From ded19a2d51e3a606621672b4dff3a52197ec7050 Mon Sep 17 00:00:00 2001 From: Ankur Dedania Date: Tue, 30 Aug 2016 19:27:50 -0500 Subject: [PATCH 001/338] add compatibility support for PEP 519 https://github.com/pydata/pandas/issues/13823 --- pandas/io/common.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6817c824ad786..fd8e69675329f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -40,6 +40,13 @@ _PY_PATH_INSTALLED = False +try: + from os import fspath, PathLike + _OS_FSPATH_INSTALLED = True +except: + _OS_FSPATH_INSTALLED = False + + if compat.PY3: from urllib.request import urlopen, pathname2url _urlopen = urlopen @@ -183,6 +190,10 @@ def _stringify_path(filepath_or_buffer): ------- str_filepath_or_buffer : a the string version of the input path """ + if _OS_FSPATH_INSTALLED and isinstance(filepath_or_buffer, PathLike): + return fspath(filepath_or_buffer) + elif hasattr(filepath_or_buffer, "__fspath__"): + return filepath_or_buffer.__fspath__() if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path): return text_type(filepath_or_buffer) if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath): From 3c8b99060d1a65b07ffb5b52ca99d3cc949dfce4 Mon Sep 17 00:00:00 2001 From: Ankur Dedania Date: Tue, 30 Aug 2016 22:57:14 -0500 Subject: [PATCH 002/338] added __fspath__ to HDFStore class --- pandas/io/pytables.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e474aeab1f6ca..7af200114f31c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -44,7 +44,7 @@ from pandas.core.index import _ensure_index from pandas.tools.merge import concat from pandas import compat -from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter +from pandas.compat import u_safe as u, PY3, range, lrange, string_types, text_type, filter from pandas.core.config import get_option from pandas.computation.pytables import Expr, maybe_expression @@ -487,6 +487,9 @@ def __contains__(self, key): return True return False + def __fspath__(self): + return text_type(self._path) + def __len__(self): return len(self.groups()) From a58221f1a8ff10ddcb3d42757cf4137ac9bd2b45 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 9 Jan 2017 12:52:23 -0500 Subject: [PATCH 003/338] TST/BLD: test 3.6 on appveyor (#15088) CLN: clean up appveyor.yml a bit --- appveyor.yml | 32 +++----- ci/appveyor.recipe/meta.yaml | 2 +- ci/install.ps1 | 6 +- ci/install_appveyor.ps1 | 133 ---------------------------------- ci/requirements-3.6-64.run | 13 ++++ pandas/io/tests/test_excel.py | 2 + 6 files changed, 30 insertions(+), 158 deletions(-) delete mode 100644 ci/install_appveyor.ps1 create mode 100644 ci/requirements-3.6-64.run diff --git a/appveyor.yml b/appveyor.yml index a8e5218ab2c9f..33b8be57eba62 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,20 +17,19 @@ environment: matrix: - # disable python 3.4 ATM - #- PYTHON: "C:\\Python34_64" - # PYTHON_VERSION: "3.4" - # PYTHON_ARCH: "64" - # CONDA_PY: "34" - # CONDA_NPY: "19" - - - PYTHON: "C:\\Python27_64" + - CONDA_ROOT: "C:\\Miniconda3.5_64" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "64" + CONDA_PY: "36" + CONDA_NPY: "111" + + - CONDA_ROOT: "C:\\Miniconda3.5_64" PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" CONDA_NPY: "110" - - PYTHON: "C:\\Python35_64" + - CONDA_ROOT: "C:\\Miniconda3.5_64" PYTHON_VERSION: "3.5" PYTHON_ARCH: "64" CONDA_PY: "35" @@ -45,9 +44,6 @@ platform: # all our python builds have to happen in tests_script... build: false -init: - - "ECHO %PYTHON_VERSION% %PYTHON%" - install: # cancel older builds for the same PR - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` @@ -58,7 +54,7 @@ install: # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit) # updates conda & installs: conda-build jinja2 anaconda-client - powershell .\ci\install.ps1 - - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH% + - SET PATH=%CONDA_ROOT%;%CONDA_ROOT%\Scripts;%PATH% - echo "install" - cd - ls -ltr @@ -70,13 +66,6 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - cmd: conda update -q conda - - # fix conda-build version - # https://github.com/conda/conda-build/issues/1001 - # disabling 3.4 as windows complains upon compiling byte - # code - - - cmd: conda install conda-build=1.21.7 - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority @@ -84,7 +73,6 @@ install: - cmd: conda config --add channels pandas - cmd: conda config --remove channels defaults - cmd: conda config --add channels defaults - - cmd: conda install anaconda-client # this is now the downloaded conda... - cmd: conda info -a @@ -98,6 +86,8 @@ install: - SET REQ=ci\requirements-%PYTHON_VERSION%-%PYTHON_ARCH%.run - cmd: echo "installing requirements from %REQ%" - cmd: conda install -n pandas -q --file=%REQ% + - cmd: conda list -n pandas + - cmd: echo "installing requirements from %REQ% - done" - ps: conda install -n pandas (conda build ci\appveyor.recipe -q --output) test_script: diff --git a/ci/appveyor.recipe/meta.yaml b/ci/appveyor.recipe/meta.yaml index 6bf0a14bc7d0b..777fd9d682d48 100644 --- a/ci/appveyor.recipe/meta.yaml +++ b/ci/appveyor.recipe/meta.yaml @@ -1,6 +1,6 @@ package: name: pandas - version: 0.18.1 + version: 0.20.0 build: number: {{environ.get('APPVEYOR_BUILD_NUMBER', 0)}} # [win] diff --git a/ci/install.ps1 b/ci/install.ps1 index 16c92dc76d273..64ec7f81884cd 100644 --- a/ci/install.ps1 +++ b/ci/install.ps1 @@ -84,9 +84,9 @@ function UpdateConda ($python_home) { function main () { - InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON - UpdateConda $env:PYTHON - InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client" + InstallMiniconda "3.5" $env:PYTHON_ARCH $env:CONDA_ROOT + UpdateConda $env:CONDA_ROOT + InstallCondaPackages $env:CONDA_ROOT "conda-build jinja2 anaconda-client" } main diff --git a/ci/install_appveyor.ps1 b/ci/install_appveyor.ps1 deleted file mode 100644 index a022995dc7d58..0000000000000 --- a/ci/install_appveyor.ps1 +++ /dev/null @@ -1,133 +0,0 @@ -# Sample script to install Miniconda under Windows -# Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon -# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ - -$MINICONDA_URL = "http://repo.continuum.io/miniconda/" - - -function DownloadMiniconda ($python_version, $platform_suffix) { - $webclient = New-Object System.Net.WebClient - if ($python_version -match "3.4") { - $filename = "Miniconda3-3.5.5-Windows-" + $platform_suffix + ".exe" - } else { - $filename = "Miniconda-3.5.5-Windows-" + $platform_suffix + ".exe" - } - $url = $MINICONDA_URL + $filename - - $basedir = $pwd.Path + "\" - $filepath = $basedir + $filename - if (Test-Path $filename) { - Write-Host "Reusing" $filepath - return $filepath - } - - # Download and retry up to 3 times in case of network transient errors. - Write-Host "Downloading" $filename "from" $url - $retry_attempts = 2 - for($i=0; $i -lt $retry_attempts; $i++){ - try { - $webclient.DownloadFile($url, $filepath) - break - } - Catch [Exception]{ - Start-Sleep 1 - } - } - if (Test-Path $filepath) { - Write-Host "File saved at" $filepath - } else { - # Retry once to get the error message if any at the last try - $webclient.DownloadFile($url, $filepath) - } - return $filepath -} - -function Start-Executable { - param( - [String] $FilePath, - [String[]] $ArgumentList - ) - $OFS = " " - $process = New-Object System.Diagnostics.Process - $process.StartInfo.FileName = $FilePath - $process.StartInfo.Arguments = $ArgumentList - $process.StartInfo.UseShellExecute = $false - $process.StartInfo.RedirectStandardOutput = $true - if ( $process.Start() ) { - $output = $process.StandardOutput.ReadToEnd() ` - -replace "\r\n$","" - if ( $output ) { - if ( $output.Contains("`r`n") ) { - $output -split "`r`n" - } - elseif ( $output.Contains("`n") ) { - $output -split "`n" - } - else { - $output - } - } - $process.WaitForExit() - & "$Env:SystemRoot\system32\cmd.exe" ` - /c exit $process.ExitCode - } - } - -function InstallMiniconda ($python_version, $architecture, $python_home) { - Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home - if (Test-Path $python_home) { - Write-Host $python_home "already exists, skipping." - return $false - } - if ($architecture -match "32") { - $platform_suffix = "x86" - } else { - $platform_suffix = "x86_64" - } - - $filepath = DownloadMiniconda $python_version $platform_suffix - Write-Host "Installing" $filepath "to" $python_home - $install_log = $python_home + ".log" - $args = "/S /D=$python_home" - Write-Host $filepath $args - Start-Process -FilePath $filepath -ArgumentList $args -Wait - if (Test-Path $python_home) { - Write-Host "Python $python_version ($architecture) installation complete" - } else { - Write-Host "Failed to install Python in $python_home" - Get-Content -Path $install_log - Exit 1 - } -} - - -function InstallCondaPackages ($python_home, $spec) { - $conda_path = $python_home + "\Scripts\conda.exe" - $args = "install --yes --quiet " + $spec - Write-Host ("conda " + $args) - Start-Executable -FilePath "$conda_path" -ArgumentList $args -} -function InstallCondaPackagesFromFile ($python_home, $ver, $arch) { - $conda_path = $python_home + "\Scripts\conda.exe" - $args = "install --yes --quiet --file " + $env:APPVEYOR_BUILD_FOLDER + "\ci\requirements-" + $ver + "_" + $arch + ".txt" - Write-Host ("conda " + $args) - Start-Executable -FilePath "$conda_path" -ArgumentList $args -} - -function UpdateConda ($python_home) { - $conda_path = $python_home + "\Scripts\conda.exe" - Write-Host "Updating conda..." - $args = "update --yes conda" - Write-Host $conda_path $args - Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -} - - -function main () { - InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON - UpdateConda $env:PYTHON - InstallCondaPackages $env:PYTHON "pip setuptools nose" - InstallCondaPackagesFromFile $env:PYTHON $env:PYTHON_VERSION $env:PYTHON_ARCH -} - -main \ No newline at end of file diff --git a/ci/requirements-3.6-64.run b/ci/requirements-3.6-64.run new file mode 100644 index 0000000000000..58ba103504b2c --- /dev/null +++ b/ci/requirements-3.6-64.run @@ -0,0 +1,13 @@ +python-dateutil +pytz +numpy +openpyxl +xlsxwriter +xlrd +#xlwt +scipy +feather-format +numexpr +pytables +matplotlib +blosc diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 8db0e6202f7fc..4b8e7714a911e 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -448,6 +448,7 @@ def test_read_excel_blank_with_header(self): # GH 12292 : error when read one empty column from excel file def test_read_one_empty_col_no_header(self): + _skip_if_no_xlwt() df = pd.DataFrame( [["", 1, 100], ["", 2, 200], @@ -504,6 +505,7 @@ def test_read_one_empty_col_with_header(self): tm.assert_frame_equal(actual_header_zero, expected_header_zero) def test_set_column_names_in_parameter(self): + _skip_if_no_xlwt() # GH 12870 : pass down column names associated with # keyword argument names refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], From 30ea130d81b34837875e86fbf30cc9031021aaf1 Mon Sep 17 00:00:00 2001 From: tomrod Date: Mon, 9 Jan 2017 13:27:54 -0500 Subject: [PATCH 004/338] ERR: Clarified error in read_sas method when buffer object provided without a format Author: tomrod Closes #14947 from tomrod/sas_read_format_bugfix and squashes the following commits: 1285dbb [tomrod] flake8 testing 4cf9231 [tomrod] PEP8 compliance ab76d80 [tomrod] Updating to match pep8 whitespace requirements in sasreader.py ffdce1d [tomrod] Updating to match pep8 as per @jorisvandenbossche aa1ada3 [tomrod] More specific error message, moved imports to top of files bf60d23 [tomrod] Adding tests, creating updated information 5efdb85 [tomrod] Adding tests f8166fc [tomrod] Updated based on feedback from jreback b52f204 [tomrod] Clarified error in read_sas method when buffer object provided without format --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/sas/sasreader.py | 8 +++++++- pandas/io/tests/sas/test_sas.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 pandas/io/tests/sas/test_sas.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0148a47068beb..8b6b765a81dba 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -282,6 +282,7 @@ Performance Improvements - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) +- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. .. _whatsnew_0200.bug_fixes: diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 081d780f71cb3..29e7f131fd9bc 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -1,10 +1,12 @@ """ Read SAS sas7bdat or xport files. """ +from pandas import compat def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False): + """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -29,8 +31,12 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ - if format is None: + buffer_error_msg = ("If this is a buffer object rather " + "than a string name, you must specify " + "a format string") + if not isinstance(filepath_or_buffer, compat.string_types): + raise ValueError(buffer_error_msg) try: fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): diff --git a/pandas/io/tests/sas/test_sas.py b/pandas/io/tests/sas/test_sas.py new file mode 100644 index 0000000000000..237e3676c3b3d --- /dev/null +++ b/pandas/io/tests/sas/test_sas.py @@ -0,0 +1,13 @@ +import pandas.util.testing as tm +from pandas.compat import StringIO +from pandas import read_sas + + +class TestSas(tm.TestCase): + + def test_sas_buffer_format(self): + + # GH14947 + b = StringIO("") + with self.assertRaises(ValueError): + read_sas(b) From d432464b3c988f0aa71615ac8018e154e342b3ee Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 9 Jan 2017 13:48:53 -0500 Subject: [PATCH 005/338] BLD: add deps for 3.6 build (#15089) --- ci/requirements-3.6.pip | 1 + ci/requirements-3.6.run | 16 ++++++++++++++++ ci/requirements-3.6.sh | 7 +++++++ 3 files changed, 24 insertions(+) create mode 100644 ci/requirements-3.6.pip create mode 100644 ci/requirements-3.6.sh diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip new file mode 100644 index 0000000000000..688866a18542f --- /dev/null +++ b/ci/requirements-3.6.pip @@ -0,0 +1 @@ +xlwt diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 684dab333648a..daf07be2d1785 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -2,3 +2,19 @@ python-dateutil pytz numpy scipy +openpyxl +xlsxwriter +xlrd +# xlwt (installed via pip) +numexpr +pytables +# matplotlib (not avail on defaults ATM) +lxml +html5lib +jinja2 +sqlalchemy +pymysql +# psycopg2 (not avail on defaults ATM) +beautifulsoup4 +s3fs +xarray diff --git a/ci/requirements-3.6.sh b/ci/requirements-3.6.sh new file mode 100644 index 0000000000000..7d88ede751ec8 --- /dev/null +++ b/ci/requirements-3.6.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "install 36" + +conda install -n pandas -c conda-forge feather-format From 38d6d7cb967ef030d98f795cb6220e327744cfde Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Jan 2017 20:03:47 +0100 Subject: [PATCH 006/338] PERF: improve perf of float-based timeseries plotting (#15073) --- asv_bench/benchmarks/plotting.py | 3 +++ doc/source/whatsnew/v0.20.0.txt | 4 ++++ pandas/tseries/converter.py | 2 +- pandas/tseries/tests/test_converter.py | 6 +++--- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 3350ddaccc496..757c3e27dd333 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -20,6 +20,9 @@ def setup(self): def time_plot_regular(self): self.df.plot() + def time_plot_regular_compat(self): + self.df.plot(x_compat=True) + class Misc(object): goal_time = 0.6 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8b6b765a81dba..da13f724eb663 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -281,10 +281,14 @@ Performance Improvements - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) +- Improved performance of timeseries plotting with an irregular DatetimeIndex + (or with ``compat_x=True``) (:issue:`15073`). + - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. + .. _whatsnew_0200.bug_fixes: Bug Fixes diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 8f8519a498a31..95ff9578fa3ee 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -212,7 +212,7 @@ def try_parse(values): try: values = tools.to_datetime(values) if isinstance(values, Index): - values = values.map(_dt_to_float_ordinal) + values = _dt_to_float_ordinal(values) else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index 7e4ed288e31c1..37d9c35639c32 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -3,7 +3,7 @@ import nose import numpy as np -from pandas import Timestamp, Period, Index +from pandas import Timestamp, Period from pandas.compat import u import pandas.util.testing as tm from pandas.tseries.offsets import Second, Milli, Micro @@ -104,8 +104,8 @@ def test_dateindex_conversion(self): for freq in ('B', 'L', 'S'): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) - xp = Index(converter.dates.date2num(dateindex._mpl_repr())) - tm.assert_index_equal(rs, xp, decimals) + xp = converter.dates.date2num(dateindex._mpl_repr()) + tm.assert_almost_equal(rs, xp, decimals) def test_resolution(self): def _assert_less(ts1, ts2): From 07b3c420c0e4cef03de2dc500908179f99fd85ec Mon Sep 17 00:00:00 2001 From: "D.S. McNeil" Date: Tue, 10 Jan 2017 08:16:15 -0500 Subject: [PATCH 007/338] BUG: read_fwf inference should respect skiprows (#11256) Fix the fact that we don't skip the rows when inferring colspecs by passing skiprows down the chain until it's needed. - [X] closes #11256 - [X] 3 tests added / passed - [X] passes `git diff upstream/master | flake8 --diff` - [X] whatsnew entry Author: D.S. McNeil Closes #14028 from dsm054/bugfix/fwf_skiprows and squashes the following commits: b5b3e66 [D.S. McNeil] BUG: read_fwf inference should respect skiprows (#11256) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 61 +++++++++++++++------ pandas/io/tests/parser/test_read_fwf.py | 72 +++++++++++++++++++------ 3 files changed, 101 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index da13f724eb663..c9ea7b427b3f2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -305,6 +305,7 @@ Bug Fixes - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) +- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8a9873b240602..41f1ab6fc16fb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -323,7 +323,7 @@ fields of each line as half-open intervals (i.e., [from, to[ ). String value 'infer' can be used to instruct the parser to try detecting the column specifications from the first 100 rows of - the data (default='infer'). + the data which are not being skipped via skiprows (default='infer'). widths : list of ints. optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. @@ -3034,13 +3034,13 @@ class FixedWidthReader(BaseIterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment): + def __init__(self, f, colspecs, delimiter, comment, skiprows=None): self.f = f self.buffer = None self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' self.comment = comment if colspecs == 'infer': - self.colspecs = self.detect_colspecs() + self.colspecs = self.detect_colspecs(skiprows=skiprows) else: self.colspecs = colspecs @@ -3049,7 +3049,6 @@ def __init__(self, f, colspecs, delimiter, comment): "input was a %r" % type(colspecs).__name__) for colspec in self.colspecs: - if not (isinstance(colspec, (tuple, list)) and len(colspec) == 2 and isinstance(colspec[0], (int, np.integer, type(None))) and @@ -3057,20 +3056,50 @@ def __init__(self, f, colspecs, delimiter, comment): raise TypeError('Each column specification must be ' '2 element tuple or list of integers') - def get_rows(self, n): - rows = [] - for i, row in enumerate(self.f, 1): - rows.append(row) - if i >= n: + def get_rows(self, n, skiprows=None): + """ + Read rows from self.f, skipping as specified. + + We distinguish buffer_rows (the first <= n lines) + from the rows returned to detect_colspecs because + it's simpler to leave the other locations with + skiprows logic alone than to modify them to deal + with the fact we skipped some rows here as well. + + Parameters + ---------- + n : int + Number of rows to read from self.f, not counting + rows that are skipped. + skiprows: set, optional + Indices of rows to skip. + + Returns + ------- + detect_rows : list of str + A list containing the rows to read. + + """ + if skiprows is None: + skiprows = set() + buffer_rows = [] + detect_rows = [] + for i, row in enumerate(self.f): + if i not in skiprows: + detect_rows.append(row) + buffer_rows.append(row) + if len(detect_rows) >= n: break - self.buffer = iter(rows) - return rows + self.buffer = iter(buffer_rows) + return detect_rows - def detect_colspecs(self, n=100): + def detect_colspecs(self, n=100, skiprows=None): # Regex escape the delimiters delimiters = ''.join([r'\%s' % x for x in self.delimiter]) pattern = re.compile('([^%s]+)' % delimiters) - rows = self.get_rows(n) + rows = self.get_rows(n, skiprows) + if not rows: + raise EmptyDataError("No rows from which to infer column width") max_len = max(map(len, rows)) mask = np.zeros(max_len + 1, dtype=int) if self.comment is not None: @@ -3081,7 +3110,8 @@ def detect_colspecs(self, n=100): shifted = np.roll(mask, 1) shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] - return list(zip(edges[::2], edges[1::2])) + edge_pairs = list(zip(edges[::2], edges[1::2])) + return edge_pairs def __next__(self): if self.buffer is not None: @@ -3106,9 +3136,8 @@ class FixedWidthFieldParser(PythonParser): def __init__(self, f, **kwds): # Support iterators, convert to a list. self.colspecs = kwds.pop('colspecs') - PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - self.comment) + self.comment, self.skiprows) diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py index 42b1116280a1e..a423355081ac3 100644 --- a/pandas/io/tests/parser/test_read_fwf.py +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -16,7 +16,7 @@ from pandas import DataFrame from pandas import compat from pandas.compat import StringIO, BytesIO -from pandas.io.parsers import read_csv, read_fwf +from pandas.io.parsers import read_csv, read_fwf, EmptyDataError class TestFwfParsing(tm.TestCase): @@ -248,83 +248,83 @@ def test_bool_header_arg(self): def test_full_file(self): # File with all values - test = '''index A B C + test = """index A B C 2000-01-03T00:00:00 0.980268513777 3 foo 2000-01-04T00:00:00 1.04791624281 -4 bar 2000-01-05T00:00:00 0.498580885705 73 baz 2000-01-06T00:00:00 1.12020151869 1 foo 2000-01-07T00:00:00 0.487094399463 0 bar 2000-01-10T00:00:00 0.836648671666 2 baz -2000-01-11T00:00:00 0.157160753327 34 foo''' +2000-01-11T00:00:00 0.157160753327 34 foo""" colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_full_file_with_missing(self): # File with missing values - test = '''index A B C + test = """index A B C 2000-01-03T00:00:00 0.980268513777 3 foo 2000-01-04T00:00:00 1.04791624281 -4 bar 0.498580885705 73 baz 2000-01-06T00:00:00 1.12020151869 1 foo 2000-01-07T00:00:00 0 bar 2000-01-10T00:00:00 0.836648671666 2 baz - 34''' + 34""" colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_full_file_with_spaces(self): # File with spaces in columns - test = ''' + test = """ Account Name Balance CreditLimit AccountCreated 101 Keanu Reeves 9315.45 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 8/6/2003 868 Jennifer Love Hewitt 0 17000.00 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 5000.00 2/5/2007 -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_full_file_with_spaces_and_missing(self): # File with spaces and missing values in columsn - test = ''' + test = """ Account Name Balance CreditLimit AccountCreated 101 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 8/6/2003 868 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_messed_up_data(self): # Completely messed up file - test = ''' + test = """ Account Name Balance Credit Limit Account Created 101 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_multiple_delimiters(self): - test = r''' + test = r""" col1~~~~~col2 col3++++++++++++++++++col4 ~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves 33+++122.33\\\bar.........Gerard Butler ++44~~~~12.01 baz~~Jennifer Love Hewitt ~~55 11+++foo++++Jada Pinkett-Smith ..66++++++.03~~~bar Bill Murray -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=' +~.\\') @@ -335,11 +335,11 @@ def test_variable_width_unicode(self): if not compat.PY3: raise nose.SkipTest( 'Bytes-related test - only needs to work on Python 3') - test = ''' + test = """ שלום שלום ום שלל של ום -'''.strip('\r\n') +""".strip('\r\n') expected = read_fwf(BytesIO(test.encode('utf8')), colspecs=[(0, 4), (5, 9)], header=None, encoding='utf8') @@ -347,10 +347,10 @@ def test_variable_width_unicode(self): BytesIO(test.encode('utf8')), header=None, encoding='utf8')) def test_dtype(self): - data = ''' a b c + data = """ a b c 1 2 3.2 3 4 5.2 -''' +""" colspecs = [(0, 5), (5, 10), (10, None)] result = pd.read_fwf(StringIO(data), colspecs=colspecs) expected = pd.DataFrame({ @@ -365,3 +365,41 @@ def test_dtype(self): result = pd.read_fwf(StringIO(data), colspecs=colspecs, dtype={'a': 'float64', 'b': str, 'c': 'int32'}) tm.assert_frame_equal(result, expected) + + def test_skiprows_inference(self): + # GH11256 + test = """ +Text contained in the file header + +DataCol1 DataCol2 + 0.0 1.0 + 101.6 956.1 +""".strip() + expected = read_csv(StringIO(test), skiprows=2, + delim_whitespace=True) + tm.assert_frame_equal(expected, read_fwf( + StringIO(test), skiprows=2)) + + def test_skiprows_by_index_inference(self): + test = """ +To be skipped +Not To Be Skipped +Once more to be skipped +123 34 8 123 +456 78 9 456 +""".strip() + + expected = read_csv(StringIO(test), skiprows=[0, 2], + delim_whitespace=True) + tm.assert_frame_equal(expected, read_fwf( + StringIO(test), skiprows=[0, 2])) + + def test_skiprows_inference_empty(self): + test = """ +AA BBB C +12 345 6 +78 901 2 +""".strip() + + with tm.assertRaises(EmptyDataError): + read_fwf(StringIO(test), skiprows=3) From 7cad7dda9474597b37b73df9b01cab3ef1a34370 Mon Sep 17 00:00:00 2001 From: Lukasz Date: Wed, 11 Jan 2017 00:37:14 +0900 Subject: [PATCH 008/338] fix misspelled word (#15097) `colunwise` should be `columnwise` --- pandas/formats/style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/formats/style.py b/pandas/formats/style.py index 4d5e72a38bb98..b3e0f0f6c7462 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -781,7 +781,7 @@ def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0, low, high: float compress the range by these values. axis: int or str - 1 or 'columns' for colunwise, 0 or 'index' for rowwise + 1 or 'columns' for columnwise, 0 or 'index' for rowwise subset: IndexSlice a valid slice for ``data`` to limit the style application to From 0e3354ed23264c2845c29cc2087f9c1f1a25c848 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 Jan 2017 16:10:04 -0500 Subject: [PATCH 009/338] TST: close sas read handles properly in tests (#15100) --- pandas/io/tests/sas/test_sas7bdat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index e20ea48247119..69073a90e9669 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -51,6 +51,7 @@ def test_from_buffer(self): iterator=True, encoding='utf-8') df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) + rdr.close() def test_from_iterator(self): for j in 0, 1: @@ -62,6 +63,7 @@ def test_from_iterator(self): tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) + rdr.close() def test_iterator_loop(self): # github #13654 @@ -74,6 +76,7 @@ def test_iterator_loop(self): for x in rdr: y += x.shape[0] self.assertTrue(y == rdr.row_count) + rdr.close() def test_iterator_read_too_much(self): # github #14734 @@ -82,9 +85,12 @@ def test_iterator_read_too_much(self): rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding='utf-8') d1 = rdr.read(rdr.row_count + 20) + rdr.close() + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") d2 = rdr.read(rdr.row_count + 20) tm.assert_frame_equal(d1, d2) + rdr.close() def test_encoding_options(): From b54b6a3642dd3628978401d0ceb185ca7de2b171 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 Jan 2017 16:11:26 -0500 Subject: [PATCH 010/338] CLN: check NULL and have similar __dealloc__ code for all hashtables (#15101) --- pandas/src/hashtable_class_helper.pxi.in | 26 +++++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index b26839599ef38..1d3c4b2cb5889 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -85,7 +85,9 @@ cdef class {{name}}Vector: self.data.data = <{{arg}}*> self.ao.data def __dealloc__(self): - PyMem_Free(self.data) + if self.data is not NULL: + PyMem_Free(self.data) + self.data = NULL def __len__(self): return self.data.n @@ -133,8 +135,11 @@ cdef class StringVector: self.data.data[i] = orig_data[i] def __dealloc__(self): - free(self.data.data) - PyMem_Free(self.data) + if self.data is not NULL: + if self.data.data is not NULL: + free(self.data.data) + PyMem_Free(self.data) + self.data = NULL def __len__(self): return self.data.n @@ -223,7 +228,9 @@ cdef class {{name}}HashTable(HashTable): return self.table.size def __dealloc__(self): - kh_destroy_{{dtype}}(self.table) + if self.table is not NULL: + kh_destroy_{{dtype}}(self.table) + self.table = NULL def __contains__(self, object key): cdef khiter_t k @@ -453,7 +460,9 @@ cdef class StringHashTable(HashTable): kh_resize_str(self.table, size_hint) def __dealloc__(self): - kh_destroy_str(self.table) + if self.table is not NULL: + kh_destroy_str(self.table) + self.table = NULL cpdef get_item(self, object val): cdef: @@ -691,7 +700,8 @@ cdef class PyObjectHashTable(HashTable): def __dealloc__(self): if self.table is not NULL: - self.destroy() + kh_destroy_pymap(self.table) + self.table = NULL def __len__(self): return self.table.size @@ -704,10 +714,6 @@ cdef class PyObjectHashTable(HashTable): k = kh_get_pymap(self.table, key) return k != self.table.n_buckets - def destroy(self): - kh_destroy_pymap(self.table) - self.table = NULL - cpdef get_item(self, object val): cdef khiter_t k if val != val or val is None: From 6b049bada57a5cdb99650958d1de1b011082b9e7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 Jan 2017 16:46:54 -0500 Subject: [PATCH 011/338] TST: add network decorator on parser network tests for compressed url's, was missing (#15102) --- pandas/io/tests/parser/test_network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 8e71cf1cc7e4c..d84c2ae3beb0c 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -30,8 +30,9 @@ def __init__(self): self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/' 'pandas/io/tests/parser/data/salaries.csv') + @tm.network def test_compressed_urls(self): - """Test reading compressed tables from URL.""" + # Test reading compressed tables from URL. msg = ('Test reading {}-compressed tables from URL: ' 'compression="{}", engine="{}"') From c0142098d210e1fa631bf55141f3b4736ead74b4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 10 Jan 2017 23:57:20 -0800 Subject: [PATCH 012/338] BUG: Patch missing data handling with usecols (#15066) Closes gh-6710. Closes gh-8985. --- doc/source/io.rst | 13 +++++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 5 +++-- pandas/io/tests/parser/usecols.py | 25 +++++++++++++++++++++++++ pandas/parser.pyx | 3 ++- 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9683fedb78303..dae97f7bc7f34 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1215,6 +1215,19 @@ You can elect to skip bad lines: 0 1 2 3 1 8 9 10 +You can also use the ``usecols`` parameter to eliminate extraneous column +data that appear in some lines but not others: + +.. code-block:: ipython + + In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) + + Out[30]: + a b c + 0 1 2 3 + 1 4 5 6 + 2 8 9 10 + .. _io.quoting: Quoting and Escape Characters diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c9ea7b427b3f2..ef731ad5e92df 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -306,6 +306,7 @@ Bug Fixes - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) +- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 41f1ab6fc16fb..f2c3113fc2cdd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2295,11 +2295,12 @@ def _infer_columns(self): columns = [lrange(ncols)] columns = self._handle_usecols(columns, columns[0]) else: - if self.usecols is None or len(names) == num_original_columns: + if self.usecols is None or len(names) >= num_original_columns: columns = self._handle_usecols([names], names) num_original_columns = len(names) else: - if self.usecols and len(names) != len(self.usecols): + if (not callable(self.usecols) and + len(names) != len(self.usecols)): raise ValueError( 'Number of passed names did not match number of ' 'header fields in the file' diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index c654859f8dc7d..96790e872abc3 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -440,3 +440,28 @@ def test_callable_usecols(self): expected = DataFrame() df = self.read_csv(StringIO(s), usecols=lambda x: False) tm.assert_frame_equal(df, expected) + + def test_incomplete_first_row(self): + # see gh-6710 + data = '1,2\n1,2,3' + names = ['a', 'b', 'c'] + expected = DataFrame({'a': [1, 1], + 'c': [np.nan, 3]}) + + usecols = ['a', 'c'] + df = self.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = lambda x: x in ['a', 'c'] + df = self.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + def test_uneven_length_cols(self): + # see gh-8985 + usecols = [0, 1, 2] + data = '19,29,39\n' * 2 + '10,20,30,40' + expected = DataFrame([[19, 29, 39], + [19, 29, 39], + [10, 20, 30]]) + df = self.read_csv(StringIO(data), header=None, usecols=usecols) + tm.assert_frame_equal(df, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index c5082e999d19c..7b31f7fe27c1e 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1317,7 +1317,8 @@ cdef class TextReader: cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): if self.has_usecols and self.names is not None: - if len(self.names) == len(self.usecols): + if (not callable(self.usecols) and + len(self.names) == len(self.usecols)): return self.names[nused] else: return self.names[i - self.leading_cols] From f2526232c1f39e8a3f9a8fbc84bea24c12fc56f1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 11 Jan 2017 03:02:33 -0500 Subject: [PATCH 013/338] DEPR: deprecate sortlevel in favor of sort_index (#15099) xref #14279 --- doc/source/api.rst | 2 -- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/frame.py | 6 +++- pandas/core/reshape.py | 4 +-- pandas/core/series.py | 4 +++ pandas/sparse/frame.py | 2 +- pandas/sparse/tests/test_series.py | 2 +- pandas/stats/plm.py | 4 +-- pandas/tests/frame/test_misc_api.py | 4 --- pandas/tests/frame/test_operators.py | 6 ++-- pandas/tests/frame/test_sorting.py | 18 +++------- pandas/tests/groupby/test_groupby.py | 6 ++-- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/indexing/test_indexing.py | 32 ++++++++--------- pandas/tests/series/test_analytics.py | 13 +++---- pandas/tests/series/test_sorting.py | 2 ++ pandas/tests/test_categorical.py | 2 +- pandas/tests/test_multilevel.py | 50 +++++++++++++------------- pandas/tests/test_panel.py | 4 +-- pandas/tools/pivot.py | 5 +-- pandas/tools/tests/test_join.py | 10 +++--- pandas/tools/tests/test_pivot.py | 2 +- 22 files changed, 88 insertions(+), 94 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 272dfe72eafe7..b7a1b8a005d89 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -418,7 +418,6 @@ Reshaping, sorting Series.reorder_levels Series.sort_values Series.sort_index - Series.sortlevel Series.swaplevel Series.unstack Series.searchsorted @@ -931,7 +930,6 @@ Reshaping, sorting, transposing DataFrame.reorder_levels DataFrame.sort_values DataFrame.sort_index - DataFrame.sortlevel DataFrame.nlargest DataFrame.nsmallest DataFrame.swaplevel diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ef731ad5e92df..70fddea3fe1a9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -261,7 +261,7 @@ Deprecations - ``Categorical.searchsorted()`` and ``Series.searchsorted()`` have deprecated the ``v`` parameter in favor of ``value`` (:issue:`12662`) - ``TimedeltaIndex.searchsorted()``, ``DatetimeIndex.searchsorted()``, and ``PeriodIndex.searchsorted()`` have deprecated the ``key`` parameter in favor of ``value`` (:issue:`12662`) - ``DataFrame.astype()`` has deprecated the ``raise_on_error`` parameter in favor of ``errors`` (:issue:`14878`) - +- ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) .. _whatsnew_0200.prior_deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9290c0ce3457..4288e03c2cc49 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1274,7 +1274,7 @@ def to_panel(self): # minor axis must be sorted if self.index.lexsort_depth < 2: - selfsorted = self.sortlevel(0) + selfsorted = self.sort_index(level=0) else: selfsorted = self @@ -3337,6 +3337,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, sort_remaining=True): """ + DEPRECATED: use :meth:`DataFrame.sort_index` + Sort multilevel index by chosen axis and primary level. Data will be lexicographically sorted by the chosen level followed by the other levels (in order) @@ -3360,6 +3362,8 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, DataFrame.sort_index(level=...) """ + warnings.warn("sortlevel is deprecated, use sort_index(level= ...)", + FutureWarning, stacklevel=2) return self.sort_index(level=level, axis=axis, ascending=ascending, inplace=inplace, sort_remaining=sort_remaining) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b359c54535b28..0831568e8c955 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -375,7 +375,7 @@ def pivot_simple(index, columns, values): hindex = MultiIndex.from_arrays([index, columns]) series = Series(values.ravel(), index=hindex) - series = series.sortlevel(0) + series = series.sort_index(level=0) return series.unstack() @@ -596,7 +596,7 @@ def _convert_level_number(level_num, columns): # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) - this = this.sortlevel(level_to_sort, axis=1) + this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: diff --git a/pandas/core/series.py b/pandas/core/series.py index 0b29e8c93a12d..ab1498c617467 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1988,6 +1988,8 @@ def nsmallest(self, n=5, keep='first'): def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ + DEPRECATED: use :meth:`Series.sort_index` + Sort Series with MultiIndex by chosen level. Data will be lexicographically sorted by the chosen level followed by the other levels (in order) @@ -2006,6 +2008,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Series.sort_index(level=...) """ + warnings.warn("sortlevel is deprecated, use sort_index(level=...)", + FutureWarning, stacklevel=2) return self.sort_index(level=level, ascending=ascending, sort_remaining=sort_remaining) diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 4cdcbc6fca32c..1fc93a967bdbb 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -816,7 +816,7 @@ def stack_sparse_frame(frame): lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=['foo']) - return lp.sortlevel(level=0) + return lp.sort_index(level=0) def homogenize(series_dict): diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 14339ab388a5d..b5ca3a730ed27 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -947,7 +947,7 @@ def setUp(self): micol = pd.MultiIndex.from_product( [['a', 'b', 'c'], ["1", "2"]], names=['col-foo', 'col-bar']) dense_multiindex_frame = pd.DataFrame( - index=miindex, columns=micol).sortlevel().sortlevel(axis=1) + index=miindex, columns=micol).sort_index().sort_index(axis=1) self.dense_multiindex_frame = dense_multiindex_frame.fillna(value=3.14) def test_to_sparse_preserve_multiindex_names_columns(self): diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index 099c45d5ec60b..806dc289f843a 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -792,8 +792,8 @@ def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid']) if cluster_axis == 1: - x = x.swaplevel(0, 1).sortlevel(0) - resid = resid.swaplevel(0, 1).sortlevel(0) + x = x.swaplevel(0, 1).sort_index(level=0) + resid = resid.swaplevel(0, 1).sort_index(level=0) m = _group_agg(x.values * resid.values, x.index._bounds, lambda x: np.sum(x, axis=0)) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index bc750727493a3..b739d087b8e93 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -456,10 +456,6 @@ def _check_f(base, f): f = lambda x: x.sort_index(inplace=True) _check_f(data.copy(), f) - # sortlevel - f = lambda x: x.sortlevel(0, inplace=True) - _check_f(data.set_index(['a', 'b']), f) - # fillna f = lambda x: x.fillna(0, inplace=True) _check_f(data.copy(), f) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 8462d5cd9bcf6..f7e821c0434aa 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -467,7 +467,7 @@ def test_binary_ops_align(self): df = DataFrame(np.arange(27 * 3).reshape(27, 3), index=index, - columns=['value1', 'value2', 'value3']).sortlevel() + columns=['value1', 'value2', 'value3']).sort_index() idx = pd.IndexSlice for op in ['add', 'sub', 'mul', 'div', 'truediv']: @@ -479,7 +479,7 @@ def test_binary_ops_align(self): result = getattr(df, op)(x, level='third', axis=0) expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) - for i, v in x.iteritems()]).sortlevel() + for i, v in x.iteritems()]).sort_index() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ['two', 'three']) @@ -487,7 +487,7 @@ def test_binary_ops_align(self): expected = (pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]) - .reindex_like(df).sortlevel()) + .reindex_like(df).sort_index()) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 579a4bf5d54d5..2abef59df284d 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -53,16 +53,6 @@ def test_sort_index_multiindex(self): mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(level='A', sort_remaining=False) - expected = df.sortlevel('A', sort_remaining=False) - assert_frame_equal(result, expected) - - # sort columns by specified level of multi-index - df = df.T - result = df.sort_index(level='A', axis=1, sort_remaining=False) - expected = df.sortlevel('A', axis=1, sort_remaining=False) - assert_frame_equal(result, expected) - # MI sort, but no level: sort_level has no effect mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) @@ -79,6 +69,8 @@ def test_sort(self): frame.sort(columns='A') with tm.assert_produces_warning(FutureWarning): frame.sort() + with tm.assert_produces_warning(FutureWarning): + frame.sortlevel() def test_sort_values(self): frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], @@ -453,13 +445,13 @@ def test_sort_index_duplicates(self): result = df.sort_values(by=('a', 1)) assert_frame_equal(result, expected) - def test_sortlevel(self): + def test_sort_index_level(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) - res = df.sortlevel('A', sort_remaining=False) + res = df.sort_index(level='A', sort_remaining=False) assert_frame_equal(df, res) - res = df.sortlevel(['A', 'B'], sort_remaining=False) + res = df.sort_index(level=['A', 'B'], sort_remaining=False) assert_frame_equal(df, res) def test_sort_datetimes(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e87b5d04271e8..dbeb36c894971 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3427,7 +3427,7 @@ def test_int64_overflow(self): left = lg.sum()['values'] right = rg.sum()['values'] - exp_index, _ = left.index.sortlevel(0) + exp_index, _ = left.index.sortlevel() self.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) @@ -3708,7 +3708,7 @@ def test_more_flexible_frame_multi_function(self): exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) - expected = expected.swaplevel(0, 1, axis=1).sortlevel(0, axis=1) + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) result = grouped.aggregate(d) @@ -4711,7 +4711,7 @@ def test_timegrouper_with_reg_groups(self): expected = df.groupby('user_id')[ 'whole_cost'].resample( freq).sum().dropna().reorder_levels( - ['date', 'user_id']).sortlevel().astype('int64') + ['date', 'user_id']).sort_index().astype('int64') expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 7da3cb377e63d..16831219e0930 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -440,7 +440,7 @@ def test_set_value_keeps_names(self): np.random.randn(6, 4), columns=['one', 'two', 'three', 'four'], index=idx) - df = df.sortlevel() + df = df.sort_index() self.assertIsNone(df.is_copy) self.assertEqual(df.index.names, ('Name', 'Number')) df = df.set_value(('grethe', '4'), 'one', 99.34) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 4e5558309bad5..6fc24e41ee914 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -2104,7 +2104,7 @@ def test_ix_general(self): tm.assert_frame_equal(df.ix[key], df.iloc[2:]) # this is ok - df.sortlevel(inplace=True) + df.sort_index(inplace=True) res = df.ix[key] # col has float dtype, result should be Float64Index index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], @@ -2137,7 +2137,7 @@ def test_xs_multiindex(self): [('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1']) df = DataFrame(np.random.randn(4, 4), columns=columns) - df.sortlevel(axis=1, inplace=True) + df.sort_index(axis=1, inplace=True) result = df.xs('a', level='lvl0', axis=1) expected = df.iloc[:, 0:2].loc[:, 'a'] tm.assert_frame_equal(result, expected) @@ -2180,7 +2180,7 @@ def test_per_axis_per_level_getitem(self): df = DataFrame( np.arange(16, dtype='int64').reshape( 4, 4), index=index, columns=columns) - df = df.sortlevel(axis=0).sortlevel(axis=1) + df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] @@ -2249,7 +2249,7 @@ def f(): # not lexsorted self.assertEqual(df.index.lexsort_depth, 2) - df = df.sortlevel(level=1, axis=0) + df = df.sort_index(level=1, axis=0) self.assertEqual(df.index.lexsort_depth, 0) with tm.assertRaisesRegexp( UnsortedIndexError, @@ -2265,11 +2265,11 @@ def test_multiindex_slicers_non_unique(self): B=['a', 'a', 'a', 'a'], C=[1, 2, 1, 3], D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sortlevel()) + .set_index(['A', 'B', 'C']).sort_index()) self.assertFalse(df.index.is_unique) expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sortlevel()) + .set_index(['A', 'B', 'C']).sort_index()) result = df.loc[(slice(None), slice(None), 1), :] tm.assert_frame_equal(result, expected) @@ -2281,11 +2281,11 @@ def test_multiindex_slicers_non_unique(self): B=['a', 'a', 'a', 'a'], C=[1, 2, 1, 2], D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sortlevel()) + .set_index(['A', 'B', 'C']).sort_index()) self.assertFalse(df.index.is_unique) expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sortlevel()) + .set_index(['A', 'B', 'C']).sort_index()) result = df.loc[(slice(None), slice(None), 1), :] self.assertFalse(result.index.is_unique) tm.assert_frame_equal(result, expected) @@ -2357,7 +2357,7 @@ def test_multiindex_slicers_edges(self): df['DATE'] = pd.to_datetime(df['DATE']) df1 = df.set_index(['A', 'B', 'DATE']) - df1 = df1.sortlevel() + df1 = df1.sort_index() # A1 - Get all values under "A0" and "A1" result = df1.loc[(slice('A1')), :] @@ -2440,7 +2440,7 @@ def f(): df.loc['A1', (slice(None), 'foo')] self.assertRaises(UnsortedIndexError, f) - df = df.sortlevel(axis=1) + df = df.sort_index(axis=1) # slicing df.loc['A1', (slice(None), 'foo')] @@ -2459,7 +2459,7 @@ def test_loc_axis_arguments(self): df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') .reshape((len(index), len(columns))), index=index, - columns=columns).sortlevel().sortlevel(axis=1) + columns=columns).sort_index().sort_index(axis=1) # axis 0 result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] @@ -2551,7 +2551,7 @@ def test_per_axis_per_level_setitem(self): df_orig = DataFrame( np.arange(16, dtype='int64').reshape( 4, 4), index=index, columns=columns) - df_orig = df_orig.sortlevel(axis=0).sortlevel(axis=1) + df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity df = df_orig.copy() @@ -2764,12 +2764,12 @@ def f(): idx = pd.MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) - s = pd.Series(np.arange(9, dtype='int64'), index=idx).sortlevel() + s = pd.Series(np.arange(9, dtype='int64'), index=idx).sort_index() exp_idx = pd.MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = pd.Series(np.arange(3, dtype='int64'), - index=exp_idx).sortlevel() + index=exp_idx).sort_index() result = s.loc[['A']] tm.assert_series_equal(result, expected) @@ -2786,7 +2786,7 @@ def f(): idx = pd.IndexSlice expected = pd.Series([0, 3, 6], index=pd.MultiIndex.from_product( - [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sortlevel() + [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() result = s.loc[idx[:, ['foo']]] tm.assert_series_equal(result, expected) @@ -2799,7 +2799,7 @@ def f(): ['alpha', 'beta'])) df = DataFrame( np.random.randn(5, 6), index=range(5), columns=multi_index) - df = df.sortlevel(0, axis=1) + df = df.sort_index(level=0, axis=1) expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 3896e255f0c2f..e66e34156b6e9 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1595,21 +1595,21 @@ def test_nsmallest_nlargest(self): expected = s.sort_values().head(3) assert_series_equal(result, expected) - def test_sortlevel(self): + def test_sort_index_level(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] - res = s.sortlevel('A') + res = s.sort_index(level='A') assert_series_equal(backwards, res) - res = s.sortlevel(['A', 'B']) + res = s.sort_index(level=['A', 'B']) assert_series_equal(backwards, res) - res = s.sortlevel('A', sort_remaining=False) + res = s.sort_index(level='A', sort_remaining=False) assert_series_equal(s, res) - res = s.sortlevel(['A', 'B'], sort_remaining=False) + res = s.sort_index(level=['A', 'B'], sort_remaining=False) assert_series_equal(s, res) def test_apply_categorical(self): @@ -1738,7 +1738,8 @@ def test_unstack(self): s = Series(np.random.randn(6), index=index) exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) - expected = DataFrame({'bar': s.values}, index=exp_index).sortlevel(0) + expected = DataFrame({'bar': s.values}, + index=exp_index).sort_index(level=0) unstacked = s.unstack(0) assert_frame_equal(unstacked, expected) diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 69e70c15cae50..fb3817eb84acd 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -23,6 +23,8 @@ def test_sort(self): with tm.assert_produces_warning(FutureWarning): ts.sort() # sorts inplace self.assert_series_equal(ts, self.ts.sort_values()) + with tm.assert_produces_warning(FutureWarning): + ts.sortlevel() def test_order(self): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 382f1dd1decfb..d159e1a20d069 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3100,7 +3100,7 @@ def test_groupby(self): ['foo', 'bar']], names=['A', 'B', 'C']) expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sortlevel() + np.nan, index=exp_index)}).sort_index() expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] result = gb.sum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 4e7ace4173227..59d9e1e094d9d 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -691,7 +691,7 @@ def test_getitem_partial(self): assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): - df = self.frame.sortlevel(1).T + df = self.frame.sort_index(level=1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] @@ -744,31 +744,31 @@ def test_getitem_partial_column_select(self): self.assertRaises(KeyError, df.ix.__getitem__, (('a', 'foo'), slice(None, None))) - def test_sortlevel(self): + def test_sort_index_level(self): df = self.frame.copy() df.index = np.arange(len(df)) # axis=1 # series - a_sorted = self.frame['A'].sortlevel(0) + a_sorted = self.frame['A'].sort_index(level=0) # preserve names self.assertEqual(a_sorted.index.names, self.frame.index.names) # inplace rs = self.frame.copy() - rs.sortlevel(0, inplace=True) - assert_frame_equal(rs, self.frame.sortlevel(0)) + rs.sort_index(level=0, inplace=True) + assert_frame_equal(rs, self.frame.sort_index(level=0)) - def test_sortlevel_large_cardinality(self): + def test_sort_index_level_large_cardinality(self): # #2684 (int64) index = MultiIndex.from_arrays([np.arange(4000)] * 3) df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) # it works! - result = df.sortlevel(0) + result = df.sort_index(level=0) self.assertTrue(result.index.lexsort_depth == 3) # #2684 (int32) @@ -776,7 +776,7 @@ def test_sortlevel_large_cardinality(self): df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) # it works! - result = df.sortlevel(0) + result = df.sort_index(level=0) self.assertTrue((result.dtypes.values == df.dtypes.values).all()) self.assertTrue(result.index.lexsort_depth == 3) @@ -803,25 +803,25 @@ def test_reset_index_with_drop(self): deleveled = self.series.reset_index(drop=True) tm.assertIsInstance(deleveled, Series) - def test_sortlevel_by_name(self): + def test_sort_index_level_by_name(self): self.frame.index.names = ['first', 'second'] - result = self.frame.sortlevel(level='second') - expected = self.frame.sortlevel(level=1) + result = self.frame.sort_index(level='second') + expected = self.frame.sort_index(level=1) assert_frame_equal(result, expected) - def test_sortlevel_mixed(self): - sorted_before = self.frame.sortlevel(1) + def test_sort_index_level_mixed(self): + sorted_before = self.frame.sort_index(level=1) df = self.frame.copy() df['foo'] = 'bar' - sorted_after = df.sortlevel(1) + sorted_after = df.sort_index(level=1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T - sorted_before = dft.sortlevel(1, axis=1) + sorted_before = dft.sort_index(level=1, axis=1) dft['foo', 'three'] = 'bar' - sorted_after = dft.sortlevel(1, axis=1) + sorted_after = dft.sort_index(level=1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) @@ -915,21 +915,21 @@ def test_stack(self): restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) - unlexsorted = self.ymd.sortlevel(2) + unlexsorted = self.ymd.sort_index(level=2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() - assert_frame_equal(restacked.sortlevel(0), self.ymd) + assert_frame_equal(restacked.sort_index(level=0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) - assert_frame_equal(restacked.sortlevel(0), self.ymd) + assert_frame_equal(restacked.sort_index(level=0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) - assert_frame_equal(restacked.sortlevel(0), self.ymd) + assert_frame_equal(restacked.sort_index(level=0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() @@ -1025,7 +1025,7 @@ def test_unstack_odd_failure(self): def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' - df = df.sortlevel(1, axis=1) + df = df.sort_index(level=1, axis=1) stacked = df.stack() result = df['foo'].stack() @@ -1084,7 +1084,7 @@ def test_stack_unstack_multiple(self): restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) - restacked = restacked.sortlevel(0) + restacked = restacked.sort_index(level=0) assert_frame_equal(restacked, self.ymd) self.assertEqual(restacked.index.names, self.ymd.index.names) @@ -1421,7 +1421,7 @@ def test_frame_getitem_view(self): # but not if it's mixed-type df['foo', 'four'] = 'foo' - df = df.sortlevel(0, axis=1) + df = df.sort_index(level=0, axis=1) # this will work, but will raise/warn as its chained assignment def f(): @@ -1698,7 +1698,7 @@ def test_getitem_lowerdim_corner(self): # in theory should be inserting in a sorted space???? self.frame.ix[('bar', 'three'), 'B'] = 0 - self.assertEqual(self.frame.sortlevel().ix[('bar', 'three'), 'B'], 0) + self.assertEqual(self.frame.sort_index().ix[('bar', 'three'), 'B'], 0) # --------------------------------------------------------------------- # AMBIGUOUS CASES! @@ -2147,7 +2147,7 @@ def test_duplicate_mi(self): ['bah', 'bam', 6.0, 6]], columns=list('ABCD')) df = df.set_index(['A', 'B']) - df = df.sortlevel(0) + df = df.sort_index(level=0) expected = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2], ['foo', 'bar', 5.0, 5]], columns=list('ABCD')).set_index(['A', 'B']) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 9cb2dd5a40ac4..78edd27877783 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2370,10 +2370,10 @@ def test_sort(self): def is_sorted(arr): return (arr[1:] > arr[:-1]).any() - sorted_minor = self.panel.sortlevel(level=1) + sorted_minor = self.panel.sort_index(level=1) self.assertTrue(is_sorted(sorted_minor.index.labels[1])) - sorted_major = sorted_minor.sortlevel(level=0) + sorted_major = sorted_minor.sort_index(level=0) self.assertTrue(is_sorted(sorted_major.index.labels[0])) def test_to_string(self): diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 0f56b0b076897..01eefe5f07173 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -158,10 +158,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass # it's a single level or a series if isinstance(table, DataFrame): - if isinstance(table.columns, MultiIndex): - table = table.sortlevel(axis=1) - else: - table = table.sort_index(axis=1) + table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index f33d5f16cd439..ecf0592c66eff 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -369,8 +369,8 @@ def test_join_multiindex(self): df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) - df1 = df1.sortlevel(0) - df2 = df2.sortlevel(0) + df1 = df1.sort_index(level=0) + df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = index1._tuple_index.union(index2._tuple_index) @@ -379,10 +379,10 @@ def test_join_multiindex(self): assert_frame_equal(joined, expected) self.assertEqual(joined.index.names, index1.names) - df1 = df1.sortlevel(1) - df2 = df2.sortlevel(1) + df1 = df1.sort_index(level=1) + df2 = df2.sort_index(level=1) - joined = df1.join(df2, how='outer').sortlevel(0) + joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index e63cfcc8c0590..b88c6167f6670 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -307,7 +307,7 @@ def _check_output(result, values_col, index=['A', 'B'], check_names=False) self.assertEqual(col_margins.name, margins_col) - result = result.sortlevel() + result = result.sort_index() index_margins = result.ix[(margins_col, '')].iloc[:-1] expected_ix_margins = self.data.groupby(columns)[values_col].mean() tm.assert_series_equal(index_margins, expected_ix_margins, From 32fafdb1bf823d23a8b72b93cd197c6c18289f11 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 11 Jan 2017 08:18:29 -0500 Subject: [PATCH 014/338] PERF: Cythonize Groupby.cummin/cummax (#15048) closes #15048 Author: Matt Roeschke Closes #15053 from mroeschke/improve_cummin_cummax and squashes the following commits: 5e8ba63 [Matt Roeschke] PERF: Cythonize Groupby.cummin/cummax (#15048) --- asv_bench/benchmarks/groupby.py | 3 - doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 25 +++++++- pandas/src/algos_groupby_helper.pxi.in | 70 +++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 87 +++++++++++++++++++++++--- 5 files changed, 173 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index ad58cd0fc6d70..597b040b8075c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -690,6 +690,3 @@ def time_shift(self): def time_transform_dataframe(self): # GH 12737 self.df_nans.groupby('key').transform('first') - - - diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 70fddea3fe1a9..c82dc370e3e71 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -283,7 +283,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). - +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7eba32b4932d0..700e279cb0030 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -75,7 +75,7 @@ 'last', 'first', 'head', 'tail', 'median', 'mean', 'sum', 'min', 'max', - 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', + 'cumcount', 'resample', 'describe', 'rank', 'quantile', @@ -97,7 +97,8 @@ _dataframe_apply_whitelist = \ _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) -_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift']) +_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', + 'cummin', 'cummax']) def _groupby_function(name, alias, npfunc, numeric_only=True, @@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform('cumsum') + @Substitution(name='groupby') + @Appender(_doc_template) + def cummin(self, axis=0): + """Cumulative min for each group""" + if axis != 0: + return self.apply(lambda x: np.minimum.accumulate(x, axis)) + + return self._cython_transform('cummin') + + @Substitution(name='groupby') + @Appender(_doc_template) + def cummax(self, axis=0): + """Cumulative max for each group""" + if axis != 0: + return self.apply(lambda x: np.maximum.accumulate(x, axis)) + + return self._cython_transform('cummax') + @Substitution(name='groupby') @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): @@ -1752,6 +1771,8 @@ def get_group_levels(self): 'transform': { 'cumprod': 'group_cumprod', 'cumsum': 'group_cumsum', + 'cummin': 'group_cummin', + 'cummax': 'group_cummax', } } diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 5c704436ce3a0..70862e198edba 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -568,6 +568,76 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = minx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels, + ndarray[{{dest_type2}}, ndim=2] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + {{dest_type2}} val, min_val + int64_t lab + + N, K = ( values).shape + accum.fill({{inf_val}}) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + if val < accum[lab, j]: + min_val = val + accum[lab, j] = min_val + out[i, j] = accum[lab, j] + # val = nan + else: + out[i, j] = {{nan_val}} + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels, + ndarray[{{dest_type2}}, ndim=2] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + {{dest_type2}} val, max_val + int64_t lab + + N, K = ( values).shape + accum.fill(-{{inf_val}}) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + if val > accum[lab, j]: + max_val = val + accum[lab, j] = max_val + out[i, j] = accum[lab, j] + # val = nan + else: + out[i, j] = {{nan_val}} + {{endfor}} #---------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dbeb36c894971..b00dc62206f57 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self): 'max', 'head', 'tail', - 'cumsum', - 'cumprod', - 'cummin', - 'cummax', 'cumcount', 'resample', 'describe', @@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self): 'max', 'head', 'tail', - 'cumsum', - 'cumprod', - 'cummin', - 'cummax', 'cumcount', 'resample', 'describe', @@ -5777,6 +5769,85 @@ def test_agg_over_numpy_arrays(self): assert_frame_equal(result, expected) + def test_cummin_cummax(self): + # GH 15048 + num_types = [np.int32, np.int64, np.float32, np.float64] + num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, + np.finfo(np.float32).min, np.finfo(np.float64).min] + num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, + np.finfo(np.float32).max, np.finfo(np.float64).max] + base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], + 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + for dtype, min_val, max_val in zip(num_types, num_mins, num_max): + df = base_df.astype(dtype) + + # cummin + expected = pd.DataFrame({'B': expected_mins}).astype(dtype) + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummin w/ min value for dtype + df.loc[[2, 6], 'B'] = min_val + expected.loc[[2, 3, 6, 7], 'B'] = min_val + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # cummax + expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummax w/ max value for dtype + df.loc[[2, 6], 'B'] = max_val + expected.loc[[2, 3, 6, 7], 'B'] = max_val + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test nan in some values + base_df.loc[[0, 2, 4, 6], 'B'] = np.nan + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, + np.nan, 3, np.nan, 1]}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummin()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, + np.nan, 3, np.nan, 3]}) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummax()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + # Test nan in entire column + base_df['B'] = np.nan + expected = pd.DataFrame({'B': [np.nan] * 8}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(expected, result) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From e61547f0a7bef348b535cc0c2dfbd0bb2828506d Mon Sep 17 00:00:00 2001 From: Sarah Bird Date: Thu, 12 Jan 2017 16:10:58 -0600 Subject: [PATCH 015/338] Fix typo in docstring (#15121) it -> int --- pandas/util/testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 05517bf6cf53a..d96f57f2810e3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1220,7 +1220,7 @@ def assert_frame_equal(left, right, check_dtype=True, are identical. check_frame_type : bool, default False Whether to check the DataFrame class is identical. - check_less_precise : bool or it, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare From c60511043f06bc38125f13205d3bf1574882d19d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 12 Jan 2017 14:33:48 -0800 Subject: [PATCH 016/338] TST: Add another test for uneven CSV rows (#15114) Closes gh-9549. --- pandas/io/tests/parser/usecols.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 96790e872abc3..4875282067fb3 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -465,3 +465,13 @@ def test_uneven_length_cols(self): [10, 20, 30]]) df = self.read_csv(StringIO(data), header=None, usecols=usecols) tm.assert_frame_equal(df, expected) + + # see gh-9549 + usecols = ['A', 'B', 'C'] + data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n' + '1,2,3,,,1,\n1,2,3\n5,6,7') + expected = DataFrame({'A': [1, 3, 1, 1, 1, 5], + 'B': [2, 4, 2, 2, 2, 6], + 'C': [3, 5, 4, 3, 3, 7]}) + df = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(df, expected) From 174cb247fea17a2558511f1e2f9c48c383c57779 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 12 Jan 2017 17:42:08 -0500 Subject: [PATCH 017/338] BUG: indexing changes to .loc for compat to .ix for several situations handle iterator handle NamedTuple .loc retuns scalar selection dtypes correctly, closes #11617 xref #15113 Author: Jeff Reback Closes #15120 from jreback/indexing and squashes the following commits: 801c8d9 [Jeff Reback] BUG: indexing changes to .loc for compat to .ix for several situations --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/core/indexing.py | 80 ++++++++++++++++++++++++++--- pandas/tests/frame/test_indexing.py | 27 ++++++++++ 3 files changed, 102 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c82dc370e3e71..b1f85c68153cf 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -244,7 +244,7 @@ Other API Changes - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) - +- ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) @@ -318,7 +318,7 @@ Bug Fixes - Bug in ``Series`` construction with a datetimetz (:issue:`14928`) - Bug in compat for passing long integers to ``Timestamp.replace`` (:issue:`15030`) - +- Bug in ``.loc`` that would not return the correct dtype for scalar access for a DataFrame (:issue:`11617`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9fa5b67083b2d..6970d1891ee63 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -9,6 +9,7 @@ is_categorical_dtype, is_list_like, is_sequence, + is_iterator, is_scalar, is_sparse, _is_unorderable_exception, @@ -1300,17 +1301,24 @@ class _LocationIndexer(_NDFrameIndexer): _exception = Exception def __getitem__(self, key): - if isinstance(key, tuple): - key = tuple(com._apply_if_callable(x, self.obj) for x in key) - else: - # scalar callable may return tuple - key = com._apply_if_callable(key, self.obj) - if type(key) is tuple: + key = tuple(com._apply_if_callable(x, self.obj) for x in key) + try: + if self._is_scalar_access(key): + return self._getitem_scalar(key) + except (KeyError, IndexError): + pass return self._getitem_tuple(key) else: + key = com._apply_if_callable(key, self.obj) return self._getitem_axis(key, axis=0) + def _is_scalar_access(self, key): + raise NotImplementedError() + + def _getitem_scalar(self, key): + raise NotImplementedError() + def _getitem_axis(self, key, axis=0): raise NotImplementedError() @@ -1389,7 +1397,8 @@ def _has_valid_type(self, key, axis): return True # TODO: don't check the entire key unless necessary - if len(key) and np.all(ax.get_indexer_for(key) < 0): + if (not is_iterator(key) and len(key) and + np.all(ax.get_indexer_for(key) < 0)): raise KeyError("None of [%s] are in the [%s]" % (key, self.obj._get_axis_name(axis))) @@ -1420,6 +1429,36 @@ def error(): return True + def _is_scalar_access(self, key): + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if not hasattr(key, '__len__'): + return False + + if len(key) != self.ndim: + return False + + for i, k in enumerate(key): + if not is_scalar(k): + return False + + ax = self.obj.axes[i] + if isinstance(ax, MultiIndex): + return False + + if not ax.is_unique: + return False + + return True + + def _getitem_scalar(self, key): + # a fast-path to scalar access + # if not, raise + values = self.obj.get_value(*key) + return values + def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" @@ -1536,6 +1575,33 @@ def _has_valid_type(self, key, axis): def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) + def _is_scalar_access(self, key): + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if not hasattr(key, '__len__'): + return False + + if len(key) != self.ndim: + return False + + for i, k in enumerate(key): + if not is_integer(k): + return False + + ax = self.obj.axes[i] + if not ax.is_unique: + return False + + return True + + def _getitem_scalar(self, key): + # a fast-path to scalar access + # if not, raise + values = self.obj.get_value(*key, takeable=True) + return values + def _is_valid_integer(self, key, axis): # return a boolean if we have a valid integer indexer diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 720dcdd62dd89..abe40f7be1d90 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -93,6 +93,11 @@ def test_getitem_iterator(self): expected = self.frame.ix[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) + idx = iter(['A', 'B', 'C']) + result = self.frame.loc[:, idx] + expected = self.frame.loc[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + def test_getitem_list(self): self.frame.columns.name = 'foo' @@ -1667,6 +1672,24 @@ def test_single_element_ix_dont_upcast(self): result = self.frame.ix[self.frame.index[5], 'E'] self.assertTrue(is_integer(result)) + result = self.frame.loc[self.frame.index[5], 'E'] + self.assertTrue(is_integer(result)) + + # GH 11617 + df = pd.DataFrame(dict(a=[1.23])) + df["b"] = 666 + + result = df.ix[0, "b"] + self.assertTrue(is_integer(result)) + result = df.loc[0, "b"] + self.assertTrue(is_integer(result)) + + expected = Series([666], [0], name='b') + result = df.ix[[0], "b"] + assert_series_equal(result, expected) + result = df.loc[[0], "b"] + assert_series_equal(result, expected) + def test_irow(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2)) @@ -2159,9 +2182,13 @@ def test_index_namedtuple(self): index = Index([idx1, idx2], name="composite_index", tupleize_cols=False) df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) + result = df.ix[IndexType("foo", "bar")]["A"] self.assertEqual(result, 1) + result = df.loc[IndexType("foo", "bar")]["A"] + self.assertEqual(result, 1) + def test_boolean_indexing(self): idx = lrange(3) cols = ['A', 'B', 'C'] From 519d4882d4037ee3cee81e046cf83b4f5b953553 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 12 Jan 2017 18:33:38 -0500 Subject: [PATCH 018/338] BUG: use more generic type inference for fast plotting (#15094) xref #15073 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/types/test_dtypes.py | 11 +++++--- pandas/tests/types/test_inference.py | 36 +++++++++++++++++++++++++ pandas/tseries/tests/test_converter.py | 23 ++++++++++++++-- pandas/tseries/tests/test_timeseries.py | 2 +- pandas/tseries/tools.py | 16 +++++------ pandas/types/common.py | 7 +++-- 7 files changed, 80 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index b1f85c68153cf..c534a2e0e540c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -248,6 +248,7 @@ Other API Changes - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) +- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` .. _whatsnew_0200.deprecations: diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index a2b0a9ebfa6cc..f190c85404ff9 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -11,7 +11,8 @@ is_datetime64tz_dtype, is_datetimetz, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype, is_string_dtype, + is_datetime64_dtype, + is_datetime64_any_dtype, is_string_dtype, _coerce_to_dtype) import pandas.util.testing as tm @@ -132,8 +133,12 @@ def test_coerce_to_dtype(self): DatetimeTZDtype('ns', 'Asia/Tokyo')) def test_compat(self): - self.assertFalse(is_datetime64_ns_dtype(self.dtype)) - self.assertFalse(is_datetime64_ns_dtype('datetime64[ns, US/Eastern]')) + self.assertTrue(is_datetime64tz_dtype(self.dtype)) + self.assertTrue(is_datetime64tz_dtype('datetime64[ns, US/Eastern]')) + self.assertTrue(is_datetime64_any_dtype(self.dtype)) + self.assertTrue(is_datetime64_any_dtype('datetime64[ns, US/Eastern]')) + self.assertTrue(is_datetime64_ns_dtype(self.dtype)) + self.assertTrue(is_datetime64_ns_dtype('datetime64[ns, US/Eastern]')) self.assertFalse(is_datetime64_dtype(self.dtype)) self.assertFalse(is_datetime64_dtype('datetime64[ns, US/Eastern]')) diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 8180757d9e706..5c35112d0fe19 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -22,6 +22,10 @@ from pandas.types import inference from pandas.types.common import (is_timedelta64_dtype, is_timedelta64_ns_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, is_number, is_integer, is_float, @@ -805,6 +809,38 @@ def test_is_float(self): self.assertFalse(is_float(np.timedelta64(1, 'D'))) self.assertFalse(is_float(Timedelta('1 days'))) + def test_is_datetime_dtypes(self): + + ts = pd.date_range('20130101', periods=3) + tsa = pd.date_range('20130101', periods=3, tz='US/Eastern') + + self.assertTrue(is_datetime64_dtype('datetime64')) + self.assertTrue(is_datetime64_dtype('datetime64[ns]')) + self.assertTrue(is_datetime64_dtype(ts)) + self.assertFalse(is_datetime64_dtype(tsa)) + + self.assertFalse(is_datetime64_ns_dtype('datetime64')) + self.assertTrue(is_datetime64_ns_dtype('datetime64[ns]')) + self.assertTrue(is_datetime64_ns_dtype(ts)) + self.assertTrue(is_datetime64_ns_dtype(tsa)) + + self.assertTrue(is_datetime64_any_dtype('datetime64')) + self.assertTrue(is_datetime64_any_dtype('datetime64[ns]')) + self.assertTrue(is_datetime64_any_dtype(ts)) + self.assertTrue(is_datetime64_any_dtype(tsa)) + + self.assertFalse(is_datetime64tz_dtype('datetime64')) + self.assertFalse(is_datetime64tz_dtype('datetime64[ns]')) + self.assertFalse(is_datetime64tz_dtype(ts)) + self.assertTrue(is_datetime64tz_dtype(tsa)) + + for tz in ['US/Eastern', 'UTC']: + dtype = 'datetime64[ns, {}]'.format(tz) + self.assertFalse(is_datetime64_dtype(dtype)) + self.assertTrue(is_datetime64tz_dtype(dtype)) + self.assertTrue(is_datetime64_ns_dtype(dtype)) + self.assertTrue(is_datetime64_any_dtype(dtype)) + def test_is_timedelta(self): self.assertTrue(is_timedelta64_dtype('timedelta64')) self.assertTrue(is_timedelta64_dtype('timedelta64[ns]')) diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index 37d9c35639c32..f6cf11c871bba 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -3,10 +3,10 @@ import nose import numpy as np -from pandas import Timestamp, Period +from pandas import Timestamp, Period, Index from pandas.compat import u import pandas.util.testing as tm -from pandas.tseries.offsets import Second, Milli, Micro +from pandas.tseries.offsets import Second, Milli, Micro, Day from pandas.compat.numpy import np_datetime64_compat try: @@ -62,6 +62,25 @@ def test_conversion(self): np_datetime64_compat('2012-01-02 00:00:00+0000')]), None, None) self.assertEqual(rs[0], xp) + # we have a tz-aware date (constructed to that when we turn to utc it + # is the same as our sample) + ts = (Timestamp('2012-01-01') + .tz_localize('UTC') + .tz_convert('US/Eastern') + ) + rs = self.dtc.convert(ts, None, None) + self.assertEqual(rs, xp) + + rs = self.dtc.convert(ts.to_pydatetime(), None, None) + self.assertEqual(rs, xp) + + rs = self.dtc.convert(Index([ts - Day(1), ts]), None, None) + self.assertEqual(rs[1], xp) + + rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), + None, None) + self.assertEqual(rs[1], xp) + def test_conversion_float(self): decimals = 9 diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 1834c56e59bb9..b7e77c2eb770a 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2404,7 +2404,7 @@ def test_to_datetime_tz_psycopg2(self): i = pd.DatetimeIndex([ '2000-01-01 08:00:00+00:00' ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) - self.assertFalse(is_datetime64_ns_dtype(i)) + self.assertTrue(is_datetime64_ns_dtype(i)) # tz coerceion result = pd.to_datetime(i, errors='coerce') diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 21e1c9744aa88..f746409aadfc9 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -309,7 +309,14 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): arg = np.array(arg, dtype='O') # these are shortcutable - if is_datetime64_ns_dtype(arg): + if is_datetime64tz_dtype(arg): + if not isinstance(arg, DatetimeIndex): + return DatetimeIndex(arg, tz=tz, name=name) + if utc: + arg = arg.tz_convert(None).tz_localize('UTC') + return arg + + elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) @@ -318,13 +325,6 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): return arg - elif is_datetime64tz_dtype(arg): - if not isinstance(arg, DatetimeIndex): - return DatetimeIndex(arg, tz=tz, name=name) - if utc: - arg = arg.tz_convert(None).tz_localize('UTC') - return arg - elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") diff --git a/pandas/types/common.py b/pandas/types/common.py index 96eb6d6968bfb..e58e0826ea49a 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -187,8 +187,11 @@ def is_datetime64_ns_dtype(arr_or_dtype): try: tipo = _get_dtype(arr_or_dtype) except TypeError: - return False - return tipo == _NS_DTYPE + if is_datetime64tz_dtype(arr_or_dtype): + tipo = _get_dtype(arr_or_dtype.dtype) + else: + return False + return tipo == _NS_DTYPE or getattr(tipo, 'base', None) == _NS_DTYPE def is_timedelta64_ns_dtype(arr_or_dtype): From be4e1cfff03b1c904c44b56f9f89f75a53e13576 Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Fri, 13 Jan 2017 12:34:49 -0500 Subject: [PATCH 019/338] BUG: Fix for GH #14848 for groupby().describe() with tuples as the Index closes #14848 Author: Dr-Irv Closes #15110 from Dr-Irv/Issue14848 and squashes the following commits: c18c6cb [Dr-Irv] Undo change to merge.py and make whatsnew a 2 line comment. db13c3b [Dr-Irv] Use not is_list_like fbd20f5 [Dr-Irv] Raise error when creating index of tuples with name parameter a string f3a7a21 [Dr-Irv] Changes per jreback requests 9489cb2 [Dr-Irv] BUG: Fix issue #14848 groupby().describe() on indices containing all tuples --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/indexes/multi.py | 4 ++++ pandas/tests/groupby/test_groupby.py | 13 +++++++++++++ pandas/tests/indexes/test_multi.py | 9 +++++++++ 4 files changed, 28 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c534a2e0e540c..b157112b6ff37 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -352,6 +352,8 @@ Bug Fixes - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) +- Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) +- Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 132543e0e386c..2afafaeb544d1 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -490,6 +490,10 @@ def _set_names(self, names, level=None, validate=True): that it only acts on copies """ + # GH 15110 + # Don't allow a single string for names in a MultiIndex + if names is not None and not is_list_like(names): + raise ValueError('Names should be list-like for a MultiIndex') names = list(names) if validate and level is not None and len(names) != len(level): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b00dc62206f57..8e61fa3a5fb66 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1490,6 +1490,19 @@ def test_frame_describe_multikey(self): for name, group in groupedT: assert_frame_equal(result[name], group.describe()) + def test_frame_describe_tupleindex(self): + + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, + 'y': [10, 20, 30, 40, 50] * 3, + 'z': [100, 200, 300, 400, 500] * 3}) + df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={'k': 'key'}) + result = df1.groupby('k').describe() + expected = df2.groupby('key').describe() + expected.index.set_names(result.index.names, inplace=True) + assert_frame_equal(result, expected) + def test_frame_groupby(self): grouped = self.tsframe.groupby(lambda x: x.weekday()) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 16831219e0930..2861a1f56b24b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2554,3 +2554,12 @@ def test_unsortedindex(self): with assertRaises(KeyError): df.loc(axis=0)['q', :] + + def test_tuples_with_name_string(self): + # GH 15110 and GH 14848 + + li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] + with assertRaises(ValueError): + pd.Index(li, name='abc') + with assertRaises(ValueError): + pd.Index(li, name='a') From 168850c1f49b917cfccae4aafc48fcd539091321 Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Fri, 13 Jan 2017 15:07:18 -0500 Subject: [PATCH 020/338] BUG: Fix to_json lines with escaped characters Updates existing to_json methodology by adding is_escaping variable, which ensures escaped chars are handled correctly. xref #14693 closes #15096 Author: Rouz Azari Closes #15117 from rouzazari/to_json_lines_with_escaping and squashes the following commits: d114455 [Rouz Azari] BUG: Fix to_json lines with escaped characters --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/tests/json/test_pandas.py | 9 +++++++++ pandas/lib.pyx | 7 +++++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index b157112b6ff37..9ea7b740bae8f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -351,6 +351,7 @@ Bug Fixes - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) +- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index d7f903153fdae..aaa9752dc6d46 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -972,6 +972,15 @@ def test_to_jsonl(self): self.assertEqual(result, expected) assert_frame_equal(pd.read_json(result, lines=True), df) + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + self.assertEqual(result, expected) + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: self.assertRaisesRegexp( diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 761969491cfc7..fce6a3d03287e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1098,7 +1098,8 @@ def convert_json_to_lines(object arr): to quotes & brackets """ cdef: - Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length + Py_ssize_t i = 0, num_open_brackets_seen = 0, length + bint in_quotes = 0, is_escaping = 0 ndarray[uint8_t] narr unsigned char v, comma, left_bracket, right_brack, newline @@ -1113,8 +1114,10 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and i > 0 and narr[i - 1] != backslash: + if v == quote and i > 0 and not is_escaping: in_quotes = ~in_quotes + if v == backslash or is_escaping: + is_escaping = ~is_escaping if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: narr[i] = newline From c79089d7ef70df5fc6d8ddafc7dc6968a599274e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 13 Jan 2017 15:21:21 -0500 Subject: [PATCH 021/338] TST: tests.groupby needs to be a package to run tests (#15127) skip parts of cummin test, xref #15109 --- pandas/tests/groupby/__init__.py | 0 pandas/tests/groupby/test_groupby.py | 15 +++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/groupby/__init__.py diff --git a/pandas/tests/groupby/__init__.py b/pandas/tests/groupby/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8e61fa3a5fb66..873c63ca257c4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5808,9 +5808,12 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = min_val expected.loc[[2, 3, 6, 7], 'B'] = min_val result = df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) + + # TODO: GH 15019 + # overwriting NaNs + # tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) + # tm.assert_frame_equal(result, expected) # cummax expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) @@ -5823,9 +5826,13 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = max_val expected.loc[[2, 3, 6, 7], 'B'] = max_val result = df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) + + # TODO: GH 15019 + # overwriting NaNs + # tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) + # tm.assert_frame_equal(result, expected) # Test nan in some values base_df.loc[[0, 2, 4, 6], 'B'] = np.nan From 8f0c250482311e0c1cff2c8bf532c76edd1629e7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 13 Jan 2017 21:10:37 -0500 Subject: [PATCH 022/338] CI: fix conda version on appveyor --- appveyor.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 33b8be57eba62..2499e7069843d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -65,7 +65,8 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - - cmd: conda update -q conda + #- cmd: conda update -q conda + - cmd: conda install conda=4.2.15 - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority From f234c21472353db34ecb54e08b3146c21b913baf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 14 Jan 2017 12:00:36 -0500 Subject: [PATCH 023/338] BUG: catch overflow in timestamp + offset operations xref https://github.com/statsmodels/statsmodels/issues/3374 Author: Jeff Reback Closes #15126 from jreback/inc and squashes the following commits: 923eaa2 [Jeff Reback] BUG: catch overflow in timestamp + offset operations --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tseries/offsets.py | 21 ++++++++++++++++++--- pandas/tseries/tests/test_timedeltas.py | 5 +++++ pandas/tseries/tests/test_timeseries.py | 24 ++++++++++++++++++------ pandas/tslib.pyx | 2 +- 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9ea7b740bae8f..dee74241a7a28 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -356,6 +356,7 @@ Bug Fixes - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) +- Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index abb7018b2e25b..370dd00762896 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -2710,6 +2710,9 @@ def __add__(self, other): return self.apply(other) except ApplyTypeError: return NotImplemented + except OverflowError: + raise OverflowError("the add operation between {} and {} " + "will overflow".format(self, other)) def __eq__(self, other): if isinstance(other, compat.string_types): @@ -2748,14 +2751,26 @@ def nanos(self): def apply(self, other): # Timestamp can handle tz and nano sec, thus no need to use apply_wraps - if isinstance(other, (datetime, np.datetime64, date)): + if isinstance(other, Timestamp): + + # GH 15126 + # in order to avoid a recursive + # call of __add__ and __radd__ if there is + # an exception, when we call using the + operator, + # we directly call the known method + result = other.__add__(self) + if result == NotImplemented: + raise OverflowError + return result + elif isinstance(other, (datetime, np.datetime64, date)): return as_timestamp(other) + self + if isinstance(other, timedelta): return other + self.delta elif isinstance(other, type(self)): return type(self)(self.n + other.n) - else: - raise ApplyTypeError('Unhandled type: %s' % type(other).__name__) + + raise ApplyTypeError('Unhandled type: %s' % type(other).__name__) _prefix = 'undefined' diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 1d07b4ab39a99..48ef9a4db7e28 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -971,6 +971,11 @@ def test_overflow(self): s2 = s[0:1000] result = (s2 - s2.min()).sum() + def test_overflow_on_construction(self): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + value = pd.Timedelta('1day').value * 20169940 + self.assertRaises(OverflowError, pd.Timedelta, value) + def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index b7e77c2eb770a..aeb1b95261291 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3312,13 +3312,25 @@ def test_datetime64_with_DateOffset(self): assert_func(klass([x + op for x in s]), s + op) assert_func(klass([x - op for x in s]), s - op) assert_func(klass([op + x for x in s]), op + s) - # def test_add_timedelta64(self): - # rng = date_range('1/1/2000', periods=5) - # delta = rng.values[3] - rng.values[1] - # result = rng + delta - # expected = rng + timedelta(2) - # self.assertTrue(result.equals(expected)) + def test_overflow_offset(self): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + # ends up multiplying really large numbers which overflow + + t = Timestamp('2017-01-13 00:00:00', freq='D') + offset = 20169940 * pd.offsets.Day(1) + + def f(): + t + offset + self.assertRaises(OverflowError, f) + + def f(): + offset + t + self.assertRaises(OverflowError, f) + + def f(): + t - offset + self.assertRaises(OverflowError, f) def test_get_duplicates(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 9a20c36638bce..eb28adf59f88e 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3125,7 +3125,7 @@ class Timedelta(_Timedelta): if not (is_integer_object(other) or is_float_object(other)): return NotImplemented - return Timedelta(other *self.value, unit='ns') + return Timedelta(other * self.value, unit='ns') __rmul__ = __mul__ From ecbafbb1f6d36f44543bb6cb5212597734efc0e5 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 14 Jan 2017 12:03:04 -0500 Subject: [PATCH 024/338] ENH: Accept callable for skiprows in read_csv Title is self-explanatory. xref #10882. Author: gfyoung Closes #15059 from gfyoung/skiprows-callable and squashes the following commits: d15e3a3 [gfyoung] ENH: Accept callable for skiprows --- doc/source/io.rst | 10 ++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 28 ++++++++++++++++++++-------- pandas/io/tests/parser/skiprows.py | 25 +++++++++++++++++++++++++ pandas/parser.pyx | 26 ++++++++++++++++++++++---- pandas/src/parser/tokenizer.c | 30 ++++++++++++++++++++++++++++-- pandas/src/parser/tokenizer.h | 1 + 7 files changed, 107 insertions(+), 14 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index dae97f7bc7f34..9f5e6f2331bc5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False`` skiprows : list-like or integer, default ``None`` Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise: + + .. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) + skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). skip_footer : int, default ``0`` diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index dee74241a7a28..09ddd8fe23e77 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -110,6 +110,7 @@ Other enhancements - ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) +- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f2c3113fc2cdd..fdf26fdef6b25 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -132,9 +132,13 @@ Values to consider as False skipinitialspace : boolean, default False Skip spaces after delimiter. -skiprows : list-like or integer, default None +skiprows : list-like or integer or callable, default None Line numbers to skip (0-indexed) or number of lines to skip (int) - at the start of the file + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c') skip_footer : int, default 0 @@ -930,7 +934,10 @@ def _clean_options(self, options, engine): if engine != 'c': if is_integer(skiprows): skiprows = lrange(skiprows) - skiprows = set() if skiprows is None else set(skiprows) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) # put stuff back result['names'] = names @@ -1851,6 +1858,11 @@ def __init__(self, f, **kwds): self.memory_map = kwds['memory_map'] self.skiprows = kwds['skiprows'] + if callable(self.skiprows): + self.skipfunc = self.skiprows + else: + self.skipfunc = lambda x: x in self.skiprows + self.skipfooter = kwds['skipfooter'] self.delimiter = kwds['delimiter'] @@ -2006,7 +2018,7 @@ class MyDialect(csv.Dialect): # attempt to sniff the delimiter if sniff_sep: line = f.readline() - while self.pos in self.skiprows: + while self.skipfunc(self.pos): self.pos += 1 line = f.readline() @@ -2414,7 +2426,7 @@ def _empty(self, line): def _next_line(self): if isinstance(self.data, list): - while self.pos in self.skiprows: + while self.skipfunc(self.pos): self.pos += 1 while True: @@ -2433,7 +2445,7 @@ def _next_line(self): except IndexError: raise StopIteration else: - while self.pos in self.skiprows: + while self.skipfunc(self.pos): self.pos += 1 next(self.data) @@ -2685,7 +2697,7 @@ def _get_lines(self, rows=None): # Check for stop rows. n.b.: self.skiprows is a set. if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) - if i + self.pos not in self.skiprows] + if not self.skipfunc(i + self.pos)] lines.extend(new_rows) self.pos = new_pos @@ -2713,7 +2725,7 @@ def _get_lines(self, rows=None): except StopIteration: if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) - if self.pos + i not in self.skiprows] + if not self.skipfunc(i + self.pos)] lines.extend(new_rows) if len(lines) == 0: raise diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index 9f01adb6fabcb..c53e6a1579267 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -12,6 +12,7 @@ import pandas.util.testing as tm from pandas import DataFrame +from pandas.io.common import EmptyDataError from pandas.compat import StringIO, range, lrange @@ -198,3 +199,27 @@ def test_skiprows_infield_quote(self): df = self.read_csv(StringIO(data), skiprows=2) tm.assert_frame_equal(df, expected) + + def test_skiprows_callable(self): + data = 'a\n1\n2\n3\n4\n5' + + skiprows = lambda x: x % 2 == 0 + expected = DataFrame({'1': [3, 5]}) + df = self.read_csv(StringIO(data), skiprows=skiprows) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({'foo': [3, 5]}) + df = self.read_csv(StringIO(data), skiprows=skiprows, + header=0, names=['foo']) + tm.assert_frame_equal(df, expected) + + skiprows = lambda x: True + msg = "No columns to parse from file" + with tm.assertRaisesRegexp(EmptyDataError, msg): + self.read_csv(StringIO(data), skiprows=skiprows) + + # This is a bad callable and should raise. + msg = "by zero" + skiprows = lambda x: 1 / 0 + with tm.assertRaisesRegexp(ZeroDivisionError, msg): + self.read_csv(StringIO(data), skiprows=skiprows) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 7b31f7fe27c1e..bd793c98eef5b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h": int header_end # header row end void *skipset + PyObject *skipfunc int64_t skip_first_N_rows int skipfooter double (*converter)(const char *, char **, char, char, char, int) nogil @@ -606,9 +607,11 @@ cdef class TextReader: cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): parser_set_skipfirstnrows(self.parser, self.skiprows) - else: + elif not callable(self.skiprows): for i in self.skiprows: parser_add_skiprow(self.parser, i) + else: + self.parser.skipfunc = self.skiprows cdef _setup_parser_source(self, source): cdef: @@ -2115,18 +2118,33 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: cdef raise_parser_error(object base, parser_t *parser): cdef: object old_exc + object exc_type PyObject *type PyObject *value PyObject *traceback if PyErr_Occurred(): - PyErr_Fetch(&type, &value, &traceback); - Py_XDECREF(type) + PyErr_Fetch(&type, &value, &traceback) Py_XDECREF(traceback) + if value != NULL: old_exc = value Py_XDECREF(value) - raise old_exc + + # PyErr_Fetch only returned the error message in *value, + # so the Exception class must be extracted from *type. + if isinstance(old_exc, compat.string_types): + if type != NULL: + exc_type = type + else: + exc_type = ParserError + + Py_XDECREF(type) + raise exc_type(old_exc) + else: + Py_XDECREF(type) + raise old_exc + message = '%s. C error: ' % base if parser.error_msg != NULL: if PY3: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index bc729cd3e7453..87e17fe5fb751 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) { self->thousands = '\0'; self->skipset = NULL; + self->skipfunc = NULL; self->skip_first_N_rows = -1; self->skip_footer = 0; } @@ -679,7 +680,27 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { } int skip_this_line(parser_t *self, int64_t rownum) { - if (self->skipset != NULL) { + int should_skip; + PyObject *result; + PyGILState_STATE state; + + if (self->skipfunc != NULL) { + state = PyGILState_Ensure(); + result = PyObject_CallFunction(self->skipfunc, "i", rownum); + + // Error occurred. It will be processed + // and caught at the Cython level. + if (result == NULL) { + should_skip = -1; + } else { + should_skip = PyObject_IsTrue(result); + } + + Py_XDECREF(result); + PyGILState_Release(state); + + return should_skip; + } else if (self->skipset != NULL) { return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != ((kh_int64_t *)self->skipset)->n_buckets); } else { @@ -689,6 +710,7 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { int i, slen; + int should_skip; long maxstreamsize; char c; char *stream; @@ -818,7 +840,11 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { case START_RECORD: // start of record - if (skip_this_line(self, self->file_lines)) { + should_skip = skip_this_line(self, self->file_lines); + + if (should_skip == -1) { + goto parsingerror; + } else if (should_skip) { if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index d31bf4b688c58..e7271cabb0752 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -198,6 +198,7 @@ typedef struct parser_t { int header_end; // header row end void *skipset; + PyObject *skipfunc; int64_t skip_first_N_rows; int skip_footer; double (*converter)(const char *, char **, char, char, char, int); From 11251d6a6e2d42aac5d006d6570a2b02fdd57899 Mon Sep 17 00:00:00 2001 From: discort Date: Sat, 14 Jan 2017 12:07:17 -0500 Subject: [PATCH 025/338] BUG: pandas.Series.dt.round inconsistent behaviour on NAT's with different arguments closes #14940 Author: discort Closes #15124 from discort/dt_round_bug_14940 and squashes the following commits: 9e77191 [discort] added a test for Timestamp 52c897a [discort] BUG: added maybe_mask_results to '_round' method --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/tseries/base.py | 1 + pandas/tseries/tests/test_timeseries.py | 18 ++++++++++++++++++ pandas/tslib.pyx | 7 ++++++- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 09ddd8fe23e77..5c26bb2985e75 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -368,3 +368,5 @@ Bug Fixes - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) + +- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) \ No newline at end of file diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 63e56e09e91fe..a8dd2238c2063 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -84,6 +84,7 @@ def _round(self, freq, rounder): values = _ensure_datetimelike_to_i8(self) result = (unit * rounder(values / float(unit))).astype('i8') + result = self._maybe_mask_results(result, fill_value=tslib.NaT) attribs = self._get_attributes_dict() if 'freq' in attribs: attribs['freq'] = None diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index aeb1b95261291..58a4457777ea0 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4453,6 +4453,15 @@ def test_nat_operations(self): self.assertEqual(s.min(), exp) self.assertEqual(s.max(), exp) + def test_round_nat(self): + # GH14940 + s = Series([pd.NaT]) + expected = Series(pd.NaT) + for method in ["round", "floor", "ceil"]: + round_method = getattr(s.dt, method) + for freq in ["s", "5s", "min", "5min", "h", "5h"]: + assert_series_equal(round_method(freq), expected) + class TestTimestamp(tm.TestCase): def test_class_ops_pytz(self): @@ -4861,6 +4870,15 @@ def test_is_leap_year(self): self.assertFalse(pd.NaT.is_leap_year) self.assertIsInstance(pd.NaT.is_leap_year, bool) + def test_round_nat(self): + # GH14940 + ts = Timestamp('nat') + print(dir(ts)) + for method in ["round", "floor", "ceil"]: + round_method = getattr(ts, method) + for freq in ["s", "5s", "min", "5min", "h", "5h"]: + self.assertIs(round_method(freq), ts) + class TestSlicing(tm.TestCase): def test_slice_year(self): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index eb28adf59f88e..ad11bd7039a40 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -885,7 +885,12 @@ def _make_nan_func(func_name): f.__name__ = func_name return f + +# GH14940 +_round_methods = ['round', 'floor', 'ceil'] + _nat_methods = ['date', 'now', 'replace', 'to_pydatetime', 'today'] +_nat_methods.extend(_round_methods) _nan_methods = ['weekday', 'isoweekday', 'total_seconds'] @@ -895,7 +900,7 @@ _implemented_methods.extend(_nan_methods) for _method_name in _nat_methods: # not all methods exist in all versions of Python - if hasattr(NaTType, _method_name): + if hasattr(NaTType, _method_name) or _method_name in _round_methods: setattr(NaTType, _method_name, _make_nat_func(_method_name)) for _method_name in _nan_methods: From 94a6d140f6edb0e25f460ae15a579cc9eebc4f78 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 14 Jan 2017 12:18:44 -0500 Subject: [PATCH 026/338] CLN: reorg NaT definitions in tslib.pyx --- pandas/tslib.pyx | 187 +++++++++++++++++++++++++++-------------------- 1 file changed, 106 insertions(+), 81 deletions(-) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index ad11bd7039a40..cde7c0dc10740 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -854,68 +854,6 @@ class NaTType(_NaT): return NotImplemented -fields = ['year', 'quarter', 'month', 'day', 'hour', - 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', - 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', - 'weekday_name'] -for field in fields: - prop = property(fget=lambda self: np.nan) - setattr(NaTType, field, prop) - - -# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or -# return NaT create functions that raise, for binding to NaTType -def _make_error_func(func_name): - def f(*args, **kwargs): - raise ValueError("NaTType does not support " + func_name) - f.__name__ = func_name - return f - - -def _make_nat_func(func_name): - def f(*args, **kwargs): - return NaT - f.__name__ = func_name - return f - - -def _make_nan_func(func_name): - def f(*args, **kwargs): - return np.nan - f.__name__ = func_name - return f - - -# GH14940 -_round_methods = ['round', 'floor', 'ceil'] - -_nat_methods = ['date', 'now', 'replace', 'to_pydatetime', 'today'] -_nat_methods.extend(_round_methods) - -_nan_methods = ['weekday', 'isoweekday', 'total_seconds'] - -_implemented_methods = ['to_datetime', 'to_datetime64', 'isoformat'] -_implemented_methods.extend(_nat_methods) -_implemented_methods.extend(_nan_methods) - -for _method_name in _nat_methods: - # not all methods exist in all versions of Python - if hasattr(NaTType, _method_name) or _method_name in _round_methods: - setattr(NaTType, _method_name, _make_nat_func(_method_name)) - -for _method_name in _nan_methods: - if hasattr(NaTType, _method_name): - setattr(NaTType, _method_name, _make_nan_func(_method_name)) - -for _maybe_method_name in dir(NaTType): - _maybe_method = getattr(NaTType, _maybe_method_name) - if (callable(_maybe_method) - and not _maybe_method_name.startswith("_") - and _maybe_method_name not in _implemented_methods): - setattr(NaTType, _maybe_method_name, - _make_error_func(_maybe_method_name)) - - def __nat_unpickle(*args): # return constant defined in the module return NaT @@ -1417,22 +1355,6 @@ cdef class _NaT(_Timestamp): return NotImplemented -def _delta_to_nanoseconds(delta): - if isinstance(delta, np.ndarray): - return delta.astype('m8[ns]').astype('int64') - if hasattr(delta, 'nanos'): - return delta.nanos - if hasattr(delta, 'delta'): - delta = delta.delta - if is_timedelta64_object(delta): - return delta.astype("timedelta64[ns]").item() - if is_integer_object(delta): - return delta - return (delta.days * 24 * 60 * 60 * 1000000 - + delta.seconds * 1000000 - + delta.microseconds) * 1000 - - # lightweight C object to hold datetime & int64 pair cdef class _TSObject: cdef: @@ -3803,6 +3725,112 @@ def array_strptime(ndarray[object] values, object fmt, return result +#---------------------------------------------------------------------- +# NaT methods/property setups + + +# inject the Timestamp field properties +# these by definition return np.nan +fields = ['year', 'quarter', 'month', 'day', 'hour', + 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', + 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', + 'weekday_name'] +for field in fields: + prop = property(fget=lambda self: np.nan) + setattr(NaTType, field, prop) + + +# define how we are handling NaT methods & inject +# to the NaTType class; these can return NaT, np.nan +# or raise respectively +_nat_methods = ['date', 'now', 'replace', 'to_pydatetime', + 'today', 'round', 'floor', 'ceil'] +_nan_methods = ['weekday', 'isoweekday', 'total_seconds'] +_implemented_methods = ['to_datetime', 'to_datetime64', 'isoformat'] +_implemented_methods.extend(_nat_methods) +_implemented_methods.extend(_nan_methods) + + +def _get_docstring(_method_name): + # NaT serves double duty as Timestamp & Timedelta + # missing value, so need to acquire doc-strings for both + + try: + return getattr(Timestamp, _method_name).__doc__ + except AttributeError: + pass + + try: + return getattr(Timedelta, _method_name).__doc__ + except AttributeError: + pass + + return None + + +for _method_name in _nat_methods: + + def _make_nat_func(func_name): + def f(*args, **kwargs): + return NaT + f.__name__ = func_name + f.__doc__ = _get_docstring(_method_name) + return f + + setattr(NaTType, _method_name, _make_nat_func(_method_name)) + + +for _method_name in _nan_methods: + + def _make_nan_func(func_name): + def f(*args, **kwargs): + return np.nan + f.__name__ = func_name + f.__doc__ = _get_docstring(_method_name) + return f + + setattr(NaTType, _method_name, _make_nan_func(_method_name)) + + +# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or +# return NaT create functions that raise, for binding to NaTType +for _maybe_method_name in dir(NaTType): + _maybe_method = getattr(NaTType, _maybe_method_name) + if (callable(_maybe_method) + and not _maybe_method_name.startswith("_") + and _maybe_method_name not in _implemented_methods): + + def _make_error_func(func_name): + def f(*args, **kwargs): + raise ValueError("NaTType does not support " + func_name) + f.__name__ = func_name + f.__doc__ = _get_docstring(_method_name) + return f + + setattr(NaTType, _maybe_method_name, + _make_error_func(_maybe_method_name)) + + +#---------------------------------------------------------------------- +# Conversion routines + + +def _delta_to_nanoseconds(delta): + if isinstance(delta, np.ndarray): + return delta.astype('m8[ns]').astype('int64') + if hasattr(delta, 'nanos'): + return delta.nanos + if hasattr(delta, 'delta'): + delta = delta.delta + if is_timedelta64_object(delta): + return delta.astype("timedelta64[ns]").item() + if is_integer_object(delta): + return delta + return (delta.days * 24 * 60 * 60 * 1000000 + + delta.seconds * 1000000 + + delta.microseconds) * 1000 + + cdef inline _get_datetime64_nanos(object val): cdef: pandas_datetimestruct dts @@ -3891,9 +3919,6 @@ def cast_to_nanoseconds(ndarray arr): return result -#---------------------------------------------------------------------- -# Conversion routines - def pydt_to_i8(object pydt): """ From b12709140a3234f1fd0c3c2b05dd674ede901d75 Mon Sep 17 00:00:00 2001 From: Sami Salonen Date: Sun, 15 Jan 2017 18:09:02 +0200 Subject: [PATCH 027/338] MAINT: remove shebang from test modules (#15134) --- pandas/computation/tests/test_compat.py | 1 - pandas/computation/tests/test_eval.py | 1 - pandas/tests/plotting/test_boxplot_method.py | 1 - pandas/tests/plotting/test_frame.py | 1 - pandas/tests/plotting/test_groupby.py | 1 - pandas/tests/plotting/test_hist_method.py | 1 - pandas/tests/plotting/test_misc.py | 1 - pandas/tests/plotting/test_series.py | 1 - pandas/tests/test_config.py | 1 - pandas/tests/test_msgpack/test_buffer.py | 1 - pandas/tests/test_msgpack/test_case.py | 1 - pandas/tests/test_msgpack/test_except.py | 1 - pandas/tests/test_msgpack/test_format.py | 1 - pandas/tests/test_msgpack/test_limits.py | 1 - pandas/tests/test_msgpack/test_pack.py | 1 - pandas/tests/test_msgpack/test_seq.py | 1 - pandas/tests/test_msgpack/test_sequnpack.py | 1 - pandas/tests/test_msgpack/test_subtype.py | 1 - pandas/tests/test_testing.py | 1 - 19 files changed, 19 deletions(-) diff --git a/pandas/computation/tests/test_compat.py b/pandas/computation/tests/test_compat.py index 80b415739c647..8e8924379f153 100644 --- a/pandas/computation/tests/test_compat.py +++ b/pandas/computation/tests/test_compat.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # flake8: noqa diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 1b577a574350d..3a446bfc36c21 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # flake8: noqa diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 0916693ade2ce..f76cdb70c0240 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import nose diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 87cf89ebf0a9d..f4c64c3d7a3f8 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import nose diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 101a6556c61bf..3c682fbfbb89e 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import nose diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index c7bff5a31fc02..bde5544390b85 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import nose diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6c313f5937602..2650ce2879db7 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import nose diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6878ca0e1bc06..f668c46a15173 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import nose diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index ea226851c9101..ed8c37fd6dd20 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -1,4 +1,3 @@ -#!/usr/bin/python # -*- coding: utf-8 -*- import pandas as pd import unittest diff --git a/pandas/tests/test_msgpack/test_buffer.py b/pandas/tests/test_msgpack/test_buffer.py index 43f5e64012885..caaa22bfd08fc 100644 --- a/pandas/tests/test_msgpack/test_buffer.py +++ b/pandas/tests/test_msgpack/test_buffer.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 from pandas.msgpack import packb, unpackb diff --git a/pandas/tests/test_msgpack/test_case.py b/pandas/tests/test_msgpack/test_case.py index 5e8bbff390d07..a8a45b5b37eb0 100644 --- a/pandas/tests/test_msgpack/test_case.py +++ b/pandas/tests/test_msgpack/test_case.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 from pandas.msgpack import packb, unpackb diff --git a/pandas/tests/test_msgpack/test_except.py b/pandas/tests/test_msgpack/test_except.py index 79290ebb891fd..76b91bb375bbc 100644 --- a/pandas/tests/test_msgpack/test_except.py +++ b/pandas/tests/test_msgpack/test_except.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import unittest diff --git a/pandas/tests/test_msgpack/test_format.py b/pandas/tests/test_msgpack/test_format.py index 203726ae6a5f9..a4b309ebb657d 100644 --- a/pandas/tests/test_msgpack/test_format.py +++ b/pandas/tests/test_msgpack/test_format.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 from pandas.msgpack import unpackb diff --git a/pandas/tests/test_msgpack/test_limits.py b/pandas/tests/test_msgpack/test_limits.py index 2cf52aae65f2a..9c08f328b90dd 100644 --- a/pandas/tests/test_msgpack/test_limits.py +++ b/pandas/tests/test_msgpack/test_limits.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 from __future__ import (absolute_import, division, print_function, unicode_literals) diff --git a/pandas/tests/test_msgpack/test_pack.py b/pandas/tests/test_msgpack/test_pack.py index 99c7453212b8b..005352691d908 100644 --- a/pandas/tests/test_msgpack/test_pack.py +++ b/pandas/tests/test_msgpack/test_pack.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import unittest diff --git a/pandas/tests/test_msgpack/test_seq.py b/pandas/tests/test_msgpack/test_seq.py index 76a21b98f22da..927c2622419a6 100644 --- a/pandas/tests/test_msgpack/test_seq.py +++ b/pandas/tests/test_msgpack/test_seq.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import io diff --git a/pandas/tests/test_msgpack/test_sequnpack.py b/pandas/tests/test_msgpack/test_sequnpack.py index 2f496b9fbbafa..fe089ccda1c7f 100644 --- a/pandas/tests/test_msgpack/test_sequnpack.py +++ b/pandas/tests/test_msgpack/test_sequnpack.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 import unittest diff --git a/pandas/tests/test_msgpack/test_subtype.py b/pandas/tests/test_msgpack/test_subtype.py index c89b36717a159..d6dd72c4d9850 100644 --- a/pandas/tests/test_msgpack/test_subtype.py +++ b/pandas/tests/test_msgpack/test_subtype.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 from pandas.msgpack import packb diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 7a217ed9dbd86..e2f295a5343bc 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -1,4 +1,3 @@ -#!/usr/bin/python # -*- coding: utf-8 -*- import pandas as pd import unittest From 01090d381ead99b8ab60770a7b7430905df5e02f Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Sun, 15 Jan 2017 11:10:23 -0500 Subject: [PATCH 028/338] BUG: Fix Series constructor when copy=True Updates Series constructor to include copy argument when dtype argument is also provided. closes #15125 Author: Rouz Azari Closes #15128 from rouzazari/series_dtype_param_no_side_effects_when_copy_true and squashes the following commits: 7b0c959 [Rouz Azari] BUG: Fix Series constructor when copy=True --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/series.py | 3 ++- pandas/tests/series/test_constructors.py | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5c26bb2985e75..2a825edd0e98a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -366,7 +366,7 @@ Bug Fixes - +- Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) \ No newline at end of file diff --git a/pandas/core/series.py b/pandas/core/series.py index ab1498c617467..d967e2d02d41f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -237,7 +237,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, # create/copy the manager if isinstance(data, SingleBlockManager): if dtype is not None: - data = data.astype(dtype=dtype, raise_on_error=False) + data = data.astype(dtype=dtype, raise_on_error=False, + copy=copy) elif copy: data = data.copy() else: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a7e3ebdfc43d0..ee4940384b612 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -251,6 +251,22 @@ def test_constructor_sanitize(self): s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') self.assertEqual(s.dtype, np.dtype('f8')) + def test_constructor_copy(self): + # GH15125 + # test dtype parameter has no side effects on copy=True + for data in [[1.], np.array([1.])]: + x = Series(data) + y = pd.Series(x, copy=True, dtype=float) + + # copy=True maintains original data in Series + tm.assert_series_equal(x, y) + + # changes to origin of copy does not affect the copy + x[0] = 2. + self.assertFalse(x.equals(y)) + self.assertEqual(x[0], 2.) + self.assertEqual(y[0], 1.) + def test_constructor_pass_none(self): s = Series(None, index=lrange(5)) self.assertEqual(s.dtype, np.float64) From 1a835ef0221d536a0c1038ed7c53844ca4ed7d9d Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Tue, 17 Jan 2017 08:42:42 -0500 Subject: [PATCH 029/338] BUG: Allow tolerance when left_index/right_index enabled for merge_asof() (#15135) closes #15135 Author: Christopher C. Aycock Closes #15139 from chrisaycock/GH15135 and squashes the following commits: 17c902a [Christopher C. Aycock] BUG: Allow tolerance when left_index/right_index enabled for merge_asof() (#15135) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tools/merge.py | 6 +++++- pandas/tools/tests/test_merge_asof.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2a825edd0e98a..1ae20803ca9fd 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -360,7 +360,7 @@ Bug Fixes - Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) - +- Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 4012629aa3c90..c756a61faeefb 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1129,7 +1129,11 @@ def _get_merge_keys(self): # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: - lt = left_join_keys[-1] + if self.left_index: + lt = self.left.index + else: + lt = left_join_keys[-1] + msg = "incompatible tolerance, must be compat " \ "with type {0}".format(type(lt)) diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index bbbf1a3bdfff9..e05cbd0850169 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -518,6 +518,19 @@ def test_tolerance_tz(self): 'value2': list("BCDEE")}) assert_frame_equal(result, expected) + def test_index_tolerance(self): + # GH 15135 + expected = self.tolerance.set_index('time') + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + + result = pd.merge_asof(trades, quotes, + left_index=True, + right_index=True, + by='ticker', + tolerance=pd.Timedelta('1day')) + assert_frame_equal(result, expected) + def test_allow_exact_matches(self): result = merge_asof(self.trades, self.quotes, From 8b2031689382088e4604b0266d6682ce4ce5dc16 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 17 Jan 2017 08:45:11 -0500 Subject: [PATCH 030/338] ENH: Create and propagate UInt64Index 1) Introduces and propagates `UInt64Index`, an index specifically for `uint`. xref #14935 2) Patches bug from #14916 that makes `maybe_convert_objects` robust against the known `numpy` bug that `uint64` cannot be compared to `int64`. This bug was caught during testing of `UInt64Index`. **UPDATE**: Patched in #14951 Author: gfyoung Closes #14937 from gfyoung/create-uint64-index and squashes the following commits: 8ab6fbd [gfyoung] ENH: Create and propagate UInt64Index --- doc/source/whatsnew/v0.20.0.txt | 26 +- pandas/api/tests/test_api.py | 2 +- pandas/core/api.py | 3 +- pandas/core/indexing.py | 20 +- pandas/indexes/api.py | 4 +- pandas/indexes/base.py | 72 ++- pandas/indexes/numeric.py | 112 +++- pandas/src/algos_common_helper.pxi.in | 1 + pandas/src/index_class_helper.pxi.in | 1 + pandas/src/join_helper.pxi.in | 5 +- pandas/src/joins_func_helper.pxi.in | 3 +- pandas/tests/frame/test_indexing.py | 58 +- pandas/tests/indexes/common.py | 27 +- pandas/tests/indexes/test_base.py | 71 ++- pandas/tests/indexes/test_numeric.py | 704 ++++++++++++++++--------- pandas/tests/indexing/test_indexing.py | 123 +++-- pandas/tests/types/test_generic.py | 1 + pandas/types/generic.py | 4 +- pandas/util/testing.py | 4 + setup.py | 3 +- 20 files changed, 855 insertions(+), 389 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1ae20803ca9fd..4ce7e6d2cd3b8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -91,6 +91,25 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). df = pd.read_table(url, compression='bz2') # explicitly specify compression df.head(2) +.. _whatsnew_0200.enhancements.uint64_support: + +Pandas has significantly improved support for operations involving unsigned, +or purely non-negative, integers. Previously, handling these integers would +result in improper rounding or data-type casting, leading to incorrect results. +Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`) + +.. ipython:: python + + idx = pd.UInt64Index([1, 2, 3]) + df = pd.DataFrame({'A': ['a', 'b', 'c']}, index=idx) + df.index + +- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) +- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`) +- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) +- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`) +- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) + .. _whatsnew_0200.enhancements.other: Other enhancements @@ -298,8 +317,6 @@ Bug Fixes - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) -- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) -- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) @@ -324,8 +341,6 @@ Bug Fixes -- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`) -- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) @@ -350,7 +365,6 @@ Bug Fixes - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) -- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) @@ -369,4 +383,4 @@ Bug Fixes - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) -- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) \ No newline at end of file +- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index b13b4d7de60ca..78dfe46914200 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase): classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', - 'Period', 'PeriodIndex', 'RangeIndex', + 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseSeries', 'TimeGrouper', 'Timedelta', 'TimedeltaIndex', 'Timestamp'] diff --git a/pandas/core/api.py b/pandas/core/api.py index b5e1de2063c7e..177e7b31cbd4f 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -10,7 +10,8 @@ from pandas.core.groupby import Grouper from pandas.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, - RangeIndex, Float64Index, MultiIndex) + UInt64Index, RangeIndex, Float64Index, + MultiIndex) from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6970d1891ee63..0db5103a18807 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -860,15 +860,20 @@ def _convert_for_reindex(self, key, axis=0): return labels[key] else: if isinstance(key, Index): - # want Index objects to pass through untouched - keyarr = key + keyarr = labels._convert_index_indexer(key) else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if is_integer_dtype(keyarr) and not labels.is_integer(): - keyarr = _ensure_platform_int(keyarr) - return labels.take(keyarr) + if is_integer_dtype(keyarr): + # Cast the indexer to uint64 if possible so + # that the values returned from indexing are + # also uint64. + keyarr = labels._convert_arr_indexer(keyarr) + + if not labels.is_integer(): + keyarr = _ensure_platform_int(keyarr) + return labels.take(keyarr) return keyarr @@ -1044,11 +1049,10 @@ def _getitem_iterable(self, key, axis=0): return self.obj.take(inds, axis=axis, convert=False) else: if isinstance(key, Index): - # want Index objects to pass through untouched - keyarr = key + keyarr = labels._convert_index_indexer(key) else: - # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) + keyarr = labels._convert_arr_indexer(keyarr) if is_categorical_dtype(labels): keyarr = labels._shallow_copy(keyarr) diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py index 0b81c47488ef4..64992e46613e5 100644 --- a/pandas/indexes/api.py +++ b/pandas/indexes/api.py @@ -4,7 +4,7 @@ from pandas.indexes.category import CategoricalIndex # noqa from pandas.indexes.multi import MultiIndex # noqa from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa - Int64Index) + Int64Index, UInt64Index) from pandas.indexes.range import RangeIndex # noqa import pandas.core.common as com @@ -13,7 +13,7 @@ # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', - 'CategoricalIndex', 'RangeIndex', + 'CategoricalIndex', 'RangeIndex', 'UInt64Index', 'InvalidIndexError', '_new_Index', '_ensure_index', '_get_na_value', '_get_combined_index', diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b87fb5dc84782..d0bf4edfbc5d2 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -27,6 +27,8 @@ is_object_dtype, is_categorical_dtype, is_bool_dtype, + is_signed_integer_dtype, + is_unsigned_integer_dtype, is_integer_dtype, is_float_dtype, is_datetime64_any_dtype, is_timedelta64_dtype, @@ -199,14 +201,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, data = np.array(data, copy=copy, dtype=dtype) elif inferred in ['floating', 'mixed-integer-float']: - # if we are actually all equal to integers + # If we are actually all equal to integers, # then coerce to integer - from .numeric import Int64Index, Float64Index + from .numeric import (Int64Index, UInt64Index, + Float64Index) try: - res = data.astype('i8') + res = data.astype('i8', copy=False) if (res == data).all(): return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to + # overflow), so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, + name=name) except (TypeError, ValueError): pass @@ -235,10 +248,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, IncompatibleFrequency) if isinstance(data, PeriodIndex): return PeriodIndex(data, copy=copy, name=name, **kwargs) - if issubclass(data.dtype.type, np.integer): + if is_signed_integer_dtype(data.dtype): from .numeric import Int64Index return Int64Index(data, copy=copy, dtype=dtype, name=name) - elif issubclass(data.dtype.type, np.floating): + elif is_unsigned_integer_dtype(data.dtype): + from .numeric import UInt64Index + return UInt64Index(data, copy=copy, dtype=dtype, name=name) + elif is_float_dtype(data.dtype): from .numeric import Float64Index return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): @@ -254,9 +270,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if dtype is None: inferred = lib.infer_dtype(subarr) if inferred == 'integer': - from .numeric import Int64Index - return Int64Index(subarr.astype('i8'), copy=copy, - name=name) + from .numeric import Int64Index, UInt64Index + try: + return Int64Index(subarr.astype('i8'), copy=copy, + name=name) + except OverflowError: + return UInt64Index(subarr.astype('u8'), copy=copy, + name=name) elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index return Float64Index(subarr, copy=copy, name=name) @@ -1253,6 +1273,40 @@ def is_int(v): return indexer + _index_shared_docs['_convert_arr_indexer'] = """ + Convert an array-like indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : array-like + Indexer to convert. + + Returns + ------- + converted_keyarr : array-like + """ + + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + return keyarr + + _index_shared_docs['_convert_index_indexer'] = """ + Convert an Index indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + + Returns + ------- + converted_keyarr : Index (or sub-class) + """ + + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + return keyarr + def _convert_list_indexer(self, keyarr, kind=None): """ passed a key that is tuplesafe that is integer based @@ -3489,7 +3543,7 @@ def _validate_for_numeric_binop(self, other, op, opstr): raise ValueError("cannot evaluate a numeric op with " "unequal lengths") other = _values_from_object(other) - if other.dtype.kind not in ['f', 'i']: + if other.dtype.kind not in ['f', 'i', 'u']: raise TypeError("cannot evaluate a numeric op " "with a non-numeric dtype") elif isinstance(other, (DateOffset, np.timedelta64, diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index c71abe202226e..0b9b337731d7f 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -8,7 +8,7 @@ is_float_dtype, is_object_dtype, is_integer_dtype, is_scalar) from pandas.types.missing import isnull -from pandas.core.common import _values_from_object +from pandas.core.common import _asarray_tuplesafe, _values_from_object from pandas import compat from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs @@ -73,6 +73,13 @@ def _assert_safe_casting(cls, data, subarr): """ pass + @property + def is_all_dates(self): + """ + Checks that all the labels are datetime objects + """ + return False + _num_index_shared_docs['class_descr'] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -128,13 +135,6 @@ def asi8(self): # do not cache or you'll create a memory leak return self.values.view('i8') - @property - def is_all_dates(self): - """ - Checks that all the labels are datetime objects - """ - return False - @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): assert kind in ['ix', 'loc', 'getitem', 'iloc', None] @@ -154,7 +154,7 @@ def _assert_safe_casting(cls, data, subarr): """ Ensure incoming data can be represented as ints. """ - if not issubclass(data.dtype.type, np.integer): + if not issubclass(data.dtype.type, np.signedinteger): if not np.array_equal(data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') @@ -162,6 +162,84 @@ def _assert_safe_casting(cls, data, subarr): Int64Index._add_numeric_methods() Int64Index._add_logical_methods() +_uint64_descr_args = dict( + klass='UInt64Index', + ltype='unsigned integer', + dtype='uint64', + extra='' +) + + +class UInt64Index(NumericIndex): + __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args + + _typ = 'uint64index' + _arrmap = _algos.arrmap_uint64 + _left_indexer_unique = _join.left_join_indexer_unique_uint64 + _left_indexer = _join.left_join_indexer_uint64 + _inner_indexer = _join.inner_join_indexer_uint64 + _outer_indexer = _join.outer_join_indexer_uint64 + + _can_hold_na = False + _na_value = 0 + + _engine_type = _index.UInt64Engine + + _default_dtype = np.uint64 + + @property + def inferred_type(self): + return 'integer' + + @property + def asi8(self): + # do not cache or you'll create a memory leak + return self.values.view('u8') + + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + + # don't coerce ilocs to integers + if kind != 'iloc': + key = self._maybe_cast_indexer(key) + return (super(UInt64Index, self) + ._convert_scalar_indexer(key, kind=kind)) + + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + # Cast the indexer to uint64 if possible so + # that the values returned from indexing are + # also uint64. + if is_integer_dtype(keyarr): + return _asarray_tuplesafe(keyarr, dtype=np.uint64) + return keyarr + + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + # Cast the indexer to uint64 if possible so + # that the values returned from indexing are + # also uint64. + if keyarr.is_integer(): + return keyarr.astype(np.uint64) + return keyarr + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + return UInt64Index(joined, name=name) + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as uints. + """ + if not issubclass(data.dtype.type, np.unsignedinteger): + if not np.array_equal(data, subarr): + raise TypeError('Unsafe NumPy casting, you must ' + 'explicitly cast') + +UInt64Index._add_numeric_methods() +UInt64Index._add_logical_methods() _float64_descr_args = dict( klass='Float64Index', @@ -207,15 +285,6 @@ def astype(self, dtype, copy=True): @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): - """ - convert a scalar indexer - - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem'} or None - """ - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] if kind == 'iloc': @@ -310,13 +379,6 @@ def get_loc(self, key, method=None, tolerance=None): return super(Float64Index, self).get_loc(key, method=method, tolerance=tolerance) - @property - def is_all_dates(self): - """ - Checks that all the labels are datetime objects - """ - return False - @cache_readonly def is_unique(self): return super(Float64Index, self).is_unique and self._nan_idxs.size < 2 diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index c1c190704b4c7..a579a5020f6e7 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -27,6 +27,7 @@ dtypes = [('float64', 'float64_t', 'np.float64', True, True), ('object', 'object', 'object', True, False), ('int32', 'int32_t', 'np.int32', False, True), ('int64', 'int64_t', 'np.int64', False, True), + ('uint64', 'uint64_t', 'np.uint64', False, True), ('bool', 'uint8_t', 'np.bool', False, True)] def get_dispatch(dtypes): diff --git a/pandas/src/index_class_helper.pxi.in b/pandas/src/index_class_helper.pxi.in index 315dd18009ad4..76c0deef7ebee 100644 --- a/pandas/src/index_class_helper.pxi.in +++ b/pandas/src/index_class_helper.pxi.in @@ -12,6 +12,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dtype, ctype dtypes = [('Float64', 'float64', 'float64_t'), + ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), ('Object', 'object', 'object')] }} diff --git a/pandas/src/join_helper.pxi.in b/pandas/src/join_helper.pxi.in index 5b55ec2b1bf6d..feb8cfb76a7f0 100644 --- a/pandas/src/join_helper.pxi.in +++ b/pandas/src/join_helper.pxi.in @@ -15,7 +15,8 @@ dtypes = [('float64', 'float64_t', 'np.float64'), ('float32', 'float32_t', 'np.float32'), ('object', 'object', 'object'), ('int32', 'int32_t', 'np.int32'), - ('int64', 'int64_t', 'np.int64')] + ('int64', 'int64_t', 'np.int64'), + ('uint64', 'uint64_t', 'np.uint64')] def get_dispatch(dtypes): @@ -404,4 +405,4 @@ def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left, return result, lindexer, rindexer -{{endfor}} \ No newline at end of file +{{endfor}} diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in index 33926a23f7f41..68c376492f8f2 100644 --- a/pandas/src/joins_func_helper.pxi.in +++ b/pandas/src/joins_func_helper.pxi.in @@ -12,7 +12,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # table_type, by_dtype -by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')] +by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t'), + ('UInt64HashTable', 'uint64_t')] # on_dtype on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index abe40f7be1d90..02d288bdf6ea8 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2789,7 +2789,63 @@ def test_set_reset(self): result = df.reset_index() self.assertTrue(result['foo'].dtype, 'M8[ns, US/Eastern') - result = result.set_index('foo') + df = result.set_index('foo') + tm.assert_index_equal(df.index, idx) + + def test_transpose(self): + + result = self.df.T + expected = DataFrame(self.df.values.T) + expected.index = ['A', 'B'] + assert_frame_equal(result, expected) + + +class TestDataFrameIndexingUInt64(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ir = Index(np.arange(3), dtype=np.uint64) + self.idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo') + + self.df = DataFrame({'A': self.idx, 'B': self.ir}) + + def test_setitem(self): + + df = self.df + idx = self.idx + + # setitem + df['C'] = idx + assert_series_equal(df['C'], Series(idx, name='C')) + + df['D'] = 'foo' + df['D'] = idx + assert_series_equal(df['D'], Series(idx, name='D')) + del df['D'] + + # With NaN: because uint64 has no NaN element, + # the column should be cast to object. + df2 = df.copy() + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2['B'] + assert_series_equal(notnull(result), Series( + [True, False, True], name='B')) + assert_series_equal(df2.dtypes, Series([np.dtype('uint64'), + np.dtype('O'), np.dtype('O')], + index=['A', 'B', 'C'])) + + def test_set_reset(self): + + idx = self.idx + + # set/reset + df = DataFrame({'A': [0, 1, 2]}, index=idx) + result = df.reset_index() + self.assertEqual(result['foo'].dtype, np.dtype('uint64')) + + df = result.set_index('foo') tm.assert_index_equal(df.index, idx) def test_transpose(self): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1b373baf9b3c1..63e9fe580d73d 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,8 +5,8 @@ import numpy as np -from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex, - MultiIndex, CategoricalIndex, DatetimeIndex, +from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, + RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, notnull) from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp @@ -470,10 +470,11 @@ def test_where(self): expected = i tm.assert_index_equal(result, expected) - i2 = i.copy() - i2 = pd.Index([np.nan, np.nan] + i[2:].tolist()) - result = i.where(notnull(i2)) - expected = i2 + _nan = i._na_value + cond = [False] + [True] * len(i[1:]) + expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) + + result = i.where(cond) tm.assert_index_equal(result, expected) def test_setops_errorcases(self): @@ -660,6 +661,12 @@ def test_equals(self): self.assertFalse(idx.equals(list(idx))) self.assertFalse(idx.equals(np.array(idx))) + # Cannot pass in non-int64 dtype to RangeIndex + if not isinstance(idx, RangeIndex): + same_values = Index(idx, dtype=object) + self.assertTrue(idx.equals(same_values)) + self.assertTrue(same_values.equals(idx)) + if idx.nlevels == 1: # do not test MultiIndex self.assertFalse(idx.equals(pd.Series(idx))) @@ -744,7 +751,7 @@ def test_numpy_ufuncs(self): with tm.assertRaises(Exception): with np.errstate(all='ignore'): func(idx) - elif isinstance(idx, (Float64Index, Int64Index)): + elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): # coerces to float (e.g. np.sin) with np.errstate(all='ignore'): result = func(idx) @@ -765,7 +772,7 @@ def test_numpy_ufuncs(self): # raise TypeError or ValueError (PeriodIndex) with tm.assertRaises(Exception): func(idx) - elif isinstance(idx, (Float64Index, Int64Index)): + elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): # results in bool array result = func(idx) exp = func(idx.values) @@ -798,7 +805,7 @@ def test_hasnans_isnans(self): continue elif isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin): values[1] = pd.tslib.iNaT - elif isinstance(index, Int64Index): + elif isinstance(index, (Int64Index, UInt64Index)): continue else: values[1] = np.nan @@ -838,7 +845,7 @@ def test_fillna(self): if isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin): values[1] = pd.tslib.iNaT - elif isinstance(index, Int64Index): + elif isinstance(index, (Int64Index, UInt64Index)): continue else: values[1] = np.nan diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0e6773fd83404..a0f2a090c9a06 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -4,17 +4,18 @@ import pandas.util.testing as tm from pandas.indexes.api import Index, MultiIndex -from .common import Base +from pandas.tests.indexes.common import Base from pandas.compat import (range, lrange, lzip, u, - zip, PY3, PY36) + text_type, zip, PY3, PY36) import operator import os +import nose import numpy as np from pandas import (period_range, date_range, Series, - Float64Index, Int64Index, + DataFrame, Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) from pandas.core.index import _get_combined_index @@ -40,6 +41,7 @@ def setUp(self): periodIndex=tm.makePeriodIndex(100), tdIndex=tm.makeTimedeltaIndex(100), intIndex=tm.makeIntIndex(100), + uintIndex=tm.makeUIntIndex(100), rangeIndex=tm.makeIntIndex(100), floatIndex=tm.makeFloatIndex(100), boolIndex=Index([True, False]), @@ -449,7 +451,7 @@ def test_delete(self): self.assertEqual(result.name, expected.name) with tm.assertRaises((IndexError, ValueError)): - # either depeidnig on numpy version + # either depending on numpy version result = idx.delete(5) def test_identical(self): @@ -2020,3 +2022,64 @@ def test_repeat(self): with tm.assert_produces_warning(FutureWarning): result = idx.repeat(n=repeats) tm.assert_index_equal(result, expected) + + def test_is_monotonic_na(self): + examples = [pd.Index([np.nan]), + pd.Index([np.nan, 1]), + pd.Index([1, 2, np.nan]), + pd.Index(['a', 'b', np.nan]), + pd.to_datetime(['NaT']), + pd.to_datetime(['NaT', '2000-01-01']), + pd.to_datetime(['2000-01-01', 'NaT', '2000-01-02']), + pd.to_timedelta(['1 day', 'NaT']), ] + for index in examples: + self.assertFalse(index.is_monotonic_increasing) + self.assertFalse(index.is_monotonic_decreasing) + + def test_repr_summary(self): + with cf.option_context('display.max_seq_items', 10): + r = repr(pd.Index(np.arange(1000))) + self.assertTrue(len(r) < 200) + self.assertTrue("..." in r) + + def test_int_name_format(self): + index = Index(['a', 'b', 'c'], name=0) + s = Series(lrange(3), index) + df = DataFrame(lrange(3), index=index) + repr(s) + repr(df) + + def test_print_unicode_columns(self): + df = pd.DataFrame({u("\u05d0"): [1, 2, 3], + "\u05d1": [4, 5, 6], + "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_unicode_string_with_unicode(self): + idx = Index(lrange(1000)) + + if PY3: + str(idx) + else: + text_type(idx) + + def test_bytestring_with_unicode(self): + idx = Index(lrange(1000)) + if PY3: + bytes(idx) + else: + str(idx) + + def test_intersect_str_dates(self): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(['aa'], dtype=object) + res = i2.intersection(i1) + + self.assertEqual(len(res), 0) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index f7f072d5b5d2a..044d3477271ad 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -1,22 +1,20 @@ # -*- coding: utf-8 -*- from datetime import datetime -from pandas import compat -from pandas.compat import range, lrange, u, PY3 +from pandas.compat import range, PY3 +import nose import numpy as np -from pandas import (date_range, Series, DataFrame, - Index, Float64Index, Int64Index, RangeIndex) -from pandas.util.testing import assertRaisesRegexp +from pandas import (date_range, Series, Index, Float64Index, + Int64Index, UInt64Index, RangeIndex) import pandas.util.testing as tm -import pandas.core.config as cf import pandas as pd from pandas.lib import Timestamp -from .common import Base +from pandas.tests.indexes.common import Base def full_like(array, value): @@ -64,10 +62,11 @@ def test_numeric_compat(self): result = idx * np.array(5, dtype='int64') tm.assert_index_equal(result, idx * 5) - result = idx * np.arange(5, dtype='int64') + arr_dtype = 'uint64' if isinstance(idx, UInt64Index) else 'int64' + result = idx * np.arange(5, dtype=arr_dtype) tm.assert_index_equal(result, didx) - result = idx * Series(np.arange(5, dtype='int64')) + result = idx * Series(np.arange(5, dtype=arr_dtype)) tm.assert_index_equal(result, didx) result = idx * Series(np.arange(5, dtype='float64') + 0.1) @@ -448,7 +447,183 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) -class TestInt64Index(Numeric, tm.TestCase): +class NumericInt(Numeric): + + def test_view(self): + super(NumericInt, self).test_view() + + i = self._holder([], name='Foo') + i_view = i.view() + self.assertEqual(i_view.name, 'Foo') + + i_view = i.view(self._dtype) + tm.assert_index_equal(i, self._holder(i_view, name='Foo')) + + i_view = i.view(self._holder) + tm.assert_index_equal(i, self._holder(i_view, name='Foo')) + + def test_is_monotonic(self): + self.assertTrue(self.index.is_monotonic) + self.assertTrue(self.index.is_monotonic_increasing) + self.assertFalse(self.index.is_monotonic_decreasing) + + index = self._holder([4, 3, 2, 1]) + self.assertFalse(index.is_monotonic) + self.assertTrue(index.is_monotonic_decreasing) + + index = self._holder([1]) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_identical(self): + i = Index(self.index.copy()) + self.assertTrue(i.identical(self.index)) + + same_values_different_type = Index(i, dtype=object) + self.assertFalse(i.identical(same_values_different_type)) + + i = self.index.copy(dtype=object) + i = i.rename('foo') + same_values = Index(i, dtype=object) + self.assertTrue(same_values.identical(i)) + + self.assertFalse(i.identical(self.index)) + self.assertTrue(Index(same_values, name='foo', dtype=object).identical( + i)) + + self.assertFalse(self.index.copy(dtype=object) + .identical(self.index.copy(dtype=self._dtype))) + + def test_join_non_unique(self): + left = Index([4, 4, 3, 3]) + + joined, lidx, ridx = left.join(left, return_indexers=True) + + exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + self.assert_index_equal(joined, exp_joined) + + exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(lidx, exp_lidx) + + exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(ridx, exp_ridx) + + def test_join_self(self): + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = self.index.join(self.index, how=kind) + self.assertIs(self.index, joined) + + def test_union_noncomparable(self): + from datetime import datetime, timedelta + # corner case, non-Int64Index + now = datetime.now() + other = Index([now + timedelta(i) for i in range(4)], dtype=object) + result = self.index.union(other) + expected = Index(np.concatenate((self.index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(self.index) + expected = Index(np.concatenate((other, self.index))) + tm.assert_index_equal(result, expected) + + def test_cant_or_shouldnt_cast(self): + # can't + data = ['foo', 'bar', 'baz'] + self.assertRaises(TypeError, self._holder, data) + + # shouldn't + data = ['0', '1', '2'] + self.assertRaises(TypeError, self._holder, data) + + def test_view_index(self): + self.index.view(Index) + + def test_prevent_casting(self): + result = self.index.astype('O') + self.assertEqual(result.dtype, np.object_) + + def test_take_preserve_name(self): + index = self._holder([1, 2, 3, 4], name='foo') + taken = index.take([3, 0, 1]) + self.assertEqual(index.name, taken.name) + + def test_take_fill_value(self): + # see gh-12631 + idx = self._holder([1, 2, 3], name='xxx') + result = idx.take(np.array([1, 0, -1])) + expected = self._holder([2, 1, 3], name='xxx') + tm.assert_index_equal(result, expected) + + name = self._holder.__name__ + msg = ("Unable to fill values because " + "{name} cannot contain NA").format(name=name) + + # fill_value=True + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = self._holder([2, 1, 3], name='xxx') + tm.assert_index_equal(result, expected) + + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + idx.take(np.array([1, -5])) + + def test_slice_keep_name(self): + idx = self._holder([1, 2], name='asdf') + self.assertEqual(idx.name, idx[1:].name) + + def test_ufunc_coercions(self): + idx = self._holder([1, 2, 3, 4, 5], name='x') + + result = np.sqrt(idx) + tm.assertIsInstance(result, Float64Index) + exp = Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x') + tm.assert_index_equal(result, exp) + + result = np.divide(idx, 2.) + tm.assertIsInstance(result, Float64Index) + exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + tm.assert_index_equal(result, exp) + + # _evaluate_numeric_binop + result = idx + 2. + tm.assertIsInstance(result, Float64Index) + exp = Float64Index([3., 4., 5., 6., 7.], name='x') + tm.assert_index_equal(result, exp) + + result = idx - 2. + tm.assertIsInstance(result, Float64Index) + exp = Float64Index([-1., 0., 1., 2., 3.], name='x') + tm.assert_index_equal(result, exp) + + result = idx * 1. + tm.assertIsInstance(result, Float64Index) + exp = Float64Index([1., 2., 3., 4., 5.], name='x') + tm.assert_index_equal(result, exp) + + result = idx / 2. + tm.assertIsInstance(result, Float64Index) + exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + tm.assert_index_equal(result, exp) + + +class TestInt64Index(NumericInt, tm.TestCase): + _dtype = 'int64' _holder = Int64Index _multiprocess_can_split_ = True @@ -459,12 +634,6 @@ def setUp(self): def create_index(self): return Int64Index(np.arange(5, dtype='int64')) - def test_too_many_names(self): - def testit(): - self.index.names = ["roger", "harold"] - - assertRaisesRegexp(ValueError, "^Length", testit) - def test_constructor(self): # pass list, coerce fine index = Int64Index([-5, 0, 1, 2]) @@ -511,24 +680,6 @@ def test_constructor_corner(self): with tm.assertRaisesRegexp(TypeError, 'casting'): Int64Index(arr_with_floats) - def test_copy(self): - i = Int64Index([], name='Foo') - i_copy = i.copy() - self.assertEqual(i_copy.name, 'Foo') - - def test_view(self): - super(TestInt64Index, self).test_view() - - i = Int64Index([], name='Foo') - i_view = i.view() - self.assertEqual(i_view.name, 'Foo') - - i_view = i.view('i8') - tm.assert_index_equal(i, Int64Index(i_view, name='Foo')) - - i_view = i.view(Int64Index) - tm.assert_index_equal(i, Int64Index(i_view, name='Foo')) - def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) @@ -538,119 +689,33 @@ def test_coerce_list(self): arr = Index([1, 2, 3, 4], dtype=object) tm.assertIsInstance(arr, Index) - def test_dtype(self): - self.assertEqual(self.index.dtype, np.int64) - - def test_is_monotonic(self): - self.assertTrue(self.index.is_monotonic) - self.assertTrue(self.index.is_monotonic_increasing) - self.assertFalse(self.index.is_monotonic_decreasing) - - index = Int64Index([4, 3, 2, 1]) - self.assertFalse(index.is_monotonic) - self.assertTrue(index.is_monotonic_decreasing) - - index = Int64Index([1]) - self.assertTrue(index.is_monotonic) - self.assertTrue(index.is_monotonic_increasing) - self.assertTrue(index.is_monotonic_decreasing) - - def test_is_monotonic_na(self): - examples = [Index([np.nan]), - Index([np.nan, 1]), - Index([1, 2, np.nan]), - Index(['a', 'b', np.nan]), - pd.to_datetime(['NaT']), - pd.to_datetime(['NaT', '2000-01-01']), - pd.to_datetime(['2000-01-01', 'NaT', '2000-01-02']), - pd.to_timedelta(['1 day', 'NaT']), ] - for index in examples: - self.assertFalse(index.is_monotonic_increasing) - self.assertFalse(index.is_monotonic_decreasing) - - def test_equals(self): - same_values = Index(self.index, dtype=object) - self.assertTrue(self.index.equals(same_values)) - self.assertTrue(same_values.equals(self.index)) - - def test_logical_compat(self): - idx = self.create_index() - self.assertEqual(idx.all(), idx.values.all()) - self.assertEqual(idx.any(), idx.values.any()) - - def test_identical(self): - i = Index(self.index.copy()) - self.assertTrue(i.identical(self.index)) - - same_values_different_type = Index(i, dtype=object) - self.assertFalse(i.identical(same_values_different_type)) - - i = self.index.copy(dtype=object) - i = i.rename('foo') - same_values = Index(i, dtype=object) - self.assertTrue(same_values.identical(i)) - - self.assertFalse(i.identical(self.index)) - self.assertTrue(Index(same_values, name='foo', dtype=object).identical( - i)) - - self.assertFalse(self.index.copy(dtype=object) - .identical(self.index.copy(dtype='int64'))) - def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - def test_get_indexer_pad(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target, method='pad') expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - def test_get_indexer_backfill(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target, method='backfill') expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - def test_join_outer(self): - other = Int64Index([7, 12, 25, 1, 2, 5]) - other_mono = Int64Index([1, 2, 5, 7, 12, 25]) - - # not monotonic - # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') - self.assert_index_equal(res, noidx_res) - - eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) - elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.intp) - eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], - dtype=np.intp) - - tm.assertIsInstance(res, Int64Index) - self.assert_index_equal(res, eres) - tm.assert_numpy_array_equal(lidx, elidx) - tm.assert_numpy_array_equal(ridx, eridx) - - # monotonic - res, lidx, ridx = self.index.join(other_mono, how='outer', - return_indexers=True) - noidx_res = self.index.join(other_mono, how='outer') - self.assert_index_equal(res, noidx_res) + def test_intersection(self): + other = Index([1, 2, 3, 4, 5]) + result = self.index.intersection(other) + expected = Index(np.sort(np.intersect1d(self.index.values, + other.values))) + tm.assert_index_equal(result, expected) - elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.intp) - eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], - dtype=np.intp) - tm.assertIsInstance(res, Int64Index) - self.assert_index_equal(res, eres) - tm.assert_numpy_array_equal(lidx, elidx) - tm.assert_numpy_array_equal(ridx, eridx) + result = other.intersection(self.index) + expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, + other.values)))) + tm.assert_index_equal(result, expected) def test_join_inner(self): other = Int64Index([7, 12, 25, 1, 2, 5]) @@ -789,28 +854,92 @@ def test_join_non_int_index(self): right2 = other.join(self.index, how='right') self.assert_index_equal(right2, self.index.astype(object)) - def test_join_non_unique(self): - left = Index([4, 4, 3, 3]) + def test_join_outer(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) - joined, lidx, ridx = left.join(left, return_indexers=True) + # not monotonic + # guarantee of sortedness + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assert_index_equal(res, noidx_res) - exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) - self.assert_index_equal(joined, exp_joined) + eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], + dtype=np.intp) + eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], + dtype=np.intp) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) - tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assertIsInstance(res, Int64Index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(ridx, exp_ridx) + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='outer', + return_indexers=True) + noidx_res = self.index.join(other_mono, how='outer') + self.assert_index_equal(res, noidx_res) - def test_join_self(self): - kinds = 'outer', 'inner', 'left', 'right' - for kind in kinds: - joined = self.index.join(self.index, how=kind) - self.assertIs(self.index, joined) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], + dtype=np.intp) + eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], + dtype=np.intp) + tm.assertIsInstance(res, Int64Index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + +class TestUInt64Index(NumericInt, tm.TestCase): + + _dtype = 'uint64' + _holder = UInt64Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(index=UInt64Index([2**63, 2**63 + 10, 2**63 + 15, + 2**63 + 20, 2**63 + 25])) + self.setup_indices() + + def create_index(self): + return UInt64Index(np.arange(5, dtype='uint64')) + + def test_constructor(self): + idx = UInt64Index([1, 2, 3]) + res = Index([1, 2, 3], dtype=np.uint64) + tm.assert_index_equal(res, idx) + + idx = UInt64Index([1, 2**63]) + res = Index([1, 2**63], dtype=np.uint64) + tm.assert_index_equal(res, idx) + + idx = UInt64Index([1, 2**63]) + res = Index([1, 2**63]) + tm.assert_index_equal(res, idx) + + def test_get_indexer(self): + target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) + indexer = self.index.get_indexer(target) + expected = np.array([0, -1, 1, 2, 3, 4, + -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) + indexer = self.index.get_indexer(target, method='pad') + expected = np.array([0, 0, 1, 2, 3, 4, + 4, 4, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) + indexer = self.index.get_indexer(target, method='backfill') + expected = np.array([0, 1, 1, 2, 3, 4, + -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) def test_intersection(self): - other = Index([1, 2, 3, 4, 5]) + other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20]) result = self.index.intersection(other) expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) @@ -821,147 +950,198 @@ def test_intersection(self): other.values)))) tm.assert_index_equal(result, expected) - def test_intersect_str_dates(self): - dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + def test_join_inner(self): + other = UInt64Index(2**63 + np.array( + [7, 12, 25, 1, 2, 10], dtype='uint64')) + other_mono = UInt64Index(2**63 + np.array( + [1, 2, 7, 10, 12, 25], dtype='uint64')) - i1 = Index(dt_dates, dtype=object) - i2 = Index(['aa'], dtype=object) - res = i2.intersection(i1) + # not monotonic + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) - self.assertEqual(len(res), 0) + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) - def test_union_noncomparable(self): - from datetime import datetime, timedelta - # corner case, non-Int64Index - now = datetime.now() - other = Index([now + timedelta(i) for i in range(4)], dtype=object) - result = self.index.union(other) - expected = Index(np.concatenate((self.index, other))) - tm.assert_index_equal(result, expected) + eres = UInt64Index(2**63 + np.array([10, 25], dtype='uint64')) + elidx = np.array([1, 4], dtype=np.intp) + eridx = np.array([5, 2], dtype=np.intp) - result = other.union(self.index) - expected = Index(np.concatenate((other, self.index))) - tm.assert_index_equal(result, expected) + tm.assertIsInstance(res, UInt64Index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) - def test_cant_or_shouldnt_cast(self): - # can't - data = ['foo', 'bar', 'baz'] - self.assertRaises(TypeError, Int64Index, data) + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='inner', + return_indexers=True) - # shouldn't - data = ['0', '1', '2'] - self.assertRaises(TypeError, Int64Index, data) + res2 = self.index.intersection(other_mono) + self.assert_index_equal(res, res2) - def test_view_Index(self): - self.index.view(Index) + elidx = np.array([1, 4], dtype=np.intp) + eridx = np.array([3, 5], dtype=np.intp) - def test_prevent_casting(self): - result = self.index.astype('O') - self.assertEqual(result.dtype, np.object_) + tm.assertIsInstance(res, UInt64Index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) - def test_take_preserve_name(self): - index = Int64Index([1, 2, 3, 4], name='foo') - taken = index.take([3, 0, 1]) - self.assertEqual(index.name, taken.name) + def test_join_left(self): + other = UInt64Index(2**63 + np.array( + [7, 12, 25, 1, 2, 10], dtype='uint64')) + other_mono = UInt64Index(2**63 + np.array( + [1, 2, 7, 10, 12, 25], dtype='uint64')) - def test_take_fill_value(self): - # GH 12631 - idx = pd.Int64Index([1, 2, 3], name='xxx') - result = idx.take(np.array([1, 0, -1])) - expected = pd.Int64Index([2, 1, 3], name='xxx') - tm.assert_index_equal(result, expected) + # not monotonic + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + eres = self.index + eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp) - # fill_value - msg = "Unable to fill values because Int64Index cannot contain NA" - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -1]), fill_value=True) + tm.assertIsInstance(res, UInt64Index) + self.assert_index_equal(res, eres) + self.assertIsNone(lidx) + tm.assert_numpy_array_equal(ridx, eridx) - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Int64Index([2, 1, 3], name='xxx') - tm.assert_index_equal(result, expected) + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='left', + return_indexers=True) + eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp) - msg = "Unable to fill values because Int64Index cannot contain NA" - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) + tm.assertIsInstance(res, UInt64Index) + self.assert_index_equal(res, eres) + self.assertIsNone(lidx) + tm.assert_numpy_array_equal(ridx, eridx) - with tm.assertRaises(IndexError): - idx.take(np.array([1, -5])) + # non-unique + idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64')) + idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64')) + res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) - def test_int_name_format(self): - index = Index(['a', 'b', 'c'], name=0) - s = Series(lrange(3), index) - df = DataFrame(lrange(3), index=index) - repr(s) - repr(df) - - def test_print_unicode_columns(self): - df = pd.DataFrame({u("\u05d0"): [1, 2, 3], - "\u05d1": [4, 5, 6], - "c": [7, 8, 9]}) - repr(df.columns) # should not raise UnicodeDecodeError - - def test_repr_summary(self): - with cf.option_context('display.max_seq_items', 10): - r = repr(pd.Index(np.arange(1000))) - self.assertTrue(len(r) < 200) - self.assertTrue("..." in r) + # 1 is in idx2, so it should be x2 + eres = UInt64Index(2**63 + np.array( + [1, 1, 2, 5, 7, 9], dtype='uint64')) + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) - def test_repr_roundtrip(self): - tm.assert_index_equal(eval(repr(self.index)), self.index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) - def test_unicode_string_with_unicode(self): - idx = Index(lrange(1000)) + def test_join_right(self): + other = UInt64Index(2**63 + np.array( + [7, 12, 25, 1, 2, 10], dtype='uint64')) + other_mono = UInt64Index(2**63 + np.array( + [1, 2, 7, 10, 12, 25], dtype='uint64')) - if PY3: - str(idx) - else: - compat.text_type(idx) + # not monotonic + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp) - def test_bytestring_with_unicode(self): - idx = Index(lrange(1000)) - if PY3: - bytes(idx) - else: - str(idx) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assertIsInstance(other, UInt64Index) + self.assert_index_equal(res, eres) + self.assertIsNone(ridx) - def test_slice_keep_name(self): - idx = Int64Index([1, 2], name='asdf') - self.assertEqual(idx.name, idx[1:].name) + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='right', + return_indexers=True) + eres = other_mono + elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp) - def test_ufunc_coercions(self): - idx = Int64Index([1, 2, 3, 4, 5], name='x') + tm.assertIsInstance(other, UInt64Index) + tm.assert_numpy_array_equal(lidx, elidx) + self.assert_index_equal(res, eres) + self.assertIsNone(ridx) - result = np.sqrt(idx) - tm.assertIsInstance(result, Float64Index) - exp = Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x') - tm.assert_index_equal(result, exp) + # non-unique + idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64')) + idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64')) + res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) - result = np.divide(idx, 2.) - tm.assertIsInstance(result, Float64Index) - exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') - tm.assert_index_equal(result, exp) + # 1 is in idx2, so it should be x2 + eres = UInt64Index(2**63 + np.array( + [1, 1, 2, 5, 7, 9], dtype='uint64')) + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) - # _evaluate_numeric_binop - result = idx + 2. - tm.assertIsInstance(result, Float64Index) - exp = Float64Index([3., 4., 5., 6., 7.], name='x') - tm.assert_index_equal(result, exp) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) - result = idx - 2. - tm.assertIsInstance(result, Float64Index) - exp = Float64Index([-1., 0., 1., 2., 3.], name='x') - tm.assert_index_equal(result, exp) + def test_join_non_int_index(self): + other = Index(2**63 + np.array( + [1, 5, 7, 10, 20], dtype='uint64'), dtype=object) - result = idx * 1. - tm.assertIsInstance(result, Float64Index) - exp = Float64Index([1., 2., 3., 4., 5.], name='x') - tm.assert_index_equal(result, exp) + outer = self.index.join(other, how='outer') + outer2 = other.join(self.index, how='outer') + expected = Index(2**63 + np.array( + [0, 1, 5, 7, 10, 15, 20, 25], dtype='uint64')) + self.assert_index_equal(outer, outer2) + self.assert_index_equal(outer, expected) - result = idx / 2. - tm.assertIsInstance(result, Float64Index) - exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') - tm.assert_index_equal(result, exp) + inner = self.index.join(other, how='inner') + inner2 = other.join(self.index, how='inner') + expected = Index(2**63 + np.array([10, 20], dtype='uint64')) + self.assert_index_equal(inner, inner2) + self.assert_index_equal(inner, expected) + + left = self.index.join(other, how='left') + self.assert_index_equal(left, self.index.astype(object)) + + left2 = other.join(self.index, how='left') + self.assert_index_equal(left2, other) + + right = self.index.join(other, how='right') + self.assert_index_equal(right, other) + + right2 = other.join(self.index, how='right') + self.assert_index_equal(right2, self.index.astype(object)) + + def test_join_outer(self): + other = UInt64Index(2**63 + np.array( + [7, 12, 25, 1, 2, 10], dtype='uint64')) + other_mono = UInt64Index(2**63 + np.array( + [1, 2, 7, 10, 12, 25], dtype='uint64')) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assert_index_equal(res, noidx_res) + + eres = UInt64Index(2**63 + np.array( + [0, 1, 2, 7, 10, 12, 15, 20, 25], dtype='uint64')) + elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) + eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) + + tm.assertIsInstance(res, UInt64Index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='outer', + return_indexers=True) + noidx_res = self.index.join(other_mono, how='outer') + self.assert_index_equal(res, noidx_res) + + elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) + eridx = np.array([-1, 0, 1, 2, 3, 4, -1, -1, 5], dtype=np.intp) + + tm.assertIsInstance(res, UInt64Index) + self.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 6fc24e41ee914..a50027f1d0343 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -20,7 +20,7 @@ from pandas import option_context from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, - MultiIndex, Timestamp, Timedelta) + MultiIndex, Timestamp, Timedelta, UInt64Index) from pandas.formats.printing import pprint_thing from pandas import concat from pandas.core.common import PerformanceWarning, UnsortedIndexError @@ -100,7 +100,8 @@ class TestIndexing(tm.TestCase): _multiprocess_can_split_ = True _objs = set(['series', 'frame', 'panel']) - _typs = set(['ints', 'labels', 'mixed', 'ts', 'floats', 'empty', 'ts_rev']) + _typs = set(['ints', 'uints', 'labels', 'mixed', + 'ts', 'floats', 'empty', 'ts_rev']) def setUp(self): @@ -116,6 +117,16 @@ def setUp(self): major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) + self.series_uints = Series(np.random.rand(4), + index=UInt64Index(lrange(0, 8, 2))) + self.frame_uints = DataFrame(np.random.randn(4, 4), + index=UInt64Index(lrange(0, 8, 2)), + columns=UInt64Index(lrange(0, 12, 3))) + self.panel_uints = Panel(np.random.rand(4, 4, 4), + items=UInt64Index(lrange(0, 8, 2)), + major_axis=UInt64Index(lrange(0, 12, 3)), + minor_axis=UInt64Index(lrange(0, 16, 4))) + self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) @@ -197,10 +208,6 @@ def _print(result, error=None): pprint_thing(v) try: - # if (name == 'bool' and t == 'empty' and o == 'series' and - # method1 == 'loc'): - # import pdb; pdb.set_trace() - rs = getattr(obj, method1).__getitem__(_axify(obj, k1, a)) try: @@ -210,6 +217,8 @@ def _print(result, error=None): _print(result) return + detail = None + try: if is_scalar(rs) and is_scalar(xp): self.assertEqual(rs, xp) @@ -220,7 +229,8 @@ def _print(result, error=None): elif xp.ndim == 3: tm.assert_panel_equal(rs, xp) result = 'ok' - except (AssertionError): + except AssertionError as e: + detail = str(e) result = 'fail' # reverse the checks @@ -228,10 +238,9 @@ def _print(result, error=None): if result == 'fail': result = 'ok (fail)' - if not result.startswith('ok'): - raise AssertionError(_print(result)) - _print(result) + if not result.startswith('ok'): + raise AssertionError(detail) except AssertionError: raise @@ -309,16 +318,17 @@ def _check(f, func, values=False): d = getattr(self, o) # iat - _check(d['ints'], 'iat', values=True) + for f in [d['ints'], d['uints']]: + _check(f, 'iat', values=True) + for f in [d['labels'], d['ts'], d['floats']]: if f is not None: self.assertRaises(ValueError, self.check_values, f, 'iat') # at - _check(d['ints'], 'at') - _check(d['labels'], 'at') - _check(d['ts'], 'at') - _check(d['floats'], 'at') + for f in [d['ints'], d['uints'], d['labels'], + d['ts'], d['floats']]: + _check(f, 'at') def test_at_and_iat_set(self): def _check(f, func, values=False): @@ -334,16 +344,18 @@ def _check(f, func, values=False): d = getattr(self, t) - _check(d['ints'], 'iat', values=True) + # iat + for f in [d['ints'], d['uints']]: + _check(f, 'iat', values=True) + for f in [d['labels'], d['ts'], d['floats']]: if f is not None: self.assertRaises(ValueError, _check, f, 'iat') # at - _check(d['ints'], 'at') - _check(d['labels'], 'at') - _check(d['ts'], 'at') - _check(d['floats'], 'at') + for f in [d['ints'], d['uints'], d['labels'], + d['ts'], d['floats']]: + _check(f, 'at') def test_at_iat_coercion(self): @@ -508,7 +520,7 @@ def test_iloc_getitem_int(self): # integer self.check_result('integer', 'iloc', 2, 'ix', - {0: 4, 1: 6, 2: 8}, typs=['ints']) + {0: 4, 1: 6, 2: 8}, typs=['ints', 'uints']) self.check_result('integer', 'iloc', 2, 'indexer', 2, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -517,7 +529,7 @@ def test_iloc_getitem_neg_int(self): # neg integer self.check_result('neg int', 'iloc', -1, 'ix', - {0: 6, 1: 9, 2: 12}, typs=['ints']) + {0: 6, 1: 9, 2: 12}, typs=['ints', 'uints']) self.check_result('neg int', 'iloc', -1, 'indexer', -1, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -527,9 +539,9 @@ def test_iloc_getitem_list_int(self): # list of ints self.check_result('list int', 'iloc', [0, 1, 2], 'ix', {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, - typs=['ints']) + typs=['ints', 'uints']) self.check_result('list int', 'iloc', [2], 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints']) + {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -539,9 +551,9 @@ def test_iloc_getitem_list_int(self): self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'ix', {0: [0, 2, 4], 1: [0, 3, 6], - 2: [0, 4, 8]}, typs=['ints']) + 2: [0, 4, 8]}, typs=['ints', 'uints']) self.check_result('array int', 'iloc', np.array([2]), 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints']) + {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', [0, 1, 2], typs=['labels', 'mixed', 'ts', 'floats', 'empty'], @@ -579,7 +591,7 @@ def test_iloc_getitem_dups(self): # no dups in panel (bug?) self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, - objs=['series', 'frame'], typs=['ints']) + objs=['series', 'frame'], typs=['ints', 'uints']) # GH 6766 df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) @@ -601,13 +613,13 @@ def test_iloc_getitem_array(self): s = Series(index=lrange(1, 4)) self.check_result('array like', 'iloc', s.index, 'ix', {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, - typs=['ints']) + typs=['ints', 'uints']) def test_iloc_getitem_bool(self): # boolean indexers b = [True, False, True, False, ] - self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints']) + self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints', 'uints']) self.check_result('bool', 'iloc', b, 'ix', b, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -617,7 +629,7 @@ def test_iloc_getitem_slice(self): # slices self.check_result('slice', 'iloc', slice(1, 3), 'ix', {0: [2, 4], 1: [3, 6], 2: [4, 8]}, - typs=['ints']) + typs=['ints', 'uints']) self.check_result('slice', 'iloc', slice(1, 3), 'indexer', slice(1, 3), typs=['labels', 'mixed', 'ts', 'floats', 'empty'], @@ -1124,14 +1136,14 @@ def check(result, expected): def test_loc_getitem_int(self): # int label - self.check_result('int label', 'loc', 2, 'ix', 2, typs=['ints'], - axes=0) - self.check_result('int label', 'loc', 3, 'ix', 3, typs=['ints'], - axes=1) - self.check_result('int label', 'loc', 4, 'ix', 4, typs=['ints'], - axes=2) - self.check_result('int label', 'loc', 2, 'ix', 2, typs=['label'], - fails=KeyError) + self.check_result('int label', 'loc', 2, 'ix', 2, + typs=['ints', 'uints'], axes=0) + self.check_result('int label', 'loc', 3, 'ix', 3, + typs=['ints', 'uints'], axes=1) + self.check_result('int label', 'loc', 4, 'ix', 4, + typs=['ints', 'uints'], axes=2) + self.check_result('int label', 'loc', 2, 'ix', 2, + typs=['label'], fails=KeyError) def test_loc_getitem_label(self): @@ -1150,12 +1162,12 @@ def test_loc_getitem_label_out_of_range(self): # out of range label self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['ints', 'labels', 'mixed', 'ts'], + typs=['ints', 'uints', 'labels', 'mixed', 'ts'], fails=KeyError) self.check_result('label range', 'loc', 'f', 'ix', 'f', typs=['floats'], fails=TypeError) self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['ints', 'mixed'], fails=KeyError) + typs=['ints', 'uints', 'mixed'], fails=KeyError) self.check_result('label range', 'loc', 20, 'ix', 20, typs=['labels'], fails=TypeError) self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'], @@ -1167,11 +1179,11 @@ def test_loc_getitem_label_list(self): # list of labels self.check_result('list lbl', 'loc', [0, 2, 4], 'ix', [0, 2, 4], - typs=['ints'], axes=0) + typs=['ints', 'uints'], axes=0) self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9], - typs=['ints'], axes=1) + typs=['ints', 'uints'], axes=1) self.check_result('list lbl', 'loc', [4, 8, 12], 'ix', [4, 8, 12], - typs=['ints'], axes=2) + typs=['ints', 'uints'], axes=2) self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix', ['a', 'b', 'd'], typs=['labels'], axes=0) self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix', @@ -1188,27 +1200,27 @@ def test_loc_getitem_label_list(self): self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2], typs=['empty'], fails=KeyError) self.check_result('list lbl', 'loc', [0, 2, 3], 'ix', [0, 2, 3], - typs=['ints'], axes=0, fails=KeyError) + typs=['ints', 'uints'], axes=0, fails=KeyError) self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7], - typs=['ints'], axes=1, fails=KeyError) + typs=['ints', 'uints'], axes=1, fails=KeyError) self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], - typs=['ints'], axes=2, fails=KeyError) + typs=['ints', 'uints'], axes=2, fails=KeyError) def test_loc_getitem_label_list_fails(self): # fails self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints'], axes=1, fails=KeyError) + typs=['ints', 'uints'], axes=1, fails=KeyError) self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints'], axes=2, fails=KeyError) + typs=['ints', 'uints'], axes=2, fails=KeyError) def test_loc_getitem_label_array_like(self): # array like self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, - 'ix', [0, 2, 4], typs=['ints'], axes=0) + 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0) self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index, - 'ix', [3, 6, 9], typs=['ints'], axes=1) + 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) self.check_result('array like', 'loc', Series(index=[4, 8, 12]).index, - 'ix', [4, 8, 12], typs=['ints'], axes=2) + 'ix', [4, 8, 12], typs=['ints', 'uints'], axes=2) def test_loc_getitem_series(self): # GH14730 @@ -1236,7 +1248,8 @@ def test_loc_getitem_bool(self): # boolean indexers b = [True, False, True, False] self.check_result('bool', 'loc', b, 'ix', b, - typs=['ints', 'labels', 'mixed', 'ts', 'floats']) + typs=['ints', 'uints', 'labels', + 'mixed', 'ts', 'floats']) self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'], fails=KeyError) @@ -1244,11 +1257,11 @@ def test_loc_getitem_int_slice(self): # ok self.check_result('int slice2', 'loc', slice(2, 4), 'ix', [2, 4], - typs=['ints'], axes=0) + typs=['ints', 'uints'], axes=0) self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6], - typs=['ints'], axes=1) + typs=['ints', 'uints'], axes=1) self.check_result('int slice2', 'loc', slice(4, 8), 'ix', [4, 8], - typs=['ints'], axes=2) + typs=['ints', 'uints'], axes=2) # GH 3053 # loc should treat integer slices like label slices diff --git a/pandas/tests/types/test_generic.py b/pandas/tests/types/test_generic.py index 89913de6f6069..28600687e8062 100644 --- a/pandas/tests/types/test_generic.py +++ b/pandas/tests/types/test_generic.py @@ -24,6 +24,7 @@ class TestABCClasses(tm.TestCase): def test_abc_types(self): self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) + self.assertIsInstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index) self.assertIsInstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) self.assertIsInstance(self.multi_index, gt.ABCMultiIndex) self.assertIsInstance(self.datetime_index, gt.ABCDatetimeIndex) diff --git a/pandas/types/generic.py b/pandas/types/generic.py index 0d576eed43d45..86d266f4595e2 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -16,6 +16,8 @@ def _check(cls, inst): ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", )) ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index", )) +ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", + ("uint64index", )) ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex", )) ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", @@ -32,7 +34,7 @@ def _check(cls, inst): ("categoricalindex", )) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", "rangeindex", - "float64index", + "float64index", "uint64index", "multiindex", "datetimeindex", "timedeltaindex", "periodindex", "categoricalindex")) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d96f57f2810e3..d39ce7acf0029 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1573,6 +1573,10 @@ def makeIntIndex(k=10, name=None): return Index(lrange(k), name=name) +def makeUIntIndex(k=10, name=None): + return Index([2**63 + i for i in lrange(k)], name=name) + + def makeRangeIndex(k=10, name=None): return RangeIndex(0, k, 1, name=name) diff --git a/setup.py b/setup.py index 0a84cf527bfb1..a53464f8f7987 100755 --- a/setup.py +++ b/setup.py @@ -490,7 +490,8 @@ def pxd(name): index={'pyxfile': 'index', 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c'], - 'pxdfiles': ['src/util']}, + 'pxdfiles': ['src/util'], + 'depends': _pxi_dep['index']}, algos={'pyxfile': 'algos', 'pxdfiles': ['src/util'], 'depends': _pxi_dep['algos']}, From d096b060c1bb50077dbfdfe7f07e4105826bb07e Mon Sep 17 00:00:00 2001 From: Jim Crist Date: Wed, 18 Jan 2017 09:22:40 -0500 Subject: [PATCH 031/338] BUG: Categoricals hash consistently Previously categorical values were hashed using just their codes. This meant that the hash value depended on the ordering of the categories, rather than on the values the series represented. This caused problems in dask, where different partitions might have different categorical mappings. This PR makes the hashing dependent on the values the categorical represents, rather than on the codes. The categories are first hashed, and then the codes are remapped to the hashed values. This is slightly slower than before (still need to hash the categories, where we didn't before), but allows for more consistent hashing. Related to this work in dask: https://github.com/dask/dask/pull/1877. Author: Jim Crist Closes #15143 from jcrist/categories_hash_consistently and squashes the following commits: f1aea13 [Jim Crist] Address comments 7878c55 [Jim Crist] Categoricals hash consistently --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/hashing.py | 100 +++++++++++++++++++---------- pandas/tools/tests/test_hashing.py | 17 +++++ 3 files changed, 83 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4ce7e6d2cd3b8..8f7b8be613c8a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -326,6 +326,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) +- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index aa18b8bc70c37..6d2186fdab34c 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -4,15 +4,17 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index -from pandas.lib import infer_dtype +from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame -from pandas.types.common import is_categorical_dtype +from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, + is_datetime64_dtype, is_timedelta64_dtype) # 16 byte long hashing key _default_hash_key = '0123456789123456' -def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): +def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, + categorize=True): """ Return a data hash of the Index/Series/DataFrame @@ -25,6 +27,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 Returns ------- @@ -39,28 +46,33 @@ def adder(h, hashed_to_add): return np.add(h, hashed_to_add, h) if isinstance(obj, ABCIndexClass): - h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64') h = Series(h, index=obj, dtype='uint64') elif isinstance(obj, ABCSeries): - h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64') if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, - hash_key=hash_key).values) + hash_key=hash_key, + categorize=categorize).values) h = Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, ABCDataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, - hash_key).astype('uint64') + hash_key, categorize).astype('uint64') for _, col in cols: - h = adder(h, hash_array(col.values, encoding, hash_key)) + h = adder(h, hash_array(col.values, encoding, hash_key, + categorize)) if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, - hash_key=hash_key).values) + hash_key=hash_key, + categorize=categorize).values) h = Series(h, index=obj.index, dtype='uint64') else: @@ -68,7 +80,27 @@ def adder(h, hashed_to_add): return h -def hash_array(vals, encoding='utf8', hash_key=None): +def _hash_categorical(c, encoding, hash_key): + """ + Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes + + Parameters + ---------- + c : Categorical + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + ndarray of hashed values array, same size as len(c) + """ + cat_hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False).astype(np.uint64, copy=False) + return c.rename_categories(cat_hashed).astype(np.uint64) + + +def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. @@ -80,6 +112,11 @@ def hash_array(vals, encoding='utf8', hash_key=None): encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 Returns ------- @@ -87,46 +124,39 @@ def hash_array(vals, encoding='utf8', hash_key=None): """ - # work with cagegoricals as ints. (This check is above the complex - # check so that we don't ask numpy if categorical is a subdtype of - # complex, as it will choke. if hash_key is None: hash_key = _default_hash_key + # For categoricals, we hash the categories, then remap the codes to the + # hash values. (This check is above the complex check so that we don't ask + # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): - vals = vals.codes + return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) - # MAIN LOGIC: - inferred = infer_dtype(vals) - # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - if inferred == 'boolean': + if is_bool_array(vals): vals = vals.astype('u8') - - if (np.issubdtype(vals.dtype, np.datetime64) or - np.issubdtype(vals.dtype, np.timedelta64) or - np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: - + elif ((is_datetime64_dtype(vals) or + is_timedelta64_dtype(vals) or + is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: - - # its MUCH faster to categorize object dtypes, then hash and rename - codes, categories = factorize(vals, sort=False) - categories = Index(categories) - c = Series(Categorical(codes, categories, - ordered=False, fastpath=True)) - vals = _hash.hash_object_array(categories.values, - hash_key, - encoding) - - # rename & extract - vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values + # With repeated values, its MUCH faster to categorize object dtypes, + # then hash and rename categories. We allow skipping the categorization + # when the values are known/likely to be unique. + if categorize: + codes, categories = factorize(vals, sort=False) + cat = Categorical(codes, Index(categories), + ordered=False, fastpath=True) + return _hash_categorical(cat, encoding, hash_key) + + vals = _hash.hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 6e5f30fb7a52d..7913706f5658b 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -90,6 +90,23 @@ def test_hash_pandas_empty_object(self): # these are by-definition the same with # or w/o the index as the data is empty + def test_categorical_consistency(self): + # GH15143 + # Check that categoricals hash consistent with their values, not codes + # This should work for categoricals of any dtype + for s1 in [Series(['a', 'b', 'c', 'd']), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4))]: + s2 = s1.astype('category').cat.set_categories(s1) + s3 = s2.cat.set_categories(list(reversed(s1))) + for categorize in [True, False]: + # These should all hash identically + h1 = hash_pandas_object(s1, categorize=categorize) + h2 = hash_pandas_object(s2, categorize=categorize) + h3 = hash_pandas_object(s3, categorize=categorize) + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) + def test_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: From 66e0e202bbd0dd145de5ccb77e1ea415bcd25dc3 Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Wed, 18 Jan 2017 11:05:17 -0500 Subject: [PATCH 032/338] ENH: Added 'direction' parameter to merge_asof() (#14887) closes #14887 Author: Christopher C. Aycock Closes #15129 from chrisaycock/GH14887 and squashes the following commits: da38483 [Christopher C. Aycock] Description of direction parameters is now a bullet-point list 879c9f0 [Christopher C. Aycock] Tweak prose to include statement on version and parameters ce5caaa [Christopher C. Aycock] Split Cython functions according to direction 50431ad [Christopher C. Aycock] ENH: Added 'direction' parameter to merge_asof() (#14887) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/src/joins_func_helper.pxi.in | 216 +++++++++++++++++++++++++- pandas/tools/merge.py | 76 ++++++--- pandas/tools/tests/test_merge_asof.py | 173 ++++++++++++++++++++- 4 files changed, 436 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8f7b8be613c8a..2ff7323e5f8ef 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -135,6 +135,7 @@ Other enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) +- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in index 68c376492f8f2..486c65368602b 100644 --- a/pandas/src/joins_func_helper.pxi.in +++ b/pandas/src/joins_func_helper.pxi.in @@ -29,7 +29,8 @@ from hashtable cimport * {{for on_dtype in on_dtypes}} -def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, +def asof_join_backward_{{on_dtype}}_by_{{by_dtype}}( + ndarray[{{on_dtype}}] left_values, ndarray[{{on_dtype}}] right_values, ndarray[{{by_dtype}}] left_by_values, ndarray[{{by_dtype}}] right_by_values, @@ -41,6 +42,7 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 {{on_dtype}} tolerance_ + {{on_dtype}} diff {{table_type}} hash_table {{by_dtype}} by_value @@ -63,7 +65,7 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, if right_pos < 0: right_pos = 0 - # find last position in right whose value is less than left's value + # find last position in right whose value is less than left's if allow_exact_matches: while right_pos < right_size and\ right_values[right_pos] <= left_values[left_pos]: @@ -91,6 +93,119 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, return left_indexer, right_indexer + +def asof_join_forward_{{on_dtype}}_by_{{by_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + ndarray[{{by_dtype}}] left_by_values, + ndarray[{{by_dtype}}] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + {{on_dtype}} diff + {{table_type}} hash_table + {{by_dtype}} by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = {{table_type}}(right_size) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while right_pos >= 0 and\ + right_values[right_pos] >= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + else: + while right_pos >= 0 and\ + right_values[right_pos] > left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = right_values[found_right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest_{{on_dtype}}_by_{{by_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + ndarray[{{by_dtype}}] left_by_values, + ndarray[{{by_dtype}}] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + {{on_dtype}} bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri =\ + asof_join_backward_{{on_dtype}}_by_{{by_dtype}}(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + fli, fri =\ + asof_join_forward_{{on_dtype}}_by_{{by_dtype}}(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer + {{endfor}} {{endfor}} @@ -111,7 +226,8 @@ dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', {{for on_dtype in dtypes}} -def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, +def asof_join_backward_{{on_dtype}}( + ndarray[{{on_dtype}}] left_values, ndarray[{{on_dtype}}] right_values, bint allow_exact_matches=1, tolerance=None): @@ -121,6 +237,7 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 {{on_dtype}} tolerance_ + {{on_dtype}} diff # if we are using tolerance, set our objects if tolerance is not None: @@ -139,7 +256,7 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, if right_pos < 0: right_pos = 0 - # find last position in right whose value is less than left's value + # find last position in right whose value is less than left's if allow_exact_matches: while right_pos < right_size and\ right_values[right_pos] <= left_values[left_pos]: @@ -162,5 +279,96 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, return left_indexer, right_indexer + +def asof_join_forward_{{on_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + {{on_dtype}} diff + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while right_pos >= 0 and\ + right_values[right_pos] >= left_values[left_pos]: + right_pos -= 1 + else: + while right_pos >= 0 and\ + right_values[right_pos] > left_values[left_pos]: + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos\ + if right_pos != right_size else -1 + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != right_size: + diff = right_values[right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest_{{on_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + {{on_dtype}} bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri = asof_join_backward_{{on_dtype}}(left_values, right_values, + allow_exact_matches, tolerance) + fli, fri = asof_join_forward_{{on_dtype}}(left_values, right_values, + allow_exact_matches, tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer + {{endfor}} diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index c756a61faeefb..3fbd83a6f3245 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -266,16 +266,29 @@ def merge_asof(left, right, on=None, by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, - allow_exact_matches=True): + allow_exact_matches=True, + direction='backward'): """Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. - For each row in the left DataFrame, we select the last row in the right - DataFrame whose 'on' key is less than or equal to the left's key. Both - DataFrames must be sorted by the key. + Both DataFrames must be sorted by the key. - Optionally match on equivalent keys with 'by' before searching for nearest - match with 'on'. + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + 'on' key is less than or equal to the left's key. + + - A "forward" search selects the first row in the right DataFrame whose + 'on' key is greater than or equal to the left's key. + + - A "nearest" search selects the row in the right DataFrame whose 'on' + key is closest in absolute distance to the left's key. + + The default is "backward" and is the compatible in versions below 0.20.0. + The direction parameter was added in version 0.20.0 and introduces + "forward" and "nearest". + + Optionally match on equivalent keys with 'by' before searching with 'on'. .. versionadded:: 0.19.0 @@ -323,9 +336,14 @@ def merge_asof(left, right, on=None, allow_exact_matches : boolean, default True - If True, allow matching the same 'on' value - (i.e. less-than-or-equal-to) + (i.e. less-than-or-equal-to / greater-than-or-equal-to) - If False, don't match the same 'on' value - (i.e., stricly less-than) + (i.e., stricly less-than / strictly greater-than) + + direction : 'backward' (default), 'forward', or 'nearest' + Whether to search for prior, subsequent, or closest matches. + + .. versionadded:: 0.20.0 Returns ------- @@ -359,17 +377,17 @@ def merge_asof(left, right, on=None, 1 5 b 3.0 2 10 c 7.0 - For this example, we can achieve a similar result thru - ``pd.merge_ordered()``, though its not nearly as performant. - - >>> (pd.merge_ordered(left, right, on='a') - ... .ffill() - ... .drop_duplicates(['left_val']) - ... ) + >>> pd.merge_asof(left, right, on='a', direction='forward') a left_val right_val 0 1 a 1.0 - 3 5 b 3.0 - 6 10 c 7.0 + 1 5 b 6.0 + 2 10 c NaN + + >>> pd.merge_asof(left, right, on='a', direction='nearest') + a left_val right_val + 0 1 a 1 + 1 5 b 6 + 2 10 c 7 We can use indexed DataFrames as well. @@ -467,7 +485,8 @@ def merge_asof(left, right, on=None, by=by, left_by=left_by, right_by=right_by, suffixes=suffixes, how='asof', tolerance=tolerance, - allow_exact_matches=allow_exact_matches) + allow_exact_matches=allow_exact_matches, + direction=direction) return op.get_result() @@ -999,12 +1018,13 @@ def get_result(self): return result -def _asof_function(on_type): - return getattr(_join, 'asof_join_%s' % on_type, None) +def _asof_function(direction, on_type): + return getattr(_join, 'asof_join_%s_%s' % (direction, on_type), None) -def _asof_by_function(on_type, by_type): - return getattr(_join, 'asof_join_%s_by_%s' % (on_type, by_type), None) +def _asof_by_function(direction, on_type, by_type): + return getattr(_join, 'asof_join_%s_%s_by_%s' % + (direction, on_type, by_type), None) _type_casters = { @@ -1056,13 +1076,15 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None, axis=1, suffixes=('_x', '_y'), copy=True, fill_method=None, how='asof', tolerance=None, - allow_exact_matches=True): + allow_exact_matches=True, + direction='backward'): self.by = by self.left_by = left_by self.right_by = right_by self.tolerance = tolerance self.allow_exact_matches = allow_exact_matches + self.direction = direction _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, right_on=right_on, left_index=left_index, @@ -1108,6 +1130,10 @@ def _validate_specification(self): self.left_on = self.left_by + list(self.left_on) self.right_on = self.right_by + list(self.right_on) + # check 'direction' is valid + if self.direction not in ['backward', 'forward', 'nearest']: + raise MergeError('direction invalid: ' + self.direction) + @property def _asof_key(self): """ This is our asof key, the 'on' """ @@ -1208,7 +1234,7 @@ def flip(xs): # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - func = _asof_by_function(on_type, by_type) + func = _asof_by_function(self.direction, on_type, by_type) return func(left_values, right_values, left_by_values, @@ -1218,7 +1244,7 @@ def flip(xs): else: # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - func = _asof_function(on_type) + func = _asof_function(self.direction, on_type) return func(left_values, right_values, self.allow_exact_matches, diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index e05cbd0850169..ef7b25008e80a 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -42,7 +42,12 @@ def test_examples1(self): right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 'right_val': [1, 2, 3, 6, 7]}) - pd.merge_asof(left, right, on='a') + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, 3, 7]}) + + result = pd.merge_asof(left, right, on='a') + assert_frame_equal(result, expected) def test_examples2(self): """ doc-string examples """ @@ -94,6 +99,38 @@ def test_examples2(self): tolerance=pd.Timedelta('10ms'), allow_exact_matches=False) + def test_examples3(self): + """ doc-string examples """ + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, 6, np.nan]}) + + result = pd.merge_asof(left, right, on='a', direction='forward') + assert_frame_equal(result, expected) + + def test_examples4(self): + """ doc-string examples """ + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, 6, 7]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest') + assert_frame_equal(result, expected) + def test_basic(self): expected = self.asof @@ -495,6 +532,38 @@ def test_tolerance(self): expected = self.tolerance assert_frame_equal(result, expected) + def test_tolerance_forward(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, np.nan, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='forward', + tolerance=1) + assert_frame_equal(result, expected) + + def test_tolerance_nearest(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, np.nan, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest', + tolerance=1) + assert_frame_equal(result, expected) + def test_tolerance_tz(self): # GH 14844 left = pd.DataFrame( @@ -540,6 +609,38 @@ def test_allow_exact_matches(self): expected = self.allow_exact_matches assert_frame_equal(result, expected) + def test_allow_exact_matches_forward(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [2, 7, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='forward', + allow_exact_matches=False) + assert_frame_equal(result, expected) + + def test_allow_exact_matches_nearest(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [2, 3, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest', + allow_exact_matches=False) + assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance(self): result = merge_asof(self.trades, self.quotes, @@ -602,6 +703,76 @@ def test_allow_exact_matches_and_tolerance3(self): 'version': [np.nan, np.nan]}) assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance_forward(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 3, 4, 6, 11], + 'right_val': [1, 3, 4, 6, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [np.nan, 6, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='forward', + allow_exact_matches=False, tolerance=1) + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance_nearest(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 3, 4, 6, 11], + 'right_val': [1, 3, 4, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [np.nan, 4, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest', + allow_exact_matches=False, tolerance=1) + assert_frame_equal(result, expected) + + def test_forward_by(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Y', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e']}) + right = pd.DataFrame({'a': [1, 6, 11, 15, 16], + 'b': ['X', 'Z', 'Y', 'Z', 'Y'], + 'right_val': [1, 6, 11, 15, 16]}) + + expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Y', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e'], + 'right_val': [1, np.nan, 11, 15, 16]}) + + result = pd.merge_asof(left, right, on='a', by='b', + direction='forward') + assert_frame_equal(result, expected) + + def test_nearest_by(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Z', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e']}) + right = pd.DataFrame({'a': [1, 6, 11, 15, 16], + 'b': ['X', 'Z', 'Z', 'Z', 'Y'], + 'right_val': [1, 6, 11, 15, 16]}) + + expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Z', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e'], + 'right_val': [1, 1, 11, 11, 16]}) + + result = pd.merge_asof(left, right, on='a', by='b', + direction='nearest') + assert_frame_equal(result, expected) + def test_by_int(self): # we specialize by type, so test that this is correct df1 = pd.DataFrame({ From f795d817b45b007903b5c77746bbf9fa8868c9ed Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 18 Jan 2017 11:10:25 -0500 Subject: [PATCH 033/338] DEPR: deprecate .ix in favor of .loc/.iloc closes #14218 closes #15116 Author: Jeff Reback Closes #15113 from jreback/ix and squashes the following commits: 1544f50 [Jeff Reback] DEPR: deprecate .ix in favor of .loc/.iloc --- doc/source/advanced.rst | 23 +- doc/source/api.rst | 9 +- doc/source/basics.rst | 6 +- doc/source/categorical.rst | 3 +- doc/source/computation.rst | 10 +- doc/source/cookbook.rst | 21 +- doc/source/gotchas.rst | 60 +- doc/source/indexing.rst | 79 +- doc/source/io.rst | 14 +- doc/source/merging.rst | 8 +- doc/source/missing_data.rst | 10 +- doc/source/reshaping.rst | 2 +- doc/source/sparse.rst | 2 +- doc/source/visualization.rst | 4 +- doc/source/whatsnew/v0.20.0.txt | 49 ++ pandas/core/frame.py | 14 +- pandas/core/generic.py | 17 +- pandas/core/groupby.py | 2 +- pandas/core/indexing.py | 33 +- pandas/core/panel.py | 2 +- pandas/core/reshape.py | 8 +- pandas/core/series.py | 2 +- pandas/io/pytables.py | 10 +- pandas/io/tests/json/test_pandas.py | 23 +- pandas/io/tests/parser/common.py | 4 +- pandas/io/tests/parser/parse_dates.py | 4 +- pandas/io/tests/test_date_converters.py | 16 +- pandas/io/tests/test_excel.py | 14 +- pandas/io/tests/test_html.py | 4 +- pandas/io/tests/test_packers.py | 4 +- pandas/io/tests/test_pytables.py | 141 ++-- pandas/io/tests/test_sql.py | 8 +- pandas/io/tests/test_stata.py | 2 +- pandas/sparse/tests/test_frame.py | 26 +- pandas/sparse/tests/test_indexing.py | 8 +- pandas/sparse/tests/test_series.py | 4 +- pandas/stats/tests/test_ols.py | 18 +- pandas/tests/formats/test_format.py | 8 +- pandas/tests/frame/test_alter_axes.py | 12 +- pandas/tests/frame/test_analytics.py | 157 ++-- pandas/tests/frame/test_apply.py | 4 +- pandas/tests/frame/test_asof.py | 4 +- .../tests/frame/test_axis_select_reindex.py | 40 +- pandas/tests/frame/test_block_internals.py | 16 +- pandas/tests/frame/test_combine_concat.py | 10 +- pandas/tests/frame/test_constructors.py | 8 +- pandas/tests/frame/test_indexing.py | 777 ++++++++++-------- pandas/tests/frame/test_misc_api.py | 24 +- pandas/tests/frame/test_missing.py | 50 +- pandas/tests/frame/test_nonunique_indexes.py | 4 +- pandas/tests/frame/test_operators.py | 33 +- pandas/tests/frame/test_replace.py | 10 +- pandas/tests/frame/test_reshape.py | 2 +- pandas/tests/frame/test_sorting.py | 16 +- pandas/tests/frame/test_subclass.py | 4 +- pandas/tests/frame/test_timeseries.py | 6 +- pandas/tests/frame/test_to_csv.py | 4 +- pandas/tests/groupby/test_filters.py | 4 +- pandas/tests/groupby/test_groupby.py | 69 +- pandas/tests/indexes/test_datetimelike.py | 4 +- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/indexing/test_callable.py | 44 +- pandas/tests/indexing/test_categorical.py | 32 +- pandas/tests/indexing/test_floats.py | 112 ++- pandas/tests/indexing/test_indexing.py | 163 ++-- pandas/tests/plotting/test_datetimelike.py | 2 +- pandas/tests/plotting/test_frame.py | 12 +- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_constructors.py | 6 +- pandas/tests/series/test_indexing.py | 106 +-- pandas/tests/series/test_repr.py | 2 +- pandas/tests/series/test_subclass.py | 2 +- pandas/tests/test_categorical.py | 40 +- pandas/tests/test_generic.py | 8 +- pandas/tests/test_internals.py | 2 +- pandas/tests/test_multilevel.py | 251 +++--- pandas/tests/test_panel.py | 100 +-- pandas/tests/test_panel4d.py | 31 +- pandas/tests/test_panelnd.py | 4 +- pandas/tests/test_strings.py | 2 +- pandas/tests/test_window.py | 2 +- pandas/tools/tests/test_concat.py | 42 +- pandas/tools/tests/test_join.py | 36 +- pandas/tools/tests/test_merge.py | 20 +- pandas/tools/tests/test_merge_ordered.py | 4 +- pandas/tools/tests/test_pivot.py | 7 +- pandas/tseries/tests/test_period.py | 8 +- pandas/tseries/tests/test_resample.py | 10 +- pandas/tseries/tests/test_timedeltas.py | 4 +- pandas/tseries/tests/test_timeseries.py | 70 +- 90 files changed, 1657 insertions(+), 1399 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 7b6b2a09f6037..9247652388c5b 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -230,7 +230,7 @@ of tuples: Advanced indexing with hierarchical index ----------------------------------------- -Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc/.ix`` is a +Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc`` is a bit challenging, but we've made every effort to do so. for example the following works as you would expect: @@ -258,7 +258,7 @@ Passing a list of labels or tuples works similar to reindexing: .. ipython:: python - df.ix[[('bar', 'two'), ('qux', 'one')]] + df.loc[[('bar', 'two'), ('qux', 'one')]] .. _advanced.mi_slicers: @@ -604,7 +604,7 @@ intended to work on boolean indices and may return unexpected results. ser = pd.Series(np.random.randn(10)) ser.take([False, False, True, True]) - ser.ix[[0, 1]] + ser.iloc[[0, 1]] Finally, as a small note on performance, because the ``take`` method handles a narrower range of inputs, it can offer performance that is a good deal @@ -620,7 +620,7 @@ faster than fancy indexing. timeit arr.take(indexer, axis=0) ser = pd.Series(arr[:, 0]) - timeit ser.ix[indexer] + timeit ser.iloc[indexer] timeit ser.take(indexer) .. _indexing.index_types: @@ -661,7 +661,7 @@ Setting the index, will create create a ``CategoricalIndex`` df2 = df.set_index('B') df2.index -Indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an ``Index`` with duplicates. +Indexing with ``__getitem__/.iloc/.loc`` works similarly to an ``Index`` with duplicates. The indexers MUST be in the category or the operation will raise. .. ipython:: python @@ -759,14 +759,12 @@ same. sf = pd.Series(range(5), index=indexf) sf -Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) +Scalar selection for ``[],.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) .. ipython:: python sf[3] sf[3.0] - sf.ix[3] - sf.ix[3.0] sf.loc[3] sf.loc[3.0] @@ -783,7 +781,6 @@ Slicing is ALWAYS on the values of the index, for ``[],ix,loc`` and ALWAYS posit .. ipython:: python sf[2:4] - sf.ix[2:4] sf.loc[2:4] sf.iloc[2:4] @@ -813,14 +810,6 @@ In non-float indexes, slicing using floats will raise a ``TypeError`` In [3]: pd.Series(range(5)).iloc[3.0] TypeError: cannot do positional indexing on with these indexers [3.0] of - Further the treatment of ``.ix`` with a float indexer on a non-float index, will be label based, and thus coerce the index. - - .. ipython:: python - - s2 = pd.Series([1, 2, 3], index=list('abc')) - s2 - s2.ix[1.0] = 10 - s2 Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat irregular timedelta-like indexing scheme, but the data is recorded as floats. This could for diff --git a/doc/source/api.rst b/doc/source/api.rst index b7a1b8a005d89..04a85bf63a6f8 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -268,13 +268,12 @@ Indexing, iteration Series.get Series.at Series.iat - Series.ix Series.loc Series.iloc Series.__iter__ Series.iteritems -For more information on ``.at``, ``.iat``, ``.ix``, ``.loc``, and +For more information on ``.at``, ``.iat``, ``.loc``, and ``.iloc``, see the :ref:`indexing documentation `. Binary operator functions @@ -774,7 +773,6 @@ Indexing, iteration DataFrame.head DataFrame.at DataFrame.iat - DataFrame.ix DataFrame.loc DataFrame.iloc DataFrame.insert @@ -791,7 +789,7 @@ Indexing, iteration DataFrame.mask DataFrame.query -For more information on ``.at``, ``.iat``, ``.ix``, ``.loc``, and +For more information on ``.at``, ``.iat``, ``.loc``, and ``.iloc``, see the :ref:`indexing documentation `. @@ -1090,7 +1088,6 @@ Indexing, iteration, slicing Panel.at Panel.iat - Panel.ix Panel.loc Panel.iloc Panel.__iter__ @@ -1100,7 +1097,7 @@ Indexing, iteration, slicing Panel.major_xs Panel.minor_xs -For more information on ``.at``, ``.iat``, ``.ix``, ``.loc``, and +For more information on ``.at``, ``.iat``, ``.loc``, and ``.iloc``, see the :ref:`indexing documentation `. Binary operator functions diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 2e8abe0a5c329..6f711323695c8 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -145,7 +145,7 @@ either match on the *index* or *columns* via the **axis** keyword: 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) df - row = df.ix[1] + row = df.iloc[1] column = df['two'] df.sub(row, axis='columns') @@ -556,7 +556,7 @@ course): series[::2] = np.nan series.describe() frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) - frame.ix[::2] = np.nan + frame.iloc[::2] = np.nan frame.describe() You can select specific percentiles to include in the output: @@ -1081,7 +1081,7 @@ objects either on the DataFrame's index or columns using the ``axis`` argument: .. ipython:: python - df.align(df2.ix[0], axis=1) + df.align(df2.iloc[0], axis=1) .. _basics.reindex_fill: diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 86d0c61398be1..18e429cfc92fa 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -482,7 +482,7 @@ Pivot tables: Data munging ------------ -The optimized pandas data access methods ``.loc``, ``.iloc``, ``.ix`` ``.at``, and ``.iat``, +The optimized pandas data access methods ``.loc``, ``.iloc``, ``.at``, and ``.iat``, work as normal. The only difference is the return type (for getting) and that only values already in `categories` can be assigned. @@ -501,7 +501,6 @@ the ``category`` dtype is preserved. df.iloc[2:4,:] df.iloc[2:4,:].dtypes df.loc["h":"j","cats"] - df.ix["h":"j",0:1] df[df["cats"] == "b"] An example where the category type is not preserved is if you take one single row: the diff --git a/doc/source/computation.rst b/doc/source/computation.rst index a19a56f6f1905..57480a244f308 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -84,8 +84,8 @@ in order to have a valid result. .. ipython:: python frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.ix[:5, 'a'] = np.nan - frame.ix[5:10, 'b'] = np.nan + frame.loc[frame.index[:5], 'a'] = np.nan + frame.loc[frame.index[5:10], 'b'] = np.nan frame.cov() @@ -120,7 +120,7 @@ All of these are currently computed using pairwise complete observations. .. ipython:: python frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) - frame.ix[::2] = np.nan + frame.iloc[::2] = np.nan # Series with Series frame['a'].corr(frame['b']) @@ -137,8 +137,8 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.ix[:5, 'a'] = np.nan - frame.ix[5:10, 'b'] = np.nan + frame.loc[frame.index[:5], 'a'] = np.nan + frame.loc[frame.index[5:10], 'b'] = np.nan frame.corr() diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index d2df72b284a12..841195de3da47 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -66,19 +66,19 @@ An if-then on one column .. ipython:: python - df.ix[df.AAA >= 5,'BBB'] = -1; df + df.loc[df.AAA >= 5,'BBB'] = -1; df An if-then with assignment to 2 columns: .. ipython:: python - df.ix[df.AAA >= 5,['BBB','CCC']] = 555; df + df.loc[df.AAA >= 5,['BBB','CCC']] = 555; df Add another line with different logic, to do the -else .. ipython:: python - df.ix[df.AAA < 5,['BBB','CCC']] = 2000; df + df.loc[df.AAA < 5,['BBB','CCC']] = 2000; df Or use pandas where after you've set up a mask @@ -149,7 +149,7 @@ Building Criteria {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df aValue = 43.0 - df.ix[(df.CCC-aValue).abs().argsort()] + df.loc[(df.CCC-aValue).abs().argsort()] `Dynamically reduce a list of criteria using a binary operators `__ @@ -217,9 +217,9 @@ There are 2 explicit slicing methods, with a third general case df.loc['bar':'kar'] #Label - #Generic - df.ix[0:3] #Same as .iloc[0:3] - df.ix['bar':'kar'] #Same as .loc['bar':'kar'] + # Generic + df.iloc[0:3] + df.loc['bar':'kar'] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. @@ -231,9 +231,6 @@ Ambiguity arises when an index consists of integers with a non-zero start or non df2.loc[1:3] #Label-oriented - df2.ix[1:3] #General, will mimic loc (label-oriented) - df2.ix[0:3] #General, will mimic iloc (position-oriented), as loc[0:3] would raise a KeyError - `Using inverse operator (~) to take the complement of a mask `__ @@ -440,7 +437,7 @@ Fill forward a reversed timeseries .. ipython:: python df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A')) - df.ix[3,'A'] = np.nan + df.loc[df.index[3], 'A'] = np.nan df df.reindex(df.index[::-1]).ffill() @@ -545,7 +542,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data') - sorted_df = df.ix[agg_n_sort_order.index] + sorted_df = df.loc[agg_n_sort_order.index] sorted_df diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index a1c12044adc34..e582d4f42894b 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -221,7 +221,7 @@ Label-based indexing with integer axis labels is a thorny topic. It has been discussed heavily on mailing lists and among various members of the scientific Python community. In pandas, our general viewpoint is that labels matter more than integer locations. Therefore, with an integer axis index *only* -label-based indexing is possible with the standard tools like ``.ix``. The +label-based indexing is possible with the standard tools like ``.loc``. The following code will generate exceptions: .. code-block:: python @@ -230,7 +230,7 @@ following code will generate exceptions: s[-1] df = pd.DataFrame(np.random.randn(5, 4)) df - df.ix[-2:] + df.loc[-2:] This deliberate decision was made to prevent ambiguities and subtle bugs (many users reported finding bugs when the API change was made to stop "falling back" @@ -305,7 +305,7 @@ index can be somewhat complicated. For example, the following does not work: :: - s.ix['c':'e'+1] + s.loc['c':'e'+1] A very common use case is to limit a time series to start and end at two specific dates. To enable this, we made the design design to make label-based @@ -313,7 +313,7 @@ slicing include both endpoints: .. ipython:: python - s.ix['c':'e'] + s.loc['c':'e'] This is most definitely a "practicality beats purity" sort of thing, but it is something to watch out for if you expect label-based slicing to behave exactly @@ -322,58 +322,6 @@ in the way that standard Python integer slicing works. Miscellaneous indexing gotchas ------------------------------ -Reindex versus ix gotchas -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Many users will find themselves using the ``ix`` indexing capabilities as a -concise means of selecting data from a pandas object: - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(6, 4), columns=['one', 'two', 'three', 'four'], - index=list('abcdef')) - df - df.ix[['b', 'c', 'e']] - -This is, of course, completely equivalent *in this case* to using the -``reindex`` method: - -.. ipython:: python - - df.reindex(['b', 'c', 'e']) - -Some might conclude that ``ix`` and ``reindex`` are 100% equivalent based on -this. This is indeed true **except in the case of integer indexing**. For -example, the above operation could alternately have been expressed as: - -.. ipython:: python - - df.ix[[1, 2, 4]] - -If you pass ``[1, 2, 4]`` to ``reindex`` you will get another thing entirely: - -.. ipython:: python - - df.reindex([1, 2, 4]) - -So it's important to remember that ``reindex`` is **strict label indexing -only**. This can lead to some potentially surprising results in pathological -cases where an index contains, say, both integers and strings: - -.. ipython:: python - - s = pd.Series([1, 2, 3], index=['a', 0, 1]) - s - s.ix[[0, 1]] - s.reindex([0, 1]) - -Because the index in this case does not contain solely integers, ``ix`` falls -back on integer indexing. By contrast, ``reindex`` only looks for the values -passed in the index, thus finding the integers ``0`` and ``1``. While it would -be possible to insert some logic to check whether a passed sequence is all -contained in the index, that logic would exact a very high cost in large data -sets. - Reindex potentially changes underlying Series dtype ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 1ea6662a4edb0..9c653f6d9eeff 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -61,6 +61,8 @@ See the :ref:`MultiIndex / Advanced Indexing ` for ``MultiIndex`` and See the :ref:`cookbook` for some advanced strategies +.. _indexing.choice: + Different Choices for Indexing ------------------------------ @@ -104,24 +106,13 @@ of multi-axis indexing. See more at :ref:`Selection by Position ` -- ``.ix`` supports mixed integer and label based access. It is primarily label - based, but will fall back to integer positional access unless the corresponding - axis is of integer type. ``.ix`` is the most general and will - support any of the inputs in ``.loc`` and ``.iloc``. ``.ix`` also supports floating point - label schemes. ``.ix`` is exceptionally useful when dealing with mixed positional - and label based hierarchical indexes. - - However, when an axis is integer based, ONLY - label based access and not positional access is supported. - Thus, in such cases, it's usually better to be explicit and use ``.iloc`` or ``.loc``. - See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical `. -- ``.loc``, ``.iloc``, ``.ix`` and also ``[]`` indexing can accept a ``callable`` as indexer. See more at :ref:`Selection By Callable `. +- ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. See more at :ref:`Selection By Callable `. Getting values from an object with multi-axes selection uses the following -notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as +notation (using ``.loc`` as an example, but applies to ``.iloc`` as well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to ``p.loc['a', :, :]``) @@ -193,7 +184,7 @@ columns. .. warning:: - pandas aligns all AXES when setting ``Series`` and ``DataFrame`` from ``.loc``, ``.iloc`` and ``.ix``. + pandas aligns all AXES when setting ``Series`` and ``DataFrame`` from ``.loc``, and ``.iloc``. This will **not** modify ``df`` because the column alignment is before value assignment. @@ -526,7 +517,7 @@ Selection By Callable .. versionadded:: 0.18.1 -``.loc``, ``.iloc``, ``.ix`` and also ``[]`` indexing can accept a ``callable`` as indexer. +``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. The ``callable`` must be a function with one argument (the calling Series, DataFrame or Panel) and that returns valid output for indexing. .. ipython:: python @@ -559,6 +550,58 @@ without using temporary variable. (bb.groupby(['year', 'team']).sum() .loc[lambda df: df.r > 100]) +.. _indexing.deprecate_ix: + +IX Indexer is Deprecated +------------------------ + +.. warning:: + + Starting in 0.20.0, the ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` +and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to +do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*. This has caused +quite a bit of user confusion over the years. + +The recommended methods of indexing are: + +.. ipython:: python + + dfd = pd.DataFrame({'A': [1, 2, 3], + 'B': [4, 5, 6]}, + index=list('abc')) + + dfd + +Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. + +.. code-block:: ipython + + In [3]: dfd.ix[[0, 2], 'A'] + Out[3]: + a 1 + c 3 + Name: A, dtype: int64 + +Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. + +.. ipython:: python + + dfd.loc[df.index[[0, 2]], 'A'] + +This can also be expressed using ``.iloc``, by explicitly getting locations on the indexers, and using +*positional* indexing to select things. + +.. ipython:: python + + dfd.iloc[[0, 2], dfd.columns.get_loc('A')] + +For getting *multiple* indexers, using ``.get_indexer`` + +.. ipython:: python + + dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])] + + .. _indexing.basics.partial_setting: Selecting Random Samples @@ -641,7 +684,7 @@ Setting With Enlargement .. versionadded:: 0.13 -The ``.loc/.ix/[]`` operations can perform enlargement when setting a non-existant key for that axis. +The ``.loc/[]`` operations can perform enlargement when setting a non-existant key for that axis. In the ``Series`` case this is effectively an appending operation @@ -906,7 +949,7 @@ without creating a copy: Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), such that partial selection with setting is possible. This is analogous to -partial setting via ``.ix`` (but on the contents rather than the axis labels) +partial setting via ``.loc`` (but on the contents rather than the axis labels) .. ipython:: python @@ -1716,7 +1759,7 @@ A chained assignment can also crop up in setting in a mixed dtype frame. .. note:: - These setting rules apply to all of ``.loc/.iloc/.ix`` + These setting rules apply to all of ``.loc/.iloc`` This is the correct access method diff --git a/doc/source/io.rst b/doc/source/io.rst index 9f5e6f2331bc5..5483248e6e3d4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1400,7 +1400,7 @@ returned object: df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) df - df.ix[1978] + df.iloc[1978] .. _io.multi_index_columns: @@ -3280,7 +3280,7 @@ defaults to `nan`. 'bool' : True, 'datetime64' : pd.Timestamp('20010102')}, index=list(range(8))) - df_mixed.ix[3:5,['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) df_mixed1 = store.select('df_mixed') @@ -3560,10 +3560,10 @@ be data_columns df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.ix[4:6,'string'] = np.nan - df_dc.ix[7:9,'string'] = 'bar' + df_dc.loc[df_dc.index[4:6], 'string'] = np.nan + df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc.ix[1:3,['B','C']] = 1.0 + df_dc.loc[df_dc.index[1:3], ['B','C']] = 1.0 df_dc # on-disk operations @@ -3727,7 +3727,7 @@ results. df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' - df_mt.ix[1, ('A', 'B')] = np.nan + df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan # you can also create the tables individually store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, @@ -4686,7 +4686,7 @@ destination DataFrame as well as a preferred column order as follows: Starting with 0.20.0, you can specify the query config as parameter to use additional options of your job. -For more information about query configuration parameters see +For more information about query configuration parameters see `here `__. .. code-block:: python diff --git a/doc/source/merging.rst b/doc/source/merging.rst index f95987afd4c77..f732f0a4cc749 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -132,7 +132,7 @@ means that we can now do stuff like select out each chunk by key: .. ipython:: python - result.ix['y'] + result.loc['y'] It's not a stretch to see how this can be very useful. More detail on this functionality below. @@ -692,7 +692,7 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=['left', 'right'], vertical=False); plt.close('all'); - + Here is another example with duplicate join keys in DataFrames: .. ipython:: python @@ -710,10 +710,10 @@ Here is another example with duplicate join keys in DataFrames: p.plot([left, right], result, labels=['left', 'right'], vertical=False); plt.close('all'); - + .. warning:: - Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, + Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames. .. _merging.indicator: diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 3ede97a902696..37930775885e3 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -113,7 +113,7 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``. df2 = df.copy() df2['timestamp'] = pd.Timestamp('20120101') df2 - df2.ix[['a','c','h'],['one','timestamp']] = np.nan + df2.loc[['a','c','h'],['one','timestamp']] = np.nan df2 df2.get_dtype_counts() @@ -155,9 +155,9 @@ objects. .. ipython:: python :suppress: - df = df2.ix[:, ['one', 'two', 'three']] - a = df2.ix[:5, ['one', 'two']].fillna(method='pad') - b = df2.ix[:5, ['one', 'two', 'three']] + df = df2.loc[:, ['one', 'two', 'three']] + a = df2.loc[df2.index[:5], ['one', 'two']].fillna(method='pad') + b = df2.loc[df2.index[:5], ['one', 'two', 'three']] .. ipython:: python @@ -237,7 +237,7 @@ we can use the `limit` keyword: .. ipython:: python :suppress: - df.ix[2:4, :] = np.nan + df.iloc[2:4, :] = np.nan .. ipython:: python diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index f90c2960fa30c..eccaa9474bf6d 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -217,7 +217,7 @@ calling ``sort_index``, of course). Here is a more complex example: ('one', 'two')], names=['first', 'second']) df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) - df2 = df.ix[[0, 1, 2, 4, 5, 7]] + df2 = df.iloc[[0, 1, 2, 4, 5, 7]] df2 As mentioned above, ``stack`` can be called with a ``level`` argument to select diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index d3f921f8762cc..2bc5d3f6dd0f5 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -45,7 +45,7 @@ large, mostly NA DataFrame: .. ipython:: python df = pd.DataFrame(randn(10000, 4)) - df.ix[:9998] = np.nan + df.iloc[:9998] = np.nan sdf = df.to_sparse() sdf sdf.density diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index e3b186abe53fc..2b2012dbf0b8a 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -134,7 +134,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.ix[5].plot(kind='bar'); plt.axhline(0, color='k') + df.iloc[5].plot(kind='bar'); plt.axhline(0, color='k') .. versionadded:: 0.17.0 @@ -179,7 +179,7 @@ For labeled, non-time series data, you may wish to produce a bar plot: plt.figure(); @savefig bar_plot_ex.png - df.ix[5].plot.bar(); plt.axhline(0, color='k') + df.iloc[5].plot.bar(); plt.axhline(0, color='k') Calling a DataFrame's :meth:`plot.bar() ` method produces a multiple bar plot: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2ff7323e5f8ef..f7c05a3f373f5 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -10,6 +10,7 @@ users upgrade to this version. Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) +- The ``.ix`` indexer has been deprecated, see :ref:`here ` Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -143,6 +144,54 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew.api_breaking.deprecate_ix + +Deprecate .ix +^^^^^^^^^^^^^ + +The ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*. This has caused quite a bit of user confusion over the years. The full indexing documentation are :ref:`here `. (:issue:`14218`) + + +The recommended methods of indexing are: + +- ``.loc`` if you want to *label* index +- ``.iloc`` if you want to *positionally* index. + +Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code `here `_ + + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': [4, 5, 6]}, + index=list('abc')) + + df + +Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. + +.. code-block:: ipython + + In [3]: df.ix[[0, 2], 'A'] + Out[3]: + a 1 + c 3 + Name: A, dtype: int64 + +Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. + +.. ipython:: python + + df.loc[df.index[[0, 2]], 'A'] + +Using ``.iloc``. Here we will get the location of the 'A' column, then use *positional* indexing to select things. + +.. ipython:: python + + df.iloc[[0, 2], df.columns.get_loc('A')] + + .. _whatsnew.api_breaking.index_map Map on Index types now return other Index types diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4288e03c2cc49..504554d6410f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1961,7 +1961,7 @@ def _ixs(self, i, axis=0): if isinstance(i, slice): # need to return view lab_slice = slice(label[0], label[-1]) - return self.ix[:, lab_slice] + return self.loc[:, lab_slice] else: if isinstance(label, Index): return self.take(i, axis=1, convert=True) @@ -2056,7 +2056,7 @@ def _getitem_array(self, key): indexer = key.nonzero()[0] return self.take(indexer, axis=0, convert=False) else: - indexer = self.ix._convert_to_indexer(key, axis=1) + indexer = self.loc._convert_to_indexer(key, axis=1) return self.take(indexer, axis=1, convert=True) def _getitem_multilevel(self, key): @@ -2389,7 +2389,7 @@ def __setitem__(self, key, value): def _setitem_slice(self, key, value): self._check_setitem_copy() - self.ix._setitem_with_indexer(key, value) + self.loc._setitem_with_indexer(key, value) def _setitem_array(self, key, value): # also raises Exception if object array with NA values @@ -2400,7 +2400,7 @@ def _setitem_array(self, key, value): key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() - self.ix._setitem_with_indexer(indexer, value) + self.loc._setitem_with_indexer(indexer, value) else: if isinstance(value, DataFrame): if len(value.columns) != len(key): @@ -2408,9 +2408,9 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: - indexer = self.ix._convert_to_indexer(key, axis=1) + indexer = self.loc._convert_to_indexer(key, axis=1) self._check_setitem_copy() - self.ix._setitem_with_indexer((slice(None), indexer), value) + self.loc._setitem_with_indexer((slice(None), indexer), value) def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. @@ -4403,7 +4403,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): elif isinstance(other, list) and not isinstance(other[0], DataFrame): other = DataFrame(other) if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.ix[:, self.columns] + other = other.loc[:, self.columns] from pandas.tools.merge import concat if isinstance(other, (list, tuple)): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b5767da74cad..f96cc9e8515de 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1809,18 +1809,12 @@ def xs(self, key, axis=0, level=None, drop_level=True): loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) - # convert to a label indexer if needed - if isinstance(loc, slice): - lev_num = labels._get_level_number(level) - if labels.levels[lev_num].inferred_type == 'integer': - loc = labels[loc] - # create the tuple of the indexer indexer = [slice(None)] * self.ndim indexer[axis] = loc indexer = tuple(indexer) - result = self.ix[indexer] + result = self.iloc[indexer] setattr(result, result._get_axis_name(axis), new_ax) return result @@ -1983,7 +1977,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer - result = self.ix[tuple(slicer)] + result = self.loc[tuple(slicer)] if inplace: self._update_inplace(result) @@ -4332,8 +4326,9 @@ def first(self, offset): if not offset.isAnchored() and hasattr(offset, '_inc'): if end_date in self.index: end = self.index.searchsorted(end_date, side='left') + return self.iloc[:end] - return self.ix[:end] + return self.loc[:end] def last(self, offset): """ @@ -4364,7 +4359,7 @@ def last(self, offset): start_date = start = self.index[-1] - offset start = self.index.searchsorted(start_date, side='right') - return self.ix[start:] + return self.iloc[start:] def rank(self, axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False): @@ -5078,7 +5073,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): slicer = [slice(None, None)] * self._AXIS_LEN slicer[axis] = slice(before, after) - result = self.ix[tuple(slicer)] + result = self.loc[tuple(slicer)] if isinstance(ax, MultiIndex): setattr(result, self._get_axis_name(axis), diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 700e279cb0030..31df22cc11b1d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4103,7 +4103,7 @@ def _chop(self, sdata, slice_obj): if self.axis == 0: return sdata.iloc[slice_obj] else: - return sdata._slice(slice_obj, axis=1) # ix[:, slice_obj] + return sdata._slice(slice_obj, axis=1) # .loc[:, slice_obj] class NDFrameSplitter(DataSplitter): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0db5103a18807..40050d6d769a6 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,5 +1,6 @@ # pylint: disable=W0223 +import warnings import numpy as np from pandas.compat import range, zip import pandas.compat as compat @@ -119,8 +120,12 @@ def _get_setitem_indexer(self, key): except Exception: pass - if isinstance(key, tuple) and not self.ndim < len(key): - return self._convert_tuple(key, is_setter=True) + if isinstance(key, tuple): + try: + return self._convert_tuple(key, is_setter=True) + except IndexingError: + pass + if isinstance(key, range): return self._convert_range(key, is_setter=True) @@ -182,6 +187,8 @@ def _convert_tuple(self, key, is_setter=False): keyidx.append(slice(None)) else: for i, k in enumerate(key): + if i >= self.obj.ndim: + raise IndexingError('Too many indexers') idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) keyidx.append(idx) return tuple(keyidx) @@ -1284,6 +1291,20 @@ class _IXIndexer(_NDFrameIndexer): """ + def __init__(self, obj, name): + + _ix_deprecation_warning = """ +.ix is deprecated. Please use +.loc for label based indexing or +.iloc for positional indexing + +See the documentation here: +http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix""" + + warnings.warn(_ix_deprecation_warning, + DeprecationWarning, stacklevel=3) + super(_IXIndexer, self).__init__(obj, name) + def _has_valid_type(self, key, axis): if isinstance(key, slice): return True @@ -1619,11 +1640,17 @@ def _is_valid_list_like(self, key, axis): # return a boolean if we are a valid list-like (e.g. that we don't # have out-of-bounds values) + # a tuple should already have been caught by this point + # so don't treat a tuple as a valid indexer + if isinstance(key, tuple): + raise IndexingError('Too many indexers') + # coerce the key to not exceed the maximum size of the index arr = np.array(key) ax = self.obj._get_axis(axis) l = len(ax) - if len(arr) and (arr.max() >= l or arr.min() < -l): + if (hasattr(arr, '__len__') and len(arr) and + (arr.max() >= l or arr.min() < -l)): raise IndexError("positional indexers are out-of-bounds") return True diff --git a/pandas/core/panel.py b/pandas/core/panel.py index f708774dd84ff..a11ef53de1af9 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -272,7 +272,7 @@ def __getitem__(self, key): return self._getitem_multilevel(key) if not (is_list_like(key) or isinstance(key, slice)): return super(Panel, self).__getitem__(key) - return self.ix[key] + return self.loc[key] def _getitem_multilevel(self, key): info = self._info_axis diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 0831568e8c955..d6287f17c8387 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -450,7 +450,7 @@ def _unstack_frame(obj, level, fill_value=None): result = DataFrame(BlockManager(new_blocks, new_axes)) mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) - return result.ix[:, mask_frame.sum(0) > 0] + return result.loc[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, @@ -625,12 +625,12 @@ def _convert_level_number(level_num, columns): drop_cols.append(key) continue elif slice_len != levsize: - chunk = this.ix[:, this.columns[loc]] + chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: - value_slice = this.ix[:, this.columns[loc]].values + value_slice = this.loc[:, this.columns[loc]].values else: value_slice = this.values[:, loc] @@ -771,7 +771,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if value_vars is not None: if not isinstance(value_vars, (tuple, list, np.ndarray)): value_vars = [value_vars] - frame = frame.ix[:, id_vars + value_vars] + frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() diff --git a/pandas/core/series.py b/pandas/core/series.py index d967e2d02d41f..9845e1cd4ad47 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -685,7 +685,7 @@ def _get_with(self, key): try: # handle the dup indexing case (GH 4246) if isinstance(key, (list, tuple)): - return self.ix[key] + return self.loc[key] return self.reindex(key) except Exception: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7af200114f31c..7b4800115b23b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1043,7 +1043,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, valid_index = next(idxs) for index in idxs: valid_index = valid_index.intersection(index) - value = value.ix[valid_index] + value = value.loc[valid_index] # append for k, v in d.items(): @@ -3579,8 +3579,8 @@ def process_filter(field, filt): filt = filt.union(Index(self.levels)) takers = op(axis_values, filt) - return obj.ix._getitem_axis(takers, - axis=axis_number) + return obj.loc._getitem_axis(takers, + axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: @@ -3593,8 +3593,8 @@ def process_filter(field, filt): if isinstance(obj, DataFrame): axis_number = 1 - axis_number takers = op(values, filt) - return obj.ix._getitem_axis(takers, - axis=axis_number) + return obj.loc._getitem_axis(takers, + axis=axis_number) raise ValueError( "cannot find the field [%s] for filtering!" % field) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index aaa9752dc6d46..48e92feeb3efe 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -438,9 +438,9 @@ def test_v12_compat(self): columns=['A', 'B', 'C', 'D'], index=pd.date_range('2000-01-03', '2000-01-07')) df['date'] = pd.Timestamp('19920106 18:21:32.12') - df.ix[3, 'date'] = pd.Timestamp('20130101') + df.iloc[3, df.columns.get_loc('date')] = pd.Timestamp('20130101') df['modified'] = df['date'] - df.ix[1, 'modified'] = pd.NaT + df.iloc[1, df.columns.get_loc('modified')] = pd.NaT v12_json = os.path.join(self.dirpath, 'tsframe_v012.json') df_unser = pd.read_json(v12_json) @@ -650,8 +650,8 @@ def test_date_format_frame(self): def test_w_date(date, date_unit=None): df['date'] = Timestamp(date) - df.ix[1, 'date'] = pd.NaT - df.ix[5, 'date'] = pd.NaT + df.iloc[1, df.columns.get_loc('date')] = pd.NaT + df.iloc[5, df.columns.get_loc('date')] = pd.NaT if date_unit: json = df.to_json(date_format='iso', date_unit=date_unit) else: @@ -671,8 +671,8 @@ def test_w_date(date, date_unit=None): def test_date_format_series(self): def test_w_date(date, date_unit=None): ts = Series(Timestamp(date), index=self.ts.index) - ts.ix[1] = pd.NaT - ts.ix[5] = pd.NaT + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT if date_unit: json = ts.to_json(date_format='iso', date_unit=date_unit) else: @@ -693,9 +693,10 @@ def test_w_date(date, date_unit=None): def test_date_unit(self): df = self.tsframe.copy() df['date'] = Timestamp('20130101 20:43:42') - df.ix[1, 'date'] = Timestamp('19710101 20:43:42') - df.ix[2, 'date'] = Timestamp('21460101 20:43:42') - df.ix[4, 'date'] = pd.NaT + dl = df.columns.get_loc('date') + df.iloc[1, dl] = Timestamp('19710101 20:43:42') + df.iloc[2, dl] = Timestamp('21460101 20:43:42') + df.iloc[4, dl] = pd.NaT for unit in ('s', 'ms', 'us', 'ns'): json = df.to_json(date_format='epoch', date_unit=unit) @@ -894,14 +895,14 @@ def test_datetime_tz(self): def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = pd.DataFrame(np.random.randn(10, 4)) - df.ix[:8] = np.nan + df.loc[:8] = np.nan sdf = df.to_sparse() expected = df.to_json() self.assertEqual(expected, sdf.to_json()) s = pd.Series(np.random.randn(10)) - s.ix[:8] = np.nan + s.loc[:8] = np.nan ss = s.to_sparse() expected = s.to_json() diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 6a05dada05e0d..9655c481b763a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -275,7 +275,7 @@ def test_read_csv_no_index_name(self): pd.Index(['A', 'B', 'C', 'D', 'E'])) self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) - self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, + self.assertEqual(df.loc[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) tm.assert_frame_equal(df, df2) @@ -666,7 +666,7 @@ def test_skipinitialspace(self): # it's 33 columns result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'], header=None, skipinitialspace=True) - self.assertTrue(pd.isnull(result.ix[0, 29])) + self.assertTrue(pd.isnull(result.iloc[0, 29])) def test_utf16_bom_skiprows(self): # #2298 diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index 9fe49f616c5f2..e4af1ff70a498 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -64,7 +64,7 @@ def func(*date_cols): self.assertNotIn('X3', df) d = datetime(1999, 1, 27, 19, 0) - self.assertEqual(df.ix[0, 'nominal'], d) + self.assertEqual(df.loc[0, 'nominal'], d) df = self.read_csv(StringIO(data), header=None, date_parser=func, @@ -96,7 +96,7 @@ def func(*date_cols): self.assertNotIn('X3', df) d = datetime(1999, 1, 27, 19, 0) - self.assertEqual(df.ix[0, 'X1_X2'], d) + self.assertEqual(df.loc[0, 'X1_X2'], d) df = self.read_csv(StringIO(data), header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True) diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py index 3a0dd4eaa09e5..99abbacb604fa 100644 --- a/pandas/io/tests/test_date_converters.py +++ b/pandas/io/tests/test_date_converters.py @@ -40,7 +40,7 @@ def test_parse_date_time(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_time) self.assertIn('date_time', df) - self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) + self.assertEqual(df.date_time.loc[0], datetime(2001, 1, 5, 10, 0, 0)) data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" @@ -65,7 +65,7 @@ def test_parse_date_fields(self): parse_dates=datecols, date_parser=conv.parse_date_fields) self.assertIn('ymd', df) - self.assertEqual(df.ymd.ix[0], datetime(2001, 1, 10)) + self.assertEqual(df.ymd.loc[0], datetime(2001, 1, 10)) def test_datetime_six_col(self): result = conv.parse_all_fields(self.years, self.months, self.days, @@ -82,7 +82,7 @@ def test_datetime_six_col(self): parse_dates=datecols, date_parser=conv.parse_all_fields) self.assertIn('ymdHMS', df) - self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0)) + self.assertEqual(df.ymdHMS.loc[0], datetime(2001, 1, 5, 10, 0, 0)) def test_datetime_fractional_seconds(self): data = """\ @@ -95,10 +95,10 @@ def test_datetime_fractional_seconds(self): parse_dates=datecols, date_parser=conv.parse_all_fields) self.assertIn('ymdHMS', df) - self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0, - microsecond=123456)) - self.assertEqual(df.ymdHMS.ix[1], datetime(2001, 1, 5, 10, 0, 0, - microsecond=500000)) + self.assertEqual(df.ymdHMS.loc[0], datetime(2001, 1, 5, 10, 0, 0, + microsecond=123456)) + self.assertEqual(df.ymdHMS.loc[1], datetime(2001, 1, 5, 10, 0, 0, + microsecond=500000)) def test_generic(self): data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." @@ -108,7 +108,7 @@ def test_generic(self): parse_dates=datecols, date_parser=dateconverter) self.assertIn('ym', df) - self.assertEqual(df.ym.ix[0], date(2001, 1, 1)) + self.assertEqual(df.ym.loc[0], date(2001, 1, 1)) def test_dateparser_resolution_if_not_ns(self): # issue 10245 diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 4b8e7714a911e..9dd2739608b7a 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -276,12 +276,12 @@ def test_excel_table_sheet_by_index(self): df3 = read_excel(excel, 0, index_col=0, skipfooter=1) df4 = read_excel(excel, 0, index_col=0, skip_footer=1) - tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df1.iloc[:-1]) tm.assert_frame_equal(df3, df4) df3 = excel.parse(0, index_col=0, skipfooter=1) df4 = excel.parse(0, index_col=0, skip_footer=1) - tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df1.iloc[:-1]) tm.assert_frame_equal(df3, df4) import xlrd @@ -302,7 +302,7 @@ def test_excel_table(self): skipfooter=1) df4 = self.get_exceldf('test1', 'Sheet1', index_col=0, skip_footer=1) - tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df1.iloc[:-1]) tm.assert_frame_equal(df3, df4) def test_reader_special_dtypes(self): @@ -328,7 +328,7 @@ def test_reader_special_dtypes(self): # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) - float_expected.loc[1, "Str2Col"] = 3.0 + float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 actual = self.get_exceldf(basename, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) @@ -1670,15 +1670,15 @@ def roundtrip(df, header=True, parser_hdr=0, index=True): # no nans for r in range(len(res.index)): for c in range(len(res.columns)): - self.assertTrue(res.ix[r, c] is not np.nan) + self.assertTrue(res.iloc[r, c] is not np.nan) res = roundtrip(DataFrame([0])) self.assertEqual(res.shape, (1, 1)) - self.assertTrue(res.ix[0, 0] is not np.nan) + self.assertTrue(res.iloc[0, 0] is not np.nan) res = roundtrip(DataFrame([0]), False, None) self.assertEqual(res.shape, (1, 2)) - self.assertTrue(res.ix[0, 0] is not np.nan) + self.assertTrue(res.iloc[0, 0] is not np.nan) def test_excel_010_hemstring_raises_NotImplementedError(self): # This test was failing only for j>1 and header=False, diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index f4eec864da572..9ac8def3a074d 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -144,7 +144,7 @@ def test_spam_no_types(self): df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) - self.assertEqual(df1[0].ix[0, 0], 'Proximates') + self.assertEqual(df1[0].iloc[0, 0], 'Proximates') self.assertEqual(df1[0].columns[0], 'Nutrient') def test_spam_with_types(self): @@ -152,7 +152,7 @@ def test_spam_with_types(self): df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) - self.assertEqual(df1[0].ix[0, 0], 'Proximates') + self.assertEqual(df1[0].iloc[0, 0], 'Proximates') self.assertEqual(df1[0].columns[0], 'Nutrient') def test_spam_no_match(self): diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 63c2ffc629ca6..6b368bb2bb5ce 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -531,8 +531,8 @@ def test_sparse_series(self): def test_sparse_frame(self): s = tm.makeDataFrame() - s.ix[3:5, 1:3] = np.nan - s.ix[8:10, -2] = np.nan + s.loc[3:5, 1:3] = np.nan + s.loc[8:10, -2] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_frame_equal, diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b23d0b89fe850..40db10c42d5a7 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -426,7 +426,7 @@ def test_repr(self): df['timestamp2'] = Timestamp('20010103') df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.ix[3:6, ['obj1']] = np.nan + df.loc[3:6, ['obj1']] = np.nan df = df.consolidate()._convert(datetime=True) warnings.filterwarnings('ignore', category=PerformanceWarning) @@ -770,7 +770,7 @@ def test_put_mixed_type(self): df['timestamp2'] = Timestamp('20010103') df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.ix[3:6, ['obj1']] = np.nan + df.loc[3:6, ['obj1']] = np.nan df = df.consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: @@ -815,8 +815,8 @@ def test_append(self): # panel wp = tm.makePanel() _maybe_remove(store, 'wp1') - store.append('wp1', wp.ix[:, :10, :]) - store.append('wp1', wp.ix[:, 10:, :]) + store.append('wp1', wp.iloc[:, :10, :]) + store.append('wp1', wp.iloc[:, 10:, :]) assert_panel_equal(store['wp1'], wp) # ndim @@ -824,15 +824,15 @@ def test_append(self): check_stacklevel=False): p4d = tm.makePanel4D() _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :]) - store.append('p4d', p4d.ix[:, :, 10:, :]) + store.append('p4d', p4d.iloc[:, :, :10, :]) + store.append('p4d', p4d.iloc[:, :, 10:, :]) assert_panel4d_equal(store['p4d'], p4d) # test using axis labels _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=[ + store.append('p4d', p4d.iloc[:, :, :10, :], axes=[ 'items', 'major_axis', 'minor_axis']) - store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + store.append('p4d', p4d.iloc[:, :, 10:, :], axes=[ 'items', 'major_axis', 'minor_axis']) assert_panel4d_equal(store['p4d'], p4d) @@ -847,16 +847,16 @@ def test_append(self): # test using differt order of items on the non-index axes _maybe_remove(store, 'wp1') - wp_append1 = wp.ix[:, :10, :] + wp_append1 = wp.iloc[:, :10, :] store.append('wp1', wp_append1) - wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1]) + wp_append2 = wp.iloc[:, 10:, :].reindex(items=wp.items[::-1]) store.append('wp1', wp_append2) assert_panel_equal(store['wp1'], wp) # dtype issues - mizxed type in a single object column df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) df['mixed_column'] = 'testing' - df.ix[2, 'mixed_column'] = np.nan + df.loc[2, 'mixed_column'] = np.nan _maybe_remove(store, 'df') store.append('df', df) tm.assert_frame_equal(store['df'], df) @@ -1040,14 +1040,14 @@ def test_append_some_nans(self): index=np.arange(20)) # some nans _maybe_remove(store, 'df1') - df.ix[0:15, ['A1', 'B', 'D', 'E']] = np.nan + df.loc[0:15, ['A1', 'B', 'D', 'E']] = np.nan store.append('df1', df[:10]) store.append('df1', df[10:]) tm.assert_frame_equal(store['df1'], df) # first column df1 = df.copy() - df1.ix[:, 'A1'] = np.nan + df1.loc[:, 'A1'] = np.nan _maybe_remove(store, 'df1') store.append('df1', df1[:10]) store.append('df1', df1[10:]) @@ -1055,7 +1055,7 @@ def test_append_some_nans(self): # 2nd column df2 = df.copy() - df2.ix[:, 'A2'] = np.nan + df2.loc[:, 'A2'] = np.nan _maybe_remove(store, 'df2') store.append('df2', df2[:10]) store.append('df2', df2[10:]) @@ -1063,7 +1063,7 @@ def test_append_some_nans(self): # datetimes df3 = df.copy() - df3.ix[:, 'E'] = np.nan + df3.loc[:, 'E'] = np.nan _maybe_remove(store, 'df3') store.append('df3', df3[:10]) store.append('df3', df3[10:]) @@ -1076,7 +1076,7 @@ def test_append_all_nans(self): df = DataFrame({'A1': np.random.randn(20), 'A2': np.random.randn(20)}, index=np.arange(20)) - df.ix[0:15, :] = np.nan + df.loc[0:15, :] = np.nan # nan some entire rows (dropna=True) _maybe_remove(store, 'df') @@ -1109,7 +1109,7 @@ def test_append_all_nans(self): 'B': 'foo', 'C': 'bar'}, index=np.arange(20)) - df.ix[0:15, :] = np.nan + df.loc[0:15, :] = np.nan _maybe_remove(store, 'df') store.append('df', df[:10], dropna=True) @@ -1130,7 +1130,7 @@ def test_append_all_nans(self): 'E': datetime.datetime(2001, 1, 2, 0, 0)}, index=np.arange(20)) - df.ix[0:15, :] = np.nan + df.loc[0:15, :] = np.nan _maybe_remove(store, 'df') store.append('df', df[:10], dropna=True) @@ -1173,8 +1173,8 @@ def test_append_frame_column_oriented(self): # column oriented df = tm.makeTimeDataFrame() _maybe_remove(store, 'df1') - store.append('df1', df.ix[:, :2], axes=['columns']) - store.append('df1', df.ix[:, 2:]) + store.append('df1', df.iloc[:, :2], axes=['columns']) + store.append('df1', df.iloc[:, 2:]) tm.assert_frame_equal(store['df1'], df) result = store.select('df1', 'columns=A') @@ -1250,37 +1250,37 @@ def check_indexers(key, indexers): indexers = ['items', 'major_axis', 'minor_axis'] _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) + store.append('p4d', p4d.iloc[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.iloc[:, :, 10:, :]) assert_panel4d_equal(store.select('p4d'), p4d) check_indexers('p4d', indexers) # same as above, but try to append with differnt axes _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + store.append('p4d', p4d.iloc[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.iloc[:, :, 10:, :], axes=[ 'labels', 'items', 'major_axis']) assert_panel4d_equal(store.select('p4d'), p4d) check_indexers('p4d', indexers) # pass incorrect number of axes _maybe_remove(store, 'p4d') - self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ + self.assertRaises(ValueError, store.append, 'p4d', p4d.iloc[ :, :, :10, :], axes=['major_axis', 'minor_axis']) # different than default indexables #1 indexers = ['labels', 'major_axis', 'minor_axis'] _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) + store.append('p4d', p4d.iloc[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.iloc[:, :, 10:, :]) assert_panel4d_equal(store['p4d'], p4d) check_indexers('p4d', indexers) # different than default indexables #2 indexers = ['major_axis', 'labels', 'minor_axis'] _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) + store.append('p4d', p4d.iloc[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.iloc[:, :, 10:, :]) assert_panel4d_equal(store['p4d'], p4d) check_indexers('p4d', indexers) @@ -1392,11 +1392,11 @@ def check_col(key, name, size): _maybe_remove(store, 'df') df = tm.makeTimeDataFrame() df['string'] = 'foo' - df.ix[1:4, 'string'] = np.nan + df.loc[1:4, 'string'] = np.nan df['string2'] = 'bar' - df.ix[4:8, 'string2'] = np.nan + df.loc[4:8, 'string2'] = np.nan df['string3'] = 'bah' - df.ix[1:, 'string3'] = np.nan + df.loc[1:, 'string3'] = np.nan store.append('df', df) result = store.select('df') tm.assert_frame_equal(result, df) @@ -1466,7 +1466,7 @@ def test_append_with_data_columns(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame() - df.loc[:, 'B'].iloc[0] = 1. + df.iloc[0, df.columns.get_loc('B')] = 1. _maybe_remove(store, 'df') store.append('df', df[:2], data_columns=['B']) store.append('df', df[2:]) @@ -1533,14 +1533,18 @@ def check_col(key, name, size): with ensure_clean_store(self.path) as store: # multiple data columns df_new = df.copy() - df_new.ix[0, 'A'] = 1. - df_new.ix[0, 'B'] = -1. + df_new.iloc[0, df_new.columns.get_loc('A')] = 1. + df_new.iloc[0, df_new.columns.get_loc('B')] = -1. df_new['string'] = 'foo' - df_new.loc[1:4, 'string'] = np.nan - df_new.loc[5:6, 'string'] = 'bar' + + sl = df_new.columns.get_loc('string') + df_new.iloc[1:4, sl] = np.nan + df_new.iloc[5:6, sl] = 'bar' + df_new['string2'] = 'foo' - df_new.loc[2:5, 'string2'] = np.nan - df_new.loc[7:8, 'string2'] = 'bar' + sl = df_new.columns.get_loc('string2') + df_new.iloc[2:5, sl] = np.nan + df_new.iloc[7:8, sl] = 'bar' _maybe_remove(store, 'df') store.append( 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) @@ -1561,12 +1565,12 @@ def check_col(key, name, size): # doc example df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.ix[4:6, 'string'] = np.nan - df_dc.ix[7:9, 'string'] = 'bar' + df_dc.loc[4:6, 'string'] = np.nan + df_dc.loc[7:9, 'string'] = 'bar' df_dc['string2'] = 'cool' df_dc['datetime'] = Timestamp('20010102') df_dc = df_dc._convert(datetime=True) - df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan + df_dc.loc[3:5, ['A', 'B', 'datetime']] = np.nan _maybe_remove(store, 'df_dc') store.append('df_dc', df_dc, @@ -1590,9 +1594,9 @@ def check_col(key, name, size): df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C']) df_dc['string'] = 'foo' - df_dc.ix[4:6, 'string'] = np.nan - df_dc.ix[7:9, 'string'] = 'bar' - df_dc.ix[:, ['B', 'C']] = df_dc.ix[:, ['B', 'C']].abs() + df_dc.loc[4:6, 'string'] = np.nan + df_dc.loc[7:9, 'string'] = 'bar' + df_dc.loc[:, ['B', 'C']] = df_dc.loc[:, ['B', 'C']].abs() df_dc['string2'] = 'cool' # on-disk operations @@ -1695,8 +1699,9 @@ def col(t, column): def test_append_diff_item_order(self): wp = tm.makePanel() - wp1 = wp.ix[:, :10, :] - wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] + wp1 = wp.iloc[:, :10, :] + wp2 = wp.iloc[wp.items.get_indexer(['ItemC', 'ItemB', 'ItemA']), + 10:, :] with ensure_clean_store(self.path) as store: store.put('panel', wp1, format='table') @@ -2080,7 +2085,7 @@ def test_table_mixed_dtypes(self): df['timestamp2'] = Timestamp('20010103') df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.ix[3:6, ['obj1']] = np.nan + df.loc[3:6, ['obj1']] = np.nan df = df.consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: @@ -2177,7 +2182,7 @@ def test_append_with_timedelta(self): df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp( '20130101') + timedelta(days=i, seconds=10) for i in range(10)])) df['C'] = df['A'] - df['B'] - df.ix[3:5, 'C'] = np.nan + df.loc[3:5, 'C'] = np.nan with ensure_clean_store(self.path) as store: @@ -2439,7 +2444,7 @@ def test_invalid_terms(self): df = tm.makeTimeDataFrame() df['string'] = 'foo' - df.ix[0:4, 'string'] = 'bar' + df.loc[0:4, 'string'] = 'bar' wp = tm.makePanel() p4d = tm.makePanel4D() @@ -2726,7 +2731,7 @@ def test_series(self): def test_sparse_series(self): s = tm.makeStringSeries() - s[3:5] = np.nan + s.iloc[3:5] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) @@ -2742,8 +2747,8 @@ def test_sparse_series(self): def test_sparse_frame(self): s = tm.makeDataFrame() - s.ix[3:5, 1:3] = np.nan - s.ix[8:10, -2] = np.nan + s.iloc[3:5, 1:3] = np.nan + s.iloc[8:10, -2] = np.nan ss = s.to_sparse() self._check_double_roundtrip(ss, tm.assert_frame_equal, @@ -3199,7 +3204,7 @@ def test_select_dtypes(self): # bool columns (GH #2849) df = DataFrame(np.random.randn(5, 2), columns=['A', 'B']) df['object'] = 'foo' - df.ix[4:5, 'object'] = 'bar' + df.loc[4:5, 'object'] = 'bar' df['boolv'] = df['A'] > 0 _maybe_remove(store, 'df') store.append('df', df, data_columns=True) @@ -3718,11 +3723,11 @@ def test_frame_select(self): crit3 = ('columns=A') result = store.select('frame', [crit1, crit2]) - expected = df.ix[date:, ['A', 'D']] + expected = df.loc[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) result = store.select('frame', [crit3]) - expected = df.ix[:, ['A']] + expected = df.loc[:, ['A']] tm.assert_frame_equal(result, expected) # invalid terms @@ -3881,7 +3886,7 @@ def test_string_select(self): # test string ==/!= df['x'] = 'none' - df.ix[2:7, 'x'] = '' + df.loc[2:7, 'x'] = '' store.append('df', df, data_columns=['x']) @@ -3908,7 +3913,7 @@ def test_string_select(self): # int ==/!= df['int'] = 1 - df.ix[2:7, 'int'] = 2 + df.loc[2:7, 'int'] = 2 store.append('df3', df, data_columns=['int']) @@ -3954,7 +3959,7 @@ def f(): # a data column with NaNs, result excludes the NaNs df3 = df.copy() df3['string'] = 'foo' - df3.ix[4:6, 'string'] = np.nan + df3.loc[4:6, 'string'] = np.nan store.append('df3', df3, data_columns=['string']) result = store.select_column('df3', 'string') tm.assert_almost_equal(result.values, df3['string'].values) @@ -4005,13 +4010,13 @@ def test_coordinates(self): c = store.select_as_coordinates('df', ['index<3']) assert((c.values == np.arange(3)).all()) result = store.select('df', where=c) - expected = df.ix[0:2, :] + expected = df.loc[0:2, :] tm.assert_frame_equal(result, expected) c = store.select_as_coordinates('df', ['index>=3', 'index<=4']) assert((c.values == np.arange(2) + 3).all()) result = store.select('df', where=c) - expected = df.ix[3:4, :] + expected = df.loc[3:4, :] tm.assert_frame_equal(result, expected) self.assertIsInstance(c, Index) @@ -4113,7 +4118,7 @@ def test_append_to_multiple(self): def test_append_to_multiple_dropna(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df1.ix[1, ['A', 'B']] = np.nan + df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: @@ -4226,14 +4231,14 @@ def test_start_stop_table(self): result = store.select( 'df', [Term("columns=['A']")], start=0, stop=5) - expected = df.ix[0:4, ['A']] + expected = df.loc[0:4, ['A']] tm.assert_frame_equal(result, expected) # out of range result = store.select( 'df', [Term("columns=['A']")], start=30, stop=40) self.assertTrue(len(result) == 0) - expected = df.ix[30:40, ['A']] + expected = df.loc[30:40, ['A']] tm.assert_frame_equal(result, expected) def test_start_stop_fixed(self): @@ -4275,8 +4280,8 @@ def test_start_stop_fixed(self): # sparse; not implemented df = tm.makeDataFrame() - df.ix[3:5, 1:3] = np.nan - df.ix[8:10, -2] = np.nan + df.iloc[3:5, 1:3] = np.nan + df.iloc[8:10, -2] = np.nan dfs = df.to_sparse() store.put('dfs', dfs) with self.assertRaises(NotImplementedError): @@ -4293,11 +4298,11 @@ def test_select_filter_corner(self): crit = Term('columns=df.columns[:75]') result = store.select('frame', [crit]) - tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) crit = Term('columns=df.columns[:75:2]') result = store.select('frame', [crit]) - tm.assert_frame_equal(result, df.ix[:, df.columns[:75:2]]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index cb08944e8dc57..9e639f7ef6057 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -2140,7 +2140,7 @@ def test_basic(self): def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() - frame.ix[0, 0] = np.nan + frame.iloc[0, 0] = np.nan create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) @@ -2165,7 +2165,7 @@ def test_execute(self): cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" - row = frame.ix[0] + row = frame.iloc[0] sql.execute(ins, self.conn, params=tuple(row)) self.conn.commit() @@ -2430,7 +2430,7 @@ def test_write_row_by_row(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() - frame.ix[0, 0] = np.nan + frame.iloc[0, 0] = np.nan drop_sql = "DROP TABLE IF EXISTS test" create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() @@ -2474,7 +2474,7 @@ def test_execute(self): cur.execute(create_sql) ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" - row = frame.ix[0].values.tolist() + row = frame.iloc[0].values.tolist() sql.execute(ins, self.conn, params=tuple(row)) self.conn.commit() diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index cd972868a6e32..08fcde8d3022e 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -336,7 +336,7 @@ def test_write_preserves_original(self): # 9795 np.random.seed(423) df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) - df.ix[2, 'a':'c'] = np.nan + df.loc[2, 'a':'c'] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: df.to_stata(path, write_index=False) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 83b6a89811ee6..b9e8a31393931 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -180,14 +180,14 @@ def test_constructor_from_series(self): x = Series(np.random.randn(10000), name='a') y = Series(np.random.randn(10000), name='b') x2 = x.astype(float) - x2.ix[:9998] = np.NaN + x2.loc[:9998] = np.NaN # TODO: x_sparse is unused...fix x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa # Currently fails too with weird ufunc error # df1 = SparseDataFrame([x_sparse, y]) - y.ix[:9998] = 0 + y.loc[:9998] = 0 # TODO: y_sparse is unsused...fix y_sparse = y.to_sparse(fill_value=0) # noqa # without sparse value raises error @@ -232,7 +232,7 @@ def test_constructor_nan_dataframe(self): def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) - df.ix[:9998] = np.nan + df.loc[:9998] = np.nan sdf = df.to_sparse() result = sdf.get_dtype_counts() @@ -248,7 +248,7 @@ def test_shape(self): def test_str(self): df = DataFrame(np.random.randn(10000, 4)) - df.ix[:9998] = np.nan + df.loc[:9998] = np.nan sdf = df.to_sparse() str(sdf) @@ -364,7 +364,7 @@ def _compare_to_dense(a, b, da, db, op): _compare_to_dense(s, frame, s, frame.to_dense(), op) # it works! - result = self.frame + self.frame.ix[:, ['A', 'B']] # noqa + result = self.frame + self.frame.loc[:, ['A', 'B']] # noqa def test_op_corners(self): empty = self.empty + self.empty @@ -427,12 +427,12 @@ def test_set_value(self): def test_fancy_index_misc(self): # axis = 0 - sliced = self.frame.ix[-2:, :] + sliced = self.frame.iloc[-2:, :] expected = self.frame.reindex(index=self.frame.index[-2:]) tm.assert_sp_frame_equal(sliced, expected) # axis = 1 - sliced = self.frame.ix[:, -2:] + sliced = self.frame.iloc[:, -2:] expected = self.frame.reindex(columns=self.frame.columns[-2:]) tm.assert_sp_frame_equal(sliced, expected) @@ -556,10 +556,10 @@ def test_append(self): appended = a.append(b) tm.assert_sp_frame_equal(appended, self.frame, exact_indices=False) - a = self.frame.ix[:5, :3] - b = self.frame.ix[5:] + a = self.frame.iloc[:5, :3] + b = self.frame.iloc[5:] appended = a.append(b) - tm.assert_sp_frame_equal(appended.ix[:, :3], self.frame.ix[:, :3], + tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) def test_apply(self): @@ -721,12 +721,12 @@ def test_describe(self): desc = self.frame.describe() # noqa def test_join(self): - left = self.frame.ix[:, ['A', 'B']] - right = self.frame.ix[:, ['C', 'D']] + left = self.frame.loc[:, ['A', 'B']] + right = self.frame.loc[:, ['C', 'D']] joined = left.join(right) tm.assert_sp_frame_equal(joined, self.frame, exact_indices=False) - right = self.frame.ix[:, ['B', 'D']] + right = self.frame.loc[:, ['B', 'D']] self.assertRaises(Exception, left.join, right) with tm.assertRaisesRegexp(ValueError, diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index c0d4b70c41dc4..a634c34139186 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -562,8 +562,8 @@ def test_getitem(self): tm.assert_sp_frame_equal(sparse[[True, False, True, True]], orig[[True, False, True, True]].to_sparse()) - tm.assert_sp_frame_equal(sparse[[1, 2]], - orig[[1, 2]].to_sparse()) + tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], + orig.iloc[[1, 2]].to_sparse()) def test_getitem_fill_value(self): orig = pd.DataFrame([[1, np.nan, 0], @@ -589,9 +589,9 @@ def test_getitem_fill_value(self): exp._default_fill_value = np.nan tm.assert_sp_frame_equal(sparse[indexer], exp) - exp = orig[[1, 2]].to_sparse(fill_value=0) + exp = orig.iloc[[1, 2]].to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[[1, 2]], exp) + tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], exp) def test_loc(self): orig = pd.DataFrame([[1, np.nan, np.nan], diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index b5ca3a730ed27..06d76bdd4dd3d 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -295,8 +295,8 @@ def test_constructor_scalar(self): data = 5 sp = SparseSeries(data, np.arange(100)) sp = sp.reindex(np.arange(200)) - self.assertTrue((sp.ix[:99] == data).all()) - self.assertTrue(isnull(sp.ix[100:]).all()) + self.assertTrue((sp.loc[:99] == data).all()) + self.assertTrue(isnull(sp.loc[100:]).all()) data = np.nan sp = SparseSeries(data, np.arange(100)) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 6f688649affb0..2935f986cca9f 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -111,7 +111,7 @@ def testWLS(self): self._check_wls(X, Y, weights) - weights.ix[[5, 15]] = np.nan + weights.loc[[5, 15]] = np.nan Y[[2, 21]] = np.nan self._check_wls(X, Y, weights) @@ -421,8 +421,8 @@ def test_various_attributes(self): model._results def test_catch_regressor_overlap(self): - df1 = tm.makeTimeDataFrame().ix[:, ['A', 'B']] - df2 = tm.makeTimeDataFrame().ix[:, ['B', 'C', 'D']] + df1 = tm.makeTimeDataFrame().loc[:, ['A', 'B']] + df2 = tm.makeTimeDataFrame().loc[:, ['B', 'C', 'D']] y = tm.makeTimeSeries() data = {'foo': df1, 'bar': df2} @@ -562,10 +562,10 @@ def test_wls_panel(self): x = Panel({'x1': tm.makeTimeDataFrame(), 'x2': tm.makeTimeDataFrame()}) - y.ix[[1, 7], 'A'] = np.nan - y.ix[[6, 15], 'B'] = np.nan - y.ix[[3, 20], 'C'] = np.nan - y.ix[[5, 11], 'D'] = np.nan + y.iloc[[1, 7], y.columns.get_loc('A')] = np.nan + y.iloc[[6, 15], y.columns.get_loc('B')] = np.nan + y.iloc[[3, 20], y.columns.get_loc('C')] = np.nan + y.iloc[[5, 11], y.columns.get_loc('D')] = np.nan stack_y = y.stack() stack_x = DataFrame(dict((k, v.stack()) @@ -615,7 +615,7 @@ def testWithEntityEffects(self): index=result._x.index, columns=['FE_B', 'x1', 'x2', 'intercept'], dtype=float) - tm.assert_frame_equal(result._x, exp_x.ix[:, result._x.columns]) + tm.assert_frame_equal(result._x, exp_x.loc[:, result._x.columns]) # _check_non_raw_results(result) def testWithEntityEffectsAndDroppedDummies(self): @@ -630,7 +630,7 @@ def testWithEntityEffectsAndDroppedDummies(self): index=result._x.index, columns=['FE_A', 'x1', 'x2', 'intercept'], dtype=float) - tm.assert_frame_equal(result._x, exp_x.ix[:, result._x.columns]) + tm.assert_frame_equal(result._x, exp_x.loc[:, result._x.columns]) # _check_non_raw_results(result) def testWithXEffects(self): diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 00e5e002ca48d..b27e051d4f409 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -136,7 +136,7 @@ def test_repr_embedded_ndarray(self): df.to_string() def test_eng_float_formatter(self): - self.frame.ix[5] = 0 + self.frame.loc[5] = 0 fmt.set_eng_float_format() repr(self.frame) @@ -1884,7 +1884,7 @@ def test_index_with_nan(self): # all-nan in mi df2 = df.copy() - df2.ix[:, 'id2'] = np.nan + df2.loc[:, 'id2'] = np.nan y = df2.set_index('id2') result = y.to_string() expected = u( @@ -1893,7 +1893,7 @@ def test_index_with_nan(self): # partial nan in mi df2 = df.copy() - df2.ix[:, 'id2'] = np.nan + df2.loc[:, 'id2'] = np.nan y = df2.set_index(['id2', 'id3']) result = y.to_string() expected = u( @@ -3715,7 +3715,7 @@ def test_mixed_datetime64(self): df = DataFrame({'A': [1, 2], 'B': ['2012-01-01', '2012-01-02']}) df['B'] = pd.to_datetime(df.B) - result = repr(df.ix[0]) + result = repr(df.loc[0]) self.assertTrue('2012-01-01' in result) def test_period(self): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 46f0fff7bb4b8..edeca0a664a87 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -39,10 +39,10 @@ def test_set_index_cast(self): # issue casting an index then set_index df = DataFrame({'A': [1.1, 2.2, 3.3], 'B': [5.0, 6.1, 7.2]}, index=[2010, 2011, 2012]) - expected = df.ix[2010] + expected = df.loc[2010] new_index = df.index.astype(np.int32) df.index = new_index - result = df.ix[2010] + result = df.loc[2010] assert_series_equal(result, expected) def test_set_index2(self): @@ -58,7 +58,7 @@ def test_set_index2(self): index = Index(df['C'], name='C') - expected = df.ix[:, ['A', 'B', 'D', 'E']] + expected = df.loc[:, ['A', 'B', 'D', 'E']] expected.index = index expected_nodrop = df.copy() @@ -86,7 +86,7 @@ def test_set_index2(self): index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) - expected = df.ix[:, ['C', 'D', 'E']] + expected = df.loc[:, ['C', 'D', 'E']] expected.index = index expected_nodrop = df.copy() @@ -301,8 +301,8 @@ def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) rs = df.set_index(df.columns[0]) - xp = df.ix[:, 1:] - xp.index = df.ix[:, 0].values + xp = df.iloc[:, 1:] + xp.index = df.iloc[:, 0].values xp.index.names = [df.columns[0]] assert_frame_equal(rs, xp) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f6081e14d4081..7e50fbd7378e0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -58,7 +58,7 @@ def _check_method(self, method='pearson', check_minp=False): else: result = self.frame.corr(min_periods=len(self.frame) - 8) expected = self.frame.corr() - expected.ix['A', 'B'] = expected.ix['B', 'A'] = nan + expected.loc['A', 'B'] = expected.loc['B', 'A'] = nan tm.assert_frame_equal(result, expected) def test_corr_non_numeric(self): @@ -68,7 +68,7 @@ def test_corr_non_numeric(self): # exclude non-numeric types result = self.mixed_frame.corr() - expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].corr() + expected = self.mixed_frame.loc[:, ['A', 'B', 'C', 'D']].corr() tm.assert_frame_equal(result, expected) def test_corr_nooverlap(self): @@ -81,11 +81,11 @@ def test_corr_nooverlap(self): 'C': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]}) rs = df.corr(meth) - self.assertTrue(isnull(rs.ix['A', 'B'])) - self.assertTrue(isnull(rs.ix['B', 'A'])) - self.assertEqual(rs.ix['A', 'A'], 1) - self.assertEqual(rs.ix['B', 'B'], 1) - self.assertTrue(isnull(rs.ix['C', 'C'])) + self.assertTrue(isnull(rs.loc['A', 'B'])) + self.assertTrue(isnull(rs.loc['B', 'A'])) + self.assertEqual(rs.loc['A', 'A'], 1) + self.assertEqual(rs.loc['B', 'B'], 1) + self.assertTrue(isnull(rs.loc['C', 'C'])) def test_corr_constant(self): tm._skip_if_no_scipy() @@ -135,8 +135,8 @@ def test_cov(self): frame['B'][5:10] = nan result = self.frame.cov(min_periods=len(self.frame) - 8) expected = self.frame.cov() - expected.ix['A', 'B'] = np.nan - expected.ix['B', 'A'] = np.nan + expected.loc['A', 'B'] = np.nan + expected.loc['B', 'A'] = np.nan # regular self.frame['A'][:5] = nan @@ -148,7 +148,7 @@ def test_cov(self): # exclude non-numeric types result = self.mixed_frame.cov() - expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].cov() + expected = self.mixed_frame.loc[:, ['A', 'B', 'C', 'D']].cov() tm.assert_frame_equal(result, expected) # Single column frame @@ -157,7 +157,7 @@ def test_cov(self): expected = DataFrame(np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns) tm.assert_frame_equal(result, expected) - df.ix[0] = np.nan + df.loc[0] = np.nan result = df.cov() expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)), index=df.columns, columns=df.columns) @@ -193,7 +193,8 @@ def test_corrwith(self): df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: - tm.assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) + tm.assert_almost_equal(correls[row], + df1.loc[row].corr(df2.loc[row])) def test_corrwith_with_objects(self): df1 = tm.makeTimeDataFrame() @@ -204,11 +205,11 @@ def test_corrwith_with_objects(self): df2['obj'] = 'bar' result = df1.corrwith(df2) - expected = df1.ix[:, cols].corrwith(df2.ix[:, cols]) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) result = df1.corrwith(df2, axis=1) - expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) tm.assert_series_equal(result, expected) def test_corrwith_series(self): @@ -463,9 +464,9 @@ def test_min(self): self._check_stat_op('min', np.min, frame=self.intframe) def test_cummin(self): - self.tsframe.ix[5:10, 0] = nan - self.tsframe.ix[10:15, 1] = nan - self.tsframe.ix[15:, 2] = nan + self.tsframe.loc[5:10, 0] = nan + self.tsframe.loc[10:15, 1] = nan + self.tsframe.loc[15:, 2] = nan # axis = 0 cummin = self.tsframe.cummin() @@ -486,9 +487,9 @@ def test_cummin(self): self.assertEqual(np.shape(cummin_xs), np.shape(self.tsframe)) def test_cummax(self): - self.tsframe.ix[5:10, 0] = nan - self.tsframe.ix[10:15, 1] = nan - self.tsframe.ix[15:, 2] = nan + self.tsframe.loc[5:10, 0] = nan + self.tsframe.loc[10:15, 1] = nan + self.tsframe.loc[15:, 2] = nan # axis = 0 cummax = self.tsframe.cummax() @@ -545,11 +546,11 @@ def test_numeric_only_flag(self): methods = ['sem', 'var', 'std'] df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz']) # set one entry to a number in str format - df1.ix[0, 'foo'] = '100' + df1.loc[0, 'foo'] = '100' df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz']) # set one entry to a non-number str - df2.ix[0, 'foo'] = 'a' + df2.loc[0, 'foo'] = 'a' for meth in methods: result = getattr(df1, meth)(axis=1, numeric_only=True) @@ -567,9 +568,9 @@ def test_numeric_only_flag(self): (axis=1, numeric_only=False)) def test_cumsum(self): - self.tsframe.ix[5:10, 0] = nan - self.tsframe.ix[10:15, 1] = nan - self.tsframe.ix[15:, 2] = nan + self.tsframe.loc[5:10, 0] = nan + self.tsframe.loc[10:15, 1] = nan + self.tsframe.loc[15:, 2] = nan # axis = 0 cumsum = self.tsframe.cumsum() @@ -590,9 +591,9 @@ def test_cumsum(self): self.assertEqual(np.shape(cumsum_xs), np.shape(self.tsframe)) def test_cumprod(self): - self.tsframe.ix[5:10, 0] = nan - self.tsframe.ix[10:15, 1] = nan - self.tsframe.ix[15:, 2] = nan + self.tsframe.loc[5:10, 0] = nan + self.tsframe.loc[10:15, 1] = nan + self.tsframe.loc[15:, 2] = nan # axis = 0 cumprod = self.tsframe.cumprod() @@ -864,8 +865,8 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, if frame is None: frame = self.frame # set some NAs - frame.ix[5:10] = np.nan - frame.ix[15:20, -2:] = np.nan + frame.loc[5:10] = np.nan + frame.loc[15:20, -2:] = np.nan f = getattr(frame, name) @@ -1008,16 +1009,16 @@ def test_operators_timedelta64(self): # min result = diffs.min() - self.assertEqual(result[0], diffs.ix[0, 'A']) - self.assertEqual(result[1], diffs.ix[0, 'B']) + self.assertEqual(result[0], diffs.loc[0, 'A']) + self.assertEqual(result[1], diffs.loc[0, 'B']) result = diffs.min(axis=1) - self.assertTrue((result == diffs.ix[0, 'B']).all()) + self.assertTrue((result == diffs.loc[0, 'B']).all()) # max result = diffs.max() - self.assertEqual(result[0], diffs.ix[2, 'A']) - self.assertEqual(result[1], diffs.ix[2, 'B']) + self.assertEqual(result[0], diffs.loc[2, 'A']) + self.assertEqual(result[1], diffs.loc[2, 'B']) result = diffs.max(axis=1) self.assertTrue((result == diffs['A']).all()) @@ -1153,8 +1154,8 @@ def test_sum_bools(self): def test_idxmin(self): frame = self.frame - frame.ix[5:10] = np.nan - frame.ix[15:20, -2:] = np.nan + frame.loc[5:10] = np.nan + frame.loc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, self.intframe]: @@ -1167,8 +1168,8 @@ def test_idxmin(self): def test_idxmax(self): frame = self.frame - frame.ix[5:10] = np.nan - frame.ix[15:20, -2:] = np.nan + frame.loc[5:10] = np.nan + frame.loc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, self.intframe]: @@ -1219,8 +1220,8 @@ def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, # set some NAs frame = DataFrame(frame.values.astype(object), frame.index, frame.columns) - frame.ix[5:10] = np.nan - frame.ix[15:20, -2:] = np.nan + frame.loc[5:10] = np.nan + frame.loc[15:20, -2:] = np.nan f = getattr(frame, name) @@ -1495,43 +1496,43 @@ def test_drop_duplicates(self): tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') - expected = df.ix[[6, 7]] + expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) - expected = df.ix[[]] + expected = df.loc[[]] tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('AAA', take_last=True) - expected = df.ix[[6, 7]] + expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) # multi column - expected = df.ix[[0, 1, 2, 3]] + expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B']) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep='last') - expected = df.ix[[0, 5, 6, 7]] + expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep=False) - expected = df.ix[[0]] + expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(('AAA', 'B'), take_last=True) - expected = df.ix[[0, 5, 6, 7]] + expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) # consider everything - df2 = df.ix[:, ['AAA', 'B', 'C']] + df2 = df.loc[:, ['AAA', 'B', 'C']] result = df2.drop_duplicates() # in this case only @@ -1643,22 +1644,22 @@ def test_drop_duplicates_tuple(self): tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep='last') - expected = df.ix[[6, 7]] + expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep=False) - expected = df.ix[[]] # empty df + expected = df.loc[[]] # empty df self.assertEqual(len(result), 0) tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(('AA', 'AB'), take_last=True) - expected = df.ix[[6, 7]] + expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) # multi column - expected = df.ix[[0, 1, 2, 3]] + expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates((('AA', 'AB'), 'B')) tm.assert_frame_equal(result, expected) @@ -1673,41 +1674,41 @@ def test_drop_duplicates_NA(self): # single column result = df.drop_duplicates('A') - expected = df.ix[[0, 2, 3]] + expected = df.loc[[0, 2, 3]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') - expected = df.ix[[1, 6, 7]] + expected = df.loc[[1, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) - expected = df.ix[[]] # empty df + expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('A', take_last=True) - expected = df.ix[[1, 6, 7]] + expected = df.loc[[1, 6, 7]] tm.assert_frame_equal(result, expected) # multi column result = df.drop_duplicates(['A', 'B']) - expected = df.ix[[0, 2, 3, 6]] + expected = df.loc[[0, 2, 3, 6]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep='last') - expected = df.ix[[1, 5, 6, 7]] + expected = df.loc[[1, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep=False) - expected = df.ix[[6]] + expected = df.loc[[6]] tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(['A', 'B'], take_last=True) - expected = df.ix[[1, 5, 6, 7]] + expected = df.loc[[1, 5, 6, 7]] tm.assert_frame_equal(result, expected) # nan @@ -1724,37 +1725,37 @@ def test_drop_duplicates_NA(self): tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') - expected = df.ix[[3, 7]] + expected = df.loc[[3, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) - expected = df.ix[[]] # empty df + expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('C', take_last=True) - expected = df.ix[[3, 7]] + expected = df.loc[[3, 7]] tm.assert_frame_equal(result, expected) # multi column result = df.drop_duplicates(['C', 'B']) - expected = df.ix[[0, 1, 2, 4]] + expected = df.loc[[0, 1, 2, 4]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep='last') - expected = df.ix[[1, 3, 6, 7]] + expected = df.loc[[1, 3, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep=False) - expected = df.ix[[1]] + expected = df.loc[[1]] tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(['C', 'B'], take_last=True) - expected = df.ix[[1, 3, 6, 7]] + expected = df.loc[[1, 3, 6, 7]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA_for_take_all(self): @@ -1808,13 +1809,13 @@ def test_drop_duplicates_inplace(self): df = orig.copy() df.drop_duplicates('A', keep='last', inplace=True) - expected = orig.ix[[6, 7]] + expected = orig.loc[[6, 7]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates('A', keep=False, inplace=True) - expected = orig.ix[[]] + expected = orig.loc[[]] result = df tm.assert_frame_equal(result, expected) self.assertEqual(len(df), 0) @@ -1823,26 +1824,26 @@ def test_drop_duplicates_inplace(self): df = orig.copy() with tm.assert_produces_warning(FutureWarning): df.drop_duplicates('A', take_last=True, inplace=True) - expected = orig.ix[[6, 7]] + expected = orig.loc[[6, 7]] result = df tm.assert_frame_equal(result, expected) # multi column df = orig.copy() df.drop_duplicates(['A', 'B'], inplace=True) - expected = orig.ix[[0, 1, 2, 3]] + expected = orig.loc[[0, 1, 2, 3]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates(['A', 'B'], keep='last', inplace=True) - expected = orig.ix[[0, 5, 6, 7]] + expected = orig.loc[[0, 5, 6, 7]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates(['A', 'B'], keep=False, inplace=True) - expected = orig.ix[[0]] + expected = orig.loc[[0]] result = df tm.assert_frame_equal(result, expected) @@ -1850,12 +1851,12 @@ def test_drop_duplicates_inplace(self): df = orig.copy() with tm.assert_produces_warning(FutureWarning): df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) - expected = orig.ix[[0, 5, 6, 7]] + expected = orig.loc[[0, 5, 6, 7]] result = df tm.assert_frame_equal(result, expected) # consider everything - orig2 = orig.ix[:, ['A', 'B', 'C']].copy() + orig2 = orig.loc[:, ['A', 'B', 'C']].copy() df2 = orig2.copy() df2.drop_duplicates(inplace=True) @@ -2160,10 +2161,10 @@ def test_dot(self): self.assertTrue(result.name is None) # can pass correct-length arrays - row = a.ix[0].values + row = a.iloc[0].values result = a.dot(row) - exp = a.dot(a.ix[0]) + exp = a.dot(a.iloc[0]) tm.assert_series_equal(result, exp) with tm.assertRaisesRegexp(ValueError, 'Dot product shape mismatch'): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 9e68b7e76d78f..fe04d1005e003 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -70,7 +70,7 @@ def test_apply_empty(self): expected = Series(np.nan, index=self.frame.columns) assert_series_equal(result, expected) - no_cols = self.frame.ix[:, []] + no_cols = self.frame.loc[:, []] result = no_cols.apply(lambda x: x.mean(), axis=1) expected = Series(np.nan, index=self.frame.index) assert_series_equal(result, expected) @@ -224,7 +224,7 @@ def test_apply_yield_list(self): assert_frame_equal(result, self.frame) def test_apply_reduce_Series(self): - self.frame.ix[::2, 'A'] = np.nan + self.frame.loc[::2, 'A'] = np.nan expected = self.frame.mean(1) result = self.frame.apply(np.mean, axis=1) assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index 6c15c75cb5427..47d56c39757db 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -23,7 +23,7 @@ def setUp(self): def test_basic(self): df = self.df.copy() - df.ix[15:30, 'A'] = np.nan + df.loc[15:30, 'A'] = np.nan dates = date_range('1/1/1990', periods=self.N * 3, freq='25s') @@ -46,7 +46,7 @@ def test_subset(self): rng = date_range('1/1/1990', periods=N, freq='53s') df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) - df.ix[4:8, 'A'] = np.nan + df.loc[4:8, 'A'] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index ecce17f96a672..ff6215531fc64 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -80,9 +80,9 @@ def test_drop(self): assert_frame_equal(simple.drop("A", axis=1), simple[['B']]) assert_frame_equal(simple.drop(["A", "B"], axis='columns'), simple[[]]) - assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.ix[[2], :]) + assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) assert_frame_equal(simple.drop( - [0, 3], axis='index'), simple.ix[[1, 2], :]) + [0, 3], axis='index'), simple.loc[[1, 2], :]) self.assertRaises(ValueError, simple.drop, 5) self.assertRaises(ValueError, simple.drop, 'C', 1) @@ -92,7 +92,7 @@ def test_drop(self): # errors = 'ignore' assert_frame_equal(simple.drop(5, errors='ignore'), simple) assert_frame_equal(simple.drop([0, 5], errors='ignore'), - simple.ix[[1, 2, 3], :]) + simple.loc[[1, 2, 3], :]) assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple) assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'), simple[['B']]) @@ -105,8 +105,8 @@ def test_drop(self): nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') - assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.ix[["Y"], :]) - assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.ix[[], :]) + assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) + assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) # inplace cache issue # GH 5628 @@ -417,7 +417,7 @@ def test_align(self): self.assertIs(af._data, self.frame._data) # axis = 0 - other = self.frame.ix[:-5, :3] + other = self.frame.iloc[:-5, :3] af, bf = self.frame.align(other, axis=0, fill_value=-1) self.assert_index_equal(bf.columns, other.columns) # test fill value @@ -434,7 +434,7 @@ def test_align(self): self.assert_index_equal(af.index, other.index) # axis = 1 - other = self.frame.ix[:-5, :3].copy() + other = self.frame.iloc[:-5, :3].copy() af, bf = self.frame.align(other, axis=1) self.assert_index_equal(bf.columns, self.frame.columns) self.assert_index_equal(bf.index, other.index) @@ -464,25 +464,25 @@ def test_align(self): join='inner', axis=1, method='pad') self.assert_index_equal(bf.columns, self.mixed_frame.columns) - af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, + af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=None) self.assert_index_equal(bf.index, Index([])) - af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, + af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=0) self.assert_index_equal(bf.index, Index([])) # mixed floats/ints - af, bf = self.mixed_float.align(other.ix[:, 0], join='inner', axis=1, + af, bf = self.mixed_float.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=0) self.assert_index_equal(bf.index, Index([])) - af, bf = self.mixed_int.align(other.ix[:, 0], join='inner', axis=1, + af, bf = self.mixed_int.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=0) self.assert_index_equal(bf.index, Index([])) # try to align dataframe to series along bad axis - self.assertRaises(ValueError, self.frame.align, af.ix[0, :3], + self.assertRaises(ValueError, self.frame.align, af.iloc[0, :3], join='inner', axis=2) # align dataframe to series with broadcast or not @@ -561,9 +561,9 @@ def test_align_fill_method_right(self): self._check_align_fill('right', meth, ax, fax) def _check_align_fill(self, kind, meth, ax, fax): - left = self.frame.ix[0:4, :10] - right = self.frame.ix[2:, 6:] - empty = self.frame.ix[:0, :0] + left = self.frame.iloc[0:4, :10] + right = self.frame.iloc[2:, 6:] + empty = self.frame.iloc[:0, :0] self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) @@ -779,7 +779,7 @@ def test_take(self): # axis = 1 result = df.take(order, axis=1) - expected = df.ix[:, ['D', 'B', 'C', 'A']] + expected = df.loc[:, ['D', 'B', 'C', 'A']] assert_frame_equal(result, expected, check_names=False) # neg indicies @@ -792,7 +792,7 @@ def test_take(self): # axis = 1 result = df.take(order, axis=1) - expected = df.ix[:, ['C', 'B', 'D']] + expected = df.loc[:, ['C', 'B', 'D']] assert_frame_equal(result, expected, check_names=False) # illegal indices @@ -811,7 +811,7 @@ def test_take(self): # axis = 1 result = df.take(order, axis=1) - expected = df.ix[:, ['foo', 'B', 'C', 'A', 'D']] + expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']] assert_frame_equal(result, expected) # neg indicies @@ -824,7 +824,7 @@ def test_take(self): # axis = 1 result = df.take(order, axis=1) - expected = df.ix[:, ['foo', 'B', 'D']] + expected = df.loc[:, ['foo', 'B', 'D']] assert_frame_equal(result, expected) # by dtype @@ -837,7 +837,7 @@ def test_take(self): # axis = 1 result = df.take(order, axis=1) - expected = df.ix[:, ['B', 'C', 'A', 'D']] + expected = df.loc[:, ['B', 'C', 'A', 'D']] assert_frame_equal(result, expected) def test_reindex_boolean(self): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 706820b06b12e..33550670720c3 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -318,7 +318,7 @@ def test_copy_blocks(self): blocks = df.as_blocks() for dtype, _df in blocks.items(): if column in _df: - _df.ix[:, column] = _df[column] + 1 + _df.loc[:, column] = _df[column] + 1 # make sure we did not change the original DataFrame self.assertFalse(_df[column].equals(df[column])) @@ -332,7 +332,7 @@ def test_no_copy_blocks(self): blocks = df.as_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: - _df.ix[:, column] = _df[column] + 1 + _df.loc[:, column] = _df[column] + 1 # make sure we did change the original DataFrame self.assertTrue(_df[column].equals(df[column])) @@ -423,12 +423,12 @@ def test_get_numeric_data(self): index=np.arange(10)) result = df._get_numeric_data() - expected = df.ix[:, ['a', 'b', 'd', 'e', 'f']] + expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']] assert_frame_equal(result, expected) - only_obj = df.ix[:, ['c', 'g']] + only_obj = df.loc[:, ['c', 'g']] result = only_obj._get_numeric_data() - expected = df.ix[:, []] + expected = df.loc[:, []] assert_frame_equal(result, expected) df = DataFrame.from_dict( @@ -457,7 +457,7 @@ def test_convert_objects(self): l = len(self.mixed_frame) self.mixed_frame['J'] = '1.' self.mixed_frame['K'] = '1' - self.mixed_frame.ix[0:5, ['J', 'K']] = 'garbled' + self.mixed_frame.loc[0:5, ['J', 'K']] = 'garbled' converted = self.mixed_frame._convert(datetime=True, numeric=True) self.assertEqual(converted['H'].dtype, 'float64') self.assertEqual(converted['I'].dtype, 'int64') @@ -535,6 +535,6 @@ def test_strange_column_corruption_issue(self): myid = 100 - first = len(df.ix[pd.isnull(df[myid]), [myid]]) - second = len(df.ix[pd.isnull(df[myid]), [myid]]) + first = len(df.loc[pd.isnull(df[myid]), [myid]]) + second = len(df.loc[pd.isnull(df[myid]), [myid]]) self.assertTrue(first == second == 0) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index c6b69dad3e6b5..71b6500e7184a 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -79,7 +79,7 @@ def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=['foo', 'bar', 'baz', 'qux']) - series = df.ix[4] + series = df.loc[4] with assertRaisesRegexp(ValueError, 'Indexes have overlapping values'): df.append(series, verify_integrity=True) series.name = None @@ -99,10 +99,10 @@ def test_append_series_dict(self): result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({0: series[::-1][:3]}).T, ignore_index=True) - assert_frame_equal(result, expected.ix[:, result.columns]) + assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set - row = df.ix[4] + row = df.loc[4] row.name = 5 result = df.append(row) expected = df.append(df[-1:], ignore_index=True) @@ -534,9 +534,9 @@ def test_combine_first_mixed_bug(self): result = df.combine_first(other) assert_frame_equal(result, df) - df.ix[0, 'A'] = np.nan + df.loc[0, 'A'] = np.nan result = df.combine_first(other) - df.ix[0, 'A'] = 45 + df.loc[0, 'A'] = 45 assert_frame_equal(result, df) # doc example diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index bf0fabaf3e402..07cf6816330bc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -96,8 +96,8 @@ def test_constructor_dtype_nocast_view(self): def test_constructor_dtype_list_data(self): df = DataFrame([[1, '2'], [None, 'a']], dtype=object) - self.assertIsNone(df.ix[1, 0]) - self.assertEqual(df.ix[0, 1], '2') + self.assertIsNone(df.loc[1, 0]) + self.assertEqual(df.loc[0, 1], '2') def test_constructor_list_frames(self): @@ -1147,7 +1147,7 @@ def test_constructor_from_items(self): # pass some columns recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) - tm.assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) + tm.assert_frame_equal(recons, self.frame.loc[:, ['C', 'B', 'A']]) # orient='index' @@ -1186,7 +1186,7 @@ def test_constructor_from_items(self): def test_constructor_mix_series_nonseries(self): df = DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])}, columns=['A', 'B']) - tm.assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) + tm.assert_frame_equal(df, self.frame.loc[:, ['A', 'B']]) with tm.assertRaisesRegexp(ValueError, 'does not match index length'): DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]}) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 02d288bdf6ea8..bc0a68f765903 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import print_function +from warnings import catch_warnings from datetime import datetime, date, timedelta, time @@ -89,8 +90,8 @@ def test_get(self): def test_getitem_iterator(self): idx = iter(['A', 'B', 'C']) - result = self.frame.ix[:, idx] - expected = self.frame.ix[:, ['A', 'B', 'C']] + result = self.frame.loc[:, idx] + expected = self.frame.loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) idx = iter(['A', 'B', 'C']) @@ -104,7 +105,7 @@ def test_getitem_list(self): result = self.frame[['B', 'A']] result2 = self.frame[Index(['B', 'A'])] - expected = self.frame.ix[:, ['B', 'A']] + expected = self.frame.loc[:, ['B', 'A']] expected.columns.name = 'foo' assert_frame_equal(result, expected) @@ -123,7 +124,7 @@ def test_getitem_list(self): ('peek', 'aboo')], name=['sth', 'sth2'])) result = df[[('foo', 'bar'), ('baz', 'qux')]] - expected = df.ix[:, :2] + expected = df.iloc[:, :2] assert_frame_equal(result, expected) self.assertEqual(result.columns.names, ['sth', 'sth2']) @@ -157,15 +158,15 @@ def test_setitem_list(self): data['A'] = range(len(data.index) - 1) df = DataFrame(0, lrange(3), ['tt1', 'tt2'], dtype=np.int_) - df.ix[1, ['tt1', 'tt2']] = [1, 2] + df.loc[1, ['tt1', 'tt2']] = [1, 2] - result = df.ix[1, ['tt1', 'tt2']] + result = df.loc[df.index[1], ['tt1', 'tt2']] expected = Series([1, 2], df.columns, dtype=np.int_, name=1) assert_series_equal(result, expected) df['tt1'] = df['tt2'] = '0' - df.ix[1, ['tt1', 'tt2']] = ['1', '2'] - result = df.ix[1, ['tt1', 'tt2']] + df.loc[df.index[1], ['tt1', 'tt2']] = ['1', '2'] + result = df.loc[df.index[1], ['tt1', 'tt2']] expected = Series(['1', '2'], df.columns, name=1) assert_series_equal(result, expected) @@ -297,7 +298,7 @@ def test_getitem_boolean_casting(self): assert_series_equal(result, expected) # int block splitting - df.ix[1:3, ['E1', 'F1']] = 0 + df.loc[df.index[1:3], ['E1', 'F1']] = 0 casted = df[df > 0] result = casted.get_dtype_counts() expected = Series({'float64': 6, 'int32': 1, 'int64': 1}) @@ -318,7 +319,7 @@ def test_getitem_boolean_list(self): def _checkit(lst): result = df[lst] - expected = df.ix[df.index[lst]] + expected = df.loc[df.index[lst]] assert_frame_equal(result, expected) _checkit([True, False, True]) @@ -350,12 +351,13 @@ def test_getitem_ix_mixed_integer(self): df = DataFrame(np.random.randn(4, 3), index=[1, 10, 'C', 'E'], columns=[1, 2, 3]) - result = df.ix[:-1] - expected = df.ix[df.index[:-1]] + result = df.iloc[:-1] + expected = df.loc[df.index[:-1]] assert_frame_equal(result, expected) - result = df.ix[[1, 10]] - expected = df.ix[Index([1, 10], dtype=object)] + with catch_warnings(record=True): + result = df.ix[[1, 10]] + expected = df.ix[Index([1, 10], dtype=object)] assert_frame_equal(result, expected) # 11320 @@ -372,28 +374,35 @@ def test_getitem_ix_mixed_integer(self): assert_frame_equal(result, expected) def test_getitem_setitem_ix_negative_integers(self): - result = self.frame.ix[:, -1] + with catch_warnings(record=True): + result = self.frame.ix[:, -1] assert_series_equal(result, self.frame['D']) - result = self.frame.ix[:, [-1]] + with catch_warnings(record=True): + result = self.frame.ix[:, [-1]] assert_frame_equal(result, self.frame[['D']]) - result = self.frame.ix[:, [-1, -2]] + with catch_warnings(record=True): + result = self.frame.ix[:, [-1, -2]] assert_frame_equal(result, self.frame[['D', 'C']]) - self.frame.ix[:, [-1]] = 0 + with catch_warnings(record=True): + self.frame.ix[:, [-1]] = 0 self.assertTrue((self.frame['D'] == 0).all()) df = DataFrame(np.random.randn(8, 4)) - self.assertTrue(isnull(df.ix[:, [-1]].values).all()) + with catch_warnings(record=True): + self.assertTrue(isnull(df.ix[:, [-1]].values).all()) # #1942 a = DataFrame(randn(20, 2), index=[chr(x + 65) for x in range(20)]) - a.ix[-1] = a.ix[-2] + with catch_warnings(record=True): + a.ix[-1] = a.ix[-2] - assert_series_equal(a.ix[-1], a.ix[-2], check_names=False) - self.assertEqual(a.ix[-1].name, 'T') - self.assertEqual(a.ix[-2].name, 'S') + with catch_warnings(record=True): + assert_series_equal(a.ix[-1], a.ix[-2], check_names=False) + self.assertEqual(a.ix[-1].name, 'T') + self.assertEqual(a.ix[-2].name, 'S') def test_getattr(self): assert_series_equal(self.frame.A, self.frame['A']) @@ -576,7 +585,7 @@ def test_setitem_boolean_column(self): expected = self.frame.copy() mask = self.frame['A'] > 0 - self.frame.ix[mask, 'B'] = 0 + self.frame.loc[mask, 'B'] = 0 expected.values[mask.values, 1] = 0 assert_frame_equal(self.frame, expected) @@ -632,11 +641,11 @@ def test_setitem_corner2(self): df = DataFrame(data) ix = df[df['title'] == 'bar'].index - df.ix[ix, ['title']] = 'foobar' - df.ix[ix, ['cruft']] = 0 + df.loc[ix, ['title']] = 'foobar' + df.loc[ix, ['cruft']] = 0 - assert(df.ix[1, 'title'] == 'foobar') - assert(df.ix[1, 'cruft'] == 0) + self.assertEqual(df.loc[1, 'title'], 'foobar') + self.assertEqual(df.loc[1, 'cruft'], 0) def test_setitem_ambig(self): # difficulties with mixed-type data @@ -671,7 +680,7 @@ def test_setitem_clear_caches(self): # cache it foo = df['z'] - df.ix[2:, 'z'] = 42 + df.loc[df.index[2:], 'z'] = 42 expected = Series([np.nan, np.nan, 42, 42], index=df.index, name='z') self.assertIsNot(df['z'], foo) @@ -728,72 +737,80 @@ def test_delitem_corner(self): def test_getitem_fancy_2d(self): f = self.frame - ix = f.ix - assert_frame_equal(ix[:, ['B', 'A']], f.reindex(columns=['B', 'A'])) + with catch_warnings(record=True): + assert_frame_equal(f.ix[:, ['B', 'A']], + f.reindex(columns=['B', 'A'])) subidx = self.frame.index[[5, 4, 1]] - assert_frame_equal(ix[subidx, ['B', 'A']], - f.reindex(index=subidx, columns=['B', 'A'])) + with catch_warnings(record=True): + assert_frame_equal(f.ix[subidx, ['B', 'A']], + f.reindex(index=subidx, columns=['B', 'A'])) # slicing rows, etc. - assert_frame_equal(ix[5:10], f[5:10]) - assert_frame_equal(ix[5:10, :], f[5:10]) - assert_frame_equal(ix[:5, ['A', 'B']], - f.reindex(index=f.index[:5], columns=['A', 'B'])) + with catch_warnings(record=True): + assert_frame_equal(f.ix[5:10], f[5:10]) + assert_frame_equal(f.ix[5:10, :], f[5:10]) + assert_frame_equal(f.ix[:5, ['A', 'B']], + f.reindex(index=f.index[:5], + columns=['A', 'B'])) # slice rows with labels, inclusive! - expected = ix[5:11] - result = ix[f.index[5]:f.index[10]] + with catch_warnings(record=True): + expected = f.ix[5:11] + result = f.ix[f.index[5]:f.index[10]] assert_frame_equal(expected, result) # slice columns - assert_frame_equal(ix[:, :2], f.reindex(columns=['A', 'B'])) + with catch_warnings(record=True): + assert_frame_equal(f.ix[:, :2], f.reindex(columns=['A', 'B'])) # get view - exp = f.copy() - ix[5:10].values[:] = 5 - exp.values[5:10] = 5 - assert_frame_equal(f, exp) + with catch_warnings(record=True): + exp = f.copy() + f.ix[5:10].values[:] = 5 + exp.values[5:10] = 5 + assert_frame_equal(f, exp) - self.assertRaises(ValueError, ix.__getitem__, f > 0.5) + with catch_warnings(record=True): + self.assertRaises(ValueError, f.ix.__getitem__, f > 0.5) def test_slice_floats(self): index = [52195.504153, 52196.303147, 52198.369883] df = DataFrame(np.random.rand(3, 2), index=index) - s1 = df.ix[52195.1:52196.5] + s1 = df.loc[52195.1:52196.5] self.assertEqual(len(s1), 2) - s1 = df.ix[52195.1:52196.6] + s1 = df.loc[52195.1:52196.6] self.assertEqual(len(s1), 2) - s1 = df.ix[52195.1:52198.9] + s1 = df.loc[52195.1:52198.9] self.assertEqual(len(s1), 3) def test_getitem_fancy_slice_integers_step(self): df = DataFrame(np.random.randn(10, 5)) # this is OK - result = df.ix[:8:2] # noqa - df.ix[:8:2] = np.nan - self.assertTrue(isnull(df.ix[:8:2]).values.all()) + result = df.iloc[:8:2] # noqa + df.iloc[:8:2] = np.nan + self.assertTrue(isnull(df.iloc[:8:2]).values.all()) def test_getitem_setitem_integer_slice_keyerrors(self): df = DataFrame(np.random.randn(10, 5), index=lrange(0, 20, 2)) # this is OK cp = df.copy() - cp.ix[4:10] = 0 - self.assertTrue((cp.ix[4:10] == 0).values.all()) + cp.iloc[4:10] = 0 + self.assertTrue((cp.iloc[4:10] == 0).values.all()) # so is this cp = df.copy() - cp.ix[3:11] = 0 - self.assertTrue((cp.ix[3:11] == 0).values.all()) + cp.iloc[3:11] = 0 + self.assertTrue((cp.iloc[3:11] == 0).values.all()) - result = df.ix[4:10] - result2 = df.ix[3:11] + result = df.iloc[2:6] + result2 = df.loc[3:11] expected = df.reindex([4, 6, 8, 10]) assert_frame_equal(result, expected) @@ -801,18 +818,17 @@ def test_getitem_setitem_integer_slice_keyerrors(self): # non-monotonic, raise KeyError df2 = df.iloc[lrange(5) + lrange(5, 10)[::-1]] - self.assertRaises(KeyError, df2.ix.__getitem__, slice(3, 11)) - self.assertRaises(KeyError, df2.ix.__setitem__, slice(3, 11), 0) + self.assertRaises(KeyError, df2.loc.__getitem__, slice(3, 11)) + self.assertRaises(KeyError, df2.loc.__setitem__, slice(3, 11), 0) def test_setitem_fancy_2d(self): - f = self.frame - - ix = f.ix # noqa # case 1 frame = self.frame.copy() expected = frame.copy() - frame.ix[:, ['B', 'A']] = 1 + + with catch_warnings(record=True): + frame.ix[:, ['B', 'A']] = 1 expected['B'] = 1. expected['A'] = 1. assert_frame_equal(frame, expected) @@ -826,11 +842,12 @@ def test_setitem_fancy_2d(self): subidx = self.frame.index[[5, 4, 1]] values = randn(3, 2) - frame.ix[subidx, ['B', 'A']] = values - frame2.ix[[5, 4, 1], ['B', 'A']] = values + with catch_warnings(record=True): + frame.ix[subidx, ['B', 'A']] = values + frame2.ix[[5, 4, 1], ['B', 'A']] = values - expected['B'].ix[subidx] = values[:, 0] - expected['A'].ix[subidx] = values[:, 1] + expected['B'].ix[subidx] = values[:, 0] + expected['A'].ix[subidx] = values[:, 1] assert_frame_equal(frame, expected) assert_frame_equal(frame2, expected) @@ -838,60 +855,67 @@ def test_setitem_fancy_2d(self): # case 3: slicing rows, etc. frame = self.frame.copy() - expected1 = self.frame.copy() - frame.ix[5:10] = 1. - expected1.values[5:10] = 1. + with catch_warnings(record=True): + expected1 = self.frame.copy() + frame.ix[5:10] = 1. + expected1.values[5:10] = 1. assert_frame_equal(frame, expected1) - expected2 = self.frame.copy() - arr = randn(5, len(frame.columns)) - frame.ix[5:10] = arr - expected2.values[5:10] = arr + with catch_warnings(record=True): + expected2 = self.frame.copy() + arr = randn(5, len(frame.columns)) + frame.ix[5:10] = arr + expected2.values[5:10] = arr assert_frame_equal(frame, expected2) # case 4 - frame = self.frame.copy() - frame.ix[5:10, :] = 1. - assert_frame_equal(frame, expected1) - frame.ix[5:10, :] = arr + with catch_warnings(record=True): + frame = self.frame.copy() + frame.ix[5:10, :] = 1. + assert_frame_equal(frame, expected1) + frame.ix[5:10, :] = arr assert_frame_equal(frame, expected2) # case 5 - frame = self.frame.copy() - frame2 = self.frame.copy() + with catch_warnings(record=True): + frame = self.frame.copy() + frame2 = self.frame.copy() - expected = self.frame.copy() - values = randn(5, 2) + expected = self.frame.copy() + values = randn(5, 2) - frame.ix[:5, ['A', 'B']] = values - expected['A'][:5] = values[:, 0] - expected['B'][:5] = values[:, 1] + frame.ix[:5, ['A', 'B']] = values + expected['A'][:5] = values[:, 0] + expected['B'][:5] = values[:, 1] assert_frame_equal(frame, expected) - frame2.ix[:5, [0, 1]] = values + with catch_warnings(record=True): + frame2.ix[:5, [0, 1]] = values assert_frame_equal(frame2, expected) # case 6: slice rows with labels, inclusive! - frame = self.frame.copy() - expected = self.frame.copy() + with catch_warnings(record=True): + frame = self.frame.copy() + expected = self.frame.copy() - frame.ix[frame.index[5]:frame.index[10]] = 5. - expected.values[5:11] = 5 + frame.ix[frame.index[5]:frame.index[10]] = 5. + expected.values[5:11] = 5 assert_frame_equal(frame, expected) # case 7: slice columns - frame = self.frame.copy() - frame2 = self.frame.copy() - expected = self.frame.copy() + with catch_warnings(record=True): + frame = self.frame.copy() + frame2 = self.frame.copy() + expected = self.frame.copy() - # slice indices - frame.ix[:, 1:3] = 4. - expected.values[:, 1:3] = 4. - assert_frame_equal(frame, expected) + # slice indices + frame.ix[:, 1:3] = 4. + expected.values[:, 1:3] = 4. + assert_frame_equal(frame, expected) - # slice with labels - frame.ix[:, 'B':'C'] = 4. - assert_frame_equal(frame, expected) + # slice with labels + frame.ix[:, 'B':'C'] = 4. + assert_frame_equal(frame, expected) # new corner case of boolean slicing / setting frame = DataFrame(lzip([2, 3, 9, 6, 7], [np.nan] * 5), @@ -904,12 +928,12 @@ def test_setitem_fancy_2d(self): assert_frame_equal(frame, expected) def test_fancy_getitem_slice_mixed(self): - sliced = self.mixed_frame.ix[:, -3:] + sliced = self.mixed_frame.iloc[:, -3:] self.assertEqual(sliced['D'].dtype, np.float64) # get view with single block # setting it triggers setting with copy - sliced = self.frame.ix[:, -3:] + sliced = self.frame.iloc[:, -3:] def f(): sliced['C'] = 4. @@ -921,21 +945,24 @@ def test_fancy_setitem_int_labels(self): df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) - tmp = df.copy() - exp = df.copy() - tmp.ix[[0, 2, 4]] = 5 - exp.values[:3] = 5 + with catch_warnings(record=True): + tmp = df.copy() + exp = df.copy() + tmp.ix[[0, 2, 4]] = 5 + exp.values[:3] = 5 assert_frame_equal(tmp, exp) - tmp = df.copy() - exp = df.copy() - tmp.ix[6] = 5 - exp.values[3] = 5 + with catch_warnings(record=True): + tmp = df.copy() + exp = df.copy() + tmp.ix[6] = 5 + exp.values[3] = 5 assert_frame_equal(tmp, exp) - tmp = df.copy() - exp = df.copy() - tmp.ix[:, 2] = 5 + with catch_warnings(record=True): + tmp = df.copy() + exp = df.copy() + tmp.ix[:, 2] = 5 # tmp correctly sets the dtype # so match the exp way @@ -945,133 +972,151 @@ def test_fancy_setitem_int_labels(self): def test_fancy_getitem_int_labels(self): df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) - result = df.ix[[4, 2, 0], [2, 0]] - expected = df.reindex(index=[4, 2, 0], columns=[2, 0]) + with catch_warnings(record=True): + result = df.ix[[4, 2, 0], [2, 0]] + expected = df.reindex(index=[4, 2, 0], columns=[2, 0]) assert_frame_equal(result, expected) - result = df.ix[[4, 2, 0]] - expected = df.reindex(index=[4, 2, 0]) + with catch_warnings(record=True): + result = df.ix[[4, 2, 0]] + expected = df.reindex(index=[4, 2, 0]) assert_frame_equal(result, expected) - result = df.ix[4] - expected = df.xs(4) + with catch_warnings(record=True): + result = df.ix[4] + expected = df.xs(4) assert_series_equal(result, expected) - result = df.ix[:, 3] - expected = df[3] + with catch_warnings(record=True): + result = df.ix[:, 3] + expected = df[3] assert_series_equal(result, expected) def test_fancy_index_int_labels_exceptions(self): df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) - # labels that aren't contained - self.assertRaises(KeyError, df.ix.__setitem__, - ([0, 1, 2], [2, 3, 4]), 5) + with catch_warnings(record=True): - # try to set indices not contained in frame - self.assertRaises(KeyError, - self.frame.ix.__setitem__, - ['foo', 'bar', 'baz'], 1) - self.assertRaises(KeyError, - self.frame.ix.__setitem__, - (slice(None, None), ['E']), 1) + # labels that aren't contained + self.assertRaises(KeyError, df.ix.__setitem__, + ([0, 1, 2], [2, 3, 4]), 5) - # partial setting now allows this GH2578 - # self.assertRaises(KeyError, - # self.frame.ix.__setitem__, - # (slice(None, None), 'E'), 1) + # try to set indices not contained in frame + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + ['foo', 'bar', 'baz'], 1) + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + (slice(None, None), ['E']), 1) + + # partial setting now allows this GH2578 + # self.assertRaises(KeyError, + # self.frame.ix.__setitem__, + # (slice(None, None), 'E'), 1) def test_setitem_fancy_mixed_2d(self): - self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 - result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] - self.assertTrue((result.values == 5).all()) - self.mixed_frame.ix[5] = np.nan - self.assertTrue(isnull(self.mixed_frame.ix[5]).all()) + with catch_warnings(record=True): + self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 + result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] + self.assertTrue((result.values == 5).all()) + + self.mixed_frame.ix[5] = np.nan + self.assertTrue(isnull(self.mixed_frame.ix[5]).all()) - self.mixed_frame.ix[5] = self.mixed_frame.ix[6] - assert_series_equal(self.mixed_frame.ix[5], self.mixed_frame.ix[6], - check_names=False) + self.mixed_frame.ix[5] = self.mixed_frame.ix[6] + assert_series_equal(self.mixed_frame.ix[5], self.mixed_frame.ix[6], + check_names=False) # #1432 - df = DataFrame({1: [1., 2., 3.], - 2: [3, 4, 5]}) - self.assertTrue(df._is_mixed_type) + with catch_warnings(record=True): + df = DataFrame({1: [1., 2., 3.], + 2: [3, 4, 5]}) + self.assertTrue(df._is_mixed_type) - df.ix[1] = [5, 10] + df.ix[1] = [5, 10] - expected = DataFrame({1: [1., 5., 3.], - 2: [3, 10, 5]}) + expected = DataFrame({1: [1., 5., 3.], + 2: [3, 10, 5]}) - assert_frame_equal(df, expected) + assert_frame_equal(df, expected) def test_ix_align(self): b = Series(randn(10), name=0).sort_values() df_orig = DataFrame(randn(10, 4)) df = df_orig.copy() - df.ix[:, 0] = b - assert_series_equal(df.ix[:, 0].reindex(b.index), b) - - dft = df_orig.T - dft.ix[0, :] = b - assert_series_equal(dft.ix[0, :].reindex(b.index), b) - - df = df_orig.copy() - df.ix[:5, 0] = b - s = df.ix[:5, 0] - assert_series_equal(s, b.reindex(s.index)) - - dft = df_orig.T - dft.ix[0, :5] = b - s = dft.ix[0, :5] - assert_series_equal(s, b.reindex(s.index)) - - df = df_orig.copy() - idx = [0, 1, 3, 5] - df.ix[idx, 0] = b - s = df.ix[idx, 0] - assert_series_equal(s, b.reindex(s.index)) - - dft = df_orig.T - dft.ix[0, idx] = b - s = dft.ix[0, idx] - assert_series_equal(s, b.reindex(s.index)) + with catch_warnings(record=True): + df.ix[:, 0] = b + assert_series_equal(df.ix[:, 0].reindex(b.index), b) + + with catch_warnings(record=True): + dft = df_orig.T + dft.ix[0, :] = b + assert_series_equal(dft.ix[0, :].reindex(b.index), b) + + with catch_warnings(record=True): + df = df_orig.copy() + df.ix[:5, 0] = b + s = df.ix[:5, 0] + assert_series_equal(s, b.reindex(s.index)) + + with catch_warnings(record=True): + dft = df_orig.T + dft.ix[0, :5] = b + s = dft.ix[0, :5] + assert_series_equal(s, b.reindex(s.index)) + + with catch_warnings(record=True): + df = df_orig.copy() + idx = [0, 1, 3, 5] + df.ix[idx, 0] = b + s = df.ix[idx, 0] + assert_series_equal(s, b.reindex(s.index)) + + with catch_warnings(record=True): + dft = df_orig.T + dft.ix[0, idx] = b + s = dft.ix[0, idx] + assert_series_equal(s, b.reindex(s.index)) def test_ix_frame_align(self): b = DataFrame(np.random.randn(3, 4)) df_orig = DataFrame(randn(10, 4)) df = df_orig.copy() - df.ix[:3] = b - out = b.ix[:3] - assert_frame_equal(out, b) + with catch_warnings(record=True): + df.ix[:3] = b + out = b.ix[:3] + assert_frame_equal(out, b) b.sort_index(inplace=True) - df = df_orig.copy() - df.ix[[0, 1, 2]] = b - out = df.ix[[0, 1, 2]].reindex(b.index) - assert_frame_equal(out, b) + with catch_warnings(record=True): + df = df_orig.copy() + df.ix[[0, 1, 2]] = b + out = df.ix[[0, 1, 2]].reindex(b.index) + assert_frame_equal(out, b) - df = df_orig.copy() - df.ix[:3] = b - out = df.ix[:3] - assert_frame_equal(out, b.reindex(out.index)) + with catch_warnings(record=True): + df = df_orig.copy() + df.ix[:3] = b + out = df.ix[:3] + assert_frame_equal(out, b.reindex(out.index)) def test_getitem_setitem_non_ix_labels(self): df = tm.makeTimeDataFrame() start, end = df.index[[5, 10]] - result = df.ix[start:end] + result = df.loc[start:end] result2 = df[start:end] expected = df[5:11] assert_frame_equal(result, expected) assert_frame_equal(result2, expected) result = df.copy() - result.ix[start:end] = 0 + result.loc[start:end] = 0 result2 = df.copy() result2[start:end] = 0 expected = df.copy() @@ -1081,13 +1126,13 @@ def test_getitem_setitem_non_ix_labels(self): def test_ix_multi_take(self): df = DataFrame(np.random.randn(3, 2)) - rs = df.ix[df.index == 0, :] + rs = df.loc[df.index == 0, :] xp = df.reindex([0]) assert_frame_equal(rs, xp) """ #1321 df = DataFrame(np.random.randn(3, 2)) - rs = df.ix[df.index==0, df.columns==1] + rs = df.loc[df.index==0, df.columns==1] xp = df.reindex([0], [1]) assert_frame_equal(rs, xp) """ @@ -1095,14 +1140,16 @@ def test_ix_multi_take(self): def test_ix_multi_take_nonint_index(self): df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], columns=['a', 'b']) - rs = df.ix[[0], [0]] + with catch_warnings(record=True): + rs = df.ix[[0], [0]] xp = df.reindex(['x'], columns=['a']) assert_frame_equal(rs, xp) def test_ix_multi_take_multiindex(self): df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], columns=[['a', 'b'], ['1', '2']]) - rs = df.ix[[0], [0]] + with catch_warnings(record=True): + rs = df.ix[[0], [0]] xp = df.reindex(['x'], columns=[('a', '1')]) assert_frame_equal(rs, xp) @@ -1110,55 +1157,66 @@ def test_ix_dup(self): idx = Index(['a', 'a', 'b', 'c', 'd', 'd']) df = DataFrame(np.random.randn(len(idx), 3), idx) - sub = df.ix[:'d'] - assert_frame_equal(sub, df) + with catch_warnings(record=True): + sub = df.ix[:'d'] + assert_frame_equal(sub, df) - sub = df.ix['a':'c'] - assert_frame_equal(sub, df.ix[0:4]) + with catch_warnings(record=True): + sub = df.ix['a':'c'] + assert_frame_equal(sub, df.ix[0:4]) - sub = df.ix['b':'d'] - assert_frame_equal(sub, df.ix[2:]) + with catch_warnings(record=True): + sub = df.ix['b':'d'] + assert_frame_equal(sub, df.ix[2:]) def test_getitem_fancy_1d(self): f = self.frame - ix = f.ix # return self if no slicing...for now - self.assertIs(ix[:, :], f) + with catch_warnings(record=True): + self.assertIs(f.ix[:, :], f) # low dimensional slice - xs1 = ix[2, ['C', 'B', 'A']] + with catch_warnings(record=True): + xs1 = f.ix[2, ['C', 'B', 'A']] xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) assert_series_equal(xs1, xs2) - ts1 = ix[5:10, 2] + with catch_warnings(record=True): + ts1 = f.ix[5:10, 2] ts2 = f[f.columns[2]][5:10] assert_series_equal(ts1, ts2) # positional xs - xs1 = ix[0] + with catch_warnings(record=True): + xs1 = f.ix[0] xs2 = f.xs(f.index[0]) assert_series_equal(xs1, xs2) - xs1 = ix[f.index[5]] + with catch_warnings(record=True): + xs1 = f.ix[f.index[5]] xs2 = f.xs(f.index[5]) assert_series_equal(xs1, xs2) # single column - assert_series_equal(ix[:, 'A'], f['A']) + with catch_warnings(record=True): + assert_series_equal(f.ix[:, 'A'], f['A']) # return view - exp = f.copy() - exp.values[5] = 4 - ix[5][:] = 4 + with catch_warnings(record=True): + exp = f.copy() + exp.values[5] = 4 + f.ix[5][:] = 4 assert_frame_equal(exp, f) - exp.values[:, 1] = 6 - ix[:, 1][:] = 6 + with catch_warnings(record=True): + exp.values[:, 1] = 6 + f.ix[:, 1][:] = 6 assert_frame_equal(exp, f) # slice of mixed-frame - xs = self.mixed_frame.ix[5] + with catch_warnings(record=True): + xs = self.mixed_frame.ix[5] exp = self.mixed_frame.xs(self.mixed_frame.index[5]) assert_series_equal(xs, exp) @@ -1168,52 +1226,60 @@ def test_setitem_fancy_1d(self): frame = self.frame.copy() expected = self.frame.copy() - frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.] + with catch_warnings(record=True): + frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.] expected['C'][2] = 1. expected['B'][2] = 2. expected['A'][2] = 3. assert_frame_equal(frame, expected) - frame2 = self.frame.copy() - frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] + with catch_warnings(record=True): + frame2 = self.frame.copy() + frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] assert_frame_equal(frame, expected) # case 2, set a section of a column frame = self.frame.copy() expected = self.frame.copy() - vals = randn(5) - expected.values[5:10, 2] = vals - frame.ix[5:10, 2] = vals + with catch_warnings(record=True): + vals = randn(5) + expected.values[5:10, 2] = vals + frame.ix[5:10, 2] = vals assert_frame_equal(frame, expected) - frame2 = self.frame.copy() - frame2.ix[5:10, 'B'] = vals + with catch_warnings(record=True): + frame2 = self.frame.copy() + frame2.ix[5:10, 'B'] = vals assert_frame_equal(frame, expected) # case 3: full xs frame = self.frame.copy() expected = self.frame.copy() - frame.ix[4] = 5. - expected.values[4] = 5. + with catch_warnings(record=True): + frame.ix[4] = 5. + expected.values[4] = 5. assert_frame_equal(frame, expected) - frame.ix[frame.index[4]] = 6. - expected.values[4] = 6. + with catch_warnings(record=True): + frame.ix[frame.index[4]] = 6. + expected.values[4] = 6. assert_frame_equal(frame, expected) # single column frame = self.frame.copy() expected = self.frame.copy() - frame.ix[:, 'A'] = 7. - expected['A'] = 7. + with catch_warnings(record=True): + frame.ix[:, 'A'] = 7. + expected['A'] = 7. assert_frame_equal(frame, expected) def test_getitem_fancy_scalar(self): f = self.frame - ix = f.ix + ix = f.loc + # individual value for col in f.columns: ts = f[col] @@ -1223,7 +1289,8 @@ def test_getitem_fancy_scalar(self): def test_setitem_fancy_scalar(self): f = self.frame expected = self.frame.copy() - ix = f.ix + ix = f.loc + # individual value for j, col in enumerate(f.columns): ts = f[col] # noqa @@ -1231,19 +1298,20 @@ def test_setitem_fancy_scalar(self): i = f.index.get_loc(idx) val = randn() expected.values[i, j] = val + ix[idx, col] = val assert_frame_equal(f, expected) def test_getitem_fancy_boolean(self): f = self.frame - ix = f.ix + ix = f.loc expected = f.reindex(columns=['B', 'D']) result = ix[:, [False, True, False, True]] assert_frame_equal(result, expected) expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) - result = ix[5:10, [False, True, False, True]] + result = ix[f.index[5:10], [False, True, False, True]] assert_frame_equal(result, expected) boolvec = f.index > f.index[7] @@ -1253,7 +1321,7 @@ def test_getitem_fancy_boolean(self): result = ix[boolvec, :] assert_frame_equal(result, expected) - result = ix[boolvec, 2:] + result = ix[boolvec, f.columns[2:]] expected = f.reindex(index=f.index[boolvec], columns=['C', 'D']) assert_frame_equal(result, expected) @@ -1264,27 +1332,27 @@ def test_setitem_fancy_boolean(self): expected = self.frame.copy() mask = frame['A'] > 0 - frame.ix[mask] = 0. + frame.loc[mask] = 0. expected.values[mask.values] = 0. assert_frame_equal(frame, expected) frame = self.frame.copy() expected = self.frame.copy() - frame.ix[mask, ['A', 'B']] = 0. + frame.loc[mask, ['A', 'B']] = 0. expected.values[mask.values, :2] = 0. assert_frame_equal(frame, expected) def test_getitem_fancy_ints(self): - result = self.frame.ix[[1, 4, 7]] - expected = self.frame.ix[self.frame.index[[1, 4, 7]]] + result = self.frame.iloc[[1, 4, 7]] + expected = self.frame.loc[self.frame.index[[1, 4, 7]]] assert_frame_equal(result, expected) - result = self.frame.ix[:, [2, 0, 1]] - expected = self.frame.ix[:, self.frame.columns[[2, 0, 1]]] + result = self.frame.iloc[:, [2, 0, 1]] + expected = self.frame.loc[:, self.frame.columns[[2, 0, 1]]] assert_frame_equal(result, expected) def test_getitem_setitem_fancy_exceptions(self): - ix = self.frame.ix + ix = self.frame.iloc with assertRaisesRegexp(IndexingError, 'Too many indexers'): ix[:, :, :] @@ -1295,14 +1363,14 @@ def test_getitem_setitem_boolean_misaligned(self): # boolean index misaligned labels mask = self.frame['A'][::-1] > 1 - result = self.frame.ix[mask] - expected = self.frame.ix[mask[::-1]] + result = self.frame.loc[mask] + expected = self.frame.loc[mask[::-1]] assert_frame_equal(result, expected) cp = self.frame.copy() expected = self.frame.copy() - cp.ix[mask] = 0 - expected.ix[mask] = 0 + cp.loc[mask] = 0 + expected.loc[mask] = 0 assert_frame_equal(cp, expected) def test_getitem_setitem_boolean_multi(self): @@ -1311,41 +1379,41 @@ def test_getitem_setitem_boolean_multi(self): # get k1 = np.array([True, False, True]) k2 = np.array([False, True]) - result = df.ix[k1, k2] - expected = df.ix[[0, 2], [1]] + result = df.loc[k1, k2] + expected = df.loc[[0, 2], [1]] assert_frame_equal(result, expected) expected = df.copy() - df.ix[np.array([True, False, True]), - np.array([False, True])] = 5 - expected.ix[[0, 2], [1]] = 5 + df.loc[np.array([True, False, True]), + np.array([False, True])] = 5 + expected.loc[[0, 2], [1]] = 5 assert_frame_equal(df, expected) def test_getitem_setitem_float_labels(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.random.randn(5, 5), index=index) - result = df.ix[1.5:4] + result = df.loc[1.5:4] expected = df.reindex([1.5, 2, 3, 4]) assert_frame_equal(result, expected) self.assertEqual(len(result), 4) - result = df.ix[4:5] + result = df.loc[4:5] expected = df.reindex([4, 5]) # reindex with int assert_frame_equal(result, expected, check_index_type=False) self.assertEqual(len(result), 2) - result = df.ix[4:5] + result = df.loc[4:5] expected = df.reindex([4.0, 5.0]) # reindex with float assert_frame_equal(result, expected) self.assertEqual(len(result), 2) # loc_float changes this to work properly - result = df.ix[1:2] + result = df.loc[1:2] expected = df.iloc[0:2] assert_frame_equal(result, expected) - df.ix[1:2] = 0 + df.loc[1:2] = 0 result = df[1:2] self.assertTrue((result == 0).all().all()) @@ -1380,36 +1448,36 @@ def f(): self.assertTrue((cp.iloc[0:4] == df.iloc[0:4]).values.all()) # float slicing - result = df.ix[1.0:5] + result = df.loc[1.0:5] expected = df assert_frame_equal(result, expected) self.assertEqual(len(result), 5) - result = df.ix[1.1:5] + result = df.loc[1.1:5] expected = df.reindex([2.5, 3.5, 4.5, 5.0]) assert_frame_equal(result, expected) self.assertEqual(len(result), 4) - result = df.ix[4.51:5] + result = df.loc[4.51:5] expected = df.reindex([5.0]) assert_frame_equal(result, expected) self.assertEqual(len(result), 1) - result = df.ix[1.0:5.0] + result = df.loc[1.0:5.0] expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0]) assert_frame_equal(result, expected) self.assertEqual(len(result), 5) cp = df.copy() - cp.ix[1.0:5.0] = 0 - result = cp.ix[1.0:5.0] + cp.loc[1.0:5.0] = 0 + result = cp.loc[1.0:5.0] self.assertTrue((result == 0).values.all()) def test_setitem_single_column_mixed(self): df = DataFrame(randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['foo', 'bar', 'baz']) df['str'] = 'qux' - df.ix[::2, 'str'] = nan + df.loc[df.index[::2], 'str'] = nan expected = np.array([nan, 'qux', nan, 'qux', nan], dtype=object) assert_almost_equal(df['str'].values, expected) @@ -1426,27 +1494,28 @@ def test_setitem_single_column_mixed_datetime(self): # set an allowable datetime64 type from pandas import tslib - df.ix['b', 'timestamp'] = tslib.iNaT - self.assertTrue(isnull(df.ix['b', 'timestamp'])) + df.loc['b', 'timestamp'] = tslib.iNaT + self.assertTrue(isnull(df.loc['b', 'timestamp'])) # allow this syntax - df.ix['c', 'timestamp'] = nan - self.assertTrue(isnull(df.ix['c', 'timestamp'])) + df.loc['c', 'timestamp'] = nan + self.assertTrue(isnull(df.loc['c', 'timestamp'])) # allow this syntax - df.ix['d', :] = nan - self.assertTrue(isnull(df.ix['c', :]).all() == False) # noqa + df.loc['d', :] = nan + self.assertTrue(isnull(df.loc['c', :]).all() == False) # noqa # as of GH 3216 this will now work! # try to set with a list like item # self.assertRaises( - # Exception, df.ix.__setitem__, ('d', 'timestamp'), [nan]) + # Exception, df.loc.__setitem__, ('d', 'timestamp'), [nan]) def test_setitem_frame(self): - piece = self.frame.ix[:2, ['A', 'B']] - self.frame.ix[-2:, ['A', 'B']] = piece.values - assert_almost_equal(self.frame.ix[-2:, ['A', 'B']].values, - piece.values) + piece = self.frame.loc[self.frame.index[:2], ['A', 'B']] + self.frame.loc[self.frame.index[-2]:, ['A', 'B']] = piece.values + result = self.frame.loc[self.frame.index[-2:], ['A', 'B']].values + expected = piece.values + assert_almost_equal(result, expected) # GH 3216 @@ -1455,8 +1524,8 @@ def test_setitem_frame(self): piece = DataFrame([[1., 2.], [3., 4.]], index=f.index[0:2], columns=['A', 'B']) key = (slice(None, 2), ['A', 'B']) - f.ix[key] = piece - assert_almost_equal(f.ix[0:2, ['A', 'B']].values, + f.loc[key] = piece + assert_almost_equal(f.loc[f.index[0:2], ['A', 'B']].values, piece.values) # rows unaligned @@ -1465,60 +1534,61 @@ def test_setitem_frame(self): index=list(f.index[0:2]) + ['foo', 'bar'], columns=['A', 'B']) key = (slice(None, 2), ['A', 'B']) - f.ix[key] = piece - assert_almost_equal(f.ix[0:2:, ['A', 'B']].values, + f.loc[key] = piece + assert_almost_equal(f.loc[f.index[0:2:], ['A', 'B']].values, piece.values[0:2]) # key is unaligned with values f = self.mixed_frame.copy() - piece = f.ix[:2, ['A']] + piece = f.loc[f.index[:2], ['A']] piece.index = f.index[-2:] key = (slice(-2, None), ['A', 'B']) - f.ix[key] = piece + f.loc[key] = piece piece['B'] = np.nan - assert_almost_equal(f.ix[-2:, ['A', 'B']].values, + assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, piece.values) # ndarray f = self.mixed_frame.copy() - piece = self.mixed_frame.ix[:2, ['A', 'B']] + piece = self.mixed_frame.loc[f.index[:2], ['A', 'B']] key = (slice(-2, None), ['A', 'B']) - f.ix[key] = piece.values - assert_almost_equal(f.ix[-2:, ['A', 'B']].values, + f.loc[key] = piece.values + assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, piece.values) # needs upcasting df = DataFrame([[1, 2, 'foo'], [3, 4, 'bar']], columns=['A', 'B', 'C']) df2 = df.copy() - df2.ix[:, ['A', 'B']] = df.ix[:, ['A', 'B']] + 0.5 + df2.loc[:, ['A', 'B']] = df.loc[:, ['A', 'B']] + 0.5 expected = df.reindex(columns=['A', 'B']) expected += 0.5 expected['C'] = df['C'] assert_frame_equal(df2, expected) def test_setitem_frame_align(self): - piece = self.frame.ix[:2, ['A', 'B']] + piece = self.frame.loc[self.frame.index[:2], ['A', 'B']] piece.index = self.frame.index[-2:] piece.columns = ['A', 'B'] - self.frame.ix[-2:, ['A', 'B']] = piece - assert_almost_equal(self.frame.ix[-2:, ['A', 'B']].values, - piece.values) + self.frame.loc[self.frame.index[-2:], ['A', 'B']] = piece + result = self.frame.loc[self.frame.index[-2:], ['A', 'B']].values + expected = piece.values + assert_almost_equal(result, expected) def test_getitem_setitem_ix_duplicates(self): # #1201 df = DataFrame(np.random.randn(5, 3), index=['foo', 'foo', 'bar', 'baz', 'bar']) - result = df.ix['foo'] + result = df.loc['foo'] expected = df[:2] assert_frame_equal(result, expected) - result = df.ix['bar'] - expected = df.ix[[2, 4]] + result = df.loc['bar'] + expected = df.iloc[[2, 4]] assert_frame_equal(result, expected) - result = df.ix['baz'] - expected = df.ix[3] + result = df.loc['baz'] + expected = df.iloc[3] assert_series_equal(result, expected) def test_getitem_ix_boolean_duplicates_multiple(self): @@ -1526,15 +1596,15 @@ def test_getitem_ix_boolean_duplicates_multiple(self): df = DataFrame(np.random.randn(5, 3), index=['foo', 'foo', 'bar', 'baz', 'bar']) - result = df.ix[['bar']] - exp = df.ix[[2, 4]] + result = df.loc[['bar']] + exp = df.iloc[[2, 4]] assert_frame_equal(result, exp) - result = df.ix[df[1] > 0] + result = df.loc[df[1] > 0] exp = df[df[1] > 0] assert_frame_equal(result, exp) - result = df.ix[df[0] > 0] + result = df.loc[df[0] > 0] exp = df[df[0] > 0] assert_frame_equal(result, exp) @@ -1542,11 +1612,11 @@ def test_getitem_setitem_ix_bool_keyerror(self): # #2199 df = DataFrame({'a': [1, 2, 3]}) - self.assertRaises(KeyError, df.ix.__getitem__, False) - self.assertRaises(KeyError, df.ix.__getitem__, True) + self.assertRaises(KeyError, df.loc.__getitem__, False) + self.assertRaises(KeyError, df.loc.__getitem__, True) - self.assertRaises(KeyError, df.ix.__setitem__, False, 0) - self.assertRaises(KeyError, df.ix.__setitem__, True, 0) + self.assertRaises(KeyError, df.loc.__setitem__, False, 0) + self.assertRaises(KeyError, df.loc.__setitem__, True, 0) def test_getitem_list_duplicates(self): # #1943 @@ -1556,7 +1626,7 @@ def test_getitem_list_duplicates(self): result = df[['B', 'C']] self.assertEqual(result.columns.name, 'foo') - expected = df.ix[:, 2:] + expected = df.iloc[:, 2:] assert_frame_equal(result, expected) def test_get_value(self): @@ -1669,8 +1739,9 @@ def test_single_element_ix_dont_upcast(self): self.assertTrue(issubclass(self.frame['E'].dtype.type, (int, np.integer))) - result = self.frame.ix[self.frame.index[5], 'E'] - self.assertTrue(is_integer(result)) + with catch_warnings(record=True): + result = self.frame.ix[self.frame.index[5], 'E'] + self.assertTrue(is_integer(result)) result = self.frame.loc[self.frame.index[5], 'E'] self.assertTrue(is_integer(result)) @@ -1679,13 +1750,15 @@ def test_single_element_ix_dont_upcast(self): df = pd.DataFrame(dict(a=[1.23])) df["b"] = 666 - result = df.ix[0, "b"] + with catch_warnings(record=True): + result = df.ix[0, "b"] self.assertTrue(is_integer(result)) result = df.loc[0, "b"] self.assertTrue(is_integer(result)) expected = Series([666], [0], name='b') - result = df.ix[[0], "b"] + with catch_warnings(record=True): + result = df.ix[[0], "b"] assert_series_equal(result, expected) result = df.loc[[0], "b"] assert_series_equal(result, expected) @@ -1698,16 +1771,16 @@ def test_irow(self): df.irow(1) result = df.iloc[1] - exp = df.ix[2] + exp = df.loc[2] assert_series_equal(result, exp) result = df.iloc[2] - exp = df.ix[4] + exp = df.loc[4] assert_series_equal(result, exp) # slice result = df.iloc[slice(4, 8)] - expected = df.ix[8:14] + expected = df.loc[8:14] assert_frame_equal(result, expected) # verify slice is view @@ -1733,16 +1806,16 @@ def test_icol(self): df.icol(1) result = df.iloc[:, 1] - exp = df.ix[:, 2] + exp = df.loc[:, 2] assert_series_equal(result, exp) result = df.iloc[:, 2] - exp = df.ix[:, 4] + exp = df.loc[:, 4] assert_series_equal(result, exp) # slice result = df.iloc[:, slice(4, 8)] - expected = df.ix[:, 8:14] + expected = df.loc[:, 8:14] assert_frame_equal(result, expected) # verify slice is view @@ -1764,13 +1837,15 @@ def test_irow_icol_duplicates(self): index=list('aab')) result = df.iloc[0] - result2 = df.ix[0] + with catch_warnings(record=True): + result2 = df.ix[0] tm.assertIsInstance(result, Series) assert_almost_equal(result.values, df.values[0]) assert_series_equal(result, result2) - result = df.T.iloc[:, 0] - result2 = df.T.ix[:, 0] + with catch_warnings(record=True): + result = df.T.iloc[:, 0] + result2 = df.T.ix[:, 0] tm.assertIsInstance(result, Series) assert_almost_equal(result.values, df.values[0]) assert_series_equal(result, result2) @@ -1780,16 +1855,19 @@ def test_irow_icol_duplicates(self): columns=[['i', 'i', 'j'], ['A', 'A', 'B']], index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) - rs = df.iloc[0] - xp = df.ix[0] + with catch_warnings(record=True): + rs = df.iloc[0] + xp = df.ix[0] assert_series_equal(rs, xp) - rs = df.iloc[:, 0] - xp = df.T.ix[0] + with catch_warnings(record=True): + rs = df.iloc[:, 0] + xp = df.T.ix[0] assert_series_equal(rs, xp) - rs = df.iloc[:, [0]] - xp = df.ix[:, [0]] + with catch_warnings(record=True): + rs = df.iloc[:, [0]] + xp = df.ix[:, [0]] assert_frame_equal(rs, xp) # #2259 @@ -1956,7 +2034,8 @@ def test_getitem_ix_float_duplicates(self): index=[0.1, 0.2, 0.2], columns=list('abc')) expect = df.iloc[1:] assert_frame_equal(df.loc[0.2], expect) - assert_frame_equal(df.ix[0.2], expect) + with catch_warnings(record=True): + assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] assert_series_equal(df.loc[0.2, 'a'], expect) @@ -1964,7 +2043,8 @@ def test_getitem_ix_float_duplicates(self): df.index = [1, 0.2, 0.2] expect = df.iloc[1:] assert_frame_equal(df.loc[0.2], expect) - assert_frame_equal(df.ix[0.2], expect) + with catch_warnings(record=True): + assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] assert_series_equal(df.loc[0.2, 'a'], expect) @@ -1973,7 +2053,8 @@ def test_getitem_ix_float_duplicates(self): index=[1, 0.2, 0.2, 1], columns=list('abc')) expect = df.iloc[1:-1] assert_frame_equal(df.loc[0.2], expect) - assert_frame_equal(df.ix[0.2], expect) + with catch_warnings(record=True): + assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:-1, 0] assert_series_equal(df.loc[0.2, 'a'], expect) @@ -1981,7 +2062,8 @@ def test_getitem_ix_float_duplicates(self): df.index = [0.1, 0.2, 2, 0.2] expect = df.iloc[[1, -1]] assert_frame_equal(df.loc[0.2], expect) - assert_frame_equal(df.ix[0.2], expect) + with catch_warnings(record=True): + assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[[1, -1], 0] assert_series_equal(df.loc[0.2, 'a'], expect) @@ -2033,11 +2115,13 @@ def test_setitem_datetimelike_with_inference(self): df['A'] = np.array([1 * one_hour] * 4, dtype='m8[ns]') df.loc[:, 'B'] = np.array([2 * one_hour] * 4, dtype='m8[ns]') df.loc[:3, 'C'] = np.array([3 * one_hour] * 3, dtype='m8[ns]') - df.ix[:, 'D'] = np.array([4 * one_hour] * 4, dtype='m8[ns]') - df.ix[:3, 'E'] = np.array([5 * one_hour] * 3, dtype='m8[ns]') + df.loc[:, 'D'] = np.array([4 * one_hour] * 4, dtype='m8[ns]') + df.loc[df.index[:3], 'E'] = np.array([5 * one_hour] * 3, + dtype='m8[ns]') df['F'] = np.timedelta64('NaT') - df.ix[:-1, 'F'] = np.array([6 * one_hour] * 3, dtype='m8[ns]') - df.ix[-3:, 'G'] = date_range('20130101', periods=3) + df.loc[df.index[:-1], 'F'] = np.array([6 * one_hour] * 3, + dtype='m8[ns]') + df.loc[df.index[-3]:, 'G'] = date_range('20130101', periods=3) df['H'] = np.datetime64('NaT') result = df.dtypes expected = Series([np.dtype('timedelta64[ns]')] * 6 + @@ -2054,41 +2138,41 @@ def test_at_time_between_time_datetimeindex(self): binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172] result = df.at_time(akey) - expected = df.ix[akey] - expected2 = df.ix[ainds] + expected = df.loc[akey] + expected2 = df.iloc[ainds] assert_frame_equal(result, expected) assert_frame_equal(result, expected2) self.assertEqual(len(result), 4) result = df.between_time(bkey.start, bkey.stop) - expected = df.ix[bkey] - expected2 = df.ix[binds] + expected = df.loc[bkey] + expected2 = df.iloc[binds] assert_frame_equal(result, expected) assert_frame_equal(result, expected2) self.assertEqual(len(result), 12) result = df.copy() - result.ix[akey] = 0 - result = result.ix[akey] - expected = df.ix[akey].copy() - expected.ix[:] = 0 + result.loc[akey] = 0 + result = result.loc[akey] + expected = df.loc[akey].copy() + expected.loc[:] = 0 assert_frame_equal(result, expected) result = df.copy() - result.ix[akey] = 0 - result.ix[akey] = df.ix[ainds] + result.loc[akey] = 0 + result.loc[akey] = df.iloc[ainds] assert_frame_equal(result, df) result = df.copy() - result.ix[bkey] = 0 - result = result.ix[bkey] - expected = df.ix[bkey].copy() - expected.ix[:] = 0 + result.loc[bkey] = 0 + result = result.loc[bkey] + expected = df.loc[bkey].copy() + expected.loc[:] = 0 assert_frame_equal(result, expected) result = df.copy() - result.ix[bkey] = 0 - result.ix[bkey] = df.ix[binds] + result.loc[bkey] = 0 + result.loc[bkey] = df.iloc[binds] assert_frame_equal(result, df) def test_xs(self): @@ -2183,7 +2267,8 @@ def test_index_namedtuple(self): name="composite_index", tupleize_cols=False) df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) - result = df.ix[IndexType("foo", "bar")]["A"] + with catch_warnings(record=True): + result = df.ix[IndexType("foo", "bar")]["A"] self.assertEqual(result, 1) result = df.loc[IndexType("foo", "bar")]["A"] @@ -2278,7 +2363,7 @@ def _check_get(df, cond, check_dtypes=True): df = DataFrame(dict([(c, Series([1] * 3, dtype=c)) for c in ['int64', 'int32', 'float32', 'float64']])) - df.ix[1, :] = 0 + df.iloc[1, :] = 0 result = df.where(df >= 0).get_dtype_counts() # when we don't preserve boolean casts @@ -2338,7 +2423,7 @@ def _check_align(df, cond, other, check_dtypes=True): err1 = (df + 1).values[0:2, :] self.assertRaises(ValueError, df.where, cond, err1) - err2 = cond.ix[:2, :].values + err2 = cond.iloc[:2, :].values other1 = _safe_add(df) self.assertRaises(ValueError, df.where, err2, other1) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index b739d087b8e93..f5719fa1d8b85 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -47,10 +47,10 @@ def test_getitem_pop_assign_name(self): s = self.frame.pop('A') self.assertEqual(s.name, 'A') - s = self.frame.ix[:, 'B'] + s = self.frame.loc[:, 'B'] self.assertEqual(s.name, 'B') - s2 = s.ix[:] + s2 = s.loc[:] self.assertEqual(s2.name, 'B') def test_get_value(self): @@ -105,8 +105,8 @@ def test_join_index(self): self.frame.join(self.frame, how=how) def test_join_index_more(self): - af = self.frame.ix[:, ['A', 'B']] - bf = self.frame.ix[::2, ['C', 'D']] + af = self.frame.loc[:, ['A', 'B']] + bf = self.frame.loc[::2, ['C', 'D']] expected = af.copy() expected['C'] = self.frame['C'][::2] @@ -119,7 +119,7 @@ def test_join_index_more(self): assert_frame_equal(result, expected[::2]) result = bf.join(af, how='right') - assert_frame_equal(result, expected.ix[:, result.columns]) + assert_frame_equal(result, expected.loc[:, result.columns]) def test_join_index_series(self): df = self.frame.copy() @@ -133,18 +133,18 @@ def test_join_index_series(self): assertRaisesRegexp(ValueError, 'must have a name', df.join, s) def test_join_overlap(self): - df1 = self.frame.ix[:, ['A', 'B', 'C']] - df2 = self.frame.ix[:, ['B', 'C', 'D']] + df1 = self.frame.loc[:, ['A', 'B', 'C']] + df2 = self.frame.loc[:, ['B', 'C', 'D']] joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') - df1_suf = df1.ix[:, ['B', 'C']].add_suffix('_df1') - df2_suf = df2.ix[:, ['B', 'C']].add_suffix('_df2') + df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') + df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') - no_overlap = self.frame.ix[:, ['A', 'D']] + no_overlap = self.frame.loc[:, ['A', 'D']] expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted - assert_frame_equal(joined, expected.ix[:, joined.columns]) + assert_frame_equal(joined, expected.loc[:, joined.columns]) def test_add_prefix_suffix(self): with_prefix = self.frame.add_prefix('foo#') @@ -258,7 +258,7 @@ def test_itertuples(self): for i, tup in enumerate(self.frame.itertuples()): s = Series(tup[1:]) s.name = tup[0] - expected = self.frame.ix[i, :].reset_index(drop=True) + expected = self.frame.iloc[i, :].reset_index(drop=True) assert_series_equal(s, expected) df = DataFrame({'floats': np.random.randn(5), diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index d7466f5ede06f..c4f037e85edf6 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -83,14 +83,14 @@ def test_dropna(self): df[2][:2] = nan dropped = df.dropna(axis=1) - expected = df.ix[:, [0, 1, 3]] + expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0) - expected = df.ix[lrange(2, 6)] + expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, inplace=True) assert_frame_equal(dropped, expected) @@ -98,14 +98,14 @@ def test_dropna(self): # threshold dropped = df.dropna(axis=1, thresh=5) - expected = df.ix[:, [0, 1, 3]] + expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, thresh=5, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0, thresh=4) - expected = df.ix[lrange(2, 6)] + expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, thresh=4, inplace=True) assert_frame_equal(dropped, expected) @@ -130,7 +130,7 @@ def test_dropna(self): df[2] = nan dropped = df.dropna(axis=1, how='all') - expected = df.ix[:, [0, 1, 3]] + expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input @@ -177,19 +177,23 @@ def test_dropna_multiple_axes(self): assert_frame_equal(inp, expected) def test_fillna(self): - self.tsframe.ix[:5, 'A'] = nan - self.tsframe.ix[-5:, 'A'] = nan + tf = self.tsframe + tf.loc[tf.index[:5], 'A'] = nan + tf.loc[tf.index[-5:], 'A'] = nan zero_filled = self.tsframe.fillna(0) - self.assertTrue((zero_filled.ix[:5, 'A'] == 0).all()) + self.assertTrue((zero_filled.loc[zero_filled.index[:5], 'A'] == 0 + ).all()) padded = self.tsframe.fillna(method='pad') - self.assertTrue(np.isnan(padded.ix[:5, 'A']).all()) - self.assertTrue((padded.ix[-5:, 'A'] == padded.ix[-5, 'A']).all()) + self.assertTrue(np.isnan(padded.loc[padded.index[:5], 'A']).all()) + self.assertTrue((padded.loc[padded.index[-5:], 'A'] == + padded.loc[padded.index[-5], 'A']).all()) # mixed type - self.mixed_frame.ix[5:20, 'foo'] = nan - self.mixed_frame.ix[-10:, 'A'] = nan + mf = self.mixed_frame + mf.loc[mf.index[5:20], 'foo'] = nan + mf.loc[mf.index[-10:], 'A'] = nan result = self.mixed_frame.fillna(value=0) result = self.mixed_frame.fillna(method='pad') @@ -198,7 +202,7 @@ def test_fillna(self): # mixed numeric (but no float16) mf = self.mixed_float.reindex(columns=['A', 'B', 'D']) - mf.ix[-10:, 'A'] = nan + mf.loc[mf.index[-10:], 'A'] = nan result = mf.fillna(value=0) _check_mixed_float(result, dtype=dict(C=None)) @@ -243,7 +247,8 @@ def test_fillna(self): }) expected = df.copy() - expected['Date'] = expected['Date'].fillna(df.ix[0, 'Date2']) + expected['Date'] = expected['Date'].fillna( + df.loc[df.index[0], 'Date2']) result = df.fillna(value={'Date': df['Date2']}) assert_frame_equal(result, expected) @@ -425,11 +430,12 @@ def test_fillna_col_reordering(self): self.assertEqual(df.columns.tolist(), filled.columns.tolist()) def test_fill_corner(self): - self.mixed_frame.ix[5:20, 'foo'] = nan - self.mixed_frame.ix[-10:, 'A'] = nan + mf = self.mixed_frame + mf.loc[mf.index[5:20], 'foo'] = nan + mf.loc[mf.index[-10:], 'A'] = nan filled = self.mixed_frame.fillna(value=0) - self.assertTrue((filled.ix[5:20, 'foo'] == 0).all()) + self.assertTrue((filled.loc[filled.index[5:20], 'foo'] == 0).all()) del self.mixed_frame['foo'] empty_float = self.frame.reindex(columns=[]) @@ -543,8 +549,8 @@ def test_interp_alt_scipy(self): 'C': [1, 2, 3, 5, 8, 13, 21]}) result = df.interpolate(method='barycentric') expected = df.copy() - expected.ix[2, 'A'] = 3 - expected.ix[5, 'A'] = 6 + expected.loc[2, 'A'] = 3 + expected.loc[5, 'A'] = 6 assert_frame_equal(result, expected) result = df.interpolate(method='barycentric', downcast='infer') @@ -558,12 +564,12 @@ def test_interp_alt_scipy(self): _skip_if_no_pchip() import scipy result = df.interpolate(method='pchip') - expected.ix[2, 'A'] = 3 + expected.loc[2, 'A'] = 3 if LooseVersion(scipy.__version__) >= '0.17.0': - expected.ix[5, 'A'] = 6.0 + expected.loc[5, 'A'] = 6.0 else: - expected.ix[5, 'A'] = 6.125 + expected.loc[5, 'A'] = 6.125 assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 220d29f624942..835c18ffc6081 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -353,13 +353,13 @@ def check(result, expected=None): index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'C', 'D', 'E']) z = df[['A', 'C', 'A']].copy() - expected = z.ix[['a', 'c', 'a']] + expected = z.loc[['a', 'c', 'a']] df = DataFrame(np.arange(25.).reshape(5, 5), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'C', 'D', 'E']) z = df[['A', 'C', 'A']] - result = z.ix[['a', 'c', 'a']] + result = z.loc[['a', 'c', 'a']] check(result, expected) def test_column_dups_indexing2(self): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index f7e821c0434aa..ef6297fc0b52e 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -570,7 +570,7 @@ def test_bool_flex_frame(self): # Unaligned def _check_unaligned_frame(meth, op, df, other): - part_o = other.ix[3:, 1:].copy() + part_o = other.loc[3:, 1:].copy() rs = meth(part_o) xp = op(df, part_o.reindex(index=df.index, columns=df.columns)) assert_frame_equal(rs, xp) @@ -635,19 +635,19 @@ def _test_seq(df, idx_ser, col_ser): _test_seq(df, idx_ser.values, col_ser.values) # NA - df.ix[0, 0] = np.nan + df.loc[0, 0] = np.nan rs = df.eq(df) - self.assertFalse(rs.ix[0, 0]) + self.assertFalse(rs.loc[0, 0]) rs = df.ne(df) - self.assertTrue(rs.ix[0, 0]) + self.assertTrue(rs.loc[0, 0]) rs = df.gt(df) - self.assertFalse(rs.ix[0, 0]) + self.assertFalse(rs.loc[0, 0]) rs = df.lt(df) - self.assertFalse(rs.ix[0, 0]) + self.assertFalse(rs.loc[0, 0]) rs = df.ge(df) - self.assertFalse(rs.ix[0, 0]) + self.assertFalse(rs.loc[0, 0]) rs = df.le(df) - self.assertFalse(rs.ix[0, 0]) + self.assertFalse(rs.loc[0, 0]) # complex arr = np.array([np.nan, 1, 6, np.nan]) @@ -941,12 +941,12 @@ def test_comparison_protected_from_errstate(self): def test_string_comparison(self): df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) mask_a = df.a > 1 - assert_frame_equal(df[mask_a], df.ix[1:1, :]) - assert_frame_equal(df[-mask_a], df.ix[0:0, :]) + assert_frame_equal(df[mask_a], df.loc[1:1, :]) + assert_frame_equal(df[-mask_a], df.loc[0:0, :]) mask_b = df.b == "foo" - assert_frame_equal(df[mask_b], df.ix[0:0, :]) - assert_frame_equal(df[-mask_b], df.ix[1:1, :]) + assert_frame_equal(df[mask_b], df.loc[0:0, :]) + assert_frame_equal(df[-mask_b], df.loc[1:1, :]) def test_float_none_comparison(self): df = DataFrame(np.random.randn(8, 3), index=lrange(8), @@ -1094,17 +1094,18 @@ def test_combineMult(self): def test_combine_generic(self): df1 = self.frame - df2 = self.frame.ix[:-5, ['A', 'B', 'C']] + df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']] combined = df1.combine(df2, np.add) combined2 = df2.combine(df1, np.add) self.assertTrue(combined['D'].isnull().all()) self.assertTrue(combined2['D'].isnull().all()) - chunk = combined.ix[:-5, ['A', 'B', 'C']] - chunk2 = combined2.ix[:-5, ['A', 'B', 'C']] + chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']] + chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']] - exp = self.frame.ix[:-5, ['A', 'B', 'C']].reindex_like(chunk) * 2 + exp = self.frame.loc[self.frame.index[:-5], + ['A', 'B', 'C']].reindex_like(chunk) * 2 assert_frame_equal(chunk, exp) assert_frame_equal(chunk2, exp) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 3bc388da5bec8..adc7af225588c 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -37,8 +37,9 @@ def test_replace_inplace(self): self.assertRaises(TypeError, self.tsframe.replace, nan) # mixed type - self.mixed_frame.ix[5:20, 'foo'] = nan - self.mixed_frame.ix[-10:, 'A'] = nan + mf = self.mixed_frame + mf.iloc[5:20, mf.columns.get_loc('foo')] = nan + mf.iloc[-10:, mf.columns.get_loc('A')] = nan result = self.mixed_frame.replace(np.nan, 0) expected = self.mixed_frame.fillna(value=0) @@ -639,8 +640,9 @@ def test_replace_convert(self): assert_series_equal(expec, res) def test_replace_mixed(self): - self.mixed_frame.ix[5:20, 'foo'] = nan - self.mixed_frame.ix[-10:, 'A'] = nan + mf = self.mixed_frame + mf.iloc[5:20, mf.columns.get_loc('foo')] = nan + mf.iloc[-10:, mf.columns.get_loc('A')] = nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 6b0dd38cdb82c..705270b695b77 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -659,7 +659,7 @@ def verify(df): right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - left = df.ix[17264:].copy().set_index(['s_id', 'dosage', 'agent']) + left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent']) assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 2abef59df284d..bbd8dd9b48b5c 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -28,7 +28,7 @@ def test_sort_index(self): columns=['A', 'B', 'C', 'D']) # axis=0 : sort rows by index labels - unordered = frame.ix[[3, 2, 4, 1]] + unordered = frame.loc[[3, 2, 4, 1]] result = unordered.sort_index(axis=0) expected = frame assert_frame_equal(result, expected) @@ -38,12 +38,12 @@ def test_sort_index(self): assert_frame_equal(result, expected) # axis=1 : sort columns by column names - unordered = frame.ix[:, [2, 1, 3, 0]] + unordered = frame.iloc[:, [2, 1, 3, 0]] result = unordered.sort_index(axis=1) assert_frame_equal(result, frame) result = unordered.sort_index(axis=1, ascending=False) - expected = frame.ix[:, ::-1] + expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) def test_sort_index_multiindex(self): @@ -79,12 +79,12 @@ def test_sort_values(self): # by column (axis=0) sorted_df = frame.sort_values(by='A') indexer = frame['A'].argsort().values - expected = frame.ix[frame.index[indexer]] + expected = frame.loc[frame.index[indexer]] assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by='A', ascending=False) indexer = indexer[::-1] - expected = frame.ix[frame.index[indexer]] + expected = frame.loc[frame.index[indexer]] assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by='A', ascending=False) @@ -336,7 +336,7 @@ def test_sort_index_inplace(self): columns=['A', 'B', 'C', 'D']) # axis=0 - unordered = frame.ix[[3, 2, 4, 1]] + unordered = frame.loc[[3, 2, 4, 1]] a_id = id(unordered['A']) df = unordered.copy() df.sort_index(inplace=True) @@ -350,7 +350,7 @@ def test_sort_index_inplace(self): assert_frame_equal(df, expected) # axis=1 - unordered = frame.ix[:, ['D', 'B', 'C', 'A']] + unordered = frame.loc[:, ['D', 'B', 'C', 'A']] df = unordered.copy() df.sort_index(axis=1, inplace=True) expected = frame @@ -358,7 +358,7 @@ def test_sort_index_inplace(self): df = unordered.copy() df.sort_index(axis=1, ascending=False, inplace=True) - expected = frame.ix[:, ::-1] + expected = frame.iloc[:, ::-1] assert_frame_equal(df, expected) def test_sort_index_different_sortorder(self): diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 6a57f67a6cb3d..8bd6d3ba54371 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -107,7 +107,7 @@ def test_indexing_sliced(self): tm.assert_series_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedSeries) - res = df.ix[:, 'Z'] + res = df.loc[:, 'Z'] exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z') tm.assert_series_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedSeries) @@ -122,7 +122,7 @@ def test_indexing_sliced(self): tm.assert_series_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedSeries) - res = df.ix['c', :] + res = df.loc['c', :] exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c') tm.assert_series_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedSeries) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index c6c3b4f43b55a..1a47294dc8af2 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -156,8 +156,8 @@ def test_shift(self): unshifted = shifted.shift(-1) self.assert_index_equal(shifted.index, ps.index) self.assert_index_equal(unshifted.index, ps.index) - tm.assert_numpy_array_equal(unshifted.ix[:, 0].valid().values, - ps.ix[:-1, 0].values) + tm.assert_numpy_array_equal(unshifted.iloc[:, 0].valid().values, + ps.iloc[:-1, 0].values) shifted2 = ps.shift(1, 'B') shifted3 = ps.shift(1, offsets.BDay()) @@ -244,7 +244,7 @@ def test_tshift(self): assert_frame_equal(shifted, self.tsframe.tshift(1)) assert_frame_equal(unshifted, inferred_ts) - no_freq = self.tsframe.ix[[0, 5, 7], :] + no_freq = self.tsframe.iloc[[0, 5, 7], :] self.assertRaises(ValueError, no_freq.tshift) def test_truncate(self): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 1eb3454519ce3..b585462365606 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -646,10 +646,10 @@ def create_cols(name): index=df_float.index, columns=create_cols('date')) # add in some nans - df_float.ix[30:50, 1:3] = np.nan + df_float.loc[30:50, 1:3] = np.nan # ## this is a bug in read_csv right now #### - # df_dt.ix[30:50,1:3] = np.nan + # df_dt.loc[30:50,1:3] = np.nan df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index fb0f52886ec31..40d8039f71576 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -130,7 +130,7 @@ def test_filter_out_all_groups(self): grouper = df['A'].apply(lambda x: x % 2) grouped = df.groupby(grouper) assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) + grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) def test_filter_out_no_groups(self): s = pd.Series([1, 3, 20, 5, 22, 24, 7]) @@ -278,7 +278,7 @@ def test_filter_using_len(self): assert_frame_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) - expected = df.ix[[]] + expected = df.loc[[]] assert_frame_equal(actual, expected) # Series have always worked properly, but we'll test anyway. diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 873c63ca257c4..952ec3ca01e16 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -135,7 +135,7 @@ def test_first_last_nth(self): # tests for first / last / nth grouped = self.df.groupby('A') first = grouped.first() - expected = self.df.ix[[1, 0], ['B', 'C', 'D']] + expected = self.df.loc[[1, 0], ['B', 'C', 'D']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) @@ -144,7 +144,7 @@ def test_first_last_nth(self): assert_frame_equal(nth, expected) last = grouped.last() - expected = self.df.ix[[5, 7], ['B', 'C', 'D']] + expected = self.df.loc[[5, 7], ['B', 'C', 'D']] expected.index = Index(['bar', 'foo'], name='A') assert_frame_equal(last, expected) @@ -152,7 +152,7 @@ def test_first_last_nth(self): assert_frame_equal(nth, expected) nth = grouped.nth(1) - expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy() + expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy() expected.index = Index(['foo', 'bar'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) @@ -187,19 +187,19 @@ def test_first_last_nth_dtypes(self): # tests for first / last / nth grouped = df.groupby('A') first = grouped.first() - expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']] + expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last() - expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']] + expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1) - expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] + expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) @@ -225,7 +225,7 @@ def test_nth(self): assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) assert_frame_equal(g[['B']].nth(0), - df.ix[[0, 2], ['A', 'B']].set_index('A')) + df.loc[[0, 2], ['A', 'B']].set_index('A')) exp = df.set_index('A') assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) @@ -763,7 +763,7 @@ def test_groupby_nonobject_dtype(self): df['value'] = lrange(len(df)) def max_value(group): - return group.ix[group['value'].idxmax()] + return group.loc[group['value'].idxmax()] applied = df.groupby('A').apply(max_value) result = applied.get_dtype_counts().sort_values() @@ -1024,14 +1024,14 @@ def test_groups(self): self.assertIs(groups, grouped.groups) # caching works for k, v in compat.iteritems(grouped.groups): - self.assertTrue((self.df.ix[v]['A'] == k).all()) + self.assertTrue((self.df.loc[v]['A'] == k).all()) grouped = self.df.groupby(['A', 'B']) groups = grouped.groups self.assertIs(groups, grouped.groups) # caching works for k, v in compat.iteritems(grouped.groups): - self.assertTrue((self.df.ix[v]['A'] == k[0]).all()) - self.assertTrue((self.df.ix[v]['B'] == k[1]).all()) + self.assertTrue((self.df.loc[v]['A'] == k[0]).all()) + self.assertTrue((self.df.loc[v]['B'] == k[1]).all()) def test_basic_regression(self): # regression @@ -1471,7 +1471,7 @@ def test_series_describe_single(self): assert_series_equal(result, expected) def test_series_index_name(self): - grouped = self.df.ix[:, ['C']].groupby(self.df['A']) + grouped = self.df.loc[:, ['C']].groupby(self.df['A']) result = grouped.agg(lambda x: x.mean()) self.assertEqual(result.index.name, 'A') @@ -1625,10 +1625,10 @@ def test_multi_iter_frame(self): # things get sorted! iterated = list(grouped) idx = df.index - expected = [('a', '1', df.ix[idx[[4]]]), - ('a', '2', df.ix[idx[[3, 5]]]), - ('b', '1', df.ix[idx[[0, 2]]]), - ('b', '2', df.ix[idx[[1]]])] + expected = [('a', '1', df.loc[idx[[4]]]), + ('a', '2', df.loc[idx[[3, 5]]]), + ('b', '1', df.loc[idx[[0, 2]]]), + ('b', '2', df.loc[idx[[1]]])] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] self.assertEqual(e1, one) @@ -1669,8 +1669,11 @@ def test_multi_func(self): grouped = self.df.groupby([col1.get, col2.get]) agged = grouped.mean() expected = self.df.groupby(['A', 'B']).mean() - assert_frame_equal(agged.ix[:, ['C', 'D']], expected.ix[:, ['C', 'D']], - check_names=False) # TODO groupby get drops names + + # TODO groupby get drops names + assert_frame_equal(agged.loc[:, ['C', 'D']], + expected.loc[:, ['C', 'D']], + check_names=False) # some "groups" with no data df = DataFrame({'v1': np.random.randn(6), @@ -1723,7 +1726,7 @@ def _check_op(op): expected = defaultdict(dict) for n1, gp1 in data.groupby('A'): for n2, gp2 in gp1.groupby('B'): - expected[n1][n2] = op(gp2.ix[:, ['C', 'D']]) + expected[n1][n2] = op(gp2.loc[:, ['C', 'D']]) expected = dict((k, DataFrame(v)) for k, v in compat.iteritems(expected)) expected = Panel.fromDict(expected).swapaxes(0, 1) @@ -1931,22 +1934,22 @@ def test_as_index_series_return_frame(self): grouped2 = self.df.groupby(['A', 'B'], as_index=False) result = grouped['C'].agg(np.sum) - expected = grouped.agg(np.sum).ix[:, ['A', 'C']] + expected = grouped.agg(np.sum).loc[:, ['A', 'C']] tm.assertIsInstance(result, DataFrame) assert_frame_equal(result, expected) result2 = grouped2['C'].agg(np.sum) - expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']] + expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] tm.assertIsInstance(result2, DataFrame) assert_frame_equal(result2, expected2) result = grouped['C'].sum() - expected = grouped.sum().ix[:, ['A', 'C']] + expected = grouped.sum().loc[:, ['A', 'C']] tm.assertIsInstance(result, DataFrame) assert_frame_equal(result, expected) result2 = grouped2['C'].sum() - expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']] + expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] tm.assertIsInstance(result2, DataFrame) assert_frame_equal(result2, expected2) @@ -1981,7 +1984,7 @@ def test_groupby_as_index_series_scalar(self): # GH #421 result = grouped['C'].agg(len) - expected = grouped.agg(len).ix[:, ['A', 'B', 'C']] + expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) def test_groupby_as_index_corner(self): @@ -2106,14 +2109,14 @@ def test_omit_nuisance(self): grouped = self.df.groupby('A') result = grouped.mean() - expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean() + expected = self.df.loc[:, ['A', 'C', 'D']].groupby('A').mean() assert_frame_equal(result, expected) agged = grouped.agg(np.mean) exp = grouped.mean() assert_frame_equal(agged, exp) - df = self.df.ix[:, ['A', 'C', 'D']] + df = self.df.loc[:, ['A', 'C', 'D']] df['E'] = datetime.now() grouped = df.groupby('A') result = grouped.agg(np.sum) @@ -2568,7 +2571,7 @@ def f(group): result = grouped.apply(f) for key, group in grouped: - assert_frame_equal(result.ix[key], f(group)) + assert_frame_equal(result.loc[key], f(group)) def test_mutate_groups(self): @@ -3025,7 +3028,7 @@ def f(group): with option_context('mode.chained_assignment', None): for key, group in grouped: res = f(group) - assert_frame_equal(res, result.ix[key]) + assert_frame_equal(res, result.loc[key]) def test_groupby_wrong_multi_labels(self): from pandas import read_csv @@ -3623,7 +3626,7 @@ def func(ser): return ser.sum() result = grouped.aggregate(func) - exp_grouped = self.three_group.ix[:, self.three_group.columns != 'C'] + exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] expected = exp_grouped.groupby(['A', 'B']).aggregate(func) assert_frame_equal(result, expected) @@ -3662,7 +3665,7 @@ def test_getitem_list_of_columns(self): result2 = df.groupby('A')['C', 'D'].mean() result3 = df.groupby('A')[df.columns[2:4]].mean() - expected = df.ix[:, ['A', 'C', 'D']].groupby('A').mean() + expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() assert_frame_equal(result, expected) assert_frame_equal(result2, expected) @@ -3678,7 +3681,7 @@ def test_getitem_numeric_column_names(self): result2 = df.groupby(0)[2, 4].mean() result3 = df.groupby(0)[[2, 4]].mean() - expected = df.ix[:, [0, 2, 4]].groupby(0).mean() + expected = df.loc[:, [0, 2, 4]].groupby(0).mean() assert_frame_equal(result, expected) assert_frame_equal(result2, expected) @@ -5091,8 +5094,8 @@ def test_regression_whitelist_methods(self): names=['first', 'second']) raw_frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) - raw_frame.ix[1, [1, 2]] = np.nan - raw_frame.ix[7, [0, 1]] = np.nan + raw_frame.iloc[1, [1, 2]] = np.nan + raw_frame.iloc[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2), lrange(2), diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 0017271fe6c97..2cd73ec8d254a 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -954,8 +954,8 @@ def test_period_index_indexer(self): # GH4125 idx = pd.period_range('2002-01', '2003-12', freq='M') df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) - self.assert_frame_equal(df, df.ix[idx]) - self.assert_frame_equal(df, df.ix[list(idx)]) + self.assert_frame_equal(df, df.loc[idx]) + self.assert_frame_equal(df, df.loc[list(idx)]) self.assert_frame_equal(df, df.loc[list(idx)]) self.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) self.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 2861a1f56b24b..343078aeafaf0 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1820,7 +1820,7 @@ def test_insert(self): pd.MultiIndex.from_tuples(idx[:-2])) left.loc[('test', 17)] = 11 - left.ix[('test', 18)] = 12 + left.loc[('test', 18)] = 12 right = pd.Series(np.linspace(0, 12, 13), pd.MultiIndex.from_tuples(idx)) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 3465d776bfa85..ab225f72934ce 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -21,51 +21,51 @@ def test_frame_loc_ix_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.ix[lambda x: x.A > 2] - tm.assert_frame_equal(res, df.ix[df.A > 2]) + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) res = df.loc[lambda x: x.A > 2, ] tm.assert_frame_equal(res, df.loc[df.A > 2, ]) - res = df.ix[lambda x: x.A > 2, ] - tm.assert_frame_equal(res, df.ix[df.A > 2, ]) + res = df.loc[lambda x: x.A > 2, ] + tm.assert_frame_equal(res, df.loc[df.A > 2, ]) res = df.loc[lambda x: x.B == 'b', :] tm.assert_frame_equal(res, df.loc[df.B == 'b', :]) - res = df.ix[lambda x: x.B == 'b', :] - tm.assert_frame_equal(res, df.ix[df.B == 'b', :]) + res = df.loc[lambda x: x.B == 'b', :] + tm.assert_frame_equal(res, df.loc[df.B == 'b', :]) res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B'] tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - res = df.ix[lambda x: x.A > 2, lambda x: x.columns == 'B'] - tm.assert_frame_equal(res, df.ix[df.A > 2, [False, True, False]]) + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B'] + tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) res = df.loc[lambda x: x.A > 2, lambda x: 'B'] tm.assert_series_equal(res, df.loc[df.A > 2, 'B']) - res = df.ix[lambda x: x.A > 2, lambda x: 'B'] - tm.assert_series_equal(res, df.ix[df.A > 2, 'B']) + res = df.loc[lambda x: x.A > 2, lambda x: 'B'] + tm.assert_series_equal(res, df.loc[df.A > 2, 'B']) res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']] tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) - res = df.ix[lambda x: x.A > 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.ix[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']] + tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']] tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']]) - res = df.ix[lambda x: x.A == 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.ix[df.A == 2, ['A', 'B']]) + res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']] + tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']]) # scalar res = df.loc[lambda x: 1, lambda x: 'A'] self.assertEqual(res, df.loc[1, 'A']) - res = df.ix[lambda x: 1, lambda x: 'A'] - self.assertEqual(res, df.ix[1, 'A']) + res = df.loc[lambda x: 1, lambda x: 'A'] + self.assertEqual(res, df.loc[1, 'A']) def test_frame_loc_ix_callable_mixture(self): # GH 11485 @@ -75,20 +75,20 @@ def test_frame_loc_ix_callable_mixture(self): res = df.loc[lambda x: x.A > 2, ['A', 'B']] tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) - res = df.ix[lambda x: x.A > 2, ['A', 'B']] - tm.assert_frame_equal(res, df.ix[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, ['A', 'B']] + tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) res = df.loc[[2, 3], lambda x: ['A', 'B']] tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']]) - res = df.ix[[2, 3], lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.ix[[2, 3], ['A', 'B']]) + res = df.loc[[2, 3], lambda x: ['A', 'B']] + tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']]) res = df.loc[3, lambda x: ['A', 'B']] tm.assert_series_equal(res, df.loc[3, ['A', 'B']]) - res = df.ix[3, lambda x: ['A', 'B']] - tm.assert_series_equal(res, df.ix[3, ['A', 'B']]) + res = df.loc[3, lambda x: ['A', 'B']] + tm.assert_series_equal(res, df.loc[3, ['A', 'B']]) def test_frame_loc_callable(self): # GH 11485 diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9ef2802cb950f..b8a24cb2dcb03 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -190,22 +190,22 @@ def test_ix_categorical_index(self): cdf.index = pd.CategoricalIndex(df.index) cdf.columns = pd.CategoricalIndex(df.columns) - expect = pd.Series(df.ix['A', :], index=cdf.columns, name='A') - assert_series_equal(cdf.ix['A', :], expect) + expect = pd.Series(df.loc['A', :], index=cdf.columns, name='A') + assert_series_equal(cdf.loc['A', :], expect) - expect = pd.Series(df.ix[:, 'X'], index=cdf.index, name='X') - assert_series_equal(cdf.ix[:, 'X'], expect) + expect = pd.Series(df.loc[:, 'X'], index=cdf.index, name='X') + assert_series_equal(cdf.loc[:, 'X'], expect) exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) - expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, + expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, index=exp_index) - assert_frame_equal(cdf.ix[['A', 'B'], :], expect) + assert_frame_equal(cdf.loc[['A', 'B'], :], expect) exp_columns = pd.CategoricalIndex(list('XY'), categories=['X', 'Y', 'Z']) - expect = pd.DataFrame(df.ix[:, ['X', 'Y']], index=cdf.index, + expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, columns=exp_columns) - assert_frame_equal(cdf.ix[:, ['X', 'Y']], expect) + assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) # non-unique df = pd.DataFrame(np.random.randn(3, 3), @@ -215,22 +215,22 @@ def test_ix_categorical_index(self): cdf.columns = pd.CategoricalIndex(df.columns) exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B']) - expect = pd.DataFrame(df.ix['A', :], columns=cdf.columns, + expect = pd.DataFrame(df.loc['A', :], columns=cdf.columns, index=exp_index) - assert_frame_equal(cdf.ix['A', :], expect) + assert_frame_equal(cdf.loc['A', :], expect) exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y']) - expect = pd.DataFrame(df.ix[:, 'X'], index=cdf.index, + expect = pd.DataFrame(df.loc[:, 'X'], index=cdf.index, columns=exp_columns) - assert_frame_equal(cdf.ix[:, 'X'], expect) + assert_frame_equal(cdf.loc[:, 'X'], expect) - expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, + expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, index=pd.CategoricalIndex(list('AAB'))) - assert_frame_equal(cdf.ix[['A', 'B'], :], expect) + assert_frame_equal(cdf.loc[['A', 'B'], :], expect) - expect = pd.DataFrame(df.ix[:, ['X', 'Y']], index=cdf.index, + expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, columns=pd.CategoricalIndex(list('XXY'))) - assert_frame_equal(cdf.ix[:, ['X', 'Y']], expect) + assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) def test_read_only_source(self): # GH 10043 diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 920aefa24b576..8f0fa2d56113b 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from warnings import catch_warnings import numpy as np from pandas import Series, DataFrame, Index, Float64Index from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -77,7 +78,8 @@ def test_scalar_non_numeric(self): (lambda x: x, True)]: def f(): - idxr(s)[3.0] + with catch_warnings(record=True): + idxr(s)[3.0] # gettitem on a DataFrame is a KeyError as it is indexing # via labels on the columns @@ -131,7 +133,8 @@ def f(): for idxr in [lambda x: x.ix, lambda x: x]: s2 = s.copy() - idxr(s2)[3.0] = 0 + with catch_warnings(record=True): + idxr(s2)[3.0] = 0 self.assertTrue(s2.index.is_object()) # fallsback to position selection, series only @@ -151,7 +154,8 @@ def test_scalar_with_mixed(self): lambda x: x.iloc]: def f(): - idxr(s2)[1.0] + with catch_warnings(record=True): + idxr(s2)[1.0] self.assertRaises(TypeError, f) @@ -167,7 +171,8 @@ def f(): lambda x: x]: def f(): - idxr(s3)[1.0] + with catch_warnings(record=True): + idxr(s3)[1.0] self.assertRaises(TypeError, f) @@ -199,7 +204,8 @@ def test_scalar_integer(self): (lambda x: x.loc, False), (lambda x: x, True)]: - result = idxr(s)[3.0] + with catch_warnings(record=True): + result = idxr(s)[3.0] self.check(result, s, 3, getitem) # coerce to equal int @@ -220,13 +226,14 @@ def test_scalar_integer(self): index=range(len(s)), name=3) s2 = s.copy() - idxr(s2)[3.0] = 100 + with catch_warnings(record=True): + idxr(s2)[3.0] = 100 - result = idxr(s2)[3.0] - compare(result, expected) + result = idxr(s2)[3.0] + compare(result, expected) - result = idxr(s2)[3] - compare(result, expected) + result = idxr(s2)[3] + compare(result, expected) # contains # coerce to equal int @@ -247,19 +254,23 @@ def test_scalar_float(self): (lambda x: x, True)]: # getting - result = idxr(s)[indexer] + with catch_warnings(record=True): + result = idxr(s)[indexer] self.check(result, s, 3, getitem) # setting s2 = s.copy() def f(): - idxr(s2)[indexer] = expected - result = idxr(s2)[indexer] + with catch_warnings(record=True): + idxr(s2)[indexer] = expected + with catch_warnings(record=True): + result = idxr(s2)[indexer] self.check(result, s, 3, getitem) # random integer is a KeyError - self.assertRaises(KeyError, lambda: idxr(s)[3.5]) + with catch_warnings(record=True): + self.assertRaises(KeyError, lambda: idxr(s)[3.5]) # contains self.assertTrue(3.0 in s) @@ -308,7 +319,8 @@ def f(): lambda x: x]: def f(): - idxr(s)[l] + with catch_warnings(record=True): + idxr(s)[l] self.assertRaises(TypeError, f) # setitem @@ -325,7 +337,8 @@ def f(): lambda x: x.iloc, lambda x: x]: def f(): - idxr(s)[l] = 0 + with catch_warnings(record=True): + idxr(s)[l] = 0 self.assertRaises(TypeError, f) def test_slice_integer(self): @@ -349,7 +362,8 @@ def test_slice_integer(self): for idxr in [lambda x: x.loc, lambda x: x.ix]: - result = idxr(s)[l] + with catch_warnings(record=True): + result = idxr(s)[l] # these are all label indexing # except getitem which is positional @@ -372,7 +386,8 @@ def f(): for idxr in [lambda x: x.loc, lambda x: x.ix]: - result = idxr(s)[l] + with catch_warnings(record=True): + result = idxr(s)[l] # these are all label indexing # except getitem which is positional @@ -397,7 +412,8 @@ def f(): for idxr in [lambda x: x.loc, lambda x: x.ix]: - result = idxr(s)[l] + with catch_warnings(record=True): + result = idxr(s)[l] if oob: res = slice(0, 0) else: @@ -419,8 +435,9 @@ def f(): for idxr in [lambda x: x.loc, lambda x: x.ix]: sc = s.copy() - idxr(sc)[l] = 0 - result = idxr(sc)[l].values.ravel() + with catch_warnings(record=True): + idxr(sc)[l] = 0 + result = idxr(sc)[l].values.ravel() self.assertTrue((result == 0).all()) # positional indexing @@ -467,7 +484,8 @@ def test_slice_integer_frame_getitem(self): slice(0, 1.0), slice(0.0, 1.0)]: - result = idxr(s)[l] + with catch_warnings(record=True): + result = idxr(s)[l] indexer = slice(0, 2) self.check(result, s, indexer, False) @@ -495,7 +513,8 @@ def f(): (slice(0, 0.5), slice(0, 1)), (slice(0.5, 1.5), slice(1, 2))]: - result = idxr(s)[l] + with catch_warnings(record=True): + result = idxr(s)[l] self.check(result, s, res, False) # positional indexing @@ -510,8 +529,9 @@ def f(): slice(3.0, 4.0)]: sc = s.copy() - idxr(sc)[l] = 0 - result = idxr(sc)[l].values.ravel() + with catch_warnings(record=True): + idxr(sc)[l] = 0 + result = idxr(sc)[l].values.ravel() self.assertTrue((result == 0).all()) # positional indexing @@ -537,15 +557,17 @@ def test_slice_float(self): lambda x: x]: # getitem - result = idxr(s)[l] + with catch_warnings(record=True): + result = idxr(s)[l] if isinstance(s, Series): self.assert_series_equal(result, expected) else: self.assert_frame_equal(result, expected) # setitem s2 = s.copy() - idxr(s2)[l] = 0 - result = idxr(s2)[l].values.ravel() + with catch_warnings(record=True): + idxr(s2)[l] = 0 + result = idxr(s2)[l].values.ravel() self.assertTrue((result == 0).all()) def test_floating_index_doc_example(self): @@ -553,7 +575,7 @@ def test_floating_index_doc_example(self): index = Index([1.5, 2, 3, 4.5, 5]) s = Series(range(5), index=index) self.assertEqual(s[3], 2) - self.assertEqual(s.ix[3], 2) + self.assertEqual(s.loc[3], 2) self.assertEqual(s.loc[3], 2) self.assertEqual(s.iloc[3], 3) @@ -565,7 +587,7 @@ def test_floating_misc(self): # label based slicing result1 = s[1.0:3.0] - result2 = s.ix[1.0:3.0] + result2 = s.loc[1.0:3.0] result3 = s.loc[1.0:3.0] assert_series_equal(result1, result2) assert_series_equal(result1, result3) @@ -573,13 +595,13 @@ def test_floating_misc(self): # exact indexing when found result1 = s[5.0] result2 = s.loc[5.0] - result3 = s.ix[5.0] + result3 = s.loc[5.0] self.assertEqual(result1, result2) self.assertEqual(result1, result3) result1 = s[5] result2 = s.loc[5] - result3 = s.ix[5] + result3 = s.loc[5] self.assertEqual(result1, result2) self.assertEqual(result1, result3) @@ -589,7 +611,7 @@ def test_floating_misc(self): # scalar integers self.assertRaises(KeyError, lambda: s.loc[4]) - self.assertRaises(KeyError, lambda: s.ix[4]) + self.assertRaises(KeyError, lambda: s.loc[4]) self.assertRaises(KeyError, lambda: s[4]) # fancy floats/integers create the correct entry (as nan) @@ -598,13 +620,13 @@ def test_floating_misc(self): for fancy_idx in [[5.0, 0.0], np.array([5.0, 0.0])]: # float assert_series_equal(s[fancy_idx], expected) assert_series_equal(s.loc[fancy_idx], expected) - assert_series_equal(s.ix[fancy_idx], expected) + assert_series_equal(s.loc[fancy_idx], expected) expected = Series([2, 0], index=Index([5, 0], dtype='int64')) for fancy_idx in [[5, 0], np.array([5, 0])]: # int assert_series_equal(s[fancy_idx], expected) assert_series_equal(s.loc[fancy_idx], expected) - assert_series_equal(s.ix[fancy_idx], expected) + assert_series_equal(s.loc[fancy_idx], expected) # all should return the same as we are slicing 'the same' result1 = s.loc[2:5] @@ -624,17 +646,17 @@ def test_floating_misc(self): assert_series_equal(result1, result3) assert_series_equal(result1, result4) - result1 = s.ix[2:5] - result2 = s.ix[2.0:5.0] - result3 = s.ix[2.0:5] - result4 = s.ix[2.1:5] + result1 = s.loc[2:5] + result2 = s.loc[2.0:5.0] + result3 = s.loc[2.0:5] + result4 = s.loc[2.1:5] assert_series_equal(result1, result2) assert_series_equal(result1, result3) assert_series_equal(result1, result4) # combined test result1 = s.loc[2:5] - result2 = s.ix[2:5] + result2 = s.loc[2:5] result3 = s[2:5] assert_series_equal(result1, result2) @@ -643,7 +665,7 @@ def test_floating_misc(self): # list selection result1 = s[[0.0, 5, 10]] result2 = s.loc[[0.0, 5, 10]] - result3 = s.ix[[0.0, 5, 10]] + result3 = s.loc[[0.0, 5, 10]] result4 = s.iloc[[0, 2, 4]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) @@ -651,14 +673,14 @@ def test_floating_misc(self): result1 = s[[1.6, 5, 10]] result2 = s.loc[[1.6, 5, 10]] - result3 = s.ix[[1.6, 5, 10]] + result3 = s.loc[[1.6, 5, 10]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) assert_series_equal(result1, Series( [np.nan, 2, 4], index=[1.6, 5, 10])) result1 = s[[0, 1, 2]] - result2 = s.ix[[0, 1, 2]] + result2 = s.loc[[0, 1, 2]] result3 = s.loc[[0, 1, 2]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) @@ -666,12 +688,12 @@ def test_floating_misc(self): [0.0, np.nan, np.nan], index=[0, 1, 2])) result1 = s.loc[[2.5, 5]] - result2 = s.ix[[2.5, 5]] + result2 = s.loc[[2.5, 5]] assert_series_equal(result1, result2) assert_series_equal(result1, Series([1, 2], index=[2.5, 5.0])) result1 = s[[2.5]] - result2 = s.ix[[2.5]] + result2 = s.loc[[2.5]] result3 = s.loc[[2.5]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a50027f1d0343..e44471efb9871 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -4,6 +4,7 @@ import nose import itertools import warnings +from warnings import catch_warnings from datetime import datetime from pandas.types.common import (is_integer_dtype, @@ -60,7 +61,8 @@ def _get_value(f, i, values=False): # for a in reversed(i): # v = v.__getitem__(a) # return v - return f.ix[i] + with catch_warnings(record=True): + return f.ix[i] def _get_result(obj, method, key, axis): @@ -105,9 +107,6 @@ class TestIndexing(tm.TestCase): def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) - self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), @@ -288,6 +287,14 @@ def _print(result, error=None): k2 = key2 _eq(t, o, a, obj, key1, k2) + def test_ix_deprecation(self): + # GH 15114 + + df = DataFrame({'A': [1, 2, 3]}) + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + df.ix[1, 'A'] + def test_indexer_caching(self): # GH5727 # make sure that indexers are in the _internal_names_set @@ -748,13 +755,17 @@ def test_ix_loc_setitem_consistency(self): # chained indexing assignment df = DataFrame({'a': [0, 1, 2]}) expected = df.copy() - expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] + with catch_warnings(record=True): + expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] + with catch_warnings(record=True): + df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] tm.assert_frame_equal(df, expected) df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype('float64') + 0.5 + with catch_warnings(record=True): + df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype( + 'float64') + 0.5 expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) tm.assert_frame_equal(df, expected) @@ -777,7 +788,8 @@ def test_ix_loc_setitem_consistency(self): tm.assert_frame_equal(df2, expected) df2 = df.copy() - df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') + with catch_warnings(record=True): + df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') tm.assert_frame_equal(df2, expected) def test_ix_loc_consistency(self): @@ -801,7 +813,8 @@ def compare(result, expected): tm.makeDateIndex, tm.makePeriodIndex, tm.makeTimedeltaIndex]: df.index = index(len(df.index)) - df.ix[key] + with catch_warnings(record=True): + df.ix[key] self.assertRaises(TypeError, lambda: df.loc[key]) @@ -820,7 +833,8 @@ def compare(result, expected): # if the expected raises, then compare the exceptions try: - expected = df.ix[key] + with catch_warnings(record=True): + expected = df.ix[key] except KeyError: self.assertRaises(KeyError, lambda: df.loc[key]) continue @@ -832,7 +846,8 @@ def compare(result, expected): df1 = df.copy() df2 = df.copy() - df1.ix[key] = 10 + with catch_warnings(record=True): + df1.ix[key] = 10 df2.loc[key] = 10 compare(df2, df1) @@ -840,7 +855,8 @@ def compare(result, expected): s = Series([1, 2, 3, 4], index=list('abde')) result1 = s['a':'c'] - result2 = s.ix['a':'c'] + with catch_warnings(record=True): + result2 = s.ix['a':'c'] result3 = s.loc['a':'c'] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) @@ -848,11 +864,13 @@ def compare(result, expected): # now work rather than raising KeyError s = Series(range(5), [-2, -1, 1, 2, 3]) - result1 = s.ix[-10:3] + with catch_warnings(record=True): + result1 = s.ix[-10:3] result2 = s.loc[-10:3] tm.assert_series_equal(result1, result2) - result1 = s.ix[0:3] + with catch_warnings(record=True): + result1 = s.ix[0:3] result2 = s.loc[0:3] tm.assert_series_equal(result1, result2) @@ -905,9 +923,12 @@ def check(target, indexers, value, compare_fn, expected=None): df['F'] = 99 row_selection = df['A'] % 2 == 0 col_selection = ['B', 'C'] - df.ix[row_selection, col_selection] = df['F'] + with catch_warnings(record=True): + df.ix[row_selection, col_selection] = df['F'] output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) - tm.assert_frame_equal(df.ix[row_selection, col_selection], output) + with catch_warnings(record=True): + tm.assert_frame_equal(df.ix[row_selection, col_selection], + output) check(target=df, indexers=(row_selection, col_selection), value=df['F'], @@ -1271,7 +1292,8 @@ def test_loc_getitem_int_slice(self): [6, 7, 8], ['a', 'b'])]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] - expected = df.ix[6:8, :] + with catch_warnings(record=True): + expected = df.ix[6:8, :] tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([t @@ -1279,12 +1301,14 @@ def test_loc_getitem_int_slice(self): [10, 20, 30], ['a', 'b'])]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] - expected = df.ix[20:30, :] + with catch_warnings(record=True): + expected = df.ix[20:30, :] tm.assert_frame_equal(result, expected) # doc examples result = df.loc[10, :] - expected = df.ix[10, :] + with catch_warnings(record=True): + expected = df.ix[10, :] tm.assert_frame_equal(result, expected) result = df.loc[:, 10] @@ -1521,7 +1545,8 @@ def test_loc_setitem_frame(self): df.loc[:, 'B':'D'] = 0 expected = df.loc[:, 'B':'D'] - result = df.ix[:, 1:] + with catch_warnings(record=True): + result = df.ix[:, 1:] tm.assert_frame_equal(result, expected) # GH 6254 @@ -1603,45 +1628,54 @@ def test_iloc_getitem_frame(self): columns=lrange(0, 8, 2)) result = df.iloc[2] - exp = df.ix[4] + with catch_warnings(record=True): + exp = df.ix[4] tm.assert_series_equal(result, exp) result = df.iloc[2, 2] - exp = df.ix[4, 4] + with catch_warnings(record=True): + exp = df.ix[4, 4] self.assertEqual(result, exp) # slice result = df.iloc[4:8] - expected = df.ix[8:14] + with catch_warnings(record=True): + expected = df.ix[8:14] tm.assert_frame_equal(result, expected) result = df.iloc[:, 2:3] - expected = df.ix[:, 4:5] + with catch_warnings(record=True): + expected = df.ix[:, 4:5] tm.assert_frame_equal(result, expected) # list of integers result = df.iloc[[0, 1, 3]] - expected = df.ix[[0, 2, 6]] + with catch_warnings(record=True): + expected = df.ix[[0, 2, 6]] tm.assert_frame_equal(result, expected) result = df.iloc[[0, 1, 3], [0, 1]] - expected = df.ix[[0, 2, 6], [0, 2]] + with catch_warnings(record=True): + expected = df.ix[[0, 2, 6], [0, 2]] tm.assert_frame_equal(result, expected) # neg indicies result = df.iloc[[-1, 1, 3], [-1, 1]] - expected = df.ix[[18, 2, 6], [6, 2]] + with catch_warnings(record=True): + expected = df.ix[[18, 2, 6], [6, 2]] tm.assert_frame_equal(result, expected) # dups indicies result = df.iloc[[-1, -1, 1, 3], [-1, 1]] - expected = df.ix[[18, 18, 2, 6], [6, 2]] + with catch_warnings(record=True): + expected = df.ix[[18, 18, 2, 6], [6, 2]] tm.assert_frame_equal(result, expected) # with index-like s = Series(index=lrange(1, 5)) result = df.iloc[s.index] - expected = df.ix[[2, 4, 6, 8]] + with catch_warnings(record=True): + expected = df.ix[[2, 4, 6, 8]] tm.assert_frame_equal(result, expected) def test_iloc_getitem_labelled_frame(self): @@ -1650,16 +1684,16 @@ def test_iloc_getitem_labelled_frame(self): index=list('abcdefghij'), columns=list('ABCD')) result = df.iloc[1, 1] - exp = df.ix['b', 'B'] + exp = df.loc['b', 'B'] self.assertEqual(result, exp) result = df.iloc[:, 2:3] - expected = df.ix[:, ['C']] + expected = df.loc[:, ['C']] tm.assert_frame_equal(result, expected) # negative indexing result = df.iloc[-1, -1] - exp = df.ix['j', 'D'] + exp = df.loc['j', 'D'] self.assertEqual(result, exp) # out-of-bounds exception @@ -1853,15 +1887,23 @@ def test_setitem_ndarray_1d(self): # invalid def f(): - df.ix[2:5, 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2]) + with catch_warnings(record=True): + df.ix[2:5, 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2]) + + self.assertRaises(ValueError, f) + + def f(): + df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, + 2.2, 1.0]) self.assertRaises(ValueError, f) # valid - df.ix[2:5, 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) + df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, + 2.2, 1.0]) - result = df.ix[2:5, 'bar'] - expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[2, 3, 4, 5], + result = df.loc[df.index[2:6], 'bar'] + expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name='bar') tm.assert_series_equal(result, expected) @@ -1935,24 +1977,28 @@ def test_iloc_getitem_multiindex(self): # the first row rs = mi_int.iloc[0] - xp = mi_int.ix[4].ix[8] + with catch_warnings(record=True): + xp = mi_int.ix[4].ix[8] tm.assert_series_equal(rs, xp, check_names=False) self.assertEqual(rs.name, (4, 8)) self.assertEqual(xp.name, 8) # 2nd (last) columns rs = mi_int.iloc[:, 2] - xp = mi_int.ix[:, 2] + with catch_warnings(record=True): + xp = mi_int.ix[:, 2] tm.assert_series_equal(rs, xp) # corner column rs = mi_int.iloc[2, 2] - xp = mi_int.ix[:, 2].ix[2] + with catch_warnings(record=True): + xp = mi_int.ix[:, 2].ix[2] self.assertEqual(rs, xp) # this is basically regular indexing rs = mi_labels.iloc[2, 2] - xp = mi_labels.ix['j'].ix[:, 'j'].ix[0, 0] + with catch_warnings(record=True): + xp = mi_labels.ix['j'].ix[:, 'j'].ix[0, 0] self.assertEqual(rs, xp) def test_loc_multiindex(self): @@ -1967,26 +2013,31 @@ def test_loc_multiindex(self): # the first row rs = mi_labels.loc['i'] - xp = mi_labels.ix['i'] + with catch_warnings(record=True): + xp = mi_labels.ix['i'] tm.assert_frame_equal(rs, xp) # 2nd (last) columns rs = mi_labels.loc[:, 'j'] - xp = mi_labels.ix[:, 'j'] + with catch_warnings(record=True): + xp = mi_labels.ix[:, 'j'] tm.assert_frame_equal(rs, xp) # corner column rs = mi_labels.loc['j'].loc[:, 'j'] - xp = mi_labels.ix['j'].ix[:, 'j'] + with catch_warnings(record=True): + xp = mi_labels.ix['j'].ix[:, 'j'] tm.assert_frame_equal(rs, xp) # with a tuple rs = mi_labels.loc[('i', 'X')] - xp = mi_labels.ix[('i', 'X')] + with catch_warnings(record=True): + xp = mi_labels.ix[('i', 'X')] tm.assert_frame_equal(rs, xp) rs = mi_int.loc[4] - xp = mi_int.ix[4] + with catch_warnings(record=True): + xp = mi_int.ix[4] tm.assert_frame_equal(rs, xp) def test_loc_multiindex_indexer_none(self): @@ -2069,7 +2120,7 @@ def test_series_getitem_multiindex(self): expected = Series([1], index=[0]) tm.assert_series_equal(result, expected) - result = s.ix[:, 1] + result = s.loc[:, 1] expected = Series([2, 3], index=[1, 2]) tm.assert_series_equal(result, expected) @@ -2114,11 +2165,12 @@ def test_ix_general(self): # emits a PerformanceWarning, ok with self.assert_produces_warning(PerformanceWarning): - tm.assert_frame_equal(df.ix[key], df.iloc[2:]) + tm.assert_frame_equal(df.loc[key], df.iloc[2:]) # this is ok df.sort_index(inplace=True) - res = df.ix[key] + res = df.loc[key] + # col has float dtype, result should be Float64Index index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], names=['col', 'year']) @@ -2129,7 +2181,7 @@ def test_ix_weird_slicing(self): # http://stackoverflow.com/q/17056560/1240268 df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], 'two': [1, 2, 3, 4, 5]}) - df.ix[df['one'] > 1, 'two'] = -df['two'] + df.loc[df['one'] > 1, 'two'] = -df['two'] expected = DataFrame({'one': {0: 1.0, 1: 2.0, @@ -2837,7 +2889,7 @@ def test_setitem_dtype_upcast(self): df['c'] = np.nan self.assertEqual(df['c'].dtype, np.float64) - df.ix[0, 'c'] = 'foo' + df.loc[0, 'c'] = 'foo' expected = DataFrame([{"a": 1, "c": 'foo'}, {"a": 3, "b": 2, "c": np.nan}]) tm.assert_frame_equal(df, expected) @@ -2916,10 +2968,10 @@ def test_dups_fancy_indexing(self): {'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) - result = df.ix[rows] + result = df.loc[rows] tm.assert_frame_equal(result, expected) - result = df.ix[Index(rows)] + result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] @@ -2928,7 +2980,7 @@ def test_dups_fancy_indexing(self): 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan]}, index=rows) - result = df.ix[rows] + result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer @@ -2938,7 +2990,7 @@ def test_dups_fancy_indexing(self): 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]}, index=rows) - result = df.ix[rows] + result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are @@ -3020,7 +3072,8 @@ def test_multitype_list_index_access(self): # GH 10610 df = pd.DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) - with self.assertRaises(IndexError): + + with self.assertRaises(KeyError): df[[22, 26, -8]] self.assertEqual(df[21].shape[0], df.shape[0]) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index f07aadba175f2..6486c8aa21c1b 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -273,7 +273,7 @@ def test_irreg_hf(self): idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) df = DataFrame(np.random.randn(len(idx), 2), idx) - irreg = df.ix[[0, 1, 3, 4]] + irreg = df.iloc[[0, 1, 3, 4]] ax = irreg.plot() diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index f4c64c3d7a3f8..fba554b03f191 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -699,7 +699,7 @@ def test_bar_colors(self): self._check_colors(ax.patches[::5], facecolors=rgba_colors) tm.close() - ax = df.ix[:, [0]].plot.bar(color='DodgerBlue') + ax = df.loc[:, [0]].plot.bar(color='DodgerBlue') self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) tm.close() @@ -1594,7 +1594,7 @@ def test_line_colors(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - ax = df.ix[:, [0]].plot(color='DodgerBlue') + ax = df.loc[:, [0]].plot(color='DodgerBlue') self._check_colors(ax.lines, linecolors=['DodgerBlue']) ax = df.plot(color='red') @@ -1681,7 +1681,7 @@ def test_line_colors_and_styles_subplots(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - axes = df.ix[:, [0]].plot(color='DodgerBlue', subplots=True) + axes = df.loc[:, [0]].plot(color='DodgerBlue', subplots=True) self._check_colors(axes[0].lines, linecolors=['DodgerBlue']) # single character style @@ -1784,7 +1784,7 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=rgba_colors) tm.close() - ax = df.ix[:, [0]].plot.hist(color='DodgerBlue') + ax = df.loc[:, [0]].plot.hist(color='DodgerBlue') self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) ax = df.plot(kind='hist', color='green') @@ -1856,8 +1856,8 @@ def test_kde_colors_and_styles_subplots(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - axes = df.ix[:, [0]].plot(kind='kde', color='DodgerBlue', - subplots=True) + axes = df.loc[:, [0]].plot(kind='kde', color='DodgerBlue', + subplots=True) self._check_colors(axes[0].lines, linecolors=['DodgerBlue']) # single character style diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index e66e34156b6e9..07e1be609670f 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1033,7 +1033,7 @@ def test_rank(self): rng = date_range('1/1/1990', periods=5) iseries = Series(np.arange(5), rng) + 1 - iseries.ix[4] = np.nan + iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ee4940384b612..05818b013ac52 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -368,7 +368,7 @@ def test_constructor_dtype_datetime64(self): s = Series(dates) self.assertEqual(s.dtype, 'M8[ns]') - s.ix[0] = np.nan + s.iloc[0] = np.nan self.assertEqual(s.dtype, 'M8[ns]') # invalid astypes @@ -581,8 +581,8 @@ def test_constructor_dict(self): d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx) - expected.ix[0] = 0 - expected.ix[1] = 1 + expected.iloc[0] = 0 + expected.iloc[1] = 1 assert_series_equal(result, expected) def test_constructor_dict_multiindex(self): diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index c44a7a898bb8d..e6209a853e958 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -185,7 +185,7 @@ def test_iget(self): # pass a slice result = s.iloc[slice(1, 3)] - expected = s.ix[2:4] + expected = s.loc[2:4] assert_series_equal(result, expected) # test slice is a view @@ -330,10 +330,10 @@ def test_getitem_setitem_boolean_corner(self): # ts[mask_shifted] # ts[mask_shifted] = 1 - self.assertRaises(Exception, ts.ix.__getitem__, mask_shifted) - self.assertRaises(Exception, ts.ix.__setitem__, mask_shifted, 1) - # ts.ix[mask_shifted] - # ts.ix[mask_shifted] = 2 + self.assertRaises(Exception, ts.loc.__getitem__, mask_shifted) + self.assertRaises(Exception, ts.loc.__setitem__, mask_shifted, 1) + # ts.loc[mask_shifted] + # ts.loc[mask_shifted] = 2 def test_getitem_setitem_slice_integers(self): s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16]) @@ -358,8 +358,8 @@ def test_getitem_setitem_integers(self): # caused bug without test s = Series([1, 2, 3], ['a', 'b', 'c']) - self.assertEqual(s.ix[0], s['a']) - s.ix[0] = 5 + self.assertEqual(s.iloc[0], s['a']) + s.iloc[0] = 5 self.assertAlmostEqual(s['a'], 5) def test_getitem_box_float64(self): @@ -369,7 +369,7 @@ def test_getitem_box_float64(self): def test_getitem_ambiguous_keyerror(self): s = Series(lrange(10), index=lrange(0, 20, 2)) self.assertRaises(KeyError, s.__getitem__, 1) - self.assertRaises(KeyError, s.ix.__getitem__, 1) + self.assertRaises(KeyError, s.loc.__getitem__, 1) def test_getitem_unordered_dup(self): obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) @@ -378,10 +378,10 @@ def test_getitem_unordered_dup(self): def test_getitem_dups_with_missing(self): - # breaks reindex, so need to use .ix internally + # breaks reindex, so need to use .loc internally # GH 4246 s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah']) - expected = s.ix[['foo', 'bar', 'bah', 'bam']] + expected = s.loc[['foo', 'bar', 'bah', 'bam']] result = s[['foo', 'bar', 'bah', 'bam']] assert_series_equal(result, expected) @@ -419,7 +419,7 @@ def test_setitem_ambiguous_keyerror(self): assert_series_equal(s2, expected) s2 = s.copy() - s2.ix[1] = 5 + s2.loc[1] = 5 expected = s.append(Series([5], index=[1])) assert_series_equal(s2, expected) @@ -428,7 +428,7 @@ def test_setitem_float_labels(self): s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) tmp = s.copy() - s.ix[1] = 'zoo' + s.loc[1] = 'zoo' tmp.iloc[2] = 'zoo' assert_series_equal(s, tmp) @@ -490,14 +490,14 @@ def f(): def test_slice_floats2(self): s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float)) - self.assertEqual(len(s.ix[12.0:]), 8) - self.assertEqual(len(s.ix[12.5:]), 7) + self.assertEqual(len(s.loc[12.0:]), 8) + self.assertEqual(len(s.loc[12.5:]), 7) i = np.arange(10, 20, dtype=float) i[2] = 12.2 s.index = i - self.assertEqual(len(s.ix[12.0:]), 8) - self.assertEqual(len(s.ix[12.5:]), 7) + self.assertEqual(len(s.loc[12.0:]), 8) + self.assertEqual(len(s.loc[12.5:]), 7) def test_slice_float64(self): @@ -635,7 +635,7 @@ def test_basic_getitem_with_labels(self): assert_series_equal(result, expected) result = self.ts[indices[0]:indices[2]] - expected = self.ts.ix[indices[0]:indices[2]] + expected = self.ts.loc[indices[0]:indices[2]] assert_series_equal(result, expected) # integer indexes, be careful @@ -668,13 +668,13 @@ def test_basic_setitem_with_labels(self): cp = self.ts.copy() exp = self.ts.copy() cp[indices] = 0 - exp.ix[indices] = 0 + exp.loc[indices] = 0 assert_series_equal(cp, exp) cp = self.ts.copy() exp = self.ts.copy() cp[indices[0]:indices[2]] = 0 - exp.ix[indices[0]:indices[2]] = 0 + exp.loc[indices[0]:indices[2]] = 0 assert_series_equal(cp, exp) # integer indexes, be careful @@ -685,13 +685,13 @@ def test_basic_setitem_with_labels(self): cp = s.copy() exp = s.copy() s[inds] = 0 - s.ix[inds] = 0 + s.loc[inds] = 0 assert_series_equal(cp, exp) cp = s.copy() exp = s.copy() s[arr_inds] = 0 - s.ix[arr_inds] = 0 + s.loc[arr_inds] = 0 assert_series_equal(cp, exp) inds_notfound = [0, 4, 5, 6] @@ -719,48 +719,48 @@ def test_basic_setitem_with_labels(self): result = s2['a'] self.assertEqual(result, expected) - def test_ix_getitem(self): + def test_loc_getitem(self): inds = self.series.index[[3, 4, 7]] - assert_series_equal(self.series.ix[inds], self.series.reindex(inds)) - assert_series_equal(self.series.ix[5::2], self.series[5::2]) + assert_series_equal(self.series.loc[inds], self.series.reindex(inds)) + assert_series_equal(self.series.iloc[5::2], self.series[5::2]) # slice with indices d1, d2 = self.ts.index[[5, 15]] - result = self.ts.ix[d1:d2] + result = self.ts.loc[d1:d2] expected = self.ts.truncate(d1, d2) assert_series_equal(result, expected) # boolean mask = self.series > self.series.median() - assert_series_equal(self.series.ix[mask], self.series[mask]) + assert_series_equal(self.series.loc[mask], self.series[mask]) # ask for index value - self.assertEqual(self.ts.ix[d1], self.ts[d1]) - self.assertEqual(self.ts.ix[d2], self.ts[d2]) + self.assertEqual(self.ts.loc[d1], self.ts[d1]) + self.assertEqual(self.ts.loc[d2], self.ts[d2]) - def test_ix_getitem_not_monotonic(self): + def test_loc_getitem_not_monotonic(self): d1, d2 = self.ts.index[[5, 15]] ts2 = self.ts[::2][[1, 2, 0]] - self.assertRaises(KeyError, ts2.ix.__getitem__, slice(d1, d2)) - self.assertRaises(KeyError, ts2.ix.__setitem__, slice(d1, d2), 0) + self.assertRaises(KeyError, ts2.loc.__getitem__, slice(d1, d2)) + self.assertRaises(KeyError, ts2.loc.__setitem__, slice(d1, d2), 0) - def test_ix_getitem_setitem_integer_slice_keyerrors(self): + def test_loc_getitem_setitem_integer_slice_keyerrors(self): s = Series(np.random.randn(10), index=lrange(0, 20, 2)) # this is OK cp = s.copy() - cp.ix[4:10] = 0 - self.assertTrue((cp.ix[4:10] == 0).all()) + cp.iloc[4:10] = 0 + self.assertTrue((cp.iloc[4:10] == 0).all()) # so is this cp = s.copy() - cp.ix[3:11] = 0 - self.assertTrue((cp.ix[3:11] == 0).values.all()) + cp.iloc[3:11] = 0 + self.assertTrue((cp.iloc[3:11] == 0).values.all()) - result = s.ix[4:10] - result2 = s.ix[3:11] + result = s.iloc[2:6] + result2 = s.loc[3:11] expected = s.reindex([4, 6, 8, 10]) assert_series_equal(result, expected) @@ -768,12 +768,12 @@ def test_ix_getitem_setitem_integer_slice_keyerrors(self): # non-monotonic, raise KeyError s2 = s.iloc[lrange(5) + lrange(5, 10)[::-1]] - self.assertRaises(KeyError, s2.ix.__getitem__, slice(3, 11)) - self.assertRaises(KeyError, s2.ix.__setitem__, slice(3, 11), 0) + self.assertRaises(KeyError, s2.loc.__getitem__, slice(3, 11)) + self.assertRaises(KeyError, s2.loc.__setitem__, slice(3, 11), 0) - def test_ix_getitem_iterator(self): + def test_loc_getitem_iterator(self): idx = iter(self.series.index[:10]) - result = self.series.ix[idx] + result = self.series.loc[idx] assert_series_equal(result, self.series[:10]) def test_setitem_with_tz(self): @@ -883,7 +883,7 @@ def test_where(self): assert_series_equal(rs, expected) expected = s2.abs() - expected.ix[0] = s2[0] + expected.iloc[0] = s2[0] rs = s2.where(cond[:3], -s2) assert_series_equal(rs, expected) @@ -1228,25 +1228,25 @@ def test_ix_setitem(self): inds = self.series.index[[3, 4, 7]] result = self.series.copy() - result.ix[inds] = 5 + result.loc[inds] = 5 expected = self.series.copy() expected[[3, 4, 7]] = 5 assert_series_equal(result, expected) - result.ix[5:10] = 10 + result.iloc[5:10] = 10 expected[5:10] = 10 assert_series_equal(result, expected) # set slice with indices d1, d2 = self.series.index[[5, 15]] - result.ix[d1:d2] = 6 + result.loc[d1:d2] = 6 expected[5:16] = 6 # because it's inclusive assert_series_equal(result, expected) # set index value - self.series.ix[d1] = 4 - self.series.ix[d2] = 6 + self.series.loc[d1] = 4 + self.series.loc[d2] = 6 self.assertEqual(self.series[d1], 4) self.assertEqual(self.series[d2], 6) @@ -1295,15 +1295,15 @@ def test_ix_setitem_boolean(self): mask = self.series > self.series.median() result = self.series.copy() - result.ix[mask] = 0 + result.loc[mask] = 0 expected = self.series expected[mask] = 0 assert_series_equal(result, expected) def test_ix_setitem_corner(self): inds = list(self.series.index[[5, 8, 12]]) - self.series.ix[inds] = 5 - self.assertRaises(Exception, self.series.ix.__setitem__, + self.series.loc[inds] = 5 + self.assertRaises(Exception, self.series.loc.__setitem__, inds + ['foo'], 5) def test_get_set_boolean_different_order(self): @@ -1494,7 +1494,7 @@ def test_drop(self): result = s.drop('bc', errors='ignore') assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') - expected = s.ix[1:] + expected = s.iloc[1:] assert_series_equal(result, expected) # bad axis @@ -1943,7 +1943,7 @@ def test_multilevel_preserve_name(self): s = Series(np.random.randn(len(index)), index=index, name='sth') result = s['foo'] - result2 = s.ix['foo'] + result2 = s.loc['foo'] self.assertEqual(result.name, s.name) self.assertEqual(result2.name, s.name) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 68ad29f1418c3..af52f6e712e61 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -178,5 +178,5 @@ def test_timeseries_repr_object_dtype(self): ts = tm.makeTimeSeries(1000) self.assertTrue(repr(ts).splitlines()[-1].startswith('Freq:')) - ts2 = ts.ix[np.random.randint(0, len(ts) - 1, 400)] + ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)] repr(ts2).splitlines()[-1] diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index cc07c7d9dd59b..5bcf258020349 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -22,7 +22,7 @@ def test_indexing_sliced(self): tm.assert_series_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedSeries) - res = s.ix[['a', 'b']] + res = s.loc[['a', 'b']] exp = tm.SubclassedSeries([1, 2], index=list('ab')) tm.assert_series_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedSeries) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d159e1a20d069..2321467d04c79 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3367,23 +3367,23 @@ def test_slicing_and_getting_ops(self): # ix # frame - # res_df = df.ix["j":"k",[0,1]] # doesn't work? - res_df = df.ix["j":"k", :] + # res_df = df.loc["j":"k",[0,1]] # doesn't work? + res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) self.assertTrue(is_categorical_dtype(res_df["cats"])) # row - res_row = df.ix["j", :] + res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) tm.assertIsInstance(res_row["cats"], compat.string_types) # col - res_col = df.ix[:, "cats"] + res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) self.assertTrue(is_categorical_dtype(res_col)) # single value - res_val = df.ix["j", 0] + res_val = df.loc["j", df.columns[0]] self.assertEqual(res_val, exp_val) # iat @@ -3456,7 +3456,7 @@ def test_slicing_doc_examples(self): index=['h', 'i', 'j'], name='cats') tm.assert_series_equal(result, expected) - result = df.ix["h":"j", 0:1] + result = df.loc["h":"j", df.columns[0:1]] expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], categories=['a', 'b', 'c'])}, index=['h', 'i', 'j']) @@ -3657,73 +3657,73 @@ def f(): with tm.assertRaises(ValueError): df.loc["j":"k", "cats"] = ["c", "c"] - # ix + # loc # ############## # - assign a single value -> exp_single_cats_value df = orig.copy() - df.ix["j", 0] = "b" + df.loc["j", df.columns[0]] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() - df.ix[df.index == "j", 0] = "b" + df.loc[df.index == "j", df.columns[0]] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set def f(): df = orig.copy() - df.ix["j", 0] = "c" + df.loc["j", df.columns[0]] = "c" self.assertRaises(ValueError, f) # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() - df.ix["j", :] = ["b", 2] + df.loc["j", :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() - df.ix["j", :] = ["c", 2] + df.loc["j", :] = ["c", 2] self.assertRaises(ValueError, f) # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() - df.ix["j":"k", :] = [["b", 2], ["b", 2]] + df.loc["j":"k", :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) def f(): df = orig.copy() - df.ix["j":"k", :] = [["c", 2], ["c", 2]] + df.loc["j":"k", :] = [["c", 2], ["c", 2]] self.assertRaises(ValueError, f) # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc["j":"k", df.columns[0]] = pd.Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical( + df.loc["j":"k", df.columns[0]] = pd.Categorical( ["b", "b"], categories=["a", "b", "c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical(["c", "c"], - categories=["a", "b", "c"]) + df.loc["j":"k", df.columns[0]] = pd.Categorical( + ["c", "c"], categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() - df.ix["j":"k", 0] = ["b", "b"] + df.loc["j":"k", df.columns[0]] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - df.ix["j":"k", 0] = ["c", "c"] + df.loc["j":"k", df.columns[0]] = ["c", "c"] # iat df = orig.copy() diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index f32990ff32cbe..f7b7ae8c66382 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1119,7 +1119,7 @@ def test_describe_empty_int_columns(self): desc = df[df[0] < 0].describe() # works assert_series_equal(desc.xs('count'), Series([0, 0], dtype=float, name='count')) - self.assertTrue(isnull(desc.ix[1:]).all().all()) + self.assertTrue(isnull(desc.iloc[1:]).all().all()) def test_describe_objects(self): df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']}) @@ -1751,7 +1751,7 @@ def test_squeeze(self): tm.assert_frame_equal(p.squeeze(), p['ItemA']) p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A']) - tm.assert_series_equal(p.squeeze(), p.ix['ItemA', :, 'A']) + tm.assert_series_equal(p.squeeze(), p.loc['ItemA', :, 'A']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): p4d = tm.makePanel4D().reindex(labels=['label1']) @@ -1759,7 +1759,7 @@ def test_squeeze(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) - tm.assert_frame_equal(p4d.squeeze(), p4d.ix['label1', 'ItemA']) + tm.assert_frame_equal(p4d.squeeze(), p4d.loc['label1', 'ItemA']) # don't fail with 0 length dimensions GH11229 & GH8999 empty_series = pd.Series([], name='five') @@ -1915,7 +1915,7 @@ def test_equals(self): df1['end'] = date_range('2000-1-1', periods=10, freq='D') df1['diff'] = df1['end'] - df1['start'] df1['bool'] = (np.arange(10) % 3 == 0) - df1.ix[::2] = nan + df1.loc[::2] = nan df2 = df1.copy() self.assertTrue(df1['text'].equals(df2['text'])) self.assertTrue(df1['start'].equals(df2['start'])) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 22addd4c23817..5000d6d4510fb 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -822,7 +822,7 @@ def test_unicode_repr_doesnt_raise(self): def test_missing_unicode_key(self): df = DataFrame({"a": [1]}) try: - df.ix[:, u("\u05d0")] # should not raise UnicodeEncodeError + df.loc[:, u("\u05d0")] # should not raise UnicodeEncodeError except KeyError: pass # this is the expected exception diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 59d9e1e094d9d..37bfe667b0205 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101,W0141 +from warnings import catch_warnings import datetime import itertools import nose @@ -188,8 +189,12 @@ def _test_roundtrip(frame): _test_roundtrip(self.ymd.T) def test_reindex(self): - reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] - expected = self.frame.ix[[0, 3]] + expected = self.frame.iloc[[0, 3]] + reindexed = self.frame.loc[[('foo', 'one'), ('bar', 'one')]] + assert_frame_equal(reindexed, expected) + + with catch_warnings(record=True): + reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): @@ -197,14 +202,18 @@ def test_reindex_preserve_levels(self): chunk = self.ymd.reindex(new_index) self.assertIs(chunk.index, new_index) - chunk = self.ymd.ix[new_index] + chunk = self.ymd.loc[new_index] + self.assertIs(chunk.index, new_index) + + with catch_warnings(record=True): + chunk = self.ymd.ix[new_index] self.assertIs(chunk.index, new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assertIs(chunk.columns, new_index) - chunk = ymdT.ix[:, new_index] + chunk = ymdT.loc[:, new_index] self.assertIs(chunk.columns, new_index) def test_sort_index_preserve_levels(self): @@ -286,7 +295,7 @@ def test_series_getitem(self): result = s[2000, 3] # TODO(wesm): unused? - # result2 = s.ix[2000, 3] + # result2 = s.loc[2000, 3] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) @@ -297,8 +306,12 @@ def test_series_getitem(self): self.assertEqual(result, expected) # fancy - result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s.reindex(s.index[49:51]) + result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] + assert_series_equal(result, expected) + + with catch_warnings(record=True): + result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] assert_series_equal(result, expected) # key error @@ -356,13 +369,13 @@ def test_frame_getitem_setitem_boolean(self): def test_frame_getitem_setitem_slice(self): # getitem - result = self.frame.ix[:4] + result = self.frame.iloc[:4] expected = self.frame[:4] assert_frame_equal(result, expected) # setitem cp = self.frame.copy() - cp.ix[:4] = 0 + cp.iloc[:4] = 0 self.assertTrue((cp.values[:4] == 0).all()) self.assertTrue((cp.values[4:] != 0).all()) @@ -373,21 +386,25 @@ def test_frame_getitem_setitem_multislice(self): midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) - result = df.ix[:, 'value'] + result = df.loc[:, 'value'] assert_series_equal(df['value'], result) - result = df.ix[1:3, 'value'] + with catch_warnings(record=True): + result = df.ix[:, 'value'] + assert_series_equal(df['value'], result) + + result = df.loc[df.index[1:3], 'value'] assert_series_equal(df['value'][1:3], result) - result = df.ix[:, :] + result = df.loc[:, :] assert_frame_equal(df, result) result = df - df.ix[:, 'value'] = 10 + df.loc[:, 'value'] = 10 result['value'] = 10 assert_frame_equal(df, result) - df.ix[:, :] = 10 + df.loc[:, :] = 10 assert_frame_equal(df, result) def test_frame_getitem_multicolumn_empty_level(self): @@ -444,20 +461,23 @@ def test_getitem_tuple_plus_slice(self): idf = df.set_index(['a', 'b']) - result = idf.ix[(0, 0), :] - expected = idf.ix[0, 0] + result = idf.loc[(0, 0), :] + expected = idf.loc[0, 0] expected2 = idf.xs((0, 0)) + with catch_warnings(record=True): + expected3 = idf.ix[0, 0] assert_series_equal(result, expected) assert_series_equal(result, expected2) + assert_series_equal(result, expected3) def test_getitem_setitem_tuple_plus_columns(self): # GH #1013 df = self.ymd[:5] - result = df.ix[(2000, 1, 6), ['A', 'B', 'C']] - expected = df.ix[2000, 1, 6][['A', 'B', 'C']] + result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] + expected = df.loc[2000, 1, 6][['A', 'B', 'C']] assert_series_equal(result, expected) def test_getitem_multilevel_index_tuple_unsorted(self): @@ -466,7 +486,7 @@ def test_getitem_multilevel_index_tuple_unsorted(self): columns=index_columns + ["data"]) df = df.set_index(index_columns) query_index = df.index[:1] - rs = df.ix[query_index, "data"] + rs = df.loc[query_index, "data"] xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) xp = Series(['x'], index=xp_idx, name='data') @@ -474,7 +494,7 @@ def test_getitem_multilevel_index_tuple_unsorted(self): def test_xs(self): xs = self.frame.xs(('bar', 'two')) - xs2 = self.frame.ix[('bar', 'two')] + xs2 = self.frame.loc[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) @@ -500,13 +520,13 @@ def test_xs(self): def test_xs_partial(self): result = self.frame.xs('foo') - result2 = self.frame.ix['foo'] + result2 = self.frame.loc['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) result = self.ymd.xs((2000, 4)) - expected = self.ymd.ix[2000, 4] + expected = self.ymd.loc[2000, 4] assert_frame_equal(result, expected) # ex from #1796 @@ -518,7 +538,7 @@ def test_xs_partial(self): columns=list('abcd')) result = df.xs(['foo', 'one']) - expected = df.ix['foo', 'one'] + expected = df.loc['foo', 'one'] assert_frame_equal(result, expected) def test_xs_level(self): @@ -576,8 +596,9 @@ def f(x): idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) idx.names = ['date', 'secid'] df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) + rs = df.xs(20111201, level='date') - xp = df.ix[20111201, :] + xp = df.loc[20111201, :] assert_frame_equal(rs, xp) def test_xs_level0(self): @@ -603,7 +624,7 @@ def test_xs_level_series(self): s = self.ymd['A'] result = s[2000, 5] - expected = self.ymd.ix[2000, 5]['A'] + expected = self.ymd.loc[2000, 5]['A'] assert_series_equal(result, expected) # not implementing this for now @@ -633,7 +654,7 @@ def test_getitem_toplevel(self): assert_frame_equal(result, expected) result = df['bar'] - result2 = df.ix[:, 'bar'] + result2 = df.loc[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) @@ -646,21 +667,21 @@ def test_getitem_setitem_slice_integers(self): frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) - res = frame.ix[1:2] + res = frame.loc[1:2] exp = frame.reindex(frame.index[2:]) assert_frame_equal(res, exp) - frame.ix[1:2] = 7 - self.assertTrue((frame.ix[1:2] == 7).values.all()) + frame.loc[1:2] = 7 + self.assertTrue((frame.loc[1:2] == 7).values.all()) series = Series(np.random.randn(len(index)), index=index) - res = series.ix[1:2] + res = series.loc[1:2] exp = series.reindex(series.index[2:]) assert_series_equal(res, exp) - series.ix[1:2] = 7 - self.assertTrue((series.ix[1:2] == 7).values.all()) + series.loc[1:2] = 7 + self.assertTrue((series.loc[1:2] == 7).values.all()) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] @@ -669,16 +690,16 @@ def test_getitem_int(self): frame = DataFrame(np.random.randn(6, 2), index=index) - result = frame.ix[1] + result = frame.loc[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception - self.assertRaises(KeyError, frame.ix.__getitem__, 3) + self.assertRaises(KeyError, frame.loc.__getitem__, 3) # however this will work - result = self.frame.ix[2] + result = self.frame.iloc[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) @@ -694,7 +715,7 @@ def test_getitem_slice_not_sorted(self): df = self.frame.sort_index(level=1).T # buglet with int typechecking - result = df.ix[:, :np.int32(3)] + result = df.iloc[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) @@ -709,21 +730,27 @@ def test_setitem_change_dtype(self): assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): - self.frame.ix[('bar', 'two'), 'B'] = 5 - self.assertEqual(self.frame.ix[('bar', 'two'), 'B'], 5) + self.frame.loc[('bar', 'two'), 'B'] = 5 + self.assertEqual(self.frame.loc[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = lrange(3) - df.ix[('bar', 'two'), 1] = 7 - self.assertEqual(df.ix[('bar', 'two'), 1], 7) + df.loc[('bar', 'two'), 1] = 7 + self.assertEqual(df.loc[('bar', 'two'), 1], 7) + + with catch_warnings(record=True): + df = self.frame.copy() + df.columns = lrange(3) + df.ix[('bar', 'two'), 1] = 7 + self.assertEqual(df.loc[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): - result = self.frame.ix['bar':'baz'] + result = self.frame.loc['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) - result = self.ymd.ix[(2000, 2):(2000, 4)] + result = self.ymd.loc[(2000, 2):(2000, 4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) @@ -733,15 +760,19 @@ def test_getitem_partial_column_select(self): levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) df = DataFrame(np.random.rand(3, 2), index=idx) - result = df.ix[('a', 'y'), :] - expected = df.ix[('a', 'y')] + result = df.loc[('a', 'y'), :] + expected = df.loc[('a', 'y')] + assert_frame_equal(result, expected) + + result = df.loc[('a', 'y'), [1, 0]] + expected = df.loc[('a', 'y')][[1, 0]] assert_frame_equal(result, expected) - result = df.ix[('a', 'y'), [1, 0]] - expected = df.ix[('a', 'y')][[1, 0]] + with catch_warnings(record=True): + result = df.ix[('a', 'y'), [1, 0]] assert_frame_equal(result, expected) - self.assertRaises(KeyError, df.ix.__getitem__, + self.assertRaises(KeyError, df.loc.__getitem__, (('a', 'foo'), slice(None, None))) def test_sort_index_level(self): @@ -834,10 +865,10 @@ def _check_counts(frame, axis=0): expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) - self.frame.ix[1, [1, 2]] = np.nan - self.frame.ix[7, [0, 1]] = np.nan - self.ymd.ix[1, [1, 2]] = np.nan - self.ymd.ix[7, [0, 1]] = np.nan + self.frame.iloc[1, [1, 2]] = np.nan + self.frame.iloc[7, [0, 1]] = np.nan + self.ymd.iloc[1, [1, 2]] = np.nan + self.ymd.iloc[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) @@ -953,7 +984,7 @@ def test_stack(self): assert_frame_equal(result, expected) # not all levels present in each echelon - unstacked = self.ymd.unstack(2).ix[:, ::3] + unstacked = self.ymd.unstack(2).loc[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) @@ -1096,7 +1127,7 @@ def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') - assert_frame_equal(unstacked, expected.ix[:, unstacked.columns]) + assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) def test_stack_names_and_numbers(self): unstacked = self.ymd.unstack(['year', 'month']) @@ -1214,7 +1245,7 @@ def test_stack_multiple_bug(self): down = unst.resample('W-THU').mean() rs = down.stack('ID') - xp = unst.ix[:, ['VAR1']].resample('W-THU').mean().stack('ID') + xp = unst.loc[:, ['VAR1']].resample('W-THU').mean().stack('ID') xp.columns.name = 'Params' assert_frame_equal(rs, xp) @@ -1306,8 +1337,8 @@ def test_groupby_level_no_obs(self): self.assertTrue((result.columns == ['f2', 'f3']).all()) def test_join(self): - a = self.frame.ix[:5, ['A']] - b = self.frame.ix[2:, ['B', 'C']] + a = self.frame.loc[self.frame.index[:5], ['A']] + b = self.frame.loc[self.frame.index[2:], ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() @@ -1443,7 +1474,7 @@ def test_frame_getitem_not_sorted(self): arrays = [np.array(x) for x in zip(*df.columns._tuple_index)] result = df['foo'] - result2 = df.ix[:, 'foo'] + result2 = df.loc[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) @@ -1451,7 +1482,7 @@ def test_frame_getitem_not_sorted(self): df = df.T result = df.xs('foo') - result2 = df.ix['foo'] + result2 = df.loc['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) @@ -1467,7 +1498,7 @@ def test_series_getitem_not_sorted(self): arrays = [np.array(x) for x in zip(*index._tuple_index)] result = s['qux'] - result2 = s.ix['qux'] + result2 = s.loc['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) @@ -1515,8 +1546,8 @@ def test_series_group_min_max(self): assert_series_equal(leftside, rightside) def test_frame_group_ops(self): - self.frame.ix[1, [1, 2]] = np.nan - self.frame.ix[7, [0, 1]] = np.nan + self.frame.iloc[1, [1, 2]] = np.nan + self.frame.iloc[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2), lrange(2), @@ -1620,13 +1651,13 @@ def test_multilevel_consolidate(self): df = df.consolidate() def test_ix_preserve_names(self): - result = self.ymd.ix[2000] - result2 = self.ymd['A'].ix[2000] + result = self.ymd.loc[2000] + result2 = self.ymd['A'].loc[2000] self.assertEqual(result.index.names, self.ymd.index.names[1:]) self.assertEqual(result2.index.names, self.ymd.index.names[1:]) - result = self.ymd.ix[2000, 2] - result2 = self.ymd['A'].ix[2000, 2] + result = self.ymd.loc[2000, 2] + result2 = self.ymd['A'].loc[2000, 2] self.assertEqual(result.index.name, self.ymd.index.names[2]) self.assertEqual(result2.index.name, self.ymd.index.names[2]) @@ -1634,20 +1665,20 @@ def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() - df.ix[2000, 4] = 0 - exp.ix[2000, 4].values[:] = 0 + df.loc[2000, 4] = 0 + exp.loc[2000, 4].values[:] = 0 assert_frame_equal(df, exp) - df['A'].ix[2000, 4] = 1 - exp['A'].ix[2000, 4].values[:] = 1 + df['A'].loc[2000, 4] = 1 + exp['A'].loc[2000, 4].values[:] = 1 assert_frame_equal(df, exp) - df.ix[2000] = 5 - exp.ix[2000].values[:] = 5 + df.loc[2000] = 5 + exp.loc[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now - df['A'].ix[14] = 5 + df['A'].iloc[14] = 5 self.assertEqual(df['A'][14], 5) def test_unstack_preserve_types(self): @@ -1693,12 +1724,12 @@ def test_unstack_group_index_overflow(self): self.assertEqual(result.shape, (500, 2)) def test_getitem_lowerdim_corner(self): - self.assertRaises(KeyError, self.frame.ix.__getitem__, + self.assertRaises(KeyError, self.frame.loc.__getitem__, (('bar', 'three'), 'B')) # in theory should be inserting in a sorted space???? - self.frame.ix[('bar', 'three'), 'B'] = 0 - self.assertEqual(self.frame.sort_index().ix[('bar', 'three'), 'B'], 0) + self.frame.loc[('bar', 'three'), 'B'] = 0 + self.assertEqual(self.frame.sort_index().loc[('bar', 'three'), 'B'], 0) # --------------------------------------------------------------------- # AMBIGUOUS CASES! @@ -1706,18 +1737,18 @@ def test_getitem_lowerdim_corner(self): def test_partial_ix_missing(self): raise nose.SkipTest("skipping for now") - result = self.ymd.ix[2000, 0] - expected = self.ymd.ix[2000]['A'] + result = self.ymd.loc[2000, 0] + expected = self.ymd.loc[2000]['A'] assert_series_equal(result, expected) # need to put in some work here - # self.ymd.ix[2000, 0] = 0 - # self.assertTrue((self.ymd.ix[2000]['A'] == 0).all()) + # self.ymd.loc[2000, 0] = 0 + # self.assertTrue((self.ymd.loc[2000]['A'] == 0).all()) # Pretty sure the second (and maybe even the first) is already wrong. - self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) - self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) + self.assertRaises(Exception, self.ymd.loc.__getitem__, (2000, 6)) + self.assertRaises(Exception, self.ymd.loc.__getitem__, (2000, 6), 0) # --------------------------------------------------------------------- @@ -1735,7 +1766,7 @@ def test_level_with_tuples(self): frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar', 0)] - result2 = series.ix[('foo', 'bar', 0)] + result2 = series.loc[('foo', 'bar', 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) @@ -1743,7 +1774,7 @@ def test_level_with_tuples(self): self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) - result = frame.ix[('foo', 'bar', 0)] + result = frame.loc[('foo', 'bar', 0)] result2 = frame.xs(('foo', 'bar', 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) @@ -1758,13 +1789,13 @@ def test_level_with_tuples(self): frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar')] - result2 = series.ix[('foo', 'bar')] + result2 = series.loc[('foo', 'bar')] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) - result = frame.ix[('foo', 'bar')] + result = frame.loc[('foo', 'bar')] result2 = frame.xs(('foo', 'bar')) expected = frame[:2] expected.index = expected.index.droplevel(0) @@ -1859,7 +1890,7 @@ def test_drop_nonunique(self): columns=["var1", "var2", "var3", "var4"]) grp_size = df.groupby("var1").size() - drop_idx = grp_size.ix[grp_size == 1] + drop_idx = grp_size.loc[grp_size == 1] idf = df.set_index(["var1", "var2", "var3"]) @@ -1896,65 +1927,65 @@ def test_mixed_depth_pop(self): def test_reindex_level_partial_selection(self): result = self.frame.reindex(['foo', 'qux'], level=0) - expected = self.frame.ix[[0, 1, 2, 7, 8, 9]] + expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]] assert_frame_equal(result, expected) result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0) assert_frame_equal(result, expected.T) - result = self.frame.ix[['foo', 'qux']] + result = self.frame.loc[['foo', 'qux']] assert_frame_equal(result, expected) - result = self.frame['A'].ix[['foo', 'qux']] + result = self.frame['A'].loc[['foo', 'qux']] assert_series_equal(result, expected['A']) - result = self.frame.T.ix[:, ['foo', 'qux']] + result = self.frame.T.loc[:, ['foo', 'qux']] assert_frame_equal(result, expected.T) def test_setitem_multiple_partial(self): expected = self.frame.copy() result = self.frame.copy() - result.ix[['foo', 'bar']] = 0 - expected.ix['foo'] = 0 - expected.ix['bar'] = 0 + result.loc[['foo', 'bar']] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 assert_frame_equal(result, expected) expected = self.frame.copy() result = self.frame.copy() - result.ix['foo':'bar'] = 0 - expected.ix['foo'] = 0 - expected.ix['bar'] = 0 + result.loc['foo':'bar'] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 assert_frame_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() - result.ix[['foo', 'bar']] = 0 - expected.ix['foo'] = 0 - expected.ix['bar'] = 0 + result.loc[['foo', 'bar']] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 assert_series_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() - result.ix['foo':'bar'] = 0 - expected.ix['foo'] = 0 - expected.ix['bar'] = 0 + result.loc['foo':'bar'] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 assert_series_equal(result, expected) def test_drop_level(self): result = self.frame.drop(['bar', 'qux'], level='first') - expected = self.frame.ix[[0, 1, 2, 5, 6]] + expected = self.frame.iloc[[0, 1, 2, 5, 6]] assert_frame_equal(result, expected) result = self.frame.drop(['two'], level='second') - expected = self.frame.ix[[0, 2, 3, 6, 7, 9]] + expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] assert_frame_equal(result, expected) result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') - expected = self.frame.ix[[0, 1, 2, 5, 6]].T + expected = self.frame.iloc[[0, 1, 2, 5, 6]].T assert_frame_equal(result, expected) result = self.frame.T.drop(['two'], axis=1, level='second') - expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T + expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T assert_frame_equal(result, expected) def test_drop_level_nonunique_datetime(self): @@ -2028,12 +2059,12 @@ def test_join_segfault(self): def test_set_column_scalar_with_ix(self): subset = self.frame.index[[1, 4, 5]] - self.frame.ix[subset] = 99 - self.assertTrue((self.frame.ix[subset].values == 99).all()) + self.frame.loc[subset] = 99 + self.assertTrue((self.frame.loc[subset].values == 99).all()) col = self.frame['B'] col[subset] = 97 - self.assertTrue((self.frame.ix[subset, 'B'] == 97).all()) + self.assertTrue((self.frame.loc[subset, 'B'] == 97).all()) def test_frame_dict_constructor_empty_series(self): s1 = Series([ @@ -2057,7 +2088,7 @@ def test_indexing_ambiguity_bug_1678(self): frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns) - result = frame.ix[:, 1] + result = frame.iloc[:, 1] exp = frame.loc[:, ('Ohio', 'Red')] tm.assertIsInstance(result, Series) assert_series_equal(result, exp) @@ -2069,7 +2100,7 @@ def test_nonunique_assignment_1750(self): df = df.set_index(['A', 'B']) ix = MultiIndex.from_tuples([(1, 1)]) - df.ix[ix, "C"] = '_' + df.loc[ix, "C"] = '_' self.assertTrue((df.xs((1, 1))['C'] == '_').all()) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 78edd27877783..d79081a06dbc0 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -140,8 +140,8 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): obj = self.panel # # set some NAs - # obj.ix[5:10] = np.nan - # obj.ix[15:20, -2:] = np.nan + # obj.loc[5:10] = np.nan + # obj.loc[15:20, -2:] = np.nan f = getattr(obj, name) @@ -358,7 +358,7 @@ def test_raise_when_not_implemented(self): items=['ItemA', 'ItemB', 'ItemC'], major_axis=pd.date_range('20130101', periods=4), minor_axis=list('ABCDE')) - d = p.sum(axis=1).ix[0] + d = p.sum(axis=1).iloc[0] ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'div', 'mod', 'pow'] for op in ops: with self.assertRaises(NotImplementedError): @@ -491,7 +491,7 @@ def test_setitem(self): self.assertEqual(self.panel['ItemP'].values.dtype, np.bool_) self.assertRaises(TypeError, self.panel.__setitem__, 'foo', - self.panel.ix[['ItemP']]) + self.panel.loc[['ItemP']]) # bad shape p = Panel(np.random.randn(4, 3, 2)) @@ -591,25 +591,25 @@ def test_getitem_fancy_labels(self): cols = ['D', 'C', 'F'] # all 3 specified - assert_panel_equal(p.ix[items, dates, cols], + assert_panel_equal(p.loc[items, dates, cols], p.reindex(items=items, major=dates, minor=cols)) # 2 specified - assert_panel_equal(p.ix[:, dates, cols], + assert_panel_equal(p.loc[:, dates, cols], p.reindex(major=dates, minor=cols)) - assert_panel_equal(p.ix[items, :, cols], + assert_panel_equal(p.loc[items, :, cols], p.reindex(items=items, minor=cols)) - assert_panel_equal(p.ix[items, dates, :], + assert_panel_equal(p.loc[items, dates, :], p.reindex(items=items, major=dates)) # only 1 - assert_panel_equal(p.ix[items, :, :], p.reindex(items=items)) + assert_panel_equal(p.loc[items, :, :], p.reindex(items=items)) - assert_panel_equal(p.ix[:, dates, :], p.reindex(major=dates)) + assert_panel_equal(p.loc[:, dates, :], p.reindex(major=dates)) - assert_panel_equal(p.ix[:, :, cols], p.reindex(minor=cols)) + assert_panel_equal(p.loc[:, :, cols], p.reindex(minor=cols)) def test_getitem_fancy_slice(self): pass @@ -618,8 +618,8 @@ def test_getitem_fancy_ints(self): p = self.panel # #1603 - result = p.ix[:, -1, :] - expected = p.ix[:, p.major_axis[-1], :] + result = p.iloc[:, -1, :] + expected = p.loc[:, p.major_axis[-1], :] assert_frame_equal(result, expected) def test_getitem_fancy_xs(self): @@ -631,22 +631,22 @@ def test_getitem_fancy_xs(self): # get DataFrame # item - assert_frame_equal(p.ix[item], p[item]) - assert_frame_equal(p.ix[item, :], p[item]) - assert_frame_equal(p.ix[item, :, :], p[item]) + assert_frame_equal(p.loc[item], p[item]) + assert_frame_equal(p.loc[item, :], p[item]) + assert_frame_equal(p.loc[item, :, :], p[item]) # major axis, axis=1 - assert_frame_equal(p.ix[:, date], p.major_xs(date)) - assert_frame_equal(p.ix[:, date, :], p.major_xs(date)) + assert_frame_equal(p.loc[:, date], p.major_xs(date)) + assert_frame_equal(p.loc[:, date, :], p.major_xs(date)) # minor axis, axis=2 - assert_frame_equal(p.ix[:, :, 'C'], p.minor_xs('C')) + assert_frame_equal(p.loc[:, :, 'C'], p.minor_xs('C')) # get Series - assert_series_equal(p.ix[item, date], p[item].ix[date]) - assert_series_equal(p.ix[item, date, :], p[item].ix[date]) - assert_series_equal(p.ix[item, :, col], p[item][col]) - assert_series_equal(p.ix[:, date, col], p.major_xs(date).ix[col]) + assert_series_equal(p.loc[item, date], p[item].loc[date]) + assert_series_equal(p.loc[item, date, :], p[item].loc[date]) + assert_series_equal(p.loc[item, :, col], p[item][col]) + assert_series_equal(p.loc[:, date, col], p.major_xs(date).loc[col]) def test_getitem_fancy_xs_check_view(self): item = 'ItemB' @@ -685,9 +685,9 @@ def test_ix_setitem_slice_dataframe(self): b = DataFrame(np.random.randn(2, 3), index=[111, 333], columns=[1, 2, 3]) - a.ix[:, 22, [111, 333]] = b + a.loc[:, 22, [111, 333]] = b - assert_frame_equal(a.ix[:, 22, [111, 333]], b) + assert_frame_equal(a.loc[:, 22, [111, 333]], b) def test_ix_align(self): from pandas import Series @@ -696,28 +696,28 @@ def test_ix_align(self): df_orig = Panel(np.random.randn(3, 10, 2)) df = df_orig.copy() - df.ix[0, :, 0] = b - assert_series_equal(df.ix[0, :, 0].reindex(b.index), b) + df.loc[0, :, 0] = b + assert_series_equal(df.loc[0, :, 0].reindex(b.index), b) df = df_orig.swapaxes(0, 1) - df.ix[:, 0, 0] = b - assert_series_equal(df.ix[:, 0, 0].reindex(b.index), b) + df.loc[:, 0, 0] = b + assert_series_equal(df.loc[:, 0, 0].reindex(b.index), b) df = df_orig.swapaxes(1, 2) - df.ix[0, 0, :] = b - assert_series_equal(df.ix[0, 0, :].reindex(b.index), b) + df.loc[0, 0, :] = b + assert_series_equal(df.loc[0, 0, :].reindex(b.index), b) def test_ix_frame_align(self): p_orig = tm.makePanel() - df = p_orig.ix[0].copy() + df = p_orig.iloc[0].copy() assert_frame_equal(p_orig['ItemA'], df) p = p_orig.copy() - p.ix[0, :, :] = df + p.iloc[0, :, :] = df assert_panel_equal(p, p_orig) p = p_orig.copy() - p.ix[0] = df + p.iloc[0] = df assert_panel_equal(p, p_orig) p = p_orig.copy() @@ -741,8 +741,8 @@ def test_ix_frame_align(self): assert_panel_equal(p, p_orig) p = p_orig.copy() - p.ix[0, [0, 1, 3, 5], -2:] = df - out = p.ix[0, [0, 1, 3, 5], -2:] + p.iloc[0, [0, 1, 3, 5], -2:] = df + out = p.iloc[0, [0, 1, 3, 5], -2:] assert_frame_equal(out, df.iloc[[0, 1, 3, 5], [2, 3]]) # GH3830, panel assignent by values/frame @@ -773,10 +773,10 @@ def test_ix_frame_align(self): def _check_view(self, indexer, comp): cp = self.panel.copy() - obj = cp.ix[indexer] + obj = cp.loc[indexer] obj.values[:] = 0 self.assertTrue((obj.values == 0).all()) - comp(cp.ix[indexer].reindex_like(obj), obj) + comp(cp.loc[indexer].reindex_like(obj), obj) def test_logical_with_nas(self): d = Panel({'ItemA': {'a': [np.nan, False]}, @@ -1757,7 +1757,7 @@ def test_to_panel_na_handling(self): [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]]) panel = df.to_panel() - self.assertTrue(isnull(panel[0].ix[1, [0, 1]]).all()) + self.assertTrue(isnull(panel[0].loc[1, [0, 1]]).all()) def test_to_panel_duplicates(self): # #2441 @@ -1900,7 +1900,7 @@ def test_tshift(self): assert_panel_equal(shifted, panel.tshift(1)) assert_panel_equal(unshifted, inferred_ts) - no_freq = panel.ix[:, [0, 5, 7], :] + no_freq = panel.iloc[:, [0, 5, 7], :] self.assertRaises(ValueError, no_freq.tshift) def test_pct_change(self): @@ -2000,7 +2000,7 @@ def test_multiindex_get(self): major_axis=np.arange(5), minor_axis=np.arange(5)) f1 = wp['a'] - f2 = wp.ix['a'] + f2 = wp.loc['a'] assert_panel_equal(f1, f2) self.assertTrue((f1.items == [1, 2]).all()) @@ -2099,10 +2099,10 @@ def test_to_excel_xlsxwriter(self): def test_dropna(self): p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde')) - p.ix[:, ['b', 'd'], 0] = np.nan + p.loc[:, ['b', 'd'], 0] = np.nan result = p.dropna(axis=1) - exp = p.ix[:, ['a', 'c', 'e'], :] + exp = p.loc[:, ['a', 'c', 'e'], :] assert_panel_equal(result, exp) inp = p.copy() inp.dropna(axis=1, inplace=True) @@ -2111,24 +2111,24 @@ def test_dropna(self): result = p.dropna(axis=1, how='all') assert_panel_equal(result, p) - p.ix[:, ['b', 'd'], :] = np.nan + p.loc[:, ['b', 'd'], :] = np.nan result = p.dropna(axis=1, how='all') - exp = p.ix[:, ['a', 'c', 'e'], :] + exp = p.loc[:, ['a', 'c', 'e'], :] assert_panel_equal(result, exp) p = Panel(np.random.randn(4, 5, 6), items=list('abcd')) - p.ix[['b'], :, 0] = np.nan + p.loc[['b'], :, 0] = np.nan result = p.dropna() - exp = p.ix[['a', 'c', 'd']] + exp = p.loc[['a', 'c', 'd']] assert_panel_equal(result, exp) result = p.dropna(how='all') assert_panel_equal(result, p) - p.ix['b'] = np.nan + p.loc['b'] = np.nan result = p.dropna(how='all') - exp = p.ix[['a', 'c', 'd']] + exp = p.loc[['a', 'c', 'd']] assert_panel_equal(result, exp) def test_drop(self): @@ -2334,7 +2334,7 @@ def test_combine_series(self): expected = DataFrame.add(self.panel, s, axis=0) assert_frame_equal(result, expected) - s = self.panel.ix[5] + s = self.panel.iloc[5] result = self.panel + s expected = DataFrame.add(self.panel, s, axis=1) assert_frame_equal(result, expected) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 1b5a7b6ee1e83..0769b8916a11b 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -116,8 +116,8 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): obj = self.panel4d # # set some NAs - # obj.ix[5:10] = np.nan - # obj.ix[15:20, -2:] = np.nan + # obj.loc[5:10] = np.nan + # obj.loc[15:20, -2:] = np.nan f = getattr(obj, name) @@ -531,33 +531,33 @@ def test_getitem_fancy_labels(self): cols = ['D', 'C', 'F'] # all 4 specified - assert_panel4d_equal(panel4d.ix[labels, items, dates, cols], + assert_panel4d_equal(panel4d.loc[labels, items, dates, cols], panel4d.reindex(labels=labels, items=items, major=dates, minor=cols)) # 3 specified - assert_panel4d_equal(panel4d.ix[:, items, dates, cols], + assert_panel4d_equal(panel4d.loc[:, items, dates, cols], panel4d.reindex(items=items, major=dates, minor=cols)) # 2 specified - assert_panel4d_equal(panel4d.ix[:, :, dates, cols], + assert_panel4d_equal(panel4d.loc[:, :, dates, cols], panel4d.reindex(major=dates, minor=cols)) - assert_panel4d_equal(panel4d.ix[:, items, :, cols], + assert_panel4d_equal(panel4d.loc[:, items, :, cols], panel4d.reindex(items=items, minor=cols)) - assert_panel4d_equal(panel4d.ix[:, items, dates, :], + assert_panel4d_equal(panel4d.loc[:, items, dates, :], panel4d.reindex(items=items, major=dates)) # only 1 - assert_panel4d_equal(panel4d.ix[:, items, :, :], + assert_panel4d_equal(panel4d.loc[:, items, :, :], panel4d.reindex(items=items)) - assert_panel4d_equal(panel4d.ix[:, :, dates, :], + assert_panel4d_equal(panel4d.loc[:, :, dates, :], panel4d.reindex(major=dates)) - assert_panel4d_equal(panel4d.ix[:, :, :, cols], + assert_panel4d_equal(panel4d.loc[:, :, :, cols], panel4d.reindex(minor=cols)) def test_getitem_fancy_slice(self): @@ -693,12 +693,13 @@ def test_ctor_dict(self): l1 = self.panel4d['l1'] l2 = self.panel4d['l2'] - d = {'A': l1, 'B': l2.ix[['ItemB'], :, :]} + d = {'A': l1, 'B': l2.loc[['ItemB'], :, :]} panel4d = Panel4D(d) assert_panel_equal(panel4d['A'], self.panel4d['l1']) - assert_frame_equal(panel4d.ix['B', 'ItemB', :, :], - self.panel4d.ix['l2', ['ItemB'], :, :]['ItemB']) + assert_frame_equal(panel4d.loc['B', 'ItemB', :, :], + self.panel4d.loc['l2', ['ItemB'], + :, :]['ItemB']) def test_constructor_dict_mixed(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -807,8 +808,8 @@ def test_reindex(self): larger = smaller.reindex(major=self.panel4d.major_axis, method='pad') - assert_panel_equal(larger.ix[:, :, self.panel4d.major_axis[1], :], - smaller.ix[:, :, smaller_major[0], :]) + assert_panel_equal(larger.loc[:, :, self.panel4d.major_axis[1], :], + smaller.loc[:, :, smaller_major[0], :]) # don't necessarily copy result = self.panel4d.reindex( diff --git a/pandas/tests/test_panelnd.py b/pandas/tests/test_panelnd.py index 2a69a65e8d55e..92805f3b30ec6 100644 --- a/pandas/tests/test_panelnd.py +++ b/pandas/tests/test_panelnd.py @@ -94,8 +94,8 @@ def test_5d_construction(self): p5d = Panel5D(dict(C1=p4d)) # slice back to 4d - results = p5d.ix['C1', :, :, 0:3, :] - expected = p4d.ix[:, :, 0:3, :] + results = p5d.iloc[p5d.cool1.get_loc('C1'), :, :, 0:3, :] + expected = p4d.iloc[:, :, 0:3, :] assert_panel_equal(results['L1'], expected['L1']) # test a transpose diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bbcd856250c51..bb1aab3bc7ffb 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -229,7 +229,7 @@ def test_contains(self): # na values = Series(['om', 'foo', np.nan]) res = values.str.contains('foo', na="foo") - self.assertEqual(res.ix[2], "foo") + self.assertEqual(res.loc[2], "foo") def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 929ff43bfaaad..22c3cee95f314 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2190,7 +2190,7 @@ def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) panel = get_result(self.frame) - actual = panel.ix[:, 1, 5] + actual = panel.loc[:, 1, 5] expected = get_result(self.frame[1], self.frame[5]) tm.assert_series_equal(actual, expected, check_names=False) self.assertEqual(actual.name, 5) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 172eee99b7c6b..2be7e75573d6e 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -785,8 +785,8 @@ def test_append_different_columns(self): 'floats': np.random.randn(10), 'strings': ['foo', 'bar'] * 5}) - a = df[:5].ix[:, ['bools', 'ints', 'floats']] - b = df[5:].ix[:, ['strings', 'ints', 'floats']] + a = df[:5].loc[:, ['bools', 'ints', 'floats']] + b = df[5:].loc[:, ['strings', 'ints', 'floats']] appended = a.append(b) self.assertTrue(isnull(appended['strings'][0:4]).all()) @@ -802,7 +802,7 @@ def test_append_many(self): chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) + tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame) self.assertTrue((result['foo'][15:] == 'bar').all()) self.assertTrue(result['foo'][:15].isnull().all()) @@ -929,7 +929,7 @@ def test_concat_with_group_keys(self): def test_concat_keys_specific_levels(self): df = DataFrame(np.random.randn(10, 4)) - pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] level = ['three', 'two', 'one', 'zero'] result = concat(pieces, axis=1, keys=['one', 'two', 'three'], levels=[level], @@ -1024,8 +1024,8 @@ def test_concat_multiindex_with_keys(self): result = concat([frame, frame], keys=[0, 1], names=['iteration']) self.assertEqual(result.index.names, ('iteration',) + index.names) - tm.assert_frame_equal(result.ix[0], frame) - tm.assert_frame_equal(result.ix[1], frame) + tm.assert_frame_equal(result.loc[0], frame) + tm.assert_frame_equal(result.loc[1], frame) self.assertEqual(result.index.nlevels, 3) def test_concat_multiindex_with_tz(self): @@ -1202,7 +1202,7 @@ def test_handle_empty_objects(self): frames = [baz, empty, empty, df[5:]] concatted = concat(frames, axis=0) - expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] + expected = df.loc[:, ['a', 'b', 'c', 'd', 'foo']] expected['foo'] = expected['foo'].astype('O') expected.loc[0:4, 'foo'] = 'bar' @@ -1326,28 +1326,28 @@ def test_dtype_coerceion(self): def test_panel_concat_other_axes(self): panel = tm.makePanel() - p1 = panel.ix[:, :5, :] - p2 = panel.ix[:, 5:, :] + p1 = panel.iloc[:, :5, :] + p2 = panel.iloc[:, 5:, :] result = concat([p1, p2], axis=1) tm.assert_panel_equal(result, panel) - p1 = panel.ix[:, :, :2] - p2 = panel.ix[:, :, 2:] + p1 = panel.iloc[:, :, :2] + p2 = panel.iloc[:, :, 2:] result = concat([p1, p2], axis=2) tm.assert_panel_equal(result, panel) # if things are a bit misbehaved - p1 = panel.ix[:2, :, :2] - p2 = panel.ix[:, :, 2:] + p1 = panel.iloc[:2, :, :2] + p2 = panel.iloc[:, :, 2:] p1['ItemC'] = 'baz' result = concat([p1, p2], axis=2) expected = panel.copy() expected['ItemC'] = expected['ItemC'].astype('O') - expected.ix['ItemC', :, :2] = 'baz' + expected.loc['ItemC', :, :2] = 'baz' tm.assert_panel_equal(result, expected) def test_panel_concat_buglet(self): @@ -1379,14 +1379,14 @@ def test_panel4d_concat(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): p4d = tm.makePanel4D() - p1 = p4d.ix[:, :, :5, :] - p2 = p4d.ix[:, :, 5:, :] + p1 = p4d.iloc[:, :, :5, :] + p2 = p4d.iloc[:, :, 5:, :] result = concat([p1, p2], axis=2) tm.assert_panel4d_equal(result, p4d) - p1 = p4d.ix[:, :, :, :2] - p2 = p4d.ix[:, :, :, 2:] + p1 = p4d.iloc[:, :, :, :2] + p2 = p4d.iloc[:, :, :, 2:] result = concat([p1, p2], axis=3) tm.assert_panel4d_equal(result, p4d) @@ -1396,15 +1396,15 @@ def test_panel4d_concat_mixed_type(self): p4d = tm.makePanel4D() # if things are a bit misbehaved - p1 = p4d.ix[:, :2, :, :2] - p2 = p4d.ix[:, :, :, 2:] + p1 = p4d.iloc[:, :2, :, :2] + p2 = p4d.iloc[:, :, :, 2:] p1['L5'] = 'baz' result = concat([p1, p2], axis=3) p2['L5'] = np.nan expected = concat([p1, p2], axis=3) - expected = expected.ix[result.labels] + expected = expected.loc[result.labels] tm.assert_panel4d_equal(result, expected) diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index ecf0592c66eff..4a2b64d080b4b 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -428,7 +428,7 @@ def test_join_inner_multiindex(self): self.assertTrue(joined.index.is_monotonic) assert_frame_equal(joined, expected) - # _assert_same_contents(expected, expected2.ix[:, expected.columns]) + # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 @@ -500,7 +500,7 @@ def test_join_many_non_unique_index(self): result = result.reset_index() - assert_frame_equal(result, expected.ix[:, result.columns]) + assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -597,9 +597,9 @@ def _check_diff_index(df_list, result, exp_index): def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] + df1 = df.loc[:, ['A', 'B']] + df2 = df.loc[:, ['C', 'D']] + df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) @@ -637,8 +637,8 @@ def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) - p1 = panel.ix[:2, :10, :3] - p2 = panel.ix[2:, 5:, 2:] + p1 = panel.iloc[:2, :10, :3] + p2 = panel.iloc[2:, 5:, 2:] # left join result = p1.join(p2) @@ -656,7 +656,7 @@ def test_panel_join(self): # inner join result = p1.join(p2, how='inner') - expected = panel.ix[:, 5:10, 2:3] + expected = panel.iloc[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join @@ -671,16 +671,16 @@ def test_panel_join_overlap(self): panel = tm.makePanel() tm.add_nans(panel) - p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] - p2 = panel.ix[['ItemB', 'ItemC']] + p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.loc[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') - p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') - p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') - no_overlap = panel.ix[['ItemA']] + p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.loc[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) @@ -689,12 +689,14 @@ def test_panel_join_many(self): panel = tm.makePanel() tm.K = 4 - panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) - panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + panels = [panel.iloc[:2, :-5], + panel.iloc[2:6, 2:], + panel.iloc[6:, 5:-7]] data_dict = {} for p in panels: @@ -757,13 +759,13 @@ def _restrict_to_columns(group, columns, suffix): if c in columns or c.replace(suffix, '') in columns] # filter - group = group.ix[:, found] + group = group.loc[:, found] # get rid of suffixes, if any group = group.rename(columns=lambda x: x.replace(suffix, '')) # put in the right order... - group = group.ix[:, columns] + group = group.loc[:, columns] return group diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index f078959608f91..e08074649f7e8 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -72,13 +72,13 @@ def test_merge_index_singlekey_right_vs_left(self): right_index=True, how='left', sort=False) merged2 = merge(right, left, right_on='key', left_index=True, how='right', sort=False) - assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) + assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) merged1 = merge(left, right, left_on='key', right_index=True, how='left', sort=True) merged2 = merge(right, left, right_on='key', left_index=True, how='right', sort=True) - assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) + assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) def test_merge_index_singlekey_inner(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], @@ -89,13 +89,13 @@ def test_merge_index_singlekey_inner(self): # inner join result = merge(left, right, left_on='key', right_index=True, how='inner') - expected = left.join(right, on='key').ix[result.index] + expected = left.join(right, on='key').loc[result.index] assert_frame_equal(result, expected) result = merge(right, left, right_on='key', left_index=True, how='inner') - expected = left.join(right, on='key').ix[result.index] - assert_frame_equal(result, expected.ix[:, result.columns]) + expected = left.join(right, on='key').loc[result.index] + assert_frame_equal(result, expected.loc[:, result.columns]) def test_merge_misspecified(self): self.assertRaises(ValueError, merge, self.left, self.right, @@ -763,11 +763,11 @@ def test_merge_on_multikey(self): columns=self.to_join.columns)) # TODO: columns aren't in the same order yet - assert_frame_equal(joined, expected.ix[:, joined.columns]) + assert_frame_equal(joined, expected.loc[:, joined.columns]) left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True) - right = expected.ix[:, joined.columns].sort_values(['key1', 'key2'], - kind='mergesort') + right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'], + kind='mergesort') assert_frame_equal(left, right) def test_left_join_multi_index(self): @@ -840,7 +840,7 @@ def test_merge_right_vs_left(self): left_index=True, how='right', sort=sort) - merged2 = merged2.ix[:, merged1.columns] + merged2 = merged2.loc[:, merged1.columns] assert_frame_equal(merged1, merged2) def test_compress_group_combinations(self): @@ -904,7 +904,7 @@ def test_left_join_index_preserve_order(self): # do a right join for an extra test joined = merge(right, left, left_index=True, right_on=['k1', 'k2'], how='right') - tm.assert_frame_equal(joined.ix[:, expected.columns], expected) + tm.assert_frame_equal(joined.loc[:, expected.columns], expected) def test_left_join_index_multi_match_multiindex(self): left = DataFrame([ diff --git a/pandas/tools/tests/test_merge_ordered.py b/pandas/tools/tests/test_merge_ordered.py index 0511a0ca6d1cf..d163468abc88e 100644 --- a/pandas/tools/tests/test_merge_ordered.py +++ b/pandas/tools/tests/test_merge_ordered.py @@ -54,11 +54,11 @@ def test_multigroup(self): 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) expected['group'] = ['a'] * 6 + ['b'] * 6 - assert_frame_equal(result, expected.ix[:, result.columns]) + assert_frame_equal(result, expected.loc[:, result.columns]) result2 = merge_ordered(self.right, left, on='key', right_by='group', fill_method='ffill') - assert_frame_equal(result, result2.ix[:, result.columns]) + assert_frame_equal(result, result2.loc[:, result.columns]) result = merge_ordered(left, self.right, on='key', left_by='group') self.assertTrue(result['group'].notnull().all()) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index b88c6167f6670..9cc520a7adb05 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -301,14 +301,15 @@ def test_margins(self): def _check_output(result, values_col, index=['A', 'B'], columns=['C'], margins_col='All'): - col_margins = result.ix[:-1, margins_col] + col_margins = result.loc[result.index[:-1], margins_col] expected_col_margins = self.data.groupby(index)[values_col].mean() tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) self.assertEqual(col_margins.name, margins_col) result = result.sort_index() - index_margins = result.ix[(margins_col, '')].iloc[:-1] + index_margins = result.loc[(margins_col, '')].iloc[:-1] + expected_ix_margins = self.data.groupby(columns)[values_col].mean() tm.assert_series_equal(index_margins, expected_ix_margins, check_names=False) @@ -987,7 +988,7 @@ def test_crosstab_margins(self): tm.assert_series_equal(all_cols, exp_cols) - all_rows = result.ix['All'] + all_rows = result.loc['All'] exp_rows = df.groupby(['b', 'c']).size().astype('i8') exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) exp_rows.name = 'All' diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index ad4f669fceb42..a707cc3eb74ce 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2187,7 +2187,7 @@ def test_slice_with_negative_step(self): def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - tm.assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) @@ -2213,7 +2213,7 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: ts.loc[::0]) self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.ix[::0]) + lambda: ts.loc[::0]) def test_contains(self): rng = period_range('2007-01', freq='M', periods=10) @@ -2399,13 +2399,13 @@ def test_as_frame_columns(self): df = DataFrame(randn(10, 5), columns=rng) ts = df[rng[0]] - tm.assert_series_equal(ts, df.ix[:, 0]) + tm.assert_series_equal(ts, df.iloc[:, 0]) # GH # 1211 repr(df) ts = df['1/1/2000'] - tm.assert_series_equal(ts, df.ix[:, 0]) + tm.assert_series_equal(ts, df.iloc[:, 0]) def test_indexing(self): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 26c311b4a72f8..424226c0bfb7e 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -110,7 +110,7 @@ def test_api_changes_v018(self): r = self.series.resample('H') self.assertRaises(ValueError, lambda: r.iloc[0]) self.assertRaises(ValueError, lambda: r.iat[0]) - self.assertRaises(ValueError, lambda: r.ix[0]) + self.assertRaises(ValueError, lambda: r.loc[0]) self.assertRaises(ValueError, lambda: r.loc[ Timestamp('2013-01-01 00:00:00', offset='H')]) self.assertRaises(ValueError, lambda: r.at[ @@ -1410,13 +1410,13 @@ def _ohlc(group): resampled = ts.resample('5min', closed='right', label='right').ohlc() - self.assertTrue((resampled.ix['1/1/2000 00:00'] == ts[0]).all()) + self.assertTrue((resampled.loc['1/1/2000 00:00'] == ts[0]).all()) exp = _ohlc(ts[1:31]) - self.assertTrue((resampled.ix['1/1/2000 00:05'] == exp).all()) + self.assertTrue((resampled.loc['1/1/2000 00:05'] == exp).all()) exp = _ohlc(ts['1/1/2000 5:55:01':]) - self.assertTrue((resampled.ix['1/1/2000 6:00:00'] == exp).all()) + self.assertTrue((resampled.loc['1/1/2000 6:00:00'] == exp).all()) def test_downsample_non_unique(self): rng = date_range('1/1/2000', '2/29/2000') @@ -2495,7 +2495,7 @@ def test_resample_irregular_sparse(self): subset = s[:'2012-01-04 06:55'] result = subset.resample('10min').apply(len) - expected = s.resample('10min').apply(len).ix[result.index] + expected = s.resample('10min').apply(len).loc[result.index] assert_series_equal(result, expected) def test_resample_weekly_all_na(self): diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 48ef9a4db7e28..e849dc0412c35 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1914,7 +1914,7 @@ def test_slice_with_negative_step(self): def assert_slices_equivalent(l_slc, i_slc): assert_series_equal(ts[l_slc], ts.iloc[i_slc]) assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1]) assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1]) @@ -1939,7 +1939,7 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: ts.loc[::0]) self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.ix[::0]) + lambda: ts.loc[::0]) def test_tdi_ops_attributes(self): rng = timedelta_range('2 days', periods=5, freq='2D', name='x') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 58a4457777ea0..704aebd815a29 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -173,8 +173,8 @@ def test_indexing_over_size_cutoff(self): self.assertIn(timestamp, df.index) # it works! - df.ix[timestamp] - self.assertTrue(len(df.ix[[timestamp]]) > 0) + df.loc[timestamp] + self.assertTrue(len(df.loc[[timestamp]]) > 0) finally: _index._SIZE_CUTOFF = old_cutoff @@ -1373,7 +1373,7 @@ def test_at_time(self): df = DataFrame(np.random.randn(len(rng), 3), index=rng) result = ts[time(9, 30)] - result_df = df.ix[time(9, 30)] + result_df = df.loc[time(9, 30)] expected = ts[(rng.hour == 9) & (rng.minute == 30)] exp_df = df[(rng.hour == 9) & (rng.minute == 30)] @@ -1382,8 +1382,8 @@ def test_at_time(self): assert_series_equal(result, expected) tm.assert_frame_equal(result_df, exp_df) - chunk = df.ix['1/4/2000':] - result = chunk.ix[time(9, 30)] + chunk = df.loc['1/4/2000':] + result = chunk.loc[time(9, 30)] expected = result_df[-1:] tm.assert_frame_equal(result, expected) @@ -1412,8 +1412,8 @@ def test_at_time_frame(self): expected = ts.at_time(time(9, 30)) assert_frame_equal(result, expected) - result = ts.ix[time(9, 30)] - expected = ts.ix[(rng.hour == 9) & (rng.minute == 30)] + result = ts.loc[time(9, 30)] + expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] assert_frame_equal(result, expected) @@ -3192,7 +3192,7 @@ def test_string_index_series_name_converted(self): df = DataFrame(np.random.randn(10, 4), index=date_range('1/1/2000', periods=10)) - result = df.ix['1/3/2000'] + result = df.loc['1/3/2000'] self.assertEqual(result.name, df.index[2]) result = df.T['1/3/2000'] @@ -3916,7 +3916,7 @@ def test_slice_with_negative_step(self): def assert_slices_equivalent(l_slc, i_slc): assert_series_equal(ts[l_slc], ts.iloc[i_slc]) assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) @@ -3943,7 +3943,7 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: ts.loc[::0]) self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.ix[::0]) + lambda: ts.loc[::0]) def test_slice_bounds_empty(self): # GH 14354 @@ -4313,7 +4313,7 @@ def test_slice_locs_indexerror(self): times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)] s = Series(lrange(100000), times) - s.ix[datetime(1900, 1, 1):datetime(2100, 1, 1)] + s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] def test_slicing_datetimes(self): @@ -4323,17 +4323,17 @@ def test_slicing_datetimes(self): df = DataFrame(np.arange(4., dtype='float64'), index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]]) - result = df.ix[datetime(2001, 1, 1, 10):] + result = df.loc[datetime(2001, 1, 1, 10):] assert_frame_equal(result, df) - result = df.ix[:datetime(2001, 1, 4, 10)] + result = df.loc[:datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.ix[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.ix[datetime(2001, 1, 1, 11):] + result = df.loc[datetime(2001, 1, 1, 11):] expected = df.iloc[1:] assert_frame_equal(result, expected) - result = df.ix['20010101 11':] + result = df.loc['20010101 11':] assert_frame_equal(result, expected) # duplicates @@ -4341,17 +4341,17 @@ def test_slicing_datetimes(self): index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]]) - result = df.ix[datetime(2001, 1, 1, 10):] + result = df.loc[datetime(2001, 1, 1, 10):] assert_frame_equal(result, df) - result = df.ix[:datetime(2001, 1, 4, 10)] + result = df.loc[:datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.ix[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.ix[datetime(2001, 1, 1, 11):] + result = df.loc[datetime(2001, 1, 1, 11):] expected = df.iloc[1:] assert_frame_equal(result, expected) - result = df.ix['20010101 11':] + result = df.loc['20010101 11':] assert_frame_equal(result, expected) @@ -4890,7 +4890,7 @@ def test_slice_year(self): assert_series_equal(result, expected) df = DataFrame(np.random.rand(len(dti), 5), index=dti) - result = df.ix['2005'] + result = df.loc['2005'] expected = df[df.index.year == 2005] assert_frame_equal(result, expected) @@ -4907,7 +4907,7 @@ def test_slice_quarter(self): self.assertEqual(len(s['2001Q1']), 90) df = DataFrame(np.random.rand(len(dti), 5), index=dti) - self.assertEqual(len(df.ix['1Q01']), 90) + self.assertEqual(len(df.loc['1Q01']), 90) def test_slice_month(self): dti = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) @@ -4915,7 +4915,7 @@ def test_slice_month(self): self.assertEqual(len(s['2005-11']), 30) df = DataFrame(np.random.rand(len(dti), 5), index=dti) - self.assertEqual(len(df.ix['2005-11']), 30) + self.assertEqual(len(df.loc['2005-11']), 30) assert_series_equal(s['2005-11'], s['11-2005']) @@ -4945,7 +4945,7 @@ def test_partial_slice_daily(self): s = Series(np.arange(len(rng)), index=rng) result = s['2005-1-31'] - assert_series_equal(result, s.ix[:24]) + assert_series_equal(result, s.iloc[:24]) self.assertRaises(Exception, s.__getitem__, '2004-12-31 00') @@ -4955,12 +4955,12 @@ def test_partial_slice_hourly(self): s = Series(np.arange(len(rng)), index=rng) result = s['2005-1-1'] - assert_series_equal(result, s.ix[:60 * 4]) + assert_series_equal(result, s.iloc[:60 * 4]) result = s['2005-1-1 20'] - assert_series_equal(result, s.ix[:60]) + assert_series_equal(result, s.iloc[:60]) - self.assertEqual(s['2005-1-1 20:00'], s.ix[0]) + self.assertEqual(s['2005-1-1 20:00'], s.iloc[0]) self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:15') def test_partial_slice_minutely(self): @@ -4969,12 +4969,12 @@ def test_partial_slice_minutely(self): s = Series(np.arange(len(rng)), index=rng) result = s['2005-1-1 23:59'] - assert_series_equal(result, s.ix[:60]) + assert_series_equal(result, s.iloc[:60]) result = s['2005-1-1'] - assert_series_equal(result, s.ix[:60]) + assert_series_equal(result, s.iloc[:60]) - self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.ix[0]) + self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.iloc[0]) self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') def test_partial_slice_second_precision(self): @@ -5104,8 +5104,8 @@ def f(): assert_series_equal(result, expected) df2 = pd.DataFrame(s) - expected = df2.ix['2000-1-4'] - result = df2.ix[pd.Timestamp('2000-1-4')] + expected = df2.xs('2000-1-4') + result = df2.loc[pd.Timestamp('2000-1-4')] assert_frame_equal(result, expected) def test_date_range_normalize(self): @@ -5283,10 +5283,10 @@ def test_partial_slice_doesnt_require_monotonicity(self): r"Timestamp\('2014-01-10 00:00:00'\)", lambda: nonmonotonic[timestamp:]) - assert_series_equal(nonmonotonic.ix['2014-01-10':], expected) + assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) self.assertRaisesRegexp(KeyError, r"Timestamp\('2014-01-10 00:00:00'\)", - lambda: nonmonotonic.ix[timestamp:]) + lambda: nonmonotonic.loc[timestamp:]) class TimeConversionFormats(tm.TestCase): From d8d866fe6c117b9e57b5a3a1038063efc3c721af Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 18 Jan 2017 11:13:31 -0500 Subject: [PATCH 034/338] BUG: make sure that we are passing thru kwargs to groupby BUG: allow timedelta64 to work in groupby with numeric_only=False closes #5724 Author: Jeff Reback Closes #15054 from jreback/groupby_arg and squashes the following commits: 768fce1 [Jeff Reback] BUG: make sure that we are passing thru kwargs to groupby BUG: allow timedelta64 to work in groupby with numeric_only=False --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/compat/numpy/function.py | 10 +- pandas/core/groupby.py | 140 +++++++++++++++++++++------ pandas/tests/groupby/test_groupby.py | 123 +++++++++++++++++++++++ 4 files changed, 240 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f7c05a3f373f5..e67d617fcc099 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -400,6 +400,7 @@ Bug Fixes +- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) - Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index adc17c7514832..895a376457f09 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -306,12 +306,18 @@ def validate_expanding_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_groupby_func(name, args, kwargs): +def validate_groupby_func(name, args, kwargs, allowed=None): """ - 'args' and 'kwargs' should be empty because all of + 'args' and 'kwargs' should be empty, except for allowed + kwargs because all of their necessary parameters are explicitly listed in the function signature """ + if allowed is None: + allowed = [] + + kwargs = set(kwargs) - set(allowed) + if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall(( "numpy operations are not valid " diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 31df22cc11b1d..fc9cd894162e4 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -19,6 +19,7 @@ is_categorical_dtype, is_datetimelike, is_datetime_or_timedelta_dtype, + is_datetime64_any_dtype, is_bool, is_integer_dtype, is_complex_dtype, is_bool_dtype, @@ -109,10 +110,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, @Substitution(name='groupby', f=name) @Appender(_doc_template) @Appender(_local_template) - def f(self): + def f(self, **kwargs): + if 'numeric_only' not in kwargs: + kwargs['numeric_only'] = numeric_only self._set_group_selection() try: - return self._cython_agg_general(alias, numeric_only=numeric_only) + return self._cython_agg_general(alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: @@ -127,7 +130,9 @@ def f(self): def _first_compat(x, axis=0): + def _first(x): + x = np.asarray(x) x = x[notnull(x)] if len(x) == 0: @@ -142,6 +147,7 @@ def _first(x): def _last_compat(x, axis=0): def _last(x): + x = np.asarray(x) x = x[notnull(x)] if len(x) == 0: @@ -775,7 +781,7 @@ def _try_cast(self, result, obj): return result def _cython_transform(self, how, numeric_only=True): - output = {} + output = collections.OrderedDict() for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -783,6 +789,8 @@ def _cython_transform(self, how, numeric_only=True): try: result, names = self.grouper.transform(obj.values, how) + except NotImplementedError: + continue except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -792,7 +800,7 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -1015,26 +1023,26 @@ def mean(self, *args, **kwargs): For multiple groupings, the result index will be a MultiIndex """ - nv.validate_groupby_func('mean', args, kwargs) + nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) try: - return self._cython_agg_general('mean') + return self._cython_agg_general('mean', **kwargs) except GroupByError: raise except Exception: # pragma: no cover self._set_group_selection() - f = lambda x: x.mean(axis=self.axis) + f = lambda x: x.mean(axis=self.axis, **kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @Appender(_doc_template) - def median(self): + def median(self, **kwargs): """ Compute median of groups, excluding missing values For multiple groupings, the result index will be a MultiIndex """ try: - return self._cython_agg_general('median') + return self._cython_agg_general('median', **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1044,7 +1052,7 @@ def median(self): def f(x): if isinstance(x, np.ndarray): x = Series(x) - return x.median(axis=self.axis) + return x.median(axis=self.axis, **kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @@ -1063,7 +1071,7 @@ def std(self, ddof=1, *args, **kwargs): # TODO: implement at Cython level? nv.validate_groupby_func('std', args, kwargs) - return np.sqrt(self.var(ddof=ddof)) + return np.sqrt(self.var(ddof=ddof, **kwargs)) @Substitution(name='groupby') @Appender(_doc_template) @@ -1080,10 +1088,10 @@ def var(self, ddof=1, *args, **kwargs): """ nv.validate_groupby_func('var', args, kwargs) if ddof == 1: - return self._cython_agg_general('var') + return self._cython_agg_general('var', **kwargs) else: self._set_group_selection() - f = lambda x: x.var(ddof=ddof) + f = lambda x: x.var(ddof=ddof, **kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @@ -1400,39 +1408,39 @@ def cumcount(self, ascending=True): @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs) + nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only']) if axis != 0: - return self.apply(lambda x: x.cumprod(axis=axis)) + return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) - return self._cython_transform('cumprod') + return self._cython_transform('cumprod', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs) + nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only']) if axis != 0: - return self.apply(lambda x: x.cumsum(axis=axis)) + return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) - return self._cython_transform('cumsum') + return self._cython_transform('cumsum', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) - def cummin(self, axis=0): + def cummin(self, axis=0, **kwargs): """Cumulative min for each group""" if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform('cummin') + return self._cython_transform('cummin', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) - def cummax(self, axis=0): + def cummax(self, axis=0, **kwargs): """Cumulative max for each group""" if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform('cummax') + return self._cython_transform('cummax', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) @@ -1828,6 +1836,28 @@ def wrapper(*args, **kwargs): def _cython_operation(self, kind, values, how, axis): assert kind in ['transform', 'aggregate'] + # can we do this operation with our cython functions + # if not raise NotImplementedError + + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + + # categoricals are only 1d, so we + # are not setup for dim transforming + if is_categorical_dtype(values): + raise NotImplementedError( + "categoricals are not support in cython ops ATM") + elif is_datetime64_any_dtype(values): + if how in ['add', 'prod', 'cumsum', 'cumprod']: + raise NotImplementedError( + "datetime64 type does not support {} " + "operations".format(how)) + elif is_timedelta64_dtype(values): + if how in ['prod', 'cumprod']: + raise NotImplementedError( + "timedelta64 type does not support {} " + "operations".format(how)) + arity = self._cython_arity.get(how, 1) vdim = values.ndim @@ -3155,9 +3185,9 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True): new_items, new_blocks = self._cython_agg_blocks( - how, numeric_only=numeric_only) + how, alt=alt, numeric_only=numeric_only) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3183,29 +3213,75 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, numeric_only=True): - data, agg_axis = self._get_data_to_aggregate() + def _cython_agg_blocks(self, how, alt=None, numeric_only=True): + # TODO: the actual managing of mgr_locs is a PITA + # here, it should happen via BlockManager.combine - new_blocks = [] + data, agg_axis = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) + new_blocks = [] + new_items = [] + deleted_items = [] for block in data.blocks: - result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + locs = block.mgr_locs.as_array + try: + result, _ = self.grouper.aggregate( + block.values, how, axis=agg_axis) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + + if alt is None: + # we cannot perform the operation + # in an alternate way, exclude the block + deleted_items.append(locs) + continue + + # call our grouper again with only this block + obj = self.obj[data.items[locs]] + s = groupby(obj, self.grouper) + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + result = result._data.blocks[0] # see if we can cast the block back to the original dtype result = block._try_coerce_and_cast_result(result) - newb = make_block(result, placement=block.mgr_locs) + new_items.append(locs) + newb = block.make_block_same_class(result) new_blocks.append(newb) if len(new_blocks) == 0: raise DataError('No numeric types to aggregate') - return data.items, new_blocks + # reset the locs in the blocks to correspond to our + # current ordering + indexer = np.concatenate(new_items) + new_items = data.items.take(np.sort(indexer)) + + if len(deleted_items): + + # we need to adjust the indexer to account for the + # items we have removed + # really should be done in internals :< + + deleted = np.concatenate(deleted_items) + ai = np.arange(len(data)) + mask = np.zeros(len(data)) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for b in new_blocks: + l = len(b.mgr_locs) + b.mgr_locs = indexer[offset:(offset + l)] + offset += l + + return new_items, new_blocks def _get_data_to_aggregate(self): obj = self._obj_with_exclusions diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 952ec3ca01e16..bbfd934946af7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2273,9 +2273,131 @@ def test_max_min_non_numeric(self): result = aa.groupby('nn').max() self.assertTrue('ss' in result) + result = aa.groupby('nn').max(numeric_only=False) + self.assertTrue('ss' in result) + result = aa.groupby('nn').min() self.assertTrue('ss' in result) + result = aa.groupby('nn').min(numeric_only=False) + self.assertTrue('ss' in result) + + def test_arg_passthru(self): + # make sure that we are passing thru kwargs + # to our agg functions + + # GH3668 + # GH5724 + df = pd.DataFrame( + {'group': [1, 1, 2], + 'int': [1, 2, 3], + 'float': [4., 5., 6.], + 'string': list('abc'), + 'category_string': pd.Series(list('abc')).astype('category'), + 'category_int': [7, 8, 9], + 'datetime': pd.date_range('20130101', periods=3), + 'datetimetz': pd.date_range('20130101', + periods=3, + tz='US/Eastern'), + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, + columns=['group', 'int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + + expected_columns_numeric = Index(['int', 'float', 'category_int']) + + # mean / median + expected = pd.DataFrame( + {'category_int': [7.5, 9], + 'float': [4.5, 6.], + 'timedelta': [pd.Timedelta('1.5s'), + pd.Timedelta('3s')], + 'int': [1.5, 3], + 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), + pd.Timestamp('2013-01-03 00:00:00')], + 'datetimetz': [ + pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), + pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, + index=Index([1, 2], name='group'), + columns=['int', 'float', 'category_int', + 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + assert_frame_equal(result.reindex_like(expected), expected) + + # TODO: min, max *should* handle + # categorical (ordered) dtype + expected_columns = Index(['int', 'float', 'string', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['min', 'max']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['first', 'last']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_int', 'timedelta']) + for attr in ['sum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int']) + for attr in ['prod', 'cumprod']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + # like min, max, but don't include strings + expected_columns = Index(['int', 'float', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['cummin', 'cummax']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int', + 'timedelta']) + for attr in ['cumsum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + def test_cython_agg_boolean(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': np.random.randint(0, 2, 50).astype('bool')}) @@ -3452,6 +3574,7 @@ def test_int64_overflow(self): tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) tups = com._asarray_tuplesafe(tups) + expected = df.groupby(tups).sum()['values'] for k, v in compat.iteritems(expected): From 48055e0e7c962c883386560ab2ed8fcd8750708c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 18 Jan 2017 21:01:44 -0500 Subject: [PATCH 035/338] DOC: fix doc-build for .ix deprecations (#15158) --- doc/source/indexing.rst | 2 +- doc/source/whatsnew/v0.10.1.txt | 7 +++-- doc/source/whatsnew/v0.11.0.txt | 2 +- doc/source/whatsnew/v0.13.0.txt | 2 -- doc/source/whatsnew/v0.13.1.txt | 4 +-- doc/source/whatsnew/v0.14.0.txt | 2 +- doc/source/whatsnew/v0.15.0.txt | 2 +- doc/source/whatsnew/v0.15.2.txt | 2 +- doc/source/whatsnew/v0.16.0.txt | 9 +++++-- doc/source/whatsnew/v0.18.0.txt | 20 +++++++------- doc/source/whatsnew/v0.7.0.txt | 48 ++++++++++++++++++++++++++------- doc/source/whatsnew/v0.9.1.txt | 2 +- 12 files changed, 66 insertions(+), 36 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 9c653f6d9eeff..e2916c2d969d6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -586,7 +586,7 @@ Using ``.loc``. Here we will select the appropriate indexes from the index, then .. ipython:: python - dfd.loc[df.index[[0, 2]], 'A'] + dfd.loc[dfd.index[[0, 2]], 'A'] This can also be expressed using ``.iloc``, by explicitly getting locations on the indexers, and using *positional* indexing to select things. diff --git a/doc/source/whatsnew/v0.10.1.txt b/doc/source/whatsnew/v0.10.1.txt index b61d31932c60d..edc628fe85027 100644 --- a/doc/source/whatsnew/v0.10.1.txt +++ b/doc/source/whatsnew/v0.10.1.txt @@ -51,8 +51,8 @@ perform queries on a table, by passing a list to ``data_columns`` df = DataFrame(randn(8, 3), index=date_range('1/1/2000', periods=8), columns=['A', 'B', 'C']) df['string'] = 'foo' - df.ix[4:6,'string'] = np.nan - df.ix[7:9,'string'] = 'bar' + df.loc[df.index[4:6], 'string'] = np.nan + df.loc[df.index[7:9], 'string'] = 'bar' df['string2'] = 'cool' df @@ -78,7 +78,7 @@ You can now store ``datetime64`` in data columns df_mixed = df.copy() df_mixed['datetime64'] = Timestamp('20010102') - df_mixed.ix[3:4,['A','B']] = np.nan + df_mixed.loc[df_mixed.index[3:4], ['A','B']] = np.nan store.append('df_mixed', df_mixed) df_mixed1 = store.select('df_mixed') @@ -208,4 +208,3 @@ combined result, by using ``where`` on a selector table. See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. - diff --git a/doc/source/whatsnew/v0.11.0.txt b/doc/source/whatsnew/v0.11.0.txt index 50b74fc5af090..ea149595e681f 100644 --- a/doc/source/whatsnew/v0.11.0.txt +++ b/doc/source/whatsnew/v0.11.0.txt @@ -190,7 +190,7 @@ Furthermore ``datetime64[ns]`` columns are created by default, when passed datet df.get_dtype_counts() # use the traditional nan, which is mapped to NaT internally - df.ix[2:4,['A','timestamp']] = np.nan + df.loc[df.index[2:4], ['A','timestamp']] = np.nan df Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`` to ``np.nan`` diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index 6ecd4b487c798..118632cc2c0ee 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -274,7 +274,6 @@ Float64Index API Change .. ipython:: python s[3] - s.ix[3] s.loc[3] The only positional indexing is via ``iloc`` @@ -290,7 +289,6 @@ Float64Index API Change .. ipython:: python s[2:4] - s.ix[2:4] s.loc[2:4] s.iloc[2:4] diff --git a/doc/source/whatsnew/v0.13.1.txt b/doc/source/whatsnew/v0.13.1.txt index 349acf508bbf3..d5d54ba43b622 100644 --- a/doc/source/whatsnew/v0.13.1.txt +++ b/doc/source/whatsnew/v0.13.1.txt @@ -36,7 +36,7 @@ Highlights include: .. ipython:: python df = DataFrame(dict(A = np.array(['foo','bar','bah','foo','bar']))) - df.ix[0,'A'] = np.nan + df.loc[0,'A'] = np.nan df Output Formatting Enhancements @@ -121,7 +121,7 @@ API changes .. ipython:: python :okwarning: - + df = DataFrame({'col':['foo', 0, np.nan]}) df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) df.equals(df2) diff --git a/doc/source/whatsnew/v0.14.0.txt b/doc/source/whatsnew/v0.14.0.txt index 78f96e3c0e049..f1feab4b909dc 100644 --- a/doc/source/whatsnew/v0.14.0.txt +++ b/doc/source/whatsnew/v0.14.0.txt @@ -516,7 +516,7 @@ See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :is names=['lvl0', 'lvl1']) df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), index=index, - columns=columns).sortlevel().sortlevel(axis=1) + columns=columns).sort_index().sort_index(axis=1) df Basic multi-index slicing using slices, lists, and labels. diff --git a/doc/source/whatsnew/v0.15.0.txt b/doc/source/whatsnew/v0.15.0.txt index df1171fb34486..aff8ec9092cdc 100644 --- a/doc/source/whatsnew/v0.15.0.txt +++ b/doc/source/whatsnew/v0.15.0.txt @@ -705,7 +705,7 @@ Other notable API changes: s = Series(np.arange(3,dtype='int64'), index=MultiIndex.from_product([['A'],['foo','bar','baz']], names=['one','two']) - ).sortlevel() + ).sort_index() s try: s.loc[['D']] diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 3a62ac38f7260..3bb472e9361e7 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -35,7 +35,7 @@ API changes df.loc[(1, 'z')] # lexically sorting - df2 = df.sortlevel() + df2 = df.sort_index() df2 df2.index.lexsort_depth df2.loc[(1,'z')] diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 4255f4839bca0..4d43660960597 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -300,9 +300,14 @@ The behavior of a small sub-set of edge cases for using ``.loc`` have changed (: New Behavior - .. ipython:: python + .. code-block:: python - s.ix[-1.0:2] + In [2]: s.ix[-1.0:2] + Out[2]: + -1 1 + 1 2 + 2 3 + dtype: int64 - Provide a useful exception for indexing with an invalid type for that index when using ``.loc``. For example trying to use ``.loc`` on an index of type ``DatetimeIndex`` or ``PeriodIndex`` or ``TimedeltaIndex``, with an integer (or a float). diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 7418cd0e6baa3..893922b719b34 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -448,7 +448,7 @@ New Behavior: columns=list('abc'), index=[[4,4,8], [8,10,12]]) df - df.ix[4, 'c'] = np.array([0., 1.]) + df.loc[4, 'c'] = np.array([0., 1.]) df .. _whatsnew_0180.enhancements.xarray: @@ -1115,7 +1115,6 @@ Other indexers will coerce to a like integer for both getting and setting. The ` s[5.0] s.loc[5.0] - s.ix[5.0] and setting @@ -1127,30 +1126,31 @@ and setting s_copy = s.copy() s_copy.loc[5.0] = 10 s_copy - s_copy = s.copy() - s_copy.ix[5.0] = 10 - s_copy Positional setting with ``.ix`` and a float indexer will ADD this value to the index, rather than previously setting the value by position. -.. ipython:: python +.. code-block:: python - s2.ix[1.0] = 10 - s2 + In [3]: s2.ix[1.0] = 10 + In [4]: s2 + Out[4]: + a 1 + b 2 + c 3 + 1.0 10 + dtype: int64 Slicing will also coerce integer-like floats to integers for a non-``Float64Index``. .. ipython:: python s.loc[5.0:6] - s.ix[5.0:6] Note that for floats that are NOT coercible to ints, the label based bounds will be excluded .. ipython:: python s.loc[5.1:6] - s.ix[5.1:6] Float indexing on a ``Float64Index`` is unchanged. diff --git a/doc/source/whatsnew/v0.7.0.txt b/doc/source/whatsnew/v0.7.0.txt index cfba2ad3d05b6..21d91950e7b78 100644 --- a/doc/source/whatsnew/v0.7.0.txt +++ b/doc/source/whatsnew/v0.7.0.txt @@ -169,16 +169,30 @@ API tweaks regarding label-based slicing Label-based slicing using ``ix`` now requires that the index be sorted (monotonic) **unless** both the start and endpoint are contained in the index: -.. ipython:: python +.. code-block:: python - s = Series(randn(6), index=list('gmkaec')) - s + In [1]: s = Series(randn(6), index=list('gmkaec')) + + In [2]: s + Out[2]: + g -1.182230 + m -0.276183 + k -0.243550 + a 1.628992 + e 0.073308 + c -0.539890 + dtype: float64 Then this is OK: -.. ipython:: python +.. code-block:: python - s.ix['k':'e'] + In [3]: s.ix['k':'e'] + Out[3]: + k -0.243550 + a 1.628992 + e 0.073308 + dtype: float64 But this is not: @@ -189,11 +203,26 @@ But this is not: If the index had been sorted, the "range selection" would have been possible: -.. ipython:: python +.. code-block:: python + + In [4]: s2 = s.sort_index() - s2 = s.sort_index() - s2 - s2.ix['b':'h'] + In [5]: s2 + Out[5]: + a 1.628992 + c -0.539890 + e 0.073308 + g -1.182230 + k -0.243550 + m -0.276183 + dtype: float64 + + In [6]: s2.ix['b':'h'] + Out[6]: + c -0.539890 + e 0.073308 + g -1.182230 + dtype: float64 Changes to Series ``[]`` operator ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -269,4 +298,3 @@ Performance improvements ``level`` parameter passed (:issue:`545`) - Ported skiplist data structure to C to speed up ``rolling_median`` by about 5-10x in most typical use cases (:issue:`374`) - diff --git a/doc/source/whatsnew/v0.9.1.txt b/doc/source/whatsnew/v0.9.1.txt index 51788f77a6f0f..9dd29a5fe7bf7 100644 --- a/doc/source/whatsnew/v0.9.1.txt +++ b/doc/source/whatsnew/v0.9.1.txt @@ -36,7 +36,7 @@ New features df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) - df.ix[2:4] = np.nan + df.loc[2:4] = np.nan df.rank() From c6b6ea7358e3605bef4313ac6f92e862ec326996 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 18 Jan 2017 21:22:14 -0500 Subject: [PATCH 036/338] CI: add .pxi.in to build caching (#15159) closes #15154 --- ci/prep_cython_cache.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh index cadc356b641f9..e091bb00ccedc 100755 --- a/ci/prep_cython_cache.sh +++ b/ci/prep_cython_cache.sh @@ -3,8 +3,8 @@ ls "$HOME/.cache/" PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd"` -pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx" -o -name "*.pxd"` +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` +pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` CACHE_File="$HOME/.cache/cython_files.tar" From 4b5ec17c149f5d31b39cd5f24799220f62e94aac Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Thu, 19 Jan 2017 08:53:17 -0500 Subject: [PATCH 037/338] BUG: unicode characters when reading JSON lines Fixes UnicodeDecodeError bug when reading JSON lines input with Ascii decoder, which is often the default setting in Python 2.7. Avoids issues with mixing unicode and ascii strings. closes #15132 Author: Rouz Azari Closes #15149 from rouzazari/GH_15132_json_lines_with_unicode_chars_py2 and squashes the following commits: e117889 [Rouz Azari] BUG: unicode characters when reading JSON lines --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/json.py | 2 +- pandas/io/tests/json/test_pandas.py | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e67d617fcc099..e4dacfaf40417 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -436,3 +436,4 @@ Bug Fixes - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) +- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) diff --git a/pandas/io/json.py b/pandas/io/json.py index 0a6b8af179e12..74a1ac43f1dd5 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -274,7 +274,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. lines = list(StringIO(json.strip())) - json = u'[' + u','.join(lines) + u']' + json = '[' + ','.join(lines) + ']' obj = None if typ == 'frame': diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 48e92feeb3efe..345d181a0e53a 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 import nose from pandas.compat import range, lrange, StringIO, OrderedDict @@ -960,6 +961,25 @@ def test_read_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) + def test_read_jsonl_unicode_chars(self): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + def test_to_jsonl(self): # GH9180 df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) From 72518896edd3eb65f4387afede65cd660f4bef23 Mon Sep 17 00:00:00 2001 From: bastewart Date: Thu, 19 Jan 2017 13:57:18 +0000 Subject: [PATCH 038/338] DOC: Clarified wording of coerce_float (#15164) --- pandas/core/frame.py | 2 +- pandas/io/sql.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 504554d6410f9..ef8b7a0d16bdc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -962,7 +962,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, in the result (any names not found in the data will become all-NA columns) coerce_float : boolean, default False - Attempt to convert values to non-string, non-numeric objects (like + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets Returns diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c9f8d32e1b504..9fa01c413aca8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -214,7 +214,7 @@ def read_sql_table(table_name, con, schema=None, index_col=None, index_col : string or list of strings, optional, default: None Column(s) to set as index(MultiIndex) coerce_float : boolean, default True - Attempt to convert values to non-string, non-numeric objects (like + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point. Can result in loss of Precision. parse_dates : list or dict, default: None - List of column names to parse as dates @@ -289,7 +289,7 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, index_col : string or list of strings, optional, default: None Column(s) to set as index(MultiIndex) coerce_float : boolean, default True - Attempt to convert values to non-string, non-numeric objects (like + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets params : list, tuple or dict, optional, default: None List of parameters to pass to execute method. The syntax used @@ -348,7 +348,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, index_col : string or list of strings, optional, default: None Column(s) to set as index(MultiIndex) coerce_float : boolean, default True - Attempt to convert values to non-string, non-numeric objects (like + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets params : list, tuple or dict, optional, default: None List of parameters to pass to execute method. The syntax used @@ -986,7 +986,7 @@ def read_table(self, table_name, index_col=None, coerce_float=True, index_col : string, optional, default: None Column to set as index coerce_float : boolean, default True - Attempt to convert values to non-string, non-numeric objects + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point. This can result in loss of precision. parse_dates : list or dict, default: None @@ -1048,7 +1048,7 @@ def read_query(self, sql, index_col=None, coerce_float=True, index_col : string, optional, default: None Column name to use as index for the returned DataFrame object. coerce_float : boolean, default True - Attempt to convert values to non-string, non-numeric objects (like + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets params : list, tuple or dict, optional, default: None List of parameters to pass to execute method. The syntax used From 7045bbf939a6306384f364e87bb313ffe6956cb0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 09:31:29 -0500 Subject: [PATCH 039/338] DOC: fix links in whatsnew --- doc/source/whatsnew/v0.20.0.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e4dacfaf40417..c5510d4b8151c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -10,7 +10,7 @@ users upgrade to this version. Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) -- The ``.ix`` indexer has been deprecated, see :ref:`here ` +- The ``.ix`` indexer has been deprecated, see :ref:`here ` Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -144,8 +144,7 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew.api_breaking.deprecate_ix +.. _whatsnew_0200.api_breaking.deprecate_ix Deprecate .ix ^^^^^^^^^^^^^ @@ -158,7 +157,7 @@ The recommended methods of indexing are: - ``.loc`` if you want to *label* index - ``.iloc`` if you want to *positionally* index. -Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code `here `_ +Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `. .. ipython:: python @@ -192,7 +191,7 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi df.iloc[[0, 2], df.columns.get_loc('A')] -.. _whatsnew.api_breaking.index_map +.. _whatsnew_0200.api_breaking.index_map Map on Index types now return other Index types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 59701d582eedcad60bcb94618bcdca0fd542b841 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 11:21:41 -0500 Subject: [PATCH 040/338] CI: add .pxi.in to submit caching --- ci/submit_cython_cache.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh index 5c98c3df61736..01c37d16da2fc 100755 --- a/ci/submit_cython_cache.sh +++ b/ci/submit_cython_cache.sh @@ -2,7 +2,7 @@ CACHE_File="$HOME/.cache/cython_files.tar" PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd"` +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd -o -name "*.pxi.in""` rm -rf $CACHE_File rm -rf $PYX_CACHE_DIR From a2e454bab64a4066ae11251533371da62028183f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 11:21:41 -0500 Subject: [PATCH 041/338] CLN: remove ci/ironcache, speedpack --- ci/ironcache/get.py | 41 --------- ci/ironcache/put.py | 48 ---------- ci/speedpack/Vagrantfile | 22 ----- ci/speedpack/build.sh | 117 ------------------------- ci/speedpack/nginx/nginx.conf.template | 48 ---------- 5 files changed, 276 deletions(-) delete mode 100644 ci/ironcache/get.py delete mode 100644 ci/ironcache/put.py delete mode 100644 ci/speedpack/Vagrantfile delete mode 100755 ci/speedpack/build.sh delete mode 100644 ci/speedpack/nginx/nginx.conf.template diff --git a/ci/ironcache/get.py b/ci/ironcache/get.py deleted file mode 100644 index a4663472b955c..0000000000000 --- a/ci/ironcache/get.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import re -import os -import time -import json -import base64 -from hashlib import sha1 -from iron_cache import * -import traceback as tb - -key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), - os.environ.get('JOB_NAME','unk')) -print(key) - -if sys.version_info[0] > 2: - key = bytes(key,encoding='utf8') - -key = sha1(key).hexdigest()[:8]+'.' - -b = b'' -cache = IronCache() -for i in range(20): - print("getting %s" % key+str(i)) - try: - item = cache.get(cache="travis", key=key+str(i)) - v = item.value - if sys.version_info[0] > 2: - v = bytes(v,encoding='utf8') - b += bytes(base64.b64decode(v)) - except Exception as e: - try: - print(tb.format_exc(e)) - except: - print("exception during exception, oh my") - break - -with open(os.path.join(os.environ.get('HOME',''),"ccache.7z"),'wb') as f: - f.write(b) diff --git a/ci/ironcache/put.py b/ci/ironcache/put.py deleted file mode 100644 index f6aef3a327e87..0000000000000 --- a/ci/ironcache/put.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import re -import os -import time -import json -import base64 -from hashlib import sha1 -from iron_cache import * - -key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), - os.environ.get('JOB_NAME','unk')) - -key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), - os.environ.get('JOB_NAME','unk')) -print(key) - -if sys.version_info[0] > 2: - key = bytes(key,encoding='utf8') - -key = sha1(key).hexdigest()[:8]+'.' - -os.chdir(os.environ.get('HOME')) - -cache = IronCache() - -i=0 - -for i, fname in enumerate(sorted([x for x in os.listdir('.') if re.match("ccache.\d+$",x)])): - print("Putting %s" % key+str(i)) - with open(fname,"rb") as f: - s= f.read() - value=base64.b64encode(s) - if isinstance(value, bytes): - value = value.decode('ascii') - item = cache.put(cache="travis", key=key+str(i), value=value,options=dict(expires_in=24*60*60)) - -# print("foo") -for i in range(i+1,20): - - try: - item = cache.delete(key+str(i),cache='travis') - print("Deleted %s" % key+str(i)) - except: - break - pass diff --git a/ci/speedpack/Vagrantfile b/ci/speedpack/Vagrantfile deleted file mode 100644 index ec939b7c0a937..0000000000000 --- a/ci/speedpack/Vagrantfile +++ /dev/null @@ -1,22 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : -Vagrant.configure("2") do |config| - config.vm.box = "precise64" - config.vm.box_url = "http://files.vagrantup.com/precise64.box" - -# config.vbguest.auto_update = true -# config.vbguest.no_remote = true - - config.vm.synced_folder File.expand_path("..", Dir.pwd), "/reqf" - config.vm.synced_folder "wheelhouse", "/wheelhouse" - - config.vm.provider :virtualbox do |vb| - vb.customize ["modifyvm", :id, "--cpus", "4"] - vb.customize ["modifyvm", :id, "--memory", "2048"] - vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] - vb.customize ["modifyvm", :id, "--natdnsproxy1", "on"] - end - - config.vm.provision :shell, :path => "build.sh" - -end diff --git a/ci/speedpack/build.sh b/ci/speedpack/build.sh deleted file mode 100755 index 330d8984ea7b7..0000000000000 --- a/ci/speedpack/build.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# This script is meant to run on a mint precise64 VM. -# The generated wheel files should be compatible -# with travis-ci as of 07/2013. -# -# Runtime can be up to an hour or more. - -echo "Building wheels..." - -# print a trace for everything; RTFM -set -x - -# install and update some basics -apt-get update -apt-get install python-software-properties git -y -apt-add-repository ppa:fkrull/deadsnakes -y -apt-get update - -# install some deps and virtualenv -apt-get install python-pip libfreetype6-dev libpng12-dev libhdf5-serial-dev \ - g++ libatlas-base-dev gfortran libreadline-dev zlib1g-dev flex bison \ - libxml2-dev libxslt-dev libssl-dev -y -pip install virtualenv -apt-get build-dep python-lxml -y - -# install sql servers -apt-get install postgresql-client libpq-dev -y - -export PYTHONIOENCODING='utf-8' -export VIRTUALENV_DISTRIBUTE=0 - -function create_fake_pandas() { - local site_pkg_dir="$1" - rm -rf $site_pkg_dir/pandas - mkdir $site_pkg_dir/pandas - touch $site_pkg_dir/pandas/__init__.py - echo "version = '0.10.0-phony'" > $site_pkg_dir/pandas/version.py -} - - -function get_site_pkgs_dir() { - python$1 -c 'import distutils; print(distutils.sysconfig.get_python_lib())' -} - - -function create_wheel() { - local pip_args="$1" - local wheelhouse="$2" - local n="$3" - local pyver="$4" - - local site_pkgs_dir="$(get_site_pkgs_dir $pyver)" - - - if [[ "$n" == *statsmodels* ]]; then - create_fake_pandas $site_pkgs_dir && \ - pip wheel $pip_args --wheel-dir=$wheelhouse $n && \ - pip install $pip_args --no-index $n && \ - rm -Rf $site_pkgs_dir - else - pip wheel $pip_args --wheel-dir=$wheelhouse $n - pip install $pip_args --no-index $n - fi -} - - -function generate_wheels() { - # get the requirements file - local reqfile="$1" - - # get the python version - local TAG=$(echo $reqfile | grep -Po "(\d\.?[\d\-](_\w+)?)") - - # base dir for wheel dirs - local WHEELSTREET=/wheelhouse - local WHEELHOUSE="$WHEELSTREET/$TAG" - - local PY_VER="${TAG:0:3}" - local PY_MAJOR="${PY_VER:0:1}" - local PIP_ARGS="--use-wheel --find-links=$WHEELHOUSE --download-cache /tmp" - - # install the python version if not installed - apt-get install python$PY_VER python$PY_VER-dev -y - - # create a new virtualenv - rm -Rf /tmp/venv - virtualenv -p python$PY_VER /tmp/venv - source /tmp/venv/bin/activate - - # install pip setuptools - pip install -I --download-cache /tmp 'git+https://github.com/pypa/pip@42102e9d#egg=pip' - pip install -I -U --download-cache /tmp setuptools - pip install -I --download-cache /tmp wheel - - # make the dir if it doesn't exist - mkdir -p $WHEELHOUSE - - # put the requirements file in the wheelhouse - cp $reqfile $WHEELHOUSE - - # install and build the wheels - cat $reqfile | while read N; do - create_wheel "$PIP_ARGS" "$WHEELHOUSE" "$N" "$PY_VER" - done -} - - -# generate a single wheel version -# generate_wheels "/reqf/requirements-2.7.txt" -# -# if vagrant is already up -# run as vagrant provision - -for reqfile in $(ls -1 /reqf/requirements-*.*); do - generate_wheels "$reqfile" -done diff --git a/ci/speedpack/nginx/nginx.conf.template b/ci/speedpack/nginx/nginx.conf.template deleted file mode 100644 index e2cfeaf053d08..0000000000000 --- a/ci/speedpack/nginx/nginx.conf.template +++ /dev/null @@ -1,48 +0,0 @@ -#user nobody; -worker_processes 1; - -#error_log logs/error.log; -#error_log logs/error.log notice; -#error_log logs/error.log info; - -#pid logs/nginx.pid; - - -events { - worker_connections 1024; -} - - -http { - include mime.types; - default_type application/octet-stream; - - #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' - # '$status $body_bytes_sent "$http_referer" ' - # '"$http_user_agent" "$http_x_forwarded_for"'; - - #access_log logs/access.log on; - - sendfile on; - #tcp_nopush on; - - #keepalive_timeout 0; - keepalive_timeout 65; - - #gzip on; - - server { - listen $OPENSHIFT_IP:$OPENSHIFT_PORT; - - access_log access.log ; - sendfile on; - - location / { - root ../../app-root/data/store/; - autoindex on; - } - - - } - -} From 11c1bca7123cfecd5d54c42753fe1b571d10f081 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 12:53:53 -0500 Subject: [PATCH 042/338] CI: typo in submit_cache --- ci/submit_cython_cache.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh index 01c37d16da2fc..cfbced4988357 100755 --- a/ci/submit_cython_cache.sh +++ b/ci/submit_cython_cache.sh @@ -2,7 +2,7 @@ CACHE_File="$HOME/.cache/cython_files.tar" PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd -o -name "*.pxi.in""` +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` rm -rf $CACHE_File rm -rf $PYX_CACHE_DIR From a8f9ec5d14bf1688841987aec73de07235afeb1b Mon Sep 17 00:00:00 2001 From: watercrossing Date: Thu, 19 Jan 2017 15:13:45 -0500 Subject: [PATCH 043/338] Bug in groupby.get_group on categoricalindex closes #15155 Author: watercrossing Closes #15163 from watercrossing/indexgroup and squashes the following commits: 742d4a5 [watercrossing] BUG: GroupBy.get_group failing with a categorical grouper (#15155) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/indexes/category.py | 3 +++ pandas/tests/groupby/test_categorical.py | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c5510d4b8151c..0c065937c3622 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -388,6 +388,7 @@ Bug Fixes - Bug in compat for passing long integers to ``Timestamp.replace`` (:issue:`15030`) - Bug in ``.loc`` that would not return the correct dtype for scalar access for a DataFrame (:issue:`11617`) +- Bug in ``GroupBy.get_group()`` failing with a categorical grouper (:issue:`15155`) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 2c89f72975ade..e3ffa40f5f94a 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -255,6 +255,9 @@ def categories(self): def ordered(self): return self._data.ordered + def _reverse_indexer(self): + return self._data._reverse_indexer() + def __contains__(self, key): hash(key) return key in self.values diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 99bea3a10115b..81aa183426be9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -67,6 +67,26 @@ def setUp(self): 'E': np.random.randn(11), 'F': np.random.randn(11)}) + def test_level_groupby_get_group(self): + # GH15155 + df = DataFrame(data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + labels=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"])) + g = df.groupby(level=["Index1"]) + + # expected should equal test.loc[["a"]] + # GH15166 + expected = DataFrame(data=np.arange(2, 12, 2), + index=pd.MultiIndex(levels=[pd.CategoricalIndex( + ["a", "b"]), range(5)], + labels=[[0] * 5, range(5)], + names=["Index1", "Index2"])) + result = g.get_group('a') + + assert_frame_equal(result, expected) + def test_apply_use_categorical_name(self): from pandas import qcut cats = qcut(self.df.C, 4) From 2692d0a9b0b4e6a89e610c75ab94f5b4cbc214c9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 15:58:29 -0500 Subject: [PATCH 044/338] DOC: doc building edits --- doc/source/groupby.rst | 2 +- doc/source/io.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 45af02cb60b25..8484ccd69a983 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -810,7 +810,7 @@ next). This enables some operations to be carried out rather succinctly: tsdf = pd.DataFrame(np.random.randn(1000, 3), index=pd.date_range('1/1/2000', periods=1000), columns=['A', 'B', 'C']) - tsdf.ix[::2] = np.nan + tsdf.iloc[::2] = np.nan grouped = tsdf.groupby(lambda x: x.year) grouped.fillna(method='pad') diff --git a/doc/source/io.rst b/doc/source/io.rst index 5483248e6e3d4..c53083238e098 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1400,7 +1400,7 @@ returned object: df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) df - df.iloc[1978] + df.loc[1978] .. _io.multi_index_columns: From 6158cb97f574953dcdf473ec1f012199c21292ca Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 16:50:38 -0500 Subject: [PATCH 045/338] DOC: small change in .ix deprecation docs --- doc/source/indexing.rst | 13 +++++++++---- doc/source/whatsnew/v0.20.0.txt | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e2916c2d969d6..bc8997b313053 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -557,13 +557,18 @@ IX Indexer is Deprecated .. warning:: - Starting in 0.20.0, the ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` -and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to -do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*. This has caused -quite a bit of user confusion over the years. + Starting in 0.20.0, the ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` + and ``.loc`` indexers. + +``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide +to index *positionally* OR via *labels* depending on the data type of the index. This has caused quite a +bit of user confusion over the years. The recommended methods of indexing are: +- ``.loc`` if you want to *label* index +- ``.iloc`` if you want to *positionally* index. + .. ipython:: python dfd = pd.DataFrame({'A': [1, 2, 3], diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0c065937c3622..f2d64a059282f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -149,7 +149,7 @@ Backwards incompatible API changes Deprecate .ix ^^^^^^^^^^^^^ -The ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*. This has caused quite a bit of user confusion over the years. The full indexing documentation are :ref:`here `. (:issue:`14218`) +The ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*, depending on the data type of the index. This has caused quite a bit of user confusion over the years. The full indexing documentation are :ref:`here `. (:issue:`14218`) The recommended methods of indexing are: From 032c38027cc1d1af617d5298754de873ba8d6a00 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 Jan 2017 16:53:37 -0500 Subject: [PATCH 046/338] CI: have 34_SLOW build use matplotlib latest from conda-forge (#15170) CI: change 34_SLOW to use numpy 1.10* as mpl 2.0 requires anyhow --- ci/requirements-3.4_SLOW.build | 2 +- ci/requirements-3.4_SLOW.run | 2 +- ci/requirements-3.4_SLOW.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/requirements-3.4_SLOW.build b/ci/requirements-3.4_SLOW.build index de36b1afb9fa4..c05a68a14b402 100644 --- a/ci/requirements-3.4_SLOW.build +++ b/ci/requirements-3.4_SLOW.build @@ -1,4 +1,4 @@ python-dateutil pytz -numpy=1.9.3 +numpy=1.10* cython diff --git a/ci/requirements-3.4_SLOW.run b/ci/requirements-3.4_SLOW.run index f9f226e3f1465..215f840381ada 100644 --- a/ci/requirements-3.4_SLOW.run +++ b/ci/requirements-3.4_SLOW.run @@ -1,6 +1,6 @@ python-dateutil pytz -numpy=1.9.3 +numpy=1.10* openpyxl xlsxwriter xlrd diff --git a/ci/requirements-3.4_SLOW.sh b/ci/requirements-3.4_SLOW.sh index bc8fb79147d2c..24f1e042ed69e 100644 --- a/ci/requirements-3.4_SLOW.sh +++ b/ci/requirements-3.4_SLOW.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 34_slow" -conda install -n pandas -c conda-forge/label/rc -c conda-forge matplotlib +conda install -n pandas -c conda-forge matplotlib From 2aeb77062c4720098e794be29fec9f8f92a8270d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 20 Jan 2017 07:19:21 -0500 Subject: [PATCH 047/338] BUG: no need to validate monotonicity when groupby-rolling closes #15130 Author: Jeff Reback Closes #15175 from jreback/groupby_rolling and squashes the following commits: 5831b8e [Jeff Reback] BUG: no need to validate monotonicity when groupby-rolling --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/window.py | 51 ++++++++++++++++++++++++--------- pandas/tests/test_window.py | 20 +++++++++++++ 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f2d64a059282f..7e92b08ba4b25 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -392,7 +392,7 @@ Bug Fixes - +- Bug in ``.groupby(...).rolling(...)`` when ``on`` is specified and using a ``DatetimeIndex`` (:issue:`15130`) diff --git a/pandas/core/window.py b/pandas/core/window.py index b7276aed506de..b330a12110923 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1025,19 +1025,8 @@ def validate(self): if (self.is_datetimelike and isinstance(self.window, (compat.string_types, DateOffset))): - # must be monotonic for on - if not self._on.is_monotonic: - formatted = self.on or 'index' - raise ValueError("{0} must be " - "monotonic".format(formatted)) - - from pandas.tseries.frequencies import to_offset - try: - freq = to_offset(self.window) - except (TypeError, ValueError): - raise ValueError("passed window {0} in not " - "compat with a datetimelike " - "index".format(self.window)) + self._validate_monotonic() + freq = self._validate_freq() # we don't allow center if self.center: @@ -1058,6 +1047,23 @@ def validate(self): elif self.window < 0: raise ValueError("window must be non-negative") + def _validate_monotonic(self): + """ validate on is monotonic """ + if not self._on.is_monotonic: + formatted = self.on or 'index' + raise ValueError("{0} must be " + "monotonic".format(formatted)) + + def _validate_freq(self): + """ validate & return our freq """ + from pandas.tseries.frequencies import to_offset + try: + return to_offset(self.window) + except (TypeError, ValueError): + raise ValueError("passed window {0} in not " + "compat with a datetimelike " + "index".format(self.window)) + @Substitution(name='rolling') @Appender(SelectionMixin._see_also_template) @Appender(SelectionMixin._agg_doc) @@ -1175,6 +1181,25 @@ class RollingGroupby(_GroupByMixin, Rolling): def _constructor(self): return Rolling + def _gotitem(self, key, ndim, subset=None): + + # we are setting the index on the actual object + # here so our index is carried thru to the selected obj + # when we do the splitting for the groupby + if self.on is not None: + self._groupby.obj = self._groupby.obj.set_index(self._on) + self.on = None + return super(RollingGroupby, self)._gotitem(key, ndim, subset=subset) + + def _validate_monotonic(self): + """ + validate that on is monotonic; + we don't care for groupby.rolling + because we have already validated at a higher + level + """ + pass + class Expanding(_Rolling_and_Expanding): """ diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 22c3cee95f314..1afd5dad404c8 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -3616,3 +3616,23 @@ def agg_by_day(x): agg_by_day).reset_index(level=0, drop=True) tm.assert_frame_equal(result, expected) + + def test_groupby_monotonic(self): + + # GH 15130 + # we don't need to validate monotonicity when grouping + + data = [ + ['David', '1/1/2015', 100], ['David', '1/5/2015', 500], + ['David', '5/30/2015', 50], ['David', '7/25/2015', 50], + ['Ryan', '1/4/2014', 100], ['Ryan', '1/19/2015', 500], + ['Ryan', '3/31/2016', 50], ['Joe', '7/1/2015', 100], + ['Joe', '9/9/2015', 500], ['Joe', '10/15/2015', 50]] + + df = pd.DataFrame(data=data, columns=['name', 'date', 'amount']) + df['date'] = pd.to_datetime(df['date']) + + expected = df.set_index('date').groupby('name').apply( + lambda x: x.rolling('180D')['amount'].sum()) + result = df.groupby('name').rolling('180D', on='date')['amount'].sum() + tm.assert_series_equal(result, expected) From 1d5c4fb7d3398cba69b1f836ea4f9669d1f4bc05 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Jan 2017 07:22:24 -0500 Subject: [PATCH 048/338] ENH: Timedelta isoformat Author: Tom Augspurger Closes #15136 from TomAugspurger/timedelta-isoformat and squashes the following commits: 9fda713 [Tom Augspurger] ENH: Timedelta isoformat --- doc/source/timedeltas.rst | 14 +++++++ doc/source/whatsnew/v0.20.0.txt | 3 ++ pandas/tseries/tests/test_timedeltas.py | 39 +++++++++++++++++++ pandas/tslib.pyx | 50 +++++++++++++++++++++++++ 4 files changed, 106 insertions(+) diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index f7aa879fa7216..e4a2b09f616a1 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -310,6 +310,20 @@ similarly to the ``Series``. These are the *displayed* values of the ``Timedelta td.dt.components td.dt.components.seconds +.. _timedeltas.isoformat: + +You can convert a ``Timedelta`` to an ISO 8601 Duration string with the +``.isoformat`` method + +.. versionadded:: 0.20.0 + +.. ipython:: python + pd.Timedelta(days=6, minutes=50, seconds=3, + milliseconds=10, microseconds=10, + nanoseconds=12).isoformat() + +.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations + .. _timedeltas.index: TimedeltaIndex diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7e92b08ba4b25..50161b5b7eff3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -133,11 +133,14 @@ Other enhancements - The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) +- ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) +.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations + .. _whatsnew_0200.api_breaking: diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index e849dc0412c35..6efa024d81b98 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1347,6 +1347,45 @@ def test_components(self): self.assertFalse(result.iloc[0].isnull().all()) self.assertTrue(result.iloc[1].isnull().all()) + def test_isoformat(self): + td = Timedelta(days=6, minutes=50, seconds=3, + milliseconds=10, microseconds=10, nanoseconds=12) + expected = 'P6DT0H50M3.010010012S' + result = td.isoformat() + self.assertEqual(result, expected) + + td = Timedelta(days=4, hours=12, minutes=30, seconds=5) + result = td.isoformat() + expected = 'P4DT12H30M5S' + self.assertEqual(result, expected) + + td = Timedelta(nanoseconds=123) + result = td.isoformat() + expected = 'P0DT0H0M0.000000123S' + self.assertEqual(result, expected) + + # trim nano + td = Timedelta(microseconds=10) + result = td.isoformat() + expected = 'P0DT0H0M0.00001S' + self.assertEqual(result, expected) + + # trim micro + td = Timedelta(milliseconds=1) + result = td.isoformat() + expected = 'P0DT0H0M0.001S' + self.assertEqual(result, expected) + + # NaT + result = Timedelta('NaT').isoformat() + expected = 'NaT' + self.assertEqual(result, expected) + + # don't strip every 0 + result = Timedelta(minutes=1).isoformat() + expected = 'P0DT0H1M0S' + self.assertEqual(result, expected) + def test_constructor(self): expected = TimedeltaIndex(['1 days', '1 days 00:00:05', '2 days', '2 days 00:00:02', '0 days 00:00:03']) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index cde7c0dc10740..15b279d20021a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2966,6 +2966,56 @@ class Timedelta(_Timedelta): """ return 1e-9 *self.value + def isoformat(self): + """ + Format Timedelta as ISO 8601 Duration like + `P[n]Y[n]M[n]DT[n]H[n]M[n]S`, where the `[n]`s are replaced by the + values. See https://en.wikipedia.org/wiki/ISO_8601#Durations + + .. versionadded:: 0.20.0 + + Returns + ------- + formatted : str + + Notes + ----- + The longest component is days, whose value may be larger than + 365. + Every component is always included, even if its value is 0. + Pandas uses nanosecond precision, so up to 9 decimal places may + be included in the seconds component. + Trailing 0's are removed from the seconds component after the decimal. + We do not 0 pad components, so it's `...T5H...`, not `...T05H...` + + Examples + -------- + >>> td = pd.Timedelta(days=6, minutes=50, seconds=3, + ... milliseconds=10, microseconds=10, nanoseconds=12) + >>> td.isoformat() + 'P6DT0H50M3.010010012S' + >>> pd.Timedelta(hours=1, seconds=10).isoformat() + 'P0DT0H0M10S' + >>> pd.Timedelta(hours=1, seconds=10).isoformat() + 'P0DT0H0M10S' + >>> pd.Timedelta(days=500.5).isoformat() + 'P500DT12H0MS' + + See Also + -------- + Timestamp.isoformat + """ + components = self.components + seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds, + components.milliseconds, + components.microseconds, + components.nanoseconds) + # Trim unnecessary 0s, 1.000000000 -> 1 + seconds = seconds.rstrip('0').rstrip('.') + tpl = 'P{td.days}DT{td.hours}H{td.minutes}M{seconds}S'.format( + td=components, seconds=seconds) + return tpl + def __setstate__(self, state): (value) = state self.value = value From 1e91081766d9af395527a88ad7faa4a59206fcde Mon Sep 17 00:00:00 2001 From: Phil Ruffwind Date: Fri, 20 Jan 2017 07:27:18 -0500 Subject: [PATCH 049/338] BUG: Segfault due to float_precision='round_trip' `round_trip` calls back into Python, so the GIL must be held. It also fails to silence the Python exception, leading to spurious errors. Closes #15140. Author: Phil Ruffwind Closes #15148 from Rufflewind/master and squashes the following commits: c513d2e [Phil Ruffwind] BUG: Segfault due to float_precision='round_trip' --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/io/tests/parser/c_parser_only.py | 7 ++++ pandas/parser.pyx | 43 ++++++++++++++++++------- pandas/src/parser/tokenizer.c | 8 ++--- pandas/src/parser/tokenizer.h | 6 +++- 5 files changed, 49 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 50161b5b7eff3..f3d60fa8cc05f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -440,3 +440,5 @@ Bug Fixes - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) - Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) + +- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index c6ef68fcac9a0..6df3c513faf4a 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -388,3 +388,10 @@ def test_read_nrows_large(self): df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010) self.assertTrue(df.size == 1010 * 10) + + def test_float_precision_round_trip_with_text(self): + # gh-15140 - This should not segfault on Python 2.7+ + df = self.read_csv(StringIO('a'), + float_precision='round_trip', + header=None) + tm.assert_frame_equal(df, DataFrame({0: ['a']})) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index bd793c98eef5b..b6e5ad0d73b7c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -181,7 +181,11 @@ cdef extern from "parser/tokenizer.h": PyObject *skipfunc int64_t skip_first_N_rows int skipfooter - double (*converter)(const char *, char **, char, char, char, int) nogil + # pick one, depending on whether the converter requires GIL + double (*double_converter_nogil)(const char *, char **, + char, char, char, int) nogil + double (*double_converter_withgil)(const char *, char **, + char, char, char, int) # error handling char *warn_msg @@ -482,11 +486,14 @@ cdef class TextReader: self.verbose = verbose self.low_memory = low_memory - self.parser.converter = xstrtod + self.parser.double_converter_nogil = xstrtod + self.parser.double_converter_withgil = NULL if float_precision == 'high': - self.parser.converter = precise_xstrtod - elif float_precision == 'round_trip': - self.parser.converter = round_trip + self.parser.double_converter_nogil = precise_xstrtod + self.parser.double_converter_withgil = NULL + elif float_precision == 'round_trip': # avoid gh-15140 + self.parser.double_converter_nogil = NULL + self.parser.double_converter_withgil = round_trip # encoding if encoding is not None: @@ -1699,8 +1706,19 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, result = np.empty(lines, dtype=np.float64) data = result.data na_fset = kset_float64_from_list(na_flist) - with nogil: - error = _try_double_nogil(parser, col, line_start, line_end, + if parser.double_converter_nogil != NULL: # if it can run without the GIL + with nogil: + error = _try_double_nogil(parser, parser.double_converter_nogil, + col, line_start, line_end, + na_filter, na_hashset, use_na_flist, + na_fset, NA, data, &na_count) + else: + assert parser.double_converter_withgil != NULL + error = _try_double_nogil(parser, + parser.double_converter_withgil, + col, line_start, line_end, na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) kh_destroy_float64(na_fset) @@ -1708,8 +1726,11 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, return None, None return result, na_count -cdef inline int _try_double_nogil(parser_t *parser, int col, - int line_start, int line_end, +cdef inline int _try_double_nogil(parser_t *parser, + double (*double_converter)( + const char *, char **, char, + char, char, int) nogil, + int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, bint use_na_flist, const kh_float64_t *na_flist, @@ -1739,7 +1760,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, na_count[0] += 1 data[0] = NA else: - data[0] = parser.converter(word, &p_end, parser.decimal, + data[0] = double_converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: if (strcasecmp(word, cinf) == 0 or @@ -1760,7 +1781,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, else: for i in range(lines): COLITER_NEXT(it, word) - data[0] = parser.converter(word, &p_end, parser.decimal, + data[0] = double_converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: if (strcasecmp(word, cinf) == 0 or diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 87e17fe5fb751..7cddefd40cfc5 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1773,11 +1773,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) { -#if PY_VERSION_HEX >= 0x02070000 - return PyOS_string_to_double(p, q, 0); -#else - return strtod(p, q); -#endif + double r = PyOS_string_to_double(p, q, 0); + PyErr_Clear(); + return r; } // End of xstrtod code diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index e7271cabb0752..6c1bc630ab547 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -201,7 +201,11 @@ typedef struct parser_t { PyObject *skipfunc; int64_t skip_first_N_rows; int skip_footer; - double (*converter)(const char *, char **, char, char, char, int); + // pick one, depending on whether the converter requires GIL + double (*double_converter_nogil)(const char *, char **, + char, char, char, int); + double (*double_converter_withgil)(const char *, char **, + char, char, char, int); // error handling char *warn_msg; From 31f4721c96863b68193dc446720d9f1f1d274295 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 20 Jan 2017 18:12:57 +0100 Subject: [PATCH 050/338] Apply fontsize to both x- and y-axis tick labels (#15161) * Apply fontsize to both x- and y-axis tick labels * Add tests * Add note in whatsnew --- doc/source/whatsnew/v0.20.0.txt | 3 +++ pandas/tests/plotting/test_boxplot_method.py | 10 ++++++++++ pandas/tools/plotting.py | 6 ++++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f3d60fa8cc05f..798151971363e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -439,6 +439,9 @@ Bug Fixes - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) + - Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) + +- Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index f76cdb70c0240..289d48ba6d4cc 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -158,6 +158,11 @@ def test_boxplot_empty_column(self): df.loc[:, 0] = np.nan _check_plot_works(df.boxplot, return_type='axes') + def test_fontsize(self): + df = DataFrame({"a": [1, 2, 3, 4, 5, 6]}) + self._check_ticks_props(df.boxplot("a", fontsize=16), + xlabelsize=16, ylabelsize=16) + @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): @@ -369,6 +374,11 @@ def test_grouped_box_multiple_axes(self): with tm.assert_produces_warning(UserWarning): axes = df.groupby('classroom').boxplot(ax=axes) + def test_fontsize(self): + df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}) + self._check_ticks_props(df.boxplot("a", by="b", fontsize=16), + xlabelsize=16, ylabelsize=16) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index bd9933b12b580..012d67d29cc3f 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2768,10 +2768,12 @@ def plot_group(keys, values, ax): keys = [pprint_thing(x) for x in keys] values = [remove_na(v) for v in values] bp = ax.boxplot(values, **kwds) + if fontsize is not None: + ax.tick_params(axis='both', labelsize=fontsize) if kwds.get('vert', 1): - ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) + ax.set_xticklabels(keys, rotation=rot) else: - ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize) + ax.set_yticklabels(keys, rotation=rot) maybe_color_bp(bp) # Return axes in multiplot case, maybe revisit later # 985 From 20f63bd6dd1851ff3a5a4d8afdb881aac9b763e4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 20 Jan 2017 12:53:10 -0800 Subject: [PATCH 051/338] BLD: Remove csv dialect warning in io.rst (#15177) --- doc/source/io.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index c53083238e098..6152f83675867 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -419,6 +419,7 @@ finds the closing double quote. We can get around this using ``dialect`` .. ipython:: python + :okwarning: dia = csv.excel() dia.quoting = csv.QUOTE_NONE From fb8e3e1027ce9ebb4309c7ffeda430bd11fdd374 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 20 Jan 2017 07:09:42 -0500 Subject: [PATCH 052/338] DOC: reord dialect in io.rst closes #15179 --- doc/source/io.rst | 145 ++++++++++++++++++---------------------------- 1 file changed, 56 insertions(+), 89 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6152f83675867..f637bf71bb49a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -357,95 +357,6 @@ warn_bad_lines : boolean, default ``True`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output (only valid with C parser). -.. ipython:: python - :suppress: - - f = open('foo.csv','w') - f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') - f.close() - -Consider a typical CSV file containing, in this case, some time series data: - -.. ipython:: python - - print(open('foo.csv').read()) - -The default for `read_csv` is to create a DataFrame with simple numbered rows: - -.. ipython:: python - - pd.read_csv('foo.csv') - -In the case of indexed data, you can pass the column number or column name you -wish to use as the index: - -.. ipython:: python - - pd.read_csv('foo.csv', index_col=0) - -.. ipython:: python - - pd.read_csv('foo.csv', index_col='date') - -You can also use a list of columns to create a hierarchical index: - -.. ipython:: python - - pd.read_csv('foo.csv', index_col=[0, 'A']) - -.. _io.dialect: - -The ``dialect`` keyword gives greater flexibility in specifying the file format. -By default it uses the Excel dialect but you can specify either the dialect name -or a :class:`python:csv.Dialect` instance. - -.. ipython:: python - :suppress: - - data = ('label1,label2,label3\n' - 'index1,"a,c,e\n' - 'index2,b,d,f') - -Suppose you had data with unenclosed quotes: - -.. ipython:: python - - print(data) - -By default, ``read_csv`` uses the Excel dialect and treats the double quote as -the quote character, which causes it to fail when it finds a newline before it -finds the closing double quote. - -We can get around this using ``dialect`` - -.. ipython:: python - :okwarning: - - dia = csv.excel() - dia.quoting = csv.QUOTE_NONE - pd.read_csv(StringIO(data), dialect=dia) - -All of the dialect options can be specified separately by keyword arguments: - -.. ipython:: python - - data = 'a,b,c~1,2,3~4,5,6' - pd.read_csv(StringIO(data), lineterminator='~') - -Another common dialect option is ``skipinitialspace``, to skip any whitespace -after a delimiter: - -.. ipython:: python - - data = 'a, b, c\n1, 2, 3\n4, 5, 6' - print(data) - pd.read_csv(StringIO(data), skipinitialspace=True) - -The parsers make every attempt to "do the right thing" and not be very -fragile. Type inference is a pretty big deal. So if a column can be coerced to -integer dtype without altering the contents, it will do so. Any non-numeric -columns will come through as object dtype as with the rest of pandas objects. - .. _io.dtypes: Specifying column data types @@ -1239,6 +1150,62 @@ data that appear in some lines but not others: 1 4 5 6 2 8 9 10 +.. _io.dialect: + +Dialect +''''''' + +The ``dialect`` keyword gives greater flexibility in specifying the file format. +By default it uses the Excel dialect but you can specify either the dialect name +or a :class:`python:csv.Dialect` instance. + +.. ipython:: python + :suppress: + + data = ('label1,label2,label3\n' + 'index1,"a,c,e\n' + 'index2,b,d,f') + +Suppose you had data with unenclosed quotes: + +.. ipython:: python + + print(data) + +By default, ``read_csv`` uses the Excel dialect and treats the double quote as +the quote character, which causes it to fail when it finds a newline before it +finds the closing double quote. + +We can get around this using ``dialect`` + +.. ipython:: python + :okwarning: + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + pd.read_csv(StringIO(data), dialect=dia) + +All of the dialect options can be specified separately by keyword arguments: + +.. ipython:: python + + data = 'a,b,c~1,2,3~4,5,6' + pd.read_csv(StringIO(data), lineterminator='~') + +Another common dialect option is ``skipinitialspace``, to skip any whitespace +after a delimiter: + +.. ipython:: python + + data = 'a, b, c\n1, 2, 3\n4, 5, 6' + print(data) + pd.read_csv(StringIO(data), skipinitialspace=True) + +The parsers make every attempt to "do the right thing" and not be very +fragile. Type inference is a pretty big deal. So if a column can be coerced to +integer dtype without altering the contents, it will do so. Any non-numeric +columns will come through as object dtype as with the rest of pandas objects. + .. _io.quoting: Quoting and Escape Characters From 7a0fc07962b20c25a360ff9285adc264ffa875d0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 21 Jan 2017 10:05:47 -0500 Subject: [PATCH 053/338] DOC: make sure foo.csv is properly created in io.rst DOC: remove some experimental tags --- doc/source/io.rst | 33 +++++++++++++++++++++++---------- pandas/io/json.py | 1 + 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f637bf71bb49a..d4bbb3d114b52 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -37,9 +37,9 @@ object. * :ref:`read_feather` * :ref:`read_sql` * :ref:`read_json` - * :ref:`read_msgpack` (experimental) + * :ref:`read_msgpack` * :ref:`read_html` - * :ref:`read_gbq` (experimental) + * :ref:`read_gbq` * :ref:`read_stata` * :ref:`read_sas` * :ref:`read_clipboard` @@ -53,9 +53,9 @@ The corresponding ``writer`` functions are object methods that are accessed like * :ref:`to_feather` * :ref:`to_sql` * :ref:`to_json` - * :ref:`to_msgpack` (experimental) + * :ref:`to_msgpack` * :ref:`to_html` - * :ref:`to_gbq` (experimental) + * :ref:`to_gbq` * :ref:`to_stata` * :ref:`to_clipboard` * :ref:`to_pickle` @@ -428,8 +428,8 @@ worth trying. :okwarning: df = pd.DataFrame({'col_1': list(range(500000)) + ['a', 'b'] + list(range(500000))}) - df.to_csv('foo') - mixed_df = pd.read_csv('foo') + df.to_csv('foo.csv') + mixed_df = pd.read_csv('foo.csv') mixed_df['col_1'].apply(type).value_counts() mixed_df['col_1'].dtype @@ -438,6 +438,11 @@ worth trying. data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. +.. ipython:: python + :suppress: + + os.remove('foo.csv') + .. _io.categorical: Specifying Categorical dtype @@ -570,6 +575,7 @@ The ``usecols`` argument can also be used to specify which columns not to use in the final result: .. ipython:: python + pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c']) In this case, the callable is specifying that we exclude the "a" and "c" @@ -730,6 +736,13 @@ input text data into ``datetime`` objects. The simplest case is to just pass in ``parse_dates=True``: +.. ipython:: python + :suppress: + + f = open('foo.csv','w') + f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + .. ipython:: python # Use a column as an index, and parse it as dates. @@ -2826,8 +2839,8 @@ any pickled pandas object (or any other pickled object) from file: .. _io.msgpack: -msgpack (experimental) ----------------------- +msgpack +------- .. versionadded:: 0.13.0 @@ -4547,8 +4560,8 @@ And then issue the following queries: .. _io.bigquery: -Google BigQuery (Experimental) ------------------------------- +Google BigQuery +--------------- .. versionadded:: 0.13.0 diff --git a/pandas/io/json.py b/pandas/io/json.py index 74a1ac43f1dd5..767a2212d92da 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -745,6 +745,7 @@ def json_normalize(data, record_path=None, meta=None, path to records is ['foo', 'bar'] meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' + * ignore : will ignore KeyError if keys listed in meta are not always present * raise : will raise KeyError if keys listed in meta are not From 27013f966868bb01b4a523498ead802416c7c972 Mon Sep 17 00:00:00 2001 From: Giacomo Ferroni Date: Sat, 21 Jan 2017 10:30:56 -0500 Subject: [PATCH 054/338] BUG: Comparisons result in different dtypes for empty DataFrames #15077 closes #15077 Author: Giacomo Ferroni Author: Giacomo Ferroni Author: mralgos Closes #15115 from mralgos/gh15077 and squashes the following commits: a5ca359 [mralgos] Merge branch 'master' into gh15077 dc0803b [Giacomo Ferroni] Merge branch 'master' into gh15077 b2f2d1e [Giacomo Ferroni] Merge branch 'gh15077' of https://github.com/mralgos/pandas into gh15077 fcbcb5b [Giacomo Ferroni] Apply review changes 9723c5d [Giacomo Ferroni] Merge branch 'master' into gh15077 eb7d9fd [Giacomo Ferroni] Delete blank lines 28437bb [Giacomo Ferroni] Check for bool dtype return added also for Series. Minor update to whatsnew 19296f1 [Giacomo Ferroni] Added test for gh15077 (cf. gh15115) and whatsnew note ea11867 [Giacomo Ferroni] [gh15077] Bugfix --- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/core/frame.py | 3 --- pandas/tests/frame/test_operators.py | 16 ++++++++++++++++ pandas/tests/series/test_operators.py | 14 ++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 798151971363e..8c7e3acc97305 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -437,9 +437,10 @@ Bug Fixes - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) - +- Incorrect dtyped ``Series`` was returned by comparison methods (e.g., ``lt``, ``gt``, ...) against a constant for an empty ``DataFrame`` (:issue:`15077`) - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) + - Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef8b7a0d16bdc..799e2f266a137 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3595,9 +3595,6 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): return self._constructor(new_data) def _combine_const(self, other, func, raise_on_error=True): - if self.empty: - return self - new_data = self._data.eval(func=func, other=other, raise_on_error=raise_on_error) return self._constructor(new_data) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index ef6297fc0b52e..f843a5c08ce05 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -671,6 +671,22 @@ def _test_seq(df, idx_ser, col_ser): exp = DataFrame({'col': [False, True, False]}) assert_frame_equal(result, exp) + def test_return_dtypes_bool_op_costant(self): + # GH15077 + df = DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + const = 2 + + # not empty DataFrame + for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + result = getattr(df, op)(const).get_dtype_counts() + self.assert_series_equal(result, Series([2], ['bool'])) + + # empty DataFrame + empty = df.iloc[:0] + for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + result = getattr(empty, op)(const).get_dtype_counts() + self.assert_series_equal(result, Series([2], ['bool'])) + def test_dti_tz_convert_to_utc(self): base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='UTC') diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 6650a171b818b..b013e1a6f1c10 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1294,6 +1294,20 @@ def test_comparison_flex_alignment_fill(self): exp = pd.Series([True, True, False, False], index=list('abcd')) assert_series_equal(left.gt(right, fill_value=0), exp) + def test_return_dtypes_bool_op_costant(self): + # gh15115 + s = pd.Series([1, 3, 2], index=range(3)) + const = 2 + for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + result = getattr(s, op)(const).get_dtype_counts() + self.assert_series_equal(result, Series([1], ['bool'])) + + # empty Series + empty = s.iloc[:0] + for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + result = getattr(empty, op)(const).get_dtype_counts() + self.assert_series_equal(result, Series([1], ['bool'])) + def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types index = list('bca') From adc745512028e9b90d8a9a797c6d08d8e6ee156d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 21 Jan 2017 11:49:21 -0500 Subject: [PATCH 055/338] DEPR: remove openpyxl deprecation warnings (#15185) closes #15090 --- pandas/io/excel.py | 12 +++++++++--- pandas/io/tests/test_excel.py | 23 +++++++++++++++-------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 6b7c597ecfcdc..f34ba65cf7b51 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -788,9 +788,15 @@ def __init__(self, path, engine=None, **engine_kwargs): # Create workbook object with default optimized_write=True. self.book = Workbook() + # Openpyxl 1.6.1 adds a dummy sheet. We remove it. if self.book.worksheets: - self.book.remove_sheet(self.book.worksheets[0]) + try: + self.book.remove(self.book.worksheets[0]) + except AttributeError: + + # compat + self.book.remove_sheet(self.book.worksheets[0]) def save(self): """ @@ -911,7 +917,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) + xcell = wks["%s%s" % (colletter, startrow + cell.row + 1)] xcell.value = _conv_value(cell.val) style_kwargs = {} @@ -952,7 +958,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # Ignore first cell. It is already handled. continue colletter = get_column_letter(col) - xcell = wks.cell("%s%s" % (colletter, row)) + xcell = wks["%s%s" % (colletter, row)] xcell.style = xcell.style.copy(**style_kwargs) @classmethod diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 9dd2739608b7a..a2162320d9d1b 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -2006,8 +2006,8 @@ def test_write_cells_merge_styled(self): writer.write_cells(merge_cells, sheet_name=sheet_name) wks = writer.sheets[sheet_name] - xcell_b1 = wks.cell('B1') - xcell_a2 = wks.cell('A2') + xcell_b1 = wks['B1'] + xcell_a2 = wks['A2'] self.assertEqual(xcell_b1.style, openpyxl_sty_merged) self.assertEqual(xcell_a2.style, openpyxl_sty_merged) @@ -2118,8 +2118,8 @@ def test_write_cells_merge_styled(self): writer.write_cells(merge_cells, sheet_name=sheet_name) wks = writer.sheets[sheet_name] - xcell_b1 = wks.cell('B1') - xcell_a2 = wks.cell('A2') + xcell_b1 = wks['B1'] + xcell_a2 = wks['A2'] self.assertEqual(xcell_b1.font, openpyxl_sty_merged) self.assertEqual(xcell_a2.font, openpyxl_sty_merged) @@ -2213,11 +2213,18 @@ def test_column_format(self): writer.save() read_workbook = openpyxl.load_workbook(path) - read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1') + try: + read_worksheet = read_workbook['Sheet1'] + except TypeError: + # compat + read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1') - # Get the number format from the cell. This method is backward - # compatible with older versions of openpyxl. - cell = read_worksheet.cell('B2') + # Get the number format from the cell. + try: + cell = read_worksheet['B2'] + except TypeError: + # compat + cell = read_worksheet.cell('B2') try: read_num_format = cell.number_format From 4c4bf2191847f317cd47e3c81b01ee1dfcf26373 Mon Sep 17 00:00:00 2001 From: Nick Chmura Date: Fri, 4 Nov 2016 01:37:55 -0400 Subject: [PATCH 056/338] ENH: add fill_value to asfreq closes #3715 closes #14791 --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/core/generic.py | 73 ++++++++++++++++++++++++-- pandas/tests/frame/test_timeseries.py | 20 +++++++ pandas/tests/series/test_timeseries.py | 12 +++-- pandas/tseries/resample.py | 39 ++++++++++---- pandas/tseries/tests/test_resample.py | 37 +++++++++++++ 6 files changed, 164 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8c7e3acc97305..4689a5a8a99d0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -135,13 +135,13 @@ Other enhancements - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) +- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). +- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations - .. _whatsnew_0200.api_breaking: Backwards incompatible API changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f96cc9e8515de..2bb492055bcd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4066,12 +4066,17 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, sort=sort, group_keys=group_keys, squeeze=squeeze, **kwargs) - def asfreq(self, freq, method=None, how=None, normalize=False): + def asfreq(self, freq, method=None, how=None, normalize=False, + fill_value=None): """ Convert TimeSeries to specified frequency. Optionally provide filling method to pad/backfill missing values. + Returns the original data conformed to a new index with the specified + frequency. ``resample`` is more appropriate if an operation, such as + summarization, is necessary to represent the data at the new frequency. + Parameters ---------- freq : DateOffset object, or string @@ -4086,18 +4091,79 @@ def asfreq(self, freq, method=None, how=None, normalize=False): For PeriodIndex only, see PeriodIndex.asfreq normalize : bool, default False Whether to reset output index to midnight + fill_value: scalar, optional + Value to use for missing values, applied during upsampling (note + this does not fill NaNs that already were present). + + .. versionadded:: 0.20.0 Returns ------- converted : type of caller + Examples + -------- + + Start by creating a series with 4 one minute timestamps. + + >>> index = pd.date_range('1/1/2000', periods=4, freq='T') + >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) + >>> df = pd.DataFrame({'s':series}) + >>> df + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:01:00 NaN + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:03:00 3.0 + + Upsample the series into 30 second bins. + + >>> df.asfreq(freq='30S') + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 NaN + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 NaN + 2000-01-01 00:03:00 3.0 + + Upsample again, providing a ``fill value``. + + >>> df.asfreq(freq='30S', fill_value=9.0) + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 9.0 + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 9.0 + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 9.0 + 2000-01-01 00:03:00 3.0 + + Upsample again, providing a ``method``. + + >>> df.asfreq(freq='30S', method='bfill') + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 2.0 + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 3.0 + 2000-01-01 00:03:00 3.0 + + See Also + -------- + reindex + Notes ----- To learn more about the frequency strings, please see `this link `__. """ from pandas.tseries.resample import asfreq - return asfreq(self, freq, method=method, how=how, normalize=normalize) + return asfreq(self, freq, method=method, how=how, normalize=normalize, + fill_value=fill_value) def at_time(self, time, asof=False): """ @@ -4178,9 +4244,6 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, .. versionadded:: 0.19.0 - Notes - ----- - To learn more about the offset strings, please see `this link `__. diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 1a47294dc8af2..85967e9eda0d6 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -323,6 +323,26 @@ def test_asfreq_datetimeindex(self): ts = df['A'].asfreq('B') tm.assertIsInstance(ts.index, DatetimeIndex) + def test_asfreq_fillvalue(self): + # test for fill value during upsampling, related to issue 3715 + + # setup + rng = pd.date_range('1/1/2016', periods=10, freq='2S') + ts = pd.Series(np.arange(len(rng)), index=rng) + df = pd.DataFrame({'one': ts}) + + # insert pre-existing missing value + df.loc['2016-01-01 00:00:08', 'one'] = None + + actual_df = df.asfreq(freq='1S', fill_value=9.0) + expected_df = df.asfreq(freq='1S').fillna(9.0) + expected_df.loc['2016-01-01 00:00:08', 'one'] = None + assert_frame_equal(expected_df, actual_df) + + expected_series = ts.asfreq(freq='1S').fillna(9.0) + actual_series = ts.asfreq(freq='1S', fill_value=9.0) + assert_series_equal(expected_series, actual_series) + def test_first_last_valid(self): N = len(self.frame.index) mat = randn(N) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 6e3d52366a4ec..df7ab24430746 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -410,20 +410,26 @@ def test_asfreq(self): daily_ts = ts.asfreq('B') monthly_ts = daily_ts.asfreq('BM') - self.assert_series_equal(monthly_ts, ts) + assert_series_equal(monthly_ts, ts) daily_ts = ts.asfreq('B', method='pad') monthly_ts = daily_ts.asfreq('BM') - self.assert_series_equal(monthly_ts, ts) + assert_series_equal(monthly_ts, ts) daily_ts = ts.asfreq(BDay()) monthly_ts = daily_ts.asfreq(BMonthEnd()) - self.assert_series_equal(monthly_ts, ts) + assert_series_equal(monthly_ts, ts) result = ts[:0].asfreq('M') self.assertEqual(len(result), 0) self.assertIsNot(result, ts) + daily_ts = ts.asfreq('D', fill_value=-1) + result = daily_ts.value_counts().sort_index() + expected = Series([60, 1, 1, 1], + index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + assert_series_equal(result, expected) + def test_diff(self): # Just run the function self.ts.diff() diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index e6d500144fa44..e93e5637099c1 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -353,7 +353,7 @@ def transform(self, arg, *args, **kwargs): def _downsample(self, f): raise AbstractMethodError(self) - def _upsample(self, f, limit=None): + def _upsample(self, f, limit=None, fill_value=None): raise AbstractMethodError(self) def _gotitem(self, key, ndim, subset=None): @@ -509,12 +509,25 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, limit_direction=limit_direction, downcast=downcast, **kwargs) - def asfreq(self): + def asfreq(self, fill_value=None): """ return the values at the new freq, - essentially a reindex with (no filling) + essentially a reindex + + Parameters + ---------- + fill_value: scalar, optional + Value to use for missing values, applied during upsampling (note + this does not fill NaNs that already were present). + + .. versionadded:: 0.20.0 + + See Also + -------- + Series.asfreq + DataFrame.asfreq """ - return self._upsample('asfreq') + return self._upsample('asfreq', fill_value=fill_value) def std(self, ddof=1, *args, **kwargs): """ @@ -713,12 +726,14 @@ def _adjust_binner_for_upsample(self, binner): binner = binner[:-1] return binner - def _upsample(self, method, limit=None): + def _upsample(self, method, limit=None, fill_value=None): """ method : string {'backfill', 'bfill', 'pad', 'ffill', 'asfreq'} method for upsampling limit : int, default None Maximum size gap to fill when reindexing + fill_value : scalar, default None + Value to use for missing values See also -------- @@ -745,7 +760,7 @@ def _upsample(self, method, limit=None): result.index = res_index else: result = obj.reindex(res_index, method=method, - limit=limit) + limit=limit, fill_value=fill_value) return self._wrap_result(result) @@ -865,12 +880,14 @@ def _downsample(self, how, **kwargs): 'Frequency {} cannot be resampled to {}, as they are not ' 'sub or super periods'.format(ax.freq, self.freq)) - def _upsample(self, method, limit=None): + def _upsample(self, method, limit=None, fill_value=None): """ method : string {'backfill', 'bfill', 'pad', 'ffill'} method for upsampling limit : int, default None Maximum size gap to fill when reindexing + fill_value : scalar, default None + Value to use for missing values See also -------- @@ -884,8 +901,8 @@ def _upsample(self, method, limit=None): " datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': - return super(PeriodIndexResampler, self)._upsample(method, - limit=limit) + return super(PeriodIndexResampler, self)._upsample( + method, limit=limit, fill_value=fill_value) ax = self.ax obj = self.obj @@ -1346,7 +1363,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): Timestamp(lresult).tz_localize(last_tzinfo, ambiguous=last_dst)) -def asfreq(obj, freq, method=None, how=None, normalize=False): +def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): """ Utility frequency conversion method for Series/DataFrame """ @@ -1366,7 +1383,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False): return obj.copy() dti = date_range(obj.index[0], obj.index[-1], freq=freq) dti.name = obj.index.name - rs = obj.reindex(dti, method=method) + rs = obj.reindex(dti, method=method, fill_value=fill_value) if normalize: rs.index = rs.index.normalize() return rs diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 424226c0bfb7e..fbf0e0095a2f9 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -693,6 +693,24 @@ def test_asfreq_upsample(self): expected = frame.reindex(new_index) assert_frame_equal(result, expected) + def test_asfreq_fill_value(self): + # test for fill value during resampling, issue 3715 + + s = self.create_series() + + result = s.resample('1H').asfreq() + new_index = self.create_index(s.index[0], s.index[-1], freq='1H') + expected = s.reindex(new_index) + assert_series_equal(result, expected) + + frame = s.to_frame('value') + frame.iloc[1] = None + result = frame.resample('1H').asfreq(fill_value=4.0) + new_index = self.create_index(frame.index[0], + frame.index[-1], freq='1H') + expected = frame.reindex(new_index, fill_value=4.0) + assert_frame_equal(result, expected) + def test_resample_interpolate(self): # # 12925 df = self.create_series().to_frame('value') @@ -2159,6 +2177,25 @@ def test_asfreq_upsample(self): result = frame.resample('1H').asfreq() assert_frame_equal(result, expected) + def test_asfreq_fill_value(self): + # test for fill value during resampling, issue 3715 + + s = self.create_series() + new_index = date_range(s.index[0].to_timestamp(how='start'), + (s.index[-1]).to_timestamp(how='start'), + freq='1H') + expected = s.to_timestamp().reindex(new_index, fill_value=4.0) + result = s.resample('1H', kind='timestamp').asfreq(fill_value=4.0) + assert_series_equal(result, expected) + + frame = s.to_frame('value') + new_index = date_range(frame.index[0].to_timestamp(how='start'), + (frame.index[-1]).to_timestamp(how='start'), + freq='1H') + expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) + result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0) + assert_frame_equal(result, expected) + def test_selection(self): index = self.create_series().index # This is a bug, these should be implemented From 53b056a7abc39fcc1bdb80cd3802ce525ad0dccc Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Fri, 23 Dec 2016 16:58:20 -0500 Subject: [PATCH 057/338] BUG: Incorrect index label displayed on MultiIndex DataFrame closes #14882 closes #14975 --- doc/source/whatsnew/v0.20.0.txt | 10 + pandas/formats/format.py | 21 +- pandas/tests/formats/test_format.py | 548 ++++++++++++++++++++++++++++ 3 files changed, 576 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4689a5a8a99d0..1d83dee7a92bf 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -421,12 +421,14 @@ Bug Fixes - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) + - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) - Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) +- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) - Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`) @@ -445,4 +447,12 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) + + + + + + + + - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index a3319437474c2..adfb54c02d926 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1248,6 +1248,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num + inserted = False for lnum, records in enumerate(level_lengths): rec_new = {} for tag, span in list(records.items()): @@ -1255,9 +1256,17 @@ def _write_hierarchical_rows(self, fmt_values, indent): rec_new[tag + 1] = span elif tag + span > ins_row: rec_new[tag] = span + 1 - dot_row = list(idx_values[ins_row - 1]) - dot_row[-1] = u('...') - idx_values.insert(ins_row, tuple(dot_row)) + + # GH 14882 - Make sure insertion done once + if not inserted: + dot_row = list(idx_values[ins_row - 1]) + dot_row[-1] = u('...') + idx_values.insert(ins_row, tuple(dot_row)) + inserted = True + else: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = u('...') + idx_values[ins_row] = tuple(dot_row) else: rec_new[tag] = span # If ins_row lies between tags, all cols idx cols @@ -1267,6 +1276,12 @@ def _write_hierarchical_rows(self, fmt_values, indent): if lnum == 0: idx_values.insert(ins_row, tuple( [u('...')] * len(level_lengths))) + + # GH 14882 - Place ... in correct level + elif inserted: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = u('...') + idx_values[ins_row] = tuple(dot_row) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index b27e051d4f409..9eff64b40625d 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1214,6 +1214,554 @@ def test_to_html_multiindex_sparsify(self): self.assertEqual(result, expected) + def test_to_html_multiindex_odd_even_truncate(self): + # GH 14882 - Issue on truncation with odd length DataFrame + mi = MultiIndex.from_product([[100, 200, 300], + [10, 20, 30], + [1, 2, 3, 4, 5, 6, 7]], + names=['a','b','c']) + df = DataFrame({'n' : range(len(mi))}, index = mi) + result = df.to_html(max_rows=60) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
n
abc
1001010
21
32
43
54
65
76
2017
28
39
410
511
612
713
30114
215
316
417
518
619
720
20010121
222
323
424
525
626
727
20128
229
......
633
734
30135
236
337
438
539
640
741
30010142
243
344
445
546
647
748
20149
250
351
452
553
654
755
30156
257
358
459
560
661
762
""" + self.assertEqual(result, expected) + + # Test that ... appears in a middle level + result = df.to_html(max_rows=56) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
n
abc
1001010
21
32
43
54
65
76
2017
28
39
410
511
612
713
30114
215
316
417
518
619
720
20010121
222
323
424
525
626
727
.........
30135
236
337
438
539
640
741
30010142
243
344
445
546
647
748
20149
250
351
452
553
654
755
30156
257
358
459
560
661
762
""" + self.assertEqual(result, expected) + def test_to_html_index_formatter(self): df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], columns=['foo', None], index=lrange(4)) From 512ac024c7a41483c5d420fe2351517f497b8b65 Mon Sep 17 00:00:00 2001 From: Thoralf Gutierrez Date: Sun, 22 Jan 2017 12:35:12 +0100 Subject: [PATCH 058/338] DOC: fix listing na_values in read_csv docstring - add subsequent_indent to wrapped text (#15173) --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fdf26fdef6b25..f8905dfa315c4 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -148,7 +148,8 @@ na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as - NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'`. + NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), + 70, subsequent_indent=" ") + """'`. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. From 4394e0e5ba5389b37976a4bacfe26678cd05efc3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 21 Jan 2017 13:58:12 +0100 Subject: [PATCH 059/338] DOC: build fixes --- doc/source/timedeltas.rst | 5 +++-- doc/source/tutorials.rst | 4 ++-- doc/source/whatsnew/v0.20.0.txt | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index e4a2b09f616a1..07effcfdff33b 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -312,17 +312,18 @@ similarly to the ``Series``. These are the *displayed* values of the ``Timedelta .. _timedeltas.isoformat: -You can convert a ``Timedelta`` to an ISO 8601 Duration string with the +You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the ``.isoformat`` method .. versionadded:: 0.20.0 .. ipython:: python + pd.Timedelta(days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10, nanoseconds=12).isoformat() -.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations +.. _ISO 8601 Duration: https://en.wikipedia.org/wiki/ISO_8601#Durations .. _timedeltas.index: diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 5ff4e955a9ed2..c1c1c81915c46 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -57,7 +57,7 @@ See `How to use this cookbook `_. +For more resources, please visit the main `repository `__. - `01 - Lesson: `_ - Importing libraries @@ -128,7 +128,7 @@ There are four sections covering selected topics as follows: Exercises for New Users ----------------------- Practice your skills with real data sets and exercises. -For more resources, please visit the main `repository `_. +For more resources, please visit the main `repository `__. - `01 - Getting & Knowing Your Data `_ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1d83dee7a92bf..2587c666acd07 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -83,6 +83,7 @@ URLs and paths are now both inferred using their file extensions. Additionally, support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). .. ipython:: python + url = 'https://github.com/{repo}/raw/{branch}/{path}'.format( repo = 'pandas-dev/pandas', branch = 'master', @@ -147,7 +148,7 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0200.api_breaking.deprecate_ix +.. _whatsnew_0200.api_breaking.deprecate_ix: Deprecate .ix ^^^^^^^^^^^^^ @@ -194,7 +195,7 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi df.iloc[[0, 2], df.columns.get_loc('A')] -.. _whatsnew_0200.api_breaking.index_map +.. _whatsnew_0200.api_breaking.index_map: Map on Index types now return other Index types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 7c6b73ece52ff70b045e13780c3f2989c4521d51 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Jan 2017 16:44:07 +0100 Subject: [PATCH 060/338] CLN/DEPR: remove deprecated pandas.rpy module (GH9602) --- ci/lint.sh | 3 +- doc/source/r_interface.rst | 140 ++---------- doc/source/whatsnew/v0.20.0.txt | 3 + pandas/api/tests/test_api.py | 2 +- pandas/rpy/__init__.py | 18 -- pandas/rpy/base.py | 14 -- pandas/rpy/common.py | 372 -------------------------------- pandas/rpy/mass.py | 2 - pandas/rpy/tests/__init__.py | 0 pandas/rpy/tests/test_common.py | 216 ------------------- pandas/rpy/vars.py | 22 -- setup.py | 1 - 12 files changed, 27 insertions(+), 766 deletions(-) delete mode 100644 pandas/rpy/__init__.py delete mode 100644 pandas/rpy/base.py delete mode 100644 pandas/rpy/common.py delete mode 100644 pandas/rpy/mass.py delete mode 100644 pandas/rpy/tests/__init__.py delete mode 100644 pandas/rpy/tests/test_common.py delete mode 100644 pandas/rpy/vars.py diff --git a/ci/lint.sh b/ci/lint.sh index 5c4613f88649e..2cbfdadf486b8 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -8,10 +8,9 @@ RET=0 if [ "$LINT" ]; then - # pandas/rpy is deprecated and will be removed. # pandas/src is C code, so no need to search there. echo "Linting *.py" - flake8 pandas --filename=*.py --exclude pandas/rpy,pandas/src + flake8 pandas --filename=*.py --exclude pandas/src if [ $? -ne "0" ]; then RET=1 fi diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index f2a8668bbda91..b5d699cad69d5 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -1,5 +1,3 @@ -.. currentmodule:: pandas.rpy - .. _rpy: .. ipython:: python @@ -15,132 +13,55 @@ rpy2 / R interface .. warning:: - In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be - removed in a future version**. Similar functionality can be accessed - through the `rpy2 `__ project. - See the :ref:`updating ` section for a guide to port your - code from the ``pandas.rpy`` to ``rpy2`` functions. - - -.. _rpy.updating: - -Updating your code to use rpy2 functions ----------------------------------------- - -In v0.16.0, the ``pandas.rpy`` module has been **deprecated** and users are -pointed to the similar functionality in ``rpy2`` itself (rpy2 >= 2.4). + Up to pandas 0.19, a ``pandas.rpy`` module existed with functionality to + convert between pandas and ``rpy2`` objects. This functionality now lives in + the `rpy2 `__ project itself. + See the `updating section `__ + of the previous documentation for a guide to port your code from the + removed ``pandas.rpy`` to ``rpy2`` functions. -Instead of importing ``import pandas.rpy.common as com``, the following imports -should be done to activate the pandas conversion support in rpy2:: - - from rpy2.robjects import pandas2ri - pandas2ri.activate() +`rpy2 `__ is an interface to R running embedded in a Python process, and also includes functionality to deal with pandas DataFrames. Converting data frames back and forth between rpy2 and pandas should be largely automated (no need to convert explicitly, it will be done on the fly in most rpy2 functions). - To convert explicitly, the functions are ``pandas2ri.py2ri()`` and -``pandas2ri.ri2py()``. So these functions can be used to replace the existing -functions in pandas: - -- ``com.convert_to_r_dataframe(df)`` should be replaced with ``pandas2ri.py2ri(df)`` -- ``com.convert_robj(rdf)`` should be replaced with ``pandas2ri.ri2py(rdf)`` - -Note: these functions are for the latest version (rpy2 2.5.x) and were called -``pandas2ri.pandas2ri()`` and ``pandas2ri.ri2pandas()`` previously. - -Some of the other functionality in `pandas.rpy` can be replaced easily as well. -For example to load R data as done with the ``load_data`` function, the -current method:: - - df_iris = com.load_data('iris') - -can be replaced with:: - - from rpy2.robjects import r - r.data('iris') - df_iris = pandas2ri.ri2py(r[name]) - -The ``convert_to_r_matrix`` function can be replaced by the normal -``pandas2ri.py2ri`` to convert dataframes, with a subsequent call to R -``as.matrix`` function. - -.. warning:: - - Not all conversion functions in rpy2 are working exactly the same as the - current methods in pandas. If you experience problems or limitations in - comparison to the ones in pandas, please report this at the - `issue tracker `_. - -See also the documentation of the `rpy2 `__ project. - +``pandas2ri.ri2py()``. -R interface with rpy2 ---------------------- -If your computer has R and rpy2 (> 2.2) installed (which will be left to the -reader), you will be able to leverage the below functionality. On Windows, -doing this is quite an ordeal at the moment, but users on Unix-like systems -should find it quite easy. rpy2 evolves in time, and is currently reaching -its release 2.3, while the current interface is -designed for the 2.2.x series. We recommend to use 2.2.x over other series -unless you are prepared to fix parts of the code, yet the rpy2-2.3.0 -introduces improvements such as a better R-Python bridge memory management -layer so it might be a good idea to bite the bullet and submit patches for -the few minor differences that need to be fixed. +See also the documentation of the `rpy2 `__ project: https://rpy2.readthedocs.io. +In the remainder of this page, a few examples of explicit conversion is given. The pandas conversion of rpy2 needs first to be activated: -:: - - # if installing for the first time - hg clone http://bitbucket.org/lgautier/rpy2 - - cd rpy2 - hg pull - hg update version_2.2.x - sudo python setup.py install - -.. note:: - - To use R packages with this interface, you will need to install - them inside R yourself. At the moment it cannot install them for - you. +.. ipython:: python -Once you have done installed R and rpy2, you should be able to import -``pandas.rpy.common`` without a hitch. + from rpy2.robjects import r, pandas2ri + pandas2ri.activate() Transferring R data sets into Python ------------------------------------ -The **load_data** function retrieves an R data set and converts it to the +The ``pandas2ri.ri2py`` function retrieves an R data set and converts it to the appropriate pandas object (most likely a DataFrame): - .. ipython:: python - :okwarning: - - import pandas.rpy.common as com - infert = com.load_data('infert') - infert.head() + r.data('iris') + df_iris = pandas2ri.ri2py(r['iris']) + df_iris.head() Converting DataFrames into R objects ------------------------------------ -.. versionadded:: 0.8 - -Starting from pandas 0.8, there is **experimental** support to convert +The ``pandas2ri.py2ri`` function support the reverse operation to convert DataFrames into the equivalent R object (that is, **data.frame**): .. ipython:: python - import pandas.rpy.common as com df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, index=["one", "two", "three"]) - r_dataframe = com.convert_to_r_dataframe(df) - + r_dataframe = pandas2ri.py2ri(df) print(type(r_dataframe)) print(r_dataframe) @@ -148,24 +69,7 @@ DataFrames into the equivalent R object (that is, **data.frame**): The DataFrame's index is stored as the ``rownames`` attribute of the data.frame instance. -You can also use **convert_to_r_matrix** to obtain a ``Matrix`` instance, but -bear in mind that it will only work with homogeneously-typed DataFrames (as -R matrices bear no information on the data type): - -.. ipython:: python - - import pandas.rpy.common as com - r_matrix = com.convert_to_r_matrix(df) - - print(type(r_matrix)) - print(r_matrix) - - -Calling R functions with pandas objects ---------------------------------------- - - - -High-level interface to R estimators ------------------------------------- +.. + Calling R functions with pandas objects + High-level interface to R estimators diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2587c666acd07..e862b94f878fa 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -343,6 +343,9 @@ Deprecations Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- The ``pandas.rpy`` module is removed. Similar functionality can be accessed + through the `rpy2 `__ project. + See the :ref:`R interfacing docs ` for more details. - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 78dfe46914200..410d70c65404f 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -30,7 +30,7 @@ class TestPDApi(Base, tm.TestCase): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'rpy', 'locale'] + ignored = ['tests', 'locale'] # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', diff --git a/pandas/rpy/__init__.py b/pandas/rpy/__init__.py deleted file mode 100644 index b771a3d8374a3..0000000000000 --- a/pandas/rpy/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ - -# GH9602 -# deprecate rpy to instead directly use rpy2 - -# flake8: noqa - -import warnings -warnings.warn("The pandas.rpy module is deprecated and will be " - "removed in a future version. We refer to external packages " - "like rpy2. " - "\nSee here for a guide on how to port your code to rpy2: " - "http://pandas.pydata.org/pandas-docs/stable/r_interface.html", - FutureWarning, stacklevel=2) - -try: - from .common import importr, r, load_data -except ImportError: - pass diff --git a/pandas/rpy/base.py b/pandas/rpy/base.py deleted file mode 100644 index ac339dd366b0b..0000000000000 --- a/pandas/rpy/base.py +++ /dev/null @@ -1,14 +0,0 @@ -# flake8: noqa - -import pandas.rpy.util as util - - -class lm(object): - """ - Examples - -------- - >>> model = lm('x ~ y + z', data) - >>> model.coef - """ - def __init__(self, formula, data): - pass diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py deleted file mode 100644 index 95a072154dc68..0000000000000 --- a/pandas/rpy/common.py +++ /dev/null @@ -1,372 +0,0 @@ -""" -Utilities for making working with rpy2 more user- and -developer-friendly. -""" - -# flake8: noqa - -from __future__ import print_function - -from distutils.version import LooseVersion -from pandas.compat import zip, range -import numpy as np - -import pandas as pd -import pandas.core.common as com -import pandas.util.testing as _test - -from rpy2.robjects.packages import importr -from rpy2.robjects import r -import rpy2.robjects as robj - -import itertools as IT - - -__all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe', - 'convert_to_r_matrix'] - - -def load_data(name, package=None, convert=True): - if package: - importr(package) - - r.data(name) - - robj = r[name] - - if convert: - return convert_robj(robj) - else: - return robj - - -def _rclass(obj): - """ - Return R class name for input object - """ - return r['class'](obj)[0] - - -def _is_null(obj): - return _rclass(obj) == 'NULL' - - -def _convert_list(obj): - """ - Convert named Vector to dict, factors to list - """ - try: - values = [convert_robj(x) for x in obj] - keys = r['names'](obj) - return dict(zip(keys, values)) - except TypeError: - # For state.division and state.region - factors = list(r['factor'](obj)) - level = list(r['levels'](obj)) - result = [level[index-1] for index in factors] - return result - - -def _convert_array(obj): - """ - Convert Array to DataFrame - """ - def _list(item): - try: - return list(item) - except TypeError: - return [] - - # For iris3, HairEyeColor, UCBAdmissions, Titanic - dim = list(obj.dim) - values = np.array(list(obj)) - names = r['dimnames'](obj) - try: - columns = list(r['names'](names))[::-1] - except TypeError: - columns = ['X{:d}'.format(i) for i in range(len(names))][::-1] - columns.append('value') - name_list = [(_list(x) or range(d)) for x, d in zip(names, dim)][::-1] - arr = np.array(list(IT.product(*name_list))) - arr = np.column_stack([arr,values]) - df = pd.DataFrame(arr, columns=columns) - return df - - -def _convert_vector(obj): - if isinstance(obj, robj.IntVector): - return _convert_int_vector(obj) - elif isinstance(obj, robj.StrVector): - return _convert_str_vector(obj) - # Check if the vector has extra information attached to it that can be used - # as an index - try: - attributes = set(r['attributes'](obj).names) - except AttributeError: - return list(obj) - if 'names' in attributes: - return pd.Series(list(obj), index=r['names'](obj)) - elif 'tsp' in attributes: - return pd.Series(list(obj), index=r['time'](obj)) - elif 'labels' in attributes: - return pd.Series(list(obj), index=r['labels'](obj)) - if _rclass(obj) == 'dist': - # For 'eurodist'. WARNING: This results in a DataFrame, not a Series or list. - matrix = r['as.matrix'](obj) - return convert_robj(matrix) - else: - return list(obj) - -NA_INTEGER = -2147483648 - - -def _convert_int_vector(obj): - arr = np.asarray(obj) - mask = arr == NA_INTEGER - if mask.any(): - arr = arr.astype(float) - arr[mask] = np.nan - return arr - - -def _convert_str_vector(obj): - arr = np.asarray(obj, dtype=object) - mask = arr == robj.NA_Character - if mask.any(): - arr[mask] = np.nan - return arr - - -def _convert_DataFrame(rdf): - columns = list(rdf.colnames) - rows = np.array(rdf.rownames) - - data = {} - for i, col in enumerate(columns): - vec = rdf.rx2(i + 1) - values = _convert_vector(vec) - - if isinstance(vec, robj.FactorVector): - levels = np.asarray(vec.levels) - if com.is_float_dtype(values): - mask = np.isnan(values) - notmask = -mask - result = np.empty(len(values), dtype=object) - result[mask] = np.nan - - locs = (values[notmask] - 1).astype(np.int_) - result[notmask] = levels.take(locs) - values = result - else: - values = np.asarray(vec.levels).take(values - 1) - - data[col] = values - - return pd.DataFrame(data, index=_check_int(rows), columns=columns) - - -def _convert_Matrix(mat): - columns = mat.colnames - rows = mat.rownames - - columns = None if _is_null(columns) else list(columns) - index = r['time'](mat) if _is_null(rows) else list(rows) - return pd.DataFrame(np.array(mat), index=_check_int(index), - columns=columns) - - -def _check_int(vec): - try: - # R observation numbers come through as strings - vec = vec.astype(int) - except Exception: - pass - - return vec - -_pandas_converters = [ - (robj.DataFrame, _convert_DataFrame), - (robj.Matrix, _convert_Matrix), - (robj.StrVector, _convert_vector), - (robj.FloatVector, _convert_vector), - (robj.Array, _convert_array), - (robj.Vector, _convert_list), -] - -_converters = [ - (robj.DataFrame, lambda x: _convert_DataFrame(x).toRecords(index=False)), - (robj.Matrix, lambda x: _convert_Matrix(x).toRecords(index=False)), - (robj.IntVector, _convert_vector), - (robj.StrVector, _convert_vector), - (robj.FloatVector, _convert_vector), - (robj.Array, _convert_array), - (robj.Vector, _convert_list), -] - - -def convert_robj(obj, use_pandas=True): - """ - Convert rpy2 object to a pandas-friendly form - - Parameters - ---------- - obj : rpy2 object - - Returns - ------- - Non-rpy data structure, mix of NumPy and pandas objects - """ - if not isinstance(obj, robj.RObjectMixin): - return obj - - converters = _pandas_converters if use_pandas else _converters - - for rpy_type, converter in converters: - if isinstance(obj, rpy_type): - return converter(obj) - - raise TypeError('Do not know what to do with %s object' % type(obj)) - - -def convert_to_r_posixct(obj): - """ - Convert DatetimeIndex or np.datetime array to R POSIXct using - m8[s] format. - - Parameters - ---------- - obj : source pandas object (one of [DatetimeIndex, np.datetime]) - - Returns - ------- - An R POSIXct vector (rpy2.robjects.vectors.POSIXct) - - """ - import time - from rpy2.rinterface import StrSexpVector - - # convert m8[ns] to m8[s] - vals = robj.vectors.FloatSexpVector(obj.values.view('i8') / 1E9) - as_posixct = robj.baseenv.get('as.POSIXct') - origin = StrSexpVector([time.strftime("%Y-%m-%d", - time.gmtime(0)), ]) - - # We will be sending ints as UTC - tz = obj.tz.zone if hasattr( - obj, 'tz') and hasattr(obj.tz, 'zone') else 'UTC' - tz = StrSexpVector([tz]) - utc_tz = StrSexpVector(['UTC']) - - posixct = as_posixct(vals, origin=origin, tz=utc_tz) - posixct.do_slot_assign('tzone', tz) - return posixct - - -VECTOR_TYPES = {np.float64: robj.FloatVector, - np.float32: robj.FloatVector, - np.float: robj.FloatVector, - np.int: robj.IntVector, - np.int32: robj.IntVector, - np.int64: robj.IntVector, - np.object_: robj.StrVector, - np.str: robj.StrVector, - np.bool: robj.BoolVector} - - -NA_TYPES = {np.float64: robj.NA_Real, - np.float32: robj.NA_Real, - np.float: robj.NA_Real, - np.int: robj.NA_Integer, - np.int32: robj.NA_Integer, - np.int64: robj.NA_Integer, - np.object_: robj.NA_Character, - np.str: robj.NA_Character, - np.bool: robj.NA_Logical} - - -if LooseVersion(np.__version__) >= LooseVersion('1.8'): - for dict_ in (VECTOR_TYPES, NA_TYPES): - dict_.update({ - np.bool_: dict_[np.bool], - np.int_: dict_[np.int], - np.float_: dict_[np.float], - np.string_: dict_[np.str] - }) - - -def convert_to_r_dataframe(df, strings_as_factors=False): - """ - Convert a pandas DataFrame to a R data.frame. - - Parameters - ---------- - df: The DataFrame being converted - strings_as_factors: Whether to turn strings into R factors (default: False) - - Returns - ------- - A R data.frame - - """ - - import rpy2.rlike.container as rlc - - columns = rlc.OrdDict() - - # FIXME: This doesn't handle MultiIndex - - for column in df: - value = df[column] - value_type = value.dtype.type - - if value_type == np.datetime64: - value = convert_to_r_posixct(value) - else: - value = [item if pd.notnull(item) else NA_TYPES[value_type] - for item in value] - - value = VECTOR_TYPES[value_type](value) - - if not strings_as_factors: - I = robj.baseenv.get("I") - value = I(value) - - columns[column] = value - - r_dataframe = robj.DataFrame(columns) - - del columns - - r_dataframe.rownames = robj.StrVector(df.index) - - return r_dataframe - - -def convert_to_r_matrix(df, strings_as_factors=False): - - """ - Convert a pandas DataFrame to a R matrix. - - Parameters - ---------- - df: The DataFrame being converted - strings_as_factors: Whether to turn strings into R factors (default: False) - - Returns - ------- - A R matrix - - """ - - if df._is_mixed_type: - raise TypeError("Conversion to matrix only possible with non-mixed " - "type DataFrames") - - r_dataframe = convert_to_r_dataframe(df, strings_as_factors) - as_matrix = robj.baseenv.get("as.matrix") - r_matrix = as_matrix(r_dataframe) - - return r_matrix - -if __name__ == '__main__': - pass diff --git a/pandas/rpy/mass.py b/pandas/rpy/mass.py deleted file mode 100644 index 12fbbdfa4dc98..0000000000000 --- a/pandas/rpy/mass.py +++ /dev/null @@ -1,2 +0,0 @@ -class rlm(object): - pass diff --git a/pandas/rpy/tests/__init__.py b/pandas/rpy/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py deleted file mode 100644 index c3f09e21b1545..0000000000000 --- a/pandas/rpy/tests/test_common.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Testing that functions from rpy work as expected -""" - -# flake8: noqa - -import pandas as pd -import numpy as np -import unittest -import nose -import warnings -import pandas.util.testing as tm - -try: - import pandas.rpy.common as com - from rpy2.robjects import r - import rpy2.robjects as robj -except ImportError: - raise nose.SkipTest('R not installed') - - -class TestCommon(unittest.TestCase): - def test_convert_list(self): - obj = r('list(a=1, b=2, c=3)') - - converted = com.convert_robj(obj) - expected = {'a': [1], 'b': [2], 'c': [3]} - - tm.assert_dict_equal(converted, expected) - - def test_convert_nested_list(self): - obj = r('list(a=list(foo=1, bar=2))') - - converted = com.convert_robj(obj) - expected = {'a': {'foo': [1], 'bar': [2]}} - - tm.assert_dict_equal(converted, expected) - - def test_convert_frame(self): - # built-in dataset - df = r['faithful'] - - converted = com.convert_robj(df) - - assert np.array_equal(converted.columns, ['eruptions', 'waiting']) - assert np.array_equal(converted.index, np.arange(1, 273)) - - def _test_matrix(self): - r('mat <- matrix(rnorm(9), ncol=3)') - r('colnames(mat) <- c("one", "two", "three")') - r('rownames(mat) <- c("a", "b", "c")') - - return r['mat'] - - def test_convert_matrix(self): - mat = self._test_matrix() - - converted = com.convert_robj(mat) - - assert np.array_equal(converted.index, ['a', 'b', 'c']) - assert np.array_equal(converted.columns, ['one', 'two', 'three']) - - def test_convert_r_dataframe(self): - - is_na = robj.baseenv.get("is.na") - - seriesd = tm.getSeriesData() - frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) - - # Null data - frame["E"] = [np.nan for item in frame["A"]] - # Some mixed type data - frame["F"] = ["text" if item % - 2 == 0 else np.nan for item in range(30)] - - r_dataframe = com.convert_to_r_dataframe(frame) - - assert np.array_equal( - com.convert_robj(r_dataframe.rownames), frame.index) - assert np.array_equal( - com.convert_robj(r_dataframe.colnames), frame.columns) - assert all(is_na(item) for item in r_dataframe.rx2("E")) - - for column in frame[["A", "B", "C", "D"]]: - coldata = r_dataframe.rx2(column) - original_data = frame[column] - assert np.array_equal(com.convert_robj(coldata), original_data) - - for column in frame[["D", "E"]]: - for original, converted in zip(frame[column], - r_dataframe.rx2(column)): - - if pd.isnull(original): - assert is_na(converted) - else: - assert original == converted - - def test_convert_r_matrix(self): - - is_na = robj.baseenv.get("is.na") - - seriesd = tm.getSeriesData() - frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) - # Null data - frame["E"] = [np.nan for item in frame["A"]] - - r_dataframe = com.convert_to_r_matrix(frame) - - assert np.array_equal( - com.convert_robj(r_dataframe.rownames), frame.index) - assert np.array_equal( - com.convert_robj(r_dataframe.colnames), frame.columns) - assert all(is_na(item) for item in r_dataframe.rx(True, "E")) - - for column in frame[["A", "B", "C", "D"]]: - coldata = r_dataframe.rx(True, column) - original_data = frame[column] - assert np.array_equal(com.convert_robj(coldata), - original_data) - - # Pandas bug 1282 - frame["F"] = ["text" if item % - 2 == 0 else np.nan for item in range(30)] - - try: - wrong_matrix = com.convert_to_r_matrix(frame) - except TypeError: - pass - except Exception: - raise - - def test_dist(self): - for name in ('eurodist',): - df = com.load_data(name) - dist = r[name] - labels = r['labels'](dist) - assert np.array_equal(df.index, labels) - assert np.array_equal(df.columns, labels) - - def test_timeseries(self): - """ - Test that the series has an informative index. - Unfortunately the code currently does not build a DateTimeIndex - """ - for name in ( - 'austres', 'co2', 'fdeaths', 'freeny.y', 'JohnsonJohnson', - 'ldeaths', 'mdeaths', 'nottem', 'presidents', 'sunspot.month', 'sunspots', - 'UKDriverDeaths', 'UKgas', 'USAccDeaths', - 'airmiles', 'discoveries', 'EuStockMarkets', - 'LakeHuron', 'lh', 'lynx', 'nhtemp', 'Nile', - 'Seatbelts', 'sunspot.year', 'treering', 'uspop'): - series = com.load_data(name) - ts = r[name] - assert np.array_equal(series.index, r['time'](ts)) - - def test_numeric(self): - for name in ('euro', 'islands', 'precip'): - series = com.load_data(name) - numeric = r[name] - names = numeric.names - assert np.array_equal(series.index, names) - - def test_table(self): - iris3 = pd.DataFrame({'X0': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'}, - 'X1': {0: 'Sepal L.', - 1: 'Sepal L.', - 2: 'Sepal L.', - 3: 'Sepal L.', - 4: 'Sepal L.'}, - 'X2': {0: 'Setosa', - 1: 'Setosa', - 2: 'Setosa', - 3: 'Setosa', - 4: 'Setosa'}, - 'value': {0: '5.1', 1: '4.9', 2: '4.7', 3: '4.6', 4: '5.0'}}) - hec = pd.DataFrame( - { - 'Eye': {0: 'Brown', 1: 'Brown', 2: 'Brown', 3: 'Brown', 4: 'Blue'}, - 'Hair': {0: 'Black', 1: 'Brown', 2: 'Red', 3: 'Blond', 4: 'Black'}, - 'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Male'}, - 'value': {0: '32.0', 1: '53.0', 2: '10.0', 3: '3.0', 4: '11.0'}}) - titanic = pd.DataFrame( - { - 'Age': {0: 'Child', 1: 'Child', 2: 'Child', 3: 'Child', 4: 'Child'}, - 'Class': {0: '1st', 1: '2nd', 2: '3rd', 3: 'Crew', 4: '1st'}, - 'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Female'}, - 'Survived': {0: 'No', 1: 'No', 2: 'No', 3: 'No', 4: 'No'}, - 'value': {0: '0.0', 1: '0.0', 2: '35.0', 3: '0.0', 4: '0.0'}}) - for name, expected in zip(('HairEyeColor', 'Titanic', 'iris3'), - (hec, titanic, iris3)): - df = com.load_data(name) - table = r[name] - names = r['dimnames'](table) - try: - columns = list(r['names'](names))[::-1] - except TypeError: - columns = ['X{:d}'.format(i) for i in range(len(names))][::-1] - columns.append('value') - assert np.array_equal(df.columns, columns) - result = df.head() - cond = ((result.sort(axis=1) == expected.sort(axis=1))).values - assert np.all(cond) - - def test_factor(self): - for name in ('state.division', 'state.region'): - vector = r[name] - factors = list(r['factor'](vector)) - level = list(r['levels'](vector)) - factors = [level[index - 1] for index in factors] - result = com.load_data(name) - assert np.equal(result, factors) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'], - exit=False) diff --git a/pandas/rpy/vars.py b/pandas/rpy/vars.py deleted file mode 100644 index 2073b47483141..0000000000000 --- a/pandas/rpy/vars.py +++ /dev/null @@ -1,22 +0,0 @@ -# flake8: noqa - -import pandas.rpy.util as util - - -class VAR(object): - """ - - Parameters - ---------- - y : - p : - type : {"const", "trend", "both", "none"} - season : - exogen : - lag_max : - ic : {"AIC", "HQ", "SC", "FPE"} - Information criterion to use, if lag_max is not None - """ - def __init__(y, p=1, type="none", season=None, exogen=None, - lag_max=None, ic=None): - pass diff --git a/setup.py b/setup.py index a53464f8f7987..8cdd72b2f4682 100755 --- a/setup.py +++ b/setup.py @@ -631,7 +631,6 @@ def pxd(name): 'pandas.io', 'pandas.io.sas', 'pandas.formats', - 'pandas.rpy', 'pandas.sparse', 'pandas.sparse.tests', 'pandas.stats', From 13f16c483f06aedeae56e5845c1554f50c78b82a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 22 Jan 2017 10:32:21 -0500 Subject: [PATCH 061/338] BUG, TST: Patch handling of uint64 in algorithms 1) Patches handling of `uint64` in `value_counts` 2) Adds tests for handling of `uint64` in `factorize` xref #14934 Author: gfyoung Closes #15162 from gfyoung/core-algorithms-uint64-three and squashes the following commits: 1fb256b [gfyoung] BUG, TST: Patch uint64 behavior in value_counts and factorize --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 8 +-- pandas/indexes/base.py | 78 +++++++++++++++++++--------- pandas/tests/indexes/test_numeric.py | 4 ++ pandas/tests/test_algos.py | 30 +++++++++++ 5 files changed, 92 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e862b94f878fa..79ca5a5e13d02 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -111,6 +111,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) - Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`) - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) +- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`) .. _whatsnew_0200.enhancements.other: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b4a61b26aceb3..05cfb1bd9ec27 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): - # TODO: This constructor is bugged for uint's, especially - # np.uint64 due to overflow. Test this for uint behavior - # once constructor has been fixed. uniques = Index(uniques) return labels, uniques @@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True): if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) - elif is_integer_dtype(dtype): + elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) + elif is_unsigned_integer_dtype(dtype): + values = _ensure_uint64(values) + keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index d0bf4edfbc5d2..2009a064afd73 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif inferred in ['floating', 'mixed-integer-float']: # If we are actually all equal to integers, - # then coerce to integer - from .numeric import (Int64Index, UInt64Index, - Float64Index) + # then coerce to integer. try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, - name=name) - except (OverflowError, TypeError, ValueError): + return cls._try_convert_to_int_index( + data, copy, name) + except ValueError: pass - # Conversion to int64 failed (possibly due to - # overflow), so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, - name=name) - except (TypeError, ValueError): - pass - - # return an actual float index + # Return an actual float index. + from .numeric import Float64Index return Float64Index(data, copy=copy, dtype=dtype, name=name) @@ -270,13 +257,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if dtype is None: inferred = lib.infer_dtype(subarr) if inferred == 'integer': - from .numeric import Int64Index, UInt64Index try: - return Int64Index(subarr.astype('i8'), copy=copy, - name=name) - except OverflowError: - return UInt64Index(subarr.astype('u8'), copy=copy, - name=name) + return cls._try_convert_to_int_index( + subarr, copy, name) + except ValueError: + pass + + return Index(subarr, copy=copy, + dtype=object, name=name) elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index return Float64Index(subarr, copy=copy, name=name) @@ -597,6 +585,46 @@ def ravel(self, order='C'): return self._values.ravel(order=order) # construction helpers + @classmethod + def _try_convert_to_int_index(cls, data, copy, name): + """ + Attempt to convert an array of data into an integer index. + + Parameters + ---------- + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. + + Returns + ------- + int_index : data converted to either an Int64Index or a + UInt64Index + + Raises + ------ + ValueError if the conversion was not successful. + """ + + from .numeric import Int64Index, UInt64Index + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to + # overflow), so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (TypeError, ValueError): + pass + + raise ValueError + @classmethod def _scalar_data_error(cls, data): raise TypeError('{0}(...) must be called with a collection of some ' diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 044d3477271ad..c7acbf51a17e5 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -919,6 +919,10 @@ def test_constructor(self): res = Index([1, 2**63]) tm.assert_index_equal(res, idx) + idx = Index([-1, 2**63], dtype=object) + res = Index(np.array([-1, 2**63], dtype=object)) + tm.assert_index_equal(res, idx) + def test_get_indexer(self): target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) indexer = self.index.get_indexer(target) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 75dd887c9d290..99453b9793007 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -287,6 +287,23 @@ def test_complex_sorting(self): self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True) + def test_uint64_factorize(self): + data = np.array([2**63, 1, 2**63], dtype=np.uint64) + exp_labels = np.array([0, 1, 0], dtype=np.intp) + exp_uniques = np.array([2**63, 1], dtype=np.uint64) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + + data = np.array([2**63, -1, 2**63], dtype=object) + exp_labels = np.array([0, 1, 0], dtype=np.intp) + exp_uniques = np.array([2**63, -1], dtype=object) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + class TestUnique(tm.TestCase): _multiprocess_can_split_ = True @@ -626,6 +643,19 @@ def test_value_counts_normalized(self): index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) + def test_value_counts_uint64(self): + arr = np.array([2**63], dtype=np.uint64) + expected = Series([1], index=[2**63]) + result = algos.value_counts(arr) + + tm.assert_series_equal(result, expected) + + arr = np.array([-1, 2**63], dtype=object) + expected = Series([1, 1], index=[-1, 2**63]) + result = algos.value_counts(arr) + + tm.assert_series_equal(result, expected) + class TestDuplicated(tm.TestCase): From e375f0a2bae755ad35245440e3bb2dd39422a76b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 22 Jan 2017 11:20:57 -0500 Subject: [PATCH 062/338] CI: add matplotlib, xlwt to build for 3.6 (#15189) --- ci/requirements-3.6.pip | 1 - ci/requirements-3.6.run | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip index 688866a18542f..e69de29bb2d1d 100644 --- a/ci/requirements-3.6.pip +++ b/ci/requirements-3.6.pip @@ -1 +0,0 @@ -xlwt diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index daf07be2d1785..5d9cb05a7b402 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -5,10 +5,10 @@ scipy openpyxl xlsxwriter xlrd -# xlwt (installed via pip) +xlwt numexpr pytables -# matplotlib (not avail on defaults ATM) +matplotlib lxml html5lib jinja2 From 4b7b2fd270f9581764ae0e9a617e885bf19298dc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 22 Jan 2017 12:20:18 -0500 Subject: [PATCH 063/338] CLN: remove build warnings in join_asof indexers & algos (#15188) --- pandas/src/algos_common_helper.pxi.in | 6 +++--- pandas/src/algos_groupby_helper.pxi.in | 4 ++-- pandas/src/joins_func_helper.pxi.in | 17 ++++++++--------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index a579a5020f6e7..5e87528943005 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -374,7 +374,7 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): n = len(arr) if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN return False, False, True else: @@ -382,14 +382,14 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): elif n < 2: return True, True, True - if timelike and arr[0] == iNaT: + if timelike and arr[0] == iNaT: return False, False, True {{nogil_str}} {{tab}}prev = arr[0] {{tab}}for i in range(1, n): {{tab}} cur = arr[i] - {{tab}} if timelike and cur == iNaT: + {{tab}} if timelike and cur == iNaT: {{tab}} is_monotonic_inc = 0 {{tab}} is_monotonic_dec = 0 {{tab}} break diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 70862e198edba..4b031afed3d43 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -580,7 +580,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, size - {{dest_type2}} val, min_val + {{dest_type2}} val, min_val = 0 int64_t lab N, K = ( values).shape @@ -615,7 +615,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, size - {{dest_type2}} val, max_val + {{dest_type2}} val, max_val = 0 int64_t lab N, K = ( values).shape diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in index 486c65368602b..9cca9bba2a197 100644 --- a/pandas/src/joins_func_helper.pxi.in +++ b/pandas/src/joins_func_helper.pxi.in @@ -41,8 +41,8 @@ def asof_join_backward_{{on_dtype}}_by_{{by_dtype}}( Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 - {{on_dtype}} tolerance_ - {{on_dtype}} diff + {{on_dtype}} tolerance_ = 0 + {{on_dtype}} diff = 0 {{table_type}} hash_table {{by_dtype}} by_value @@ -106,8 +106,8 @@ def asof_join_forward_{{on_dtype}}_by_{{by_dtype}}( Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 - {{on_dtype}} tolerance_ - {{on_dtype}} diff + {{on_dtype}} tolerance_ = 0 + {{on_dtype}} diff = 0 {{table_type}} hash_table {{by_dtype}} by_value @@ -236,8 +236,8 @@ def asof_join_backward_{{on_dtype}}( Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 - {{on_dtype}} tolerance_ - {{on_dtype}} diff + {{on_dtype}} tolerance_ = 0 + {{on_dtype}} diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -290,8 +290,8 @@ def asof_join_forward_{{on_dtype}}( Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 - {{on_dtype}} tolerance_ - {{on_dtype}} diff + {{on_dtype}} tolerance_ = 0 + {{on_dtype}} diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -371,4 +371,3 @@ def asof_join_nearest_{{on_dtype}}( return left_indexer, right_indexer {{endfor}} - From d30be64c4e6b8a205d7909ca0437ed224b8f3181 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 22 Jan 2017 16:42:56 -0500 Subject: [PATCH 064/338] BLD: remove more build warnings (#15191) --- pandas/src/numpy_helper.h | 2 +- pandas/tslib.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/src/numpy_helper.h b/pandas/src/numpy_helper.h index 17d5ec12f4f79..809edb2e99fa2 100644 --- a/pandas/src/numpy_helper.h +++ b/pandas/src/numpy_helper.h @@ -125,7 +125,7 @@ PyObject* sarr_from_data(PyArray_Descr* descr, int length, void* data) { void transfer_object_column(char* dst, char* src, size_t stride, size_t length) { - int i; + size_t i; size_t sz = sizeof(PyObject*); for (i = 0; i < length; ++i) { diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 15b279d20021a..7c65ab5422309 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1945,7 +1945,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, object freq): cdef: object ret - int year, quarter, month, mnum, date_len + int year, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert util.is_string_object(date_string) From 51e32d0e7cbb66cc3387ff9c44cc4ace7f3437cc Mon Sep 17 00:00:00 2001 From: Sebastian Bank Date: Mon, 23 Jan 2017 08:38:39 -0500 Subject: [PATCH 065/338] API: add DataFrame.nunique() and DataFrameGroupBy.nunique() closes #14336 Author: Sebastian Bank Closes #14376 from xflr6/nunique and squashes the following commits: a0558e7 [Sebastian Bank] use apply()-kwargs instead of partial, more tests, better examples c8d3ac4 [Sebastian Bank] extend docs and tests fd0f22d [Sebastian Bank] add simple benchmarks 5c4b325 [Sebastian Bank] API: add DataFrame.nunique() and DataFrameGroupBy.nunique() --- asv_bench/benchmarks/frame_methods.py | 14 ++++++++ asv_bench/benchmarks/groupby.py | 16 +++++++++ doc/source/whatsnew/v0.20.0.txt | 3 ++ pandas/core/frame.py | 31 +++++++++++++++++ pandas/core/groupby.py | 48 +++++++++++++++++++++++++++ pandas/tests/frame/test_analytics.py | 16 +++++++++ pandas/tests/groupby/test_groupby.py | 38 ++++++++++++++++++--- 7 files changed, 161 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index adbe73aa5c5ef..9f491302a4d6f 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -433,6 +433,20 @@ def time_frame_from_records_generator_nrows(self): +#----------------------------------------------------------------------------- +# nunique + +class frame_nunique(object): + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + + def time_frame_nunique(self): + self.df.nunique() + + + #----------------------------------------------------------------------------- # duplicated diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 597b040b8075c..03ff62568b405 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -251,6 +251,22 @@ def time_groupby_int_count(self): self.df.groupby(['key1', 'key2']).count() +#---------------------------------------------------------------------- +# nunique() speed + +class groupby_nunique(object): + + def setup(self): + self.n = 10000 + self.df = DataFrame({'key1': randint(0, 500, size=self.n), + 'key2': randint(0, 100, size=self.n), + 'ints': randint(0, 1000, size=self.n), + 'ints2': randint(0, 1000, size=self.n), }) + + def time_groupby_nunique(self): + self.df.groupby(['key1', 'key2']).nunique() + + #---------------------------------------------------------------------- # group with different functions per column diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 79ca5a5e13d02..cc3bf696ee4c7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -119,6 +119,9 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) +- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). +- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`). + - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 799e2f266a137..cf306034001db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4976,6 +4976,37 @@ def f(x): return Series(result, index=labels) + def nunique(self, axis=0, dropna=True): + """ + Return Series with number of distinct observations over requested + axis. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + dropna : boolean, default True + Don't include NaN in the counts. + + Returns + ------- + nunique : Series + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) + >>> df.nunique() + A 3 + B 1 + + >>> df.nunique(axis=1) + 0 1 + 1 2 + 2 2 + """ + return self.apply(Series.nunique, axis=axis, dropna=dropna) + def idxmin(self, axis=0, skipna=True): """ Return index of first occurrence of minimum over requested axis. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fc9cd894162e4..3bbf248ece1d3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3996,6 +3996,54 @@ def count(self): return self._wrap_agged_blocks(data.items, list(blk)) + def nunique(self, dropna=True): + """ + Return DataFrame with number of distinct observations per group for + each column. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + dropna : boolean, default True + Don't include NaN in the counts. + + Returns + ------- + nunique: DataFrame + + Examples + -------- + >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', + ... 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) + >>> df + id value1 value2 + 0 spam 1 a + 1 egg 5 b + 2 egg 5 b + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + + >>> df.groupby('id').nunique() + id value1 value2 + id + egg 1 1 1 + ham 1 1 2 + spam 1 2 1 + + # check for rows with the same id but conflicting values + >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + id value1 value2 + 0 spam 1 a + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + """ + return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna)) + from pandas.tools.plotting import boxplot_frame_groupby # noqa DataFrameGroupBy.boxplot = boxplot_frame_groupby diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 7e50fbd7378e0..5d51306363053 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -16,6 +16,7 @@ MultiIndex, date_range, Timestamp) import pandas as pd import pandas.core.nanops as nanops +import pandas.core.algorithms as algorithms import pandas.formats.printing as printing import pandas.util.testing as tm @@ -411,6 +412,21 @@ def test_count(self): expected = Series(0, index=[]) tm.assert_series_equal(result, expected) + def test_nunique(self): + f = lambda s: len(algorithms.unique1d(s.dropna())) + self._check_stat_op('nunique', f, has_skipna=False, + check_dtype=False, check_dates=True) + + df = DataFrame({'A': [1, 1, 1], + 'B': [1, 2, 3], + 'C': [1, np.nan, 3]}) + tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2})) + tm.assert_series_equal(df.nunique(dropna=False), + Series({'A': 1, 'B': 3, 'C': 3})) + tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) + tm.assert_series_equal(df.nunique(axis=1, dropna=False), + Series({0: 1, 1: 3, 2: 2})) + def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bbfd934946af7..7140eb5a6fd12 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2938,6 +2938,34 @@ def test_count_cross_type(self): # GH8169 result = df.groupby(['c', 'd']).count() tm.assert_frame_equal(result, expected) + def test_nunique(self): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + def test_non_cython_api(self): # GH5610 @@ -5281,11 +5309,11 @@ def test_tab_completion(self): 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'head', 'irow', 'describe', 'cummax', 'quantile', 'rank', - 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', - 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take', - 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', - 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) self.assertEqual(results, expected) From 726a028723a88489a3b55e7b6a11067f0e9bdfc4 Mon Sep 17 00:00:00 2001 From: Joost Kranendonk Date: Mon, 23 Jan 2017 13:06:38 -0500 Subject: [PATCH 066/338] BUG: Fix for .str.replace with repl function .str.replace now accepts a callable (function) as replacement string. It now raises a TypeError when repl is not string like nor a callable. Docstring updated accordingly. closes #15055 Author: Joost Kranendonk Author: Joost Kranendonk Closes #15056 from hzpc-joostk/pandas-GH15055-patch-1 and squashes the following commits: 826730c [Joost Kranendonk] simplify .str.replace TypeError reraising and test 90779ce [Joost Kranendonk] fix linting issues c2cc13a [Joost Kranendonk] Update v0.20.0.txt e15dcdf [Joost Kranendonk] fix bug catch TypeError with wrong number of args 40c0d97 [Joost Kranendonk] improve .str.replace with callable f15ee2a [Joost Kranendonk] improve test .str.replace with callable 14beb21 [Joost Kranendonk] Add test for .str.replace with regex named groups 27065a2 [Joost Kranendonk] Reraise TypeError only with wrong number of args ae04a3e [Joost Kranendonk] Add whatsnew for .str.replace with callable repl 067a7a8 [Joost Kranendonk] Fix testing bug for .str.replace 30d4727 [Joost Kranendonk] Bug fix in .str.replace type checking done wrong 4baa0a7 [Joost Kranendonk] add tests for .str.replace with callable repl 91c883d [Joost Kranendonk] Update .str.replace docstring 6ecc43d [Joost Kranendonk] BUG: Fix for .str.replace with repl function --- doc/source/text.rst | 21 +++++++++- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/strings.py | 70 +++++++++++++++++++++++++++++---- pandas/tests/test_strings.py | 37 +++++++++++++++++ 4 files changed, 121 insertions(+), 8 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 3a4a57ff4da95..52e05c5d511bc 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -146,6 +146,25 @@ following code will cause trouble because of the regular expression meaning of # We need to escape the special character (for >1 len patterns) dollars.str.replace(r'-\$', '-') +The ``replace`` method can also take a callable as replacement. It is called +on every ``pat`` using :func:`re.sub`. The callable should expect one +positional argument (a regex object) and return a string. + +.. versionadded:: 0.20.0 + +.. ipython:: python + + # Reverse every lowercase alphabetic word + pat = r'[a-z]+' + repl = lambda m: m.group(0)[::-1] + pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(pat, repl) + + # Using regex groups + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group('two').swapcase() + pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) + + Indexing with ``.str`` ---------------------- @@ -406,7 +425,7 @@ Method Summary :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex with some other string + :meth:`~Series.str.replace`;Replace occurrences of pattern/regex with some other string or the return value of a callable given the occurrence :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" :meth:`~Series.str.center`;Equivalent to ``str.center`` diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cc3bf696ee4c7..7e4fa44ea8ded 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -24,6 +24,7 @@ New features ~~~~~~~~~~~~ - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. +- ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3041b17b99b17..c48defe39a011 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -167,7 +167,17 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): try: convert = not all(mask) result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError): + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + if compat.PY2: + p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?' + else: + p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' + r'(?(3)required )positional arguments?') + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + raise e def g(x): try: @@ -303,8 +313,13 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): ---------- pat : string Character sequence or regular expression - repl : string - Replacement sequence + repl : string or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + + .. versionadded:: 0.20.0 + n : int, default -1 (all) Number of replacements to make from start case : boolean, default True @@ -315,12 +330,53 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Returns ------- replaced : Series/Index of objects + + Examples + -------- + When ``repl`` is a string, every ``pat`` is replaced as with + :meth:`str.replace`. NaN value(s) in the Series are left as is. + + >>> Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') + 0 boo + 1 buz + 2 NaN + dtype: object + + When ``repl`` is a callable, it is called on every ``pat`` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo + 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups: + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) + 0 bAR + 1 NaN + dtype: object """ - # Check whether repl is valid (GH 13438) - if not is_string_like(repl): - raise TypeError("repl must be a string") - use_re = not case or len(pat) > 1 or flags + # Check whether repl is valid (GH 13438, GH 15055) + if not (is_string_like(repl) or callable(repl)): + raise TypeError("repl must be a string or callable") + use_re = not case or len(pat) > 1 or flags or callable(repl) if use_re: if not case: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bb1aab3bc7ffb..f59127c853ed1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -436,6 +436,43 @@ def test_replace(self): values = klass(data) self.assertRaises(TypeError, values.str.replace, 'a', repl) + def test_replace_callable(self): + # GH 15055 + values = Series(['fooBAD__barBAD', NA]) + + # test with callable + repl = lambda m: m.group(0).swapcase() + result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) + exp = Series(['foObaD__baRbaD', NA]) + tm.assert_series_equal(result, exp) + + # test with wrong number of arguments, raising an error + if compat.PY2: + p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?' + else: + p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' + r'(?(3)required )positional arguments?') + + repl = lambda: None + with tm.assertRaisesRegexp(TypeError, p_err): + values.str.replace('a', repl) + + repl = lambda m, x: None + with tm.assertRaisesRegexp(TypeError, p_err): + values.str.replace('a', repl) + + repl = lambda m, x, y=None: None + with tm.assertRaisesRegexp(TypeError, p_err): + values.str.replace('a', repl) + + # test regex named groups + values = Series(['Foo Bar Baz', NA]) + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group('middle').swapcase() + result = values.str.replace(pat, repl) + exp = Series(['bAR', NA]) + tm.assert_series_equal(result, exp) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) From 983fdd2893a5f34801b1364911934dda86f66b3d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 24 Jan 2017 09:19:28 -0500 Subject: [PATCH 067/338] PERF: DataFrame.groupby.nunique closes #15197 Author: Jeff Reback Closes #15201 from jreback/nunique and squashes the following commits: 6d02616 [Jeff Reback] PERF: DataFrame.groupby.nunique --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 23 +++++++++++++++++++++-- pandas/tests/groupby/test_groupby.py | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7e4fa44ea8ded..c738d8d9c3499 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -121,7 +121,7 @@ Other enhancements - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). -- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`). +- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`). - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3bbf248ece1d3..84858ef5cf8b3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -34,7 +34,9 @@ from pandas.types.cast import _possibly_downcast_to_dtype from pandas.types.missing import isnull, notnull, _maybe_fill -from pandas.core.common import _values_from_object, AbstractMethodError +from pandas.core.common import (_values_from_object, AbstractMethodError, + _default_index) + from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) from pandas.core.categorical import Categorical @@ -4042,7 +4044,24 @@ def nunique(self, dropna=True): 4 ham 5 x 5 ham 5 y """ - return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna)) + + obj = self._selected_obj + + def groupby_series(obj, col=None): + return SeriesGroupBy(obj, + selection=col, + grouper=self.grouper).nunique(dropna=dropna) + + if isinstance(obj, Series): + results = groupby_series(obj) + else: + from pandas.tools.merge import concat + results = [groupby_series(obj[col], col) for col in obj.columns] + results = concat(results, axis=1) + + if not self.as_index: + results.index = _default_index(len(results)) + return results from pandas.tools.plotting import boxplot_frame_groupby # noqa diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7140eb5a6fd12..68689189a92b2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2,6 +2,7 @@ from __future__ import print_function import nose +from string import ascii_lowercase from datetime import datetime from numpy import nan @@ -1807,22 +1808,22 @@ def test_groupby_as_index_agg(self): assert_frame_equal(left, right) def test_series_groupby_nunique(self): - from itertools import product - from string import ascii_lowercase - def check_nunique(df, keys): - for sort, dropna in product((False, True), repeat=2): - gr = df.groupby(keys, sort=sort) + def check_nunique(df, keys, as_index=True): + for sort, dropna in cart_product((False, True), repeat=2): + gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) - gr = df.groupby(keys, sort=sort) + gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) - assert_series_equal(left, right) + assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10) - for n, m in product(10 ** np.arange(2, 6), (10, 100, 1000)): + for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): frame = DataFrame({ 'jim': np.random.choice( list(ascii_lowercase), n), @@ -1841,6 +1842,8 @@ def check_nunique(df, keys): check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ['jim'], as_index=False) + check_nunique(frame, ['jim', 'joe'], as_index=False) def test_series_groupby_value_counts(self): from itertools import product From 94d02bc964970264461fee196ff6608825ceaae9 Mon Sep 17 00:00:00 2001 From: Giacomo Ferroni Date: Tue, 24 Jan 2017 12:54:54 -0500 Subject: [PATCH 068/338] BUG: Fix bug in window function count should count anything non-null closes #12541 Author: Giacomo Ferroni Author: Giacomo Closes #15196 from mralgos/gh12541 and squashes the following commits: 65d70eb [Giacomo Ferroni] Added Periods to the test 94084b4 [Giacomo] Merge branch 'master' into gh12541 9621315 [Giacomo Ferroni] Added extra test and updated whatsnew cb84935 [Giacomo Ferroni] pylint checks 26c86a5 [Giacomo Ferroni] Test added for GH12541 ea49e77 [Giacomo Ferroni] Fix for GH12541 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/window.py | 12 +-------- pandas/tests/test_window.py | 43 +++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c738d8d9c3499..49e0f4cb08aee 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -432,6 +432,7 @@ Bug Fixes - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) +- Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) diff --git a/pandas/core/window.py b/pandas/core/window.py index b330a12110923..bda134dd8a2a4 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -762,17 +762,7 @@ def count(self): results = [] for b in blocks: - - if needs_i8_conversion(b.values): - result = b.notnull().astype(int) - else: - try: - result = np.isfinite(b).astype(float) - except TypeError: - result = np.isfinite(b.astype(float)).astype(float) - - result[pd.isnull(result)] = 0 - + result = b.notnull().astype(int) result = self._constructor(result, window=window, min_periods=0, center=self.center).sum() results.append(result) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 1afd5dad404c8..c05316ae34f15 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -207,6 +207,49 @@ def f(): 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) tm.assert_frame_equal(result, expected, check_like=True) + def test_count_nonnumeric_types(self): + # GH12541 + cols = ['int', 'float', 'string', 'datetime', 'timedelta', 'periods', + 'fl_inf', 'fl_nan', 'str_nan', 'dt_nat', 'periods_nat'] + + df = DataFrame( + {'int': [1, 2, 3], + 'float': [4., 5., 6.], + 'string': list('abc'), + 'datetime': pd.date_range('20170101', periods=3), + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s'), + 'periods': [pd.Period('2012-01'), pd.Period('2012-02'), + pd.Period('2012-03')], + 'fl_inf': [1., 2., np.Inf], + 'fl_nan': [1., 2., np.NaN], + 'str_nan': ['aa', 'bb', np.NaN], + 'dt_nat': [pd.Timestamp('20170101'), pd.Timestamp('20170203'), + pd.Timestamp(None)], + 'periods_nat': [pd.Period('2012-01'), pd.Period('2012-02'), + pd.Period(None)]}, + columns=cols) + + expected = DataFrame( + {'int': [1., 2., 2.], + 'float': [1., 2., 2.], + 'string': [1., 2., 2.], + 'datetime': [1., 2., 2.], + 'timedelta': [1., 2., 2.], + 'periods': [1., 2., 2.], + 'fl_inf': [1., 2., 2.], + 'fl_nan': [1., 2., 1.], + 'str_nan': [1., 2., 1.], + 'dt_nat': [1., 2., 1.], + 'periods_nat': [1., 2., 1.]}, + columns=cols) + + result = df.rolling(window=2).count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(1).count() + expected = df.notnull().astype(float) + tm.assert_frame_equal(result, expected) + def test_window_with_args(self): tm._skip_if_no_scipy() From 4fd2a2237049754cea8d25b7a7a0da578cc25edd Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Tue, 24 Jan 2017 13:44:35 -0500 Subject: [PATCH 069/338] =?UTF-8?q?BUG:=20GH14233=20resample().median()=20?= =?UTF-8?q?failed=20if=20duplicate=20column=20names=20wer=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple fix for median issue. Should use cython implementation. closes #14233 Author: Dr-Irv Closes #15202 from Dr-Irv/Issue14233 and squashes the following commits: 6e0d900 [Dr-Irv] Use randn in test 1a3b4aa [Dr-Irv] BUG: GH14233 resample().median() failed if duplicate column names were present --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/groupby.py | 1 - pandas/tseries/tests/test_resample.py | 14 ++++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 49e0f4cb08aee..6aaed803c5352 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -433,6 +433,7 @@ Bug Fixes - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) - Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`) +- Bug in ``DataFrame.resample().median()`` if duplicate column names are present (:issue:`14233`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 84858ef5cf8b3..e4edbcacd5b0a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2205,7 +2205,6 @@ def agg_series(self, obj, func): # cython aggregation _cython_functions = copy.deepcopy(BaseGrouper._cython_functions) - _cython_functions['aggregate'].pop('median') class Grouping(object): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index fbf0e0095a2f9..56953541265a6 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -2931,6 +2931,20 @@ def test_consistency_with_window(self): self.assertEqual(result.index.nlevels, 2) tm.assert_index_equal(result.index.levels[0], expected) + def test_median_duplicate_columns(self): + # GH 14233 + + df = pd.DataFrame(np.random.randn(20, 3), + columns=list('aaa'), + index=pd.date_range('2012-01-01', + periods=20, freq='s')) + df2 = df.copy() + df2.columns = ['a', 'b', 'c'] + expected = df2.resample('5s').median() + result = df.resample('5s').median() + expected.columns = result.columns + assert_frame_equal(result, expected) + class TestTimeGrouper(tm.TestCase): def setUp(self): From a9aa94a73b2c3edeed79b11fe0bb8d727a44f048 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Jan 2017 13:47:36 -0500 Subject: [PATCH 070/338] DOC: clean-up docstring str.replace (GH15056) xref #15056 Author: Joris Van den Bossche Closes #15210 from jorisvandenbossche/doc-replace-callable and squashes the following commits: 811cc34 [Joris Van den Bossche] DOC: clean-up docstring str.replace (GH15056) --- pandas/core/strings.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c48defe39a011..ac8d1db6a0bf3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -318,7 +318,8 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): match object and must return a replacement string to be used. See :func:`re.sub`. - .. versionadded:: 0.20.0 + .. versionadded:: 0.20.0 + `repl` also accepts a callable. n : int, default -1 (all) Number of replacements to make from start @@ -333,22 +334,22 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Examples -------- - When ``repl`` is a string, every ``pat`` is replaced as with + When `repl` is a string, every `pat` is replaced as with :meth:`str.replace`. NaN value(s) in the Series are left as is. - >>> Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') 0 boo 1 buz 2 NaN dtype: object - When ``repl`` is a callable, it is called on every ``pat`` using + When `repl` is a callable, it is called on every `pat` using :func:`re.sub`. The callable should expect one positional argument (a regex object) and return a string. To get the idea: - >>> Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz 2 NaN @@ -357,19 +358,19 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Reverse every lowercase alphabetic word: >>> repl = lambda m: m.group(0)[::-1] - >>> Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) 0 oof 123 1 rab zab 2 NaN dtype: object - Using regex groups: + Using regex groups (extract second group and swap case): >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" >>> repl = lambda m: m.group('two').swapcase() - >>> Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) - 0 bAR - 1 NaN + >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) + 0 tWO + 1 bAR dtype: object """ From 051777d3a72aaf6f937eea9bbf63b07af9e3cfb5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 24 Jan 2017 13:51:51 -0500 Subject: [PATCH 071/338] COMPAT: don't rely on dtype being False after numpy >= 1.13 closes #15199 Author: Jeff Reback Closes #15200 from jreback/numpy-dev and squashes the following commits: 53d663e [Jeff Reback] BLD: pull wheel for numpy builds 5475db3 [Jeff Reback] COMPAT: numpy-dev compat on bool(dtype) change --- ci/install_travis.sh | 48 +++++++++++-------- ...sh => requirements-3.5_NUMPY_DEV.build.sh} | 5 -- pandas/compat/numpy/__init__.py | 1 + pandas/core/ops.py | 4 +- pandas/tseries/index.py | 2 +- pandas/tseries/tdi.py | 3 +- 6 files changed, 34 insertions(+), 29 deletions(-) rename ci/{requirements-3.5_NUMPY_DEV.sh => requirements-3.5_NUMPY_DEV.build.sh} (66%) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 542d22d9fa871..ded428c677f17 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -16,28 +16,29 @@ function edit_init() { if [ -n "$LOCALE_OVERRIDE" ]; then - echo "Adding locale to the first line of pandas/__init__.py" + echo "[Adding locale to the first line of pandas/__init__.py]" rm -f pandas/__init__.pyc sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" sed -i "$sedc" pandas/__init__.py - echo "head -4 pandas/__init__.py" + echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py echo fi } +echo "[install_travis]" edit_init home_dir=$(pwd) -echo "home_dir: [$home_dir]" +echo "[home_dir: $home_dir]" MINICONDA_DIR="$HOME/miniconda3" if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE" ]; then - echo "Miniconda install already present from cache: $MINICONDA_DIR" + echo "[Miniconda install already present from cache: $MINICONDA_DIR]" conda config --set always_yes yes --set changeps1 no || exit 1 - echo "update conda" + echo "[update conda]" conda update -q conda || exit 1 # Useful for debugging any issues with conda @@ -45,18 +46,18 @@ if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE # set the compiler cache to work if [ "${TRAVIS_OS_NAME}" == "linux" ]; then - echo "Using ccache" + echo "[Using ccache]" export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH gcc=$(which gcc) - echo "gcc: $gcc" + echo "[gcc: $gcc]" ccache=$(which ccache) - echo "ccache: $ccache" + echo "[ccache: $ccache]" export CC='ccache gcc' fi else - echo "Using clean Miniconda install" - echo "Not using ccache" + echo "[Using clean Miniconda install]" + echo "[Not using ccache]" rm -rf "$MINICONDA_DIR" # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then @@ -66,14 +67,14 @@ else fi bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 - echo "update conda" + echo "[update conda]" conda config --set ssl_verify false || exit 1 conda config --set always_yes true --set changeps1 false || exit 1 conda update -q conda # add the pandas channel to take priority # to add extra packages - echo "add channels" + echo "[add channels]" conda config --add channels pandas || exit 1 conda config --remove channels defaults || exit 1 conda config --add channels defaults || exit 1 @@ -103,13 +104,19 @@ else fi # build deps +echo "[build installs]" REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.build" - -# install deps if [ -e ${REQ} ]; then time conda install -n pandas --file=${REQ} || exit 1 fi +# may have addtl installation instructions for this build +echo "[build addtl installs]" +REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.build.sh" +if [ -e ${REQ} ]; then + time bash $REQ || exit 1 +fi + source activate pandas if [ "$BUILD_TEST" ]; then @@ -122,24 +129,25 @@ if [ "$BUILD_TEST" ]; then else # build but don't install - echo "build em" + echo "[build em]" time python setup.py build_ext --inplace || exit 1 # we may have run installations - echo "conda installs" + echo "[conda installs]" REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.run" if [ -e ${REQ} ]; then time conda install -n pandas --file=${REQ} || exit 1 fi # we may have additional pip installs - echo "pip installs" + echo "[pip installs]" REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip" if [ -e ${REQ} ]; then pip install --upgrade -r $REQ fi # may have addtl installation instructions for this build + echo "[addtl installs]" REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.sh" if [ -e ${REQ} ]; then time bash $REQ || exit 1 @@ -147,14 +155,14 @@ else # remove any installed pandas package # w/o removing anything else - echo "removing installed pandas" + echo "[removing installed pandas]" conda remove pandas --force # install our pandas - echo "running setup.py develop" + echo "[running setup.py develop]" python setup.py develop || exit 1 fi -echo "done" +echo "[done]" exit 0 diff --git a/ci/requirements-3.5_NUMPY_DEV.sh b/ci/requirements-3.5_NUMPY_DEV.build.sh similarity index 66% rename from ci/requirements-3.5_NUMPY_DEV.sh rename to ci/requirements-3.5_NUMPY_DEV.build.sh index 946ec43ad9f1a..91fa15491bbf7 100644 --- a/ci/requirements-3.5_NUMPY_DEV.sh +++ b/ci/requirements-3.5_NUMPY_DEV.build.sh @@ -7,11 +7,6 @@ echo "install numpy master wheel" # remove the system installed numpy pip uninstall numpy -y -# we need these for numpy - -# these wheels don't play nice with the conda libgfortran / openblas -# time conda install -n pandas libgfortran openblas || exit 1 - # install numpy wheel from master pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy scipy diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 358ac3c30b8e7..bfd770d7af2c6 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -14,6 +14,7 @@ _np_version_under1p10 = _nlv < '1.10' _np_version_under1p11 = _nlv < '1.11' _np_version_under1p12 = _nlv < '1.12' +_np_version_under1p13 = _nlv < '1.13' if _nlv < '1.7.0': raise ImportError('this version of pandas is incompatible with ' diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 396b0e048bc49..697a99f63f62f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -444,13 +444,15 @@ def _convert_to_array(self, values, name=None, other=None): supplied_dtype = None if not is_list_like(values): values = np.array([values]) + # if this is a Series that contains relevant dtype info, then use this # instead of the inferred type; this avoids coercing Series([NaT], # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]') elif (isinstance(values, pd.Series) and (is_timedelta64_dtype(values) or is_datetime64_dtype(values))): supplied_dtype = values.dtype - inferred_type = supplied_dtype or lib.infer_dtype(values) + + inferred_type = lib.infer_dtype(values) if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or is_datetimetz(inferred_type)): # if we have a other of timedelta, but use pd.NaT here we diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index aca962c8178d3..6cbb696783e09 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -557,7 +557,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, if we are passed a non-dtype compat, then coerce using the constructor """ - if not getattr(values, 'dtype', None): + if getattr(values, 'dtype', None) is None: # empty, but with dtype compat if values is None: values = np.empty(0, dtype=_NS_DTYPE) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 1585aac0c8ead..2e050445f788a 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -272,8 +272,7 @@ def _box_func(self): @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - if not getattr(values, 'dtype', None): - values = np.array(values, copy=False) + values = np.array(values, copy=False) if values.dtype == np.object_: values = tslib.array_to_timedelta64(values) if values.dtype != _TD_DTYPE: From 4acfbaaea5f564ff64b85ac5c974cb1990f05b8b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 24 Jan 2017 14:12:44 -0500 Subject: [PATCH 072/338] STYLE: pep8 fix --- pandas/tests/test_window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index c05316ae34f15..dc23469976e35 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -226,7 +226,7 @@ def test_count_nonnumeric_types(self): 'dt_nat': [pd.Timestamp('20170101'), pd.Timestamp('20170203'), pd.Timestamp(None)], 'periods_nat': [pd.Period('2012-01'), pd.Period('2012-02'), - pd.Period(None)]}, + pd.Period(None)]}, columns=cols) expected = DataFrame( From de8bab012f2277b89e88ef406b2d48c92c09d31a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 24 Jan 2017 14:51:32 -0500 Subject: [PATCH 073/338] BLD: add locale to 34_nslow build Author: Jeff Reback Closes #15211 from jreback/nslow and squashes the following commits: 7dd0fac [Jeff Reback] TST: skip some non-us locale tests with guessing datetimes a7c90e1 [Jeff Reback] BLD: add locale to 34_nslow build --- .travis.yml | 4 ++++ pandas/tseries/tests/test_timeseries.py | 3 +++ pandas/util/testing.py | 8 ++++++++ 3 files changed, 15 insertions(+) diff --git a/.travis.yml b/.travis.yml index b2a1a8a63cfe6..be2058950d8ec 100644 --- a/.travis.yml +++ b/.travis.yml @@ -132,6 +132,7 @@ matrix: env: - PYTHON_VERSION=3.4 - JOB_NAME: "34_nslow" + - LOCALE_OVERRIDE="zh_CN.UTF-8" - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel @@ -141,6 +142,7 @@ matrix: apt: packages: - xsel + - language-pack-zh-hans # In allow_failures - python: 3.4 env: @@ -229,6 +231,7 @@ matrix: env: - PYTHON_VERSION=3.4 - JOB_NAME: "34_nslow" + - LOCALE_OVERRIDE="zh_CN.UTF-8" - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel @@ -238,6 +241,7 @@ matrix: apt: packages: - xsel + - language-pack-zh-hans - python: 3.5 env: - PYTHON_VERSION=3.5 diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 704aebd815a29..4d286fc62764c 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -5492,7 +5492,9 @@ def test_to_datetime_iso8601_noleading_0s(self): class TestGuessDatetimeFormat(tm.TestCase): + def test_guess_datetime_format_with_parseable_formats(self): + tm._skip_if_not_us_locale() dt_string_to_format = (('20111230', '%Y%m%d'), ('2011-12-30', '%Y-%m-%d'), ('30-12-2011', '%d-%m-%Y'), @@ -5567,6 +5569,7 @@ def test_guess_datetime_format_nopadding(self): ) def test_guess_datetime_format_for_array(self): + tm._skip_if_not_us_locale() expected_format = '%Y-%m-%d %H:%M:%S.%f' dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d39ce7acf0029..6ea91543677a7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -404,6 +404,14 @@ def _skip_if_has_locale(): import nose raise nose.SkipTest("Specific locale is set {0}".format(lang)) + +def _skip_if_not_us_locale(): + import locale + lang, _ = locale.getlocale() + if lang != 'en_US': + import nose + raise nose.SkipTest("Specific locale is set {0}".format(lang)) + # ----------------------------------------------------------------------------- # locale utilities From d8cffb7c4296191c021e143c2924d57f63460313 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Jan 2017 16:23:23 -0500 Subject: [PATCH 074/338] TST: fix excel tests when openpyxl is not installed Some tests fail when you don't have openpyxl installed, so explicitly skipping those (they need it for certain exts because they write a frame in the test) Author: Joris Van den Bossche Closes #15209 from jorisvandenbossche/openpyxl-noimport and squashes the following commits: f1cbe8f [Joris Van den Bossche] TST: fix excel tests when openpyxl is not installed --- pandas/io/tests/test_excel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index a2162320d9d1b..12aecfd50c3a6 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -449,6 +449,8 @@ def test_read_excel_blank_with_header(self): # GH 12292 : error when read one empty column from excel file def test_read_one_empty_col_no_header(self): _skip_if_no_xlwt() + _skip_if_no_openpyxl() + df = pd.DataFrame( [["", 1, 100], ["", 2, 200], @@ -506,6 +508,8 @@ def test_read_one_empty_col_with_header(self): def test_set_column_names_in_parameter(self): _skip_if_no_xlwt() + _skip_if_no_openpyxl() + # GH 12870 : pass down column names associated with # keyword argument names refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], @@ -1798,7 +1802,8 @@ def test_bytes_io(self): bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) - writer = ExcelWriter(bio) + # pass engine explicitly as there is no file path to infer from + writer = ExcelWriter(bio, engine=self.engine_name) df.to_excel(writer) writer.save() bio.seek(0) From 80317cb02c9ea180ec1dbb2e302382159b5cd242 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Tue, 24 Jan 2017 18:10:13 -0500 Subject: [PATCH 075/338] BUG: Fixed incorrect stream size check (#14125) closes #14125 Previously, self->stream_cap was copied into a local variable called maxstreamsize each time tokenize_bytes ran, and then this was checked in the PUSH_CHAR macro. However, there is one other place in the file where function make_stream_space() is called (in end_line()), and when this happens self->stream_cap is increased but maxstreamsize is not updated, making the check incorrect. In rare circumstances (see original issue or test case) this could cause a crash. The resolution is to just check self->stream_cap directly. Author: Jeff Carey Closes #15195 from jeffcarey/fix/14125 and squashes the following commits: d3c5f28 [Jeff Carey] BUG: Fixed incorrect stream size check (#14125) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/tests/parser/c_parser_only.py | 12 ++++++++++++ pandas/src/parser/tokenizer.c | 8 +++----- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6aaed803c5352..9895afdc4c7b5 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -387,6 +387,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) +- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 6df3c513faf4a..73edda90720af 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -395,3 +395,15 @@ def test_float_precision_round_trip_with_text(self): float_precision='round_trip', header=None) tm.assert_frame_equal(df, DataFrame({0: ['a']})) + + def test_large_difference_in_columns(self): + # gh-14125 + count = 10000 + large_row = ('X,' * count)[:-1] + '\n' + normal_row = 'XXXXXX XXXXXX,111111111111111\n' + test_input = (large_row + normal_row * 6)[:-1] + result = self.read_csv(StringIO(test_input), header=None, usecols=[0]) + rows = test_input.split('\n') + expected = DataFrame([row.split(',')[0] for row in rows]) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 7cddefd40cfc5..b6428c8b76743 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -592,9 +592,9 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { TRACE( \ ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= maxstreamsize) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, \ - maxstreamsize)) \ + if (slen >= self->stream_cap) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ + self->stream_cap)) \ int bufsize = 100; \ self->error_msg = (char *)malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ @@ -711,7 +711,6 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { int i, slen; int should_skip; - long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; @@ -723,7 +722,6 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { stream = self->stream + self->stream_len; slen = self->stream_len; - maxstreamsize = self->stream_cap; TRACE(("%s\n", buf)); From 2ad7c44f30f63fd8373d7c931a86cc554416b0c3 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 25 Jan 2017 06:58:48 -0500 Subject: [PATCH 076/338] BUG: Remove locale conversion from Stata file date Prevent locale from affecting Stata file date creation, which must be en_US. xref #13856 Author: Kevin Sheppard Closes #15208 from bashtage/stata-locale-date-fix and squashes the following commits: a55bfd8 [Kevin Sheppard] BUG: Remove locale conversion from Stata file date --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/stata.py | 12 +++++++++--- pandas/io/tests/test_stata.py | 4 +--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9895afdc4c7b5..1018190f8ab5f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -459,6 +459,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) +- Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 512a224555577..2be7657883e88 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2157,9 +2157,15 @@ def _write_header(self, data_label=None, time_stamp=None): time_stamp = datetime.datetime.now() elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") - self._file.write( - self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) - ) + # GH #13856 + # Avoid locale-specific month conversion + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', + 'Sep', 'Oct', 'Nov', 'Dec'] + month_lookup = {i + 1: month for i, month in enumerate(months)} + ts = (time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M")) + self._file.write(self._null_terminate(ts)) def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, fmtlist=None, lbllist=None): diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 08fcde8d3022e..8cfd5d98fe05f 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -484,9 +484,7 @@ def test_timestamp_and_label(self): data_label=data_label) with StataReader(path) as reader: - parsed_time_stamp = dt.datetime.strptime( - reader.time_stamp, ('%d %b %Y %H:%M')) - assert parsed_time_stamp == time_stamp + assert reader.time_stamp == '29 Feb 2000 14:21' assert reader.data_label == data_label def test_numeric_column_names(self): From a54213dcffbd07c705b599cc9521d884521f6854 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Jan 2017 07:01:33 -0500 Subject: [PATCH 077/338] BLD: hashtable.pxd is a pxd depends for _join, hashtable, and index (#15218) --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 8cdd72b2f4682..1c9972defabbe 100755 --- a/setup.py +++ b/setup.py @@ -490,13 +490,13 @@ def pxd(name): index={'pyxfile': 'index', 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c'], - 'pxdfiles': ['src/util'], + 'pxdfiles': ['src/util', 'hashtable'], 'depends': _pxi_dep['index']}, algos={'pyxfile': 'algos', - 'pxdfiles': ['src/util'], + 'pxdfiles': ['src/util', 'hashtable'], 'depends': _pxi_dep['algos']}, _join={'pyxfile': 'src/join', - 'pxdfiles': ['src/util'], + 'pxdfiles': ['src/util', 'hashtable'], 'depends': _pxi_dep['_join']}, _window={'pyxfile': 'window', 'pxdfiles': ['src/skiplist', 'src/util'], From 4504798b3ffcaba703300271e47cf60be96ecf74 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Jan 2017 16:43:13 +0100 Subject: [PATCH 078/338] CLN: remove deprecated google-analytics module (GH11308) (#15223) --- doc/source/remote_data.rst | 60 ----- doc/source/whatsnew/v0.15.2.txt | 2 +- doc/source/whatsnew/v0.20.0.txt | 2 + pandas/io/ga.py | 463 -------------------------------- pandas/io/tests/test_ga.py | 198 -------------- 5 files changed, 3 insertions(+), 722 deletions(-) delete mode 100644 pandas/io/ga.py delete mode 100644 pandas/io/tests/test_ga.py diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index 019aa82fed1aa..7980133582125 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -29,63 +29,3 @@ modules to be independently updated to your pandas installation. The API for .. code-block:: python from pandas_datareader import data, wb - - -.. _remote_data.ga: - -Google Analytics ----------------- - -The :mod:`~pandas.io.ga` module provides a wrapper for -`Google Analytics API `__ -to simplify retrieving traffic data. -Result sets are parsed into a pandas DataFrame with a shape and data types -derived from the source table. - -Configuring Access to Google Analytics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The first thing you need to do is to setup accesses to Google Analytics API. Follow the steps below: - -#. In the `Google Developers Console `__ - #. enable the Analytics API - #. create a new project - #. create a new Client ID for an "Installed Application" (in the "APIs & auth / Credentials section" of the newly created project) - #. download it (JSON file) -#. On your machine - #. rename it to ``client_secrets.json`` - #. move it to the ``pandas/io`` module directory - -The first time you use the :func:`read_ga` function, a browser window will open to ask you to authentify to the Google API. Do proceed. - -Using the Google Analytics API -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The following will fetch users and pageviews (metrics) data per day of the week, for the first semester of 2014, from a particular property. - -.. code-block:: python - - import pandas.io.ga as ga - ga.read_ga( - account_id = "2360420", - profile_id = "19462946", - property_id = "UA-2360420-5", - metrics = ['users', 'pageviews'], - dimensions = ['dayOfWeek'], - start_date = "2014-01-01", - end_date = "2014-08-01", - index_col = 0, - filters = "pagePath=~aboutus;ga:country==France", - ) - -The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We strongly recommend that you always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics. - -The ``index_col`` argument indicates which dimension(s) has to be taken as index. - -The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page URL has to contain ``aboutus`` AND the visitors country has to be France. - -Detailed information in the following: - -* `pandas & google analytics, by yhat `__ -* `Google Analytics integration in pandas, by Chang She `__ -* `Google Analytics Dimensions and Metrics Reference `_ diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 3bb472e9361e7..feba3d6224e65 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -163,7 +163,7 @@ Other enhancements: p.all() - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here`. +- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here`__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithemtic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1018190f8ab5f..dc39d5b79fa95 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -351,6 +351,8 @@ Removal of prior version deprecations/changes - The ``pandas.rpy`` module is removed. Similar functionality can be accessed through the `rpy2 `__ project. See the :ref:`R interfacing docs ` for more details. +- The ``pandas.io.ga`` module with a ``google-analytics`` interface is removed (:issue:`11308`). + Similar functionality can be found in the `Google2Pandas `__ package. - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) diff --git a/pandas/io/ga.py b/pandas/io/ga.py deleted file mode 100644 index 45424e78ddbe7..0000000000000 --- a/pandas/io/ga.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -1. Goto https://console.developers.google.com/iam-admin/projects -2. Create new project -3. Goto APIs and register for OAuth2.0 for installed applications -4. Download JSON secret file and move into same directory as this file -""" -# flake8: noqa - -from datetime import datetime -import re -from pandas import compat -import numpy as np -from pandas import DataFrame -import pandas as pd -import pandas.io.parsers as psr -import pandas.lib as lib -from pandas.io.date_converters import generic_parser -import pandas.io.auth as auth -from pandas.util.decorators import Appender, Substitution - -from apiclient.errors import HttpError -from oauth2client.client import AccessTokenRefreshError -from pandas.compat import zip, u - -# GH11038 -import warnings -warnings.warn("The pandas.io.ga module is deprecated and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) - -TYPE_MAP = {u('INTEGER'): int, u('FLOAT'): float, u('TIME'): int} - -NO_CALLBACK = auth.OOB_CALLBACK_URN -DOC_URL = auth.DOC_URL - -_QUERY_PARAMS = """metrics : list of str - Un-prefixed metric names (e.g., 'visitors' and not 'ga:visitors') -dimensions : list of str - Un-prefixed dimension variable names -start_date : str/date/datetime -end_date : str/date/datetime, optional, default is None but internally set as today -segment : list of str, optional, default: None -filters : list of str, optional, default: None -start_index : int, default 1 -max_results : int, default 10000 - If >10000, must specify chunksize or ValueError will be raised""" - -_QUERY_DOC = """ -Construct a google analytics query using given parameters -Metrics and dimensions do not need the 'ga:' prefix - -Parameters ----------- -profile_id : str -%s -""" % _QUERY_PARAMS - -_GA_READER_DOC = """Given query parameters, return a DataFrame with all the -data or an iterator that returns DataFrames containing chunks of the data - -Parameters ----------- -%s -sort : bool/list, default True - Sort output by index or list of columns -chunksize : int, optional - If max_results >10000, specifies the number of rows per iteration -index_col : str/list of str/dict, optional, default: None - If unspecified then dimension variables are set as index -parse_dates : bool/list/dict, default: True -keep_date_col : boolean, default: False -date_parser : optional, default: None -na_values : optional, default: None -converters : optional, default: None -dayfirst : bool, default False - Informs date parsing -account_name : str, optional, default: None -account_id : str, optional, default: None -property_name : str, optional, default: None -property_id : str, optional, default: None -profile_name : str, optional, default: None -profile_id : str, optional, default: None -%%(extras)s -Returns -------- -data : DataFrame or DataFrame yielding iterator -""" % _QUERY_PARAMS - -_AUTH_PARAMS = """secrets : str, optional - File path to the secrets file -scope : str, optional - Authentication scope -token_file_name : str, optional - Path to token storage -redirect : str, optional - Local host redirect if unspecified -""" - - -def reset_token_store(): - """ - Deletes the default token store - """ - auth.reset_default_token_store() - - -@Substitution(extras=_AUTH_PARAMS) -@Appender(_GA_READER_DOC) -def read_ga(metrics, dimensions, start_date, **kwargs): - lst = ['secrets', 'scope', 'token_file_name', 'redirect'] - reader_kwds = dict((p, kwargs.pop(p)) for p in lst if p in kwargs) - reader = GAnalytics(**reader_kwds) - return reader.get_data(metrics=metrics, start_date=start_date, - dimensions=dimensions, **kwargs) - - -class OAuthDataReader(object): - """ - Abstract class for handling OAuth2 authentication using the Google - oauth2client library - """ - def __init__(self, scope, token_file_name, redirect): - """ - Parameters - ---------- - scope : str - Designates the authentication scope - token_file_name : str - Location of cache for authenticated tokens - redirect : str - Redirect URL - """ - self.scope = scope - self.token_store = auth.make_token_store(token_file_name) - self.redirect_url = redirect - - def authenticate(self, secrets): - """ - Run the authentication process and return an authorized - http object - - Parameters - ---------- - secrets : str - File name for client secrets - - Notes - ----- - See google documention for format of secrets file - %s - """ % DOC_URL - flow = self._create_flow(secrets) - return auth.authenticate(flow, self.token_store) - - def _create_flow(self, secrets): - """ - Create an authentication flow based on the secrets file - - Parameters - ---------- - secrets : str - File name for client secrets - - Notes - ----- - See google documentation for format of secrets file - %s - """ % DOC_URL - return auth.get_flow(secrets, self.scope, self.redirect_url) - - -class GDataReader(OAuthDataReader): - """ - Abstract class for reading data from google APIs using OAuth2 - Subclasses must implement create_query method - """ - def __init__(self, scope=auth.DEFAULT_SCOPE, - token_file_name=auth.DEFAULT_TOKEN_FILE, - redirect=NO_CALLBACK, secrets=auth.DEFAULT_SECRETS): - super(GDataReader, self).__init__(scope, token_file_name, redirect) - self._service = self._init_service(secrets) - - @property - def service(self): - """The authenticated request service object""" - return self._service - - def _init_service(self, secrets): - """ - Build an authenticated google api request service using the given - secrets file - """ - http = self.authenticate(secrets) - return auth.init_service(http) - - def get_account(self, name=None, id=None, **kwargs): - """ Retrieve an account that matches the name, id, or some account - attribute specified in **kwargs - - Parameters - ---------- - name : str, optional, default: None - id : str, optional, default: None - """ - accounts = self.service.management().accounts().list().execute() - return _get_match(accounts, name, id, **kwargs) - - def get_web_property(self, account_id=None, name=None, id=None, **kwargs): - """ - Retrieve a web property given and account and property name, id, or - custom attribute - - Parameters - ---------- - account_id : str, optional, default: None - name : str, optional, default: None - id : str, optional, default: None - """ - prop_store = self.service.management().webproperties() - kwds = {} - if account_id is not None: - kwds['accountId'] = account_id - prop_for_acct = prop_store.list(**kwds).execute() - return _get_match(prop_for_acct, name, id, **kwargs) - - def get_profile(self, account_id=None, web_property_id=None, name=None, - id=None, **kwargs): - - """ - Retrieve the right profile for the given account, web property, and - profile attribute (name, id, or arbitrary parameter in kwargs) - - Parameters - ---------- - account_id : str, optional, default: None - web_property_id : str, optional, default: None - name : str, optional, default: None - id : str, optional, default: None - """ - profile_store = self.service.management().profiles() - kwds = {} - if account_id is not None: - kwds['accountId'] = account_id - if web_property_id is not None: - kwds['webPropertyId'] = web_property_id - profiles = profile_store.list(**kwds).execute() - return _get_match(profiles, name, id, **kwargs) - - def create_query(self, *args, **kwargs): - raise NotImplementedError() - - @Substitution(extras='') - @Appender(_GA_READER_DOC) - def get_data(self, metrics, start_date, end_date=None, - dimensions=None, segment=None, filters=None, start_index=1, - max_results=10000, index_col=None, parse_dates=True, - keep_date_col=False, date_parser=None, na_values=None, - converters=None, sort=True, dayfirst=False, - account_name=None, account_id=None, property_name=None, - property_id=None, profile_name=None, profile_id=None, - chunksize=None): - if chunksize is None and max_results > 10000: - raise ValueError('Google API returns maximum of 10,000 rows, ' - 'please set chunksize') - - account = self.get_account(account_name, account_id) - web_property = self.get_web_property(account.get('id'), property_name, - property_id) - profile = self.get_profile(account.get('id'), web_property.get('id'), - profile_name, profile_id) - - profile_id = profile.get('id') - - if index_col is None and dimensions is not None: - if isinstance(dimensions, compat.string_types): - dimensions = [dimensions] - index_col = _clean_index(list(dimensions), parse_dates) - - def _read(start, result_size): - query = self.create_query(profile_id, metrics, start_date, - end_date=end_date, dimensions=dimensions, - segment=segment, filters=filters, - start_index=start, - max_results=result_size) - - try: - rs = query.execute() - rows = rs.get('rows', []) - col_info = rs.get('columnHeaders', []) - return self._parse_data(rows, col_info, index_col, - parse_dates=parse_dates, - keep_date_col=keep_date_col, - date_parser=date_parser, - dayfirst=dayfirst, - na_values=na_values, - converters=converters, sort=sort) - except HttpError as inst: - raise ValueError('Google API error %s: %s' % (inst.resp.status, - inst._get_reason())) - - if chunksize is None: - return _read(start_index, max_results) - - def iterator(): - curr_start = start_index - - while curr_start < max_results: - yield _read(curr_start, chunksize) - curr_start += chunksize - return iterator() - - def _parse_data(self, rows, col_info, index_col, parse_dates=True, - keep_date_col=False, date_parser=None, dayfirst=False, - na_values=None, converters=None, sort=True): - # TODO use returned column types - col_names = _get_col_names(col_info) - df = psr._read(rows, dict(index_col=index_col, parse_dates=parse_dates, - date_parser=date_parser, dayfirst=dayfirst, - na_values=na_values, - keep_date_col=keep_date_col, - converters=converters, - header=None, names=col_names)) - - if isinstance(sort, bool) and sort: - return df.sort_index() - elif isinstance(sort, (compat.string_types, list, tuple, np.ndarray)): - return df.sort_index(by=sort) - - return df - - -class GAnalytics(GDataReader): - - @Appender(_QUERY_DOC) - def create_query(self, profile_id, metrics, start_date, end_date=None, - dimensions=None, segment=None, filters=None, - start_index=None, max_results=10000, **kwargs): - qry = format_query(profile_id, metrics, start_date, end_date=end_date, - dimensions=dimensions, segment=segment, - filters=filters, start_index=start_index, - max_results=max_results, **kwargs) - try: - return self.service.data().ga().get(**qry) - except TypeError as error: - raise ValueError('Error making query: %s' % error) - - -def format_query(ids, metrics, start_date, end_date=None, dimensions=None, - segment=None, filters=None, sort=None, start_index=None, - max_results=10000, **kwargs): - if isinstance(metrics, compat.string_types): - metrics = [metrics] - met = ','.join(['ga:%s' % x for x in metrics]) - - start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d') - if end_date is None: - end_date = datetime.today() - end_date = pd.to_datetime(end_date).strftime('%Y-%m-%d') - - qry = dict(ids='ga:%s' % str(ids), - metrics=met, - start_date=start_date, - end_date=end_date) - qry.update(kwargs) - - names = ['dimensions', 'filters', 'sort'] - lst = [dimensions, filters, sort] - [_maybe_add_arg(qry, n, d) for n, d in zip(names, lst)] - - if isinstance(segment, compat.string_types): - if re.match("^[a-zA-Z0-9\-\_]+$", segment): - _maybe_add_arg(qry, 'segment', segment, 'gaid:') - else: - _maybe_add_arg(qry, 'segment', segment, 'dynamic::ga') - elif isinstance(segment, int): - _maybe_add_arg(qry, 'segment', segment, 'gaid:') - elif segment: - raise ValueError("segment must be string for dynamic and int ID") - - if start_index is not None: - qry['start_index'] = str(start_index) - - if max_results is not None: - qry['max_results'] = str(max_results) - - return qry - - -def _maybe_add_arg(query, field, data, prefix='ga'): - if data is not None: - if isinstance(data, (compat.string_types, int)): - data = [data] - data = ','.join(['%s:%s' % (prefix, x) for x in data]) - query[field] = data - - -def _get_match(obj_store, name, id, **kwargs): - key, val = None, None - if len(kwargs) > 0: - key = list(kwargs.keys())[0] - val = list(kwargs.values())[0] - - if name is None and id is None and key is None: - return obj_store.get('items')[0] - - name_ok = lambda item: name is not None and item.get('name') == name - id_ok = lambda item: id is not None and item.get('id') == id - key_ok = lambda item: key is not None and item.get(key) == val - - match = None - if obj_store.get('items'): - # TODO look up gapi for faster lookup - for item in obj_store.get('items'): - if name_ok(item) or id_ok(item) or key_ok(item): - return item - - -def _clean_index(index_dims, parse_dates): - _should_add = lambda lst: pd.Index(lst).isin(index_dims).all() - to_remove = [] - to_add = [] - - if isinstance(parse_dates, (list, tuple, np.ndarray)): - for lst in parse_dates: - if isinstance(lst, (list, tuple, np.ndarray)): - if _should_add(lst): - to_add.append('_'.join(lst)) - to_remove.extend(lst) - elif isinstance(parse_dates, dict): - for name, lst in compat.iteritems(parse_dates): - if isinstance(lst, (list, tuple, np.ndarray)): - if _should_add(lst): - to_add.append(name) - to_remove.extend(lst) - - index_dims = pd.Index(index_dims) - to_remove = pd.Index(set(to_remove)) - to_add = pd.Index(set(to_add)) - - return index_dims.difference(to_remove).union(to_add) - - -def _get_col_names(header_info): - return [x['name'][3:] for x in header_info] - - -def _get_column_types(header_info): - return [(x['name'][3:], x['columnType']) for x in header_info] - - -def _get_dim_names(header_info): - return [x['name'][3:] for x in header_info - if x['columnType'] == u('DIMENSION')] - - -def _get_met_names(header_info): - return [x['name'][3:] for x in header_info - if x['columnType'] == u('METRIC')] - - -def _get_data_types(header_info): - return [(x['name'][3:], TYPE_MAP.get(x['dataType'], object)) - for x in header_info] diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py deleted file mode 100644 index 469e121f633d7..0000000000000 --- a/pandas/io/tests/test_ga.py +++ /dev/null @@ -1,198 +0,0 @@ -# flake8: noqa - -import os -from datetime import datetime - -import warnings -import nose -import pandas as pd -from pandas import compat -from pandas.util.testing import (network, assert_frame_equal, - with_connectivity_check, slow) -import pandas.util.testing as tm - -if compat.PY3: - raise nose.SkipTest("python-gflags does not support Python 3 yet") - -try: - import httplib2 - import apiclient - - # deprecated - with warnings.catch_warnings(record=True): - import pandas.io.ga as ga - - from pandas.io.ga import GAnalytics, read_ga - from pandas.io.auth import AuthenticationConfigError, reset_default_token_store - from pandas.io import auth -except ImportError: - raise nose.SkipTest("need httplib2 and auth libs") - - -class TestGoogle(tm.TestCase): - - _multiprocess_can_split_ = True - - def test_remove_token_store(self): - auth.DEFAULT_TOKEN_FILE = 'test.dat' - with open(auth.DEFAULT_TOKEN_FILE, 'w') as fh: - fh.write('test') - - reset_default_token_store() - self.assertFalse(os.path.exists(auth.DEFAULT_TOKEN_FILE)) - - @with_connectivity_check("http://www.google.com") - def test_getdata(self): - try: - end_date = datetime.now() - start_date = end_date - pd.offsets.Day() * 5 - end_date = end_date.strftime('%Y-%m-%d') - start_date = start_date.strftime('%Y-%m-%d') - - reader = GAnalytics() - df = reader.get_data( - metrics=['avgTimeOnSite', 'visitors', 'newVisits', - 'pageviewsPerVisit'], - start_date=start_date, - end_date=end_date, - dimensions=['date', 'hour'], - parse_dates={'ts': ['date', 'hour']}, - index_col=0) - - self.assertIsInstance(df, pd.DataFrame) - self.assertIsInstance(df.index, pd.DatetimeIndex) - self.assertGreater(len(df), 1) - self.assertTrue('date' not in df) - self.assertTrue('hour' not in df) - self.assertEqual(df.index.name, 'ts') - self.assertTrue('avgTimeOnSite' in df) - self.assertTrue('visitors' in df) - self.assertTrue('newVisits' in df) - self.assertTrue('pageviewsPerVisit' in df) - - df2 = read_ga( - metrics=['avgTimeOnSite', 'visitors', 'newVisits', - 'pageviewsPerVisit'], - start_date=start_date, - end_date=end_date, - dimensions=['date', 'hour'], - parse_dates={'ts': ['date', 'hour']}, - index_col=0) - - assert_frame_equal(df, df2) - - except AuthenticationConfigError: - raise nose.SkipTest("authentication error") - - @with_connectivity_check("http://www.google.com") - def test_iterator(self): - try: - reader = GAnalytics() - - it = reader.get_data( - metrics='visitors', - start_date='2005-1-1', - dimensions='date', - max_results=10, chunksize=5, - index_col=0) - - df1 = next(it) - df2 = next(it) - - for df in [df1, df2]: - self.assertIsInstance(df, pd.DataFrame) - self.assertIsInstance(df.index, pd.DatetimeIndex) - self.assertEqual(len(df), 5) - self.assertTrue('date' not in df) - self.assertEqual(df.index.name, 'date') - self.assertTrue('visitors' in df) - - self.assertTrue((df2.index > df1.index).all()) - - except AuthenticationConfigError: - raise nose.SkipTest("authentication error") - - def test_v2_advanced_segment_format(self): - advanced_segment_id = 1234567 - query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) - self.assertEqual(query['segment'], 'gaid::' + str(advanced_segment_id), "An integer value should be formatted as an advanced segment.") - - def test_v2_dynamic_segment_format(self): - dynamic_segment_id = 'medium==referral' - query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=dynamic_segment_id) - self.assertEqual(query['segment'], 'dynamic::ga:' + str(dynamic_segment_id), "A string value with more than just letters and numbers should be formatted as a dynamic segment.") - - def test_v3_advanced_segment_common_format(self): - advanced_segment_id = 'aZwqR234' - query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) - self.assertEqual(query['segment'], 'gaid::' + str(advanced_segment_id), "A string value with just letters and numbers should be formatted as an advanced segment.") - - def test_v3_advanced_segment_weird_format(self): - advanced_segment_id = '_aZwqR234-s1' - query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) - self.assertEqual(query['segment'], 'gaid::' + str(advanced_segment_id), "A string value with just letters, numbers, and hyphens should be formatted as an advanced segment.") - - def test_v3_advanced_segment_with_underscore_format(self): - advanced_segment_id = 'aZwqR234_s1' - query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) - self.assertEqual(query['segment'], 'gaid::' + str(advanced_segment_id), "A string value with just letters, numbers, and underscores should be formatted as an advanced segment.") - - @with_connectivity_check("http://www.google.com") - def test_segment(self): - try: - end_date = datetime.now() - start_date = end_date - pd.offsets.Day() * 5 - end_date = end_date.strftime('%Y-%m-%d') - start_date = start_date.strftime('%Y-%m-%d') - - reader = GAnalytics() - df = reader.get_data( - metrics=['avgTimeOnSite', 'visitors', 'newVisits', - 'pageviewsPerVisit'], - start_date=start_date, - end_date=end_date, - segment=-2, - dimensions=['date', 'hour'], - parse_dates={'ts': ['date', 'hour']}, - index_col=0) - - self.assertIsInstance(df, pd.DataFrame) - self.assertIsInstance(df.index, pd.DatetimeIndex) - self.assertGreater(len(df), 1) - self.assertTrue('date' not in df) - self.assertTrue('hour' not in df) - self.assertEqual(df.index.name, 'ts') - self.assertTrue('avgTimeOnSite' in df) - self.assertTrue('visitors' in df) - self.assertTrue('newVisits' in df) - self.assertTrue('pageviewsPerVisit' in df) - - # dynamic - df = read_ga( - metrics=['avgTimeOnSite', 'visitors', 'newVisits', - 'pageviewsPerVisit'], - start_date=start_date, - end_date=end_date, - segment="source=~twitter", - dimensions=['date', 'hour'], - parse_dates={'ts': ['date', 'hour']}, - index_col=0) - - assert isinstance(df, pd.DataFrame) - assert isinstance(df.index, pd.DatetimeIndex) - self.assertGreater(len(df), 1) - self.assertTrue('date' not in df) - self.assertTrue('hour' not in df) - self.assertEqual(df.index.name, 'ts') - self.assertTrue('avgTimeOnSite' in df) - self.assertTrue('visitors' in df) - self.assertTrue('newVisits' in df) - self.assertTrue('pageviewsPerVisit' in df) - - except AuthenticationConfigError: - raise nose.SkipTest("authentication error") - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) From 41f059075b5906a66df657147a6724976482919f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 25 Jan 2017 14:04:28 -0500 Subject: [PATCH 079/338] BUG: groupby.cummin/cummax nans inf values for int64 closes #15109 Author: Matt Roeschke Closes #15205 from mroeschke/fix_15109 and squashes the following commits: 717afb4 [Matt Roeschke] BUG: cummin/cummax nans inf values for int64 --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 35 +++++++++++++++----------- pandas/src/algos_groupby_helper.pxi.in | 22 ++++++++++------ pandas/tests/groupby/test_groupby.py | 30 ++++++++-------------- 4 files changed, 47 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index dc39d5b79fa95..a73bb24521b0d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -367,7 +367,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). -- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`) +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e4edbcacd5b0a..99220232114ce 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,13 +18,13 @@ is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, is_datetimelike, - is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_bool, is_integer_dtype, is_complex_dtype, is_bool_dtype, is_scalar, is_list_like, + needs_i8_conversion, _ensure_float64, _ensure_platform_int, _ensure_int64, @@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis): "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] + is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) - if is_datetime_or_timedelta_dtype(values.dtype): + if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = _ensure_float64(values) elif is_integer_dtype(values): - values = values.astype('int64', copy=False) + # we use iNaT for the missing value on ints + # so pre-convert to guard this condition + if (values == tslib.iNaT).any(): + values = _ensure_float64(values) + else: + values = values.astype('int64', copy=False) elif is_numeric and not is_complex_dtype(values): values = _ensure_float64(values) else: @@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis): fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_numeric) + result, counts, values, labels, func, is_numeric, + is_datetimelike) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) - # temporary storange for running-total type tranforms - accum = np.empty(out_shape, dtype=out_dtype) result = self._transform( - result, accum, values, labels, func, is_numeric) + result, values, labels, func, is_numeric, is_datetimelike) if is_integer_dtype(result): - if len(result[result == tslib.iNaT]) > 0: + mask = result == tslib.iNaT + if mask.any(): result = result.astype('float64') - result[result == tslib.iNaT] = np.nan + result[mask] = np.nan if kind == 'aggregate' and \ self._filter_empty_groups and not counts.all(): @@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric): + is_numeric, is_datetimelike): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result - def _transform(self, result, accum, values, comp_ids, transform_func, - is_numeric): + def _transform(self, result, values, comp_ids, transform_func, + is_numeric, is_datetimelike): + comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now @@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, accum) + comp_ids, is_datetimelike) else: - transform_func(result, values, comp_ids, accum) + transform_func(result, values, comp_ids, is_datetimelike) return result diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 4b031afed3d43..fda1e51bd2b1f 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] values, ndarray[int64_t] labels, - ndarray[{{dest_type2}}, ndim=2] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size {{dest_type2}} val, min_val = 0 + ndarray[{{dest_type2}}, ndim=2] accum int64_t lab N, K = ( values).shape + accum = np.empty_like(values) accum.fill({{inf_val}}) with nogil: @@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, accum[lab, j] = min_val out[i, j] = accum[lab, j] # val = nan - else: + elif is_datetimelike: out[i, j] = {{nan_val}} @@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] values, ndarray[int64_t] labels, - ndarray[{{dest_type2}}, ndim=2] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size {{dest_type2}} val, max_val = 0 + ndarray[{{dest_type2}}, ndim=2] accum int64_t lab N, K = ( values).shape + accum = np.empty_like(values) accum.fill(-{{inf_val}}) with nogil: @@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, accum[lab, j] = max_val out[i, j] = accum[lab, j] # val = nan - else: + elif is_datetimelike: out[i, j] = {{nan_val}} {{endfor}} @@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - float64_t[:, :] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size float64_t val + float64_t[:, :] accum int64_t lab N, K = ( values).shape - accum = np.ones_like(accum) + accum = np.ones_like(values) with nogil: for i in range(N): @@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - numeric[:, :] accum): + is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size numeric val + numeric[:, :] accum int64_t lab N, K = ( values).shape - accum = np.zeros_like(accum) + accum = np.zeros_like(values) with nogil: for i in range(N): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 68689189a92b2..ffb6025163a6b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5507,13 +5507,13 @@ def test_cython_group_transform_algos(self): ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]), (pd.algos.group_cumsum, np.cumsum, dtypes)] + is_datetimelike = False for pd_op, np_op, dtypes in ops: for dtype in dtypes: data = np.array([[1], [2], [3], [4]], dtype=dtype) ans = np.zeros_like(data) - accum = np.array([[0]], dtype=dtype) labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, accum) + pd_op(ans, data, labels, is_datetimelike) self.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) @@ -5521,25 +5521,24 @@ def test_cython_group_transform_algos(self): labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') - accum = np.array([[0.0]]) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumprod_float64(actual, data, labels, accum) + pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) - accum = np.array([[0.0]]) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumsum(actual, data, labels, accum) + pd.algos.group_cumsum(actual, data, labels, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) # timedelta + is_datetimelike = True data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - accum = np.array([[0]], dtype='int64') actual = np.zeros_like(data, dtype='int64') - pd.algos.group_cumsum(actual, data.view('int64'), labels, accum) + pd.algos.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) @@ -5965,12 +5964,9 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = min_val expected.loc[[2, 3, 6, 7], 'B'] = min_val result = df.groupby('A').cummin() - - # TODO: GH 15019 - # overwriting NaNs - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # cummax expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) @@ -5983,13 +5979,9 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = max_val expected.loc[[2, 3, 6, 7], 'B'] = max_val result = df.groupby('A').cummax() - - # TODO: GH 15019 - # overwriting NaNs - # tm.assert_frame_equal(result, expected) - + tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Test nan in some values base_df.loc[[0, 2, 4, 6], 'B'] = np.nan From 4888ded7560b44ac90ccd2acc720f2d9f89b563c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Jan 2017 16:19:46 -0500 Subject: [PATCH 080/338] ENH: add MultiIndex.to_dataframe closes #12397 Author: Jeff Reback Closes #15216 from jreback/to_dataframe and squashes the following commits: b744fb5 [Jeff Reback] ENH: add MultiIndex.to_dataframe --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/indexes/multi.py | 24 +++++++++++++++++++ pandas/tests/indexes/test_multi.py | 37 ++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 04a85bf63a6f8..92f290b5ee0a9 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1460,6 +1460,7 @@ MultiIndex Components MultiIndex.set_levels MultiIndex.set_labels MultiIndex.to_hierarchical + MultiIndex.to_frame MultiIndex.is_lexsorted MultiIndex.droplevel MultiIndex.swaplevel diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a73bb24521b0d..86fa919fec304 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -131,7 +131,7 @@ Other enhancements - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack of sorting or an incorrect key. See :ref:`here ` - +- ``MultiIndex`` has gained a ``.to_frame()`` method to convert to a ``DataFrame`` (:issue:`12397`) - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) - ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 2afafaeb544d1..d8991a0982db2 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -827,6 +827,30 @@ def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) + def to_frame(self, index=True): + """ + Create a DataFrame with the columns the levels of the MultiIndex + + .. versionadded:: 0.20.0 + + Parameters + ---------- + index : boolean, default True + return this MultiIndex as the index + + Returns + ------- + DataFrame + """ + + from pandas import DataFrame + result = DataFrame({(name or level): self.get_level_values(level) + for name, level in + zip(self.names, range(len(self.levels)))}) + if index: + result.index = self + return result + def to_hierarchical(self, n_repeat, n_shuffle=1): """ Return a MultiIndex reshaped to conform to the diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 343078aeafaf0..7d9ceb526b912 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1348,6 +1348,43 @@ def test_format_sparse_config(self): warnings.filters = warn_filters + def test_to_frame(self): + tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] + + index = MultiIndex.from_tuples(tuples) + result = index.to_frame(index=False) + expected = DataFrame(tuples) + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + result = index.to_frame(index=False) + expected = DataFrame(tuples) + expected.columns = ['first', 'second'] + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_product([range(5), + pd.date_range('20130101', periods=3)]) + result = index.to_frame(index=False) + expected = DataFrame( + {0: np.repeat(np.arange(5, dtype='int64'), 3), + 1: np.tile(pd.date_range('20130101', periods=3), 5)}) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_product([range(5), + pd.date_range('20130101', periods=3)]) + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + def test_to_hierarchical(self): index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two')]) From adb7d52fac2cbe416aa03fe527ff27a5b6529960 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Jan 2017 18:47:25 -0500 Subject: [PATCH 081/338] DOC: Merge FAQ and gotcha (rebase of GH13768) Rebase and clean-up of https://github.com/pandas-dev/pandas/pull/13768 closes #9809 Author: Joris Van den Bossche Author: sinhrks Closes #15222 from jorisvandenbossche/pr/13768 and squashes the following commits: 7abb65b [Joris Van den Bossche] Make 'indexing may change dtype' more general 53a6970 [Joris Van den Bossche] Move HTML libraries gotchas to html io docs 7185dd4 [Joris Van den Bossche] Keep original gotchas label for references c9e41cc [Joris Van den Bossche] Redo updates after ix deprecation ab7fdf0 [Joris Van den Bossche] restore file name f5e0af0 [sinhrks] DOC: Merge FAQ and gotcha --- doc/source/advanced.rst | 133 ++++++++++++++++ doc/source/basics.rst | 2 +- doc/source/ecosystem.rst | 7 +- doc/source/faq.rst | 115 -------------- doc/source/gotchas.rst | 323 ++++++++++----------------------------- doc/source/install.rst | 11 +- doc/source/io.rst | 82 +++++++++- 7 files changed, 299 insertions(+), 374 deletions(-) delete mode 100644 doc/source/faq.rst diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 9247652388c5b..21ae9f1eb8409 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -844,3 +844,136 @@ Of course if you need integer based selection, then use ``iloc`` .. ipython:: python dfir.iloc[0:5] + +Miscellaneous indexing FAQ +-------------------------- + +Integer indexing +~~~~~~~~~~~~~~~~ + +Label-based indexing with integer axis labels is a thorny topic. It has been +discussed heavily on mailing lists and among various members of the scientific +Python community. In pandas, our general viewpoint is that labels matter more +than integer locations. Therefore, with an integer axis index *only* +label-based indexing is possible with the standard tools like ``.loc``. The +following code will generate exceptions: + +.. code-block:: python + + s = pd.Series(range(5)) + s[-1] + df = pd.DataFrame(np.random.randn(5, 4)) + df + df.loc[-2:] + +This deliberate decision was made to prevent ambiguities and subtle bugs (many +users reported finding bugs when the API change was made to stop "falling back" +on position-based indexing). + +Non-monotonic indexes require exact matches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the index of a ``Series`` or ``DataFrame`` is monotonically increasing or decreasing, then the bounds +of a label-based slice can be outside the range of the index, much like slice indexing a +normal Python ``list``. Monotonicity of an index can be tested with the ``is_monotonic_increasing`` and +``is_monotonic_decreasing`` attributes. + +.. ipython:: python + + df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=range(5)) + df.index.is_monotonic_increasing + + # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: + df.loc[0:4, :] + + # slice is are outside the index, so empty DataFrame is returned + df.loc[13:15, :] + +On the other hand, if the index is not monotonic, then both slice bounds must be +*unique* members of the index. + +.. ipython:: python + + df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=range(6)) + df.index.is_monotonic_increasing + + # OK because 2 and 4 are in the index + df.loc[2:4, :] + +.. code-block:: python + + # 0 is not in the index + In [9]: df.loc[0:4, :] + KeyError: 0 + + # 3 is not a unique label + In [11]: df.loc[2:3, :] + KeyError: 'Cannot get right slice bound for non-unique label: 3' + + +Endpoints are inclusive +~~~~~~~~~~~~~~~~~~~~~~~ + +Compared with standard Python sequence slicing in which the slice endpoint is +not inclusive, label-based slicing in pandas **is inclusive**. The primary +reason for this is that it is often not possible to easily determine the +"successor" or next element after a particular label in an index. For example, +consider the following Series: + +.. ipython:: python + + s = pd.Series(np.random.randn(6), index=list('abcdef')) + s + +Suppose we wished to slice from ``c`` to ``e``, using integers this would be + +.. ipython:: python + + s[2:5] + +However, if you only had ``c`` and ``e``, determining the next element in the +index can be somewhat complicated. For example, the following does not work: + +:: + + s.loc['c':'e'+1] + +A very common use case is to limit a time series to start and end at two +specific dates. To enable this, we made the design design to make label-based +slicing include both endpoints: + +.. ipython:: python + + s.loc['c':'e'] + +This is most definitely a "practicality beats purity" sort of thing, but it is +something to watch out for if you expect label-based slicing to behave exactly +in the way that standard Python integer slicing works. + + +Indexing potentially changes underlying Series dtype +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The different indexing operation can potentially change the dtype of a ``Series``. + +.. ipython:: python + + series1 = pd.Series([1, 2, 3]) + series1.dtype + res = series1[[0,4]] + res.dtype + res + +.. ipython:: python + series2 = pd.Series([True]) + series2.dtype + res = series2.reindex_like(series1) + res.dtype + res + +This is because the (re)indexing operations above silently inserts ``NaNs`` and the ``dtype`` +changes accordingly. This can cause some issues when using ``numpy`` ``ufuncs`` +such as ``numpy.logical_and``. + +See the `this old issue `__ for a more +detailed discussion. diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 6f711323695c8..f5f7c73223595 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1889,7 +1889,7 @@ gotchas Performing selection operations on ``integer`` type data can easily upcast the data to ``floating``. The dtype of the input data will be preserved in cases where ``nans`` are not introduced (starting in 0.11.0) -See also :ref:`integer na gotchas ` +See also :ref:`Support for integer ``NA`` ` .. ipython:: python diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 087b265ee83f2..5a7d6a11d293d 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -93,12 +93,13 @@ targets the IPython Notebook environment. `Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. -`Pandas-Qt `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Visualizing Data in Qt applications +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Spun off from the main pandas library, the `Pandas-Qt `__ +Spun off from the main pandas library, the `qtpandas `__ library enables DataFrame visualization and manipulation in PyQt4 and PySide applications. + .. _ecosystem.ide: IDE diff --git a/doc/source/faq.rst b/doc/source/faq.rst deleted file mode 100644 index b96660be97d71..0000000000000 --- a/doc/source/faq.rst +++ /dev/null @@ -1,115 +0,0 @@ -.. currentmodule:: pandas -.. _faq: - -******************************** -Frequently Asked Questions (FAQ) -******************************** - -.. ipython:: python - :suppress: - - import numpy as np - np.random.seed(123456) - np.set_printoptions(precision=4, suppress=True) - import pandas as pd - pd.options.display.max_rows = 15 - import matplotlib - matplotlib.style.use('ggplot') - import matplotlib.pyplot as plt - plt.close('all') - -.. _df-memory-usage: - -DataFrame memory usage ----------------------- -As of pandas version 0.15.0, the memory usage of a dataframe (including -the index) is shown when accessing the ``info`` method of a dataframe. A -configuration option, ``display.memory_usage`` (see :ref:`options`), -specifies if the dataframe's memory usage will be displayed when -invoking the ``df.info()`` method. - -For example, the memory usage of the dataframe below is shown -when calling ``df.info()``: - -.. ipython:: python - - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] - n = 5000 - data = dict([ (t, np.random.randint(100, size=n).astype(t)) - for t in dtypes]) - df = pd.DataFrame(data) - df['categorical'] = df['object'].astype('category') - - df.info() - -The ``+`` symbol indicates that the true memory usage could be higher, because -pandas does not count the memory used by values in columns with -``dtype=object``. - -.. versionadded:: 0.17.1 - -Passing ``memory_usage='deep'`` will enable a more accurate memory usage report, -that accounts for the full usage of the contained objects. This is optional -as it can be expensive to do this deeper introspection. - -.. ipython:: python - - df.info(memory_usage='deep') - -By default the display option is set to ``True`` but can be explicitly -overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. - -The memory usage of each column can be found by calling the ``memory_usage`` -method. This returns a Series with an index represented by column names -and memory usage of each column shown in bytes. For the dataframe above, -the memory usage of each column and the total memory usage of the -dataframe can be found with the memory_usage method: - -.. ipython:: python - - df.memory_usage() - - # total memory usage of dataframe - df.memory_usage().sum() - -By default the memory usage of the dataframe's index is shown in the -returned Series, the memory usage of the index can be suppressed by passing -the ``index=False`` argument: - -.. ipython:: python - - df.memory_usage(index=False) - -The memory usage displayed by the ``info`` method utilizes the -``memory_usage`` method to determine the memory usage of a dataframe -while also formatting the output in human-readable units (base-2 -representation; i.e., 1KB = 1024 bytes). - -See also :ref:`Categorical Memory Usage `. - -Byte-Ordering Issues --------------------- -Occasionally you may have to deal with data that were created on a machine with -a different byte order than the one on which you are running Python. To deal -with this issue you should convert the underlying NumPy array to the native -system byte order *before* passing it to Series/DataFrame/Panel constructors -using something similar to the following: - -.. ipython:: python - - x = np.array(list(range(10)), '>i4') # big endian - newx = x.byteswap().newbyteorder() # force native byteorder - s = pd.Series(newx) - -See `the NumPy documentation on byte order -`__ for more -details. - - -Visualizing Data in Qt applications ------------------------------------ - -There is no support for such visualization in pandas. However, the external -package `qtpandas `_ does -provide this functionality. diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index e582d4f42894b..11827fe2776cf 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -1,18 +1,92 @@ .. currentmodule:: pandas .. _gotchas: +******************************** +Frequently Asked Questions (FAQ) +******************************** + .. ipython:: python :suppress: import numpy as np + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) import pandas as pd - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 + import matplotlib + matplotlib.style.use('ggplot') + import matplotlib.pyplot as plt + plt.close('all') + +.. _df-memory-usage: + +DataFrame memory usage +---------------------- +As of pandas version 0.15.0, the memory usage of a dataframe (including +the index) is shown when accessing the ``info`` method of a dataframe. A +configuration option, ``display.memory_usage`` (see :ref:`options`), +specifies if the dataframe's memory usage will be displayed when +invoking the ``df.info()`` method. + +For example, the memory usage of the dataframe below is shown +when calling ``df.info()``: + +.. ipython:: python + + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + n = 5000 + data = dict([ (t, np.random.randint(100, size=n).astype(t)) + for t in dtypes]) + df = pd.DataFrame(data) + df['categorical'] = df['object'].astype('category') + + df.info() + +The ``+`` symbol indicates that the true memory usage could be higher, because +pandas does not count the memory used by values in columns with +``dtype=object``. + +.. versionadded:: 0.17.1 + +Passing ``memory_usage='deep'`` will enable a more accurate memory usage report, +that accounts for the full usage of the contained objects. This is optional +as it can be expensive to do this deeper introspection. + +.. ipython:: python + + df.info(memory_usage='deep') +By default the display option is set to ``True`` but can be explicitly +overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. -******************* -Caveats and Gotchas -******************* +The memory usage of each column can be found by calling the ``memory_usage`` +method. This returns a Series with an index represented by column names +and memory usage of each column shown in bytes. For the dataframe above, +the memory usage of each column and the total memory usage of the +dataframe can be found with the memory_usage method: + +.. ipython:: python + + df.memory_usage() + + # total memory usage of dataframe + df.memory_usage().sum() + +By default the memory usage of the dataframe's index is shown in the +returned Series, the memory usage of the index can be suppressed by passing +the ``index=False`` argument: + +.. ipython:: python + + df.memory_usage(index=False) + +The memory usage displayed by the ``info`` method utilizes the +``memory_usage`` method to determine the memory usage of a dataframe +while also formatting the output in human-readable units (base-2 +representation; i.e., 1KB = 1024 bytes). + +See also :ref:`Categorical Memory Usage `. .. _gotchas.truth: @@ -214,175 +288,6 @@ and traded integer ``NA`` capability for a much simpler approach of using a special value in float and object arrays to denote ``NA``, and promoting integer arrays to floating when NAs must be introduced. -Integer indexing ----------------- - -Label-based indexing with integer axis labels is a thorny topic. It has been -discussed heavily on mailing lists and among various members of the scientific -Python community. In pandas, our general viewpoint is that labels matter more -than integer locations. Therefore, with an integer axis index *only* -label-based indexing is possible with the standard tools like ``.loc``. The -following code will generate exceptions: - -.. code-block:: python - - s = pd.Series(range(5)) - s[-1] - df = pd.DataFrame(np.random.randn(5, 4)) - df - df.loc[-2:] - -This deliberate decision was made to prevent ambiguities and subtle bugs (many -users reported finding bugs when the API change was made to stop "falling back" -on position-based indexing). - -Label-based slicing conventions -------------------------------- - -Non-monotonic indexes require exact matches -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If the index of a ``Series`` or ``DataFrame`` is monotonically increasing or decreasing, then the bounds -of a label-based slice can be outside the range of the index, much like slice indexing a -normal Python ``list``. Monotonicity of an index can be tested with the ``is_monotonic_increasing`` and -``is_monotonic_decreasing`` attributes. - -.. ipython:: python - - df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5))) - df.index.is_monotonic_increasing - - # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: - df.loc[0:4, :] - - # slice is are outside the index, so empty DataFrame is returned - df.loc[13:15, :] - -On the other hand, if the index is not monotonic, then both slice bounds must be -*unique* members of the index. - -.. ipython:: python - - df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6))) - df.index.is_monotonic_increasing - - # OK because 2 and 4 are in the index - df.loc[2:4, :] - -.. code-block:: python - - # 0 is not in the index - In [9]: df.loc[0:4, :] - KeyError: 0 - - # 3 is not a unique label - In [11]: df.loc[2:3, :] - KeyError: 'Cannot get right slice bound for non-unique label: 3' - - -Endpoints are inclusive -~~~~~~~~~~~~~~~~~~~~~~~ - -Compared with standard Python sequence slicing in which the slice endpoint is -not inclusive, label-based slicing in pandas **is inclusive**. The primary -reason for this is that it is often not possible to easily determine the -"successor" or next element after a particular label in an index. For example, -consider the following Series: - -.. ipython:: python - - s = pd.Series(np.random.randn(6), index=list('abcdef')) - s - -Suppose we wished to slice from ``c`` to ``e``, using integers this would be - -.. ipython:: python - - s[2:5] - -However, if you only had ``c`` and ``e``, determining the next element in the -index can be somewhat complicated. For example, the following does not work: - -:: - - s.loc['c':'e'+1] - -A very common use case is to limit a time series to start and end at two -specific dates. To enable this, we made the design design to make label-based -slicing include both endpoints: - -.. ipython:: python - - s.loc['c':'e'] - -This is most definitely a "practicality beats purity" sort of thing, but it is -something to watch out for if you expect label-based slicing to behave exactly -in the way that standard Python integer slicing works. - -Miscellaneous indexing gotchas ------------------------------- - -Reindex potentially changes underlying Series dtype -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The use of ``reindex_like`` can potentially change the dtype of a ``Series``. - -.. ipython:: python - - series = pd.Series([1, 2, 3]) - x = pd.Series([True]) - x.dtype - x = pd.Series([True]).reindex_like(series) - x.dtype - -This is because ``reindex_like`` silently inserts ``NaNs`` and the ``dtype`` -changes accordingly. This can cause some issues when using ``numpy`` ``ufuncs`` -such as ``numpy.logical_and``. - -See the `this old issue `__ for a more -detailed discussion. - -Parsing Dates from Text Files ------------------------------ - -When parsing multiple text file columns into a single date column, the new date -column is prepended to the data and then `index_col` specification is indexed off -of the new set of columns rather than the original ones: - -.. ipython:: python - :suppress: - - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") - - with open('tmp.csv', 'w') as fh: - fh.write(data) - -.. ipython:: python - - print(open('tmp.csv').read()) - - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, - parse_dates=date_spec, - keep_date_col=True, - index_col=0) - - # index_col=0 refers to the combined column "nominal" and not the original - # first column of 'KORD' strings - - df - -.. ipython:: python - :suppress: - - import os - os.remove('tmp.csv') - Differences with NumPy ---------------------- @@ -403,78 +308,6 @@ where the data copying occurs. See `this link `__ for more information. -.. _html-gotchas: - -HTML Table Parsing ------------------- -There are some versioning issues surrounding the libraries that are used to -parse HTML tables in the top-level pandas io function ``read_html``. - -**Issues with** |lxml|_ - - * Benefits - - * |lxml|_ is very fast - - * |lxml|_ requires Cython to install correctly. - - * Drawbacks - - * |lxml|_ does *not* make any guarantees about the results of its parse - *unless* it is given |svm|_. - - * In light of the above, we have chosen to allow you, the user, to use the - |lxml|_ backend, but **this backend will use** |html5lib|_ if |lxml|_ - fails to parse - - * It is therefore *highly recommended* that you install both - |BeautifulSoup4|_ and |html5lib|_, so that you will still get a valid - result (provided everything else is valid) even if |lxml|_ fails. - -**Issues with** |BeautifulSoup4|_ **using** |lxml|_ **as a backend** - - * The above issues hold here as well since |BeautifulSoup4|_ is essentially - just a wrapper around a parser backend. - -**Issues with** |BeautifulSoup4|_ **using** |html5lib|_ **as a backend** - - * Benefits - - * |html5lib|_ is far more lenient than |lxml|_ and consequently deals - with *real-life markup* in a much saner way rather than just, e.g., - dropping an element without notifying you. - - * |html5lib|_ *generates valid HTML5 markup from invalid markup - automatically*. This is extremely important for parsing HTML tables, - since it guarantees a valid document. However, that does NOT mean that - it is "correct", since the process of fixing markup does not have a - single definition. - - * |html5lib|_ is pure Python and requires no additional build steps beyond - its own installation. - - * Drawbacks - - * The biggest drawback to using |html5lib|_ is that it is slow as - molasses. However consider the fact that many tables on the web are not - big enough for the parsing algorithm runtime to matter. It is more - likely that the bottleneck will be in the process of reading the raw - text from the URL over the web, i.e., IO (input-output). For very large - tables, this might not be true. - - -.. |svm| replace:: **strictly valid markup** -.. _svm: http://validator.w3.org/docs/help.html#validation_basics - -.. |html5lib| replace:: **html5lib** -.. _html5lib: https://github.com/html5lib/html5lib-python - -.. |BeautifulSoup4| replace:: **BeautifulSoup4** -.. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup - -.. |lxml| replace:: **lxml** -.. _lxml: http://lxml.de - Byte-Ordering Issues -------------------- diff --git a/doc/source/install.rst b/doc/source/install.rst index 4787b3356ee9f..158a6e5562b7a 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -285,7 +285,7 @@ Optional Dependencies okay.) * `BeautifulSoup4`_ and `lxml`_ * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ - * Only `lxml`_, although see :ref:`HTML reading gotchas ` + * Only `lxml`_, although see :ref:`HTML Table Parsing ` for reasons as to why you should probably **not** take this approach. .. warning:: @@ -294,14 +294,12 @@ Optional Dependencies `lxml`_ or `html5lib`_ or both. :func:`~pandas.read_html` will **not** work with *only* `BeautifulSoup4`_ installed. - * You are highly encouraged to read :ref:`HTML reading gotchas - `. It explains issues surrounding the installation and - usage of the above three libraries + * You are highly encouraged to read :ref:`HTML Table Parsing gotchas `. + It explains issues surrounding the installation and + usage of the above three libraries. * You may need to install an older version of `BeautifulSoup4`_: Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit Ubuntu/Debian - * Additionally, if you're using `Anaconda`_ you should definitely - read :ref:`the gotchas about HTML parsing libraries ` .. note:: @@ -318,7 +316,6 @@ Optional Dependencies .. _html5lib: https://github.com/html5lib/html5lib-python .. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup .. _lxml: http://lxml.de -.. _Anaconda: https://store.continuum.io/cshop/anaconda .. note:: diff --git a/doc/source/io.rst b/doc/source/io.rst index d4bbb3d114b52..4c78758a0e2d2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2043,9 +2043,8 @@ Reading HTML Content .. warning:: - We **highly encourage** you to read the :ref:`HTML parsing gotchas - ` regarding the issues surrounding the - BeautifulSoup4/html5lib/lxml parsers. + We **highly encourage** you to read the :ref:`HTML Table Parsing gotchas` + below regarding the issues surrounding the BeautifulSoup4/html5lib/lxml parsers. .. versionadded:: 0.12.0 @@ -2347,6 +2346,83 @@ Not escaped: Some browsers may not show a difference in the rendering of the previous two HTML tables. + +.. _io.html.gotchas: + +HTML Table Parsing Gotchas +'''''''''''''''''''''''''' + +There are some versioning issues surrounding the libraries that are used to +parse HTML tables in the top-level pandas io function ``read_html``. + +**Issues with** |lxml|_ + + * Benefits + + * |lxml|_ is very fast + + * |lxml|_ requires Cython to install correctly. + + * Drawbacks + + * |lxml|_ does *not* make any guarantees about the results of its parse + *unless* it is given |svm|_. + + * In light of the above, we have chosen to allow you, the user, to use the + |lxml|_ backend, but **this backend will use** |html5lib|_ if |lxml|_ + fails to parse + + * It is therefore *highly recommended* that you install both + |BeautifulSoup4|_ and |html5lib|_, so that you will still get a valid + result (provided everything else is valid) even if |lxml|_ fails. + +**Issues with** |BeautifulSoup4|_ **using** |lxml|_ **as a backend** + + * The above issues hold here as well since |BeautifulSoup4|_ is essentially + just a wrapper around a parser backend. + +**Issues with** |BeautifulSoup4|_ **using** |html5lib|_ **as a backend** + + * Benefits + + * |html5lib|_ is far more lenient than |lxml|_ and consequently deals + with *real-life markup* in a much saner way rather than just, e.g., + dropping an element without notifying you. + + * |html5lib|_ *generates valid HTML5 markup from invalid markup + automatically*. This is extremely important for parsing HTML tables, + since it guarantees a valid document. However, that does NOT mean that + it is "correct", since the process of fixing markup does not have a + single definition. + + * |html5lib|_ is pure Python and requires no additional build steps beyond + its own installation. + + * Drawbacks + + * The biggest drawback to using |html5lib|_ is that it is slow as + molasses. However consider the fact that many tables on the web are not + big enough for the parsing algorithm runtime to matter. It is more + likely that the bottleneck will be in the process of reading the raw + text from the URL over the web, i.e., IO (input-output). For very large + tables, this might not be true. + + +.. |svm| replace:: **strictly valid markup** +.. _svm: http://validator.w3.org/docs/help.html#validation_basics + +.. |html5lib| replace:: **html5lib** +.. _html5lib: https://github.com/html5lib/html5lib-python + +.. |BeautifulSoup4| replace:: **BeautifulSoup4** +.. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup + +.. |lxml| replace:: **lxml** +.. _lxml: http://lxml.de + + + + .. _io.excel: Excel files From 5e233a255f3992ab64231dae0c087d12f83c1f8a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Jan 2017 21:31:03 -0500 Subject: [PATCH 082/338] TST: incorrect locale specification for datetime format (#15233) closes #15215 --- pandas/tseries/tests/test_timeseries.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 4d286fc62764c..b5daf1ac0ec68 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1,4 +1,5 @@ # pylint: disable-msg=E1101,W0612 +import locale import calendar import operator import sys @@ -5369,7 +5370,12 @@ def test_to_datetime_format_integer(self): assert_series_equal(result, expected) def test_to_datetime_format_microsecond(self): - val = '01-Apr-2011 00:00:01.978' + + # these are locale dependent + lang, _ = locale.getlocale() + month_abbr = calendar.month_abbr[4] + val = '01-{}-2011 00:00:01.978'.format(month_abbr) + format = '%d-%b-%Y %H:%M:%S.%f' result = to_datetime(val, format=format) exp = datetime.strptime(val, format) From 971d70e953a10fec3dae378d130e4fde1715434e Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Thu, 26 Jan 2017 06:37:53 -0600 Subject: [PATCH 083/338] BUG/API: Return a Series from DataFrame.asof with no match (#15232) * BUG/API: Return a Series from DataFrame.asof no match * doc fixups --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/generic.py | 6 +++++- pandas/tests/frame/test_asof.py | 22 ++++++++++++++++++++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 86fa919fec304..07c90b1e64f29 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -327,7 +327,7 @@ Other API Changes - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - + - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) .. _whatsnew_0200.deprecations: Deprecations diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2bb492055bcd0..869062bd231fe 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3783,7 +3783,8 @@ def asof(self, where, subset=None): .. versionadded:: 0.19.0 For DataFrame - If there is no good value, NaN is returned. + If there is no good value, NaN is returned for a Series + a Series of NaN values for a DataFrame Parameters ---------- @@ -3839,6 +3840,9 @@ def asof(self, where, subset=None): start = start.ordinal if where < start: + if not is_series: + from pandas import Series + return Series(index=self.columns, name=where) return np.nan # It's always much faster to use a *while* loop here for diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index 47d56c39757db..f68219120b48e 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -3,9 +3,10 @@ import nose import numpy as np -from pandas import DataFrame, date_range +from pandas import (DataFrame, date_range, Timestamp, Series, + to_datetime) -from pandas.util.testing import assert_frame_equal +from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm from .common import TestData @@ -67,6 +68,23 @@ def test_subset(self): assert_frame_equal(result, expected) + def test_missing(self): + # GH 15118 + # no match found - `where` value before earliest date in index + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + result = df.asof('1989-12-31') + + expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31')) + assert_series_equal(result, expected) + + result = df.asof(to_datetime(['1989-12-31'])) + expected = DataFrame(index=to_datetime(['1989-12-31']), + columns=['A', 'B'], dtype='float64') + assert_frame_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 8b82ee442964df861b7a228a2fd9369e2a8114fb Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 26 Jan 2017 09:57:22 -0500 Subject: [PATCH 084/338] BUG: TimedeltaIndex raising ValueError when boolean indexing (#14946) closes #14946 Author: Matt Roeschke Closes #15221 from mroeschke/fix_14946 and squashes the following commits: b8ac04e [Matt Roeschke] BUG: TimedelaIndex raising ValueError when boolean indexing (#14946) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/indexing/test_indexing.py | 19 +++++++++++++++++++ pandas/tseries/tdi.py | 5 ++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 07c90b1e64f29..2dc15f2fe0781 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -380,6 +380,7 @@ Bug Fixes - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) +- Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e44471efb9871..a6eb3c8a891e7 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -5498,6 +5498,25 @@ def test_none_coercion_mixed_dtypes(self): tm.assert_frame_equal(start_dataframe, exp) +class TestTimedeltaIndexing(tm.TestCase): + + def test_boolean_indexing(self): + # GH 14946 + df = pd.DataFrame({'x': range(10)}) + df.index = pd.to_timedelta(range(10), unit='s') + conditions = [df['x'] > 3, df['x'] == 3, df['x'] < 3] + expected_data = [[0, 1, 2, 3, 10, 10, 10, 10, 10, 10], + [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], + [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] + for cond, data in zip(conditions, expected_data): + result = df.copy() + result.loc[cond, 'x'] = 10 + expected = pd.DataFrame(data, + index=pd.to_timedelta(range(10), unit='s'), + columns=['x']) + tm.assert_frame_equal(expected, result) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 2e050445f788a..c62e3fc40d4af 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -14,7 +14,7 @@ _ensure_int64) from pandas.types.missing import isnull from pandas.types.generic import ABCSeries -from pandas.core.common import _maybe_box, _values_from_object +from pandas.core.common import _maybe_box, _values_from_object, is_bool_indexer from pandas.core.index import Index, Int64Index import pandas.compat as compat @@ -673,6 +673,9 @@ def get_loc(self, key, method=None, tolerance=None): loc : int """ + if is_bool_indexer(key): + raise TypeError + if isnull(key): key = tslib.NaT From 4b762d19cc77003da75b20960d06d9bd4fe9d1be Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 27 Jan 2017 09:10:14 -0500 Subject: [PATCH 085/338] ENH: support MultiIndex and tuple hashing closes #15227 Author: Jeff Reback Author: Mike Graham Closes #15224 from jreback/mi_hash2 and squashes the following commits: 8b1d3f9 [Jeff Reback] not correctly hashing categorical in a MI 48a2402 [Jeff Reback] support for mixed type arrays 58f682d [Jeff Reback] memory optimization 0c13df7 [Mike Graham] Steal the algorithm used to combine hashes from tupleobject.c e8dd607 [Jeff Reback] add hash_tuples 44e9c7d [Mike Graham] wipSteal the algorithm used to combine hashes from tupleobject.c e507c4a [Jeff Reback] ENH: support MultiIndex and tuple hashing --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/hashing.py | 158 +++++++++++++++++++++++------ pandas/tools/tests/test_hashing.py | 74 ++++++++++---- 3 files changed, 178 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2dc15f2fe0781..626ed0b1bac61 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -145,6 +145,7 @@ Other enhancements - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). +- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 6d2186fdab34c..800e0b8815443 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -1,18 +1,49 @@ """ data hash pandas / numpy objects """ +import itertools import numpy as np -from pandas import _hash, Series, factorize, Categorical, Index +from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex +import pandas.core.algorithms as algos from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype) + is_datetime64_dtype, is_timedelta64_dtype, + is_list_like) # 16 byte long hashing key _default_hash_key = '0123456789123456' +def _combine_hash_arrays(arrays, num_items): + """ + Parameters + ---------- + arrays : generator + num_items : int + + Should be the same as CPython's tupleobject.c + """ + try: + first = next(arrays) + except StopIteration: + return np.array([], dtype=np.uint64) + + arrays = itertools.chain([first], arrays) + + mult = np.uint64(1000003) + out = np.zeros_like(first) + np.uint64(0x345678) + for i, a in enumerate(arrays): + inverse_i = num_items - i + out ^= a + out *= mult + mult += np.uint64(82520 + inverse_i + inverse_i) + assert i + 1 == num_items, 'Fed in wrong num_items' + out += np.uint64(97531) + return out + + def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): """ @@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, if hash_key is None: hash_key = _default_hash_key - def adder(h, hashed_to_add): - h = np.multiply(h, np.uint(3), h) - return np.add(h, hashed_to_add, h) + if isinstance(obj, MultiIndex): + return Series(hash_tuples(obj, encoding, hash_key), + dtype='uint64', copy=False) if isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64') - h = Series(h, index=obj, dtype='uint64') + categorize).astype('uint64', copy=False) + h = Series(h, index=obj, dtype='uint64', copy=False) elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64') + categorize).astype('uint64', copy=False) if index: - h = adder(h, hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values) - h = Series(h, index=obj.index, dtype='uint64') + index_iter = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values + for _ in [None]) + arrays = itertools.chain([h], index_iter) + h = _combine_hash_arrays(arrays, 2) + + h = Series(h, index=obj.index, dtype='uint64', copy=False) + elif isinstance(obj, ABCDataFrame): - cols = obj.iteritems() - first_series = next(cols)[1] - h = hash_array(first_series.values, encoding, - hash_key, categorize).astype('uint64') - for _, col in cols: - h = adder(h, hash_array(col.values, encoding, hash_key, - categorize)) + hashes = (hash_array(series.values) for _, series in obj.iteritems()) + num_items = len(obj.columns) if index: - h = adder(h, hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values) + index_hash_generator = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values # noqa + for _ in [None]) + num_items += 1 + hashes = itertools.chain(hashes, index_hash_generator) + h = _combine_hash_arrays(hashes, num_items) - h = Series(h, index=obj.index, dtype='uint64') + h = Series(h, index=obj.index, dtype='uint64', copy=False) else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h +def hash_tuples(vals, encoding='utf8', hash_key=None): + """ + Hash an MultiIndex / list-of-tuples efficiently + + .. versionadded:: 0.20.0 + + Parameters + ---------- + vals : MultiIndex, list-of-tuples, or single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + ndarray of hashed values array + """ + + is_tuple = False + if isinstance(vals, tuple): + vals = [vals] + is_tuple = True + elif not is_list_like(vals): + raise TypeError("must be convertible to a list-of-tuples") + + if not isinstance(vals, MultiIndex): + vals = MultiIndex.from_tuples(vals) + + # create a list-of-ndarrays + def get_level_values(num): + unique = vals.levels[num] # .values + labels = vals.labels[num] + filled = algos.take_1d(unique._values, labels, + fill_value=unique._na_value) + return filled + + vals = [get_level_values(level) + for level in range(vals.nlevels)] + + # hash the list-of-ndarrays + hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) + for l in vals) + h = _combine_hash_arrays(hashes, len(vals)) + if is_tuple: + h = h[0] + + return h + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key): """ cat_hashed = hash_array(c.categories.values, encoding, hash_key, categorize=False).astype(np.uint64, copy=False) - return c.rename_categories(cat_hashed).astype(np.uint64) + return c.rename_categories(cat_hashed).astype(np.uint64, copy=False) def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): @@ -108,7 +191,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): Parameters ---------- - vals : ndarray + vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key @@ -124,6 +207,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ + if not hasattr(vals, 'dtype'): + raise TypeError("must pass a ndarray-like") + if hash_key is None: hash_key = _default_hash_key @@ -142,9 +228,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # manage it. if is_bool_array(vals): vals = vals.astype('u8') - elif ((is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals) or - is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): + elif (is_datetime64_dtype(vals) or + is_timedelta64_dtype(vals)): + vals = vals.view('i8').astype('u8', copy=False) + elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, @@ -156,7 +243,12 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) - vals = _hash.hash_object_array(vals, hash_key, encoding) + try: + vals = _hash.hash_object_array(vals, hash_key, encoding) + except TypeError: + # we have mixed types + vals = _hash.hash_object_array(vals.astype(str).astype(object), + hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 7913706f5658b..fb1f187ddd5c0 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -1,8 +1,8 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Index -from pandas.tools.hashing import hash_array, hash_pandas_object +from pandas import DataFrame, Series, Index, MultiIndex +from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object import pandas.util.testing as tm @@ -36,6 +36,18 @@ def test_hash_array(self): a = s.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) + def test_hash_array_mixed(self): + result1 = hash_array(np.array([3, 4, 'All'])) + result2 = hash_array(np.array(['3', '4', 'All'])) + result3 = hash_array(np.array([3, 4, 'All'], dtype=object)) + tm.assert_numpy_array_equal(result1, result2) + tm.assert_numpy_array_equal(result1, result3) + + def test_hash_array_errors(self): + + for val in [5, 'foo', pd.Timestamp('20130101')]: + self.assertRaises(TypeError, hash_array, val) + def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) @@ -53,7 +65,29 @@ def check_not_equal_with_index(self, obj): if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) - self.assertFalse((a == b).all()) + if len(obj): + self.assertFalse((a == b).all()) + + def test_hash_tuples(self): + tups = [(1, 'one'), (1, 'two'), (2, 'one')] + result = hash_tuples(tups) + expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values + self.assert_numpy_array_equal(result, expected) + + result = hash_tuples(tups[0]) + self.assertEqual(result, expected[0]) + + def test_hash_tuples_err(self): + + for val in [5, 'foo', pd.Timestamp('20130101')]: + self.assertRaises(TypeError, hash_tuples, val) + + def test_multiindex_unique(self): + mi = MultiIndex.from_tuples([(118, 472), (236, 118), + (51, 204), (102, 51)]) + self.assertTrue(mi.is_unique) + result = hash_pandas_object(mi) + self.assertTrue(result.is_unique) def test_hash_pandas_object(self): @@ -65,14 +99,27 @@ def test_hash_pandas_object(self): Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), + Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), + DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex()]: + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range('20130101', + periods=3, tz='US/Eastern')), + MultiIndex.from_product( + [range(5), + ['foo', 'bar', 'baz'], + pd.date_range('20130101', periods=2)]), + MultiIndex.from_product( + [pd.CategoricalIndex(list('aabc')), + range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj) @@ -107,7 +154,7 @@ def test_categorical_consistency(self): tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) - def test_errors(self): + def test_pandas_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: def f(): @@ -131,23 +178,6 @@ def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) - def test_unsupported_objects(self): - - # mixed objects are not supported - obj = Series(['1', 2, 3]) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - - # MultiIndex are represented as tuples - obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1)])) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - def test_alread_encoded(self): # if already encoded then ok From fb1f384f7aa076240a6f966c4b080ef8877d5855 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 27 Jan 2017 09:27:16 -0500 Subject: [PATCH 086/338] PERF: correctly report memory used by Index's Author: Jeff Reback Closes #15237 from jreback/memory and squashes the following commits: d77c002 [Jeff Reback] PERF: correctly report memory used by Index's --- doc/source/whatsnew/v0.20.0.txt | 37 ++++++++++++++++++++++++ pandas/core/base.py | 1 + pandas/index.pyx | 9 ++++++ pandas/indexes/base.py | 8 +++++ pandas/indexes/multi.py | 8 ++++- pandas/src/hashtable_class_helper.pxi.in | 19 ++++++++++++ pandas/tests/indexes/common.py | 20 +++++++++++++ pandas/tests/test_categorical.py | 8 +++-- 8 files changed, 106 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 626ed0b1bac61..afe2758db9ab1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -315,6 +315,43 @@ New Behavior: In [5]: df['a']['2011-12-31 23:59:59'] Out[5]: 1 +.. _whatsnew_0200.api_breaking.memory_usage: + +Memory Usage for Index is more Accurate +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`) + +Previous Behavior: + +.. code-block:: ipython + + In [8]: index = Index(['foo', 'bar', 'baz']) + + In [9]: index.memory_usage(deep=True) + Out[9]: 180 + + In [10]: index.get_loc('foo') + Out[10]: 0 + + In [11]: index.memory_usage(deep=True) + Out[11]: 180 + +New Behavior: + +.. code-block:: ipython + + In [8]: index = Index(['foo', 'bar', 'baz']) + + In [9]: index.memory_usage(deep=True) + Out[9]: 180 + + In [10]: index.get_loc('foo') + Out[10]: 0 + + In [11]: index.memory_usage(deep=True) + Out[11]: 260 + .. _whatsnew_0200.api: Other API Changes diff --git a/pandas/core/base.py b/pandas/core/base.py index 77272f7721b32..e7a79c3291a92 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1067,6 +1067,7 @@ def memory_usage(self, deep=False): v = self.values.nbytes if deep and is_object_dtype(self): v += lib.memory_usage_of_objects(self.values) + return v def factorize(self, sort=False, na_sentinel=-1): diff --git a/pandas/index.pyx b/pandas/index.pyx index d575defe17422..0c975d1775a03 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -203,6 +203,15 @@ cdef class IndexEngine: return result + def sizeof(self, deep=False): + """ return the sizeof our mapping """ + if not self.is_mapping_populated: + return 0 + return self.mapping.sizeof(deep=deep) + + def __sizeof__(self): + return self.sizeof() + property is_unique: def __get__(self): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 2009a064afd73..bc2dce4e97e5b 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -536,6 +536,14 @@ def get_values(self): """ return the underlying data as an ndarray """ return self.values + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + # ops compat def tolist(self): """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d8991a0982db2..00ead012a916a 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -446,13 +446,19 @@ def _nbytes(self, deep=False): return the number of bytes in the underlying data deeply introspect the level data if deep=True + include the engine hashtable + *this is in internal routine* """ level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels)) label_nbytes = sum((i.nbytes for i in self.labels)) names_nbytes = sum((getsizeof(i) for i in self.names)) - return level_nbytes + label_nbytes + names_nbytes + result = level_nbytes + label_nbytes + names_nbytes + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result def _format_attrs(self): """ diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 1d3c4b2cb5889..93f9411dbc8a1 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -203,6 +203,7 @@ cdef class ObjectVector: cdef class HashTable: + pass {{py: @@ -237,6 +238,12 @@ cdef class {{name}}HashTable(HashTable): k = kh_get_{{dtype}}(self.table, key) return k != self.table.n_buckets + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + cpdef get_item(self, {{dtype}}_t val): cdef khiter_t k k = kh_get_{{dtype}}(self.table, val) @@ -464,6 +471,12 @@ cdef class StringHashTable(HashTable): kh_destroy_str(self.table) self.table = NULL + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(char *) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + cpdef get_item(self, object val): cdef: khiter_t k @@ -714,6 +727,12 @@ cdef class PyObjectHashTable(HashTable): k = kh_get_pymap(self.table, key) return k != self.table.n_buckets + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + cpdef get_item(self, object val): cdef khiter_t k if val != val or val is None: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 63e9fe580d73d..5a482acf403cd 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -366,6 +366,26 @@ def test_compat(self): for ind in self.indices.values(): self.assertEqual(ind.tolist(), list(ind)) + def test_memory_usage(self): + for name, index in compat.iteritems(self.indices): + result = index.memory_usage() + if len(index): + index.get_loc(index[0]) + result2 = index.memory_usage() + result3 = index.memory_usage(deep=True) + + # RangeIndex doesn't use a hashtable engine + if not isinstance(index, RangeIndex): + self.assertTrue(result2 > result) + + if index.inferred_type == 'object': + self.assertTrue(result3 > result2) + + else: + + # we report 0 for no-length + self.assertEqual(result, 0) + def test_argsort(self): for k, ind in self.indices.items(): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2321467d04c79..745914d3e7ef5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1555,11 +1555,13 @@ def test_nbytes(self): def test_memory_usage(self): cat = pd.Categorical([1, 2, 3]) - self.assertEqual(cat.nbytes, cat.memory_usage()) - self.assertEqual(cat.nbytes, cat.memory_usage(deep=True)) + + # .categories is an index, so we include the hashtable + self.assertTrue(cat.nbytes > 0 and cat.nbytes <= cat.memory_usage()) + self.assertTrue(cat.nbytes > 0 and + cat.nbytes <= cat.memory_usage(deep=True)) cat = pd.Categorical(['foo', 'foo', 'bar']) - self.assertEqual(cat.nbytes, cat.memory_usage()) self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes) # sys.getsizeof will call the .memory_usage with From feaa7cdb81d59df3c6dac156b87887626ca2944b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sat, 28 Jan 2017 13:26:32 -0500 Subject: [PATCH 087/338] ERR: Timestamp replace compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #15240 Author: Sébastien de Menten Author: sdementen Author: Jeff Reback Closes #15248 from sdementen/timestamp-replace-compatibility and squashes the following commits: e70e75a [Jeff Reback] use tzinfo=object to accept None as a default parameter 52b6c2c [Sébastien de Menten] improve description of whatsnew item 9455ee9 [sdementen] fix for issue #15240 1a40777 [Sébastien de Menten] update what's new with bug fix 5263054 [Sébastien de Menten] fix typos + update doc d5c7b0a [Sébastien de Menten] update exception type raised when wrong argument to Timestamp.replace 2ac141a [sdementen] fix for issue #15240 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tseries/tests/test_timezones.py | 3 +- pandas/tslib.pyx | 75 +++++++++++++------------- 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index afe2758db9ab1..f87fad051fad2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -416,6 +416,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``Timestamp.replace`` now raises ``TypeError`` when incorrect argument names are given; previously this raised ``ValueError`` (:issue:`15240`) - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index db8cda5c76479..64787b6e4e79a 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -826,7 +826,6 @@ def test_date_range_span_dst_transition(self): def test_convert_datetime_list(self): dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern'), name='foo') - dr2 = DatetimeIndex(list(dr), name='foo') self.assert_index_equal(dr, dr2) self.assertEqual(dr.tz, dr2.tz) @@ -1198,7 +1197,7 @@ def test_replace(self): # error def f(): dt.replace(foo=5) - self.assertRaises(ValueError, f) + self.assertRaises(TypeError, f) def f(): dt.replace(hour=0.1) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 7c65ab5422309..dd2f5174a516b 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -650,18 +650,25 @@ class Timestamp(_Timestamp): astimezone = tz_convert - def replace(self, **kwds): + def replace(self, year=None, month=None, day=None, + hour=None, minute=None, second=None, microsecond=None, + nanosecond=None, tzinfo=object, fold=0): """ implements datetime.replace, handles nanoseconds Parameters ---------- - kwargs: key-value dict - - accepted keywords are: - year, month, day, hour, minute, second, microsecond, nanosecond, tzinfo - - values must be integer, or for tzinfo, a tz-convertible + year : int, optional + month : int, optional + day : int, optional + hour : int, optional + minute : int, optional + second : int, optional + microsecond : int, optional + nanosecond: int, optional + tzinfo : tz-convertible, optional + fold : int, optional, default is 0 + added in 3.6, NotImplemented Returns ------- @@ -671,14 +678,14 @@ class Timestamp(_Timestamp): cdef: pandas_datetimestruct dts int64_t value - object tzinfo, result, k, v + object _tzinfo, result, k, v _TSObject ts # set to naive if needed - tzinfo = self.tzinfo + _tzinfo = self.tzinfo value = self.value - if tzinfo is not None: - value = tz_convert_single(value, 'UTC', tzinfo) + if _tzinfo is not None: + value = tz_convert_single(value, 'UTC', _tzinfo) # setup components pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) @@ -692,27 +699,24 @@ class Timestamp(_Timestamp): "{v} for {k}".format(v=type(v), k=k)) return v - for k, v in kwds.items(): - if k == 'year': - dts.year = validate(k, v) - elif k == 'month': - dts.month = validate(k, v) - elif k == 'day': - dts.day = validate(k, v) - elif k == 'hour': - dts.hour = validate(k, v) - elif k == 'minute': - dts.min = validate(k, v) - elif k == 'second': - dts.sec = validate(k, v) - elif k == 'microsecond': - dts.us = validate(k, v) - elif k == 'nanosecond': - dts.ps = validate(k, v) * 1000 - elif k == 'tzinfo': - tzinfo = v - else: - raise ValueError("invalid name {} passed".format(k)) + if year is not None: + dts.year = validate('year', year) + if month is not None: + dts.month = validate('month', month) + if day is not None: + dts.day = validate('day', day) + if hour is not None: + dts.hour = validate('hour', hour) + if minute is not None: + dts.min = validate('minute', minute) + if second is not None: + dts.sec = validate('second', second) + if microsecond is not None: + dts.us = validate('microsecond', microsecond) + if nanosecond is not None: + dts.ps = validate('nanosecond', nanosecond) * 1000 + if tzinfo is not object: + _tzinfo = tzinfo # reconstruct & check bounds value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) @@ -720,11 +724,10 @@ class Timestamp(_Timestamp): _check_dts_bounds(&dts) # set tz if needed - if tzinfo is not None: - value = tz_convert_single(value, tzinfo, 'UTC') - - result = create_timestamp_from_ts(value, dts, tzinfo, self.freq) + if _tzinfo is not None: + value = tz_convert_single(value, _tzinfo, 'UTC') + result = create_timestamp_from_ts(value, dts, _tzinfo, self.freq) return result def isoformat(self, sep='T'): From 5e3db2a8b8e4130eb7495683f3aa33ca26224b62 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 29 Jan 2017 13:52:55 -0800 Subject: [PATCH 088/338] TST: MultiIndex Slicing maintains first level index (#12697) (#15255) --- pandas/tests/indexing/test_indexing.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a6eb3c8a891e7..f7fa07916ca74 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -5392,6 +5392,23 @@ def test_maybe_numeric_slice(self): expected = [1] self.assertEqual(result, expected) + def test_multiindex_slice_first_level(self): + # GH 12697 + freq = ['a', 'b', 'c', 'd'] + idx = pd.MultiIndex.from_product([freq, np.arange(500)]) + df = pd.DataFrame(list(range(2000)), index=idx, columns=['Test']) + df_slice = df.loc[pd.IndexSlice[:, 30:70], :] + result = df_slice.loc['a'] + expected = pd.DataFrame(list(range(30, 71)), + columns=['Test'], + index=range(30, 71)) + tm.assert_frame_equal(result, expected) + result = df_slice.loc['d'] + expected = pd.DataFrame(list(range(1530, 1571)), + columns=['Test'], + index=range(30, 71)) + tm.assert_frame_equal(result, expected) + class TestSeriesNoneCoercion(tm.TestCase): EXPECTED_RESULTS = [ From 97fd856bdc838f0a725ae640dbbbb5040b77743d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 30 Jan 2017 09:13:54 -0500 Subject: [PATCH 089/338] CLN: remove build some warnings CLN: from algos_rank_helper.pxi.in, algos_groupby_helper.pxi.in CLN: hashtable warns CLN: parser warnings closes #15190 Author: Jeff Reback Closes #15259 from jreback/warn and squashes the following commits: 12cc061 [Jeff Reback] CLN: remove some warnings from algos_rank_helper.pxi.in CLN: and algos_groupby_helper.pxi.in CLN: hashtable warns CLN: parser warnings --- pandas/parser.pyx | 47 ++++++++------------- pandas/src/algos_groupby_helper.pxi.in | 53 ++++++++++++++++++++---- pandas/src/algos_rank_helper.pxi.in | 8 +--- pandas/src/hashtable_class_helper.pxi.in | 2 + pandas/src/hashtable_func_helper.pxi.in | 13 ++++-- pandas/src/parser/io.c | 2 +- pandas/src/parser/tokenizer.c | 20 ++++----- 7 files changed, 86 insertions(+), 59 deletions(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index b6e5ad0d73b7c..23aee860b3108 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -716,11 +716,10 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - size_t i, start, data_line, field_count, passed_count, hr, unnamed_count # noqa + Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa char *word object name - int status - Py_ssize_t size + int status, hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) @@ -1416,8 +1415,7 @@ cdef _string_box_factorize(parser_t *parser, int col, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 - Py_ssize_t i - size_t lines + Py_ssize_t i, lines coliter_t it const char *word = NULL ndarray[object] result @@ -1470,8 +1468,7 @@ cdef _string_box_utf8(parser_t *parser, int col, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 - Py_ssize_t i - size_t lines + Py_ssize_t i, lines coliter_t it const char *word = NULL ndarray[object] result @@ -1525,8 +1522,7 @@ cdef _string_box_decode(parser_t *parser, int col, char *encoding): cdef: int error, na_count = 0 - Py_ssize_t i, size - size_t lines + Py_ssize_t i, size, lines coliter_t it const char *word = NULL ndarray[object] result @@ -1586,8 +1582,7 @@ cdef _categorical_convert(parser_t *parser, int col, "Convert column data into codes, categories" cdef: int error, na_count = 0 - Py_ssize_t i, size - size_t lines + Py_ssize_t i, size, lines coliter_t it const char *word = NULL @@ -1691,7 +1686,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 - size_t i, lines + Py_ssize_t i, lines coliter_t it const char *word = NULL char *p_end @@ -1738,8 +1733,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int *na_count) nogil: cdef: int error, - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL char *p_end @@ -1801,7 +1795,7 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error - size_t i, lines + Py_ssize_t i, lines coliter_t it uint64_t *data ndarray result @@ -1837,8 +1831,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, uint64_t *data, uint_state *state) nogil: cdef: int error - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -1873,7 +1866,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 - size_t i, lines + Py_ssize_t i, lines coliter_t it int64_t *data ndarray result @@ -1902,8 +1895,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int64_t *data, int *na_count) nogil: cdef: int error - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -1939,7 +1931,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: int na_count - size_t lines = line_end - line_start + Py_ssize_t lines = line_end - line_start uint8_t *data cnp.ndarray[cnp.uint8_t, ndim=1] result @@ -1963,8 +1955,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, uint8_t *data, int *na_count) nogil: cdef: int error - size_t lines = line_end - line_start - size_t i + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -2004,7 +1995,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, const kh_str_t *false_hashset): cdef: int error, na_count = 0 - size_t i, lines + Py_ssize_t i, lines coliter_t it const char *word = NULL uint8_t *data @@ -2033,8 +2024,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int *na_count) nogil: cdef: int error = 0 - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -2249,8 +2239,7 @@ cdef _apply_converter(object f, parser_t *parser, int col, char* c_encoding): cdef: int error - Py_ssize_t i - size_t lines + Py_ssize_t i, lines coliter_t it const char *word = NULL char *errors = "strict" @@ -2341,7 +2330,7 @@ def _to_structured_array(dict columns, object names, object usecols): cdef _fill_structured_column(char *dst, char* src, int elsize, int stride, int length, bint incref): cdef: - size_t i + Py_ssize_t i if incref: util.transfer_object_column(dst, src, stride, length) diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index fda1e51bd2b1f..9552b4299fe6a 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -361,7 +361,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -407,7 +411,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -478,7 +486,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -492,7 +504,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, 0] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val @@ -541,8 +557,11 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: - + {{endif}} nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -556,7 +575,11 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, 0] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, 0] += 1 if val < minx[lab, 0]: minx[lab, 0] = val @@ -596,14 +619,19 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, continue for j in range(K): val = values[i, j] + + # val = nan + {{if name == 'int64'}} + if is_datetimelike and val == {{nan_val}}: + out[i, j] = {{nan_val}} + else: + {{else}} if val == val: + {{endif}} if val < accum[lab, j]: min_val = val accum[lab, j] = min_val out[i, j] = accum[lab, j] - # val = nan - elif is_datetimelike: - out[i, j] = {{nan_val}} @cython.boundscheck(False) @@ -633,14 +661,18 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, continue for j in range(K): val = values[i, j] + + {{if name == 'int64'}} + if is_datetimelike and val == {{nan_val}}: + out[i, j] = {{nan_val}} + else: + {{else}} if val == val: + {{endif}} if val > accum[lab, j]: max_val = val accum[lab, j] = max_val out[i, j] = accum[lab, j] - # val = nan - elif is_datetimelike: - out[i, j] = {{nan_val}} {{endfor}} @@ -738,7 +770,12 @@ def group_cumsum(numeric[:, :] out, continue for j in range(K): val = values[i, j] - if val == val: + + if numeric == float32_t or numeric == float64_t: + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/src/algos_rank_helper.pxi.in b/pandas/src/algos_rank_helper.pxi.in index 7e7f819c7515f..aafffbf60f638 100644 --- a/pandas/src/algos_rank_helper.pxi.in +++ b/pandas/src/algos_rank_helper.pxi.in @@ -175,11 +175,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, count += 1.0 - {{if dtype == 'float64'}} if i == n - 1 or sorted_data[i + 1] != val: - {{else}} - if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: - {{endif}} if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -345,10 +341,8 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{if dtype == 'object'}} if j == k - 1 or are_diff(values[i, j + 1], val): - {{elif dtype == 'float64'}} - if j == k - 1 or values[i, j + 1] != val: {{else}} - if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if j == k - 1 or values[i, j + 1] != val: {{endif}} if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 93f9411dbc8a1..74c38dfdb393e 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -386,9 +386,11 @@ cdef class {{name}}HashTable(HashTable): val = values[i] # specific for groupby + {{if dtype != 'uint64'}} if val < 0: labels[i] = -1 continue + {{endif}} k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in index c292256767315..fa373905ef08a 100644 --- a/pandas/src/hashtable_func_helper.pxi.in +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -59,7 +59,12 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] + + {{if dtype == 'float64'}} if val == val or not dropna: + {{else}} + if True: + {{endif}} k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 @@ -85,7 +90,7 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): int64_t[:] result_counts {{endif}} - int k + Py_ssize_t k table = kh_init_{{ttype}}() {{if dtype == 'object'}} @@ -133,11 +138,11 @@ def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'): {{endif}} cdef: - int ret = 0, k + int ret = 0 {{if dtype != 'object'}} {{dtype}}_t value {{endif}} - Py_ssize_t i, n = len(values) + Py_ssize_t k, i, n = len(values) kh_{{ttype}}_t * table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') @@ -230,7 +235,7 @@ def mode_{{dtype}}({{ctype}}[:] values): cdef: int count, max_count = 2 int j = -1 # so you can do += - int k + Py_ssize_t k kh_{{table_type}}_t *table ndarray[{{ctype}}] modes diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c index 562d6033ce3eb..4381ef19e991b 100644 --- a/pandas/src/parser/io.c +++ b/pandas/src/parser/io.c @@ -215,7 +215,7 @@ void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, retval = src->memmap + src->position; - if (src->position + nbytes > src->last_pos) { + if (src->position + (off_t)nbytes > src->last_pos) { // fewer than nbytes remaining *bytes_read = src->last_pos - src->position; } else { diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index b6428c8b76743..916f06d357473 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -622,7 +622,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \ goto linelimit; \ } @@ -637,7 +637,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \ goto linelimit; \ } @@ -1072,7 +1072,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { --i; buf--; // let's try this character again (HACK!) if (line_limit > 0 && - self->lines == start_lines + line_limit) { + self->lines == start_lines + (int)line_limit) { goto linelimit; } } @@ -1160,7 +1160,7 @@ static int parser_handle_eof(parser_t *self) { int parser_consume_rows(parser_t *self, size_t nrows) { int i, offset, word_deletions, char_count; - if (nrows > self->lines) { + if ((int)nrows > self->lines) { nrows = self->lines; } @@ -1197,7 +1197,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->word_start -= char_count; /* move line metadata */ - for (i = 0; i < self->lines - nrows + 1; ++i) { + for (i = 0; i < self->lines - (int)nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; self->line_fields[i] = self->line_fields[offset]; @@ -1224,7 +1224,7 @@ int parser_trim_buffers(parser_t *self) { /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; - if (new_cap < self->words_cap) { + if ((int)new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { @@ -1247,7 +1247,7 @@ int parser_trim_buffers(parser_t *self) { ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { + if ((int)new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " "safe_realloc\n")); @@ -1275,7 +1275,7 @@ int parser_trim_buffers(parser_t *self) { /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; - if (new_cap < self->lines_cap) { + if ((int)new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int)); if (newptr == NULL) { @@ -1328,7 +1328,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { (int)nrows, self->datapos, self->datalen)); while (1) { - if (!all && self->lines - start_lines >= nrows) break; + if (!all && self->lines - start_lines >= (int)nrows) break; if (self->datapos == self->datalen) { status = parser_buffer_bytes(self, self->chunksize); @@ -1986,7 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, return 0; } - if (number > int_max) { + if (number > (uint64_t)int_max) { state->seen_uint = 1; } From d5851578d5f53ebfc23caab02fea430101615648 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 30 Jan 2017 12:39:51 -0500 Subject: [PATCH 090/338] CLN: remove build warnings from tslib.pyx, saslib.pyx, some from lib.pyx (#15264) --- pandas/io/sas/saslib.pyx | 2 +- pandas/lib.pyx | 94 +++++++++++++++++++----------------- pandas/src/datetime_helper.h | 4 -- pandas/tslib.pyx | 19 +++----- setup.py | 2 + 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx index a66d62ea41581..4396180da44cb 100644 --- a/pandas/io/sas/saslib.pyx +++ b/pandas/io/sas/saslib.pyx @@ -311,7 +311,7 @@ cdef class Parser(object): self.current_page_subheaders_count =\ self.parser._current_page_subheaders_count - cdef bint readline(self): + cdef readline(self): cdef: int offset, bit_offset, align_correction diff --git a/pandas/lib.pyx b/pandas/lib.pyx index fce6a3d03287e..b4724bc3dd59b 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -3,6 +3,7 @@ cimport numpy as np cimport cython import numpy as np import sys +cdef bint PY3 = (sys.version_info[0] >= 3) from numpy cimport * @@ -42,7 +43,6 @@ cdef extern from "Python.h": Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step, Py_ssize_t *slicelength) except -1 - cimport cpython isnan = np.isnan @@ -1568,62 +1568,65 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): int64_t cur_blkno Py_ssize_t i, start, stop, n, diff + object blkno list group_order dict group_slices int64_t[:] res_view n = blknos.shape[0] - if n > 0: - start = 0 - cur_blkno = blknos[start] + if n == 0: + return - if group == False: - for i in range(1, n): - if blknos[i] != cur_blkno: - yield cur_blkno, slice(start, i) + start = 0 + cur_blkno = blknos[start] - start = i - cur_blkno = blknos[i] + if group == False: + for i in range(1, n): + if blknos[i] != cur_blkno: + yield cur_blkno, slice(start, i) - yield cur_blkno, slice(start, n) - else: - group_order = [] - group_dict = {} - - for i in range(1, n): - if blknos[i] != cur_blkno: - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, i)] - else: - group_dict[cur_blkno].append((start, i)) - - start = i - cur_blkno = blknos[i] - - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, n)] - else: - group_dict[cur_blkno].append((start, n)) + start = i + cur_blkno = blknos[i] - for blkno in group_order: - slices = group_dict[blkno] - if len(slices) == 1: - yield blkno, slice(slices[0][0], slices[0][1]) + yield cur_blkno, slice(start, n) + else: + group_order = [] + group_dict = {} + + for i in range(1, n): + if blknos[i] != cur_blkno: + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, i)] else: - tot_len = sum(stop - start for start, stop in slices) - result = np.empty(tot_len, dtype=np.int64) - res_view = result + group_dict[cur_blkno].append((start, i)) - i = 0 - for start, stop in slices: - for diff in range(start, stop): - res_view[i] = diff - i += 1 + start = i + cur_blkno = blknos[i] - yield blkno, result + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, n)] + else: + group_dict[cur_blkno].append((start, n)) + + for blkno in group_order: + slices = group_dict[blkno] + if len(slices) == 1: + yield blkno, slice(slices[0][0], slices[0][1]) + else: + tot_len = sum([stop - start for start, stop in slices]) + result = np.empty(tot_len, dtype=np.int64) + res_view = result + + i = 0 + for start, stop in slices: + for diff in range(start, stop): + res_view[i] = diff + i += 1 + + yield blkno, result @cython.boundscheck(False) @@ -1670,7 +1673,7 @@ cpdef slice_canonize(slice s): Convert slice to canonical bounded form. """ cdef: - Py_ssize_t start, stop, step, length + Py_ssize_t start = 0, stop = 0, step = 1, length if s.step is None: step = 1 @@ -1727,6 +1730,7 @@ cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) + return start, stop, step, length diff --git a/pandas/src/datetime_helper.h b/pandas/src/datetime_helper.h index 2b24028ff3d8c..bef4b4266c824 100644 --- a/pandas/src/datetime_helper.h +++ b/pandas/src/datetime_helper.h @@ -15,10 +15,6 @@ The full license is in the LICENSE file, distributed with this software. #include "numpy/arrayobject.h" #include "numpy/arrayscalars.h" -#if PY_MAJOR_VERSION >= 3 -#define PyInt_AS_LONG PyLong_AsLong -#endif - npy_int64 get_long_attr(PyObject *o, const char *attr) { npy_int64 long_val; PyObject *value = PyObject_GetAttrString(o, attr); diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index dd2f5174a516b..fc6e689a35d81 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -8,10 +8,8 @@ from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, from datetime cimport get_datetime64_value, get_timedelta64_value import numpy as np -# GH3363 -from sys import version_info -cdef bint PY2 = version_info[0] == 2 -cdef bint PY3 = not PY2 +import sys +cdef bint PY3 = (sys.version_info[0] >= 3) from cpython cimport ( PyTypeObject, @@ -24,11 +22,8 @@ from cpython cimport ( PyUnicode_AsUTF8String, ) - -# Cython < 0.17 doesn't have this in cpython cdef extern from "Python.h": cdef PyTypeObject *Py_TYPE(object) - int PySlice_Check(object) cdef extern from "datetime_helper.h": double total_seconds(object) @@ -2121,8 +2116,8 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace( hour=0, minute=0, second=0, microsecond=0) _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] -_MONTH_NUMBERS = dict((k, i) for i, k in enumerate(_MONTHS)) -_MONTH_ALIASES = dict((k + 1, v) for k, v in enumerate(_MONTHS)) +_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} +_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} cpdef object _get_rule_month(object source, object default='DEC'): @@ -5529,8 +5524,8 @@ class TimeRE(dict): 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone - for tz in tz_names), + 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone + for tz in tz_names], 'Z'), '%': '%'}) base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) @@ -5553,7 +5548,7 @@ class TimeRE(dict): break else: return '' - regex = '|'.join(re_escape(stuff) for stuff in to_convert) + regex = '|'.join([re_escape(stuff) for stuff in to_convert]) regex = '(?P<%s>%s' % (directive, regex) return '%s)' % regex diff --git a/setup.py b/setup.py index 1c9972defabbe..0c4dd33a70482 100755 --- a/setup.py +++ b/setup.py @@ -462,6 +462,7 @@ def pxd(name): tseries_depends = ['pandas/src/datetime/np_datetime.h', 'pandas/src/datetime/np_datetime_strings.h', + 'pandas/src/datetime_helper.h', 'pandas/src/period_helper.h', 'pandas/src/datetime.pxd'] @@ -584,6 +585,7 @@ def pxd(name): ujson_ext = Extension('pandas.json', depends=['pandas/src/ujson/lib/ultrajson.h', + 'pandas/src/datetime_helper.h', 'pandas/src/numpy_helper.h'], sources=['pandas/src/ujson/python/ujson.c', 'pandas/src/ujson/python/objToJSON.c', From cfb916ebb690405d30810cdf9062f90d2ef98821 Mon Sep 17 00:00:00 2001 From: David Hoffman Date: Wed, 1 Feb 2017 15:42:58 -0500 Subject: [PATCH 091/338] BUG: Fix overflow error in cartesian_product When the numbers in `X` are large it can cause an overflow error on windows machine where the native `int` is 32 bit. Switching to np.intp alleviates this problem. Other fixes would include switching to np.uint32 or np.uint64. closes #15234 Author: David Hoffman Closes #15265 from david-hoffman/patch-1 and squashes the following commits: c9c8d5e [David Hoffman] Update v0.19.2.txt d54583e [David Hoffman] Remove `test_large_input` because it's too big 47a6c6c [David Hoffman] Update test so that it will actually run on "normal" machine 7aeee85 [David Hoffman] Added tests for large numbers b196878 [David Hoffman] Fix overflow error in cartesian_product --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/util.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f87fad051fad2..34048b8cc372d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -444,6 +444,7 @@ Bug Fixes - Bug in compat for passing long integers to ``Timestamp.replace`` (:issue:`15030`) - Bug in ``.loc`` that would not return the correct dtype for scalar access for a DataFrame (:issue:`11617`) - Bug in ``GroupBy.get_group()`` failing with a categorical grouper (:issue:`15155`) +- Bug in ``pandas.tools.utils.cartesian_product()`` with large input can cause overflow on windows (:issue:`15265`) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 381e29283d417..8ec074fbf5950 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -58,7 +58,7 @@ def cartesian_product(X): if len(X) == 0: return [] - lenX = np.fromiter((len(x) for x in X), dtype=int) + lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) From ab09d2d696361843e20d5f3e2e94db9b6545b974 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Feb 2017 17:55:40 -0500 Subject: [PATCH 092/338] COMPAT: xarray 0.8.2 test compat w.r.t. CategoricalIndex indempotency (#15285) closes #15282 --- ci/install_travis.sh | 2 +- ci/requirements-2.7.run | 2 +- ci/requirements-3.5.pip | 1 + ci/requirements-3.5.run | 1 - pandas/tests/test_generic.py | 19 +++++++++++-------- 5 files changed, 14 insertions(+), 11 deletions(-) create mode 100644 ci/requirements-3.5.pip diff --git a/ci/install_travis.sh b/ci/install_travis.sh index ded428c677f17..52b52d787aade 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -143,7 +143,7 @@ else echo "[pip installs]" REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip" if [ -e ${REQ} ]; then - pip install --upgrade -r $REQ + pip install -r $REQ fi # may have addtl installation instructions for this build diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index 2bfb8a3777fdf..b5fc919297c76 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -20,4 +20,4 @@ html5lib=1.0b2 beautiful-soup=4.2.1 statsmodels jinja2=2.8 -xarray +xarray=0.8.0 diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip new file mode 100644 index 0000000000000..0d9e44cf39fa4 --- /dev/null +++ b/ci/requirements-3.5.pip @@ -0,0 +1 @@ +xarray==0.9.1 diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index e15ca6079b4fe..ef354195c8f23 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -16,6 +16,5 @@ bottleneck sqlalchemy pymysql psycopg2 -xarray s3fs beautifulsoup4 diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index f7b7ae8c66382..0ca8ba47b8a8f 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -7,6 +7,7 @@ from numpy import nan import pandas as pd +from distutils.version import LooseVersion from pandas.types.common import is_scalar from pandas import (Index, Series, DataFrame, Panel, isnull, date_range, period_range, Panel4D) @@ -870,6 +871,7 @@ def test_describe_none(self): def test_to_xarray(self): tm._skip_if_no_xarray() + import xarray from xarray import DataArray s = Series([]) @@ -895,15 +897,16 @@ def testit(index, check_index_type=True, check_categorical=True): check_index_type=check_index_type, check_categorical=check_categorical) - for index in [tm.makeFloatIndex, tm.makeIntIndex, - tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex, - tm.makeTimedeltaIndex]: - testit(index) + l = [tm.makeFloatIndex, tm.makeIntIndex, + tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex, + tm.makeTimedeltaIndex] + + if LooseVersion(xarray.__version__) >= '0.8.0': + l.append(tm.makeCategoricalIndex) - # not idempotent - testit(tm.makeCategoricalIndex, check_index_type=False, - check_categorical=False) + for index in l: + testit(index) s = Series(range(6)) s.index.name = 'foo' From a2e86ade8249ad2ecc79aaeb37603fd7122370f5 Mon Sep 17 00:00:00 2001 From: Michael Lamparski Date: Thu, 2 Feb 2017 15:26:12 -0500 Subject: [PATCH 093/338] BUG: Support empty dict-likes in replace() closes #15289 Author: Michael Lamparski Closes #15294 from ExpHP/bugfix-15289 and squashes the following commits: f349e0a [Michael Lamparski] BUG: Support empty dict-likes in replace() --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 6 +++--- pandas/tests/frame/test_replace.py | 10 ++++++++++ pandas/tests/series/test_replace.py | 6 ++++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 34048b8cc372d..d76a78c68fb73 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -512,3 +512,4 @@ Bug Fixes - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) +- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 869062bd231fe..8074b167ff176 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -48,7 +48,7 @@ from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.compat import (map, zip, lrange, string_types, +from pandas.compat import (map, zip, lzip, lrange, string_types, isidentifier, set_function_name) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg @@ -3509,7 +3509,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, regex = True items = list(compat.iteritems(to_replace)) - keys, values = zip(*items) + keys, values = lzip(*items) or ([], []) are_mappings = [is_dict_like(v) for v in values] @@ -3523,7 +3523,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, value_dict = {} for k, v in items: - keys, values = zip(*v.items()) + keys, values = lzip(*v.items()) or ([], []) if set(keys) & set(values): raise ValueError("Replacement not allowed with " "overlapping keys and values") diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index adc7af225588c..f46215105b375 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1055,3 +1055,13 @@ def test_replace_datetimetz(self): Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) + + def test_replace_with_empty_dictlike(self): + # GH 15289 + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + assert_frame_equal(df, df.replace({})) + assert_frame_equal(df, df.replace(Series([]))) + + assert_frame_equal(df, df.replace({'b': {}})) + assert_frame_equal(df, df.replace(Series({'b': {}}))) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index d80328ea3863a..aa16f2cca9475 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -223,3 +223,9 @@ def test_replace2(self): self.assertTrue((ser[:5] == -1).all()) self.assertTrue((ser[6:10] == -1).all()) self.assertTrue((ser[20:30] == -1).all()) + + def test_replace_with_empty_dictlike(self): + # GH 15289 + s = pd.Series(list('abcd')) + tm.assert_series_equal(s, s.replace(dict())) + tm.assert_series_equal(s, s.replace(pd.Series([]))) From 951bbbb9ffd694fb5fe26de80fba0c254cc5bfa8 Mon Sep 17 00:00:00 2001 From: TrigonaMinima Date: Thu, 2 Feb 2017 20:13:54 -0500 Subject: [PATCH 094/338] TST: DatetimeIndex compiled together in test_datetime.py xref #14854 Author: TrigonaMinima Closes #15266 from TrigonaMinima/issue-14854-datetime and squashes the following commits: 6ee2bd9 [TrigonaMinima] TST: Splitting test_datetime.py into smaller chunks (gh14854) 415a748 [TrigonaMinima] TST: Moving DatetimeIndex related tests from test_timeseries.py and flake8 fixes c43c7de [TrigonaMinima] TST: proper naming of files 458d141 [TrigonaMinima] TST: splitting test_datetime.py 1ff0819 [TrigonaMinima] TST: fix flake8 errors - test_datetime.py (GH14854) 9311161 [TrigonaMinima] TST: reorg of DatetimeIndex tests from tseries/tests/test_base.py to test_datetime.py (GH14854) 54421a5 [TrigonaMinima] TST: reorg of DatetimeIndex tests from test_datetimelike.py to test_datetime.py (GH14854) f83814b [TrigonaMinima] TST: reorg of DatetimeIndex tests from test_timeseries.py to test_datetime.py --- pandas/tests/indexes/datetimes/__init__.py | 0 pandas/tests/indexes/datetimes/test_astype.py | 122 ++ .../indexes/datetimes/test_construction.py | 425 +++++ .../tests/indexes/datetimes/test_datetime.py | 836 +++++++++ .../tests/indexes/datetimes/test_indexing.py | 244 +++ pandas/tests/indexes/datetimes/test_misc.py | 333 ++++ .../tests/indexes/datetimes/test_missing.py | 51 + pandas/tests/indexes/datetimes/test_ops.py | 1073 ++++++++++++ pandas/tests/indexes/datetimes/test_setops.py | 168 ++ pandas/tests/indexes/test_datetimelike.py | 669 +------- pandas/tseries/tests/test_base.py | 897 +--------- pandas/tseries/tests/test_timeseries.py | 1521 +---------------- setup.py | 1 + 13 files changed, 3259 insertions(+), 3081 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/__init__.py create mode 100644 pandas/tests/indexes/datetimes/test_astype.py create mode 100644 pandas/tests/indexes/datetimes/test_construction.py create mode 100644 pandas/tests/indexes/datetimes/test_datetime.py create mode 100644 pandas/tests/indexes/datetimes/test_indexing.py create mode 100644 pandas/tests/indexes/datetimes/test_misc.py create mode 100644 pandas/tests/indexes/datetimes/test_missing.py create mode 100644 pandas/tests/indexes/datetimes/test_ops.py create mode 100644 pandas/tests/indexes/datetimes/test_setops.py diff --git a/pandas/tests/indexes/datetimes/__init__.py b/pandas/tests/indexes/datetimes/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py new file mode 100644 index 0000000000000..f64d18a69a093 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -0,0 +1,122 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import (DatetimeIndex, date_range, Series, NaT, Index, Timestamp, + Int64Index) + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_astype(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + + result = idx.astype(object) + expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object) + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index([1463356800000000000] + + [-9223372036854775808] * 3, dtype=np.int64) + tm.assert_index_equal(result, expected) + + rng = date_range('1/1/2000', periods=10) + result = rng.astype('i8') + self.assert_index_equal(result, Index(rng.asi8)) + self.assert_numpy_array_equal(result.values, rng.asi8) + + def test_astype_with_tz(self): + + # with tz + rng = date_range('1/1/2000', periods=10, tz='US/Eastern') + result = rng.astype('datetime64[ns]') + expected = (date_range('1/1/2000', periods=10, + tz='US/Eastern') + .tz_convert('UTC').tz_localize(None)) + tm.assert_index_equal(result, expected) + + # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex + result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) + expected = pd.Series( + ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(pd.date_range('2012-01-01', periods=3, + tz='US/Eastern')).astype(str) + expected = Series(['2012-01-01 00:00:00-05:00', + '2012-01-02 00:00:00-05:00', + '2012-01-03 00:00:00-05:00'], + dtype=object) + tm.assert_series_equal(result, expected) + + def test_astype_str_compat(self): + # GH 13149, GH 13209 + # verify that we are returing NaT as a string (and not unicode) + + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + result = idx.astype(str) + expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object) + tm.assert_index_equal(result, expected) + + def test_astype_str(self): + # test astype string - #10442 + result = date_range('2012-01-01', periods=4, + name='test_name').astype(str) + expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', + '2012-01-04'], name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with tz and name + result = date_range('2012-01-01', periods=3, name='test_name', + tz='US/Eastern').astype(str) + expected = Index(['2012-01-01 00:00:00-05:00', + '2012-01-02 00:00:00-05:00', + '2012-01-03 00:00:00-05:00'], + name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and name + result = date_range('1/1/2011', periods=3, freq='H', + name='test_name').astype(str) + expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', + '2011-01-01 02:00:00'], + name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and timezone + result = date_range('3/6/2012 00:00', periods=2, freq='H', + tz='Europe/London', name='test_name').astype(str) + expected = Index(['2012-03-06 00:00:00+00:00', + '2012-03-06 01:00:00+00:00'], + dtype=object, name='test_name') + tm.assert_index_equal(result, expected) + + def test_astype_datetime64(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + + result = idx.astype('datetime64[ns]') + tm.assert_index_equal(result, idx) + self.assertFalse(result is idx) + + result = idx.astype('datetime64[ns]', copy=False) + tm.assert_index_equal(result, idx) + self.assertTrue(result is idx) + + idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST') + result = idx_tz.astype('datetime64[ns]') + expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]') + tm.assert_index_equal(result, expected) + + def test_astype_raises(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + + self.assertRaises(ValueError, idx.astype, float) + self.assertRaises(ValueError, idx.astype, 'timedelta64') + self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') + self.assertRaises(ValueError, idx.astype, 'datetime64') + self.assertRaises(ValueError, idx.astype, 'datetime64[D]') diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py new file mode 100644 index 0000000000000..ae4eb6ee397b6 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -0,0 +1,425 @@ +import numpy as np +from datetime import timedelta + +import pandas as pd +import pandas.util.testing as tm +from pandas.tslib import OutOfBoundsDatetime +from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range) + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_construction_with_alt(self): + + i = pd.date_range('20130101', periods=5, freq='H', tz='US/Eastern') + i2 = DatetimeIndex(i, dtype=i.dtype) + self.assert_index_equal(i, i2) + + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz=i.dtype.tz) + self.assert_index_equal(i, i2) + + i2 = DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype) + self.assert_index_equal(i, i2) + + i2 = DatetimeIndex( + i.tz_localize(None).asi8, dtype=i.dtype, tz=i.dtype.tz) + self.assert_index_equal(i, i2) + + # localize into the provided tz + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz='UTC') + expected = i.tz_localize(None).tz_localize('UTC') + self.assert_index_equal(i2, expected) + + # incompat tz/dtype + self.assertRaises(ValueError, lambda: DatetimeIndex( + i.tz_localize(None).asi8, dtype=i.dtype, tz='US/Pacific')) + + def test_construction_index_with_mixed_timezones(self): + # GH 11488 + # no tz results in DatetimeIndex + result = Index([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNone(result.tz) + + # same tz results in DatetimeIndex + result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], + name='idx') + exp = DatetimeIndex( + [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') + ], tz='Asia/Tokyo', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNotNone(result.tz) + self.assertEqual(result.tz, exp.tz) + + # same tz results in DatetimeIndex (DST) + result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), + Timestamp('2011-08-01 10:00', tz='US/Eastern')], + name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-08-01 10:00')], + tz='US/Eastern', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNotNone(result.tz) + self.assertEqual(result.tz, exp.tz) + + # different tz results in Index(dtype=object) + result = Index([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + name='idx') + exp = Index([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + dtype='object', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertFalse(isinstance(result, DatetimeIndex)) + + result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + name='idx') + exp = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + dtype='object', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertFalse(isinstance(result, DatetimeIndex)) + + # length = 1 + result = Index([Timestamp('2011-01-01')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNone(result.tz) + + # length = 1 with tz + result = Index( + [Timestamp('2011-01-01 10:00', tz='Asia/Tokyo')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00')], tz='Asia/Tokyo', + name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNotNone(result.tz) + self.assertEqual(result.tz, exp.tz) + + def test_construction_index_with_mixed_timezones_with_NaT(self): + # GH 11488 + result = Index([pd.NaT, Timestamp('2011-01-01'), + pd.NaT, Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'), + pd.NaT, Timestamp('2011-01-02')], name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNone(result.tz) + + # same tz results in DatetimeIndex + result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + pd.NaT, Timestamp('2011-01-02 10:00', + tz='Asia/Tokyo')], + name='idx') + exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00')], + tz='Asia/Tokyo', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNotNone(result.tz) + self.assertEqual(result.tz, exp.tz) + + # same tz results in DatetimeIndex (DST) + result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), + pd.NaT, + Timestamp('2011-08-01 10:00', tz='US/Eastern')], + name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT, + Timestamp('2011-08-01 10:00')], + tz='US/Eastern', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNotNone(result.tz) + self.assertEqual(result.tz, exp.tz) + + # different tz results in Index(dtype=object) + result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', + tz='US/Eastern')], + name='idx') + exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + dtype='object', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertFalse(isinstance(result, DatetimeIndex)) + + result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + pd.NaT, Timestamp('2011-01-02 10:00', + tz='US/Eastern')], name='idx') + exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + dtype='object', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertFalse(isinstance(result, DatetimeIndex)) + + # all NaT + result = Index([pd.NaT, pd.NaT], name='idx') + exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNone(result.tz) + + # all NaT with tz + result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') + exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + self.assertIsNotNone(result.tz) + self.assertEqual(result.tz, exp.tz) + + def test_construction_dti_with_mixed_timezones(self): + # GH 11488 (not changed, added explicit tests) + + # no tz results in DatetimeIndex + result = DatetimeIndex( + [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex( + [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + + # same tz results in DatetimeIndex + result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + Timestamp('2011-01-02 10:00', + tz='Asia/Tokyo')], + name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00')], + tz='Asia/Tokyo', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + + # same tz results in DatetimeIndex (DST) + result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='US/Eastern'), + Timestamp('2011-08-01 10:00', + tz='US/Eastern')], + name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-08-01 10:00')], + tz='US/Eastern', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + + # different tz coerces tz-naive to tz-awareIndex(dtype=object) + result = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00', + tz='US/Eastern')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 05:00'), + Timestamp('2011-01-02 10:00')], + tz='US/Eastern', name='idx') + self.assert_index_equal(result, exp, exact=True) + self.assertTrue(isinstance(result, DatetimeIndex)) + + # tz mismatch affecting to tz-aware raises TypeError/ValueError + + with tm.assertRaises(ValueError): + DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + name='idx') + + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): + DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='Asia/Tokyo', name='idx') + + with tm.assertRaises(ValueError): + DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), + Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='US/Eastern', name='idx') + + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): + # passing tz should results in DatetimeIndex, then mismatch raises + # TypeError + Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='Asia/Tokyo', name='idx') + + def test_construction_base_constructor(self): + arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + def test_construction_outofbounds(self): + # GH 13663 + dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1)] + exp = Index(dates, dtype=object) + # coerces to object + tm.assert_index_equal(Index(dates), exp) + + with tm.assertRaises(OutOfBoundsDatetime): + # can't create DatetimeIndex + DatetimeIndex(dates) + + def test_construction_with_ndarray(self): + # GH 5152 + dates = [datetime(2013, 10, 7), + datetime(2013, 10, 8), + datetime(2013, 10, 9)] + data = DatetimeIndex(dates, freq=pd.tseries.frequencies.BDay()).values + result = DatetimeIndex(data, freq=pd.tseries.frequencies.BDay()) + expected = DatetimeIndex(['2013-10-07', + '2013-10-08', + '2013-10-09'], + freq='B') + tm.assert_index_equal(result, expected) + + def test_constructor_coverage(self): + rng = date_range('1/1/2000', periods=10.5) + exp = date_range('1/1/2000', periods=10) + tm.assert_index_equal(rng, exp) + + self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', + periods='foo', freq='D') + + self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', + end='1/10/2000') + + self.assertRaises(ValueError, DatetimeIndex, '1/1/2000') + + # generator expression + gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) + result = DatetimeIndex(gen) + expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) + for i in range(10)]) + tm.assert_index_equal(result, expected) + + # NumPy string array + strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype('O')) + tm.assert_index_equal(result, expected) + + from_ints = DatetimeIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # string with NaT + strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype('O')) + tm.assert_index_equal(result, expected) + + from_ints = DatetimeIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # non-conforming + self.assertRaises(ValueError, DatetimeIndex, + ['2000-01-01', '2000-01-02', '2000-01-04'], freq='D') + + self.assertRaises(ValueError, DatetimeIndex, start='2011-01-01', + freq='b') + self.assertRaises(ValueError, DatetimeIndex, end='2011-01-01', + freq='B') + self.assertRaises(ValueError, DatetimeIndex, periods=10, freq='D') + + def test_constructor_datetime64_tzformat(self): + # GH 6572 + tm._skip_if_no_pytz() + import pytz + # ISO 8601 format results in pytz.FixedOffset + for freq in ['AS', 'W-SUN']: + idx = date_range('2013-01-01T00:00:00-05:00', + '2016-01-01T23:59:59-05:00', freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(-300)) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='America/Lima') + self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range('2013-01-01T00:00:00+09:00', + '2016-01-01T23:59:59+09:00', freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(540)) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='Asia/Tokyo') + self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + tm._skip_if_no_dateutil() + + # Non ISO 8601 format results in dateutil.tz.tzoffset + for freq in ['AS', 'W-SUN']: + idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', + freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(-300)) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='America/Lima') + self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range('2013/1/1 0:00:00+9:00', + '2016/1/1 23:59:59+09:00', freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(540)) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='Asia/Tokyo') + self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + def test_constructor_dtype(self): + + # passing a dtype with a tz should localize + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], + dtype='datetime64[ns, US/Eastern]') + expected = DatetimeIndex(['2013-01-01', '2013-01-02'] + ).tz_localize('US/Eastern') + tm.assert_index_equal(idx, expected) + + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], + tz='US/Eastern') + tm.assert_index_equal(idx, expected) + + # if we already have a tz and its not the same, then raise + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], + dtype='datetime64[ns, US/Eastern]') + + self.assertRaises(ValueError, + lambda: DatetimeIndex(idx, + dtype='datetime64[ns]')) + + # this is effectively trying to convert tz's + self.assertRaises(TypeError, + lambda: DatetimeIndex(idx, + dtype='datetime64[ns, CET]')) + self.assertRaises(ValueError, + lambda: DatetimeIndex( + idx, tz='CET', + dtype='datetime64[ns, US/Eastern]')) + result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') + tm.assert_index_equal(idx, result) + + def test_constructor_name(self): + idx = DatetimeIndex(start='2000-01-01', periods=1, freq='A', + name='TEST') + self.assertEqual(idx.name, 'TEST') + + def test_000constructor_resolution(self): + # 2252 + t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1) + idx = DatetimeIndex([t1]) + + self.assertEqual(idx.nanosecond[0], t1.nanosecond) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py new file mode 100644 index 0000000000000..a69406804cd97 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -0,0 +1,836 @@ +import numpy as np +from datetime import date, timedelta, time + +import pandas as pd +import pandas.util.testing as tm +from pandas.compat import lrange +from pandas.compat.numpy import np_datetime64_compat +from pandas import (DatetimeIndex, Index, date_range, Series, DataFrame, + Timestamp, datetime, offsets, _np_version_under1p8) + +from pandas.util.testing import assert_series_equal, assert_almost_equal + +randn = np.random.randn + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_get_loc(self): + idx = pd.date_range('2000-01-01', periods=3) + + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(idx[1], method), 1) + self.assertEqual(idx.get_loc(idx[1].to_pydatetime(), method), 1) + self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + if method is not None: + self.assertEqual(idx.get_loc(idx[1], method, + tolerance=pd.Timedelta('0 days')), + 1) + + self.assertEqual(idx.get_loc('2000-01-01', method='nearest'), 0) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest'), 1) + + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance='1 day'), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance=pd.Timedelta('1D')), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance=np.timedelta64(1, 'D')), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance=timedelta(1)), 1) + with tm.assertRaisesRegexp(ValueError, 'must be convertible'): + idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') + with tm.assertRaises(KeyError): + idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') + + self.assertEqual(idx.get_loc('2000', method='nearest'), slice(0, 3)) + self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 3)) + + self.assertEqual(idx.get_loc('1999', method='nearest'), 0) + self.assertEqual(idx.get_loc('2001', method='nearest'), 2) + + with tm.assertRaises(KeyError): + idx.get_loc('1999', method='pad') + with tm.assertRaises(KeyError): + idx.get_loc('2001', method='backfill') + + with tm.assertRaises(KeyError): + idx.get_loc('foobar') + with tm.assertRaises(TypeError): + idx.get_loc(slice(2)) + + idx = pd.to_datetime(['2000-01-01', '2000-01-04']) + self.assertEqual(idx.get_loc('2000-01-02', method='nearest'), 0) + self.assertEqual(idx.get_loc('2000-01-03', method='nearest'), 1) + self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 2)) + + # time indexing + idx = pd.date_range('2000-01-01', periods=24, freq='H') + tm.assert_numpy_array_equal(idx.get_loc(time(12)), + np.array([12]), check_dtype=False) + tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), + np.array([]), check_dtype=False) + with tm.assertRaises(NotImplementedError): + idx.get_loc(time(12, 30), method='pad') + + def test_get_indexer(self): + idx = pd.date_range('2000-01-01', periods=3) + exp = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) + + target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', + '1 day 1 hour']) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), + np.array([-1, 0, 1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), + np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), + np.array([0, 1, 1], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', + tolerance=pd.Timedelta('1 hour')), + np.array([0, -1, 1], dtype=np.intp)) + with tm.assertRaises(ValueError): + idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') + + def test_roundtrip_pickle_with_tz(self): + + # GH 8367 + # round-trip of timezone + index = date_range('20130101', periods=3, tz='US/Eastern', name='foo') + unpickled = self.round_trip_pickle(index) + self.assert_index_equal(index, unpickled) + + def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): + # GH7774 + index = date_range('20130101', periods=3, tz='US/Eastern') + self.assertEqual(str(index.reindex([])[0].tz), 'US/Eastern') + self.assertEqual(str(index.reindex(np.array([]))[0].tz), 'US/Eastern') + + def test_time_loc(self): # GH8667 + from datetime import time + from pandas.index import _SIZE_CUTOFF + + ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) + key = time(15, 11, 30) + start = key.hour * 3600 + key.minute * 60 + key.second + step = 24 * 3600 + + for n in ns: + idx = pd.date_range('2014-11-26', periods=n, freq='S') + ts = pd.Series(np.random.randn(n), index=idx) + i = np.arange(start, n, step) + + tm.assert_numpy_array_equal(ts.index.get_loc(key), i, + check_dtype=False) + tm.assert_series_equal(ts[key], ts.iloc[i]) + + left, right = ts.copy(), ts.copy() + left[key] *= -10 + right.iloc[i] *= -10 + tm.assert_series_equal(left, right) + + def test_time_overflow_for_32bit_machines(self): + # GH8943. On some machines NumPy defaults to np.int32 (for example, + # 32-bit Linux machines). In the function _generate_regular_range + # found in tseries/index.py, `periods` gets multiplied by `strides` + # (which has value 1e9) and since the max value for np.int32 is ~2e9, + # and since those machines won't promote np.int32 to np.int64, we get + # overflow. + periods = np.int_(1000) + + idx1 = pd.date_range(start='2000', periods=periods, freq='S') + self.assertEqual(len(idx1), periods) + + idx2 = pd.date_range(end='2000', periods=periods, freq='S') + self.assertEqual(len(idx2), periods) + + def test_nat(self): + self.assertIs(DatetimeIndex([np.nan])[0], pd.NaT) + + def test_ufunc_coercions(self): + idx = date_range('2011-01-01', periods=3, freq='2D', name='x') + + delta = np.timedelta64(1, 'D') + for result in [idx + delta, np.add(idx, delta)]: + tm.assertIsInstance(result, DatetimeIndex) + exp = date_range('2011-01-02', periods=3, freq='2D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '2D') + + for result in [idx - delta, np.subtract(idx, delta)]: + tm.assertIsInstance(result, DatetimeIndex) + exp = date_range('2010-12-31', periods=3, freq='2D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '2D') + + delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), + np.timedelta64(3, 'D')]) + for result in [idx + delta, np.add(idx, delta)]: + tm.assertIsInstance(result, DatetimeIndex) + exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], + freq='3D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '3D') + + for result in [idx - delta, np.subtract(idx, delta)]: + tm.assertIsInstance(result, DatetimeIndex) + exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], + freq='D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, 'D') + + def test_week_of_month_frequency(self): + # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise + d1 = date(2002, 9, 1) + d2 = date(2013, 10, 27) + d3 = date(2012, 9, 30) + idx1 = DatetimeIndex([d1, d2]) + idx2 = DatetimeIndex([d3]) + result_append = idx1.append(idx2) + expected = DatetimeIndex([d1, d2, d3]) + tm.assert_index_equal(result_append, expected) + result_union = idx1.union(idx2) + expected = DatetimeIndex([d1, d3, d2]) + tm.assert_index_equal(result_union, expected) + + # GH 5115 + result = date_range("2013-1-1", periods=4, freq='WOM-1SAT') + dates = ['2013-01-05', '2013-02-02', '2013-03-02', '2013-04-06'] + expected = DatetimeIndex(dates, freq='WOM-1SAT') + tm.assert_index_equal(result, expected) + + def test_hash_error(self): + index = date_range('20010101', periods=10) + with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % + type(index).__name__): + hash(index) + + def test_stringified_slice_with_tz(self): + # GH2658 + import datetime + start = datetime.datetime.now() + idx = DatetimeIndex(start=start, freq="1d", periods=10) + df = DataFrame(lrange(10), index=idx) + df["2013-01-14 23:44:34.437768-05:00":] # no exception here + + def test_append_join_nondatetimeindex(self): + rng = date_range('1/1/2000', periods=10) + idx = Index(['a', 'b', 'c', 'd']) + + result = rng.append(idx) + tm.assertIsInstance(result[0], Timestamp) + + # it works + rng.join(idx, how='outer') + + def test_to_period_nofreq(self): + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.to_period) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], + freq='infer') + self.assertEqual(idx.freqstr, 'D') + expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', + '2000-01-03'], freq='D') + tm.assert_index_equal(idx.to_period(), expected) + + # GH 7606 + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + self.assertEqual(idx.freqstr, None) + tm.assert_index_equal(idx.to_period(), expected) + + def test_comparisons_coverage(self): + rng = date_range('1/1/2000', periods=10) + + # raise TypeError for now + self.assertRaises(TypeError, rng.__lt__, rng[3].value) + + result = rng == list(rng) + exp = rng == rng + self.assert_numpy_array_equal(result, exp) + + def test_comparisons_nat(self): + + fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) + fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) + + didx1 = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, + '2014-05-01', '2014-07-01']) + didx2 = pd.DatetimeIndex(['2014-02-01', '2014-03-01', pd.NaT, pd.NaT, + '2014-06-01', '2014-07-01']) + darr = np.array([np_datetime64_compat('2014-02-01 00:00Z'), + np_datetime64_compat('2014-03-01 00:00Z'), + np_datetime64_compat('nat'), np.datetime64('nat'), + np_datetime64_compat('2014-06-01 00:00Z'), + np_datetime64_compat('2014-07-01 00:00Z')]) + + if _np_version_under1p8: + # cannot test array because np.datetime('nat') returns today's date + cases = [(fidx1, fidx2), (didx1, didx2)] + else: + cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] + + # Check pd.NaT is handles as the same as np.nan + with tm.assert_produces_warning(None): + for idx1, idx2 in cases: + + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + self.assert_numpy_array_equal(result, expected) + + with tm.assert_produces_warning(None): + for idx1, val in [(fidx1, np.nan), (didx1, pd.NaT)]: + result = idx1 < val + expected = np.array([False, False, False, False, False, False]) + self.assert_numpy_array_equal(result, expected) + result = idx1 > val + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= val + self.assert_numpy_array_equal(result, expected) + result = idx1 >= val + self.assert_numpy_array_equal(result, expected) + + result = idx1 == val + self.assert_numpy_array_equal(result, expected) + + result = idx1 != val + expected = np.array([True, True, True, True, True, True]) + self.assert_numpy_array_equal(result, expected) + + # Check pd.NaT is handles as the same as np.nan + with tm.assert_produces_warning(None): + for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]: + result = idx1 < val + expected = np.array([True, False, False, False, False, False]) + self.assert_numpy_array_equal(result, expected) + result = idx1 > val + expected = np.array([False, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= val + expected = np.array([True, False, True, False, False, False]) + self.assert_numpy_array_equal(result, expected) + result = idx1 >= val + expected = np.array([False, False, True, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 == val + expected = np.array([False, False, True, False, False, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 != val + expected = np.array([True, True, False, True, True, True]) + self.assert_numpy_array_equal(result, expected) + + def test_map(self): + rng = date_range('1/1/2000', periods=10) + + f = lambda x: x.strftime('%Y%m%d') + result = rng.map(f) + exp = Index([f(x) for x in rng], dtype='= -1') + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_fill_value_with_timezone(self): + idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + name='xxx', tz='US/Eastern') + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', tz='US/Eastern') + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], + name='xxx', tz='US/Eastern') + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', tz='US/Eastern') + tm.assert_index_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + idx.take(np.array([1, -5])) + + def test_map_bug_1677(self): + index = DatetimeIndex(['2012-04-25 09:30:00.393000']) + f = index.asof + + result = index.map(f) + expected = Index([f(index[0])]) + tm.assert_index_equal(result, expected) + + def test_groupby_function_tuple_1677(self): + df = DataFrame(np.random.rand(100), + index=date_range("1/1/2000", periods=100)) + monthly_group = df.groupby(lambda x: (x.year, x.month)) + + result = monthly_group.mean() + tm.assertIsInstance(result.index[0], tuple) + + def test_append_numpy_bug_1681(self): + # another datetime64 bug + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + a = DataFrame() + c = DataFrame({'A': 'foo', 'B': dr}, index=dr) + + result = a.append(c) + self.assertTrue((result['B'] == dr).all()) + + def test_isin(self): + index = tm.makeDateIndex(4) + result = index.isin(index) + self.assertTrue(result.all()) + + result = index.isin(list(index)) + self.assertTrue(result.all()) + + assert_almost_equal(index.isin([index[2], 5]), + np.array([False, False, True, False])) + + def test_time(self): + rng = pd.date_range('1/1/2000', freq='12min', periods=10) + result = pd.Index(rng).time + expected = [t.time() for t in rng] + self.assertTrue((result == expected).all()) + + def test_date(self): + rng = pd.date_range('1/1/2000', freq='12H', periods=10) + result = pd.Index(rng).date + expected = [t.date() for t in rng] + self.assertTrue((result == expected).all()) + + def test_does_not_convert_mixed_integer(self): + df = tm.makeCustomDataframe(10, 10, + data_gen_f=lambda *args, **kwargs: randn(), + r_idx_type='i', c_idx_type='dt') + cols = df.columns.join(df.index, how='outer') + joined = cols.join(df.columns) + self.assertEqual(cols.dtype, np.dtype('O')) + self.assertEqual(cols.dtype, joined.dtype) + tm.assert_numpy_array_equal(cols.values, joined.values) + + def test_slice_keeps_name(self): + # GH4226 + st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') + et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') + dr = pd.date_range(st, et, freq='H', name='timebucket') + self.assertEqual(dr[1:].name, dr.name) + + def test_join_self(self): + index = date_range('1/1/2000', periods=10) + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = index.join(index, how=kind) + self.assertIs(index, joined) + + def assert_index_parameters(self, index): + assert index.freq == '40960N' + assert index.inferred_freq == '40960N' + + def test_ns_index(self): + nsamples = 400 + ns = int(1e9 / 24414) + dtstart = np.datetime64('2012-09-20T00:00:00') + + dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, 'ns') + freq = ns * offsets.Nano() + index = pd.DatetimeIndex(dt, freq=freq, name='time') + self.assert_index_parameters(index) + + new_index = pd.DatetimeIndex(start=index[0], end=index[-1], + freq=index.freq) + self.assert_index_parameters(new_index) + + def test_join_with_period_index(self): + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=lambda *args: np.random.randint(2), + c_idx_type='p', r_idx_type='dt') + s = df.iloc[:5, 0] + joins = 'left', 'right', 'inner', 'outer' + + for join in joins: + with tm.assertRaisesRegexp(ValueError, 'can only call with other ' + 'PeriodIndex-ed objects'): + df.columns.join(s.index, how=join) + + def test_factorize(self): + idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', + '2014-03', '2014-03']) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + # tz must be preserved + idx1 = idx1.tz_localize('Asia/Tokyo') + exp_idx = exp_idx.tz_localize('Asia/Tokyo') + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01']) + + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) + exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) + arr, idx = idx2.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) + exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01']) + arr, idx = idx2.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + # freq must be preserved + idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo') + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + + def test_factorize_tz(self): + # GH 13750 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) + idx = base.repeat(5) + + exp_arr = np.arange(100, dtype=np.intp).repeat(5) + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(res, base) + + def test_factorize_dst(self): + # GH 13750 + idx = pd.date_range('2016-11-06', freq='H', periods=12, + tz='US/Eastern') + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + + idx = pd.date_range('2016-06-13', freq='H', periods=12, + tz='US/Eastern') + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), + date_range('2014-01-01', periods=20, freq='MS')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) + assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], + SLC[13:8:-1]) + assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp( + '2014-10-01'):-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], + SLC[13:8:-1]) + assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], + SLC[13:8:-1]) + + assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), + date_range('2014-01-01', periods=20, freq='MS')) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + + def test_slice_bounds_empty(self): + # GH 14354 + empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') + + right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') + exp = Timestamp('2015-01-02 23:59:59.999999999') + self.assertEqual(right, exp) + + left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') + exp = Timestamp('2015-01-02 00:00:00') + self.assertEqual(left, exp) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py new file mode 100644 index 0000000000000..5b6bcffe71856 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -0,0 +1,244 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +import pandas.compat as compat +from pandas import notnull, Index, DatetimeIndex, datetime, date_range + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_where_other(self): + + # other is ndarray or Index + i = pd.date_range('20130101', periods=3, tz='US/Eastern') + + for arr in [np.nan, pd.NaT]: + result = i.where(notnull(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2), i2.values) + tm.assert_index_equal(result, i2) + + def test_where_tz(self): + i = pd.date_range('20130101', periods=3, tz='US/Eastern') + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + + def test_insert(self): + idx = DatetimeIndex( + ['2000-01-04', '2000-01-01', '2000-01-02'], name='idx') + + result = idx.insert(2, datetime(2000, 1, 5)) + exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05', + '2000-01-02'], name='idx') + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, 'inserted') + expected = Index([datetime(2000, 1, 4), 'inserted', + datetime(2000, 1, 1), + datetime(2000, 1, 2)], name='idx') + self.assertNotIsInstance(result, DatetimeIndex) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + + idx = date_range('1/1/2000', periods=3, freq='M', name='idx') + + # preserve freq + expected_0 = DatetimeIndex(['1999-12-31', '2000-01-31', '2000-02-29', + '2000-03-31'], name='idx', freq='M') + expected_3 = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', + '2000-04-30'], name='idx', freq='M') + + # reset freq to None + expected_1_nofreq = DatetimeIndex(['2000-01-31', '2000-01-31', + '2000-02-29', + '2000-03-31'], name='idx', + freq=None) + expected_3_nofreq = DatetimeIndex(['2000-01-31', '2000-02-29', + '2000-03-31', + '2000-01-02'], name='idx', + freq=None) + + cases = [(0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq)] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + # reset freq to None + result = idx.insert(3, datetime(2000, 1, 2)) + expected = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', + '2000-01-02'], name='idx', freq=None) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertTrue(result.freq is None) + + # GH 7299 + tm._skip_if_no_pytz() + import pytz + + idx = date_range('1/1/2000', periods=3, freq='D', tz='Asia/Tokyo', + name='idx') + with tm.assertRaises(ValueError): + result = idx.insert(3, pd.Timestamp('2000-01-04')) + with tm.assertRaises(ValueError): + result = idx.insert(3, datetime(2000, 1, 4)) + with tm.assertRaises(ValueError): + result = idx.insert(3, pd.Timestamp('2000-01-04', tz='US/Eastern')) + with tm.assertRaises(ValueError): + result = idx.insert(3, + datetime(2000, 1, 4, + tzinfo=pytz.timezone('US/Eastern'))) + + for tz in ['US/Pacific', 'Asia/Singapore']: + idx = date_range('1/1/2000 09:00', periods=6, freq='H', tz=tz, + name='idx') + # preserve freq + expected = date_range('1/1/2000 09:00', periods=7, freq='H', tz=tz, + name='idx') + for d in [pd.Timestamp('2000-01-01 15:00', tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15))]: + + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 10:00', + '2000-01-01 11:00', + '2000-01-01 12:00', '2000-01-01 13:00', + '2000-01-01 14:00', + '2000-01-01 10:00'], name='idx', + tz=tz, freq=None) + # reset freq to None + for d in [pd.Timestamp('2000-01-01 10:00', tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10))]: + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertTrue(result.freq is None) + self.assertEqual(result.tz, expected.tz) + + def test_delete(self): + idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') + + # prserve freq + expected_0 = date_range(start='2000-02-01', periods=4, freq='M', + name='idx') + expected_4 = date_range(start='2000-01-01', periods=4, freq='M', + name='idx') + + # reset freq to None + expected_1 = DatetimeIndex(['2000-01-31', '2000-03-31', '2000-04-30', + '2000-05-31'], freq=None, name='idx') + + cases = {0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = idx.delete(5) + + for tz in [None, 'Asia/Tokyo', 'US/Pacific']: + idx = date_range(start='2000-01-01 09:00', periods=10, freq='H', + name='idx', tz=tz) + + expected = date_range(start='2000-01-01 10:00', periods=9, + freq='H', name='idx', tz=tz) + result = idx.delete(0) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freqstr, 'H') + self.assertEqual(result.tz, expected.tz) + + expected = date_range(start='2000-01-01 09:00', periods=9, + freq='H', name='idx', tz=tz) + result = idx.delete(-1) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freqstr, 'H') + self.assertEqual(result.tz, expected.tz) + + def test_delete_slice(self): + idx = date_range(start='2000-01-01', periods=10, freq='D', name='idx') + + # prserve freq + expected_0_2 = date_range(start='2000-01-04', periods=7, freq='D', + name='idx') + expected_7_9 = date_range(start='2000-01-01', periods=7, freq='D', + name='idx') + + # reset freq to None + expected_3_5 = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', + '2000-01-07', '2000-01-08', '2000-01-09', + '2000-01-10'], freq=None, name='idx') + + cases = {(0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + for tz in [None, 'Asia/Tokyo', 'US/Pacific']: + ts = pd.Series(1, index=pd.date_range( + '2000-01-01 09:00', periods=10, freq='H', name='idx', tz=tz)) + # preserve freq + result = ts.drop(ts.index[:5]).index + expected = pd.date_range('2000-01-01 14:00', periods=5, freq='H', + name='idx', tz=tz) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + # reset freq to None + result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index + expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 11:00', + '2000-01-01 13:00', + '2000-01-01 15:00', '2000-01-01 17:00'], + freq=None, name='idx', tz=tz) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py new file mode 100644 index 0000000000000..3dfe95fa77b85 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -0,0 +1,333 @@ +import numpy as np + +import pandas.lib as lib +import pandas.util.testing as tm +from pandas import Float64Index, date_range, Timestamp +from pandas import (Index, DatetimeIndex, datetime, offsets, to_datetime, + Series, DataFrame) + + +class TestDateTimeIndexToJulianDate(tm.TestCase): + + def test_1700(self): + r1 = Float64Index([2345897.5, 2345898.5, 2345899.5, 2345900.5, + 2345901.5]) + r2 = date_range(start=Timestamp('1710-10-01'), periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + r1 = Float64Index([2451601.5, 2451602.5, 2451603.5, 2451604.5, + 2451605.5]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + r1 = Float64Index( + [2451601.5, 2451601.5416666666666666, 2451601.5833333333333333, + 2451601.625, 2451601.6666666666666666]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='H').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + r1 = Float64Index( + [2451601.5, 2451601.5006944444444444, 2451601.5013888888888888, + 2451601.5020833333333333, 2451601.5027777777777777]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='T').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_second(self): + r1 = Float64Index( + [2451601.5, 2451601.500011574074074, 2451601.5000231481481481, + 2451601.5000347222222222, 2451601.5000462962962962]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='S').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + +class TestTimeSeries(tm.TestCase): + _multiprocess_can_split_ = True + + def test_pass_datetimeindex_to_index(self): + # Bugs in #1396 + rng = date_range('1/1/2000', '3/1/2000') + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pydatetime(), dtype=object) + + self.assert_numpy_array_equal(idx.values, expected.values) + + +class TestDatetime64(tm.TestCase): + + def test_datetimeindex_accessors(self): + dti = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), periods=365) + + self.assertEqual(dti.year[0], 1998) + self.assertEqual(dti.month[0], 1) + self.assertEqual(dti.day[0], 1) + self.assertEqual(dti.hour[0], 0) + self.assertEqual(dti.minute[0], 0) + self.assertEqual(dti.second[0], 0) + self.assertEqual(dti.microsecond[0], 0) + self.assertEqual(dti.dayofweek[0], 3) + + self.assertEqual(dti.dayofyear[0], 1) + self.assertEqual(dti.dayofyear[120], 121) + + self.assertEqual(dti.weekofyear[0], 1) + self.assertEqual(dti.weekofyear[120], 18) + + self.assertEqual(dti.quarter[0], 1) + self.assertEqual(dti.quarter[120], 2) + + self.assertEqual(dti.days_in_month[0], 31) + self.assertEqual(dti.days_in_month[90], 30) + + self.assertEqual(dti.is_month_start[0], True) + self.assertEqual(dti.is_month_start[1], False) + self.assertEqual(dti.is_month_start[31], True) + self.assertEqual(dti.is_quarter_start[0], True) + self.assertEqual(dti.is_quarter_start[90], True) + self.assertEqual(dti.is_year_start[0], True) + self.assertEqual(dti.is_year_start[364], False) + self.assertEqual(dti.is_month_end[0], False) + self.assertEqual(dti.is_month_end[30], True) + self.assertEqual(dti.is_month_end[31], False) + self.assertEqual(dti.is_month_end[364], True) + self.assertEqual(dti.is_quarter_end[0], False) + self.assertEqual(dti.is_quarter_end[30], False) + self.assertEqual(dti.is_quarter_end[89], True) + self.assertEqual(dti.is_quarter_end[364], True) + self.assertEqual(dti.is_year_end[0], False) + self.assertEqual(dti.is_year_end[364], True) + + # GH 11128 + self.assertEqual(dti.weekday_name[4], u'Monday') + self.assertEqual(dti.weekday_name[5], u'Tuesday') + self.assertEqual(dti.weekday_name[6], u'Wednesday') + self.assertEqual(dti.weekday_name[7], u'Thursday') + self.assertEqual(dti.weekday_name[8], u'Friday') + self.assertEqual(dti.weekday_name[9], u'Saturday') + self.assertEqual(dti.weekday_name[10], u'Sunday') + + self.assertEqual(Timestamp('2016-04-04').weekday_name, u'Monday') + self.assertEqual(Timestamp('2016-04-05').weekday_name, u'Tuesday') + self.assertEqual(Timestamp('2016-04-06').weekday_name, u'Wednesday') + self.assertEqual(Timestamp('2016-04-07').weekday_name, u'Thursday') + self.assertEqual(Timestamp('2016-04-08').weekday_name, u'Friday') + self.assertEqual(Timestamp('2016-04-09').weekday_name, u'Saturday') + self.assertEqual(Timestamp('2016-04-10').weekday_name, u'Sunday') + + self.assertEqual(len(dti.year), 365) + self.assertEqual(len(dti.month), 365) + self.assertEqual(len(dti.day), 365) + self.assertEqual(len(dti.hour), 365) + self.assertEqual(len(dti.minute), 365) + self.assertEqual(len(dti.second), 365) + self.assertEqual(len(dti.microsecond), 365) + self.assertEqual(len(dti.dayofweek), 365) + self.assertEqual(len(dti.dayofyear), 365) + self.assertEqual(len(dti.weekofyear), 365) + self.assertEqual(len(dti.quarter), 365) + self.assertEqual(len(dti.is_month_start), 365) + self.assertEqual(len(dti.is_month_end), 365) + self.assertEqual(len(dti.is_quarter_start), 365) + self.assertEqual(len(dti.is_quarter_end), 365) + self.assertEqual(len(dti.is_year_start), 365) + self.assertEqual(len(dti.is_year_end), 365) + self.assertEqual(len(dti.weekday_name), 365) + + dti = DatetimeIndex(freq='BQ-FEB', start=datetime(1998, 1, 1), + periods=4) + + self.assertEqual(sum(dti.is_quarter_start), 0) + self.assertEqual(sum(dti.is_quarter_end), 4) + self.assertEqual(sum(dti.is_year_start), 0) + self.assertEqual(sum(dti.is_year_end), 1) + + # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, + # CBD requires np >= 1.7 + bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') + dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) + self.assertRaises(ValueError, lambda: dti.is_month_start) + + dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + + self.assertEqual(dti.is_month_start[0], 1) + + tests = [ + (Timestamp('2013-06-01', freq='M').is_month_start, 1), + (Timestamp('2013-06-01', freq='BM').is_month_start, 0), + (Timestamp('2013-06-03', freq='M').is_month_start, 0), + (Timestamp('2013-06-03', freq='BM').is_month_start, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), + (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), + (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), + (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), + (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), + (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), + (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), + (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), + (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), + (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), + (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), + (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), + (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), + (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), + (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), + (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), + (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), + (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), + (Timestamp('2012-02-01').days_in_month, 29), + (Timestamp('2013-02-01').days_in_month, 28)] + + for ts, value in tests: + self.assertEqual(ts, value) + + def test_datetimeindex_diff(self): + dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), + periods=100) + dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), + periods=98) + self.assertEqual(len(dti1.difference(dti2)), 2) + + def test_nanosecond_field(self): + dti = DatetimeIndex(np.arange(10)) + + self.assert_numpy_array_equal(dti.nanosecond, + np.arange(10, dtype=np.int32)) + + def test_datetimeindex_constructor(self): + arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] + self.assertRaises(Exception, DatetimeIndex, arr) + + arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] + idx1 = DatetimeIndex(arr) + + arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] + idx2 = DatetimeIndex(arr) + + arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', + '2005-01-04'] + idx3 = DatetimeIndex(arr) + + arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', + '2005-01-04'], dtype='O') + idx4 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) + idx5 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' + ]) + idx6 = DatetimeIndex(arr) + + idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) + idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, + yearfirst=True) + tm.assert_index_equal(idx7, idx8) + + for other in [idx2, idx3, idx4, idx5, idx6]: + self.assertTrue((idx1.values == other.values).all()) + + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + idx = DatetimeIndex(start=sdate, freq='1B', periods=20) + self.assertEqual(len(idx), 20) + self.assertEqual(idx[0], sdate + 0 * offsets.BDay()) + self.assertEqual(idx.freq, 'B') + + idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) + self.assertEqual(len(idx), 20) + self.assertEqual(idx[-1], edate) + self.assertEqual(idx.freq, '5D') + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=offsets.Week(weekday=6)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=offsets.QuarterBegin(startingMonth=1)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=offsets.BQuarterEnd(startingMonth=12)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + def test_dayfirst(self): + # GH 5917 + arr = ['10/02/2014', '11/02/2014', '12/02/2014'] + expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), + datetime(2014, 2, 12)]) + idx1 = DatetimeIndex(arr, dayfirst=True) + idx2 = DatetimeIndex(np.array(arr), dayfirst=True) + idx3 = to_datetime(arr, dayfirst=True) + idx4 = to_datetime(np.array(arr), dayfirst=True) + idx5 = DatetimeIndex(Index(arr), dayfirst=True) + idx6 = DatetimeIndex(Series(arr), dayfirst=True) + tm.assert_index_equal(expected, idx1) + tm.assert_index_equal(expected, idx2) + tm.assert_index_equal(expected, idx3) + tm.assert_index_equal(expected, idx4) + tm.assert_index_equal(expected, idx5) + tm.assert_index_equal(expected, idx6) + + def test_dti_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') + idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.reindex(idx2) + tm.assert_index_equal(df.index, idx2) + + # 11314 + # with tz + index = date_range(datetime(2015, 10, 1), + datetime(2015, 10, 1, 23), + freq='H', tz='US/Eastern') + df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) + new_index = date_range(datetime(2015, 10, 2), + datetime(2015, 10, 2, 23), + freq='H', tz='US/Eastern') + + # TODO: unused? + result = df.set_index(new_index) # noqa + + self.assertEqual(new_index.freq, index.freq) + + def test_datetimeindex_union_join_empty(self): + dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') + empty = Index([]) + + result = dti.union(empty) + tm.assertIsInstance(result, DatetimeIndex) + self.assertIs(result, result) + + result = dti.join(empty) + tm.assertIsInstance(result, DatetimeIndex) diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py new file mode 100644 index 0000000000000..5c408d5300cdc --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -0,0 +1,51 @@ +import pandas as pd +import pandas.util.testing as tm + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_fillna_datetime64(self): + # GH 11343 + for tz in ['US/Eastern', 'Asia/Tokyo']: + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00']) + + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00']) + self.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + + # tz mismatch + exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), + pd.Timestamp('2011-01-01 10:00', tz=tz), + pd.Timestamp('2011-01-01 11:00')], dtype=object) + self.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + + # object + exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', + pd.Timestamp('2011-01-01 11:00')], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], tz=tz) + + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], tz=tz) + self.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + + exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), + pd.Timestamp('2011-01-01 10:00'), + pd.Timestamp('2011-01-01 11:00', tz=tz)], + dtype=object) + self.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + + # object + exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), + 'x', + pd.Timestamp('2011-01-01 11:00', tz=tz)], + dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py new file mode 100644 index 0000000000000..c25cd6a3fa90e --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -0,0 +1,1073 @@ +import warnings +import numpy as np +from datetime import timedelta + +import pandas as pd +import pandas.tslib as tslib +import pandas.util.testing as tm +from pandas.core.common import PerformanceWarning +from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, + date_range, TimedeltaIndex, _np_version_under1p10, Index, + datetime, Float64Index) + +from pandas.tests.test_base import Ops + + +class TestDatetimeIndexOps(Ops): + tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', + 'dateutil/US/Pacific'] + + def setUp(self): + super(TestDatetimeIndexOps, self).setUp() + mask = lambda x: (isinstance(x, DatetimeIndex) or + isinstance(x, PeriodIndex)) + self.is_valid_objs = [o for o in self.objs if mask(o)] + self.not_valid_objs = [o for o in self.objs if not mask(o)] + + def test_ops_properties(self): + self.check_ops_properties( + ['year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', + 'week', 'dayofweek', 'dayofyear', 'quarter']) + self.check_ops_properties(['date', 'time', 'microsecond', 'nanosecond', + 'is_month_start', 'is_month_end', + 'is_quarter_start', + 'is_quarter_end', 'is_year_start', + 'is_year_end', 'weekday_name'], + lambda x: isinstance(x, DatetimeIndex)) + + def test_ops_properties_basic(self): + + # sanity check that the behavior didn't change + # GH7206 + for op in ['year', 'day', 'second', 'weekday']: + self.assertRaises(TypeError, lambda x: getattr(self.dt_series, op)) + + # attribute access should still work! + s = Series(dict(year=2000, month=1, day=10)) + self.assertEqual(s.year, 2000) + self.assertEqual(s.month, 1) + self.assertEqual(s.day, 10) + self.assertRaises(AttributeError, lambda: s.weekday) + + def test_asobject_tolist(self): + idx = pd.date_range(start='2013-01-01', periods=4, freq='M', + name='idx') + expected_list = [Timestamp('2013-01-31'), + Timestamp('2013-02-28'), + Timestamp('2013-03-31'), + Timestamp('2013-04-30')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + + self.assertEqual(result.dtype, object) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = pd.date_range(start='2013-01-01', periods=4, freq='M', + name='idx', tz='Asia/Tokyo') + expected_list = [Timestamp('2013-01-31', tz='Asia/Tokyo'), + Timestamp('2013-02-28', tz='Asia/Tokyo'), + Timestamp('2013-03-31', tz='Asia/Tokyo'), + Timestamp('2013-04-30', tz='Asia/Tokyo')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), + pd.NaT, datetime(2013, 1, 4)], name='idx') + expected_list = [Timestamp('2013-01-01'), + Timestamp('2013-01-02'), pd.NaT, + Timestamp('2013-01-04')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + def test_minmax(self): + for tz in self.tz: + # monotonic + idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], tz=tz) + self.assertTrue(idx1.is_monotonic) + + # non-monotonic + idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', + '2011-01-02', pd.NaT], tz=tz) + self.assertFalse(idx2.is_monotonic) + + for idx in [idx1, idx2]: + self.assertEqual(idx.min(), Timestamp('2011-01-01', tz=tz)) + self.assertEqual(idx.max(), Timestamp('2011-01-03', tz=tz)) + self.assertEqual(idx.argmin(), 0) + self.assertEqual(idx.argmax(), 2) + + for op in ['min', 'max']: + # Return NaT + obj = DatetimeIndex([]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = DatetimeIndex([pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + def test_numpy_minmax(self): + dr = pd.date_range(start='2016-01-15', end='2016-01-20') + + self.assertEqual(np.min(dr), + Timestamp('2016-01-15 00:00:00', freq='D')) + self.assertEqual(np.max(dr), + Timestamp('2016-01-20 00:00:00', freq='D')) + + errmsg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, errmsg, np.min, dr, out=0) + tm.assertRaisesRegexp(ValueError, errmsg, np.max, dr, out=0) + + self.assertEqual(np.argmin(dr), 0) + self.assertEqual(np.argmax(dr), 5) + + if not _np_version_under1p10: + errmsg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, errmsg, np.argmin, dr, out=0) + tm.assertRaisesRegexp(ValueError, errmsg, np.argmax, dr, out=0) + + def test_round(self): + for tz in self.tz: + rng = pd.date_range(start='2016-01-01', periods=5, + freq='30Min', tz=tz) + elt = rng[1] + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), + ]) + expected_elt = expected_rng[1] + + tm.assert_index_equal(rng.round(freq='H'), expected_rng) + self.assertEqual(elt.round(freq='H'), expected_elt) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): + rng.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') + + msg = " is a non-fixed frequency" + tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') + tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') + + def test_repeat_range(self): + rng = date_range('1/1/2000', '1/1/2001') + + result = rng.repeat(5) + self.assertIsNone(result.freq) + self.assertEqual(len(result), 5 * len(rng)) + + for tz in self.tz: + index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-02', '2001-01-02'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-03', '2001-01-03'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], + tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', + 'NaT', 'NaT', 'NaT', + '2003-01-01', '2003-01-01', '2003-01-01'], + tz=tz) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + def test_repeat(self): + reps = 2 + msg = "the 'axis' parameter is not supported" + + for tz in self.tz: + rng = pd.date_range(start='2016-01-01', periods=2, + freq='30Min', tz=tz) + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + ]) + + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + self.assertIsNone(res.freq) + + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) + tm.assertRaisesRegexp(ValueError, msg, np.repeat, + rng, reps, axis=1) + + def test_representation(self): + + idx = [] + idx.append(DatetimeIndex([], freq='D')) + idx.append(DatetimeIndex(['2011-01-01'], freq='D')) + idx.append(DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')) + idx.append(DatetimeIndex( + ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D')) + idx.append(DatetimeIndex( + ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' + ], freq='H', tz='Asia/Tokyo')) + idx.append(DatetimeIndex( + ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern')) + idx.append(DatetimeIndex( + ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='UTC')) + + exp = [] + exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""") + exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', " + "freq='D')") + exp.append("DatetimeIndex(['2011-01-01', '2011-01-02'], " + "dtype='datetime64[ns]', freq='D')") + exp.append("DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='datetime64[ns]', freq='D')") + exp.append("DatetimeIndex(['2011-01-01 09:00:00+09:00', " + "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" + ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')") + exp.append("DatetimeIndex(['2011-01-01 09:00:00-05:00', " + "'2011-01-01 10:00:00-05:00', 'NaT'], " + "dtype='datetime64[ns, US/Eastern]', freq=None)") + exp.append("DatetimeIndex(['2011-01-01 09:00:00+00:00', " + "'2011-01-01 10:00:00+00:00', 'NaT'], " + "dtype='datetime64[ns, UTC]', freq=None)""") + + with pd.option_context('display.width', 300): + for indx, expected in zip(idx, exp): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(indx, func)() + self.assertEqual(result, expected) + + def test_representation_to_series(self): + idx1 = DatetimeIndex([], freq='D') + idx2 = DatetimeIndex(['2011-01-01'], freq='D') + idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = DatetimeIndex( + ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo') + idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='US/Eastern') + idx7 = DatetimeIndex(['2011-01-01 09:00', '2011-01-02 10:15']) + + exp1 = """Series([], dtype: datetime64[ns])""" + + exp2 = """0 2011-01-01 +dtype: datetime64[ns]""" + + exp3 = """0 2011-01-01 +1 2011-01-02 +dtype: datetime64[ns]""" + + exp4 = """0 2011-01-01 +1 2011-01-02 +2 2011-01-03 +dtype: datetime64[ns]""" + + exp5 = """0 2011-01-01 09:00:00+09:00 +1 2011-01-01 10:00:00+09:00 +2 2011-01-01 11:00:00+09:00 +dtype: datetime64[ns, Asia/Tokyo]""" + + exp6 = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 NaT +dtype: datetime64[ns, US/Eastern]""" + + exp7 = """0 2011-01-01 09:00:00 +1 2011-01-02 10:15:00 +dtype: datetime64[ns]""" + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, + idx5, idx6, idx7], + [exp1, exp2, exp3, exp4, + exp5, exp6, exp7]): + result = repr(Series(idx)) + self.assertEqual(result, expected) + + def test_summary(self): + # GH9116 + idx1 = DatetimeIndex([], freq='D') + idx2 = DatetimeIndex(['2011-01-01'], freq='D') + idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = DatetimeIndex( + ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], + freq='H', tz='Asia/Tokyo') + idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='US/Eastern') + + exp1 = """DatetimeIndex: 0 entries +Freq: D""" + + exp2 = """DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01 +Freq: D""" + + exp3 = """DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02 +Freq: D""" + + exp4 = """DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03 +Freq: D""" + + exp5 = ("DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " + "to 2011-01-01 11:00:00+09:00\n" + "Freq: H") + + exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], + [exp1, exp2, exp3, exp4, exp5, exp6]): + result = idx.summary() + self.assertEqual(result, expected) + + def test_resolution(self): + for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', + 'S', 'L', 'U'], + ['day', 'day', 'day', 'day', 'hour', + 'minute', 'second', 'millisecond', + 'microsecond']): + for tz in self.tz: + idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, + tz=tz) + self.assertEqual(idx.resolution, expected) + + def test_union(self): + for tz in self.tz: + # union + rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + + rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + + rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + for rng, other, expected in [(rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3)]: + + result_union = rng.union(other) + tm.assert_index_equal(result_union, expected) + + def test_add_iadd(self): + for tz in self.tz: + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + result = rng + delta + expected = pd.date_range('2000-01-01 02:00', + '2000-02-01 02:00', tz=tz) + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + # int + rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, + tz=tz) + result = rng + 1 + expected = pd.date_range('2000-01-01 10:00', freq='H', periods=10, + tz=tz) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + idx = DatetimeIndex(['2011-01-01', '2011-01-02']) + msg = "cannot add a datelike to a DatetimeIndex" + with tm.assertRaisesRegexp(TypeError, msg): + idx + Timestamp('2011-01-01') + + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp('2011-01-01') + idx + + def test_add_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now raises + # TypeError (GH14164) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + + with tm.assertRaises(TypeError): + dti + dti + + with tm.assertRaises(TypeError): + dti_tz + dti_tz + + with tm.assertRaises(TypeError): + dti_tz + dti + + with tm.assertRaises(TypeError): + dti + dti_tz + + def test_difference(self): + for tz in self.tz: + # diff + rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) + + rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + for rng, other, expected in [(rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3)]: + result_diff = rng.difference(other) + tm.assert_index_equal(result_diff, expected) + + def test_sub_isub(self): + for tz in self.tz: + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + expected = pd.date_range('1999-12-31 22:00', + '2000-01-31 22:00', tz=tz) + + result = rng - delta + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + # int + rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, + tz=tz) + result = rng - 1 + expected = pd.date_range('2000-01-01 08:00', freq='H', periods=10, + tz=tz) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + def test_sub_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now changed to + # return subtraction -> TimeDeltaIndex (GH ...) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') + expected = TimedeltaIndex([0, 0, 0]) + + result = dti - dti + tm.assert_index_equal(result, expected) + + result = dti_tz - dti_tz + tm.assert_index_equal(result, expected) + + with tm.assertRaises(TypeError): + dti_tz - dti + + with tm.assertRaises(TypeError): + dti - dti_tz + + with tm.assertRaises(TypeError): + dti_tz - dti_tz2 + + # isub + dti -= dti + tm.assert_index_equal(dti, expected) + + # different length raises ValueError + dti1 = date_range('20130101', periods=3) + dti2 = date_range('20130101', periods=4) + with tm.assertRaises(ValueError): + dti1 - dti2 + + # NaN propagation + dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) + dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) + expected = TimedeltaIndex(['1 days', np.nan, np.nan]) + result = dti2 - dti1 + tm.assert_index_equal(result, expected) + + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'D']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) + + with tm.assertRaises(TypeError): + idx - p + + with tm.assertRaises(TypeError): + p - idx + + def test_comp_nat(self): + left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')]) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + + def test_value_counts_unique(self): + # GH 7735 + for tz in self.tz: + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + tz=tz) + + exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, + tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, + tz=tz) + tm.assert_index_equal(idx.unique(), expected) + + idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', + '2013-01-01 09:00', '2013-01-01 08:00', + '2013-01-01 08:00', pd.NaT], tz=tz) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + tz=tz) + expected = Series([3, 2], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], tz=tz) + expected = Series([3, 2, 1], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), + expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + def test_nonunique_contains(self): + # GH 9512 + for idx in map(DatetimeIndex, + ([0, 1, 0], [0, 0, -1], [0, -1, -1], + ['2015', '2015', '2016'], ['2015', '2015', '2014'])): + tm.assertIn(idx[0], idx) + + def test_order(self): + # with freq + idx1 = DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], freq='D', name='idx') + idx2 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H', + tz='Asia/Tokyo', name='tzidx') + + for idx in [idx1, idx2]: + ordered = idx.sort_values() + self.assert_index_equal(ordered, idx) + self.assertEqual(ordered.freq, idx.freq) + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + self.assert_index_equal(ordered, expected) + self.assertEqual(ordered.freq, expected.freq) + self.assertEqual(ordered.freq.n, -1) + + ordered, indexer = idx.sort_values(return_indexer=True) + self.assert_index_equal(ordered, idx) + self.assert_numpy_array_equal(indexer, + np.array([0, 1, 2]), + check_dtype=False) + self.assertEqual(ordered.freq, idx.freq) + + ordered, indexer = idx.sort_values(return_indexer=True, + ascending=False) + expected = idx[::-1] + self.assert_index_equal(ordered, expected) + self.assert_numpy_array_equal(indexer, + np.array([2, 1, 0]), + check_dtype=False) + self.assertEqual(ordered.freq, expected.freq) + self.assertEqual(ordered.freq.n, -1) + + # without freq + for tz in self.tz: + idx1 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + tz=tz, name='idx1') + exp1 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + tz=tz, name='idx1') + + idx2 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + tz=tz, name='idx2') + + exp2 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + tz=tz, name='idx2') + + idx3 = DatetimeIndex([pd.NaT, '2011-01-03', '2011-01-05', + '2011-01-02', pd.NaT], tz=tz, name='idx3') + exp3 = DatetimeIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', + '2011-01-05'], tz=tz, name='idx3') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: + ordered = idx.sort_values() + self.assert_index_equal(ordered, expected) + self.assertIsNone(ordered.freq) + + ordered = idx.sort_values(ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True) + self.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True, + ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) + + def test_getitem(self): + idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', + tz='Asia/Tokyo', name='idx') + + for idx in [idx1, idx2]: + result = idx[0] + self.assertEqual(result, Timestamp('2011-01-01', tz=idx.tz)) + + result = idx[0:5] + expected = pd.date_range('2011-01-01', '2011-01-05', freq='D', + tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx[0:10:2] + expected = pd.date_range('2011-01-01', '2011-01-09', freq='2D', + tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx[-20:-5:3] + expected = pd.date_range('2011-01-12', '2011-01-24', freq='3D', + tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx[4::-1] + expected = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-03', + '2011-01-02', '2011-01-01'], + freq='-1D', tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + def test_drop_duplicates_metadata(self): + # GH 10115 + idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + result = idx.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertEqual(idx.freq, result.freq) + + idx_dup = idx.append(idx) + self.assertIsNone(idx_dup.freq) # freq is reset + result = idx_dup.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertIsNone(result.freq) + + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + + def test_take(self): + # GH 10295 + idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', + tz='Asia/Tokyo', name='idx') + + for idx in [idx1, idx2]: + result = idx.take([0]) + self.assertEqual(result, Timestamp('2011-01-01', tz=idx.tz)) + + result = idx.take([0, 1, 2]) + expected = pd.date_range('2011-01-01', '2011-01-03', freq='D', + tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx.take([0, 2, 4]) + expected = pd.date_range('2011-01-01', '2011-01-05', freq='2D', + tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx.take([7, 4, 1]) + expected = pd.date_range('2011-01-08', '2011-01-02', freq='-3D', + tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx.take([3, 2, 5]) + expected = DatetimeIndex(['2011-01-04', '2011-01-03', + '2011-01-06'], + freq=None, tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertIsNone(result.freq) + + result = idx.take([-3, 2, 5]) + expected = DatetimeIndex(['2011-01-29', '2011-01-03', + '2011-01-06'], + freq=None, tz=idx.tz, name='idx') + self.assert_index_equal(result, expected) + self.assertIsNone(result.freq) + + def test_take_invalid_kwargs(self): + idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + indices = [1, 6, 5, 9, 10, 13, 15, 3] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + tm.assertRaisesRegexp(TypeError, msg, idx.take, + indices, foo=2) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, idx.take, + indices, out=indices) + + msg = "the 'mode' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, idx.take, + indices, mode='clip') + + def test_infer_freq(self): + # GH 11018 + for freq in ['A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D', + '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S', + '-3S']: + idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10) + result = pd.DatetimeIndex(idx.asi8, freq='infer') + tm.assert_index_equal(idx, result) + self.assertEqual(result.freq, freq) + + def test_nat_new(self): + idx = pd.date_range('2011-01-01', freq='D', periods=5, name='x') + result = idx._nat_new() + exp = pd.DatetimeIndex([pd.NaT] * 5, name='x') + tm.assert_index_equal(result, exp) + + result = idx._nat_new(box=False) + exp = np.array([tslib.iNaT] * 5, dtype=np.int64) + tm.assert_numpy_array_equal(result, exp) + + def test_shift(self): + # GH 9903 + for tz in self.tz: + idx = pd.DatetimeIndex([], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + tm.assert_index_equal(idx.shift(3, freq='H'), idx) + + idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00' + '2011-01-01 12:00'], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00' + '2011-01-01 15:00'], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(3, freq='H'), exp) + exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00' + '2011-01-01 09:00'], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + + def test_nat(self): + self.assertIs(pd.DatetimeIndex._na_value, pd.NaT) + self.assertIs(pd.DatetimeIndex([])._na_value, pd.NaT) + + for tz in [None, 'US/Eastern', 'UTC']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.intp)) + + idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.intp)) + + def test_equals(self): + # GH 13107 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], + tz='US/Pacific') + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + + # same internal, different tz + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + self.assertFalse(idx.equals(idx3)) + self.assertFalse(idx.equals(idx3.copy())) + self.assertFalse(idx.equals(idx3.asobject)) + self.assertFalse(idx.asobject.equals(idx3)) + self.assertFalse(idx.equals(list(idx3))) + self.assertFalse(idx.equals(pd.Series(idx3))) + + +class TestDateTimeIndexToJulianDate(tm.TestCase): + + def test_1700(self): + r1 = Float64Index([2345897.5, 2345898.5, 2345899.5, 2345900.5, + 2345901.5]) + r2 = date_range(start=Timestamp('1710-10-01'), periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + r1 = Float64Index([2451601.5, 2451602.5, 2451603.5, 2451604.5, + 2451605.5]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + r1 = Float64Index( + [2451601.5, 2451601.5416666666666666, 2451601.5833333333333333, + 2451601.625, 2451601.6666666666666666]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='H').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + r1 = Float64Index( + [2451601.5, 2451601.5006944444444444, 2451601.5013888888888888, + 2451601.5020833333333333, 2451601.5027777777777777]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='T').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_second(self): + r1 = Float64Index( + [2451601.5, 2451601.500011574074074, 2451601.5000231481481481, + 2451601.5000347222222222, 2451601.5000462962962962]) + r2 = date_range(start=Timestamp('2000-02-27'), periods=5, + freq='S').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + # GH 10699 + def test_datetime64_with_DateOffset(self): + for klass, assert_func in zip([Series, DatetimeIndex], + [self.assert_series_equal, + tm.assert_index_equal]): + s = klass(date_range('2000-01-01', '2000-01-31'), name='a') + result = s + pd.DateOffset(years=1) + result2 = pd.DateOffset(years=1) + s + exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') + assert_func(result, exp) + assert_func(result2, exp) + + result = s - pd.DateOffset(years=1) + exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') + assert_func(result, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.Day() + result2 = pd.offsets.Day() + s + exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), + Timestamp('2000-02-16', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.MonthEnd() + result2 = pd.offsets.MonthEnd() + s + exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), + Timestamp('2000-02-29', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + # array of offsets - valid for Series only + if klass is Series: + with tm.assert_produces_warning(PerformanceWarning): + s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.MonthEnd()]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') + ]) + assert_func(result, exp) + + # same offset + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.DateOffset(years=1)]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) + assert_func(result, exp) + + s = klass([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-03-31'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31'), + Timestamp('2000-05-15'), + Timestamp('2001-06-15')]) + + # DateOffset relativedelta fastpath + relative_kwargs = [('years', 2), ('months', 5), ('days', 3), + ('hours', 5), ('minutes', 10), ('seconds', 2), + ('microseconds', 5)] + for i, kwd in enumerate(relative_kwargs): + op = pd.DateOffset(**dict([kwd])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + + # assert these are equal on a piecewise basis + offsets = ['YearBegin', ('YearBegin', {'month': 5}), 'YearEnd', + ('YearEnd', {'month': 5}), 'MonthBegin', 'MonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', + 'Week', ('Week', { + 'weekday': 3 + }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', + 'CustomBusinessDay', 'CDay', 'CBMonthEnd', + 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', + 'BusinessHour', 'BYearBegin', 'BYearEnd', + 'BQuarterBegin', ('LastWeekOfMonth', { + 'weekday': 2 + }), ('FY5253Quarter', {'qtr_with_extra_week': 1, + 'startingMonth': 1, + 'weekday': 2, + 'variation': 'nearest'}), + ('FY5253', {'weekday': 0, + 'startingMonth': 2, + 'variation': + 'nearest'}), ('WeekOfMonth', {'weekday': 2, + 'week': 2}), + 'Easter', ('DateOffset', {'day': 4}), + ('DateOffset', {'month': 5})] + + with warnings.catch_warnings(record=True): + for normalize in (True, False): + for do in offsets: + if isinstance(do, tuple): + do, kwargs = do + else: + do = do + kwargs = {} + + for n in [0, 5]: + if (do in ['WeekOfMonth', 'LastWeekOfMonth', + 'FY5253Quarter', 'FY5253'] and n == 0): + continue + op = getattr(pd.offsets, do)(n, + normalize=normalize, + **kwargs) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + assert_func(klass([op + x for x in s]), op + s) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py new file mode 100644 index 0000000000000..ba6beb03c7f24 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -0,0 +1,168 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import (DatetimeIndex, date_range, Series, bdate_range, DataFrame, + Int64Index) + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_union(self): + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = Int64Index(np.arange(10, 30, 2)) + result = i1.union(i2) + expected = Int64Index(np.arange(0, 30, 2)) + tm.assert_index_equal(result, expected) + + def test_union_coverage(self): + idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) + ordered = DatetimeIndex(idx.sort_values(), freq='infer') + result = ordered.union(idx) + tm.assert_index_equal(result, ordered) + + result = ordered[:0].union(ordered) + tm.assert_index_equal(result, ordered) + self.assertEqual(result.freq, ordered.freq) + + def test_union_bug_1730(self): + rng_a = date_range('1/1/2012', periods=4, freq='3H') + rng_b = date_range('1/1/2012', periods=4, freq='4H') + + result = rng_a.union(rng_b) + exp = DatetimeIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + tm.assert_index_equal(result, exp) + + def test_union_bug_1745(self): + left = DatetimeIndex(['2012-05-11 15:19:49.695000']) + right = DatetimeIndex(['2012-05-29 13:04:21.322000', + '2012-05-11 15:27:24.873000', + '2012-05-11 15:31:05.350000']) + + result = left.union(right) + exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + tm.assert_index_equal(result, exp) + + def test_union_bug_4564(self): + from pandas import DateOffset + left = date_range("2013-01-01", "2013-02-01") + right = left + DateOffset(minutes=15) + + result = left.union(right) + exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + tm.assert_index_equal(result, exp) + + def test_union_freq_both_none(self): + # GH11086 + expected = bdate_range('20150101', periods=10) + expected.freq = None + + result = expected.union(expected) + tm.assert_index_equal(result, expected) + self.assertIsNone(result.freq) + + def test_union_dataframe_index(self): + rng1 = date_range('1/1/1999', '1/1/2012', freq='MS') + s1 = Series(np.random.randn(len(rng1)), rng1) + + rng2 = date_range('1/1/1980', '12/1/2001', freq='MS') + s2 = Series(np.random.randn(len(rng2)), rng2) + df = DataFrame({'s1': s1, 's2': s2}) + + exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS') + tm.assert_index_equal(df.index, exp) + + def test_union_with_DatetimeIndex(self): + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = DatetimeIndex(start='2012-01-03 00:00:00', periods=10, freq='D') + i1.union(i2) # Works + i2.union(i1) # Fails with "AttributeError: can't set attribute" + + def test_intersection(self): + # GH 4690 (with tz) + for tz in [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: + base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') + + # if target has the same name, it is preserved + rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') + expected2 = date_range('6/1/2000', '6/20/2000', freq='D', + name='idx') + + # if target name is different, it will be reset + rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') + expected3 = date_range('6/1/2000', '6/20/2000', freq='D', + name=None) + + rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') + expected4 = DatetimeIndex([], name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), + (rng4, expected4)]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + # non-monotonic + base = DatetimeIndex(['2011-01-05', '2011-01-04', + '2011-01-02', '2011-01-03'], + tz=tz, name='idx') + + rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', + '2011-02-02', '2011-02-03'], + tz=tz, name='idx') + expected2 = DatetimeIndex( + ['2011-01-04', '2011-01-02'], tz=tz, name='idx') + + rng3 = DatetimeIndex(['2011-01-04', '2011-01-02', + '2011-02-02', '2011-02-03'], + tz=tz, name='other') + expected3 = DatetimeIndex( + ['2011-01-04', '2011-01-02'], tz=tz, name=None) + + # GH 7880 + rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, + name='idx') + expected4 = DatetimeIndex([], tz=tz, name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), + (rng4, expected4)]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertIsNone(result.freq) + self.assertEqual(result.tz, expected.tz) + + # empty same freq GH2129 + rng = date_range('6/1/2000', '6/15/2000', freq='T') + result = rng[0:0].intersection(rng) + self.assertEqual(len(result), 0) + + result = rng.intersection(rng[0:0]) + self.assertEqual(len(result), 0) + + def test_intersection_bug_1708(self): + from pandas import DateOffset + index_1 = date_range('1/1/2012', periods=4, freq='12H') + index_2 = index_1 + DateOffset(hours=1) + + result = index_1 & index_2 + self.assertEqual(len(result), 0) + + def test_difference_freq(self): + # GH14323: difference of DatetimeIndex should not preserve frequency + + index = date_range("20160920", "20160925", freq="D") + other = date_range("20160921", "20160924", freq="D") + expected = DatetimeIndex(["20160920", "20160925"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = date_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = DatetimeIndex(["20160920", "20160921"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 2cd73ec8d254a..32e4029a57fe9 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -1,18 +1,15 @@ # -*- coding: utf-8 -*- -from datetime import datetime, timedelta, time, date - import numpy as np +from datetime import timedelta +import pandas as pd +import pandas.util.testing as tm from pandas import (DatetimeIndex, Float64Index, Index, Int64Index, NaT, Period, PeriodIndex, Series, Timedelta, TimedeltaIndex, date_range, period_range, timedelta_range, notnull) -import pandas.util.testing as tm - -import pandas as pd -from pandas.tslib import Timestamp, OutOfBoundsDatetime from .common import Base @@ -88,553 +85,9 @@ def test_shift(self): '2013-01-11'], freq='D') self.assert_index_equal(result, expected) - def test_construction_with_alt(self): - - i = pd.date_range('20130101', periods=5, freq='H', tz='US/Eastern') - i2 = DatetimeIndex(i, dtype=i.dtype) - self.assert_index_equal(i, i2) - - i2 = DatetimeIndex(i.tz_localize(None).asi8, tz=i.dtype.tz) - self.assert_index_equal(i, i2) - - i2 = DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype) - self.assert_index_equal(i, i2) - - i2 = DatetimeIndex( - i.tz_localize(None).asi8, dtype=i.dtype, tz=i.dtype.tz) - self.assert_index_equal(i, i2) - - # localize into the provided tz - i2 = DatetimeIndex(i.tz_localize(None).asi8, tz='UTC') - expected = i.tz_localize(None).tz_localize('UTC') - self.assert_index_equal(i2, expected) - - # incompat tz/dtype - self.assertRaises(ValueError, lambda: DatetimeIndex( - i.tz_localize(None).asi8, dtype=i.dtype, tz='US/Pacific')) - def test_pickle_compat_construction(self): pass - def test_construction_index_with_mixed_timezones(self): - # GH 11488 - # no tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01'), - Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01'), - Timestamp('2011-01-02')], name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNone(result.tz) - - # same tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNotNone(result.tz) - self.assertEqual(result.tz, exp.tz) - - # same tz results in DatetimeIndex (DST) - result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - Timestamp('2011-08-01 10:00', tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNotNone(result.tz) - self.assertEqual(result.tz, exp.tz) - - # different tz results in Index(dtype=object) - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - exp = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertFalse(isinstance(result, DatetimeIndex)) - - result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - exp = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertFalse(isinstance(result, DatetimeIndex)) - - # length = 1 - result = Index([Timestamp('2011-01-01')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNone(result.tz) - - # length = 1 with tz - result = Index( - [Timestamp('2011-01-01 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00')], tz='Asia/Tokyo', - name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNotNone(result.tz) - self.assertEqual(result.tz, exp.tz) - - def test_construction_index_with_mixed_timezones_with_NaT(self): - # GH 11488 - result = Index([pd.NaT, Timestamp('2011-01-01'), - pd.NaT, Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'), - pd.NaT, Timestamp('2011-01-02')], name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNone(result.tz) - - # same tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNotNone(result.tz) - self.assertEqual(result.tz, exp.tz) - - # same tz results in DatetimeIndex (DST) - result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - pd.NaT, - Timestamp('2011-08-01 10:00', tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT, - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNotNone(result.tz) - self.assertEqual(result.tz, exp.tz) - - # different tz results in Index(dtype=object) - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - name='idx') - exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertFalse(isinstance(result, DatetimeIndex)) - - result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], name='idx') - exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertFalse(isinstance(result, DatetimeIndex)) - - # all NaT - result = Index([pd.NaT, pd.NaT], name='idx') - exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNone(result.tz) - - # all NaT with tz - result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - self.assertIsNotNone(result.tz) - self.assertEqual(result.tz, exp.tz) - - def test_construction_dti_with_mixed_timezones(self): - # GH 11488 (not changed, added explicit tests) - - # no tz results in DatetimeIndex - result = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - - # same tz results in DatetimeIndex - result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', - tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - - # same tz results in DatetimeIndex (DST) - result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - Timestamp('2011-08-01 10:00', - tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - - # different tz coerces tz-naive to tz-awareIndex(dtype=object) - result = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', - tz='US/Eastern')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 05:00'), - Timestamp('2011-01-02 10:00')], - tz='US/Eastern', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - - # tz mismatch affecting to tz-aware raises TypeError/ValueError - - with tm.assertRaises(ValueError): - DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - - with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): - DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - - with tm.assertRaises(ValueError): - DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='US/Eastern', name='idx') - - with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): - # passing tz should results in DatetimeIndex, then mismatch raises - # TypeError - Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - - def test_construction_base_constructor(self): - arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] - tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.DatetimeIndex(np.array(arr))) - - arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] - tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.DatetimeIndex(np.array(arr))) - - def test_construction_outofbounds(self): - # GH 13663 - dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1)] - exp = Index(dates, dtype=object) - # coerces to object - tm.assert_index_equal(Index(dates), exp) - - with tm.assertRaises(OutOfBoundsDatetime): - # can't create DatetimeIndex - DatetimeIndex(dates) - - def test_construction_with_ndarray(self): - # GH 5152 - dates = [datetime(2013, 10, 7), - datetime(2013, 10, 8), - datetime(2013, 10, 9)] - data = DatetimeIndex(dates, freq=pd.tseries.frequencies.BDay()).values - result = DatetimeIndex(data, freq=pd.tseries.frequencies.BDay()) - expected = DatetimeIndex(['2013-10-07', - '2013-10-08', - '2013-10-09'], - freq='B') - tm.assert_index_equal(result, expected) - - def test_astype(self): - # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - - result = idx.astype(object) - expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object) - tm.assert_index_equal(result, expected) - - result = idx.astype(int) - expected = Int64Index([1463356800000000000] + - [-9223372036854775808] * 3, dtype=np.int64) - tm.assert_index_equal(result, expected) - - rng = date_range('1/1/2000', periods=10) - result = rng.astype('i8') - self.assert_index_equal(result, Index(rng.asi8)) - self.assert_numpy_array_equal(result.values, rng.asi8) - - def test_astype_with_tz(self): - - # with tz - rng = date_range('1/1/2000', periods=10, tz='US/Eastern') - result = rng.astype('datetime64[ns]') - expected = (date_range('1/1/2000', periods=10, - tz='US/Eastern') - .tz_convert('UTC').tz_localize(None)) - tm.assert_index_equal(result, expected) - - # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex - result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) - expected = pd.Series( - ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) - tm.assert_series_equal(result, expected) - - result = Series(pd.date_range('2012-01-01', periods=3, - tz='US/Eastern')).astype(str) - expected = Series(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - dtype=object) - tm.assert_series_equal(result, expected) - - def test_astype_str_compat(self): - # GH 13149, GH 13209 - # verify that we are returing NaT as a string (and not unicode) - - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - result = idx.astype(str) - expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object) - tm.assert_index_equal(result, expected) - - def test_astype_str(self): - # test astype string - #10442 - result = date_range('2012-01-01', periods=4, - name='test_name').astype(str) - expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', - '2012-01-04'], name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with tz and name - result = date_range('2012-01-01', periods=3, name='test_name', - tz='US/Eastern').astype(str) - expected = Index(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with freqH and name - result = date_range('1/1/2011', periods=3, freq='H', - name='test_name').astype(str) - expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', - '2011-01-01 02:00:00'], - name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with freqH and timezone - result = date_range('3/6/2012 00:00', periods=2, freq='H', - tz='Europe/London', name='test_name').astype(str) - expected = Index(['2012-03-06 00:00:00+00:00', - '2012-03-06 01:00:00+00:00'], - dtype=object, name='test_name') - tm.assert_index_equal(result, expected) - - def test_astype_datetime64(self): - # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - - result = idx.astype('datetime64[ns]') - tm.assert_index_equal(result, idx) - self.assertFalse(result is idx) - - result = idx.astype('datetime64[ns]', copy=False) - tm.assert_index_equal(result, idx) - self.assertTrue(result is idx) - - idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST') - result = idx_tz.astype('datetime64[ns]') - expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]') - tm.assert_index_equal(result, expected) - - def test_astype_raises(self): - # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - - self.assertRaises(ValueError, idx.astype, float) - self.assertRaises(ValueError, idx.astype, 'timedelta64') - self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') - self.assertRaises(ValueError, idx.astype, 'datetime64') - self.assertRaises(ValueError, idx.astype, 'datetime64[D]') - - def test_where_other(self): - - # other is ndarray or Index - i = pd.date_range('20130101', periods=3, tz='US/Eastern') - - for arr in [np.nan, pd.NaT]: - result = i.where(notnull(i), other=np.nan) - expected = i - tm.assert_index_equal(result, expected) - - i2 = i.copy() - i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) - result = i.where(notnull(i2), i2) - tm.assert_index_equal(result, i2) - - i2 = i.copy() - i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) - result = i.where(notnull(i2), i2.values) - tm.assert_index_equal(result, i2) - - def test_where_tz(self): - i = pd.date_range('20130101', periods=3, tz='US/Eastern') - result = i.where(notnull(i)) - expected = i - tm.assert_index_equal(result, expected) - - i2 = i.copy() - i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) - result = i.where(notnull(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_get_loc(self): - idx = pd.date_range('2000-01-01', periods=3) - - for method in [None, 'pad', 'backfill', 'nearest']: - self.assertEqual(idx.get_loc(idx[1], method), 1) - self.assertEqual(idx.get_loc(idx[1].to_pydatetime(), method), 1) - self.assertEqual(idx.get_loc(str(idx[1]), method), 1) - if method is not None: - self.assertEqual(idx.get_loc(idx[1], method, - tolerance=pd.Timedelta('0 days')), - 1) - - self.assertEqual(idx.get_loc('2000-01-01', method='nearest'), 0) - self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest'), 1) - - self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', - tolerance='1 day'), 1) - self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', - tolerance=pd.Timedelta('1D')), 1) - self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', - tolerance=np.timedelta64(1, 'D')), 1) - self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', - tolerance=timedelta(1)), 1) - with tm.assertRaisesRegexp(ValueError, 'must be convertible'): - idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') - with tm.assertRaises(KeyError): - idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') - - self.assertEqual(idx.get_loc('2000', method='nearest'), slice(0, 3)) - self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 3)) - - self.assertEqual(idx.get_loc('1999', method='nearest'), 0) - self.assertEqual(idx.get_loc('2001', method='nearest'), 2) - - with tm.assertRaises(KeyError): - idx.get_loc('1999', method='pad') - with tm.assertRaises(KeyError): - idx.get_loc('2001', method='backfill') - - with tm.assertRaises(KeyError): - idx.get_loc('foobar') - with tm.assertRaises(TypeError): - idx.get_loc(slice(2)) - - idx = pd.to_datetime(['2000-01-01', '2000-01-04']) - self.assertEqual(idx.get_loc('2000-01-02', method='nearest'), 0) - self.assertEqual(idx.get_loc('2000-01-03', method='nearest'), 1) - self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 2)) - - # time indexing - idx = pd.date_range('2000-01-01', periods=24, freq='H') - tm.assert_numpy_array_equal(idx.get_loc(time(12)), - np.array([12]), check_dtype=False) - tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), - np.array([]), check_dtype=False) - with tm.assertRaises(NotImplementedError): - idx.get_loc(time(12, 30), method='pad') - - def test_get_indexer(self): - idx = pd.date_range('2000-01-01', periods=3) - exp = np.array([0, 1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) - - target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', - '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 hour')), - np.array([0, -1, 1], dtype=np.intp)) - with tm.assertRaises(ValueError): - idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') - - def test_roundtrip_pickle_with_tz(self): - - # GH 8367 - # round-trip of timezone - index = date_range('20130101', periods=3, tz='US/Eastern', name='foo') - unpickled = self.round_trip_pickle(index) - self.assert_index_equal(index, unpickled) - - def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): - # GH7774 - index = date_range('20130101', periods=3, tz='US/Eastern') - self.assertEqual(str(index.reindex([])[0].tz), 'US/Eastern') - self.assertEqual(str(index.reindex(np.array([]))[0].tz), 'US/Eastern') - - def test_time_loc(self): # GH8667 - from datetime import time - from pandas.index import _SIZE_CUTOFF - - ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) - key = time(15, 11, 30) - start = key.hour * 3600 + key.minute * 60 + key.second - step = 24 * 3600 - - for n in ns: - idx = pd.date_range('2014-11-26', periods=n, freq='S') - ts = pd.Series(np.random.randn(n), index=idx) - i = np.arange(start, n, step) - - tm.assert_numpy_array_equal(ts.index.get_loc(key), i, - check_dtype=False) - tm.assert_series_equal(ts[key], ts.iloc[i]) - - left, right = ts.copy(), ts.copy() - left[key] *= -10 - right.iloc[i] *= -10 - tm.assert_series_equal(left, right) - - def test_time_overflow_for_32bit_machines(self): - # GH8943. On some machines NumPy defaults to np.int32 (for example, - # 32-bit Linux machines). In the function _generate_regular_range - # found in tseries/index.py, `periods` gets multiplied by `strides` - # (which has value 1e9) and since the max value for np.int32 is ~2e9, - # and since those machines won't promote np.int32 to np.int64, we get - # overflow. - periods = np.int_(1000) - - idx1 = pd.date_range(start='2000', periods=periods, freq='S') - self.assertEqual(len(idx1), periods) - - idx2 = pd.date_range(end='2000', periods=periods, freq='S') - self.assertEqual(len(idx2), periods) - def test_intersection(self): first = self.index second = self.index[5:] @@ -665,122 +118,6 @@ def test_union(self): result = first.union(case) self.assertTrue(tm.equalContents(result, everything)) - def test_nat(self): - self.assertIs(DatetimeIndex([np.nan])[0], pd.NaT) - - def test_ufunc_coercions(self): - idx = date_range('2011-01-01', periods=3, freq='2D', name='x') - - delta = np.timedelta64(1, 'D') - for result in [idx + delta, np.add(idx, delta)]: - tm.assertIsInstance(result, DatetimeIndex) - exp = date_range('2011-01-02', periods=3, freq='2D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '2D') - - for result in [idx - delta, np.subtract(idx, delta)]: - tm.assertIsInstance(result, DatetimeIndex) - exp = date_range('2010-12-31', periods=3, freq='2D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '2D') - - delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), - np.timedelta64(3, 'D')]) - for result in [idx + delta, np.add(idx, delta)]: - tm.assertIsInstance(result, DatetimeIndex) - exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], - freq='3D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '3D') - - for result in [idx - delta, np.subtract(idx, delta)]: - tm.assertIsInstance(result, DatetimeIndex) - exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], - freq='D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, 'D') - - def test_fillna_datetime64(self): - # GH 11343 - for tz in ['US/Eastern', 'Asia/Tokyo']: - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00']) - - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00']) - self.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) - - # tz mismatch - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 10:00', tz=tz), - pd.Timestamp('2011-01-01 11:00')], dtype=object) - self.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) - - # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', - pd.Timestamp('2011-01-01 11:00')], dtype=object) - self.assert_index_equal(idx.fillna('x'), exp) - - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], tz=tz) - - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], tz=tz) - self.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) - - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - pd.Timestamp('2011-01-01 10:00'), - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - self.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) - - # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - 'x', - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - self.assert_index_equal(idx.fillna('x'), exp) - - def test_difference_freq(self): - # GH14323: difference of DatetimeIndex should not preserve frequency - - index = date_range("20160920", "20160925", freq="D") - other = date_range("20160921", "20160924", freq="D") - expected = DatetimeIndex(["20160920", "20160925"], freq=None) - idx_diff = index.difference(other) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) - - other = date_range("20160922", "20160925", freq="D") - idx_diff = index.difference(other) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) - - def test_week_of_month_frequency(self): - # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise - d1 = date(2002, 9, 1) - d2 = date(2013, 10, 27) - d3 = date(2012, 9, 30) - idx1 = DatetimeIndex([d1, d2]) - idx2 = DatetimeIndex([d3]) - result_append = idx1.append(idx2) - expected = DatetimeIndex([d1, d2, d3]) - tm.assert_index_equal(result_append, expected) - result_union = idx1.union(idx2) - expected = DatetimeIndex([d1, d3, d2]) - tm.assert_index_equal(result_union, expected) - - # GH 5115 - result = date_range("2013-1-1", periods=4, freq='WOM-1SAT') - dates = ['2013-01-05', '2013-02-02', '2013-03-02', '2013-04-06'] - expected = DatetimeIndex(dates, freq='WOM-1SAT') - tm.assert_index_equal(result, expected) - class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index bca50237081e1..4f2ac3ff0d87e 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1,5 +1,5 @@ from __future__ import print_function -from datetime import datetime, timedelta +from datetime import timedelta import numpy as np import pandas as pd from pandas import (Series, Index, Int64Index, Timestamp, Period, @@ -14,901 +14,6 @@ from pandas.tests.test_base import Ops -class TestDatetimeIndexOps(Ops): - tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', - 'dateutil/US/Pacific'] - - def setUp(self): - super(TestDatetimeIndexOps, self).setUp() - mask = lambda x: (isinstance(x, DatetimeIndex) or - isinstance(x, PeriodIndex)) - self.is_valid_objs = [o for o in self.objs if mask(o)] - self.not_valid_objs = [o for o in self.objs if not mask(o)] - - def test_ops_properties(self): - self.check_ops_properties( - ['year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', - 'week', 'dayofweek', 'dayofyear', 'quarter']) - self.check_ops_properties(['date', 'time', 'microsecond', 'nanosecond', - 'is_month_start', 'is_month_end', - 'is_quarter_start', - 'is_quarter_end', 'is_year_start', - 'is_year_end', 'weekday_name'], - lambda x: isinstance(x, DatetimeIndex)) - - def test_ops_properties_basic(self): - - # sanity check that the behavior didn't change - # GH7206 - for op in ['year', 'day', 'second', 'weekday']: - self.assertRaises(TypeError, lambda x: getattr(self.dt_series, op)) - - # attribute access should still work! - s = Series(dict(year=2000, month=1, day=10)) - self.assertEqual(s.year, 2000) - self.assertEqual(s.month, 1) - self.assertEqual(s.day, 10) - self.assertRaises(AttributeError, lambda: s.weekday) - - def test_asobject_tolist(self): - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx') - expected_list = [Timestamp('2013-01-31'), - Timestamp('2013-02-28'), - Timestamp('2013-03-31'), - Timestamp('2013-04-30')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - - self.assertEqual(result.dtype, object) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx', tz='Asia/Tokyo') - expected_list = [Timestamp('2013-01-31', tz='Asia/Tokyo'), - Timestamp('2013-02-28', tz='Asia/Tokyo'), - Timestamp('2013-03-31', tz='Asia/Tokyo'), - Timestamp('2013-04-30', tz='Asia/Tokyo')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), - pd.NaT, datetime(2013, 1, 4)], name='idx') - expected_list = [Timestamp('2013-01-01'), - Timestamp('2013-01-02'), pd.NaT, - Timestamp('2013-01-04')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - def test_minmax(self): - for tz in self.tz: - # monotonic - idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], tz=tz) - self.assertTrue(idx1.is_monotonic) - - # non-monotonic - idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', - '2011-01-02', pd.NaT], tz=tz) - self.assertFalse(idx2.is_monotonic) - - for idx in [idx1, idx2]: - self.assertEqual(idx.min(), Timestamp('2011-01-01', tz=tz)) - self.assertEqual(idx.max(), Timestamp('2011-01-03', tz=tz)) - self.assertEqual(idx.argmin(), 0) - self.assertEqual(idx.argmax(), 2) - - for op in ['min', 'max']: - # Return NaT - obj = DatetimeIndex([]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - obj = DatetimeIndex([pd.NaT]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - def test_numpy_minmax(self): - dr = pd.date_range(start='2016-01-15', end='2016-01-20') - - self.assertEqual(np.min(dr), - Timestamp('2016-01-15 00:00:00', freq='D')) - self.assertEqual(np.max(dr), - Timestamp('2016-01-20 00:00:00', freq='D')) - - errmsg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, errmsg, np.min, dr, out=0) - tm.assertRaisesRegexp(ValueError, errmsg, np.max, dr, out=0) - - self.assertEqual(np.argmin(dr), 0) - self.assertEqual(np.argmax(dr), 5) - - if not _np_version_under1p10: - errmsg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, errmsg, np.argmin, dr, out=0) - tm.assertRaisesRegexp(ValueError, errmsg, np.argmax, dr, out=0) - - def test_round(self): - for tz in self.tz: - rng = pd.date_range(start='2016-01-01', periods=5, - freq='30Min', tz=tz) - elt = rng[1] - - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - ]) - expected_elt = expected_rng[1] - - tm.assert_index_equal(rng.round(freq='H'), expected_rng) - self.assertEqual(elt.round(freq='H'), expected_elt) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with tm.assertRaisesRegexp(ValueError, msg): - rng.round(freq='foo') - with tm.assertRaisesRegexp(ValueError, msg): - elt.round(freq='foo') - - msg = " is a non-fixed frequency" - tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') - - def test_repeat_range(self): - rng = date_range('1/1/2000', '1/1/2001') - - result = rng.repeat(5) - self.assertIsNone(result.freq) - self.assertEqual(len(result), 5 * len(rng)) - - for tz in self.tz: - index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-02', '2001-01-02'], tz=tz) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - self.assertIsNone(res.freq) - - index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-03', '2001-01-03'], tz=tz) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - self.assertIsNone(res.freq) - - index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], - tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', - 'NaT', 'NaT', 'NaT', - '2003-01-01', '2003-01-01', '2003-01-01'], - tz=tz) - for res in [index.repeat(3), np.repeat(index, 3)]: - tm.assert_index_equal(res, exp) - self.assertIsNone(res.freq) - - def test_repeat(self): - reps = 2 - msg = "the 'axis' parameter is not supported" - - for tz in self.tz: - rng = pd.date_range(start='2016-01-01', periods=2, - freq='30Min', tz=tz) - - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - ]) - - res = rng.repeat(reps) - tm.assert_index_equal(res, expected_rng) - self.assertIsNone(res.freq) - - tm.assert_index_equal(np.repeat(rng, reps), expected_rng) - tm.assertRaisesRegexp(ValueError, msg, np.repeat, - rng, reps, axis=1) - - def test_representation(self): - - idx = [] - idx.append(DatetimeIndex([], freq='D')) - idx.append(DatetimeIndex(['2011-01-01'], freq='D')) - idx.append(DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')) - idx.append(DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D')) - idx.append(DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], freq='H', tz='Asia/Tokyo')) - idx.append(DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern')) - idx.append(DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='UTC')) - - exp = [] - exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""") - exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', " - "freq='D')") - exp.append("DatetimeIndex(['2011-01-01', '2011-01-02'], " - "dtype='datetime64[ns]', freq='D')") - exp.append("DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='datetime64[ns]', freq='D')") - exp.append("DatetimeIndex(['2011-01-01 09:00:00+09:00', " - "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" - ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')") - exp.append("DatetimeIndex(['2011-01-01 09:00:00-05:00', " - "'2011-01-01 10:00:00-05:00', 'NaT'], " - "dtype='datetime64[ns, US/Eastern]', freq=None)") - exp.append("DatetimeIndex(['2011-01-01 09:00:00+00:00', " - "'2011-01-01 10:00:00+00:00', 'NaT'], " - "dtype='datetime64[ns, UTC]', freq=None)""") - - with pd.option_context('display.width', 300): - for indx, expected in zip(idx, exp): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(indx, func)() - self.assertEqual(result, expected) - - def test_representation_to_series(self): - idx1 = DatetimeIndex([], freq='D') - idx2 = DatetimeIndex(['2011-01-01'], freq='D') - idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo') - idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], - tz='US/Eastern') - idx7 = DatetimeIndex(['2011-01-01 09:00', '2011-01-02 10:15']) - - exp1 = """Series([], dtype: datetime64[ns])""" - - exp2 = """0 2011-01-01 -dtype: datetime64[ns]""" - - exp3 = """0 2011-01-01 -1 2011-01-02 -dtype: datetime64[ns]""" - - exp4 = """0 2011-01-01 -1 2011-01-02 -2 2011-01-03 -dtype: datetime64[ns]""" - - exp5 = """0 2011-01-01 09:00:00+09:00 -1 2011-01-01 10:00:00+09:00 -2 2011-01-01 11:00:00+09:00 -dtype: datetime64[ns, Asia/Tokyo]""" - - exp6 = """0 2011-01-01 09:00:00-05:00 -1 2011-01-01 10:00:00-05:00 -2 NaT -dtype: datetime64[ns, US/Eastern]""" - - exp7 = """0 2011-01-01 09:00:00 -1 2011-01-02 10:15:00 -dtype: datetime64[ns]""" - - with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, - idx5, idx6, idx7], - [exp1, exp2, exp3, exp4, - exp5, exp6, exp7]): - result = repr(Series(idx)) - self.assertEqual(result, expected) - - def test_summary(self): - # GH9116 - idx1 = DatetimeIndex([], freq='D') - idx2 = DatetimeIndex(['2011-01-01'], freq='D') - idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], - freq='H', tz='Asia/Tokyo') - idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], - tz='US/Eastern') - - exp1 = """DatetimeIndex: 0 entries -Freq: D""" - - exp2 = """DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01 -Freq: D""" - - exp3 = """DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02 -Freq: D""" - - exp4 = """DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03 -Freq: D""" - - exp5 = ("DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " - "to 2011-01-01 11:00:00+09:00\n" - "Freq: H") - - exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], - [exp1, exp2, exp3, exp4, exp5, exp6]): - result = idx.summary() - self.assertEqual(result, expected) - - def test_resolution(self): - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', - 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', 'hour', - 'minute', 'second', 'millisecond', - 'microsecond']): - for tz in self.tz: - idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) - self.assertEqual(idx.resolution, expected) - - def test_union(self): - for tz in self.tz: - # union - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) - - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) - - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: - - result_union = rng.union(other) - tm.assert_index_equal(result_union, expected) - - def test_add_iadd(self): - for tz in self.tz: - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - result = rng + delta - expected = pd.date_range('2000-01-01 02:00', - '2000-02-01 02:00', tz=tz) - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - # int - rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, - tz=tz) - result = rng + 1 - expected = pd.date_range('2000-01-01 10:00', freq='H', periods=10, - tz=tz) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - idx = DatetimeIndex(['2011-01-01', '2011-01-02']) - msg = "cannot add a datelike to a DatetimeIndex" - with tm.assertRaisesRegexp(TypeError, msg): - idx + Timestamp('2011-01-01') - - with tm.assertRaisesRegexp(TypeError, msg): - Timestamp('2011-01-01') + idx - - def test_add_dti_dti(self): - # previously performed setop (deprecated in 0.16.0), now raises - # TypeError (GH14164) - - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - - with tm.assertRaises(TypeError): - dti + dti - - with tm.assertRaises(TypeError): - dti_tz + dti_tz - - with tm.assertRaises(TypeError): - dti_tz + dti - - with tm.assertRaises(TypeError): - dti + dti_tz - - def test_difference(self): - for tz in self.tz: - # diff - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) - - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: - result_diff = rng.difference(other) - tm.assert_index_equal(result_diff, expected) - - def test_sub_isub(self): - for tz in self.tz: - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('1999-12-31 22:00', - '2000-01-31 22:00', tz=tz) - - result = rng - delta - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - # int - rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, - tz=tz) - result = rng - 1 - expected = pd.date_range('2000-01-01 08:00', freq='H', periods=10, - tz=tz) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - - def test_sub_dti_dti(self): - # previously performed setop (deprecated in 0.16.0), now changed to - # return subtraction -> TimeDeltaIndex (GH ...) - - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') - expected = TimedeltaIndex([0, 0, 0]) - - result = dti - dti - tm.assert_index_equal(result, expected) - - result = dti_tz - dti_tz - tm.assert_index_equal(result, expected) - - with tm.assertRaises(TypeError): - dti_tz - dti - - with tm.assertRaises(TypeError): - dti - dti_tz - - with tm.assertRaises(TypeError): - dti_tz - dti_tz2 - - # isub - dti -= dti - tm.assert_index_equal(dti, expected) - - # different length raises ValueError - dti1 = date_range('20130101', periods=3) - dti2 = date_range('20130101', periods=4) - with tm.assertRaises(ValueError): - dti1 - dti2 - - # NaN propagation - dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) - dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) - expected = TimedeltaIndex(['1 days', np.nan, np.nan]) - result = dti2 - dti1 - tm.assert_index_equal(result, expected) - - def test_sub_period(self): - # GH 13078 - # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - - for freq in [None, 'D']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) - - with tm.assertRaises(TypeError): - idx - p - - with tm.assertRaises(TypeError): - p - idx - - def test_comp_nat(self): - left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, - pd.Timestamp('2011-01-03')]) - right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) - - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r - expected = np.array([False, False, True]) - tm.assert_numpy_array_equal(result, expected) - - result = l != r - expected = np.array([True, True, False]) - tm.assert_numpy_array_equal(result, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) - - expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) - - def test_value_counts_unique(self): - # GH 7735 - for tz in self.tz: - idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), - tz=tz) - - exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, - tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, - tz=tz) - tm.assert_index_equal(idx.unique(), expected) - - idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 09:00', '2013-01-01 08:00', - '2013-01-01 08:00', pd.NaT], tz=tz) - - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], - tz=tz) - expected = Series([3, 2], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', - pd.NaT], tz=tz) - expected = Series([3, 2, 1], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), - expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - - def test_nonunique_contains(self): - # GH 9512 - for idx in map(DatetimeIndex, - ([0, 1, 0], [0, 0, -1], [0, -1, -1], - ['2015', '2015', '2016'], ['2015', '2015', '2014'])): - tm.assertIn(idx[0], idx) - - def test_order(self): - # with freq - idx1 = DatetimeIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], freq='D', name='idx') - idx2 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], freq='H', - tz='Asia/Tokyo', name='tzidx') - - for idx in [idx1, idx2]: - ordered = idx.sort_values() - self.assert_index_equal(ordered, idx) - self.assertEqual(ordered.freq, idx.freq) - - ordered = idx.sort_values(ascending=False) - expected = idx[::-1] - self.assert_index_equal(ordered, expected) - self.assertEqual(ordered.freq, expected.freq) - self.assertEqual(ordered.freq.n, -1) - - ordered, indexer = idx.sort_values(return_indexer=True) - self.assert_index_equal(ordered, idx) - self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2]), - check_dtype=False) - self.assertEqual(ordered.freq, idx.freq) - - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) - expected = idx[::-1] - self.assert_index_equal(ordered, expected) - self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0]), - check_dtype=False) - self.assertEqual(ordered.freq, expected.freq) - self.assertEqual(ordered.freq.n, -1) - - # without freq - for tz in self.tz: - idx1 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - tz=tz, name='idx1') - exp1 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], - tz=tz, name='idx1') - - idx2 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - tz=tz, name='idx2') - - exp2 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], - tz=tz, name='idx2') - - idx3 = DatetimeIndex([pd.NaT, '2011-01-03', '2011-01-05', - '2011-01-02', pd.NaT], tz=tz, name='idx3') - exp3 = DatetimeIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - '2011-01-05'], tz=tz, name='idx3') - - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: - ordered = idx.sort_values() - self.assert_index_equal(ordered, expected) - self.assertIsNone(ordered.freq) - - ordered = idx.sort_values(ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True) - self.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - self.assert_numpy_array_equal(indexer, exp, check_dtype=False) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0]) - self.assert_numpy_array_equal(indexer, exp, check_dtype=False) - self.assertIsNone(ordered.freq) - - def test_getitem(self): - idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', - tz='Asia/Tokyo', name='idx') - - for idx in [idx1, idx2]: - result = idx[0] - self.assertEqual(result, Timestamp('2011-01-01', tz=idx.tz)) - - result = idx[0:5] - expected = pd.date_range('2011-01-01', '2011-01-05', freq='D', - tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx[0:10:2] - expected = pd.date_range('2011-01-01', '2011-01-09', freq='2D', - tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx[-20:-5:3] - expected = pd.date_range('2011-01-12', '2011-01-24', freq='3D', - tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx[4::-1] - expected = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-03', - '2011-01-02', '2011-01-01'], - freq='-1D', tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - def test_drop_duplicates_metadata(self): - # GH 10115 - idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - result = idx.drop_duplicates() - self.assert_index_equal(idx, result) - self.assertEqual(idx.freq, result.freq) - - idx_dup = idx.append(idx) - self.assertIsNone(idx_dup.freq) # freq is reset - result = idx_dup.drop_duplicates() - self.assert_index_equal(idx, result) - self.assertIsNone(result.freq) - - def test_drop_duplicates(self): - # to check Index/Series compat - base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - idx = base.append(base[:5]) - - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) - - res = idx.drop_duplicates(keep='last') - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) - - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - - def test_take(self): - # GH 10295 - idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', - tz='Asia/Tokyo', name='idx') - - for idx in [idx1, idx2]: - result = idx.take([0]) - self.assertEqual(result, Timestamp('2011-01-01', tz=idx.tz)) - - result = idx.take([0, 1, 2]) - expected = pd.date_range('2011-01-01', '2011-01-03', freq='D', - tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx.take([0, 2, 4]) - expected = pd.date_range('2011-01-01', '2011-01-05', freq='2D', - tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx.take([7, 4, 1]) - expected = pd.date_range('2011-01-08', '2011-01-02', freq='-3D', - tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx.take([3, 2, 5]) - expected = DatetimeIndex(['2011-01-04', '2011-01-03', - '2011-01-06'], - freq=None, tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertIsNone(result.freq) - - result = idx.take([-3, 2, 5]) - expected = DatetimeIndex(['2011-01-29', '2011-01-03', - '2011-01-06'], - freq=None, tz=idx.tz, name='idx') - self.assert_index_equal(result, expected) - self.assertIsNone(result.freq) - - def test_take_invalid_kwargs(self): - idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - indices = [1, 6, 5, 9, 10, 13, 15, 3] - - msg = r"take\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, idx.take, - indices, foo=2) - - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, idx.take, - indices, out=indices) - - msg = "the 'mode' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, idx.take, - indices, mode='clip') - - def test_infer_freq(self): - # GH 11018 - for freq in ['A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D', - '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S', - '-3S']: - idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10) - result = pd.DatetimeIndex(idx.asi8, freq='infer') - tm.assert_index_equal(idx, result) - self.assertEqual(result.freq, freq) - - def test_nat_new(self): - idx = pd.date_range('2011-01-01', freq='D', periods=5, name='x') - result = idx._nat_new() - exp = pd.DatetimeIndex([pd.NaT] * 5, name='x') - tm.assert_index_equal(result, exp) - - result = idx._nat_new(box=False) - exp = np.array([tslib.iNaT] * 5, dtype=np.int64) - tm.assert_numpy_array_equal(result, exp) - - def test_shift(self): - # GH 9903 - for tz in self.tz: - idx = pd.DatetimeIndex([], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) - - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00' - '2011-01-01 12:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00' - '2011-01-01 15:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00' - '2011-01-01 09:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) - - def test_nat(self): - self.assertIs(pd.DatetimeIndex._na_value, pd.NaT) - self.assertIs(pd.DatetimeIndex([])._na_value, pd.NaT) - - for tz in [None, 'US/Eastern', 'UTC']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) - self.assertTrue(idx._can_hold_na) - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) - self.assertFalse(idx.hasnans) - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) - - idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) - self.assertTrue(idx._can_hold_na) - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - self.assertTrue(idx.hasnans) - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) - - def test_equals(self): - # GH 13107 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) - self.assertTrue(idx.equals(idx)) - self.assertTrue(idx.equals(idx.copy())) - self.assertTrue(idx.equals(idx.asobject)) - self.assertTrue(idx.asobject.equals(idx)) - self.assertTrue(idx.asobject.equals(idx.asobject)) - self.assertFalse(idx.equals(list(idx))) - self.assertFalse(idx.equals(pd.Series(idx))) - - idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], - tz='US/Pacific') - self.assertFalse(idx.equals(idx2)) - self.assertFalse(idx.equals(idx2.copy())) - self.assertFalse(idx.equals(idx2.asobject)) - self.assertFalse(idx.asobject.equals(idx2)) - self.assertFalse(idx.equals(list(idx2))) - self.assertFalse(idx.equals(pd.Series(idx2))) - - # same internal, different tz - idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') - tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) - self.assertFalse(idx.equals(idx3)) - self.assertFalse(idx.equals(idx3.copy())) - self.assertFalse(idx.equals(idx3.asobject)) - self.assertFalse(idx.asobject.equals(idx3)) - self.assertFalse(idx.equals(list(idx3))) - self.assertFalse(idx.equals(pd.Series(idx3))) - - class TestTimedeltaIndexOps(Ops): def setUp(self): super(TestTimedeltaIndexOps, self).setUp() diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index b5daf1ac0ec68..ff6cc4bb9853c 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3,7 +3,6 @@ import calendar import operator import sys -import warnings from datetime import datetime, time, timedelta from numpy.random import rand @@ -23,11 +22,9 @@ import pandas.util.testing as tm from pandas import ( Index, Series, DataFrame, isnull, date_range, Timestamp, Period, - DatetimeIndex, Int64Index, to_datetime, bdate_range, Float64Index, - NaT, timedelta_range, Timedelta, _np_version_under1p8, concat) + DatetimeIndex, to_datetime, bdate_range, Float64Index, + NaT, timedelta_range, Timedelta, concat) from pandas.compat import range, long, StringIO, lrange, lmap, zip, product -from pandas.compat.numpy import np_datetime64_compat -from pandas.core.common import PerformanceWarning from pandas.tslib import iNaT from pandas.util.testing import ( assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -323,15 +320,6 @@ def test_dti_slicing(self): # don't carry freq through irregular slicing self.assertIsNone(dti2.freq) - def test_pass_datetimeindex_to_index(self): - # Bugs in #1396 - rng = date_range('1/1/2000', '3/1/2000') - idx = Index(rng, dtype=object) - - expected = Index(rng.to_pydatetime(), dtype=object) - - self.assert_numpy_array_equal(idx.values, expected.values) - def test_contiguous_boolean_preserve_freq(self): rng = date_range('1/1/2000', '3/1/2000', freq='B') @@ -2718,1247 +2706,6 @@ def test_dataframe_dtypes(self): to_datetime(df) -class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True - - def test_hash_error(self): - index = date_range('20010101', periods=10) - with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % - type(index).__name__): - hash(index) - - def test_stringified_slice_with_tz(self): - # GH2658 - import datetime - start = datetime.datetime.now() - idx = DatetimeIndex(start=start, freq="1d", periods=10) - df = DataFrame(lrange(10), index=idx) - df["2013-01-14 23:44:34.437768-05:00":] # no exception here - - def test_append_join_nondatetimeindex(self): - rng = date_range('1/1/2000', periods=10) - idx = Index(['a', 'b', 'c', 'd']) - - result = rng.append(idx) - tm.assertIsInstance(result[0], Timestamp) - - # it works - rng.join(idx, how='outer') - - def test_to_period_nofreq(self): - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) - self.assertRaises(ValueError, idx.to_period) - - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], - freq='infer') - self.assertEqual(idx.freqstr, 'D') - expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', - '2000-01-03'], freq='D') - tm.assert_index_equal(idx.to_period(), expected) - - # GH 7606 - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) - self.assertEqual(idx.freqstr, None) - tm.assert_index_equal(idx.to_period(), expected) - - def test_000constructor_resolution(self): - # 2252 - t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1) - idx = DatetimeIndex([t1]) - - self.assertEqual(idx.nanosecond[0], t1.nanosecond) - - def test_constructor_coverage(self): - rng = date_range('1/1/2000', periods=10.5) - exp = date_range('1/1/2000', periods=10) - tm.assert_index_equal(rng, exp) - - self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', - periods='foo', freq='D') - - self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', - end='1/10/2000') - - self.assertRaises(ValueError, DatetimeIndex, '1/1/2000') - - # generator expression - gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) - result = DatetimeIndex(gen) - expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) - for i in range(10)]) - tm.assert_index_equal(result, expected) - - # NumPy string array - strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) - result = DatetimeIndex(strings) - expected = DatetimeIndex(strings.astype('O')) - tm.assert_index_equal(result, expected) - - from_ints = DatetimeIndex(expected.asi8) - tm.assert_index_equal(from_ints, expected) - - # string with NaT - strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) - result = DatetimeIndex(strings) - expected = DatetimeIndex(strings.astype('O')) - tm.assert_index_equal(result, expected) - - from_ints = DatetimeIndex(expected.asi8) - tm.assert_index_equal(from_ints, expected) - - # non-conforming - self.assertRaises(ValueError, DatetimeIndex, - ['2000-01-01', '2000-01-02', '2000-01-04'], freq='D') - - self.assertRaises(ValueError, DatetimeIndex, start='2011-01-01', - freq='b') - self.assertRaises(ValueError, DatetimeIndex, end='2011-01-01', - freq='B') - self.assertRaises(ValueError, DatetimeIndex, periods=10, freq='D') - - def test_constructor_datetime64_tzformat(self): - # GH 6572 - tm._skip_if_no_pytz() - import pytz - # ISO 8601 format results in pytz.FixedOffset - for freq in ['AS', 'W-SUN']: - idx = date_range('2013-01-01T00:00:00-05:00', - '2016-01-01T23:59:59-05:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) - tm.assert_index_equal(idx, expected) - # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') - self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - - idx = date_range('2013-01-01T00:00:00+09:00', - '2016-01-01T23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) - tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') - self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - - tm._skip_if_no_dateutil() - - # Non ISO 8601 format results in dateutil.tz.tzoffset - for freq in ['AS', 'W-SUN']: - idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', - freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) - tm.assert_index_equal(idx, expected) - # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') - self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - - idx = date_range('2013/1/1 0:00:00+9:00', - '2016/1/1 23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) - tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') - self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - - def test_constructor_dtype(self): - - # passing a dtype with a tz should localize - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - dtype='datetime64[ns, US/Eastern]') - expected = DatetimeIndex(['2013-01-01', '2013-01-02'] - ).tz_localize('US/Eastern') - tm.assert_index_equal(idx, expected) - - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - tz='US/Eastern') - tm.assert_index_equal(idx, expected) - - # if we already have a tz and its not the same, then raise - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - dtype='datetime64[ns, US/Eastern]') - - self.assertRaises(ValueError, - lambda: DatetimeIndex(idx, - dtype='datetime64[ns]')) - - # this is effectively trying to convert tz's - self.assertRaises(TypeError, - lambda: DatetimeIndex(idx, - dtype='datetime64[ns, CET]')) - self.assertRaises(ValueError, - lambda: DatetimeIndex( - idx, tz='CET', - dtype='datetime64[ns, US/Eastern]')) - result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') - tm.assert_index_equal(idx, result) - - def test_constructor_name(self): - idx = DatetimeIndex(start='2000-01-01', periods=1, freq='A', - name='TEST') - self.assertEqual(idx.name, 'TEST') - - def test_comparisons_coverage(self): - rng = date_range('1/1/2000', periods=10) - - # raise TypeError for now - self.assertRaises(TypeError, rng.__lt__, rng[3].value) - - result = rng == list(rng) - exp = rng == rng - self.assert_numpy_array_equal(result, exp) - - def test_comparisons_nat(self): - - fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) - fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) - - didx1 = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, - '2014-05-01', '2014-07-01']) - didx2 = pd.DatetimeIndex(['2014-02-01', '2014-03-01', pd.NaT, pd.NaT, - '2014-06-01', '2014-07-01']) - darr = np.array([np_datetime64_compat('2014-02-01 00:00Z'), - np_datetime64_compat('2014-03-01 00:00Z'), - np_datetime64_compat('nat'), np.datetime64('nat'), - np_datetime64_compat('2014-06-01 00:00Z'), - np_datetime64_compat('2014-07-01 00:00Z')]) - - if _np_version_under1p8: - # cannot test array because np.datetime('nat') returns today's date - cases = [(fidx1, fidx2), (didx1, didx2)] - else: - cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] - - # Check pd.NaT is handles as the same as np.nan - with tm.assert_produces_warning(None): - for idx1, idx2 in cases: - - result = idx1 < idx2 - expected = np.array([True, False, False, False, True, False]) - self.assert_numpy_array_equal(result, expected) - - result = idx2 > idx1 - expected = np.array([True, False, False, False, True, False]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 <= idx2 - expected = np.array([True, False, False, False, True, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx2 >= idx1 - expected = np.array([True, False, False, False, True, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 == idx2 - expected = np.array([False, False, False, False, False, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 != idx2 - expected = np.array([True, True, True, True, True, False]) - self.assert_numpy_array_equal(result, expected) - - with tm.assert_produces_warning(None): - for idx1, val in [(fidx1, np.nan), (didx1, pd.NaT)]: - result = idx1 < val - expected = np.array([False, False, False, False, False, False]) - self.assert_numpy_array_equal(result, expected) - result = idx1 > val - self.assert_numpy_array_equal(result, expected) - - result = idx1 <= val - self.assert_numpy_array_equal(result, expected) - result = idx1 >= val - self.assert_numpy_array_equal(result, expected) - - result = idx1 == val - self.assert_numpy_array_equal(result, expected) - - result = idx1 != val - expected = np.array([True, True, True, True, True, True]) - self.assert_numpy_array_equal(result, expected) - - # Check pd.NaT is handles as the same as np.nan - with tm.assert_produces_warning(None): - for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]: - result = idx1 < val - expected = np.array([True, False, False, False, False, False]) - self.assert_numpy_array_equal(result, expected) - result = idx1 > val - expected = np.array([False, False, False, False, True, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 <= val - expected = np.array([True, False, True, False, False, False]) - self.assert_numpy_array_equal(result, expected) - result = idx1 >= val - expected = np.array([False, False, True, False, True, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 == val - expected = np.array([False, False, True, False, False, False]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 != val - expected = np.array([True, True, False, True, True, True]) - self.assert_numpy_array_equal(result, expected) - - def test_map(self): - rng = date_range('1/1/2000', periods=10) - - f = lambda x: x.strftime('%Y%m%d') - result = rng.map(f) - exp = Index([f(x) for x in rng], dtype='= -1') - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with tm.assertRaises(IndexError): - idx.take(np.array([1, -5])) - - def test_take_fill_value_with_timezone(self): - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', tz='US/Eastern') - result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', tz='US/Eastern') - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') - tm.assert_index_equal(result, expected) - - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with tm.assertRaises(IndexError): - idx.take(np.array([1, -5])) - - def test_map_bug_1677(self): - index = DatetimeIndex(['2012-04-25 09:30:00.393000']) - f = index.asof - - result = index.map(f) - expected = Index([f(index[0])]) - tm.assert_index_equal(result, expected) - - def test_groupby_function_tuple_1677(self): - df = DataFrame(np.random.rand(100), - index=date_range("1/1/2000", periods=100)) - monthly_group = df.groupby(lambda x: (x.year, x.month)) - - result = monthly_group.mean() - tm.assertIsInstance(result.index[0], tuple) - - def test_append_numpy_bug_1681(self): - # another datetime64 bug - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - a = DataFrame() - c = DataFrame({'A': 'foo', 'B': dr}, index=dr) - - result = a.append(c) - self.assertTrue((result['B'] == dr).all()) - - def test_isin(self): - index = tm.makeDateIndex(4) - result = index.isin(index) - self.assertTrue(result.all()) - - result = index.isin(list(index)) - self.assertTrue(result.all()) - - assert_almost_equal(index.isin([index[2], 5]), - np.array([False, False, True, False])) - - def test_union(self): - i1 = Int64Index(np.arange(0, 20, 2)) - i2 = Int64Index(np.arange(10, 30, 2)) - result = i1.union(i2) - expected = Int64Index(np.arange(0, 30, 2)) - tm.assert_index_equal(result, expected) - - def test_union_with_DatetimeIndex(self): - i1 = Int64Index(np.arange(0, 20, 2)) - i2 = DatetimeIndex(start='2012-01-03 00:00:00', periods=10, freq='D') - i1.union(i2) # Works - i2.union(i1) # Fails with "AttributeError: can't set attribute" - - def test_time(self): - rng = pd.date_range('1/1/2000', freq='12min', periods=10) - result = pd.Index(rng).time - expected = [t.time() for t in rng] - self.assertTrue((result == expected).all()) - - def test_date(self): - rng = pd.date_range('1/1/2000', freq='12H', periods=10) - result = pd.Index(rng).date - expected = [t.date() for t in rng] - self.assertTrue((result == expected).all()) - - def test_does_not_convert_mixed_integer(self): - df = tm.makeCustomDataframe(10, 10, - data_gen_f=lambda *args, **kwargs: randn(), - r_idx_type='i', c_idx_type='dt') - cols = df.columns.join(df.index, how='outer') - joined = cols.join(df.columns) - self.assertEqual(cols.dtype, np.dtype('O')) - self.assertEqual(cols.dtype, joined.dtype) - tm.assert_numpy_array_equal(cols.values, joined.values) - - def test_slice_keeps_name(self): - # GH4226 - st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') - et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') - dr = pd.date_range(st, et, freq='H', name='timebucket') - self.assertEqual(dr[1:].name, dr.name) - - def test_join_self(self): - index = date_range('1/1/2000', periods=10) - kinds = 'outer', 'inner', 'left', 'right' - for kind in kinds: - joined = index.join(index, how=kind) - self.assertIs(index, joined) - - def assert_index_parameters(self, index): - assert index.freq == '40960N' - assert index.inferred_freq == '40960N' - - def test_ns_index(self): - nsamples = 400 - ns = int(1e9 / 24414) - dtstart = np.datetime64('2012-09-20T00:00:00') - - dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, 'ns') - freq = ns * offsets.Nano() - index = pd.DatetimeIndex(dt, freq=freq, name='time') - self.assert_index_parameters(index) - - new_index = pd.DatetimeIndex(start=index[0], end=index[-1], - freq=index.freq) - self.assert_index_parameters(new_index) - - def test_join_with_period_index(self): - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=lambda *args: np.random.randint(2), - c_idx_type='p', r_idx_type='dt') - s = df.iloc[:5, 0] - joins = 'left', 'right', 'inner', 'outer' - - for join in joins: - with tm.assertRaisesRegexp(ValueError, 'can only call with other ' - 'PeriodIndex-ed objects'): - df.columns.join(s.index, how=join) - - def test_factorize(self): - idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', - '2014-03', '2014-03']) - - exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) - - arr, idx = idx1.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - arr, idx = idx1.factorize(sort=True) - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - # tz must be preserved - idx1 = idx1.tz_localize('Asia/Tokyo') - exp_idx = exp_idx.tz_localize('Asia/Tokyo') - - arr, idx = idx1.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01', - '2014-03', '2014-01']) - - exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) - exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) - arr, idx = idx2.factorize(sort=True) - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) - exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01']) - arr, idx = idx2.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - # freq must be preserved - idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo') - exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) - arr, idx = idx3.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, idx3) - - def test_factorize_tz(self): - # GH 13750 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) - idx = base.repeat(5) - - exp_arr = np.arange(100, dtype=np.intp).repeat(5) - - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(res, base) - - def test_factorize_dst(self): - # GH 13750 - idx = pd.date_range('2016-11-06', freq='H', periods=12, - tz='US/Eastern') - - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) - - idx = pd.date_range('2016-06-13', freq='H', periods=12, - tz='US/Eastern') - - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) - - def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) - SLC = pd.IndexSlice - - def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(ts[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - - assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) - - assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) - - assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp( - '2014-10-01'):-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], - SLC[13:8:-1]) - - assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) - - def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - - def test_slice_bounds_empty(self): - # GH 14354 - empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') - - right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') - exp = Timestamp('2015-01-02 23:59:59.999999999') - self.assertEqual(right, exp) - - left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') - exp = Timestamp('2015-01-02 00:00:00') - self.assertEqual(left, exp) - - class TestDatetime64(tm.TestCase): """ Also test support for datetime64[ns] in Series / DataFrame @@ -3969,152 +2716,6 @@ def setUp(self): end=datetime(2005, 1, 10), freq='Min') self.series = Series(rand(len(dti)), dti) - def test_datetimeindex_accessors(self): - dti = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), periods=365) - - self.assertEqual(dti.year[0], 1998) - self.assertEqual(dti.month[0], 1) - self.assertEqual(dti.day[0], 1) - self.assertEqual(dti.hour[0], 0) - self.assertEqual(dti.minute[0], 0) - self.assertEqual(dti.second[0], 0) - self.assertEqual(dti.microsecond[0], 0) - self.assertEqual(dti.dayofweek[0], 3) - - self.assertEqual(dti.dayofyear[0], 1) - self.assertEqual(dti.dayofyear[120], 121) - - self.assertEqual(dti.weekofyear[0], 1) - self.assertEqual(dti.weekofyear[120], 18) - - self.assertEqual(dti.quarter[0], 1) - self.assertEqual(dti.quarter[120], 2) - - self.assertEqual(dti.days_in_month[0], 31) - self.assertEqual(dti.days_in_month[90], 30) - - self.assertEqual(dti.is_month_start[0], True) - self.assertEqual(dti.is_month_start[1], False) - self.assertEqual(dti.is_month_start[31], True) - self.assertEqual(dti.is_quarter_start[0], True) - self.assertEqual(dti.is_quarter_start[90], True) - self.assertEqual(dti.is_year_start[0], True) - self.assertEqual(dti.is_year_start[364], False) - self.assertEqual(dti.is_month_end[0], False) - self.assertEqual(dti.is_month_end[30], True) - self.assertEqual(dti.is_month_end[31], False) - self.assertEqual(dti.is_month_end[364], True) - self.assertEqual(dti.is_quarter_end[0], False) - self.assertEqual(dti.is_quarter_end[30], False) - self.assertEqual(dti.is_quarter_end[89], True) - self.assertEqual(dti.is_quarter_end[364], True) - self.assertEqual(dti.is_year_end[0], False) - self.assertEqual(dti.is_year_end[364], True) - - # GH 11128 - self.assertEqual(dti.weekday_name[4], u'Monday') - self.assertEqual(dti.weekday_name[5], u'Tuesday') - self.assertEqual(dti.weekday_name[6], u'Wednesday') - self.assertEqual(dti.weekday_name[7], u'Thursday') - self.assertEqual(dti.weekday_name[8], u'Friday') - self.assertEqual(dti.weekday_name[9], u'Saturday') - self.assertEqual(dti.weekday_name[10], u'Sunday') - - self.assertEqual(Timestamp('2016-04-04').weekday_name, u'Monday') - self.assertEqual(Timestamp('2016-04-05').weekday_name, u'Tuesday') - self.assertEqual(Timestamp('2016-04-06').weekday_name, u'Wednesday') - self.assertEqual(Timestamp('2016-04-07').weekday_name, u'Thursday') - self.assertEqual(Timestamp('2016-04-08').weekday_name, u'Friday') - self.assertEqual(Timestamp('2016-04-09').weekday_name, u'Saturday') - self.assertEqual(Timestamp('2016-04-10').weekday_name, u'Sunday') - - self.assertEqual(len(dti.year), 365) - self.assertEqual(len(dti.month), 365) - self.assertEqual(len(dti.day), 365) - self.assertEqual(len(dti.hour), 365) - self.assertEqual(len(dti.minute), 365) - self.assertEqual(len(dti.second), 365) - self.assertEqual(len(dti.microsecond), 365) - self.assertEqual(len(dti.dayofweek), 365) - self.assertEqual(len(dti.dayofyear), 365) - self.assertEqual(len(dti.weekofyear), 365) - self.assertEqual(len(dti.quarter), 365) - self.assertEqual(len(dti.is_month_start), 365) - self.assertEqual(len(dti.is_month_end), 365) - self.assertEqual(len(dti.is_quarter_start), 365) - self.assertEqual(len(dti.is_quarter_end), 365) - self.assertEqual(len(dti.is_year_start), 365) - self.assertEqual(len(dti.is_year_end), 365) - self.assertEqual(len(dti.weekday_name), 365) - - dti = DatetimeIndex(freq='BQ-FEB', start=datetime(1998, 1, 1), - periods=4) - - self.assertEqual(sum(dti.is_quarter_start), 0) - self.assertEqual(sum(dti.is_quarter_end), 4) - self.assertEqual(sum(dti.is_year_start), 0) - self.assertEqual(sum(dti.is_year_end), 1) - - # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, - # CBD requires np >= 1.7 - bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') - dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) - self.assertRaises(ValueError, lambda: dti.is_month_start) - - dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) - - self.assertEqual(dti.is_month_start[0], 1) - - tests = [ - (Timestamp('2013-06-01', freq='M').is_month_start, 1), - (Timestamp('2013-06-01', freq='BM').is_month_start, 0), - (Timestamp('2013-06-03', freq='M').is_month_start, 0), - (Timestamp('2013-06-03', freq='BM').is_month_start, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), - (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), - (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), - (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), - (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), - (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), - (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), - (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), - (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), - (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), - (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), - (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), - (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), - (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), - (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), - (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), - (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), - (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), - (Timestamp('2012-02-01').days_in_month, 29), - (Timestamp('2013-02-01').days_in_month, 28)] - - for ts, value in tests: - self.assertEqual(ts, value) - - def test_nanosecond_field(self): - dti = DatetimeIndex(np.arange(10)) - - self.assert_numpy_array_equal(dti.nanosecond, - np.arange(10, dtype=np.int32)) - - def test_datetimeindex_diff(self): - dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=100) - dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=98) - self.assertEqual(len(dti1.difference(dti2)), 2) - def test_fancy_getitem(self): dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), end=datetime(2010, 1, 1)) @@ -4143,87 +2744,6 @@ def test_fancy_setitem(self): s['1/2/2009':'2009-06-05'] = -3 self.assertTrue((s[48:54] == -3).all()) - def test_datetimeindex_constructor(self): - arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] - self.assertRaises(Exception, DatetimeIndex, arr) - - arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] - idx1 = DatetimeIndex(arr) - - arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] - idx2 = DatetimeIndex(arr) - - arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', - '2005-01-04'] - idx3 = DatetimeIndex(arr) - - arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', - '2005-01-04'], dtype='O') - idx4 = DatetimeIndex(arr) - - arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) - idx5 = DatetimeIndex(arr) - - arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' - ]) - idx6 = DatetimeIndex(arr) - - idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) - idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, - yearfirst=True) - tm.assert_index_equal(idx7, idx8) - - for other in [idx2, idx3, idx4, idx5, idx6]: - self.assertTrue((idx1.values == other.values).all()) - - sdate = datetime(1999, 12, 25) - edate = datetime(2000, 1, 1) - idx = DatetimeIndex(start=sdate, freq='1B', periods=20) - self.assertEqual(len(idx), 20) - self.assertEqual(idx[0], sdate + 0 * offsets.BDay()) - self.assertEqual(idx.freq, 'B') - - idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) - self.assertEqual(len(idx), 20) - self.assertEqual(idx[-1], edate) - self.assertEqual(idx.freq, '5D') - - idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') - idx2 = DatetimeIndex(start=sdate, end=edate, - freq=offsets.Week(weekday=6)) - self.assertEqual(len(idx1), len(idx2)) - self.assertEqual(idx1.offset, idx2.offset) - - idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') - idx2 = DatetimeIndex(start=sdate, end=edate, - freq=offsets.QuarterBegin(startingMonth=1)) - self.assertEqual(len(idx1), len(idx2)) - self.assertEqual(idx1.offset, idx2.offset) - - idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') - idx2 = DatetimeIndex(start=sdate, end=edate, - freq=offsets.BQuarterEnd(startingMonth=12)) - self.assertEqual(len(idx1), len(idx2)) - self.assertEqual(idx1.offset, idx2.offset) - - def test_dayfirst(self): - # GH 5917 - arr = ['10/02/2014', '11/02/2014', '12/02/2014'] - expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), - datetime(2014, 2, 12)]) - idx1 = DatetimeIndex(arr, dayfirst=True) - idx2 = DatetimeIndex(np.array(arr), dayfirst=True) - idx3 = to_datetime(arr, dayfirst=True) - idx4 = to_datetime(np.array(arr), dayfirst=True) - idx5 = DatetimeIndex(Index(arr), dayfirst=True) - idx6 = DatetimeIndex(Series(arr), dayfirst=True) - tm.assert_index_equal(expected, idx1) - tm.assert_index_equal(expected, idx2) - tm.assert_index_equal(expected, idx3) - tm.assert_index_equal(expected, idx4) - tm.assert_index_equal(expected, idx5) - tm.assert_index_equal(expected, idx6) - def test_dti_snap(self): dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') @@ -4255,43 +2775,6 @@ def test_dti_reset_index_round_trip(self): self.assertEqual(df.index[0], stamp) self.assertEqual(df.reset_index()['Date'][0], stamp) - def test_dti_set_index_reindex(self): - # GH 6631 - df = DataFrame(np.random.random(6)) - idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') - idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') - - df = df.set_index(idx1) - tm.assert_index_equal(df.index, idx1) - df = df.reindex(idx2) - tm.assert_index_equal(df.index, idx2) - - # 11314 - # with tz - index = date_range(datetime(2015, 10, 1), - datetime(2015, 10, 1, 23), - freq='H', tz='US/Eastern') - df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) - new_index = date_range(datetime(2015, 10, 2), - datetime(2015, 10, 2, 23), - freq='H', tz='US/Eastern') - - # TODO: unused? - result = df.set_index(new_index) # noqa - - self.assertEqual(new_index.freq, index.freq) - - def test_datetimeindex_union_join_empty(self): - dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') - empty = Index([]) - - result = dti.union(empty) - tm.assertIsInstance(result, DatetimeIndex) - self.assertIs(result, result) - - result = dti.join(empty) - tm.assertIsInstance(result, DatetimeIndex) - def test_series_set_value(self): # #1561 diff --git a/setup.py b/setup.py index 0c4dd33a70482..2ba4331aa1561 100755 --- a/setup.py +++ b/setup.py @@ -640,6 +640,7 @@ def pxd(name): 'pandas.tests', 'pandas.tests.frame', 'pandas.tests.indexes', + 'pandas.tests.indexes.datetimes', 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', From 84a7adf28bd2cf3989d2236fc52df22dc4d7452d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Feb 2017 20:18:13 -0500 Subject: [PATCH 095/338] TST: create the pandas/tests/scalar directory structure --- pandas/tests/scalar/__init__.py | 0 pandas/tests/scalar/test_timedelta.py | 1 + pandas/tests/scalar/test_timestamp.py | 1 + setup.py | 1 + 4 files changed, 3 insertions(+) create mode 100644 pandas/tests/scalar/__init__.py create mode 100644 pandas/tests/scalar/test_timedelta.py create mode 100644 pandas/tests/scalar/test_timestamp.py diff --git a/pandas/tests/scalar/__init__.py b/pandas/tests/scalar/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py new file mode 100644 index 0000000000000..fab790c4bf948 --- /dev/null +++ b/pandas/tests/scalar/test_timedelta.py @@ -0,0 +1 @@ +""" test the scalar Timedelta """ diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py new file mode 100644 index 0000000000000..2159c59de72ce --- /dev/null +++ b/pandas/tests/scalar/test_timestamp.py @@ -0,0 +1 @@ +""" test the scalar Timestamp """ diff --git a/setup.py b/setup.py index 2ba4331aa1561..93a044bc3cc7d 100755 --- a/setup.py +++ b/setup.py @@ -644,6 +644,7 @@ def pxd(name): 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', + 'pandas.tests.scalar', 'pandas.tests.types', 'pandas.tests.test_msgpack', 'pandas.tests.plotting', From 8f124babc6f6814eedc64494e6319460ca4560c9 Mon Sep 17 00:00:00 2001 From: Kacawi Date: Fri, 3 Feb 2017 14:50:25 +0100 Subject: [PATCH 096/338] Added a tutorial for pandas dataframes (#15295) --- doc/source/tutorials.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index c1c1c81915c46..2489b787560d0 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -177,3 +177,4 @@ Various Tutorials - `Intro to pandas data structures, by Greg Reda `_ - `Pandas and Python: Top 10, by Manish Amde `_ - `Pandas Tutorial, by Mikhail Semeniuk `_ +- `Pandas DataFrames Tutorial, by Karlijn Willems `_ From 6ff9083343a751cb6a6269d67aca45e236da4ee5 Mon Sep 17 00:00:00 2001 From: TrigonaMinima Date: Sat, 4 Feb 2017 11:02:30 -0500 Subject: [PATCH 097/338] TST: Timestamp and Timeseries tests reorg (gh14854) xref partial on #14854 Author: TrigonaMinima Closes #15301 from TrigonaMinima/gh14854-timestamp and squashes the following commits: d8e3f4d [TrigonaMinima] splitting test_timeseries.py further 4072d93 [TrigonaMinima] TST: tseries/tests/test_timeseries.py tests moved to appropriate places. dbfd2ba [TrigonaMinima] TST: Timestamp tests compiled (gh14854) --- .../indexes/datetimes/test_construction.py | 69 +- .../indexes/datetimes/test_date_range.py | 112 + .../tests/indexes/datetimes/test_datetime.py | 68 - pandas/tests/indexes/datetimes/test_misc.py | 214 +- pandas/tests/indexes/test_timedelta.py | 43 + pandas/tests/scalar/test_timestamp.py | 1577 +++++++ pandas/tests/series/test_missing.py | 103 + pandas/tests/series/test_timeseries.py | 3147 ++++++++++++- pandas/tseries/tests/test_timeseries.py | 4184 ----------------- pandas/tseries/tests/test_tslib.py | 869 +--- 10 files changed, 5183 insertions(+), 5203 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/test_date_range.py create mode 100644 pandas/tests/indexes/test_timedelta.py delete mode 100644 pandas/tseries/tests/test_timeseries.py diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index ae4eb6ee397b6..f8eca0f0d91d0 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -4,7 +4,8 @@ import pandas as pd import pandas.util.testing as tm from pandas.tslib import OutOfBoundsDatetime -from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range) +from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range, + to_datetime) class TestDatetimeIndex(tm.TestCase): @@ -423,3 +424,69 @@ def test_000constructor_resolution(self): idx = DatetimeIndex([t1]) self.assertEqual(idx.nanosecond[0], t1.nanosecond) + + +class TestTimeSeries(tm.TestCase): + _multiprocess_can_split_ = True + + def test_dti_constructor_preserve_dti_freq(self): + rng = date_range('1/1/2000', '1/2/2000', freq='5min') + + rng2 = DatetimeIndex(rng) + self.assertEqual(rng.freq, rng2.freq) + + def test_dti_constructor_years_only(self): + # GH 6961 + for tz in [None, 'UTC', 'Asia/Tokyo', 'dateutil/US/Pacific']: + rng1 = date_range('2014', '2015', freq='M', tz=tz) + expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz) + + rng2 = date_range('2014', '2015', freq='MS', tz=tz) + expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', + tz=tz) + + rng3 = date_range('2014', '2020', freq='A', tz=tz) + expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz) + + rng4 = date_range('2014', '2020', freq='AS', tz=tz) + expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', + tz=tz) + + for rng, expected in [(rng1, expected1), (rng2, expected2), + (rng3, expected3), (rng4, expected4)]: + tm.assert_index_equal(rng, expected) + + def test_dti_constructor_small_int(self): + # GH 13721 + exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', + '1970-01-01 00:00:00.00000001', + '1970-01-01 00:00:00.00000002']) + + for dtype in [np.int64, np.int32, np.int16, np.int8]: + arr = np.array([0, 10, 20], dtype=dtype) + tm.assert_index_equal(DatetimeIndex(arr), exp) + + def test_dti_constructor_numpy_timeunits(self): + # GH 9114 + base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) + + for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', + 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: + values = base.values.astype(dtype) + + tm.assert_index_equal(DatetimeIndex(values), base) + tm.assert_index_equal(to_datetime(values), base) + + def test_constructor_int64_nocopy(self): + # #1624 + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr) + + arr[50:100] = -1 + self.assertTrue((index.asi8[50:100] == -1).all()) + + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr, copy=True) + + arr[50:100] = -1 + self.assertTrue((index.asi8[50:100] != -1).all()) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py new file mode 100644 index 0000000000000..b3d6c41573ab8 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -0,0 +1,112 @@ +from datetime import datetime, timedelta, time + +import pandas as pd +import pandas.util.testing as tm +from pandas import date_range, offsets, DatetimeIndex, Timestamp + +from pandas.tests.series.common import TestData + + +class TestTimeSeries(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def test_date_range_gen_error(self): + rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') + self.assertEqual(len(rng), 4) + + def test_date_range_negative_freq(self): + # GH 11018 + rng = date_range('2011-12-31', freq='-2A', periods=3) + exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31', + '2007-12-31'], freq='-2A') + tm.assert_index_equal(rng, exp) + self.assertEqual(rng.freq, '-2A') + + rng = date_range('2011-01-31', freq='-2M', periods=3) + exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30', + '2010-09-30'], freq='-2M') + tm.assert_index_equal(rng, exp) + self.assertEqual(rng.freq, '-2M') + + def test_date_range_bms_bug(self): + # #1645 + rng = date_range('1/1/2000', periods=10, freq='BMS') + + ex_first = Timestamp('2000-01-03') + self.assertEqual(rng[0], ex_first) + + def test_date_range_normalize(self): + snap = datetime.today() + n = 50 + + rng = date_range(snap, periods=n, normalize=False, freq='2D') + + offset = timedelta(2) + values = DatetimeIndex([snap + i * offset for i in range(n)]) + + tm.assert_index_equal(rng, values) + + rng = date_range('1/1/2000 08:15', periods=n, normalize=False, + freq='B') + the_time = time(8, 15) + for val in rng: + self.assertEqual(val.time(), the_time) + + def test_date_range_fy5252(self): + dr = date_range(start="2013-01-01", periods=2, freq=offsets.FY5253( + startingMonth=1, weekday=3, variation="nearest")) + self.assertEqual(dr[0], Timestamp('2013-01-31')) + self.assertEqual(dr[1], Timestamp('2014-01-30')) + + def test_date_range_ambiguous_arguments(self): + # #2538 + start = datetime(2011, 1, 1, 5, 3, 40) + end = datetime(2011, 1, 1, 8, 9, 40) + + self.assertRaises(ValueError, date_range, start, end, freq='s', + periods=10) + + def test_date_range_businesshour(self): + idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', + '2014-07-04 11:00', + '2014-07-04 12:00', '2014-07-04 13:00', + '2014-07-04 14:00', + '2014-07-04 15:00', '2014-07-04 16:00'], + freq='BH') + rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH') + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex( + ['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH') + rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH') + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', + '2014-07-04 11:00', + '2014-07-04 12:00', '2014-07-04 13:00', + '2014-07-04 14:00', + '2014-07-04 15:00', '2014-07-04 16:00', + '2014-07-07 09:00', '2014-07-07 10:00', + '2014-07-07 11:00', + '2014-07-07 12:00', '2014-07-07 13:00', + '2014-07-07 14:00', + '2014-07-07 15:00', '2014-07-07 16:00', + '2014-07-08 09:00', '2014-07-08 10:00', + '2014-07-08 11:00', + '2014-07-08 12:00', '2014-07-08 13:00', + '2014-07-08 14:00', + '2014-07-08 15:00', '2014-07-08 16:00'], + freq='BH') + rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH') + tm.assert_index_equal(idx, rng) + + def test_range_misspecified(self): + # GH #1095 + + self.assertRaises(ValueError, date_range, '1/1/2000') + self.assertRaises(ValueError, date_range, end='1/1/2000') + self.assertRaises(ValueError, date_range, periods=10) + + self.assertRaises(ValueError, date_range, '1/1/2000', freq='H') + self.assertRaises(ValueError, date_range, end='1/1/2000', freq='H') + self.assertRaises(ValueError, date_range, periods=10, freq='H') diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a69406804cd97..f92fca6ecfa14 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -457,74 +457,6 @@ def test_sort_values(self): self.assert_numpy_array_equal(dexer, np.array([0, 2, 1], dtype=np.intp)) - def test_round(self): - - # round - dt = Timestamp('20130101 09:10:11') - result = dt.round('D') - expected = Timestamp('20130101') - self.assertEqual(result, expected) - - dt = Timestamp('20130101 19:10:11') - result = dt.round('D') - expected = Timestamp('20130102') - self.assertEqual(result, expected) - - dt = Timestamp('20130201 12:00:00') - result = dt.round('D') - expected = Timestamp('20130202') - self.assertEqual(result, expected) - - dt = Timestamp('20130104 12:00:00') - result = dt.round('D') - expected = Timestamp('20130105') - self.assertEqual(result, expected) - - dt = Timestamp('20130104 12:32:00') - result = dt.round('30Min') - expected = Timestamp('20130104 12:30:00') - self.assertEqual(result, expected) - - dti = date_range('20130101 09:10:11', periods=5) - result = dti.round('D') - expected = date_range('20130101', periods=5) - tm.assert_index_equal(result, expected) - - # floor - dt = Timestamp('20130101 09:10:11') - result = dt.floor('D') - expected = Timestamp('20130101') - self.assertEqual(result, expected) - - # ceil - dt = Timestamp('20130101 09:10:11') - result = dt.ceil('D') - expected = Timestamp('20130102') - self.assertEqual(result, expected) - - # round with tz - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('D') - expected = Timestamp('20130101', tz='US/Eastern') - self.assertEqual(result, expected) - - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('s') - self.assertEqual(result, dt) - - dti = date_range('20130101 09:10:11', - periods=5).tz_localize('UTC').tz_convert('US/Eastern') - result = dti.round('D') - expected = date_range('20130101', periods=5).tz_localize('US/Eastern') - tm.assert_index_equal(result, expected) - - result = dti.round('s') - tm.assert_index_equal(result, dti) - - # invalid - for freq in ['Y', 'M', 'foobar']: - self.assertRaises(ValueError, lambda: dti.round(freq)) - def test_take(self): dates = [datetime(2010, 1, 1, 14), datetime(2010, 1, 1, 15), datetime(2010, 1, 1, 17), datetime(2010, 1, 1, 21)] diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 3dfe95fa77b85..4685df580190b 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,10 +1,12 @@ import numpy as np +import pandas as pd import pandas.lib as lib import pandas.util.testing as tm -from pandas import Float64Index, date_range, Timestamp from pandas import (Index, DatetimeIndex, datetime, offsets, to_datetime, - Series, DataFrame) + Series, DataFrame, Float64Index, date_range, Timestamp) + +from pandas.util.testing import assert_series_equal class TestDateTimeIndexToJulianDate(tm.TestCase): @@ -65,6 +67,196 @@ def test_pass_datetimeindex_to_index(self): self.assert_numpy_array_equal(idx.values, expected.values) + def test_range_edges(self): + # GH 13672 + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), + end=Timestamp('1970-01-01 00:00:00.000000004'), + freq='N') + exp = DatetimeIndex(['1970-01-01 00:00:00.000000001', + '1970-01-01 00:00:00.000000002', + '1970-01-01 00:00:00.000000003', + '1970-01-01 00:00:00.000000004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000004'), + end=Timestamp('1970-01-01 00:00:00.000000001'), + freq='N') + exp = DatetimeIndex([]) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), + end=Timestamp('1970-01-01 00:00:00.000000001'), + freq='N') + exp = DatetimeIndex(['1970-01-01 00:00:00.000000001']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000001'), + end=Timestamp('1970-01-01 00:00:00.000004'), + freq='U') + exp = DatetimeIndex(['1970-01-01 00:00:00.000001', + '1970-01-01 00:00:00.000002', + '1970-01-01 00:00:00.000003', + '1970-01-01 00:00:00.000004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.001'), + end=Timestamp('1970-01-01 00:00:00.004'), + freq='L') + exp = DatetimeIndex(['1970-01-01 00:00:00.001', + '1970-01-01 00:00:00.002', + '1970-01-01 00:00:00.003', + '1970-01-01 00:00:00.004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:01'), + end=Timestamp('1970-01-01 00:00:04'), freq='S') + exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02', + '1970-01-01 00:00:03', '1970-01-01 00:00:04']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:01'), + end=Timestamp('1970-01-01 00:04'), freq='T') + exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02', + '1970-01-01 00:03', '1970-01-01 00:04']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 01:00'), + end=Timestamp('1970-01-01 04:00'), freq='H') + exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00', + '1970-01-01 03:00', '1970-01-01 04:00']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01'), + end=Timestamp('1970-01-04'), freq='D') + exp = DatetimeIndex(['1970-01-01', '1970-01-02', + '1970-01-03', '1970-01-04']) + tm.assert_index_equal(idx, exp) + + def test_datetimeindex_integers_shift(self): + rng = date_range('1/1/2000', periods=20) + + result = rng + 5 + expected = rng.shift(5) + tm.assert_index_equal(result, expected) + + result = rng - 5 + expected = rng.shift(-5) + tm.assert_index_equal(result, expected) + + def test_datetimeindex_repr_short(self): + dr = date_range(start='1/1/2012', periods=1) + repr(dr) + + dr = date_range(start='1/1/2012', periods=2) + repr(dr) + + dr = date_range(start='1/1/2012', periods=3) + repr(dr) + + def test_getitem_setitem_datetimeindex(self): + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04:00:00"] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00"] = 0 + result["1990-01-01 04:00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04:00:00" + rb = "1990-01-01 07:00:00" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # repeat all the above with naive datetimes + result = ts[datetime(1990, 1, 1, 4)] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4)] = 0 + result[datetime(1990, 1, 1, 4)] = ts[4] + assert_series_equal(result, ts) + + result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] + assert_series_equal(result, ts) + + lb = datetime(1990, 1, 1, 4) + rb = datetime(1990, 1, 1, 7) + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts[ts.index[4]] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + # also test partial date slicing + result = ts["1990-01-02"] + expected = ts[24:48] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-02"] = 0 + result["1990-01-02"] = ts[24:48] + assert_series_equal(result, ts) + + def test_normalize(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D') + tm.assert_index_equal(result, expected) + + rng_ns = pd.DatetimeIndex(np.array([1380585623454345752, + 1380585612343234312]).astype( + "datetime64[ns]")) + rng_ns_normalized = rng_ns.normalize() + expected = pd.DatetimeIndex(np.array([1380585600000000000, + 1380585600000000000]).astype( + "datetime64[ns]")) + tm.assert_index_equal(rng_ns_normalized, expected) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + + def test_series_ctor_plus_datetimeindex(self): + rng = date_range('20090415', '20090519', freq='B') + data = dict((k, 1) for k in rng) + + result = Series(data, index=rng) + self.assertIs(result.index, rng) + class TestDatetime64(tm.TestCase): @@ -331,3 +523,21 @@ def test_datetimeindex_union_join_empty(self): result = dti.join(empty) tm.assertIsInstance(result, DatetimeIndex) + + +class TestTimeSeriesDuplicates(tm.TestCase): + _multiprocess_can_split_ = True + + def test_recreate_from_data(self): + freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', + 'C'] + + for f in freqs: + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) + idx = DatetimeIndex(org, freq=f) + tm.assert_index_equal(idx, org) + + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, + tz='US/Pacific', periods=1) + idx = DatetimeIndex(org, freq=f, tz='US/Pacific') + tm.assert_index_equal(idx, org) diff --git a/pandas/tests/indexes/test_timedelta.py b/pandas/tests/indexes/test_timedelta.py new file mode 100644 index 0000000000000..be01ad03a0660 --- /dev/null +++ b/pandas/tests/indexes/test_timedelta.py @@ -0,0 +1,43 @@ +import numpy as np +from datetime import timedelta + +import pandas as pd +import pandas.util.testing as tm +from pandas import (timedelta_range, date_range, Series, Timedelta, + DatetimeIndex) + + +class TestSlicing(tm.TestCase): + + def test_timedelta(self): + # this is valid too + index = date_range('1/1/2000', periods=50, freq='B') + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + self.assertTrue(tm.equalContents(index, back)) + self.assertEqual(shifted.freq, index.freq) + self.assertEqual(shifted.freq, back.freq) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) + + # GH4134, buggy with timedeltas + rng = date_range('2013', '2014') + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) + + +class TestTimeSeries(tm.TestCase): + _multiprocess_can_split_ = True + + def test_series_box_timedelta(self): + rng = timedelta_range('1 day 1 s', periods=5, freq='h') + s = Series(rng) + tm.assertIsInstance(s[1], Timedelta) + tm.assertIsInstance(s.iat[2], Timedelta) diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 2159c59de72ce..94369ebbd0a19 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -1 +1,1578 @@ """ test the scalar Timestamp """ + +import sys +import operator +import calendar +import numpy as np +from datetime import datetime, timedelta +from distutils.version import LooseVersion + +import pandas as pd +import pandas.util.testing as tm +import pandas._period as period +from pandas.tseries import offsets, frequencies +from pandas.tslib import get_timezone, iNaT +from pandas.compat import lrange, long +from pandas.util.testing import assert_series_equal +from pandas.compat.numpy import np_datetime64_compat +from pandas import (Timestamp, date_range, Period, Timedelta, tslib, compat, + Series, NaT, isnull, DataFrame, DatetimeIndex) +from pandas.tseries.frequencies import (RESO_DAY, RESO_HR, RESO_MIN, RESO_US, + RESO_MS, RESO_SEC) + +randn = np.random.randn + + +class TestTimestamp(tm.TestCase): + + def test_constructor(self): + base_str = '2014-07-01 09:00' + base_dt = datetime(2014, 7, 1, 9) + base_expected = 1404205200000000000 + + # confirm base representation is correct + import calendar + self.assertEqual(calendar.timegm(base_dt.timetuple()) * 1000000000, + base_expected) + + tests = [(base_str, base_dt, base_expected), + ('2014-07-01 10:00', datetime(2014, 7, 1, 10), + base_expected + 3600 * 1000000000), + ('2014-07-01 09:00:00.000008000', + datetime(2014, 7, 1, 9, 0, 0, 8), + base_expected + 8000), + ('2014-07-01 09:00:00.000000005', + Timestamp('2014-07-01 09:00:00.000000005'), + base_expected + 5)] + + tm._skip_if_no_pytz() + tm._skip_if_no_dateutil() + import pytz + import dateutil + timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), + ('US/Eastern', -4), ('dateutil/US/Pacific', -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5)] + + for date_str, date, expected in tests: + for result in [Timestamp(date_str), Timestamp(date)]: + # only with timestring + self.assertEqual(result.value, expected) + self.assertEqual(tslib.pydt_to_i8(result), expected) + + # re-creation shouldn't affect to internal value + result = Timestamp(result) + self.assertEqual(result.value, expected) + self.assertEqual(tslib.pydt_to_i8(result), expected) + + # with timezone + for tz, offset in timezones: + for result in [Timestamp(date_str, tz=tz), Timestamp(date, + tz=tz)]: + expected_tz = expected - offset * 3600 * 1000000000 + self.assertEqual(result.value, expected_tz) + self.assertEqual(tslib.pydt_to_i8(result), expected_tz) + + # should preserve tz + result = Timestamp(result) + self.assertEqual(result.value, expected_tz) + self.assertEqual(tslib.pydt_to_i8(result), expected_tz) + + # should convert to UTC + result = Timestamp(result, tz='UTC') + expected_utc = expected - offset * 3600 * 1000000000 + self.assertEqual(result.value, expected_utc) + self.assertEqual(tslib.pydt_to_i8(result), expected_utc) + + def test_constructor_with_stringoffset(self): + # GH 7833 + base_str = '2014-07-01 11:00:00+02:00' + base_dt = datetime(2014, 7, 1, 9) + base_expected = 1404205200000000000 + + # confirm base representation is correct + import calendar + self.assertEqual(calendar.timegm(base_dt.timetuple()) * 1000000000, + base_expected) + + tests = [(base_str, base_expected), + ('2014-07-01 12:00:00+02:00', + base_expected + 3600 * 1000000000), + ('2014-07-01 11:00:00.000008000+02:00', base_expected + 8000), + ('2014-07-01 11:00:00.000000005+02:00', base_expected + 5)] + + tm._skip_if_no_pytz() + tm._skip_if_no_dateutil() + import pytz + import dateutil + timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), + ('US/Eastern', -4), ('dateutil/US/Pacific', -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5)] + + for date_str, expected in tests: + for result in [Timestamp(date_str)]: + # only with timestring + self.assertEqual(result.value, expected) + self.assertEqual(tslib.pydt_to_i8(result), expected) + + # re-creation shouldn't affect to internal value + result = Timestamp(result) + self.assertEqual(result.value, expected) + self.assertEqual(tslib.pydt_to_i8(result), expected) + + # with timezone + for tz, offset in timezones: + result = Timestamp(date_str, tz=tz) + expected_tz = expected + self.assertEqual(result.value, expected_tz) + self.assertEqual(tslib.pydt_to_i8(result), expected_tz) + + # should preserve tz + result = Timestamp(result) + self.assertEqual(result.value, expected_tz) + self.assertEqual(tslib.pydt_to_i8(result), expected_tz) + + # should convert to UTC + result = Timestamp(result, tz='UTC') + expected_utc = expected + self.assertEqual(result.value, expected_utc) + self.assertEqual(tslib.pydt_to_i8(result), expected_utc) + + # This should be 2013-11-01 05:00 in UTC + # converted to Chicago tz + result = Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago') + self.assertEqual(result.value, Timestamp('2013-11-01 05:00').value) + expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa + self.assertEqual(repr(result), expected) + self.assertEqual(result, eval(repr(result))) + + # This should be 2013-11-01 05:00 in UTC + # converted to Tokyo tz (+09:00) + result = Timestamp('2013-11-01 00:00:00-0500', tz='Asia/Tokyo') + self.assertEqual(result.value, Timestamp('2013-11-01 05:00').value) + expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" + self.assertEqual(repr(result), expected) + self.assertEqual(result, eval(repr(result))) + + # GH11708 + # This should be 2015-11-18 10:00 in UTC + # converted to Asia/Katmandu + result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu") + self.assertEqual(result.value, Timestamp("2015-11-18 10:00").value) + expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')" + self.assertEqual(repr(result), expected) + self.assertEqual(result, eval(repr(result))) + + # This should be 2015-11-18 10:00 in UTC + # converted to Asia/Kolkata + result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata") + self.assertEqual(result.value, Timestamp("2015-11-18 10:00").value) + expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')" + self.assertEqual(repr(result), expected) + self.assertEqual(result, eval(repr(result))) + + def test_constructor_invalid(self): + with tm.assertRaisesRegexp(TypeError, 'Cannot convert input'): + Timestamp(slice(2)) + with tm.assertRaisesRegexp(ValueError, 'Cannot convert Period'): + Timestamp(Period('1000-01-01')) + + def test_constructor_positional(self): + # GH 10758 + with tm.assertRaises(TypeError): + Timestamp(2000, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 0, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 13, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 1, 0) + with tm.assertRaises(ValueError): + Timestamp(2000, 1, 32) + + # GH 11630 + self.assertEqual( + repr(Timestamp(2015, 11, 12)), + repr(Timestamp('20151112'))) + + self.assertEqual( + repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)), + repr(Timestamp('2015-11-12 01:02:03.999999'))) + + self.assertIs(Timestamp(None), pd.NaT) + + def test_constructor_keyword(self): + # GH 10758 + with tm.assertRaises(TypeError): + Timestamp(year=2000, month=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=0, day=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=13, day=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=1, day=0) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=1, day=32) + + self.assertEqual( + repr(Timestamp(year=2015, month=11, day=12)), + repr(Timestamp('20151112'))) + + self.assertEqual( + repr(Timestamp(year=2015, month=11, day=12, + hour=1, minute=2, second=3, microsecond=999999)), + repr(Timestamp('2015-11-12 01:02:03.999999'))) + + def test_constructor_fromordinal(self): + base = datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal(), freq='D') + self.assertEqual(base, ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') + self.assertEqual(pd.Timestamp('2000-01-01', tz='US/Eastern'), ts) + self.assertEqual(base.toordinal(), ts.toordinal()) + + def test_constructor_offset_depr(self): + # GH 12160 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp('2011-01-01', offset='D') + self.assertEqual(ts.freq, 'D') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.assertEqual(ts.offset, 'D') + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp('2011-01-01', offset='D', freq='D') + + def test_constructor_offset_depr_fromordinal(self): + # GH 12160 + base = datetime(2000, 1, 1) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp.fromordinal(base.toordinal(), offset='D') + self.assertEqual(pd.Timestamp('2000-01-01'), ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') + + def test_conversion(self): + # GH 9255 + ts = Timestamp('2000-01-01') + + result = ts.to_pydatetime() + expected = datetime(2000, 1, 1) + self.assertEqual(result, expected) + self.assertEqual(type(result), type(expected)) + + result = ts.to_datetime64() + expected = np.datetime64(ts.value, 'ns') + self.assertEqual(result, expected) + self.assertEqual(type(result), type(expected)) + self.assertEqual(result.dtype, expected.dtype) + + def test_repr(self): + tm._skip_if_no_pytz() + tm._skip_if_no_dateutil() + + dates = ['2014-03-07', '2014-01-01 09:00', + '2014-01-01 00:00:00.000000001'] + + # dateutil zone change (only matters for repr) + import dateutil + if (dateutil.__version__ >= LooseVersion('2.3') and + (dateutil.__version__ <= LooseVersion('2.4.0') or + dateutil.__version__ >= LooseVersion('2.6.0'))): + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific'] + else: + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/America/Los_Angeles'] + + freqs = ['D', 'M', 'S', 'N'] + + for date in dates: + for tz in timezones: + for freq in freqs: + + # avoid to match with timezone name + freq_repr = "'{0}'".format(freq) + if tz.startswith('dateutil'): + tz_repr = tz.replace('dateutil', '') + else: + tz_repr = tz + + date_only = Timestamp(date) + self.assertIn(date, repr(date_only)) + self.assertNotIn(tz_repr, repr(date_only)) + self.assertNotIn(freq_repr, repr(date_only)) + self.assertEqual(date_only, eval(repr(date_only))) + + date_tz = Timestamp(date, tz=tz) + self.assertIn(date, repr(date_tz)) + self.assertIn(tz_repr, repr(date_tz)) + self.assertNotIn(freq_repr, repr(date_tz)) + self.assertEqual(date_tz, eval(repr(date_tz))) + + date_freq = Timestamp(date, freq=freq) + self.assertIn(date, repr(date_freq)) + self.assertNotIn(tz_repr, repr(date_freq)) + self.assertIn(freq_repr, repr(date_freq)) + self.assertEqual(date_freq, eval(repr(date_freq))) + + date_tz_freq = Timestamp(date, tz=tz, freq=freq) + self.assertIn(date, repr(date_tz_freq)) + self.assertIn(tz_repr, repr(date_tz_freq)) + self.assertIn(freq_repr, repr(date_tz_freq)) + self.assertEqual(date_tz_freq, eval(repr(date_tz_freq))) + + # this can cause the tz field to be populated, but it's redundant to + # information in the datestring + tm._skip_if_no_pytz() + import pytz # noqa + date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) + self.assertIn('2014-03-13 00:00:00-0400', repr(date_with_utc_offset)) + self.assertNotIn('tzoffset', repr(date_with_utc_offset)) + self.assertIn('pytz.FixedOffset(-240)', repr(date_with_utc_offset)) + expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'", + 'pytz.FixedOffset(-240)') + self.assertEqual(date_with_utc_offset, eval(expr)) + + def test_bounds_with_different_units(self): + out_of_bounds_dates = ('1677-09-21', '2262-04-12', ) + + time_units = ('D', 'h', 'm', 's', 'ms', 'us') + + for date_string in out_of_bounds_dates: + for unit in time_units: + self.assertRaises(ValueError, Timestamp, np.datetime64( + date_string, dtype='M8[%s]' % unit)) + + in_bounds_dates = ('1677-09-23', '2262-04-11', ) + + for date_string in in_bounds_dates: + for unit in time_units: + Timestamp(np.datetime64(date_string, dtype='M8[%s]' % unit)) + + def test_tz(self): + t = '2014-02-01 09:00' + ts = Timestamp(t) + local = ts.tz_localize('Asia/Tokyo') + self.assertEqual(local.hour, 9) + self.assertEqual(local, Timestamp(t, tz='Asia/Tokyo')) + conv = local.tz_convert('US/Eastern') + self.assertEqual(conv, Timestamp('2014-01-31 19:00', tz='US/Eastern')) + self.assertEqual(conv.hour, 19) + + # preserves nanosecond + ts = Timestamp(t) + offsets.Nano(5) + local = ts.tz_localize('Asia/Tokyo') + self.assertEqual(local.hour, 9) + self.assertEqual(local.nanosecond, 5) + conv = local.tz_convert('US/Eastern') + self.assertEqual(conv.nanosecond, 5) + self.assertEqual(conv.hour, 19) + + def test_tz_localize_ambiguous(self): + + ts = Timestamp('2014-11-02 01:00') + ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) + ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) + + rng = date_range('2014-11-02', periods=3, freq='H', tz='US/Eastern') + self.assertEqual(rng[1], ts_dst) + self.assertEqual(rng[2], ts_no_dst) + self.assertRaises(ValueError, ts.tz_localize, 'US/Eastern', + ambiguous='infer') + + # GH 8025 + with tm.assertRaisesRegexp(TypeError, + 'Cannot localize tz-aware Timestamp, use ' + 'tz_convert for conversions'): + Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo') + + with tm.assertRaisesRegexp(TypeError, + 'Cannot convert tz-naive Timestamp, use ' + 'tz_localize to localize'): + Timestamp('2011-01-01').tz_convert('Asia/Tokyo') + + def test_tz_localize_nonexistent(self): + # See issue 13057 + from pytz.exceptions import NonExistentTimeError + times = ['2015-03-08 02:00', '2015-03-08 02:30', + '2015-03-29 02:00', '2015-03-29 02:30'] + timezones = ['US/Eastern', 'US/Pacific', + 'Europe/Paris', 'Europe/Belgrade'] + for t, tz in zip(times, timezones): + ts = Timestamp(t) + self.assertRaises(NonExistentTimeError, ts.tz_localize, + tz) + self.assertRaises(NonExistentTimeError, ts.tz_localize, + tz, errors='raise') + self.assertIs(ts.tz_localize(tz, errors='coerce'), + pd.NaT) + + def test_tz_localize_errors_ambiguous(self): + # See issue 13057 + from pytz.exceptions import AmbiguousTimeError + ts = pd.Timestamp('2015-11-1 01:00') + self.assertRaises(AmbiguousTimeError, + ts.tz_localize, 'US/Pacific', errors='coerce') + + def test_tz_localize_roundtrip(self): + for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: + for t in ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']: + ts = Timestamp(t) + localized = ts.tz_localize(tz) + self.assertEqual(localized, Timestamp(t, tz=tz)) + + with tm.assertRaises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + self.assertEqual(reset, ts) + self.assertTrue(reset.tzinfo is None) + + def test_tz_convert_roundtrip(self): + for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: + for t in ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']: + ts = Timestamp(t, tz='UTC') + converted = ts.tz_convert(tz) + + reset = converted.tz_convert(None) + self.assertEqual(reset, Timestamp(t)) + self.assertTrue(reset.tzinfo is None) + self.assertEqual(reset, + converted.tz_convert('UTC').tz_localize(None)) + + def test_barely_oob_dts(self): + one_us = np.timedelta64(1).astype('timedelta64[us]') + + # By definition we can't go out of bounds in [ns], so we + # convert the datetime64s to [us] so we can go out of bounds + min_ts_us = np.datetime64(Timestamp.min).astype('M8[us]') + max_ts_us = np.datetime64(Timestamp.max).astype('M8[us]') + + # No error for the min/max datetimes + Timestamp(min_ts_us) + Timestamp(max_ts_us) + + # One us less than the minimum is an error + self.assertRaises(ValueError, Timestamp, min_ts_us - one_us) + + # One us more than the maximum is an error + self.assertRaises(ValueError, Timestamp, max_ts_us + one_us) + + def test_utc_z_designator(self): + self.assertEqual(get_timezone( + Timestamp('2014-11-02 01:00Z').tzinfo), 'UTC') + + def test_now(self): + # #9000 + ts_from_string = Timestamp('now') + ts_from_method = Timestamp.now() + ts_datetime = datetime.now() + + ts_from_string_tz = Timestamp('now', tz='US/Eastern') + ts_from_method_tz = Timestamp.now(tz='US/Eastern') + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + self.assertTrue(abs(ts_from_method - ts_from_string) < delta) + self.assertTrue(abs(ts_datetime - ts_from_method) < delta) + self.assertTrue(abs(ts_from_method_tz - ts_from_string_tz) < delta) + self.assertTrue(abs(ts_from_string_tz.tz_localize(None) - + ts_from_method_tz.tz_localize(None)) < delta) + + def test_today(self): + + ts_from_string = Timestamp('today') + ts_from_method = Timestamp.today() + ts_datetime = datetime.today() + + ts_from_string_tz = Timestamp('today', tz='US/Eastern') + ts_from_method_tz = Timestamp.today(tz='US/Eastern') + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + self.assertTrue(abs(ts_from_method - ts_from_string) < delta) + self.assertTrue(abs(ts_datetime - ts_from_method) < delta) + self.assertTrue(abs(ts_from_method_tz - ts_from_string_tz) < delta) + self.assertTrue(abs(ts_from_string_tz.tz_localize(None) - + ts_from_method_tz.tz_localize(None)) < delta) + + def test_asm8(self): + np.random.seed(7960929) + ns = [Timestamp.min.value, Timestamp.max.value, 1000, ] + for n in ns: + self.assertEqual(Timestamp(n).asm8.view('i8'), + np.datetime64(n, 'ns').view('i8'), n) + self.assertEqual(Timestamp('nat').asm8.view('i8'), + np.datetime64('nat', 'ns').view('i8')) + + def test_fields(self): + def check(value, equal): + # that we are int/long like + self.assertTrue(isinstance(value, (int, compat.long))) + self.assertEqual(value, equal) + + # GH 10050 + ts = Timestamp('2015-05-10 09:06:03.000100001') + check(ts.year, 2015) + check(ts.month, 5) + check(ts.day, 10) + check(ts.hour, 9) + check(ts.minute, 6) + check(ts.second, 3) + self.assertRaises(AttributeError, lambda: ts.millisecond) + check(ts.microsecond, 100) + check(ts.nanosecond, 1) + check(ts.dayofweek, 6) + check(ts.quarter, 2) + check(ts.dayofyear, 130) + check(ts.week, 19) + check(ts.daysinmonth, 31) + check(ts.daysinmonth, 31) + + def test_nat_fields(self): + # GH 10050 + ts = Timestamp('NaT') + self.assertTrue(np.isnan(ts.year)) + self.assertTrue(np.isnan(ts.month)) + self.assertTrue(np.isnan(ts.day)) + self.assertTrue(np.isnan(ts.hour)) + self.assertTrue(np.isnan(ts.minute)) + self.assertTrue(np.isnan(ts.second)) + self.assertTrue(np.isnan(ts.microsecond)) + self.assertTrue(np.isnan(ts.nanosecond)) + self.assertTrue(np.isnan(ts.dayofweek)) + self.assertTrue(np.isnan(ts.quarter)) + self.assertTrue(np.isnan(ts.dayofyear)) + self.assertTrue(np.isnan(ts.week)) + self.assertTrue(np.isnan(ts.daysinmonth)) + self.assertTrue(np.isnan(ts.days_in_month)) + + def test_pprint(self): + # GH12622 + import pprint + nested_obj = {'foo': 1, + 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10} + result = pprint.pformat(nested_obj, width=50) + expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], + 'foo': 1}""" + self.assertEqual(result, expected) + + def to_datetime_depr(self): + # see gh-8254 + ts = Timestamp('2011-01-01') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = datetime(2011, 1, 1) + result = ts.to_datetime() + self.assertEqual(result, expected) + + def to_pydatetime_nonzero_nano(self): + ts = Timestamp('2011-01-01 9:00:00.123456789') + + # Warn the user of data loss (nanoseconds). + with tm.assert_produces_warning(UserWarning, + check_stacklevel=False): + expected = datetime(2011, 1, 1, 9, 0, 0, 123456) + result = ts.to_pydatetime() + self.assertEqual(result, expected) + + def test_round(self): + + # round + dt = Timestamp('20130101 09:10:11') + result = dt.round('D') + expected = Timestamp('20130101') + self.assertEqual(result, expected) + + dt = Timestamp('20130101 19:10:11') + result = dt.round('D') + expected = Timestamp('20130102') + self.assertEqual(result, expected) + + dt = Timestamp('20130201 12:00:00') + result = dt.round('D') + expected = Timestamp('20130202') + self.assertEqual(result, expected) + + dt = Timestamp('20130104 12:00:00') + result = dt.round('D') + expected = Timestamp('20130105') + self.assertEqual(result, expected) + + dt = Timestamp('20130104 12:32:00') + result = dt.round('30Min') + expected = Timestamp('20130104 12:30:00') + self.assertEqual(result, expected) + + dti = date_range('20130101 09:10:11', periods=5) + result = dti.round('D') + expected = date_range('20130101', periods=5) + tm.assert_index_equal(result, expected) + + # floor + dt = Timestamp('20130101 09:10:11') + result = dt.floor('D') + expected = Timestamp('20130101') + self.assertEqual(result, expected) + + # ceil + dt = Timestamp('20130101 09:10:11') + result = dt.ceil('D') + expected = Timestamp('20130102') + self.assertEqual(result, expected) + + # round with tz + dt = Timestamp('20130101 09:10:11', tz='US/Eastern') + result = dt.round('D') + expected = Timestamp('20130101', tz='US/Eastern') + self.assertEqual(result, expected) + + dt = Timestamp('20130101 09:10:11', tz='US/Eastern') + result = dt.round('s') + self.assertEqual(result, dt) + + dti = date_range('20130101 09:10:11', + periods=5).tz_localize('UTC').tz_convert('US/Eastern') + result = dti.round('D') + expected = date_range('20130101', periods=5).tz_localize('US/Eastern') + tm.assert_index_equal(result, expected) + + result = dti.round('s') + tm.assert_index_equal(result, dti) + + # invalid + for freq in ['Y', 'M', 'foobar']: + self.assertRaises(ValueError, lambda: dti.round(freq)) + + def test_class_ops_pytz(self): + tm._skip_if_no_pytz() + from pytz import timezone + + def compare(x, y): + self.assertEqual(int(Timestamp(x).value / 1e9), + int(Timestamp(y).value / 1e9)) + + compare(Timestamp.now(), datetime.now()) + compare(Timestamp.now('UTC'), datetime.now(timezone('UTC'))) + compare(Timestamp.utcnow(), datetime.utcnow()) + compare(Timestamp.today(), datetime.today()) + current_time = calendar.timegm(datetime.now().utctimetuple()) + compare(Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time)) + compare(Timestamp.fromtimestamp(current_time), + datetime.fromtimestamp(current_time)) + + date_component = datetime.utcnow() + time_component = (date_component + timedelta(minutes=10)).time() + compare(Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component)) + + def test_class_ops_dateutil(self): + tm._skip_if_no_dateutil() + from dateutil.tz import tzutc + + def compare(x, y): + self.assertEqual(int(np.round(Timestamp(x).value / 1e9)), + int(np.round(Timestamp(y).value / 1e9))) + + compare(Timestamp.now(), datetime.now()) + compare(Timestamp.now('UTC'), datetime.now(tzutc())) + compare(Timestamp.utcnow(), datetime.utcnow()) + compare(Timestamp.today(), datetime.today()) + current_time = calendar.timegm(datetime.now().utctimetuple()) + compare(Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time)) + compare(Timestamp.fromtimestamp(current_time), + datetime.fromtimestamp(current_time)) + + date_component = datetime.utcnow() + time_component = (date_component + timedelta(minutes=10)).time() + compare(Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component)) + + def test_basics_nanos(self): + val = np.int64(946684800000000000).view('M8[ns]') + stamp = Timestamp(val.view('i8') + 500) + self.assertEqual(stamp.year, 2000) + self.assertEqual(stamp.month, 1) + self.assertEqual(stamp.microsecond, 0) + self.assertEqual(stamp.nanosecond, 500) + + # GH 14415 + val = np.iinfo(np.int64).min + 80000000000000 + stamp = Timestamp(val) + self.assertEqual(stamp.year, 1677) + self.assertEqual(stamp.month, 9) + self.assertEqual(stamp.day, 21) + self.assertEqual(stamp.microsecond, 145224) + self.assertEqual(stamp.nanosecond, 192) + + def test_unit(self): + + def check(val, unit=None, h=1, s=1, us=0): + stamp = Timestamp(val, unit=unit) + self.assertEqual(stamp.year, 2000) + self.assertEqual(stamp.month, 1) + self.assertEqual(stamp.day, 1) + self.assertEqual(stamp.hour, h) + if unit != 'D': + self.assertEqual(stamp.minute, 1) + self.assertEqual(stamp.second, s) + self.assertEqual(stamp.microsecond, us) + else: + self.assertEqual(stamp.minute, 0) + self.assertEqual(stamp.second, 0) + self.assertEqual(stamp.microsecond, 0) + self.assertEqual(stamp.nanosecond, 0) + + ts = Timestamp('20000101 01:01:01') + val = ts.value + days = (ts - Timestamp('1970-01-01')).days + + check(val) + check(val / long(1000), unit='us') + check(val / long(1000000), unit='ms') + check(val / long(1000000000), unit='s') + check(days, unit='D', h=0) + + # using truediv, so these are like floats + if compat.PY3: + check((val + 500000) / long(1000000000), unit='s', us=500) + check((val + 500000000) / long(1000000000), unit='s', us=500000) + check((val + 500000) / long(1000000), unit='ms', us=500) + + # get chopped in py2 + else: + check((val + 500000) / long(1000000000), unit='s') + check((val + 500000000) / long(1000000000), unit='s') + check((val + 500000) / long(1000000), unit='ms') + + # ok + check((val + 500000) / long(1000), unit='us', us=500) + check((val + 500000000) / long(1000000), unit='ms', us=500000) + + # floats + check(val / 1000.0 + 5, unit='us', us=5) + check(val / 1000.0 + 5000, unit='us', us=5000) + check(val / 1000000.0 + 0.5, unit='ms', us=500) + check(val / 1000000.0 + 0.005, unit='ms', us=5) + check(val / 1000000000.0 + 0.5, unit='s', us=500000) + check(days + 0.5, unit='D', h=12) + + # nan + result = Timestamp(np.nan) + self.assertIs(result, NaT) + + result = Timestamp(None) + self.assertIs(result, NaT) + + result = Timestamp(iNaT) + self.assertIs(result, NaT) + + result = Timestamp(NaT) + self.assertIs(result, NaT) + + result = Timestamp('NaT') + self.assertIs(result, NaT) + + self.assertTrue(isnull(Timestamp('nat'))) + + def test_roundtrip(self): + + # test value to string and back conversions + # further test accessors + base = Timestamp('20140101 00:00:00') + + result = Timestamp(base.value + pd.Timedelta('5ms').value) + self.assertEqual(result, Timestamp(str(base) + ".005000")) + self.assertEqual(result.microsecond, 5000) + + result = Timestamp(base.value + pd.Timedelta('5us').value) + self.assertEqual(result, Timestamp(str(base) + ".000005")) + self.assertEqual(result.microsecond, 5) + + result = Timestamp(base.value + pd.Timedelta('5ns').value) + self.assertEqual(result, Timestamp(str(base) + ".000000005")) + self.assertEqual(result.nanosecond, 5) + self.assertEqual(result.microsecond, 0) + + result = Timestamp(base.value + pd.Timedelta('6ms 5us').value) + self.assertEqual(result, Timestamp(str(base) + ".006005")) + self.assertEqual(result.microsecond, 5 + 6 * 1000) + + result = Timestamp(base.value + pd.Timedelta('200ms 5us').value) + self.assertEqual(result, Timestamp(str(base) + ".200005")) + self.assertEqual(result.microsecond, 5 + 200 * 1000) + + def test_comparison(self): + # 5-18-2012 00:00:00.000 + stamp = long(1337299200000000000) + + val = Timestamp(stamp) + + self.assertEqual(val, val) + self.assertFalse(val != val) + self.assertFalse(val < val) + self.assertTrue(val <= val) + self.assertFalse(val > val) + self.assertTrue(val >= val) + + other = datetime(2012, 5, 18) + self.assertEqual(val, other) + self.assertFalse(val != other) + self.assertFalse(val < other) + self.assertTrue(val <= other) + self.assertFalse(val > other) + self.assertTrue(val >= other) + + other = Timestamp(stamp + 100) + + self.assertNotEqual(val, other) + self.assertNotEqual(val, other) + self.assertTrue(val < other) + self.assertTrue(val <= other) + self.assertTrue(other > val) + self.assertTrue(other >= val) + + def test_compare_invalid(self): + + # GH 8058 + val = Timestamp('20130101 12:01:02') + self.assertFalse(val == 'foo') + self.assertFalse(val == 10.0) + self.assertFalse(val == 1) + self.assertFalse(val == long(1)) + self.assertFalse(val == []) + self.assertFalse(val == {'foo': 1}) + self.assertFalse(val == np.float64(1)) + self.assertFalse(val == np.int64(1)) + + self.assertTrue(val != 'foo') + self.assertTrue(val != 10.0) + self.assertTrue(val != 1) + self.assertTrue(val != long(1)) + self.assertTrue(val != []) + self.assertTrue(val != {'foo': 1}) + self.assertTrue(val != np.float64(1)) + self.assertTrue(val != np.int64(1)) + + # ops testing + df = DataFrame(randn(5, 2)) + a = df[0] + b = Series(randn(5)) + b.name = Timestamp('2000-01-01') + tm.assert_series_equal(a / b, 1 / (b / a)) + + def test_cant_compare_tz_naive_w_aware(self): + tm._skip_if_no_pytz() + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz='utc') + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + else: + self.assertFalse(a == b.to_pydatetime()) + self.assertFalse(a.to_pydatetime() == b) + + def test_cant_compare_tz_naive_w_aware_explicit_pytz(self): + tm._skip_if_no_pytz() + from pytz import utc + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz=utc) + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + else: + self.assertFalse(a == b.to_pydatetime()) + self.assertFalse(a.to_pydatetime() == b) + + def test_cant_compare_tz_naive_w_aware_dateutil(self): + tm._skip_if_no_dateutil() + from dateutil.tz import tzutc + utc = tzutc() + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz=utc) + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + else: + self.assertFalse(a == b.to_pydatetime()) + self.assertFalse(a.to_pydatetime() == b) + + def test_delta_preserve_nanos(self): + val = Timestamp(long(1337299200000000123)) + result = val + timedelta(1) + self.assertEqual(result.nanosecond, val.nanosecond) + + def test_frequency_misc(self): + self.assertEqual(frequencies.get_freq_group('T'), + frequencies.FreqGroup.FR_MIN) + + code, stride = frequencies.get_freq_code(offsets.Hour()) + self.assertEqual(code, frequencies.FreqGroup.FR_HR) + + code, stride = frequencies.get_freq_code((5, 'T')) + self.assertEqual(code, frequencies.FreqGroup.FR_MIN) + self.assertEqual(stride, 5) + + offset = offsets.Hour() + result = frequencies.to_offset(offset) + self.assertEqual(result, offset) + + result = frequencies.to_offset((5, 'T')) + expected = offsets.Minute(5) + self.assertEqual(result, expected) + + self.assertRaises(ValueError, frequencies.get_freq_code, (5, 'baz')) + + self.assertRaises(ValueError, frequencies.to_offset, '100foo') + + self.assertRaises(ValueError, frequencies.to_offset, ('', '')) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frequencies.get_standard_freq(offsets.Hour()) + self.assertEqual(result, 'H') + + def test_hash_equivalent(self): + d = {datetime(2011, 1, 1): 5} + stamp = Timestamp(datetime(2011, 1, 1)) + self.assertEqual(d[stamp], 5) + + def test_timestamp_compare_scalars(self): + # case where ndim == 0 + lhs = np.datetime64(datetime(2013, 12, 6)) + rhs = Timestamp('now') + nat = Timestamp('nat') + + ops = {'gt': 'lt', + 'lt': 'gt', + 'ge': 'le', + 'le': 'ge', + 'eq': 'eq', + 'ne': 'ne'} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + expected = left_f(lhs, rhs) + + result = right_f(rhs, lhs) + self.assertEqual(result, expected) + + expected = left_f(rhs, nat) + result = right_f(nat, rhs) + self.assertEqual(result, expected) + + def test_timestamp_compare_series(self): + # make sure we can compare Timestamps on the right AND left hand side + # GH4982 + s = Series(date_range('20010101', periods=10), name='dates') + s_nat = s.copy(deep=True) + + s[0] = pd.Timestamp('nat') + s[3] = pd.Timestamp('nat') + + ops = {'lt': 'gt', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + + # no nats + expected = left_f(s, Timestamp('20010109')) + result = right_f(Timestamp('20010109'), s) + tm.assert_series_equal(result, expected) + + # nats + expected = left_f(s, Timestamp('nat')) + result = right_f(Timestamp('nat'), s) + tm.assert_series_equal(result, expected) + + # compare to timestamp with series containing nats + expected = left_f(s_nat, Timestamp('20010109')) + result = right_f(Timestamp('20010109'), s_nat) + tm.assert_series_equal(result, expected) + + # compare to nat with series containing nats + expected = left_f(s_nat, Timestamp('nat')) + result = right_f(Timestamp('nat'), s_nat) + tm.assert_series_equal(result, expected) + + def test_is_leap_year(self): + # GH 13727 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + dt = Timestamp('2000-01-01 00:00:00', tz=tz) + self.assertTrue(dt.is_leap_year) + self.assertIsInstance(dt.is_leap_year, bool) + + dt = Timestamp('1999-01-01 00:00:00', tz=tz) + self.assertFalse(dt.is_leap_year) + + dt = Timestamp('2004-01-01 00:00:00', tz=tz) + self.assertTrue(dt.is_leap_year) + + dt = Timestamp('2100-01-01 00:00:00', tz=tz) + self.assertFalse(dt.is_leap_year) + + self.assertFalse(pd.NaT.is_leap_year) + self.assertIsInstance(pd.NaT.is_leap_year, bool) + + def test_round_nat(self): + # GH14940 + ts = Timestamp('nat') + print(dir(ts)) + for method in ["round", "floor", "ceil"]: + round_method = getattr(ts, method) + for freq in ["s", "5s", "min", "5min", "h", "5h"]: + self.assertIs(round_method(freq), ts) + + +class TestTimestampNsOperations(tm.TestCase): + def setUp(self): + self.timestamp = Timestamp(datetime.utcnow()) + + def assert_ns_timedelta(self, modified_timestamp, expected_value): + value = self.timestamp.value + modified_value = modified_timestamp.value + + self.assertEqual(modified_value - value, expected_value) + + def test_timedelta_ns_arithmetic(self): + self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'ns'), + -123) + + def test_timedelta_ns_based_arithmetic(self): + self.assert_ns_timedelta(self.timestamp + np.timedelta64( + 1234567898, 'ns'), 1234567898) + + def test_timedelta_us_arithmetic(self): + self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'us'), + -123000) + + def test_timedelta_ms_arithmetic(self): + time = self.timestamp + np.timedelta64(-123, 'ms') + self.assert_ns_timedelta(time, -123000000) + + def test_nanosecond_string_parsing(self): + ts = Timestamp('2013-05-01 07:15:45.123456789') + # GH 7878 + expected_repr = '2013-05-01 07:15:45.123456789' + expected_value = 1367392545123456789 + self.assertEqual(ts.value, expected_value) + self.assertIn(expected_repr, repr(ts)) + + ts = Timestamp('2013-05-01 07:15:45.123456789+09:00', tz='Asia/Tokyo') + self.assertEqual(ts.value, expected_value - 9 * 3600 * 1000000000) + self.assertIn(expected_repr, repr(ts)) + + ts = Timestamp('2013-05-01 07:15:45.123456789', tz='UTC') + self.assertEqual(ts.value, expected_value) + self.assertIn(expected_repr, repr(ts)) + + ts = Timestamp('2013-05-01 07:15:45.123456789', tz='US/Eastern') + self.assertEqual(ts.value, expected_value + 4 * 3600 * 1000000000) + self.assertIn(expected_repr, repr(ts)) + + # GH 10041 + ts = Timestamp('20130501T071545.123456789') + self.assertEqual(ts.value, expected_value) + self.assertIn(expected_repr, repr(ts)) + + def test_nanosecond_timestamp(self): + # GH 7610 + expected = 1293840000000000005 + t = Timestamp('2011-01-01') + offsets.Nano(5) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 5) + + t = Timestamp(t) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 5) + + t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000005Z')) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 5) + + expected = 1293840000000000010 + t = t + offsets.Nano(5) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 10) + + t = Timestamp(t) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 10) + + t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000010Z')) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 10) + + def test_nat_arithmetic(self): + # GH 6873 + i = 2 + f = 1.5 + + for (left, right) in [(pd.NaT, i), (pd.NaT, f), (pd.NaT, np.nan)]: + self.assertIs(left / right, pd.NaT) + self.assertIs(left * right, pd.NaT) + self.assertIs(right * left, pd.NaT) + with tm.assertRaises(TypeError): + right / left + + # Timestamp / datetime + t = Timestamp('2014-01-01') + dt = datetime(2014, 1, 1) + for (left, right) in [(pd.NaT, pd.NaT), (pd.NaT, t), (pd.NaT, dt)]: + # NaT __add__ or __sub__ Timestamp-like (or inverse) returns NaT + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(left - right, pd.NaT) + self.assertIs(right - left, pd.NaT) + + # timedelta-like + # offsets are tested in test_offsets.py + + delta = timedelta(3600) + td = Timedelta('5s') + + for (left, right) in [(pd.NaT, delta), (pd.NaT, td)]: + # NaT + timedelta-like returns NaT + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(right - left, pd.NaT) + self.assertIs(left - right, pd.NaT) + + # GH 11718 + tm._skip_if_no_pytz() + import pytz + + t_utc = Timestamp('2014-01-01', tz='UTC') + t_tz = Timestamp('2014-01-01', tz='US/Eastern') + dt_tz = pytz.timezone('Asia/Tokyo').localize(dt) + + for (left, right) in [(pd.NaT, t_utc), (pd.NaT, t_tz), + (pd.NaT, dt_tz)]: + # NaT __add__ or __sub__ Timestamp-like (or inverse) returns NaT + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(left - right, pd.NaT) + self.assertIs(right - left, pd.NaT) + + # int addition / subtraction + for (left, right) in [(pd.NaT, 2), (pd.NaT, 0), (pd.NaT, -3)]: + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(left - right, pd.NaT) + self.assertIs(right - left, pd.NaT) + + def test_nat_arithmetic_index(self): + # GH 11718 + + # datetime + tm._skip_if_no_pytz() + + dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], name='x') + exp = pd.DatetimeIndex([pd.NaT, pd.NaT], name='x') + self.assert_index_equal(dti + pd.NaT, exp) + self.assert_index_equal(pd.NaT + dti, exp) + + dti_tz = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], + tz='US/Eastern', name='x') + exp = pd.DatetimeIndex([pd.NaT, pd.NaT], name='x', tz='US/Eastern') + self.assert_index_equal(dti_tz + pd.NaT, exp) + self.assert_index_equal(pd.NaT + dti_tz, exp) + + exp = pd.TimedeltaIndex([pd.NaT, pd.NaT], name='x') + for (left, right) in [(pd.NaT, dti), (pd.NaT, dti_tz)]: + self.assert_index_equal(left - right, exp) + self.assert_index_equal(right - left, exp) + + # timedelta + tdi = pd.TimedeltaIndex(['1 day', '2 day'], name='x') + exp = pd.DatetimeIndex([pd.NaT, pd.NaT], name='x') + for (left, right) in [(pd.NaT, tdi)]: + self.assert_index_equal(left + right, exp) + self.assert_index_equal(right + left, exp) + self.assert_index_equal(left - right, exp) + self.assert_index_equal(right - left, exp) + + +class TestTimestampOps(tm.TestCase): + def test_timestamp_and_datetime(self): + self.assertEqual((Timestamp(datetime( + 2013, 10, 13)) - datetime(2013, 10, 12)).days, 1) + self.assertEqual((datetime(2013, 10, 12) - + Timestamp(datetime(2013, 10, 13))).days, -1) + + def test_timestamp_and_series(self): + timestamp_series = Series(date_range('2014-03-17', periods=2, freq='D', + tz='US/Eastern')) + first_timestamp = timestamp_series[0] + + delta_series = Series([np.timedelta64(0, 'D'), np.timedelta64(1, 'D')]) + assert_series_equal(timestamp_series - first_timestamp, delta_series) + assert_series_equal(first_timestamp - timestamp_series, -delta_series) + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time + # objects + datetime_instance = datetime(2014, 3, 4) + timedelta_instance = timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports + # addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, + freq='D')[0] + + self.assertEqual(type(timestamp_instance + 1), Timestamp) + self.assertEqual(type(timestamp_instance - 1), Timestamp) + + # Timestamp + datetime not supported, though subtraction is supported + # and yields timedelta more tests in tseries/base/tests/test_base.py + self.assertEqual( + type(timestamp_instance - datetime_instance), Timedelta) + self.assertEqual( + type(timestamp_instance + timedelta_instance), Timestamp) + self.assertEqual( + type(timestamp_instance - timedelta_instance), Timestamp) + + # Timestamp +/- datetime64 not supported, so not tested (could possibly + # assert error raised?) + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual( + type(timestamp_instance + timedelta64_instance), Timestamp) + self.assertEqual( + type(timestamp_instance - timedelta64_instance), Timestamp) + + def test_addition_subtraction_preserve_frequency(self): + timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] + timedelta_instance = timedelta(days=1) + original_freq = timestamp_instance.freq + self.assertEqual((timestamp_instance + 1).freq, original_freq) + self.assertEqual((timestamp_instance - 1).freq, original_freq) + self.assertEqual( + (timestamp_instance + timedelta_instance).freq, original_freq) + self.assertEqual( + (timestamp_instance - timedelta_instance).freq, original_freq) + + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual( + (timestamp_instance + timedelta64_instance).freq, original_freq) + self.assertEqual( + (timestamp_instance - timedelta64_instance).freq, original_freq) + + def test_resolution(self): + + for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', + 'S', 'L', 'U'], + [RESO_DAY, RESO_DAY, + RESO_DAY, RESO_DAY, + RESO_HR, RESO_MIN, + RESO_SEC, RESO_MS, + RESO_US]): + for tz in [None, 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Eastern']: + idx = date_range(start='2013-04-01', periods=30, freq=freq, + tz=tz) + result = period.resolution(idx.asi8, idx.tz) + self.assertEqual(result, expected) + + +class TestTimestampToJulianDate(tm.TestCase): + def test_compare_1700(self): + r = Timestamp('1700-06-23').to_julian_date() + self.assertEqual(r, 2342145.5) + + def test_compare_2000(self): + r = Timestamp('2000-04-12').to_julian_date() + self.assertEqual(r, 2451646.5) + + def test_compare_2100(self): + r = Timestamp('2100-08-12').to_julian_date() + self.assertEqual(r, 2488292.5) + + def test_compare_hour01(self): + r = Timestamp('2000-08-12T01:00:00').to_julian_date() + self.assertEqual(r, 2451768.5416666666666666) + + def test_compare_hour13(self): + r = Timestamp('2000-08-12T13:00:00').to_julian_date() + self.assertEqual(r, 2451769.0416666666666666) + + +class TestTimeSeries(tm.TestCase): + _multiprocess_can_split_ = True + + def test_timestamp_to_datetime(self): + tm._skip_if_no_pytz() + rng = date_range('20090415', '20090519', tz='US/Eastern') + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEqual(stamp, dtval) + self.assertEqual(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_to_datetime_dateutil(self): + tm._skip_if_no_pytz() + rng = date_range('20090415', '20090519', tz='dateutil/US/Eastern') + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEqual(stamp, dtval) + self.assertEqual(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_to_datetime_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + rng = date_range('20090415', '20090519', + tz=pytz.timezone('US/Eastern')) + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEqual(stamp, dtval) + self.assertEqual(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_to_datetime_explicit_dateutil(self): + tm._skip_if_windows_python_3() + tm._skip_if_no_dateutil() + from pandas.tslib import _dateutil_gettz as gettz + rng = date_range('20090415', '20090519', tz=gettz('US/Eastern')) + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEqual(stamp, dtval) + self.assertEqual(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_fields(self): + # extra fields from DatetimeIndex like quarter and week + idx = tm.makeDateIndex(100) + + fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', + 'days_in_month', 'is_month_start', 'is_month_end', + 'is_quarter_start', 'is_quarter_end', 'is_year_start', + 'is_year_end', 'weekday_name'] + for f in fields: + expected = getattr(idx, f)[-1] + result = getattr(Timestamp(idx[-1]), f) + self.assertEqual(result, expected) + + self.assertEqual(idx.freq, Timestamp(idx[-1], idx.freq).freq) + self.assertEqual(idx.freqstr, Timestamp(idx[-1], idx.freq).freqstr) + + def test_timestamp_date_out_of_range(self): + self.assertRaises(ValueError, Timestamp, '1676-01-01') + self.assertRaises(ValueError, Timestamp, '2263-01-01') + + # 1475 + self.assertRaises(ValueError, DatetimeIndex, ['1400-01-01']) + self.assertRaises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) + + def test_timestamp_repr(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + self.assertIn(iso8601, result) + + def test_timestamp_from_ordinal(self): + + # GH 3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + self.assertEqual(ts.to_pydatetime(), dt) + + # with a tzinfo + stamp = Timestamp('2011-4-16', tz='US/Eastern') + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') + self.assertEqual(ts.to_pydatetime(), dt_tz) + + def test_timestamp_compare_with_early_datetime(self): + # e.g. datetime.min + stamp = Timestamp('2012-01-01') + + self.assertFalse(stamp == datetime.min) + self.assertFalse(stamp == datetime(1600, 1, 1)) + self.assertFalse(stamp == datetime(2700, 1, 1)) + self.assertNotEqual(stamp, datetime.min) + self.assertNotEqual(stamp, datetime(1600, 1, 1)) + self.assertNotEqual(stamp, datetime(2700, 1, 1)) + self.assertTrue(stamp > datetime(1600, 1, 1)) + self.assertTrue(stamp >= datetime(1600, 1, 1)) + self.assertTrue(stamp < datetime(2700, 1, 1)) + self.assertTrue(stamp <= datetime(2700, 1, 1)) + + def test_timestamp_equality(self): + + # GH 11034 + s = Series([Timestamp('2000-01-29 01:59:00'), 'NaT']) + result = s != s + assert_series_equal(result, Series([False, True])) + result = s != s[0] + assert_series_equal(result, Series([False, True])) + result = s != s[1] + assert_series_equal(result, Series([True, True])) + + result = s == s + assert_series_equal(result, Series([True, False])) + result = s == s[0] + assert_series_equal(result, Series([True, False])) + result = s == s[1] + assert_series_equal(result, Series([False, False])) + + def test_series_box_timestamp(self): + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng) + + tm.assertIsInstance(s[5], Timestamp) + + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng, index=rng) + tm.assertIsInstance(s[5], Timestamp) + + tm.assertIsInstance(s.iat[5], Timestamp) + + def test_frame_setitem_timestamp(self): + # 2155 + columns = DatetimeIndex(start='1/1/2012', end='2/1/2012', + freq=offsets.BDay()) + index = lrange(10) + data = DataFrame(columns=columns, index=index) + t = datetime(2012, 11, 1) + ts = Timestamp(t) + data[ts] = np.nan # works + + def test_to_html_timestamp(self): + rng = date_range('2000-01-01', periods=10) + df = DataFrame(np.random.randn(10, 4), index=rng) + + result = df.to_html() + self.assertIn('2000-01-01', result) + + def test_series_map_box_timestamps(self): + # #2689, #2627 + s = Series(date_range('1/1/2000', periods=10)) + + def f(x): + return (x.hour, x.day, x.month) + + # it works! + s.map(f) + s.apply(f) + DataFrame(s).applymap(f) + + def test_dti_slicing(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + dti2 = dti[[1, 3, 5]] + + v1 = dti2[0] + v2 = dti2[1] + v3 = dti2[2] + + self.assertEqual(v1, Timestamp('2/28/2005')) + self.assertEqual(v2, Timestamp('4/30/2005')) + self.assertEqual(v3, Timestamp('6/30/2005')) + + # don't carry freq through irregular slicing + self.assertIsNone(dti2.freq) + + def test_woy_boundary(self): + # make sure weeks at year boundaries are correct + d = datetime(2013, 12, 31) + result = Timestamp(d).week + expected = 1 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2008, 12, 28) + result = Timestamp(d).week + expected = 52 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2009, 12, 31) + result = Timestamp(d).week + expected = 53 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2010, 1, 1) + result = Timestamp(d).week + expected = 53 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2010, 1, 3) + result = Timestamp(d).week + expected = 53 # ISO standard + self.assertEqual(result, expected) + + result = np.array([Timestamp(datetime(*args)).week + for args in [(2000, 1, 1), (2000, 1, 2), ( + 2005, 1, 1), (2005, 1, 2)]]) + self.assertTrue((result == [52, 52, 53, 53]).all()) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 3c82e4ed82969..91da36161e188 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -34,6 +34,11 @@ def _skip_if_no_akima(): raise nose.SkipTest('scipy.interpolate.Akima1DInterpolator missing') +def _simple_ts(start, end, freq='D'): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + class TestSeriesMissingData(TestData, tm.TestCase): _multiprocess_can_split_ = True @@ -530,6 +535,79 @@ def test_fill_value_when_combine_const(self): res = s.add(2, fill_value=0) assert_series_equal(res, exp) + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = s[:2].reindex(index).fillna(method='pad') + expected[-3:] = np.nan + assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method='bfill', limit=5) + + expected = s[-2:].reindex(index).fillna(method='backfill') + expected[:3] = np.nan + assert_series_equal(result, expected) + + def test_sparse_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + ss = s[:2].reindex(index).to_sparse() + result = ss.fillna(method='pad', limit=5) + expected = ss.fillna(method='pad', limit=5) + expected = expected.to_dense() + expected[-3:] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + ss = s[-2:].reindex(index).to_sparse() + result = ss.fillna(method='backfill', limit=5) + expected = ss.fillna(method='backfill') + expected = expected.to_dense() + expected[:3] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + def test_sparse_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + s = s.to_sparse() + + result = s[:2].reindex(index, method='pad', limit=5) + expected = s[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected[-3:] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method='backfill', limit=5) + expected = s[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected[:3] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index, method='pad', limit=5) + + expected = s[:2].reindex(index).fillna(method='pad') + expected[-3:] = np.nan + assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method='backfill', limit=5) + + expected = s[-2:].reindex(index).fillna(method='backfill') + expected[:3] = np.nan + assert_series_equal(result, expected) + class TestSeriesInterpolateData(TestData, tm.TestCase): @@ -932,6 +1010,31 @@ def test_interp_timedelta64(self): index=pd.to_timedelta([1, 2, 4])) assert_series_equal(result, expected) + def test_series_interpolate_method_values(self): + # #1646 + ts = _simple_ts('1/1/2000', '1/20/2000') + ts[::2] = np.nan + + result = ts.interpolate(method='values') + exp = ts.interpolate() + assert_series_equal(result, exp) + + def test_series_interpolate_intraday(self): + # #1698 + index = pd.date_range('1/1/2012', periods=4, freq='12D') + ts = pd.Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(days=1)).sort_values() + + exp = ts.reindex(new_index).interpolate(method='time') + + index = pd.date_range('1/1/2012', periods=4, freq='12H') + ts = pd.Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() + result = ts.reindex(new_index).interpolate(method='time') + + self.assert_numpy_array_equal(result.values, exp.values) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index df7ab24430746..073b8bfeee131 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1,22 +1,48 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -from datetime import datetime - +import sys +import nose +import locale +import calendar import numpy as np +from numpy.random import rand +from datetime import datetime, timedelta, time -from pandas import Index, Series, date_range, NaT +import pandas as pd +import pandas.index as _index +import pandas.tseries.tools as tools +import pandas.core.common as com +import pandas.util.testing as tm +from pandas.tslib import iNaT +from pandas.compat import lrange, lmap, StringIO, product +from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.index import DatetimeIndex from pandas.tseries.offsets import BDay, BMonthEnd -from pandas.tseries.tdi import TimedeltaIndex - -from pandas.util.testing import assert_series_equal, assert_almost_equal -import pandas.util.testing as tm +from pandas.types.common import is_datetime64_ns_dtype +from pandas import (Index, Series, date_range, NaT, concat, DataFrame, + Timestamp, lib, isnull, to_datetime, offsets, Timedelta, + tslib, bdate_range, Period, timedelta_range, compat) +from pandas.util.testing import (assert_series_equal, assert_almost_equal, + slow, assert_frame_equal, _skip_if_has_locale) from pandas.tests.series.common import TestData +randn = np.random.randn + + +def _simple_ts(start, end, freq='D'): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) -class TestSeriesTimeSeries(TestData, tm.TestCase): + +def assert_range_equal(left, right): + assert (left.equals(right)) + assert (left.freq == right.freq) + assert (left.tz == right.tz) + + +class TestTimeSeries(TestData, tm.TestCase): _multiprocess_can_split_ = True def test_shift(self): @@ -204,86 +230,6 @@ def test_truncate(self): before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset) - def test_getitem_setitem_datetimeindex(self): - from pandas import date_range - - N = 50 - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(N), index=rng) - - result = ts["1990-01-01 04:00:00"] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts.copy() - result["1990-01-01 04:00:00"] = 0 - result["1990-01-01 04:00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 - result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] - assert_series_equal(result, ts) - - lb = "1990-01-01 04:00:00" - rb = "1990-01-01 07:00:00" - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - # repeat all the above with naive datetimes - result = ts[datetime(1990, 1, 1, 4)] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts.copy() - result[datetime(1990, 1, 1, 4)] = 0 - result[datetime(1990, 1, 1, 4)] = ts[4] - assert_series_equal(result, ts) - - result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] - assert_series_equal(result, ts) - - lb = datetime(1990, 1, 1, 4) - rb = datetime(1990, 1, 1, 7) - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts[ts.index[4]] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts[ts.index[4:8]] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] - assert_series_equal(result, ts) - - # also test partial date slicing - result = ts["1990-01-02"] - expected = ts[24:48] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-02"] = 0 - result["1990-01-02"] = ts[24:48] - assert_series_equal(result, ts) - def test_getitem_setitem_datetime_tz_pytz(self): tm._skip_if_no_pytz() from pytz import timezone as tz @@ -567,8 +513,3031 @@ def test_empty_series_ops(self): assert_series_equal(a, b + a) self.assertRaises(TypeError, lambda x, y: x - y, b, a) + def test_is_(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + self.assertTrue(dti.is_(dti)) + self.assertTrue(dti.is_(dti.view())) + self.assertFalse(dti.is_(dti.copy())) + + def test_contiguous_boolean_preserve_freq(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + mask = np.zeros(len(rng), dtype=bool) + mask[10:20] = True + + masked = rng[mask] + expected = rng[10:20] + self.assertIsNotNone(expected.freq) + assert_range_equal(masked, expected) + + mask[22] = True + masked = rng[mask] + self.assertIsNone(masked.freq) + + def test_getitem_median_slice_bug(self): + index = date_range('20090415', '20090519', freq='2B') + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + result = s[indexer] + expected = s[indexer[0]] + assert_series_equal(result, expected) + + def test_ctor_str_intraday(self): + rng = DatetimeIndex(['1-1-2000 00:00:01']) + self.assertEqual(rng[0].second, 1) + + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index, method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_sparse_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index, method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index, method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_sparse_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_pad_require_monotonicity(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + # neither monotonic increasing or decreasing + rng2 = rng[[1, 0, 2]] + + self.assertRaises(ValueError, rng2.get_indexer, rng, method='pad') + + def test_frame_ctor_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + dates = np.asarray(rng) + + df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) + self.assertTrue(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) + + def test_frame_add_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + df = DataFrame(index=np.arange(len(rng))) + + df['A'] = rng + self.assertTrue(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({'year': date_range('1/1/1700', periods=50, + freq='A-DEC')}) + # it works! + repr(df) + + def test_frame_add_datetime64_col_other_units(self): + n = 100 + + units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] + + ns_dtype = np.dtype('M8[ns]') + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + ex_vals = to_datetime(vals.astype('O')).values + + self.assertEqual(df[unit].dtype, ns_dtype) + self.assertTrue((df[unit].values == ex_vals).all()) + + # Test insertion into existing datetime64 column + df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + tmp = df.copy() + + tmp['dates'] = vals + ex_vals = to_datetime(vals.astype('O')).values + + self.assertTrue((tmp['dates'].values == ex_vals).all()) + + def test_to_datetime_unit(self): + + epoch = 1370745748 + s = Series([epoch + t for t in range(20)]) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in range(20)]) + assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)]).astype(float) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in range(20)]) + assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)] + [iNaT]) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in range(20)] + [NaT]) + assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in range(20)] + [NaT]) + assert_series_equal(result, expected) + + # GH13834 + s = Series([epoch + t for t in np.arange(0, 2, .25)] + + [iNaT]).astype(float) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in np.arange(0, 2, .25)] + [NaT]) + assert_series_equal(result, expected) + + s = concat([Series([epoch + t for t in range(20)] + ).astype(float), Series([np.nan])], + ignore_index=True) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in range(20)] + [NaT]) + assert_series_equal(result, expected) + + result = to_datetime([1, 2, 'NaT', pd.NaT, np.nan], unit='D') + expected = DatetimeIndex([Timestamp('1970-01-02'), + Timestamp('1970-01-03')] + ['NaT'] * 3) + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + to_datetime([1, 2, 'foo'], unit='D') + with self.assertRaises(ValueError): + to_datetime([1, 2, 111111111], unit='D') + + # coerce we can process + expected = DatetimeIndex([Timestamp('1970-01-02'), + Timestamp('1970-01-03')] + ['NaT'] * 1) + result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + + result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + + def test_series_ctor_datetime64(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + dates = np.asarray(rng) + + series = Series(dates) + self.assertTrue(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) + + def test_index_cast_datetime64_other_units(self): + arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') + + idx = Index(arr) + + self.assertTrue((idx.values == tslib.cast_to_nanoseconds(arr)).all()) + + def test_reindex_series_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + series = Series(rng) + + result = series.reindex(lrange(15)) + self.assertTrue(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) + + mask = result.isnull() + self.assertTrue(mask[-5:].all()) + self.assertFalse(mask[:-5].any()) + + def test_reindex_frame_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) + + result = df.reindex(lrange(15)) + self.assertTrue(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) + + mask = com.isnull(result)['B'] + self.assertTrue(mask[-5:].all()) + self.assertFalse(mask[:-5].any()) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + + result = repr(series) + expected = ('0 1970-01-01 00:00:00.000000\n' + '1 1970-01-01 00:00:00.000001\n' + '2 1970-01-01 00:00:00.000002\n' + '3 NaT\n' + 'dtype: datetime64[ns]') + self.assertEqual(result, expected) + + def test_fillna_nat(self): + series = Series([0, 1, 2, iNaT], dtype='M8[ns]') + + filled = series.fillna(method='pad') + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.values[3] = expected.values[2] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='pad') + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') + + filled = series.fillna(method='bfill') + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='bfill') + filled2 = df.fillna(value=series[1]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + def test_string_na_nat_conversion(self): + # GH #999, #858 + + from pandas.compat import parse_date + + strings = np.array(['1/1/2000', '1/2/2000', np.nan, + '1/4/2000, 12:34:56'], dtype=object) + + expected = np.empty(4, dtype='M8[ns]') + for i, val in enumerate(strings): + if com.isnull(val): + expected[i] = iNaT + else: + expected[i] = parse_date(val) + + result = tslib.array_to_datetime(strings) + assert_almost_equal(result, expected) + + result2 = to_datetime(strings) + tm.assertIsInstance(result2, DatetimeIndex) + tm.assert_numpy_array_equal(result, result2.values) + + malformed = np.array(['1/100/2000', np.nan], dtype=object) + + # GH 10636, default is now 'raise' + self.assertRaises(ValueError, + lambda: to_datetime(malformed, errors='raise')) + + result = to_datetime(malformed, errors='ignore') + tm.assert_numpy_array_equal(result, malformed) + + self.assertRaises(ValueError, to_datetime, malformed, errors='raise') + + idx = ['a', 'b', 'c', 'd', 'e'] + series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, + '1/5/2000'], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000'), np.nan, + to_datetime('1/3/2000'), np.nan, + to_datetime('1/5/2000')], index=idx, name='foo') + + result = to_datetime(series) + dresult = to_datetime(dseries) + + expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + for i in range(5): + x = series[i] + if isnull(x): + expected[i] = iNaT + else: + expected[i] = to_datetime(x) + + assert_series_equal(result, expected, check_names=False) + self.assertEqual(result.name, 'foo') + + assert_series_equal(dresult, expected, check_names=False) + self.assertEqual(dresult.name, 'foo') + + def test_nat_vector_field_access(self): + idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000']) + + fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', + 'days_in_month', 'is_leap_year'] + + for field in fields: + result = getattr(idx, field) + expected = [getattr(x, field) for x in idx] + self.assert_numpy_array_equal(result, np.array(expected)) + + s = pd.Series(idx) + + for field in fields: + result = getattr(s.dt, field) + expected = [getattr(x, field) for x in idx] + self.assert_series_equal(result, pd.Series(expected)) + + def test_nat_scalar_field_access(self): + fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', + 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name'] + for field in fields: + result = getattr(NaT, field) + self.assertTrue(np.isnan(result)) + + def test_NaT_methods(self): + # GH 9513 + raise_methods = ['astimezone', 'combine', 'ctime', 'dst', + 'fromordinal', 'fromtimestamp', 'isocalendar', + 'strftime', 'strptime', 'time', 'timestamp', + 'timetuple', 'timetz', 'toordinal', 'tzname', + 'utcfromtimestamp', 'utcnow', 'utcoffset', + 'utctimetuple'] + nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today'] + nan_methods = ['weekday', 'isoweekday'] + + for method in raise_methods: + if hasattr(NaT, method): + self.assertRaises(ValueError, getattr(NaT, method)) + + for method in nan_methods: + if hasattr(NaT, method): + self.assertTrue(np.isnan(getattr(NaT, method)())) + + for method in nat_methods: + if hasattr(NaT, method): + # see gh-8254 + exp_warning = None + if method == 'to_datetime': + exp_warning = FutureWarning + with tm.assert_produces_warning( + exp_warning, check_stacklevel=False): + self.assertIs(getattr(NaT, method)(), NaT) + + # GH 12300 + self.assertEqual(NaT.isoformat(), 'NaT') + + def test_index_convert_to_datetime_array(self): + tm._skip_if_no_pytz() + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assertIsInstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assertIsInstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') + rng_utc = date_range('20090415', '20090519', tz='utc') + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assertIsInstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assertIsInstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', + tz=pytz.timezone('US/Eastern')) + rng_utc = date_range('20090415', '20090519', tz=pytz.utc) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assertIsInstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assertIsInstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', + tz='dateutil/US/Eastern') + rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_reasonable_keyerror(self): + # GH #1062 + index = DatetimeIndex(['1/3/2000']) + try: + index.get_loc('1/1/2000') + except KeyError as e: + self.assertIn('2000', str(e)) + + def test_reindex_with_datetimes(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + + def test_asfreq_keep_index_name(self): + # GH #9854 + index_name = 'bar' + index = pd.date_range('20130101', periods=20, name=index_name) + df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index) + + self.assertEqual(index_name, df.index.name) + self.assertEqual(index_name, df.asfreq('10D').index.name) + + def test_promote_datetime_date(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + ts_slice = ts[5:] + ts2 = ts_slice.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts + ts2 + result2 = ts2 + ts + expected = ts + ts[5:] + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # test asfreq + result = ts2.asfreq('4H', method='ffill') + expected = ts[5:].asfreq('4H', method='ffill') + assert_series_equal(result, expected) + + result = rng.get_indexer(ts2.index) + expected = rng.get_indexer(ts_slice.index) + self.assert_numpy_array_equal(result, expected) + + def test_asfreq_normalize(self): + rng = date_range('1/1/2000 09:30', periods=20) + norm = date_range('1/1/2000', periods=20) + vals = np.random.randn(20) + ts = Series(vals, index=rng) + + result = ts.asfreq('D', normalize=True) + norm = date_range('1/1/2000', periods=20) + expected = Series(vals, index=norm) + + assert_series_equal(result, expected) + + vals = np.random.randn(20, 3) + ts = DataFrame(vals, index=rng) + + result = ts.asfreq('D', normalize=True) + expected = DataFrame(vals, index=norm) + + assert_frame_equal(result, expected) + + def test_first_subset(self): + ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') + result = ts.first('10d') + self.assertEqual(len(result), 20) + + ts = _simple_ts('1/1/2000', '1/1/2010') + result = ts.first('10d') + self.assertEqual(len(result), 10) + + result = ts.first('3M') + expected = ts[:'3/31/2000'] + assert_series_equal(result, expected) + + result = ts.first('21D') + expected = ts[:21] + assert_series_equal(result, expected) + + result = ts[:0].first('3M') + assert_series_equal(result, ts[:0]) + + def test_last_subset(self): + ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') + result = ts.last('10d') + self.assertEqual(len(result), 20) + + ts = _simple_ts('1/1/2000', '1/1/2010') + result = ts.last('10d') + self.assertEqual(len(result), 10) + + result = ts.last('21D') + expected = ts['12/12/2009':] + assert_series_equal(result, expected) + + result = ts.last('21D') + expected = ts[-21:] + assert_series_equal(result, expected) + + result = ts[:0].last('3M') + assert_series_equal(result, ts[:0]) + + def test_format_pre_1900_dates(self): + rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC') + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_at_time(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + rs = ts.at_time(rng[1]) + self.assertTrue((rs.index.hour == rng[1].hour).all()) + self.assertTrue((rs.index.minute == rng[1].minute).all()) + self.assertTrue((rs.index.second == rng[1].second).all()) + + result = ts.at_time('9:30') + expected = ts.at_time(time(9, 30)) + assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts[time(9, 30)] + result_df = df.loc[time(9, 30)] + expected = ts[(rng.hour == 9) & (rng.minute == 30)] + exp_df = df[(rng.hour == 9) & (rng.minute == 30)] + + # expected.index = date_range('1/1/2000', '1/4/2000') + + assert_series_equal(result, expected) + tm.assert_frame_equal(result_df, exp_df) + + chunk = df.loc['1/4/2000':] + result = chunk.loc[time(9, 30)] + expected = result_df[-1:] + tm.assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range('1/1/2000', '1/31/2000') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.at_time(time(0, 0)) + assert_series_equal(result, ts) + + # time doesn't exist + rng = date_range('1/1/2012', freq='23Min', periods=384) + ts = Series(np.random.randn(len(rng)), rng) + rs = ts.at_time('16:00') + self.assertEqual(len(rs), 0) + + def test_at_time_frame(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + rs = ts.at_time(rng[1]) + self.assertTrue((rs.index.hour == rng[1].hour).all()) + self.assertTrue((rs.index.minute == rng[1].minute).all()) + self.assertTrue((rs.index.second == rng[1].second).all()) + + result = ts.at_time('9:30') + expected = ts.at_time(time(9, 30)) + assert_frame_equal(result, expected) + + result = ts.loc[time(9, 30)] + expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] + + assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range('1/1/2000', '1/31/2000') + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts.at_time(time(0, 0)) + assert_frame_equal(result, ts) + + # time doesn't exist + rng = date_range('1/1/2012', freq='23Min', periods=384) + ts = DataFrame(np.random.randn(len(rng), 2), rng) + rs = ts.at_time('16:00') + self.assertEqual(len(rs), 0) + + def test_between_time(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue(t >= stime) + else: + self.assertTrue(t > stime) + + if inc_end: + self.assertTrue(t <= etime) + else: + self.assertTrue(t < etime) + + result = ts.between_time('00:00', '01:00') + expected = ts.between_time(stime, etime) + assert_series_equal(result, expected) + + # across midnight + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue((t >= stime) or (t <= etime)) + else: + self.assertTrue((t > stime) or (t <= etime)) + + if inc_end: + self.assertTrue((t <= etime) or (t >= stime)) + else: + self.assertTrue((t < etime) or (t >= stime)) + + def test_between_time_frame(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue(t >= stime) + else: + self.assertTrue(t > stime) + + if inc_end: + self.assertTrue(t <= etime) + else: + self.assertTrue(t < etime) + + result = ts.between_time('00:00', '01:00') + expected = ts.between_time(stime, etime) + assert_frame_equal(result, expected) + + # across midnight + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue((t >= stime) or (t <= etime)) + else: + self.assertTrue((t > stime) or (t <= etime)) + + if inc_end: + self.assertTrue((t <= etime) or (t >= stime)) + else: + self.assertTrue((t < etime) or (t >= stime)) + + def test_between_time_types(self): + # GH11818 + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + self.assertRaises(ValueError, rng.indexer_between_time, + datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + frame = DataFrame({'A': 0}, index=rng) + self.assertRaises(ValueError, frame.between_time, + datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + series = Series(0, index=rng) + self.assertRaises(ValueError, series.between_time, + datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + def test_between_time_formats(self): + # GH11818 + _skip_if_has_locale() + + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + + strings = [("2:00", "2:30"), ("0200", "0230"), ("2:00am", "2:30am"), + ("0200am", "0230am"), ("2:00:00", "2:30:00"), + ("020000", "023000"), ("2:00:00am", "2:30:00am"), + ("020000am", "023000am")] + expected_length = 28 + + for time_string in strings: + self.assertEqual(len(ts.between_time(*time_string)), + expected_length, + "%s - %s" % time_string) + + def test_to_period(self): + from pandas.tseries.period import period_range + + ts = _simple_ts('1/1/2000', '1/1/2001') + + pts = ts.to_period() + exp = ts.copy() + exp.index = period_range('1/1/2000', '1/1/2001') + assert_series_equal(pts, exp) + + pts = ts.to_period('M') + exp.index = exp.index.asfreq('M') + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + assert_series_equal(pts, exp) + + # GH 7606 without freq + idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04']) + exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04'], freq='D') + + s = Series(np.random.randn(4), index=idx) + expected = s.copy() + expected.index = exp_idx + assert_series_equal(s.to_period(), expected) + + df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) + expected = df.copy() + expected.index = exp_idx + assert_frame_equal(df.to_period(), expected) + + expected = df.copy() + expected.columns = exp_idx + assert_frame_equal(df.to_period(axis=1), expected) + + def create_dt64_based_index(self): + data = [Timestamp('2007-01-01 10:11:12.123456Z'), + Timestamp('2007-01-01 10:11:13.789123Z')] + index = DatetimeIndex(data) + return index + + def test_to_period_millisecond(self): + index = self.create_dt64_based_index() + + period = index.to_period(freq='L') + self.assertEqual(2, len(period)) + self.assertEqual(period[0], Period('2007-01-01 10:11:12.123Z', 'L')) + self.assertEqual(period[1], Period('2007-01-01 10:11:13.789Z', 'L')) + + def test_to_period_microsecond(self): + index = self.create_dt64_based_index() + + period = index.to_period(freq='U') + self.assertEqual(2, len(period)) + self.assertEqual(period[0], Period('2007-01-01 10:11:12.123456Z', 'U')) + self.assertEqual(period[1], Period('2007-01-01 10:11:13.789123Z', 'U')) + + def test_to_period_tz_pytz(self): + tm._skip_if_no_pytz() + from dateutil.tz import tzlocal + from pytz import utc as UTC + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz='US/Eastern') + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=UTC) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + tm.assert_index_equal(ts.to_period(), xp) + + def test_to_period_tz_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + from dateutil.tz import tzlocal + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz=pytz.timezone('US/Eastern')) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + def test_to_period_tz_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + from dateutil.tz import tzlocal + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz='dateutil/US/Eastern') + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + def test_frame_to_period(self): + K = 5 + from pandas.tseries.period import period_range + + dr = date_range('1/1/2000', '1/1/2001') + pr = period_range('1/1/2000', '1/1/2001') + df = DataFrame(randn(len(dr), K), index=dr) + df['mix'] = 'a' + + pts = df.to_period() + exp = df.copy() + exp.index = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M') + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + + df = df.T + pts = df.to_period(axis=1) + exp = df.copy() + exp.columns = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M', axis=1) + tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) + + self.assertRaises(ValueError, df.to_period, axis=2) + + def test_compat_replace(self): + # https://github.com/statsmodels/statsmodels/issues/3349 + # replace should take ints/longs for compat + + for f in [compat.long, int]: + result = date_range(Timestamp('1960-04-01 00:00:00', + freq='QS-JAN'), + periods=f(76), + freq='QS-JAN') + self.assertEqual(len(result), 76) + + def test_astype_object(self): + # NumPy 1.6.1 weak ns support + rng = date_range('1/1/2000', periods=20) + + casted = rng.astype('O') + exp_values = list(rng) + + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + self.assertEqual(casted.tolist(), exp_values) + + def test_catch_infinite_loop(self): + offset = offsets.DateOffset(minute=5) + # blow up, don't loop forever + self.assertRaises(Exception, date_range, datetime(2011, 11, 11), + datetime(2011, 11, 12), freq=offset) + + def test_append_concat(self): + rng = date_range('5/8/2012 1:45', periods=10, freq='5T') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) + + appended = rng.append(rng) + tm.assert_index_equal(appended, ex_index) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + tm.assert_index_equal(appended, ex_index) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = 'foo' + rng2.name = 'bar' + self.assertEqual(rng1.append(rng1).name, 'foo') + self.assertIsNone(rng1.append(rng2).name) + + def test_append_concat_tz(self): + # GH 2938 + tm._skip_if_no_pytz() + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz='US/Eastern') + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz='US/Eastern') + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz='US/Eastern') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_concat_tz_explicit_pytz(self): + # GH 2938 + tm._skip_if_no_pytz() + from pytz import timezone as timezone + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz=timezone('US/Eastern')) + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz=timezone('US/Eastern')) + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz=timezone('US/Eastern')) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_concat_tz_dateutil(self): + # GH 2938 + tm._skip_if_no_dateutil() + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz='dateutil/US/Eastern') + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz='dateutil/US/Eastern') + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz='dateutil/US/Eastern') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_set_dataframe_column_ns_dtype(self): + x = DataFrame([datetime.now(), datetime.now()]) + self.assertEqual(x[0].dtype, np.dtype('M8[ns]')) + + def test_groupby_count_dateparseerror(self): + dr = date_range(start='1/1/2012', freq='5min', periods=10) + + # BAD Example, datetimes first + s = Series(np.arange(10), index=[dr, lrange(10)]) + grouped = s.groupby(lambda x: x[1] % 2 == 0) + result = grouped.count() + + s = Series(np.arange(10), index=[lrange(10), dr]) + grouped = s.groupby(lambda x: x[0] % 2 == 0) + expected = grouped.count() + + assert_series_equal(result, expected) + + def test_frame_datetime64_handling_groupby(self): + # it works! + df = DataFrame([(3, np.datetime64('2012-07-03')), + (3, np.datetime64('2012-07-04'))], + columns=['a', 'date']) + result = df.groupby('a').first() + self.assertEqual(result['date'][3], Timestamp('2012-07-03')) + + def test_frame_dict_constructor_datetime64_1680(self): + dr = date_range('1/1/2012', periods=10) + s = Series(dr, index=dr) + + # it works! + DataFrame({'a': 'foo', 'b': s}, index=dr) + DataFrame({'a': 'foo', 'b': s.values}, index=dr) + + def test_frame_datetime64_mixed_index_ctor_1681(self): + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + ts = Series(dr) + + # it works! + d = DataFrame({'A': 'foo', 'B': ts}, index=dr) + self.assertTrue(d['B'].isnull().all()) + + def test_frame_timeseries_to_records(self): + index = date_range('1/1/2000', periods=10) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['a', 'b', 'c']) + + result = df.to_records() + result['index'].dtype == 'M8[ns]' + + result = df.to_records(index=False) + + def test_to_csv_numpy_16_bug(self): + frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) + + buf = StringIO() + frame.to_csv(buf) + + result = buf.getvalue() + self.assertIn('2000-01-01', result) + + def test_series_map_box_timedelta(self): + # GH 11349 + s = Series(timedelta_range('1 day 1 s', periods=5, freq='h')) + + def f(x): + return x.total_seconds() + + s.map(f) + s.apply(f) + DataFrame(s).applymap(f) + + def test_concat_datetime_datetime64_frame(self): + # #2624 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 'hi']) + + df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + + ind = date_range(start="2000/1/1", freq="D", periods=10) + df1 = DataFrame({'date': ind, 'test': lrange(10)}) + + # it works! + pd.concat([df1, df2_obj]) + + def test_asfreq_resample_set_correct_freq(self): + # GH5613 + # we test if .asfreq() and .resample() set the correct value for .freq + df = pd.DataFrame({'date': ["2012-01-01", "2012-01-02", "2012-01-03"], + 'col': [1, 2, 3]}) + df = df.set_index(pd.to_datetime(df.date)) + + # testing the settings before calling .asfreq() and .resample() + self.assertEqual(df.index.freq, None) + self.assertEqual(df.index.inferred_freq, 'D') + + # does .asfreq() set .freq correctly? + self.assertEqual(df.asfreq('D').index.freq, 'D') + + # does .resample() set .freq correctly? + self.assertEqual(df.resample('D').asfreq().index.freq, 'D') + + def test_pickle(self): + + # GH4606 + p = self.round_trip_pickle(NaT) + self.assertTrue(p is NaT) + + idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) + idx_p = self.round_trip_pickle(idx) + self.assertTrue(idx_p[0] == idx[0]) + self.assertTrue(idx_p[1] is NaT) + self.assertTrue(idx_p[2] == idx[2]) + + # GH11002 + # don't infer freq + idx = date_range('1750-1-1', '2050-1-1', freq='7D') + idx_p = self.round_trip_pickle(idx) + tm.assert_index_equal(idx, idx_p) + + +class TestTimeSeriesDuplicates(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), + datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 3), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 4), + datetime(2000, 1, 4), datetime(2000, 1, 5)] + + self.dups = Series(np.random.randn(len(dates)), index=dates) + + def test_constructor(self): + tm.assertIsInstance(self.dups, Series) + tm.assertIsInstance(self.dups.index, DatetimeIndex) + + def test_is_unique_monotonic(self): + self.assertFalse(self.dups.index.is_unique) + + def test_index_unique(self): + uniques = self.dups.index.unique() + expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 5)]) + self.assertEqual(uniques.dtype, 'M8[ns]') # sanity + tm.assert_index_equal(uniques, expected) + self.assertEqual(self.dups.index.nunique(), 4) + + # #2563 + self.assertTrue(isinstance(uniques, DatetimeIndex)) + + dups_local = self.dups.index.tz_localize('US/Eastern') + dups_local.name = 'foo' + result = dups_local.unique() + expected = DatetimeIndex(expected, name='foo') + expected = expected.tz_localize('US/Eastern') + self.assertTrue(result.tz is not None) + self.assertEqual(result.name, 'foo') + tm.assert_index_equal(result, expected) + + # NaT, note this is excluded + arr = [1370745748 + t for t in range(20)] + [iNaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + + arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) + for t in range(20)] + [NaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + + def test_index_dupes_contains(self): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + self.assertTrue(d in ix) + + def test_duplicate_dates_indexing(self): + ts = self.dups + + uniques = ts.index.unique() + for date in uniques: + result = ts[date] + + mask = ts.index == date + total = (ts.index == date).sum() + expected = ts[mask] + if total > 1: + assert_series_equal(result, expected) + else: + assert_almost_equal(result, expected[0]) + + cp = ts.copy() + cp[date] = 0 + expected = Series(np.where(mask, 0, ts), index=ts.index) + assert_series_equal(cp, expected) + + self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) + + # new index + ts[datetime(2000, 1, 6)] = 0 + self.assertEqual(ts[datetime(2000, 1, 6)], 0) + + def test_range_slice(self): + idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', + '1/4/2000']) + + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts['1/2/2000':] + expected = ts[1:] + assert_series_equal(result, expected) + + result = ts['1/2/2000':'1/3/2000'] + expected = ts[1:4] + assert_series_equal(result, expected) + + def test_groupby_average_dup_values(self): + result = self.dups.groupby(level=0).mean() + expected = self.dups.groupby(self.dups.index).mean() + assert_series_equal(result, expected) + + def test_indexing_over_size_cutoff(self): + import datetime + # #1821 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + # create large list of non periodic datetime + dates = [] + sec = datetime.timedelta(seconds=1) + half_sec = datetime.timedelta(microseconds=500000) + d = datetime.datetime(2011, 12, 5, 20, 30) + n = 1100 + for i in range(n): + dates.append(d) + dates.append(d + sec) + dates.append(d + sec + half_sec) + dates.append(d + sec + sec + half_sec) + d += 3 * sec + + # duplicate some values in the list + duplicate_positions = np.random.randint(0, len(dates) - 1, 20) + for p in duplicate_positions: + dates[p + 1] = dates[p] + + df = DataFrame(np.random.randn(len(dates), 4), + index=dates, + columns=list('ABCD')) + + pos = n * 3 + timestamp = df.index[pos] + self.assertIn(timestamp, df.index) + + # it works! + df.loc[timestamp] + self.assertTrue(len(df.loc[[timestamp]]) > 0) + finally: + _index._SIZE_CUTOFF = old_cutoff + + def test_indexing_unordered(self): + # GH 2437 + rng = date_range(start='2011-01-01', end='2011-01-15') + ts = Series(randn(len(rng)), index=rng) + ts2 = concat([ts[0:4], ts[-4:], ts[4:-4]]) + + for t in ts.index: + # TODO: unused? + s = str(t) # noqa + + expected = ts[t] + result = ts2[t] + self.assertTrue(expected == result) + + # GH 3448 (ranges) + def compare(slobj): + result = ts2[slobj].copy() + result = result.sort_index() + expected = ts[slobj] + assert_series_equal(result, expected) + + compare(slice('2011-01-01', '2011-01-15')) + compare(slice('2010-12-30', '2011-01-15')) + compare(slice('2011-01-01', '2011-01-16')) + + # partial ranges + compare(slice('2011-01-01', '2011-01-6')) + compare(slice('2011-01-06', '2011-01-8')) + compare(slice('2011-01-06', '2011-01-12')) + + # single values + result = ts2['2011'].sort_index() + expected = ts['2011'] + assert_series_equal(result, expected) + + # diff freq + rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') + ts = Series(np.arange(len(rng)), index=rng) + ts = ts.take(np.random.permutation(20)) + + result = ts['2005'] + for t in result.index: + self.assertTrue(t.year == 2005) + + def test_indexing(self): + + idx = date_range("2001-1-1", periods=20, freq='M') + ts = Series(np.random.rand(len(idx)), index=idx) + + # getting + + # GH 3070, make sure semantics work on Series/Frame + expected = ts['2001'] + expected.name = 'A' + + df = DataFrame(dict(A=ts)) + result = df['2001']['A'] + assert_series_equal(expected, result) + + # setting + ts['2001'] = 1 + expected = ts['2001'] + expected.name = 'A' + + df.loc['2001', 'A'] = 1 + + result = df['2001']['A'] + assert_series_equal(expected, result) + + # GH3546 (not including times on the last day) + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', + freq='H') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected, ts) + + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', + freq='S') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected, ts) + + idx = [Timestamp('2013-05-31 00:00'), + Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013'] + assert_series_equal(expected, ts) + + # GH14826, indexing with a seconds resolution string / datetime object + df = DataFrame(randn(5, 5), + columns=['open', 'high', 'low', 'close', 'volume'], + index=date_range('2012-01-02 18:01:00', + periods=5, tz='US/Central', freq='s')) + expected = df.loc[[df.index[2]]] + + # this is a single date, so will raise + self.assertRaises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) + self.assertRaises(KeyError, df.__getitem__, df.index[2], ) + + +class TestDatetime64(tm.TestCase): + """ + Also test support for datetime64[ns] in Series / DataFrame + """ + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005, 1, 1), + end=datetime(2005, 1, 10), freq='Min') + self.series = Series(rand(len(dti)), dti) + + def test_fancy_getitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + + self.assertEqual(s[48], 48) + self.assertEqual(s['1/2/2009'], 48) + self.assertEqual(s['2009-1-2'], 48) + self.assertEqual(s[datetime(2009, 1, 2)], 48) + self.assertEqual(s[lib.Timestamp(datetime(2009, 1, 2))], 48) + self.assertRaises(KeyError, s.__getitem__, '2009-1-3') + + assert_series_equal(s['3/6/2009':'2009-06-05'], + s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) + + def test_fancy_setitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + s[48] = -1 + self.assertEqual(s[48], -1) + s['1/2/2009'] = -2 + self.assertEqual(s[48], -2) + s['1/2/2009':'2009-06-05'] = -3 + self.assertTrue((s[48:54] == -3).all()) + + def test_dti_snap(self): + dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', + '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') + + res = dti.snap(freq='W-MON') + exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') + exp = exp.repeat([3, 4]) + self.assertTrue((res == exp).all()) + + res = dti.snap(freq='B') + + exp = date_range('1/1/2002', '1/7/2002', freq='b') + exp = exp.repeat([1, 1, 1, 2, 2]) + self.assertTrue((res == exp).all()) + + def test_dti_reset_index_round_trip(self): + dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') + d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) + d2 = d1.reset_index() + self.assertEqual(d2.dtypes[0], np.dtype('M8[ns]')) + d3 = d2.set_index('index') + assert_frame_equal(d1, d3, check_names=False) + + # #2329 + stamp = datetime(2012, 11, 22) + df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) + df = df.set_index('Date') + + self.assertEqual(df.index[0], stamp) + self.assertEqual(df.reset_index()['Date'][0], stamp) + + def test_series_set_value(self): + # #1561 + + dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] + index = DatetimeIndex(dates) + + s = Series().set_value(dates[0], 1.) + s2 = s.set_value(dates[1], np.nan) + + exp = Series([1., np.nan], index=index) + + assert_series_equal(s2, exp) + + # s = Series(index[:1], index[:1]) + # s2 = s.set_value(dates[1], index[1]) + # self.assertEqual(s2.values.dtype, 'M8[ns]') + + @slow + def test_slice_locs_indexerror(self): + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) + for i in range(100000)] + s = Series(lrange(100000), times) + s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] + + def test_slicing_datetimes(self): + + # GH 7523 + + # unique + df = DataFrame(np.arange(4., dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) + for i in [1, 2, 3, 4]]) + result = df.loc[datetime(2001, 1, 1, 10):] + assert_frame_equal(result, df) + result = df.loc[:datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11):] + expected = df.iloc[1:] + assert_frame_equal(result, expected) + result = df.loc['20010101 11':] + assert_frame_equal(result, expected) + + # duplicates + df = pd.DataFrame(np.arange(5., dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) + for i in [1, 2, 2, 3, 4]]) + + result = df.loc[datetime(2001, 1, 1, 10):] + assert_frame_equal(result, df) + result = df.loc[:datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11):] + expected = df.iloc[1:] + assert_frame_equal(result, expected) + result = df.loc['20010101 11':] + assert_frame_equal(result, expected) + + def test_frame_datetime64_duplicated(self): + dates = date_range('2010-07-01', end='2010-08-05') + + tst = DataFrame({'symbol': 'AAA', 'date': dates}) + result = tst.duplicated(['date', 'symbol']) + self.assertTrue((-result).all()) + + tst = DataFrame({'date': dates}) + result = tst.duplicated() + self.assertTrue((-result).all()) + + +class TestSeriesDatetime64(tm.TestCase): + def setUp(self): + self.series = Series(date_range('1/1/2000', periods=10)) + + def test_auto_conversion(self): + series = Series(list(date_range('1/1/2000', periods=10))) + self.assertEqual(series.dtype, 'M8[ns]') + + def test_constructor_cant_cast_datetime64(self): + msg = "Cannot cast datetime64 to " + with tm.assertRaisesRegexp(TypeError, msg): + Series(date_range('1/1/2000', periods=10), dtype=float) + + with tm.assertRaisesRegexp(TypeError, msg): + Series(date_range('1/1/2000', periods=10), dtype=int) + + def test_constructor_cast_object(self): + s = Series(date_range('1/1/2000', periods=10), dtype=object) + exp = Series(date_range('1/1/2000', periods=10)) + tm.assert_series_equal(s, exp) + + def test_series_comparison_scalars(self): + val = datetime(2000, 1, 4) + result = self.series > val + expected = Series([x > val for x in self.series]) + self.assert_series_equal(result, expected) + + val = self.series[5] + result = self.series > val + expected = Series([x > val for x in self.series]) + self.assert_series_equal(result, expected) + + def test_between(self): + left, right = self.series[[2, 7]] + + result = self.series.between(left, right) + expected = (self.series >= left) & (self.series <= right) + assert_series_equal(result, expected) + + # --------------------------------------------------------------------- + # NaT support + + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + + val = series[3] + self.assertTrue(com.isnull(val)) + + series[2] = val + self.assertTrue(com.isnull(series[2])) + + def test_NaT_cast(self): + # GH10747 + result = Series([np.nan]).astype('M8[ns]') + expected = Series([NaT]) + assert_series_equal(result, expected) + + def test_set_none_nan(self): + self.series[3] = None + self.assertIs(self.series[3], NaT) + + self.series[3:5] = None + self.assertIs(self.series[4], NaT) + + self.series[5] = np.nan + self.assertIs(self.series[5], NaT) + + self.series[5:7] = np.nan + self.assertIs(self.series[6], NaT) + + def test_intercept_astype_object(self): + + # this test no longer makes sense as series is by default already + # M8[ns] + expected = self.series.astype('object') + + df = DataFrame({'a': self.series, + 'b': np.random.randn(len(self.series))}) + exp_dtypes = pd.Series([np.dtype('datetime64[ns]'), + np.dtype('float64')], index=['a', 'b']) + tm.assert_series_equal(df.dtypes, exp_dtypes) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + def test_nat_operations(self): + # GH 8617 + s = Series([0, pd.NaT], dtype='m8[ns]') + exp = s[0] + self.assertEqual(s.median(), exp) + self.assertEqual(s.min(), exp) + self.assertEqual(s.max(), exp) + + def test_round_nat(self): + # GH14940 + s = Series([pd.NaT]) + expected = Series(pd.NaT) + for method in ["round", "floor", "ceil"]: + round_method = getattr(s.dt, method) + for freq in ["s", "5s", "min", "5min", "h", "5h"]: + assert_series_equal(round_method(freq), expected) + + +class TestDaysInMonth(tm.TestCase): + # tests for issue #10154 + def test_day_not_in_month_coerce(self): + self.assertTrue(isnull(to_datetime('2015-02-29', errors='coerce'))) + self.assertTrue(isnull(to_datetime('2015-02-29', format="%Y-%m-%d", + errors='coerce'))) + self.assertTrue(isnull(to_datetime('2015-02-32', format="%Y-%m-%d", + errors='coerce'))) + self.assertTrue(isnull(to_datetime('2015-04-31', format="%Y-%m-%d", + errors='coerce'))) + + def test_day_not_in_month_raise(self): + self.assertRaises(ValueError, to_datetime, '2015-02-29', + errors='raise') + self.assertRaises(ValueError, to_datetime, '2015-02-29', + errors='raise', format="%Y-%m-%d") + self.assertRaises(ValueError, to_datetime, '2015-02-32', + errors='raise', format="%Y-%m-%d") + self.assertRaises(ValueError, to_datetime, '2015-04-31', + errors='raise', format="%Y-%m-%d") + + def test_day_not_in_month_ignore(self): + self.assertEqual(to_datetime( + '2015-02-29', errors='ignore'), '2015-02-29') + self.assertEqual(to_datetime( + '2015-02-29', errors='ignore', format="%Y-%m-%d"), '2015-02-29') + self.assertEqual(to_datetime( + '2015-02-32', errors='ignore', format="%Y-%m-%d"), '2015-02-32') + self.assertEqual(to_datetime( + '2015-04-31', errors='ignore', format="%Y-%m-%d"), '2015-04-31') + + +class TestGuessDatetimeFormat(tm.TestCase): + + def test_guess_datetime_format_with_parseable_formats(self): + tm._skip_if_not_us_locale() + dt_string_to_format = (('20111230', '%Y%m%d'), + ('2011-12-30', '%Y-%m-%d'), + ('30-12-2011', '%d-%m-%Y'), + ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'), + ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'), + ('2011-12-30 00:00:00.000000', + '%Y-%m-%d %H:%M:%S.%f'), ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_with_dayfirst(self): + ambiguous_string = '01/01/2011' + self.assertEqual( + tools._guess_datetime_format(ambiguous_string, dayfirst=True), + '%d/%m/%Y' + ) + self.assertEqual( + tools._guess_datetime_format(ambiguous_string, dayfirst=False), + '%m/%d/%Y' + ) + + def test_guess_datetime_format_with_locale_specific_formats(self): + # The month names will vary depending on the locale, in which + # case these wont be parsed properly (dateutil can't parse them) + _skip_if_has_locale() + + dt_string_to_format = (('30/Dec/2011', '%d/%b/%Y'), + ('30/December/2011', '%d/%B/%Y'), + ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'), ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_invalid_inputs(self): + # A datetime string must include a year, month and a day for it + # to be guessable, in addition to being a string that looks like + # a datetime + invalid_dts = [ + '2013', + '01/2013', + '12:00:00', + '1/1/1/1', + 'this_is_not_a_datetime', + '51a', + 9, + datetime(2011, 1, 1), + ] + + for invalid_dt in invalid_dts: + self.assertTrue(tools._guess_datetime_format(invalid_dt) is None) + + def test_guess_datetime_format_nopadding(self): + # GH 11142 + dt_string_to_format = (('2011-1-1', '%Y-%m-%d'), + ('30-1-2011', '%d-%m-%Y'), + ('1/1/2011', '%m/%d/%Y'), + ('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'), + ('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'), + ('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_for_array(self): + tm._skip_if_not_us_locale() + expected_format = '%Y-%m-%d %H:%M:%S.%f' + dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) + + test_arrays = [ + np.array([dt_string, dt_string, dt_string], dtype='O'), + np.array([np.nan, np.nan, dt_string], dtype='O'), + np.array([dt_string, 'random_string'], dtype='O'), + ] + + for test_array in test_arrays: + self.assertEqual( + tools._guess_datetime_format_for_array(test_array), + expected_format + ) + + format_for_string_of_nans = tools._guess_datetime_format_for_array( + np.array( + [np.nan, np.nan, np.nan], dtype='O')) + self.assertTrue(format_for_string_of_nans is None) + + +class TestToDatetimeInferFormat(tm.TestCase): + + def test_to_datetime_infer_datetime_format_consistent_format(self): + s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) + + test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', + '%Y-%m-%dT%H:%M:%S.%f'] + + for test_format in test_formats: + s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) + + with_format = pd.to_datetime(s_as_dt_strings, format=test_format) + no_infer = pd.to_datetime(s_as_dt_strings, + infer_datetime_format=False) + yes_infer = pd.to_datetime(s_as_dt_strings, + infer_datetime_format=True) + + # Whether the format is explicitly passed, it is inferred, or + # it is not inferred, the results should all be the same + self.assert_series_equal(with_format, no_infer) + self.assert_series_equal(no_infer, yes_infer) + + def test_to_datetime_infer_datetime_format_inconsistent_format(self): + s = pd.Series(np.array(['01/01/2011 00:00:00', + '01-02-2011 00:00:00', + '2011-01-03T00:00:00'])) + + # When the format is inconsistent, infer_datetime_format should just + # fallback to the default parsing + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) + + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + def test_to_datetime_infer_datetime_format_series_with_nans(self): + s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, + '01/03/2011 00:00:00', np.nan])) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): + s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', + '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) + + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + def test_to_datetime_iso8601_noleading_0s(self): + # GH 11871 + s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) + expected = pd.Series([pd.Timestamp('2014-01-01'), + pd.Timestamp('2014-02-02'), + pd.Timestamp('2015-03-03')]) + tm.assert_series_equal(pd.to_datetime(s), expected) + tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) + + +class TimeConversionFormats(tm.TestCase): + def test_to_datetime_format(self): + values = ['1/1/2000', '1/2/2000', '1/3/2000'] + + results1 = [Timestamp('20000101'), Timestamp('20000201'), + Timestamp('20000301')] + results2 = [Timestamp('20000101'), Timestamp('20000102'), + Timestamp('20000103')] + for vals, expecteds in [(values, (Index(results1), Index(results2))), + (Series(values), + (Series(results1), Series(results2))), + (values[0], (results1[0], results2[0])), + (values[1], (results1[1], results2[1])), + (values[2], (results1[2], results2[2]))]: + + for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): + result = to_datetime(vals, format=fmt) + expected = expecteds[i] + + if isinstance(expected, Series): + assert_series_equal(result, Series(expected)) + elif isinstance(expected, Timestamp): + self.assertEqual(result, expected) + else: + tm.assert_index_equal(result, expected) + + def test_to_datetime_format_YYYYMMDD(self): + s = Series([19801222, 19801222] + [19810105] * 5) + expected = Series([Timestamp(x) for x in s.apply(str)]) + + result = to_datetime(s, format='%Y%m%d') + assert_series_equal(result, expected) + + result = to_datetime(s.apply(str), format='%Y%m%d') + assert_series_equal(result, expected) + + # with NaT + expected = Series([Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5) + expected[2] = np.nan + s[2] = np.nan + + result = to_datetime(s, format='%Y%m%d') + assert_series_equal(result, expected) + + # string with NaT + s = s.apply(str) + s[2] = 'nat' + result = to_datetime(s, format='%Y%m%d') + assert_series_equal(result, expected) + + # coercion + # GH 7930 + s = Series([20121231, 20141231, 99991231]) + result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') + expected = Series([datetime(2012, 12, 31), + datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object) + self.assert_series_equal(result, expected) + + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') + expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + assert_series_equal(result, expected) + + # GH 10178 + def test_to_datetime_format_integer(self): + s = Series([2000, 2001, 2002]) + expected = Series([Timestamp(x) for x in s.apply(str)]) + + result = to_datetime(s, format='%Y') + assert_series_equal(result, expected) + + s = Series([200001, 200105, 200206]) + expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) + ]) + + result = to_datetime(s, format='%Y%m') + assert_series_equal(result, expected) + + def test_to_datetime_format_microsecond(self): + + # these are locale dependent + lang, _ = locale.getlocale() + month_abbr = calendar.month_abbr[4] + val = '01-{}-2011 00:00:01.978'.format(month_abbr) + + format = '%d-%b-%Y %H:%M:%S.%f' + result = to_datetime(val, format=format) + exp = datetime.strptime(val, format) + self.assertEqual(result, exp) + + def test_to_datetime_format_time(self): + data = [ + ['01/10/2010 15:20', '%m/%d/%Y %H:%M', + Timestamp('2010-01-10 15:20')], + ['01/10/2010 05:43', '%m/%d/%Y %I:%M', + Timestamp('2010-01-10 05:43')], + ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', + Timestamp('2010-01-10 13:56:01')] # , + # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', + # Timestamp('2010-01-10 20:14')], + # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', + # Timestamp('2010-01-10 07:40')], + # ['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p', + # Timestamp('2010-01-10 09:12:56')] + ] + for s, format, dt in data: + self.assertEqual(to_datetime(s, format=format), dt) + + def test_to_datetime_with_non_exact(self): + # GH 10834 + _skip_if_has_locale() + + # 8904 + # exact kw + if sys.version_info < (2, 7): + raise nose.SkipTest('on python version < 2.7') + + s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', + '19MAY11 00:00:00Z']) + result = to_datetime(s, format='%d%b%y', exact=False) + expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), + format='%d%b%y') + assert_series_equal(result, expected) + + def test_parse_nanoseconds_with_formula(self): + + # GH8989 + # trunctaing the nanoseconds when a format was provided + for v in ["2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", ]: + expected = pd.to_datetime(v) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") + self.assertEqual(result, expected) + + def test_to_datetime_format_weeks(self): + data = [ + ['2009324', '%Y%W%w', Timestamp('2009-08-13')], + ['2013020', '%Y%U%w', Timestamp('2013-01-13')] + ] + for s, format, dt in data: + self.assertEqual(to_datetime(s, format=format), dt) + + +class TestSlicing(tm.TestCase): + def test_slice_year(self): + dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + result = s['2005'] + expected = s[s.index.year == 2005] + assert_series_equal(result, expected) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + result = df.loc['2005'] + expected = df[df.index.year == 2005] + assert_frame_equal(result, expected) + + rng = date_range('1/1/2000', '1/1/2010') + + result = rng.get_loc('2009') + expected = slice(3288, 3653) + self.assertEqual(result, expected) + + def test_slice_quarter(self): + dti = DatetimeIndex(freq='D', start=datetime(2000, 6, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + self.assertEqual(len(s['2001Q1']), 90) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEqual(len(df.loc['1Q01']), 90) + + def test_slice_month(self): + dti = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(dti)), index=dti) + self.assertEqual(len(s['2005-11']), 30) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEqual(len(df.loc['2005-11']), 30) + + assert_series_equal(s['2005-11'], s['11-2005']) + + def test_partial_slice(self): + rng = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-05':'2006-02'] + expected = s['20050501':'20060228'] + assert_series_equal(result, expected) + + result = s['2005-05':] + expected = s['20050501':] + assert_series_equal(result, expected) + + result = s[:'2006-02'] + expected = s[:'20060228'] + assert_series_equal(result, expected) + + result = s['2005-1-1'] + self.assertEqual(result, s.iloc[0]) + + self.assertRaises(Exception, s.__getitem__, '2004-12-31') + + def test_partial_slice_daily(self): + rng = DatetimeIndex(freq='H', start=datetime(2005, 1, 31), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-31'] + assert_series_equal(result, s.iloc[:24]) + + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00') + + def test_partial_slice_hourly(self): + rng = DatetimeIndex(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), + periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-1'] + assert_series_equal(result, s.iloc[:60 * 4]) + + result = s['2005-1-1 20'] + assert_series_equal(result, s.iloc[:60]) + + self.assertEqual(s['2005-1-1 20:00'], s.iloc[0]) + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:15') + + def test_partial_slice_minutely(self): + rng = DatetimeIndex(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), + periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-1 23:59'] + assert_series_equal(result, s.iloc[:60]) + + result = s['2005-1-1'] + assert_series_equal(result, s.iloc[:60]) + + self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.iloc[0]) + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') + + def test_partial_slice_second_precision(self): + rng = DatetimeIndex(start=datetime(2005, 1, 1, 0, 0, 59, + microsecond=999990), + periods=20, freq='US') + s = Series(np.arange(20), rng) + + assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) + assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) + + assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) + assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) + + self.assertEqual(s[Timestamp('2005-1-1 00:00:59.999990')], s.iloc[0]) + self.assertRaisesRegexp(KeyError, '2005-1-1 00:00:00', + lambda: s['2005-1-1 00:00:00']) + + def test_partial_slicing_dataframe(self): + # GH14856 + # Test various combinations of string slicing resolution vs. + # index resolution + # - If string resolution is less precise than index resolution, + # string is considered a slice + # - If string resolution is equal to or more precise than index + # resolution, string is considered an exact match + formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] + resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] + for rnum, resolution in enumerate(resolutions[2:], 2): + # we check only 'day', 'hour', 'minute' and 'second' + unit = Timedelta("1 " + resolution) + middate = datetime(2012, 1, 1, 0, 0, 0) + index = DatetimeIndex([middate - unit, + middate, middate + unit]) + values = [1, 2, 3] + df = DataFrame({'a': values}, index, dtype=np.int64) + self.assertEqual(df.index.resolution, resolution) + + # Timestamp with the same resolution as index + # Should be exact match for Series (return scalar) + # and raise KeyError for Frame + for timestamp, expected in zip(index, values): + ts_string = timestamp.strftime(formats[rnum]) + # make ts_string as precise as index + result = df['a'][ts_string] + self.assertIsInstance(result, np.int64) + self.assertEqual(result, expected) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + # Timestamp with resolution less precise than index + for fmt in formats[:rnum]: + for element, theslice in [[0, slice(None, 1)], + [1, slice(1, None)]]: + ts_string = index[element].strftime(fmt) + + # Series should return slice + result = df['a'][ts_string] + expected = df['a'][theslice] + assert_series_equal(result, expected) + + # Frame should return slice as well + result = df[ts_string] + expected = df[theslice] + assert_frame_equal(result, expected) + + # Timestamp with resolution more precise than index + # Compatible with existing key + # Should return scalar for Series + # and raise KeyError for Frame + for fmt in formats[rnum + 1:]: + ts_string = index[1].strftime(fmt) + result = df['a'][ts_string] + self.assertIsInstance(result, np.int64) + self.assertEqual(result, 2) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + # Not compatible with existing key + # Should raise KeyError + for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: + ts = index[1] + Timedelta("1 " + res) + ts_string = ts.strftime(fmt) + self.assertRaises(KeyError, df['a'].__getitem__, ts_string) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + def test_partial_slicing_with_multiindex(self): + + # GH 4758 + # partial string indexing with a multi-index buggy + df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"], + 'val': [1, 2, 3, 4]}, + index=date_range("2013-06-19 09:30:00", + periods=4, freq='5T')) + df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) + + expected = DataFrame([ + [1] + ], index=Index(['ABC'], name='TICKER'), columns=['val']) + result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] + assert_frame_equal(result, expected) + + expected = df_multi.loc[ + (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] + result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] + assert_series_equal(result, expected) + + # this is a KeyError as we don't do partial string selection on + # multi-levels + def f(): + df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] + + self.assertRaises(KeyError, f) + + # GH 4294 + # partial slice on a series mi + s = pd.DataFrame(randn(1000, 1000), index=pd.date_range( + '2000-1-1', periods=1000)).stack() + + s2 = s[:-1].copy() + expected = s2['2000-1-4'] + result = s2[pd.Timestamp('2000-1-4')] + assert_series_equal(result, expected) + + result = s[pd.Timestamp('2000-1-4')] + expected = s['2000-1-4'] + assert_series_equal(result, expected) + + df2 = pd.DataFrame(s) + expected = df2.xs('2000-1-4') + result = df2.loc[pd.Timestamp('2000-1-4')] + assert_frame_equal(result, expected) + + def test_shift(self): + ts = Series(np.random.randn(5), + index=date_range('1/1/2000', periods=5, freq='H')) + + result = ts.shift(1, freq='5T') + exp_index = ts.index.shift(1, freq='5T') + tm.assert_index_equal(result.index, exp_index) + + # GH #1063, multiple of same base + result = ts.shift(1, freq='4H') + exp_index = ts.index + offsets.Hour(4) + tm.assert_index_equal(result.index, exp_index) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.shift, 1) + + def test_setops_preserve_freq(self): + for tz in [None, 'Asia/Tokyo', 'US/Eastern']: + rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz) + + result = rng[:50].union(rng[50:100]) + self.assertEqual(result.name, rng.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].union(rng[30:100]) + self.assertEqual(result.name, rng.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].union(rng[60:100]) + self.assertEqual(result.name, rng.name) + self.assertIsNone(result.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].intersection(rng[25:75]) + self.assertEqual(result.name, rng.name) + self.assertEqual(result.freqstr, 'D') + self.assertEqual(result.tz, rng.tz) + + nofreq = DatetimeIndex(list(rng[25:75]), name='other') + result = rng[:50].union(nofreq) + self.assertIsNone(result.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].intersection(nofreq) + self.assertIsNone(result.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + def test_min_max(self): + rng = date_range('1/1/2000', '12/31/2000') + rng2 = rng.take(np.random.permutation(len(rng))) + + the_min = rng2.min() + the_max = rng2.max() + tm.assertIsInstance(the_min, Timestamp) + tm.assertIsInstance(the_max, Timestamp) + self.assertEqual(the_min, rng[0]) + self.assertEqual(the_max, rng[-1]) + + self.assertEqual(rng.min(), rng[0]) + self.assertEqual(rng.max(), rng[-1]) + + def test_min_max_series(self): + rng = date_range('1/1/2000', periods=10, freq='4h') + lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] + df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) + + result = df.TS.max() + exp = Timestamp(df.TS.iat[-1]) + self.assertTrue(isinstance(result, Timestamp)) + self.assertEqual(result, exp) + + result = df.TS.min() + exp = Timestamp(df.TS.iat[0]) + self.assertTrue(isinstance(result, Timestamp)) + self.assertEqual(result, exp) + + def test_from_M8_structured(self): + dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] + arr = np.array(dates, + dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) + df = DataFrame(arr) + + self.assertEqual(df['Date'][0], dates[0][0]) + self.assertEqual(df['Forecasting'][0], dates[0][1]) + + s = Series(arr['Date']) + self.assertTrue(s[0], Timestamp) + self.assertEqual(s[0], dates[0][0]) + + s = Series.from_array(arr['Date'], Index([0])) + self.assertEqual(s[0], dates[0][0]) + + def test_get_level_values_box(self): + from pandas import MultiIndex + + dates = date_range('1/1/2000', periods=4) + levels = [dates, [0, 1]] + labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] + + index = MultiIndex(levels=levels, labels=labels) + + self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp)) + + def test_frame_apply_dont_convert_datetime64(self): + from pandas.tseries.offsets import BDay + df = DataFrame({'x1': [datetime(1996, 1, 1)]}) + + df = df.applymap(lambda x: x + BDay()) + df = df.applymap(lambda x: x + BDay()) + + self.assertTrue(df.x1.dtype == 'M8[ns]') + + def test_partial_slice_doesnt_require_monotonicity(self): + # For historical reasons. + s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10)) + + nonmonotonic = s[[3, 5, 4]] + expected = nonmonotonic.iloc[:0] + timestamp = pd.Timestamp('2014-01-10') + + assert_series_equal(nonmonotonic['2014-01-10':], expected) + self.assertRaisesRegexp(KeyError, + r"Timestamp\('2014-01-10 00:00:00'\)", + lambda: nonmonotonic[timestamp:]) + + assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) + self.assertRaisesRegexp(KeyError, + r"Timestamp\('2014-01-10 00:00:00'\)", + lambda: nonmonotonic.loc[timestamp:]) + + +class TestToDatetime(tm.TestCase): + _multiprocess_can_split_ = True + + def test_to_datetime_dt64s(self): + in_bound_dts = [ + np.datetime64('2000-01-01'), + np.datetime64('2000-01-02'), + ] + + for dt in in_bound_dts: + self.assertEqual(pd.to_datetime(dt), Timestamp(dt)) + + oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ] + + for dt in oob_dts: + self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') + self.assertRaises(ValueError, tslib.Timestamp, dt) + self.assertIs(pd.to_datetime(dt, errors='coerce'), NaT) + + def test_to_datetime_array_of_dt64s(self): + dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] + + # Assuming all datetimes are in bounds, to_datetime() returns + # an array that is equal to Timestamp() parsing + self.assert_numpy_array_equal( + pd.to_datetime(dts, box=False), + np.array([Timestamp(x).asm8 for x in dts]) + ) + + # A list of datetimes where the last one is out of bounds + dts_with_oob = dts + [np.datetime64('9999-01-01')] + + self.assertRaises(ValueError, pd.to_datetime, dts_with_oob, + errors='raise') + + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, errors='coerce'), + np.array( + [ + Timestamp(dts_with_oob[0]).asm8, + Timestamp(dts_with_oob[1]).asm8, + iNaT, + ], + dtype='M8' + ) + ) + + # With errors='ignore', out of bounds datetime64s + # are converted to their .item(), which depending on the version of + # numpy is either a python datetime.datetime or datetime.date + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, errors='ignore'), + np.array( + [dt.item() for dt in dts_with_oob], + dtype='O' + ) + ) + + def test_to_datetime_tz(self): + + # xref 8260 + # uniform returns a DatetimeIndex + arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] + result = pd.to_datetime(arr) + expected = DatetimeIndex( + ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') + tm.assert_index_equal(result, expected) + + # mixed tzs will raise + arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] + self.assertRaises(ValueError, lambda: pd.to_datetime(arr)) + + def test_to_datetime_tz_pytz(self): + + # xref 8260 + tm._skip_if_no_pytz() + import pytz + + us_eastern = pytz.timezone('US/Eastern') + arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, + hour=3, minute=0)), + us_eastern.localize(datetime(year=2000, month=6, day=1, + hour=3, minute=0))], + dtype=object) + result = pd.to_datetime(arr, utc=True) + expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', + '2000-06-01 07:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + tm.assert_index_equal(result, expected) + + def test_to_datetime_utc_is_true(self): + # See gh-11934 + start = pd.Timestamp('2014-01-01', tz='utc') + end = pd.Timestamp('2014-01-03', tz='utc') + date_range = pd.bdate_range(start, end) + + result = pd.to_datetime(date_range, utc=True) + expected = pd.DatetimeIndex(data=date_range) + tm.assert_index_equal(result, expected) + + def test_to_datetime_tz_psycopg2(self): + + # xref 8260 + try: + import psycopg2 + except ImportError: + raise nose.SkipTest("no psycopg2 installed") + + # misc cases + tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) + tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) + arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1), + datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], + dtype=object) + + result = pd.to_datetime(arr, errors='coerce', utc=True) + expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', + '2000-06-01 07:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + tm.assert_index_equal(result, expected) + + # dtype coercion + i = pd.DatetimeIndex([ + '2000-01-01 08:00:00+00:00' + ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) + self.assertTrue(is_datetime64_ns_dtype(i)) + + # tz coerceion + result = pd.to_datetime(i, errors='coerce') + tm.assert_index_equal(result, i) + + result = pd.to_datetime(i, errors='coerce', utc=True) + expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], + dtype='datetime64[ns, UTC]') + tm.assert_index_equal(result, expected) + + def test_datetime_bool(self): + # GH13176 + with self.assertRaises(TypeError): + to_datetime(False) + self.assertTrue(to_datetime(False, errors="coerce") is tslib.NaT) + self.assertEqual(to_datetime(False, errors="ignore"), False) + with self.assertRaises(TypeError): + to_datetime(True) + self.assertTrue(to_datetime(True, errors="coerce") is tslib.NaT) + self.assertEqual(to_datetime(True, errors="ignore"), True) + with self.assertRaises(TypeError): + to_datetime([False, datetime.today()]) + with self.assertRaises(TypeError): + to_datetime(['20130101', True]) + tm.assert_index_equal(to_datetime([0, False, tslib.NaT, 0.0], + errors="coerce"), + DatetimeIndex([to_datetime(0), tslib.NaT, + tslib.NaT, to_datetime(0)])) + + def test_datetime_invalid_datatype(self): + # GH13176 + + with self.assertRaises(TypeError): + pd.to_datetime(bool) + with self.assertRaises(TypeError): + pd.to_datetime(pd.to_datetime) + + def test_unit(self): + # GH 11758 + # test proper behavior with erros + + with self.assertRaises(ValueError): + to_datetime([1], unit='D', format='%Y%m%d') + + values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, + 'NaT', ''] + result = to_datetime(values, unit='D', errors='ignore') + expected = Index([11111111, Timestamp('1970-01-02'), + Timestamp('1970-01-02'), pd.NaT, + pd.NaT, pd.NaT, pd.NaT, pd.NaT], + dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit='D', errors='coerce') + expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', + 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, unit='D', errors='raise') + + values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] + + result = to_datetime(values, errors='ignore', unit='s') + expected = Index([1420043460000, pd.NaT, pd.NaT, + pd.NaT, pd.NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors='coerce', unit='s') + expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, errors='raise', unit='s') + + # if we have a string, then we raise a ValueError + # and NOT an OutOfBoundsDatetime + for val in ['foo', Timestamp('20130101')]: + try: + to_datetime(val, errors='raise', unit='s') + except tslib.OutOfBoundsDatetime: + raise AssertionError("incorrect exception raised") + except ValueError: + pass + + def test_unit_consistency(self): + + # consistency of conversions + expected = Timestamp('1970-05-09 14:25:11') + result = pd.to_datetime(11111111, unit='s', errors='raise') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='coerce') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='ignore') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + def test_unit_with_numeric(self): + + # GH 13180 + # coercions from floats/ints are ok + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr1 = [1.434692e+18, 1.432766e+18] + arr2 = np.array(arr1).astype('int64') + for errors in ['ignore', 'raise', 'coerce']: + result = pd.to_datetime(arr1, errors=errors) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(arr2, errors=errors) + tm.assert_index_equal(result, expected) + + # but we want to make sure that we are coercing + # if we have ints/strings + expected = DatetimeIndex(['NaT', + '2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr = ['foo', 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20', + 'NaT', + 'NaT']) + arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + def test_unit_mixed(self): + + # mixed integers/datetimes + expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) + arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + expected = DatetimeIndex(['NaT', + 'NaT', + '2013-01-01']) + arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + def test_dataframe(self): + + df = DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5], + 'hour': [6, 7], + 'minute': [58, 59], + 'second': [10, 11], + 'ms': [1, 1], + 'us': [2, 2], + 'ns': [3, 3]}) + + result = to_datetime({'year': df['year'], + 'month': df['month'], + 'day': df['day']}) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:0:00')]) + assert_series_equal(result, expected) + + # dict-like + result = to_datetime(df[['year', 'month', 'day']].to_dict()) + assert_series_equal(result, expected) + + # dict but with constructable + df2 = df[['year', 'month', 'day']].to_dict() + df2['month'] = 2 + result = to_datetime(df2) + expected2 = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160205 00:0:00')]) + assert_series_equal(result, expected2) + + # unit mappings + units = [{'year': 'years', + 'month': 'months', + 'day': 'days', + 'hour': 'hours', + 'minute': 'minutes', + 'second': 'seconds'}, + {'year': 'year', + 'month': 'month', + 'day': 'day', + 'hour': 'hour', + 'minute': 'minute', + 'second': 'second'}, + ] + + for d in units: + result = to_datetime(df[list(d.keys())].rename(columns=d)) + expected = Series([Timestamp('20150204 06:58:10'), + Timestamp('20160305 07:59:11')]) + assert_series_equal(result, expected) + + d = {'year': 'year', + 'month': 'month', + 'day': 'day', + 'hour': 'hour', + 'minute': 'minute', + 'second': 'second', + 'ms': 'ms', + 'us': 'us', + 'ns': 'ns'} + + result = to_datetime(df.rename(columns=d)) + expected = Series([Timestamp('20150204 06:58:10.001002003'), + Timestamp('20160305 07:59:11.001002003')]) + assert_series_equal(result, expected) + + # coerce back to int + result = to_datetime(df.astype(str)) + assert_series_equal(result, expected) + + # passing coerce + df2 = DataFrame({'year': [2015, 2016], + 'month': [2, 20], + 'day': [4, 5]}) + with self.assertRaises(ValueError): + to_datetime(df2) + result = to_datetime(df2, errors='coerce') + expected = Series([Timestamp('20150204 00:00:00'), + pd.NaT]) + assert_series_equal(result, expected) + + # extra columns + with self.assertRaises(ValueError): + df2 = df.copy() + df2['foo'] = 1 + to_datetime(df2) + + # not enough + for c in [['year'], + ['year', 'month'], + ['year', 'month', 'second'], + ['month', 'day'], + ['year', 'day', 'second']]: + with self.assertRaises(ValueError): + to_datetime(df[c]) + + # duplicates + df2 = DataFrame({'year': [2015, 2016], + 'month': [2, 20], + 'day': [4, 5]}) + df2.columns = ['year', 'year', 'day'] + with self.assertRaises(ValueError): + to_datetime(df2) + + df2 = DataFrame({'year': [2015, 2016], + 'month': [2, 20], + 'day': [4, 5], + 'hour': [4, 5]}) + df2.columns = ['year', 'month', 'day', 'day'] + with self.assertRaises(ValueError): + to_datetime(df2) + + def test_dataframe_dtypes(self): + # #13451 + df = DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]}) + + # int16 + result = to_datetime(df.astype('int16')) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # mixed dtypes + df['month'] = df['month'].astype('int8') + df['day'] = df['day'].astype('int8') + result = to_datetime(df) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # float + df = DataFrame({'year': [2000, 2001], + 'month': [1.5, 1], + 'day': [1, 1]}) + with self.assertRaises(ValueError): + to_datetime(df) + + def test_index_to_datetime(self): + idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = idx.to_datetime() + expected = DatetimeIndex(pd.to_datetime(idx.values)) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + today = datetime.today() + idx = Index([today], dtype=object) + result = idx.to_datetime() + expected = DatetimeIndex([today]) + tm.assert_index_equal(result, expected) + + def test_to_datetime_iso8601(self): + result = to_datetime(["2012-01-01 00:00:00"]) + exp = Timestamp("2012-01-01 00:00:00") + self.assertEqual(result[0], exp) + + result = to_datetime(['20121001']) # bad iso 8601 + exp = Timestamp('2012-10-01') + self.assertEqual(result[0], exp) + + def test_to_datetime_default(self): + rs = to_datetime('2001') + xp = datetime(2001, 1, 1) + self.assertTrue(rs, xp) + + # dayfirst is essentially broken + + # to_datetime('01-13-2012', dayfirst=True) + # self.assertRaises(ValueError, to_datetime('01-13-2012', + # dayfirst=True)) + + def test_to_datetime_on_datetime64_series(self): + # #2699 + s = Series(date_range('1/1/2000', periods=10)) + + result = to_datetime(s) + self.assertEqual(result[0], s[0]) + + def test_to_datetime_with_space_in_series(self): + # GH 6428 + s = Series(['10/18/2006', '10/18/2008', ' ']) + tm.assertRaises(ValueError, lambda: to_datetime(s, errors='raise')) + result_coerce = to_datetime(s, errors='coerce') + expected_coerce = Series([datetime(2006, 10, 18), + datetime(2008, 10, 18), + pd.NaT]) + tm.assert_series_equal(result_coerce, expected_coerce) + result_ignore = to_datetime(s, errors='ignore') + tm.assert_series_equal(result_ignore, s) + + def test_to_datetime_with_apply(self): + # this is only locale tested with US/None locales + _skip_if_has_locale() + + # GH 5195 + # with a format and coerce a single item to_datetime fails + td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) + expected = pd.to_datetime(td, format='%b %y') + result = td.apply(pd.to_datetime, format='%b %y') + assert_series_equal(result, expected) + + td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) + self.assertRaises(ValueError, + lambda: pd.to_datetime(td, format='%b %y', + errors='raise')) + self.assertRaises(ValueError, + lambda: td.apply(pd.to_datetime, format='%b %y', + errors='raise')) + expected = pd.to_datetime(td, format='%b %y', errors='coerce') + + result = td.apply( + lambda x: pd.to_datetime(x, format='%b %y', errors='coerce')) + assert_series_equal(result, expected) + + def test_to_datetime_types(self): + + # empty string + result = to_datetime('') + self.assertIs(result, NaT) + + result = to_datetime(['', '']) + self.assertTrue(isnull(result).all()) + + # ints + result = Timestamp(0) + expected = to_datetime(0) + self.assertEqual(result, expected) + + # GH 3888 (strings) + expected = to_datetime(['2012'])[0] + result = to_datetime('2012') + self.assertEqual(result, expected) + + # array = ['2012','20120101','20120101 12:01:01'] + array = ['20120101', '20120101 12:01:01'] + expected = list(to_datetime(array)) + result = lmap(Timestamp, array) + tm.assert_almost_equal(result, expected) + + # currently fails ### + # result = Timestamp('2012') + # expected = to_datetime('2012') + # self.assertEqual(result, expected) + + def test_to_datetime_unprocessable_input(self): + # GH 4928 + self.assert_numpy_array_equal( + to_datetime([1, '1'], errors='ignore'), + np.array([1, '1'], dtype='O') + ) + self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') + + def test_to_datetime_other_datetime64_units(self): + # 5/25/2012 + scalar = np.int64(1337904000000000).view('M8[us]') + as_obj = scalar.astype('O') + + index = DatetimeIndex([scalar]) + self.assertEqual(index[0], scalar.astype('O')) + + value = Timestamp(scalar) + self.assertEqual(value, as_obj) + + def test_to_datetime_list_of_integers(self): + rng = date_range('1/1/2000', periods=20) + rng = DatetimeIndex(rng.values) + + ints = list(rng.asi8) + + result = DatetimeIndex(ints) + + tm.assert_index_equal(rng, result) + + def test_to_datetime_freq(self): + xp = bdate_range('2000-1-1', periods=10, tz='UTC') + rs = xp.to_datetime() + self.assertEqual(xp.freq, rs.freq) + self.assertEqual(xp.tzinfo, rs.tzinfo) + if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py deleted file mode 100644 index ff6cc4bb9853c..0000000000000 --- a/pandas/tseries/tests/test_timeseries.py +++ /dev/null @@ -1,4184 +0,0 @@ -# pylint: disable-msg=E1101,W0612 -import locale -import calendar -import operator -import sys -from datetime import datetime, time, timedelta -from numpy.random import rand - -import nose -import numpy as np -import pandas.index as _index -import pandas.lib as lib -import pandas.tslib as tslib - -from pandas.types.common import is_datetime64_ns_dtype -import pandas as pd -import pandas.compat as compat -import pandas.core.common as com -import pandas.tseries.frequencies as frequencies -import pandas.tseries.offsets as offsets -import pandas.tseries.tools as tools -import pandas.util.testing as tm -from pandas import ( - Index, Series, DataFrame, isnull, date_range, Timestamp, Period, - DatetimeIndex, to_datetime, bdate_range, Float64Index, - NaT, timedelta_range, Timedelta, concat) -from pandas.compat import range, long, StringIO, lrange, lmap, zip, product -from pandas.tslib import iNaT -from pandas.util.testing import ( - assert_frame_equal, assert_series_equal, assert_almost_equal, - _skip_if_has_locale, slow) - -randn = np.random.randn - - -class TestTimeSeriesDuplicates(tm.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), - datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 3), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 4), - datetime(2000, 1, 4), datetime(2000, 1, 5)] - - self.dups = Series(np.random.randn(len(dates)), index=dates) - - def test_constructor(self): - tm.assertIsInstance(self.dups, Series) - tm.assertIsInstance(self.dups.index, DatetimeIndex) - - def test_is_unique_monotonic(self): - self.assertFalse(self.dups.index.is_unique) - - def test_index_unique(self): - uniques = self.dups.index.unique() - expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 5)]) - self.assertEqual(uniques.dtype, 'M8[ns]') # sanity - tm.assert_index_equal(uniques, expected) - self.assertEqual(self.dups.index.nunique(), 4) - - # #2563 - self.assertTrue(isinstance(uniques, DatetimeIndex)) - - dups_local = self.dups.index.tz_localize('US/Eastern') - dups_local.name = 'foo' - result = dups_local.unique() - expected = DatetimeIndex(expected, name='foo') - expected = expected.tz_localize('US/Eastern') - self.assertTrue(result.tz is not None) - self.assertEqual(result.name, 'foo') - tm.assert_index_equal(result, expected) - - # NaT, note this is excluded - arr = [1370745748 + t for t in range(20)] + [iNaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - self.assertEqual(idx.nunique(), 20) - self.assertEqual(idx.nunique(dropna=False), 21) - - arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) - for t in range(20)] + [NaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - self.assertEqual(idx.nunique(), 20) - self.assertEqual(idx.nunique(dropna=False), 21) - - def test_index_dupes_contains(self): - d = datetime(2011, 12, 5, 20, 30) - ix = DatetimeIndex([d, d]) - self.assertTrue(d in ix) - - def test_duplicate_dates_indexing(self): - ts = self.dups - - uniques = ts.index.unique() - for date in uniques: - result = ts[date] - - mask = ts.index == date - total = (ts.index == date).sum() - expected = ts[mask] - if total > 1: - assert_series_equal(result, expected) - else: - assert_almost_equal(result, expected[0]) - - cp = ts.copy() - cp[date] = 0 - expected = Series(np.where(mask, 0, ts), index=ts.index) - assert_series_equal(cp, expected) - - self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) - - # new index - ts[datetime(2000, 1, 6)] = 0 - self.assertEqual(ts[datetime(2000, 1, 6)], 0) - - def test_range_slice(self): - idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', - '1/4/2000']) - - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts['1/2/2000':] - expected = ts[1:] - assert_series_equal(result, expected) - - result = ts['1/2/2000':'1/3/2000'] - expected = ts[1:4] - assert_series_equal(result, expected) - - def test_groupby_average_dup_values(self): - result = self.dups.groupby(level=0).mean() - expected = self.dups.groupby(self.dups.index).mean() - assert_series_equal(result, expected) - - def test_indexing_over_size_cutoff(self): - import datetime - # #1821 - - old_cutoff = _index._SIZE_CUTOFF - try: - _index._SIZE_CUTOFF = 1000 - - # create large list of non periodic datetime - dates = [] - sec = datetime.timedelta(seconds=1) - half_sec = datetime.timedelta(microseconds=500000) - d = datetime.datetime(2011, 12, 5, 20, 30) - n = 1100 - for i in range(n): - dates.append(d) - dates.append(d + sec) - dates.append(d + sec + half_sec) - dates.append(d + sec + sec + half_sec) - d += 3 * sec - - # duplicate some values in the list - duplicate_positions = np.random.randint(0, len(dates) - 1, 20) - for p in duplicate_positions: - dates[p + 1] = dates[p] - - df = DataFrame(np.random.randn(len(dates), 4), - index=dates, - columns=list('ABCD')) - - pos = n * 3 - timestamp = df.index[pos] - self.assertIn(timestamp, df.index) - - # it works! - df.loc[timestamp] - self.assertTrue(len(df.loc[[timestamp]]) > 0) - finally: - _index._SIZE_CUTOFF = old_cutoff - - def test_indexing_unordered(self): - # GH 2437 - rng = date_range(start='2011-01-01', end='2011-01-15') - ts = Series(randn(len(rng)), index=rng) - ts2 = concat([ts[0:4], ts[-4:], ts[4:-4]]) - - for t in ts.index: - # TODO: unused? - s = str(t) # noqa - - expected = ts[t] - result = ts2[t] - self.assertTrue(expected == result) - - # GH 3448 (ranges) - def compare(slobj): - result = ts2[slobj].copy() - result = result.sort_index() - expected = ts[slobj] - assert_series_equal(result, expected) - - compare(slice('2011-01-01', '2011-01-15')) - compare(slice('2010-12-30', '2011-01-15')) - compare(slice('2011-01-01', '2011-01-16')) - - # partial ranges - compare(slice('2011-01-01', '2011-01-6')) - compare(slice('2011-01-06', '2011-01-8')) - compare(slice('2011-01-06', '2011-01-12')) - - # single values - result = ts2['2011'].sort_index() - expected = ts['2011'] - assert_series_equal(result, expected) - - # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') - ts = Series(np.arange(len(rng)), index=rng) - ts = ts.take(np.random.permutation(20)) - - result = ts['2005'] - for t in result.index: - self.assertTrue(t.year == 2005) - - def test_indexing(self): - - idx = date_range("2001-1-1", periods=20, freq='M') - ts = Series(np.random.rand(len(idx)), index=idx) - - # getting - - # GH 3070, make sure semantics work on Series/Frame - expected = ts['2001'] - expected.name = 'A' - - df = DataFrame(dict(A=ts)) - result = df['2001']['A'] - assert_series_equal(expected, result) - - # setting - ts['2001'] = 1 - expected = ts['2001'] - expected.name = 'A' - - df.loc['2001', 'A'] = 1 - - result = df['2001']['A'] - assert_series_equal(expected, result) - - # GH3546 (not including times on the last day) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', - freq='H') - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013-05'] - assert_series_equal(expected, ts) - - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', - freq='S') - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013-05'] - assert_series_equal(expected, ts) - - idx = [Timestamp('2013-05-31 00:00'), - Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013'] - assert_series_equal(expected, ts) - - # GH14826, indexing with a seconds resolution string / datetime object - df = DataFrame(randn(5, 5), - columns=['open', 'high', 'low', 'close', 'volume'], - index=date_range('2012-01-02 18:01:00', - periods=5, tz='US/Central', freq='s')) - expected = df.loc[[df.index[2]]] - - # this is a single date, so will raise - self.assertRaises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) - self.assertRaises(KeyError, df.__getitem__, df.index[2], ) - - def test_recreate_from_data(self): - freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', - 'C'] - - for f in freqs: - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) - idx = DatetimeIndex(org, freq=f) - tm.assert_index_equal(idx, org) - - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, - tz='US/Pacific', periods=1) - idx = DatetimeIndex(org, freq=f, tz='US/Pacific') - tm.assert_index_equal(idx, org) - - -def assert_range_equal(left, right): - assert (left.equals(right)) - assert (left.freq == right.freq) - assert (left.tz == right.tz) - - -class TestTimeSeries(tm.TestCase): - _multiprocess_can_split_ = True - - def test_is_(self): - dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') - self.assertTrue(dti.is_(dti)) - self.assertTrue(dti.is_(dti.view())) - self.assertFalse(dti.is_(dti.copy())) - - def test_dti_slicing(self): - dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') - dti2 = dti[[1, 3, 5]] - - v1 = dti2[0] - v2 = dti2[1] - v3 = dti2[2] - - self.assertEqual(v1, Timestamp('2/28/2005')) - self.assertEqual(v2, Timestamp('4/30/2005')) - self.assertEqual(v3, Timestamp('6/30/2005')) - - # don't carry freq through irregular slicing - self.assertIsNone(dti2.freq) - - def test_contiguous_boolean_preserve_freq(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') - - mask = np.zeros(len(rng), dtype=bool) - mask[10:20] = True - - masked = rng[mask] - expected = rng[10:20] - self.assertIsNotNone(expected.freq) - assert_range_equal(masked, expected) - - mask[22] = True - masked = rng[mask] - self.assertIsNone(masked.freq) - - def test_getitem_median_slice_bug(self): - index = date_range('20090415', '20090519', freq='2B') - s = Series(np.random.randn(13), index=index) - - indexer = [slice(6, 7, None)] - result = s[indexer] - expected = s[indexer[0]] - assert_series_equal(result, expected) - - def test_series_box_timestamp(self): - rng = date_range('20090415', '20090519', freq='B') - s = Series(rng) - - tm.assertIsInstance(s[5], Timestamp) - - rng = date_range('20090415', '20090519', freq='B') - s = Series(rng, index=rng) - tm.assertIsInstance(s[5], Timestamp) - - tm.assertIsInstance(s.iat[5], Timestamp) - - def test_series_box_timedelta(self): - rng = timedelta_range('1 day 1 s', periods=5, freq='h') - s = Series(rng) - tm.assertIsInstance(s[1], Timedelta) - tm.assertIsInstance(s.iat[2], Timedelta) - - def test_date_range_ambiguous_arguments(self): - # #2538 - start = datetime(2011, 1, 1, 5, 3, 40) - end = datetime(2011, 1, 1, 8, 9, 40) - - self.assertRaises(ValueError, date_range, start, end, freq='s', - periods=10) - - def test_timestamp_to_datetime(self): - tm._skip_if_no_pytz() - rng = date_range('20090415', '20090519', tz='US/Eastern') - - stamp = rng[0] - dtval = stamp.to_pydatetime() - self.assertEqual(stamp, dtval) - self.assertEqual(stamp.tzinfo, dtval.tzinfo) - - def test_timestamp_to_datetime_dateutil(self): - tm._skip_if_no_pytz() - rng = date_range('20090415', '20090519', tz='dateutil/US/Eastern') - - stamp = rng[0] - dtval = stamp.to_pydatetime() - self.assertEqual(stamp, dtval) - self.assertEqual(stamp.tzinfo, dtval.tzinfo) - - def test_timestamp_to_datetime_explicit_pytz(self): - tm._skip_if_no_pytz() - import pytz - rng = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) - - stamp = rng[0] - dtval = stamp.to_pydatetime() - self.assertEqual(stamp, dtval) - self.assertEqual(stamp.tzinfo, dtval.tzinfo) - - def test_timestamp_to_datetime_explicit_dateutil(self): - tm._skip_if_windows_python_3() - tm._skip_if_no_dateutil() - from pandas.tslib import _dateutil_gettz as gettz - rng = date_range('20090415', '20090519', tz=gettz('US/Eastern')) - - stamp = rng[0] - dtval = stamp.to_pydatetime() - self.assertEqual(stamp, dtval) - self.assertEqual(stamp.tzinfo, dtval.tzinfo) - - def test_index_convert_to_datetime_array(self): - tm._skip_if_no_pytz() - - def _check_rng(rng): - converted = rng.to_pydatetime() - tm.assertIsInstance(converted, np.ndarray) - for x, stamp in zip(converted, rng): - tm.assertIsInstance(x, datetime) - self.assertEqual(x, stamp.to_pydatetime()) - self.assertEqual(x.tzinfo, stamp.tzinfo) - - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') - rng_utc = date_range('20090415', '20090519', tz='utc') - - _check_rng(rng) - _check_rng(rng_eastern) - _check_rng(rng_utc) - - def test_index_convert_to_datetime_array_explicit_pytz(self): - tm._skip_if_no_pytz() - import pytz - - def _check_rng(rng): - converted = rng.to_pydatetime() - tm.assertIsInstance(converted, np.ndarray) - for x, stamp in zip(converted, rng): - tm.assertIsInstance(x, datetime) - self.assertEqual(x, stamp.to_pydatetime()) - self.assertEqual(x.tzinfo, stamp.tzinfo) - - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) - rng_utc = date_range('20090415', '20090519', tz=pytz.utc) - - _check_rng(rng) - _check_rng(rng_eastern) - _check_rng(rng_utc) - - def test_index_convert_to_datetime_array_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - - def _check_rng(rng): - converted = rng.to_pydatetime() - tm.assertIsInstance(converted, np.ndarray) - for x, stamp in zip(converted, rng): - tm.assertIsInstance(x, datetime) - self.assertEqual(x, stamp.to_pydatetime()) - self.assertEqual(x.tzinfo, stamp.tzinfo) - - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz='dateutil/US/Eastern') - rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) - - _check_rng(rng) - _check_rng(rng_eastern) - _check_rng(rng_utc) - - def test_ctor_str_intraday(self): - rng = DatetimeIndex(['1-1-2000 00:00:01']) - self.assertEqual(rng[0].second, 1) - - def test_series_ctor_plus_datetimeindex(self): - rng = date_range('20090415', '20090519', freq='B') - data = dict((k, 1) for k in rng) - - result = Series(data, index=rng) - self.assertIs(result.index, rng) - - def test_series_pad_backfill_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - result = s[:2].reindex(index, method='pad', limit=5) - - expected = s[:2].reindex(index).fillna(method='pad') - expected[-3:] = np.nan - assert_series_equal(result, expected) - - result = s[-2:].reindex(index, method='backfill', limit=5) - - expected = s[-2:].reindex(index).fillna(method='backfill') - expected[:3] = np.nan - assert_series_equal(result, expected) - - def test_series_fillna_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - result = s[:2].reindex(index) - result = result.fillna(method='pad', limit=5) - - expected = s[:2].reindex(index).fillna(method='pad') - expected[-3:] = np.nan - assert_series_equal(result, expected) - - result = s[-2:].reindex(index) - result = result.fillna(method='bfill', limit=5) - - expected = s[-2:].reindex(index).fillna(method='backfill') - expected[:3] = np.nan - assert_series_equal(result, expected) - - def test_frame_pad_backfill_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - - result = df[:2].reindex(index, method='pad', limit=5) - - expected = df[:2].reindex(index).fillna(method='pad') - expected.values[-3:] = np.nan - tm.assert_frame_equal(result, expected) - - result = df[-2:].reindex(index, method='backfill', limit=5) - - expected = df[-2:].reindex(index).fillna(method='backfill') - expected.values[:3] = np.nan - tm.assert_frame_equal(result, expected) - - def test_frame_fillna_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - - result = df[:2].reindex(index) - result = result.fillna(method='pad', limit=5) - - expected = df[:2].reindex(index).fillna(method='pad') - expected.values[-3:] = np.nan - tm.assert_frame_equal(result, expected) - - result = df[-2:].reindex(index) - result = result.fillna(method='backfill', limit=5) - - expected = df[-2:].reindex(index).fillna(method='backfill') - expected.values[:3] = np.nan - tm.assert_frame_equal(result, expected) - - def test_frame_setitem_timestamp(self): - # 2155 - columns = DatetimeIndex(start='1/1/2012', end='2/1/2012', - freq=offsets.BDay()) - index = lrange(10) - data = DataFrame(columns=columns, index=index) - t = datetime(2012, 11, 1) - ts = Timestamp(t) - data[ts] = np.nan # works - - def test_sparse_series_fillna_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - ss = s[:2].reindex(index).to_sparse() - result = ss.fillna(method='pad', limit=5) - expected = ss.fillna(method='pad', limit=5) - expected = expected.to_dense() - expected[-3:] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - ss = s[-2:].reindex(index).to_sparse() - result = ss.fillna(method='backfill', limit=5) - expected = ss.fillna(method='backfill') - expected = expected.to_dense() - expected[:3] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - def test_sparse_series_pad_backfill_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - s = s.to_sparse() - - result = s[:2].reindex(index, method='pad', limit=5) - expected = s[:2].reindex(index).fillna(method='pad') - expected = expected.to_dense() - expected[-3:] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - result = s[-2:].reindex(index, method='backfill', limit=5) - expected = s[-2:].reindex(index).fillna(method='backfill') - expected = expected.to_dense() - expected[:3] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - def test_sparse_frame_pad_backfill_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - sdf = df.to_sparse() - - result = sdf[:2].reindex(index, method='pad', limit=5) - - expected = sdf[:2].reindex(index).fillna(method='pad') - expected = expected.to_dense() - expected.values[-3:] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - result = sdf[-2:].reindex(index, method='backfill', limit=5) - - expected = sdf[-2:].reindex(index).fillna(method='backfill') - expected = expected.to_dense() - expected.values[:3] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - def test_sparse_frame_fillna_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - sdf = df.to_sparse() - - result = sdf[:2].reindex(index) - result = result.fillna(method='pad', limit=5) - - expected = sdf[:2].reindex(index).fillna(method='pad') - expected = expected.to_dense() - expected.values[-3:] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - result = sdf[-2:].reindex(index) - result = result.fillna(method='backfill', limit=5) - - expected = sdf[-2:].reindex(index).fillna(method='backfill') - expected = expected.to_dense() - expected.values[:3] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - def test_pad_require_monotonicity(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') - - # neither monotonic increasing or decreasing - rng2 = rng[[1, 0, 2]] - - self.assertRaises(ValueError, rng2.get_indexer, rng, method='pad') - - def test_frame_ctor_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') - dates = np.asarray(rng) - - df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) - self.assertTrue(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) - - def test_frame_add_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') - df = DataFrame(index=np.arange(len(rng))) - - df['A'] = rng - self.assertTrue(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) - - def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({'year': date_range('1/1/1700', periods=50, - freq='A-DEC')}) - # it works! - repr(df) - - def test_frame_add_datetime64_col_other_units(self): - n = 100 - - units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] - - ns_dtype = np.dtype('M8[ns]') - - for unit in units: - dtype = np.dtype('M8[%s]' % unit) - vals = np.arange(n, dtype=np.int64).view(dtype) - - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) - df[unit] = vals - - ex_vals = to_datetime(vals.astype('O')).values - - self.assertEqual(df[unit].dtype, ns_dtype) - self.assertTrue((df[unit].values == ex_vals).all()) - - # Test insertion into existing datetime64 column - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) - df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) - - for unit in units: - dtype = np.dtype('M8[%s]' % unit) - vals = np.arange(n, dtype=np.int64).view(dtype) - - tmp = df.copy() - - tmp['dates'] = vals - ex_vals = to_datetime(vals.astype('O')).values - - self.assertTrue((tmp['dates'].values == ex_vals).all()) - - def test_to_datetime_unit(self): - - epoch = 1370745748 - s = Series([epoch + t for t in range(20)]) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)]) - assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)]) - assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)] + [iNaT]) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) - assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) - assert_series_equal(result, expected) - - # GH13834 - s = Series([epoch + t for t in np.arange(0, 2, .25)] + - [iNaT]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in np.arange(0, 2, .25)] + [NaT]) - assert_series_equal(result, expected) - - s = concat([Series([epoch + t for t in range(20)] - ).astype(float), Series([np.nan])], - ignore_index=True) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) - assert_series_equal(result, expected) - - result = to_datetime([1, 2, 'NaT', pd.NaT, np.nan], unit='D') - expected = DatetimeIndex([Timestamp('1970-01-02'), - Timestamp('1970-01-03')] + ['NaT'] * 3) - tm.assert_index_equal(result, expected) - - with self.assertRaises(ValueError): - to_datetime([1, 2, 'foo'], unit='D') - with self.assertRaises(ValueError): - to_datetime([1, 2, 111111111], unit='D') - - # coerce we can process - expected = DatetimeIndex([Timestamp('1970-01-02'), - Timestamp('1970-01-03')] + ['NaT'] * 1) - result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') - tm.assert_index_equal(result, expected) - - result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') - tm.assert_index_equal(result, expected) - - def test_series_ctor_datetime64(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') - dates = np.asarray(rng) - - series = Series(dates) - self.assertTrue(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) - - def test_index_cast_datetime64_other_units(self): - arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') - - idx = Index(arr) - - self.assertTrue((idx.values == tslib.cast_to_nanoseconds(arr)).all()) - - def test_reindex_series_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - series = Series(rng) - - result = series.reindex(lrange(15)) - self.assertTrue(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) - - mask = result.isnull() - self.assertTrue(mask[-5:].all()) - self.assertFalse(mask[:-5].any()) - - def test_reindex_frame_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) - - result = df.reindex(lrange(15)) - self.assertTrue(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) - - mask = com.isnull(result)['B'] - self.assertTrue(mask[-5:].all()) - self.assertFalse(mask[:-5].any()) - - def test_series_repr_nat(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') - - result = repr(series) - expected = ('0 1970-01-01 00:00:00.000000\n' - '1 1970-01-01 00:00:00.000001\n' - '2 1970-01-01 00:00:00.000002\n' - '3 NaT\n' - 'dtype: datetime64[ns]') - self.assertEqual(result, expected) - - def test_fillna_nat(self): - series = Series([0, 1, 2, iNaT], dtype='M8[ns]') - - filled = series.fillna(method='pad') - filled2 = series.fillna(value=series.values[2]) - - expected = series.copy() - expected.values[3] = expected.values[2] - - assert_series_equal(filled, expected) - assert_series_equal(filled2, expected) - - df = DataFrame({'A': series}) - filled = df.fillna(method='pad') - filled2 = df.fillna(value=series.values[2]) - expected = DataFrame({'A': expected}) - assert_frame_equal(filled, expected) - assert_frame_equal(filled2, expected) - - series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') - - filled = series.fillna(method='bfill') - filled2 = series.fillna(value=series[1]) - - expected = series.copy() - expected[0] = expected[1] - - assert_series_equal(filled, expected) - assert_series_equal(filled2, expected) - - df = DataFrame({'A': series}) - filled = df.fillna(method='bfill') - filled2 = df.fillna(value=series[1]) - expected = DataFrame({'A': expected}) - assert_frame_equal(filled, expected) - assert_frame_equal(filled2, expected) - - def test_string_na_nat_conversion(self): - # GH #999, #858 - - from pandas.compat import parse_date - - strings = np.array(['1/1/2000', '1/2/2000', np.nan, - '1/4/2000, 12:34:56'], dtype=object) - - expected = np.empty(4, dtype='M8[ns]') - for i, val in enumerate(strings): - if com.isnull(val): - expected[i] = iNaT - else: - expected[i] = parse_date(val) - - result = tslib.array_to_datetime(strings) - assert_almost_equal(result, expected) - - result2 = to_datetime(strings) - tm.assertIsInstance(result2, DatetimeIndex) - tm.assert_numpy_array_equal(result, result2.values) - - malformed = np.array(['1/100/2000', np.nan], dtype=object) - - # GH 10636, default is now 'raise' - self.assertRaises(ValueError, - lambda: to_datetime(malformed, errors='raise')) - - result = to_datetime(malformed, errors='ignore') - tm.assert_numpy_array_equal(result, malformed) - - self.assertRaises(ValueError, to_datetime, malformed, errors='raise') - - idx = ['a', 'b', 'c', 'd', 'e'] - series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, - '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000'), np.nan, - to_datetime('1/3/2000'), np.nan, - to_datetime('1/5/2000')], index=idx, name='foo') - - result = to_datetime(series) - dresult = to_datetime(dseries) - - expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) - for i in range(5): - x = series[i] - if isnull(x): - expected[i] = iNaT - else: - expected[i] = to_datetime(x) - - assert_series_equal(result, expected, check_names=False) - self.assertEqual(result.name, 'foo') - - assert_series_equal(dresult, expected, check_names=False) - self.assertEqual(dresult.name, 'foo') - - def test_to_datetime_iso8601(self): - result = to_datetime(["2012-01-01 00:00:00"]) - exp = Timestamp("2012-01-01 00:00:00") - self.assertEqual(result[0], exp) - - result = to_datetime(['20121001']) # bad iso 8601 - exp = Timestamp('2012-10-01') - self.assertEqual(result[0], exp) - - def test_to_datetime_default(self): - rs = to_datetime('2001') - xp = datetime(2001, 1, 1) - self.assertTrue(rs, xp) - - # dayfirst is essentially broken - - # to_datetime('01-13-2012', dayfirst=True) - # self.assertRaises(ValueError, to_datetime('01-13-2012', - # dayfirst=True)) - - def test_to_datetime_on_datetime64_series(self): - # #2699 - s = Series(date_range('1/1/2000', periods=10)) - - result = to_datetime(s) - self.assertEqual(result[0], s[0]) - - def test_to_datetime_with_space_in_series(self): - # GH 6428 - s = Series(['10/18/2006', '10/18/2008', ' ']) - tm.assertRaises(ValueError, lambda: to_datetime(s, errors='raise')) - result_coerce = to_datetime(s, errors='coerce') - expected_coerce = Series([datetime(2006, 10, 18), - datetime(2008, 10, 18), - pd.NaT]) - tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(s, errors='ignore') - tm.assert_series_equal(result_ignore, s) - - def test_to_datetime_with_apply(self): - # this is only locale tested with US/None locales - _skip_if_has_locale() - - # GH 5195 - # with a format and coerce a single item to_datetime fails - td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) - expected = pd.to_datetime(td, format='%b %y') - result = td.apply(pd.to_datetime, format='%b %y') - assert_series_equal(result, expected) - - td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) - self.assertRaises(ValueError, - lambda: pd.to_datetime(td, format='%b %y', - errors='raise')) - self.assertRaises(ValueError, - lambda: td.apply(pd.to_datetime, format='%b %y', - errors='raise')) - expected = pd.to_datetime(td, format='%b %y', errors='coerce') - - result = td.apply( - lambda x: pd.to_datetime(x, format='%b %y', errors='coerce')) - assert_series_equal(result, expected) - - def test_nat_vector_field_access(self): - idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000']) - - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', - 'days_in_month', 'is_leap_year'] - - for field in fields: - result = getattr(idx, field) - expected = [getattr(x, field) for x in idx] - self.assert_numpy_array_equal(result, np.array(expected)) - - s = pd.Series(idx) - - for field in fields: - result = getattr(s.dt, field) - expected = [getattr(x, field) for x in idx] - self.assert_series_equal(result, pd.Series(expected)) - - def test_nat_scalar_field_access(self): - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', - 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name'] - for field in fields: - result = getattr(NaT, field) - self.assertTrue(np.isnan(result)) - - def test_NaT_methods(self): - # GH 9513 - raise_methods = ['astimezone', 'combine', 'ctime', 'dst', - 'fromordinal', 'fromtimestamp', 'isocalendar', - 'strftime', 'strptime', 'time', 'timestamp', - 'timetuple', 'timetz', 'toordinal', 'tzname', - 'utcfromtimestamp', 'utcnow', 'utcoffset', - 'utctimetuple'] - nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today'] - nan_methods = ['weekday', 'isoweekday'] - - for method in raise_methods: - if hasattr(NaT, method): - self.assertRaises(ValueError, getattr(NaT, method)) - - for method in nan_methods: - if hasattr(NaT, method): - self.assertTrue(np.isnan(getattr(NaT, method)())) - - for method in nat_methods: - if hasattr(NaT, method): - # see gh-8254 - exp_warning = None - if method == 'to_datetime': - exp_warning = FutureWarning - with tm.assert_produces_warning( - exp_warning, check_stacklevel=False): - self.assertIs(getattr(NaT, method)(), NaT) - - # GH 12300 - self.assertEqual(NaT.isoformat(), 'NaT') - - def test_to_datetime_types(self): - - # empty string - result = to_datetime('') - self.assertIs(result, NaT) - - result = to_datetime(['', '']) - self.assertTrue(isnull(result).all()) - - # ints - result = Timestamp(0) - expected = to_datetime(0) - self.assertEqual(result, expected) - - # GH 3888 (strings) - expected = to_datetime(['2012'])[0] - result = to_datetime('2012') - self.assertEqual(result, expected) - - # array = ['2012','20120101','20120101 12:01:01'] - array = ['20120101', '20120101 12:01:01'] - expected = list(to_datetime(array)) - result = lmap(Timestamp, array) - tm.assert_almost_equal(result, expected) - - # currently fails ### - # result = Timestamp('2012') - # expected = to_datetime('2012') - # self.assertEqual(result, expected) - - def test_to_datetime_unprocessable_input(self): - # GH 4928 - self.assert_numpy_array_equal( - to_datetime([1, '1'], errors='ignore'), - np.array([1, '1'], dtype='O') - ) - self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') - - def test_to_datetime_other_datetime64_units(self): - # 5/25/2012 - scalar = np.int64(1337904000000000).view('M8[us]') - as_obj = scalar.astype('O') - - index = DatetimeIndex([scalar]) - self.assertEqual(index[0], scalar.astype('O')) - - value = Timestamp(scalar) - self.assertEqual(value, as_obj) - - def test_to_datetime_list_of_integers(self): - rng = date_range('1/1/2000', periods=20) - rng = DatetimeIndex(rng.values) - - ints = list(rng.asi8) - - result = DatetimeIndex(ints) - - tm.assert_index_equal(rng, result) - - def test_to_datetime_freq(self): - xp = bdate_range('2000-1-1', periods=10, tz='UTC') - rs = xp.to_datetime() - self.assertEqual(xp.freq, rs.freq) - self.assertEqual(xp.tzinfo, rs.tzinfo) - - def test_range_edges(self): - # GH 13672 - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), - end=Timestamp('1970-01-01 00:00:00.000000004'), - freq='N') - exp = DatetimeIndex(['1970-01-01 00:00:00.000000001', - '1970-01-01 00:00:00.000000002', - '1970-01-01 00:00:00.000000003', - '1970-01-01 00:00:00.000000004']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000004'), - end=Timestamp('1970-01-01 00:00:00.000000001'), - freq='N') - exp = DatetimeIndex([]) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), - end=Timestamp('1970-01-01 00:00:00.000000001'), - freq='N') - exp = DatetimeIndex(['1970-01-01 00:00:00.000000001']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000001'), - end=Timestamp('1970-01-01 00:00:00.000004'), - freq='U') - exp = DatetimeIndex(['1970-01-01 00:00:00.000001', - '1970-01-01 00:00:00.000002', - '1970-01-01 00:00:00.000003', - '1970-01-01 00:00:00.000004']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.001'), - end=Timestamp('1970-01-01 00:00:00.004'), - freq='L') - exp = DatetimeIndex(['1970-01-01 00:00:00.001', - '1970-01-01 00:00:00.002', - '1970-01-01 00:00:00.003', - '1970-01-01 00:00:00.004']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:01'), - end=Timestamp('1970-01-01 00:00:04'), freq='S') - exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02', - '1970-01-01 00:00:03', '1970-01-01 00:00:04']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 00:01'), - end=Timestamp('1970-01-01 00:04'), freq='T') - exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02', - '1970-01-01 00:03', '1970-01-01 00:04']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01 01:00'), - end=Timestamp('1970-01-01 04:00'), freq='H') - exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00', - '1970-01-01 03:00', '1970-01-01 04:00']) - tm.assert_index_equal(idx, exp) - - idx = DatetimeIndex(start=Timestamp('1970-01-01'), - end=Timestamp('1970-01-04'), freq='D') - exp = DatetimeIndex(['1970-01-01', '1970-01-02', - '1970-01-03', '1970-01-04']) - tm.assert_index_equal(idx, exp) - - def test_range_misspecified(self): - # GH #1095 - - self.assertRaises(ValueError, date_range, '1/1/2000') - self.assertRaises(ValueError, date_range, end='1/1/2000') - self.assertRaises(ValueError, date_range, periods=10) - - self.assertRaises(ValueError, date_range, '1/1/2000', freq='H') - self.assertRaises(ValueError, date_range, end='1/1/2000', freq='H') - self.assertRaises(ValueError, date_range, periods=10, freq='H') - - def test_reasonable_keyerror(self): - # GH #1062 - index = DatetimeIndex(['1/3/2000']) - try: - index.get_loc('1/1/2000') - except KeyError as e: - self.assertIn('2000', str(e)) - - def test_reindex_with_datetimes(self): - rng = date_range('1/1/2000', periods=20) - ts = Series(np.random.randn(20), index=rng) - - result = ts.reindex(list(ts.index[5:10])) - expected = ts[5:10] - tm.assert_series_equal(result, expected) - - result = ts[list(ts.index[5:10])] - tm.assert_series_equal(result, expected) - - def test_asfreq_keep_index_name(self): - # GH #9854 - index_name = 'bar' - index = pd.date_range('20130101', periods=20, name=index_name) - df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index) - - self.assertEqual(index_name, df.index.name) - self.assertEqual(index_name, df.asfreq('10D').index.name) - - def test_promote_datetime_date(self): - rng = date_range('1/1/2000', periods=20) - ts = Series(np.random.randn(20), index=rng) - - ts_slice = ts[5:] - ts2 = ts_slice.copy() - ts2.index = [x.date() for x in ts2.index] - - result = ts + ts2 - result2 = ts2 + ts - expected = ts + ts[5:] - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - - # test asfreq - result = ts2.asfreq('4H', method='ffill') - expected = ts[5:].asfreq('4H', method='ffill') - assert_series_equal(result, expected) - - result = rng.get_indexer(ts2.index) - expected = rng.get_indexer(ts_slice.index) - self.assert_numpy_array_equal(result, expected) - - def test_asfreq_normalize(self): - rng = date_range('1/1/2000 09:30', periods=20) - norm = date_range('1/1/2000', periods=20) - vals = np.random.randn(20) - ts = Series(vals, index=rng) - - result = ts.asfreq('D', normalize=True) - norm = date_range('1/1/2000', periods=20) - expected = Series(vals, index=norm) - - assert_series_equal(result, expected) - - vals = np.random.randn(20, 3) - ts = DataFrame(vals, index=rng) - - result = ts.asfreq('D', normalize=True) - expected = DataFrame(vals, index=norm) - - assert_frame_equal(result, expected) - - def test_date_range_gen_error(self): - rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') - self.assertEqual(len(rng), 4) - - def test_date_range_negative_freq(self): - # GH 11018 - rng = date_range('2011-12-31', freq='-2A', periods=3) - exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31', - '2007-12-31'], freq='-2A') - tm.assert_index_equal(rng, exp) - self.assertEqual(rng.freq, '-2A') - - rng = date_range('2011-01-31', freq='-2M', periods=3) - exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30', - '2010-09-30'], freq='-2M') - tm.assert_index_equal(rng, exp) - self.assertEqual(rng.freq, '-2M') - - def test_date_range_bms_bug(self): - # #1645 - rng = date_range('1/1/2000', periods=10, freq='BMS') - - ex_first = Timestamp('2000-01-03') - self.assertEqual(rng[0], ex_first) - - def test_date_range_businesshour(self): - idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', - '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', - '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00'], - freq='BH') - rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH') - tm.assert_index_equal(idx, rng) - - idx = DatetimeIndex( - ['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH') - rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH') - tm.assert_index_equal(idx, rng) - - idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', - '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', - '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00', - '2014-07-07 09:00', '2014-07-07 10:00', - '2014-07-07 11:00', - '2014-07-07 12:00', '2014-07-07 13:00', - '2014-07-07 14:00', - '2014-07-07 15:00', '2014-07-07 16:00', - '2014-07-08 09:00', '2014-07-08 10:00', - '2014-07-08 11:00', - '2014-07-08 12:00', '2014-07-08 13:00', - '2014-07-08 14:00', - '2014-07-08 15:00', '2014-07-08 16:00'], - freq='BH') - rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH') - tm.assert_index_equal(idx, rng) - - def test_first_subset(self): - ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') - result = ts.first('10d') - self.assertEqual(len(result), 20) - - ts = _simple_ts('1/1/2000', '1/1/2010') - result = ts.first('10d') - self.assertEqual(len(result), 10) - - result = ts.first('3M') - expected = ts[:'3/31/2000'] - assert_series_equal(result, expected) - - result = ts.first('21D') - expected = ts[:21] - assert_series_equal(result, expected) - - result = ts[:0].first('3M') - assert_series_equal(result, ts[:0]) - - def test_last_subset(self): - ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') - result = ts.last('10d') - self.assertEqual(len(result), 20) - - ts = _simple_ts('1/1/2000', '1/1/2010') - result = ts.last('10d') - self.assertEqual(len(result), 10) - - result = ts.last('21D') - expected = ts['12/12/2009':] - assert_series_equal(result, expected) - - result = ts.last('21D') - expected = ts[-21:] - assert_series_equal(result, expected) - - result = ts[:0].last('3M') - assert_series_equal(result, ts[:0]) - - def test_format_pre_1900_dates(self): - rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC') - rng.format() - ts = Series(1, index=rng) - repr(ts) - - def test_at_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = Series(np.random.randn(len(rng)), index=rng) - rs = ts.at_time(rng[1]) - self.assertTrue((rs.index.hour == rng[1].hour).all()) - self.assertTrue((rs.index.minute == rng[1].minute).all()) - self.assertTrue((rs.index.second == rng[1].second).all()) - - result = ts.at_time('9:30') - expected = ts.at_time(time(9, 30)) - assert_series_equal(result, expected) - - df = DataFrame(np.random.randn(len(rng), 3), index=rng) - - result = ts[time(9, 30)] - result_df = df.loc[time(9, 30)] - expected = ts[(rng.hour == 9) & (rng.minute == 30)] - exp_df = df[(rng.hour == 9) & (rng.minute == 30)] - - # expected.index = date_range('1/1/2000', '1/4/2000') - - assert_series_equal(result, expected) - tm.assert_frame_equal(result_df, exp_df) - - chunk = df.loc['1/4/2000':] - result = chunk.loc[time(9, 30)] - expected = result_df[-1:] - tm.assert_frame_equal(result, expected) - - # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts.at_time(time(0, 0)) - assert_series_equal(result, ts) - - # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) - ts = Series(np.random.randn(len(rng)), rng) - rs = ts.at_time('16:00') - self.assertEqual(len(rs), 0) - - def test_at_time_frame(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - rs = ts.at_time(rng[1]) - self.assertTrue((rs.index.hour == rng[1].hour).all()) - self.assertTrue((rs.index.minute == rng[1].minute).all()) - self.assertTrue((rs.index.second == rng[1].second).all()) - - result = ts.at_time('9:30') - expected = ts.at_time(time(9, 30)) - assert_frame_equal(result, expected) - - result = ts.loc[time(9, 30)] - expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] - - assert_frame_equal(result, expected) - - # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') - ts = DataFrame(np.random.randn(len(rng), 3), index=rng) - - result = ts.at_time(time(0, 0)) - assert_frame_equal(result, ts) - - # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) - ts = DataFrame(np.random.randn(len(rng), 2), rng) - rs = ts.at_time('16:00') - self.assertEqual(len(rs), 0) - - def test_between_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = Series(np.random.randn(len(rng)), index=rng) - stime = time(0, 0) - etime = time(1, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = 13 * 4 + 1 - if not inc_start: - exp_len -= 5 - if not inc_end: - exp_len -= 4 - - self.assertEqual(len(filtered), exp_len) - for rs in filtered.index: - t = rs.time() - if inc_start: - self.assertTrue(t >= stime) - else: - self.assertTrue(t > stime) - - if inc_end: - self.assertTrue(t <= etime) - else: - self.assertTrue(t < etime) - - result = ts.between_time('00:00', '01:00') - expected = ts.between_time(stime, etime) - assert_series_equal(result, expected) - - # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = Series(np.random.randn(len(rng)), index=rng) - stime = time(22, 0) - etime = time(9, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = (12 * 11 + 1) * 4 + 1 - if not inc_start: - exp_len -= 4 - if not inc_end: - exp_len -= 4 - - self.assertEqual(len(filtered), exp_len) - for rs in filtered.index: - t = rs.time() - if inc_start: - self.assertTrue((t >= stime) or (t <= etime)) - else: - self.assertTrue((t > stime) or (t <= etime)) - - if inc_end: - self.assertTrue((t <= etime) or (t >= stime)) - else: - self.assertTrue((t < etime) or (t >= stime)) - - def test_between_time_frame(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - stime = time(0, 0) - etime = time(1, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = 13 * 4 + 1 - if not inc_start: - exp_len -= 5 - if not inc_end: - exp_len -= 4 - - self.assertEqual(len(filtered), exp_len) - for rs in filtered.index: - t = rs.time() - if inc_start: - self.assertTrue(t >= stime) - else: - self.assertTrue(t > stime) - - if inc_end: - self.assertTrue(t <= etime) - else: - self.assertTrue(t < etime) - - result = ts.between_time('00:00', '01:00') - expected = ts.between_time(stime, etime) - assert_frame_equal(result, expected) - - # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - stime = time(22, 0) - etime = time(9, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = (12 * 11 + 1) * 4 + 1 - if not inc_start: - exp_len -= 4 - if not inc_end: - exp_len -= 4 - - self.assertEqual(len(filtered), exp_len) - for rs in filtered.index: - t = rs.time() - if inc_start: - self.assertTrue((t >= stime) or (t <= etime)) - else: - self.assertTrue((t > stime) or (t <= etime)) - - if inc_end: - self.assertTrue((t <= etime) or (t >= stime)) - else: - self.assertTrue((t < etime) or (t >= stime)) - - def test_between_time_types(self): - # GH11818 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - self.assertRaises(ValueError, rng.indexer_between_time, - datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - - frame = DataFrame({'A': 0}, index=rng) - self.assertRaises(ValueError, frame.between_time, - datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - - series = Series(0, index=rng) - self.assertRaises(ValueError, series.between_time, - datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - - def test_between_time_formats(self): - # GH11818 - _skip_if_has_locale() - - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - - strings = [("2:00", "2:30"), ("0200", "0230"), ("2:00am", "2:30am"), - ("0200am", "0230am"), ("2:00:00", "2:30:00"), - ("020000", "023000"), ("2:00:00am", "2:30:00am"), - ("020000am", "023000am")] - expected_length = 28 - - for time_string in strings: - self.assertEqual(len(ts.between_time(*time_string)), - expected_length, - "%s - %s" % time_string) - - def test_dti_constructor_preserve_dti_freq(self): - rng = date_range('1/1/2000', '1/2/2000', freq='5min') - - rng2 = DatetimeIndex(rng) - self.assertEqual(rng.freq, rng2.freq) - - def test_dti_constructor_years_only(self): - # GH 6961 - for tz in [None, 'UTC', 'Asia/Tokyo', 'dateutil/US/Pacific']: - rng1 = date_range('2014', '2015', freq='M', tz=tz) - expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz) - - rng2 = date_range('2014', '2015', freq='MS', tz=tz) - expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', - tz=tz) - - rng3 = date_range('2014', '2020', freq='A', tz=tz) - expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz) - - rng4 = date_range('2014', '2020', freq='AS', tz=tz) - expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', - tz=tz) - - for rng, expected in [(rng1, expected1), (rng2, expected2), - (rng3, expected3), (rng4, expected4)]: - tm.assert_index_equal(rng, expected) - - def test_dti_constructor_small_int(self): - # GH 13721 - exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', - '1970-01-01 00:00:00.00000001', - '1970-01-01 00:00:00.00000002']) - - for dtype in [np.int64, np.int32, np.int16, np.int8]: - arr = np.array([0, 10, 20], dtype=dtype) - tm.assert_index_equal(DatetimeIndex(arr), exp) - - def test_dti_constructor_numpy_timeunits(self): - # GH 9114 - base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) - - for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', - 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: - values = base.values.astype(dtype) - - tm.assert_index_equal(DatetimeIndex(values), base) - tm.assert_index_equal(to_datetime(values), base) - - def test_normalize(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D') - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D') - tm.assert_index_equal(result, expected) - - rng_ns = pd.DatetimeIndex(np.array([1380585623454345752, - 1380585612343234312]).astype( - "datetime64[ns]")) - rng_ns_normalized = rng_ns.normalize() - expected = pd.DatetimeIndex(np.array([1380585600000000000, - 1380585600000000000]).astype( - "datetime64[ns]")) - tm.assert_index_equal(rng_ns_normalized, expected) - - self.assertTrue(result.is_normalized) - self.assertFalse(rng.is_normalized) - - def test_to_period(self): - from pandas.tseries.period import period_range - - ts = _simple_ts('1/1/2000', '1/1/2001') - - pts = ts.to_period() - exp = ts.copy() - exp.index = period_range('1/1/2000', '1/1/2001') - assert_series_equal(pts, exp) - - pts = ts.to_period('M') - exp.index = exp.index.asfreq('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) - assert_series_equal(pts, exp) - - # GH 7606 without freq - idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04']) - exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04'], freq='D') - - s = Series(np.random.randn(4), index=idx) - expected = s.copy() - expected.index = exp_idx - assert_series_equal(s.to_period(), expected) - - df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) - expected = df.copy() - expected.index = exp_idx - assert_frame_equal(df.to_period(), expected) - - expected = df.copy() - expected.columns = exp_idx - assert_frame_equal(df.to_period(axis=1), expected) - - def create_dt64_based_index(self): - data = [Timestamp('2007-01-01 10:11:12.123456Z'), - Timestamp('2007-01-01 10:11:13.789123Z')] - index = DatetimeIndex(data) - return index - - def test_to_period_millisecond(self): - index = self.create_dt64_based_index() - - period = index.to_period(freq='L') - self.assertEqual(2, len(period)) - self.assertEqual(period[0], Period('2007-01-01 10:11:12.123Z', 'L')) - self.assertEqual(period[1], Period('2007-01-01 10:11:13.789Z', 'L')) - - def test_to_period_microsecond(self): - index = self.create_dt64_based_index() - - period = index.to_period(freq='U') - self.assertEqual(2, len(period)) - self.assertEqual(period[0], Period('2007-01-01 10:11:12.123456Z', 'U')) - self.assertEqual(period[1], Period('2007-01-01 10:11:13.789123Z', 'U')) - - def test_to_period_tz_pytz(self): - tm._skip_if_no_pytz() - from dateutil.tz import tzlocal - from pytz import utc as UTC - - xp = date_range('1/1/2000', '4/1/2000').to_period() - - ts = date_range('1/1/2000', '4/1/2000', tz='US/Eastern') - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertEqual(result, expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=UTC) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertEqual(result, expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertEqual(result, expected) - tm.assert_index_equal(ts.to_period(), xp) - - def test_to_period_tz_explicit_pytz(self): - tm._skip_if_no_pytz() - import pytz - from dateutil.tz import tzlocal - - xp = date_range('1/1/2000', '4/1/2000').to_period() - - ts = date_range('1/1/2000', '4/1/2000', tz=pytz.timezone('US/Eastern')) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - def test_to_period_tz_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - from dateutil.tz import tzlocal - - xp = date_range('1/1/2000', '4/1/2000').to_period() - - ts = date_range('1/1/2000', '4/1/2000', tz='dateutil/US/Eastern') - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - def test_frame_to_period(self): - K = 5 - from pandas.tseries.period import period_range - - dr = date_range('1/1/2000', '1/1/2001') - pr = period_range('1/1/2000', '1/1/2001') - df = DataFrame(randn(len(dr), K), index=dr) - df['mix'] = 'a' - - pts = df.to_period() - exp = df.copy() - exp.index = pr - assert_frame_equal(pts, exp) - - pts = df.to_period('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) - - df = df.T - pts = df.to_period(axis=1) - exp = df.copy() - exp.columns = pr - assert_frame_equal(pts, exp) - - pts = df.to_period('M', axis=1) - tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) - - self.assertRaises(ValueError, df.to_period, axis=2) - - def test_timestamp_fields(self): - # extra fields from DatetimeIndex like quarter and week - idx = tm.makeDateIndex(100) - - fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', - 'days_in_month', 'is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'weekday_name'] - for f in fields: - expected = getattr(idx, f)[-1] - result = getattr(Timestamp(idx[-1]), f) - self.assertEqual(result, expected) - - self.assertEqual(idx.freq, Timestamp(idx[-1], idx.freq).freq) - self.assertEqual(idx.freqstr, Timestamp(idx[-1], idx.freq).freqstr) - - def test_woy_boundary(self): - # make sure weeks at year boundaries are correct - d = datetime(2013, 12, 31) - result = Timestamp(d).week - expected = 1 # ISO standard - self.assertEqual(result, expected) - - d = datetime(2008, 12, 28) - result = Timestamp(d).week - expected = 52 # ISO standard - self.assertEqual(result, expected) - - d = datetime(2009, 12, 31) - result = Timestamp(d).week - expected = 53 # ISO standard - self.assertEqual(result, expected) - - d = datetime(2010, 1, 1) - result = Timestamp(d).week - expected = 53 # ISO standard - self.assertEqual(result, expected) - - d = datetime(2010, 1, 3) - result = Timestamp(d).week - expected = 53 # ISO standard - self.assertEqual(result, expected) - - result = np.array([Timestamp(datetime(*args)).week - for args in [(2000, 1, 1), (2000, 1, 2), ( - 2005, 1, 1), (2005, 1, 2)]]) - self.assertTrue((result == [52, 52, 53, 53]).all()) - - def test_timestamp_date_out_of_range(self): - self.assertRaises(ValueError, Timestamp, '1676-01-01') - self.assertRaises(ValueError, Timestamp, '2263-01-01') - - # 1475 - self.assertRaises(ValueError, DatetimeIndex, ['1400-01-01']) - self.assertRaises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) - - def test_compat_replace(self): - # https://github.com/statsmodels/statsmodels/issues/3349 - # replace should take ints/longs for compat - - for f in [compat.long, int]: - result = date_range(Timestamp('1960-04-01 00:00:00', - freq='QS-JAN'), - periods=f(76), - freq='QS-JAN') - self.assertEqual(len(result), 76) - - def test_timestamp_repr(self): - # pre-1900 - stamp = Timestamp('1850-01-01', tz='US/Eastern') - repr(stamp) - - iso8601 = '1850-01-01 01:23:45.012345' - stamp = Timestamp(iso8601, tz='US/Eastern') - result = repr(stamp) - self.assertIn(iso8601, result) - - def test_timestamp_from_ordinal(self): - - # GH 3042 - dt = datetime(2011, 4, 16, 0, 0) - ts = Timestamp.fromordinal(dt.toordinal()) - self.assertEqual(ts.to_pydatetime(), dt) - - # with a tzinfo - stamp = Timestamp('2011-4-16', tz='US/Eastern') - dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') - self.assertEqual(ts.to_pydatetime(), dt_tz) - - def test_datetimeindex_integers_shift(self): - rng = date_range('1/1/2000', periods=20) - - result = rng + 5 - expected = rng.shift(5) - tm.assert_index_equal(result, expected) - - result = rng - 5 - expected = rng.shift(-5) - tm.assert_index_equal(result, expected) - - def test_astype_object(self): - # NumPy 1.6.1 weak ns support - rng = date_range('1/1/2000', periods=20) - - casted = rng.astype('O') - exp_values = list(rng) - - tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) - self.assertEqual(casted.tolist(), exp_values) - - def test_catch_infinite_loop(self): - offset = offsets.DateOffset(minute=5) - # blow up, don't loop forever - self.assertRaises(Exception, date_range, datetime(2011, 11, 11), - datetime(2011, 11, 12), freq=offset) - - def test_append_concat(self): - rng = date_range('5/8/2012 1:45', periods=10, freq='5T') - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - - result = ts.append(ts) - result_df = df.append(df) - ex_index = DatetimeIndex(np.tile(rng.values, 2)) - tm.assert_index_equal(result.index, ex_index) - tm.assert_index_equal(result_df.index, ex_index) - - appended = rng.append(rng) - tm.assert_index_equal(appended, ex_index) - - appended = rng.append([rng, rng]) - ex_index = DatetimeIndex(np.tile(rng.values, 3)) - tm.assert_index_equal(appended, ex_index) - - # different index names - rng1 = rng.copy() - rng2 = rng.copy() - rng1.name = 'foo' - rng2.name = 'bar' - self.assertEqual(rng1.append(rng1).name, 'foo') - self.assertIsNone(rng1.append(rng2).name) - - def test_append_concat_tz(self): - # GH 2938 - tm._skip_if_no_pytz() - - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='US/Eastern') - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_explicit_pytz(self): - # GH 2938 - tm._skip_if_no_pytz() - from pytz import timezone as timezone - - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz=timezone('US/Eastern')) - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_dateutil(self): - # GH 2938 - tm._skip_if_no_dateutil() - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='dateutil/US/Eastern') - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_set_dataframe_column_ns_dtype(self): - x = DataFrame([datetime.now(), datetime.now()]) - self.assertEqual(x[0].dtype, np.dtype('M8[ns]')) - - def test_groupby_count_dateparseerror(self): - dr = date_range(start='1/1/2012', freq='5min', periods=10) - - # BAD Example, datetimes first - s = Series(np.arange(10), index=[dr, lrange(10)]) - grouped = s.groupby(lambda x: x[1] % 2 == 0) - result = grouped.count() - - s = Series(np.arange(10), index=[lrange(10), dr]) - grouped = s.groupby(lambda x: x[0] % 2 == 0) - expected = grouped.count() - - assert_series_equal(result, expected) - - def test_datetimeindex_repr_short(self): - dr = date_range(start='1/1/2012', periods=1) - repr(dr) - - dr = date_range(start='1/1/2012', periods=2) - repr(dr) - - dr = date_range(start='1/1/2012', periods=3) - repr(dr) - - def test_constructor_int64_nocopy(self): - # #1624 - arr = np.arange(1000, dtype=np.int64) - index = DatetimeIndex(arr) - - arr[50:100] = -1 - self.assertTrue((index.asi8[50:100] == -1).all()) - - arr = np.arange(1000, dtype=np.int64) - index = DatetimeIndex(arr, copy=True) - - arr[50:100] = -1 - self.assertTrue((index.asi8[50:100] != -1).all()) - - def test_series_interpolate_method_values(self): - # #1646 - ts = _simple_ts('1/1/2000', '1/20/2000') - ts[::2] = np.nan - - result = ts.interpolate(method='values') - exp = ts.interpolate() - assert_series_equal(result, exp) - - def test_frame_datetime64_handling_groupby(self): - # it works! - df = DataFrame([(3, np.datetime64('2012-07-03')), - (3, np.datetime64('2012-07-04'))], - columns=['a', 'date']) - result = df.groupby('a').first() - self.assertEqual(result['date'][3], Timestamp('2012-07-03')) - - def test_series_interpolate_intraday(self): - # #1698 - index = pd.date_range('1/1/2012', periods=4, freq='12D') - ts = pd.Series([0, 12, 24, 36], index) - new_index = index.append(index + pd.DateOffset(days=1)).sort_values() - - exp = ts.reindex(new_index).interpolate(method='time') - - index = pd.date_range('1/1/2012', periods=4, freq='12H') - ts = pd.Series([0, 12, 24, 36], index) - new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() - result = ts.reindex(new_index).interpolate(method='time') - - self.assert_numpy_array_equal(result.values, exp.values) - - def test_frame_dict_constructor_datetime64_1680(self): - dr = date_range('1/1/2012', periods=10) - s = Series(dr, index=dr) - - # it works! - DataFrame({'a': 'foo', 'b': s}, index=dr) - DataFrame({'a': 'foo', 'b': s.values}, index=dr) - - def test_frame_datetime64_mixed_index_ctor_1681(self): - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - ts = Series(dr) - - # it works! - d = DataFrame({'A': 'foo', 'B': ts}, index=dr) - self.assertTrue(d['B'].isnull().all()) - - def test_frame_timeseries_to_records(self): - index = date_range('1/1/2000', periods=10) - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['a', 'b', 'c']) - - result = df.to_records() - result['index'].dtype == 'M8[ns]' - - result = df.to_records(index=False) - - def test_frame_datetime64_duplicated(self): - dates = date_range('2010-07-01', end='2010-08-05') - - tst = DataFrame({'symbol': 'AAA', 'date': dates}) - result = tst.duplicated(['date', 'symbol']) - self.assertTrue((-result).all()) - - tst = DataFrame({'date': dates}) - result = tst.duplicated() - self.assertTrue((-result).all()) - - def test_timestamp_compare_with_early_datetime(self): - # e.g. datetime.min - stamp = Timestamp('2012-01-01') - - self.assertFalse(stamp == datetime.min) - self.assertFalse(stamp == datetime(1600, 1, 1)) - self.assertFalse(stamp == datetime(2700, 1, 1)) - self.assertNotEqual(stamp, datetime.min) - self.assertNotEqual(stamp, datetime(1600, 1, 1)) - self.assertNotEqual(stamp, datetime(2700, 1, 1)) - self.assertTrue(stamp > datetime(1600, 1, 1)) - self.assertTrue(stamp >= datetime(1600, 1, 1)) - self.assertTrue(stamp < datetime(2700, 1, 1)) - self.assertTrue(stamp <= datetime(2700, 1, 1)) - - def test_to_html_timestamp(self): - rng = date_range('2000-01-01', periods=10) - df = DataFrame(np.random.randn(10, 4), index=rng) - - result = df.to_html() - self.assertIn('2000-01-01', result) - - def test_to_csv_numpy_16_bug(self): - frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) - - buf = StringIO() - frame.to_csv(buf) - - result = buf.getvalue() - self.assertIn('2000-01-01', result) - - def test_series_map_box_timestamps(self): - # #2689, #2627 - s = Series(date_range('1/1/2000', periods=10)) - - def f(x): - return (x.hour, x.day, x.month) - - # it works! - s.map(f) - s.apply(f) - DataFrame(s).applymap(f) - - def test_series_map_box_timedelta(self): - # GH 11349 - s = Series(timedelta_range('1 day 1 s', periods=5, freq='h')) - - def f(x): - return x.total_seconds() - - s.map(f) - s.apply(f) - DataFrame(s).applymap(f) - - def test_concat_datetime_datetime64_frame(self): - # #2624 - rows = [] - rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), 'hi']) - - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) - - ind = date_range(start="2000/1/1", freq="D", periods=10) - df1 = DataFrame({'date': ind, 'test': lrange(10)}) - - # it works! - pd.concat([df1, df2_obj]) - - def test_asfreq_resample_set_correct_freq(self): - # GH5613 - # we test if .asfreq() and .resample() set the correct value for .freq - df = pd.DataFrame({'date': ["2012-01-01", "2012-01-02", "2012-01-03"], - 'col': [1, 2, 3]}) - df = df.set_index(pd.to_datetime(df.date)) - - # testing the settings before calling .asfreq() and .resample() - self.assertEqual(df.index.freq, None) - self.assertEqual(df.index.inferred_freq, 'D') - - # does .asfreq() set .freq correctly? - self.assertEqual(df.asfreq('D').index.freq, 'D') - - # does .resample() set .freq correctly? - self.assertEqual(df.resample('D').asfreq().index.freq, 'D') - - def test_pickle(self): - - # GH4606 - p = self.round_trip_pickle(NaT) - self.assertTrue(p is NaT) - - idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) - idx_p = self.round_trip_pickle(idx) - self.assertTrue(idx_p[0] == idx[0]) - self.assertTrue(idx_p[1] is NaT) - self.assertTrue(idx_p[2] == idx[2]) - - # GH11002 - # don't infer freq - idx = date_range('1750-1-1', '2050-1-1', freq='7D') - idx_p = self.round_trip_pickle(idx) - tm.assert_index_equal(idx, idx_p) - - def test_timestamp_equality(self): - - # GH 11034 - s = Series([Timestamp('2000-01-29 01:59:00'), 'NaT']) - result = s != s - assert_series_equal(result, Series([False, True])) - result = s != s[0] - assert_series_equal(result, Series([False, True])) - result = s != s[1] - assert_series_equal(result, Series([True, True])) - - result = s == s - assert_series_equal(result, Series([True, False])) - result = s == s[0] - assert_series_equal(result, Series([True, False])) - result = s == s[1] - assert_series_equal(result, Series([False, False])) - - -def _simple_ts(start, end, freq='D'): - rng = date_range(start, end, freq=freq) - return Series(np.random.randn(len(rng)), index=rng) - - -class TestToDatetime(tm.TestCase): - _multiprocess_can_split_ = True - - def test_to_datetime_dt64s(self): - in_bound_dts = [ - np.datetime64('2000-01-01'), - np.datetime64('2000-01-02'), - ] - - for dt in in_bound_dts: - self.assertEqual(pd.to_datetime(dt), Timestamp(dt)) - - oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ] - - for dt in oob_dts: - self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') - self.assertRaises(ValueError, tslib.Timestamp, dt) - self.assertIs(pd.to_datetime(dt, errors='coerce'), NaT) - - def test_to_datetime_array_of_dt64s(self): - dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] - - # Assuming all datetimes are in bounds, to_datetime() returns - # an array that is equal to Timestamp() parsing - self.assert_numpy_array_equal( - pd.to_datetime(dts, box=False), - np.array([Timestamp(x).asm8 for x in dts]) - ) - - # A list of datetimes where the last one is out of bounds - dts_with_oob = dts + [np.datetime64('9999-01-01')] - - self.assertRaises(ValueError, pd.to_datetime, dts_with_oob, - errors='raise') - - self.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='coerce'), - np.array( - [ - Timestamp(dts_with_oob[0]).asm8, - Timestamp(dts_with_oob[1]).asm8, - iNaT, - ], - dtype='M8' - ) - ) - - # With errors='ignore', out of bounds datetime64s - # are converted to their .item(), which depending on the version of - # numpy is either a python datetime.datetime or datetime.date - self.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='ignore'), - np.array( - [dt.item() for dt in dts_with_oob], - dtype='O' - ) - ) - - def test_to_datetime_tz(self): - - # xref 8260 - # uniform returns a DatetimeIndex - arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] - result = pd.to_datetime(arr) - expected = DatetimeIndex( - ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') - tm.assert_index_equal(result, expected) - - # mixed tzs will raise - arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] - self.assertRaises(ValueError, lambda: pd.to_datetime(arr)) - - def test_to_datetime_tz_pytz(self): - - # xref 8260 - tm._skip_if_no_pytz() - import pytz - - us_eastern = pytz.timezone('US/Eastern') - arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, - hour=3, minute=0)), - us_eastern.localize(datetime(year=2000, month=6, day=1, - hour=3, minute=0))], - dtype=object) - result = pd.to_datetime(arr, utc=True) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) - tm.assert_index_equal(result, expected) - - def test_to_datetime_utc_is_true(self): - # See gh-11934 - start = pd.Timestamp('2014-01-01', tz='utc') - end = pd.Timestamp('2014-01-03', tz='utc') - date_range = pd.bdate_range(start, end) - - result = pd.to_datetime(date_range, utc=True) - expected = pd.DatetimeIndex(data=date_range) - tm.assert_index_equal(result, expected) - - def test_to_datetime_tz_psycopg2(self): - - # xref 8260 - try: - import psycopg2 - except ImportError: - raise nose.SkipTest("no psycopg2 installed") - - # misc cases - tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) - tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) - arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1), - datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], - dtype=object) - - result = pd.to_datetime(arr, errors='coerce', utc=True) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) - tm.assert_index_equal(result, expected) - - # dtype coercion - i = pd.DatetimeIndex([ - '2000-01-01 08:00:00+00:00' - ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) - self.assertTrue(is_datetime64_ns_dtype(i)) - - # tz coerceion - result = pd.to_datetime(i, errors='coerce') - tm.assert_index_equal(result, i) - - result = pd.to_datetime(i, errors='coerce', utc=True) - expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], - dtype='datetime64[ns, UTC]') - tm.assert_index_equal(result, expected) - - def test_datetime_bool(self): - # GH13176 - with self.assertRaises(TypeError): - to_datetime(False) - self.assertTrue(to_datetime(False, errors="coerce") is tslib.NaT) - self.assertEqual(to_datetime(False, errors="ignore"), False) - with self.assertRaises(TypeError): - to_datetime(True) - self.assertTrue(to_datetime(True, errors="coerce") is tslib.NaT) - self.assertEqual(to_datetime(True, errors="ignore"), True) - with self.assertRaises(TypeError): - to_datetime([False, datetime.today()]) - with self.assertRaises(TypeError): - to_datetime(['20130101', True]) - tm.assert_index_equal(to_datetime([0, False, tslib.NaT, 0.0], - errors="coerce"), - DatetimeIndex([to_datetime(0), tslib.NaT, - tslib.NaT, to_datetime(0)])) - - def test_datetime_invalid_datatype(self): - # GH13176 - - with self.assertRaises(TypeError): - pd.to_datetime(bool) - with self.assertRaises(TypeError): - pd.to_datetime(pd.to_datetime) - - def test_unit(self): - # GH 11758 - # test proper behavior with erros - - with self.assertRaises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') - - values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), pd.NaT, - pd.NaT, pd.NaT, pd.NaT, pd.NaT], - dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, unit='D', errors='coerce') - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') - - values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] - - result = to_datetime(values, errors='ignore', unit='s') - expected = Index([1420043460000, pd.NaT, pd.NaT, - pd.NaT, pd.NaT], dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, errors='coerce', unit='s') - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') - - # if we have a string, then we raise a ValueError - # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: - try: - to_datetime(val, errors='raise', unit='s') - except tslib.OutOfBoundsDatetime: - raise AssertionError("incorrect exception raised") - except ValueError: - pass - - def test_unit_consistency(self): - - # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='coerce') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='ignore') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - def test_unit_with_numeric(self): - - # GH 13180 - # coercions from floats/ints are ok - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr1 = [1.434692e+18, 1.432766e+18] - arr2 = np.array(arr1).astype('int64') - for errors in ['ignore', 'raise', 'coerce']: - result = pd.to_datetime(arr1, errors=errors) - tm.assert_index_equal(result, expected) - - result = pd.to_datetime(arr2, errors=errors) - tm.assert_index_equal(result, expected) - - # but we want to make sure that we are coercing - # if we have ints/strings - expected = DatetimeIndex(['NaT', - '2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20', - 'NaT', - 'NaT']) - arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - def test_unit_mixed(self): - - # mixed integers/datetimes - expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) - arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - with self.assertRaises(ValueError): - pd.to_datetime(arr, errors='raise') - - expected = DatetimeIndex(['NaT', - 'NaT', - '2013-01-01']) - arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - with self.assertRaises(ValueError): - pd.to_datetime(arr, errors='raise') - - def test_index_to_datetime(self): - idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = idx.to_datetime() - expected = DatetimeIndex(pd.to_datetime(idx.values)) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - today = datetime.today() - idx = Index([today], dtype=object) - result = idx.to_datetime() - expected = DatetimeIndex([today]) - tm.assert_index_equal(result, expected) - - def test_dataframe(self): - - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [6, 7], - 'minute': [58, 59], - 'second': [10, 11], - 'ms': [1, 1], - 'us': [2, 2], - 'ns': [3, 3]}) - - result = to_datetime({'year': df['year'], - 'month': df['month'], - 'day': df['day']}) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:0:00')]) - assert_series_equal(result, expected) - - # dict-like - result = to_datetime(df[['year', 'month', 'day']].to_dict()) - assert_series_equal(result, expected) - - # dict but with constructable - df2 = df[['year', 'month', 'day']].to_dict() - df2['month'] = 2 - result = to_datetime(df2) - expected2 = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160205 00:0:00')]) - assert_series_equal(result, expected2) - - # unit mappings - units = [{'year': 'years', - 'month': 'months', - 'day': 'days', - 'hour': 'hours', - 'minute': 'minutes', - 'second': 'seconds'}, - {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second'}, - ] - - for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d)) - expected = Series([Timestamp('20150204 06:58:10'), - Timestamp('20160305 07:59:11')]) - assert_series_equal(result, expected) - - d = {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second', - 'ms': 'ms', - 'us': 'us', - 'ns': 'ns'} - - result = to_datetime(df.rename(columns=d)) - expected = Series([Timestamp('20150204 06:58:10.001002003'), - Timestamp('20160305 07:59:11.001002003')]) - assert_series_equal(result, expected) - - # coerce back to int - result = to_datetime(df.astype(str)) - assert_series_equal(result, expected) - - # passing coerce - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) - with self.assertRaises(ValueError): - to_datetime(df2) - result = to_datetime(df2, errors='coerce') - expected = Series([Timestamp('20150204 00:00:00'), - pd.NaT]) - assert_series_equal(result, expected) - - # extra columns - with self.assertRaises(ValueError): - df2 = df.copy() - df2['foo'] = 1 - to_datetime(df2) - - # not enough - for c in [['year'], - ['year', 'month'], - ['year', 'month', 'second'], - ['month', 'day'], - ['year', 'day', 'second']]: - with self.assertRaises(ValueError): - to_datetime(df[c]) - - # duplicates - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) - df2.columns = ['year', 'year', 'day'] - with self.assertRaises(ValueError): - to_datetime(df2) - - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5], - 'hour': [4, 5]}) - df2.columns = ['year', 'month', 'day', 'day'] - with self.assertRaises(ValueError): - to_datetime(df2) - - def test_dataframe_dtypes(self): - # #13451 - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) - - # int16 - result = to_datetime(df.astype('int16')) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) - assert_series_equal(result, expected) - - # mixed dtypes - df['month'] = df['month'].astype('int8') - df['day'] = df['day'].astype('int8') - result = to_datetime(df) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) - assert_series_equal(result, expected) - - # float - df = DataFrame({'year': [2000, 2001], - 'month': [1.5, 1], - 'day': [1, 1]}) - with self.assertRaises(ValueError): - to_datetime(df) - - -class TestDatetime64(tm.TestCase): - """ - Also test support for datetime64[ns] in Series / DataFrame - """ - - def setUp(self): - dti = DatetimeIndex(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='Min') - self.series = Series(rand(len(dti)), dti) - - def test_fancy_getitem(self): - dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) - - s = Series(np.arange(len(dti)), index=dti) - - self.assertEqual(s[48], 48) - self.assertEqual(s['1/2/2009'], 48) - self.assertEqual(s['2009-1-2'], 48) - self.assertEqual(s[datetime(2009, 1, 2)], 48) - self.assertEqual(s[lib.Timestamp(datetime(2009, 1, 2))], 48) - self.assertRaises(KeyError, s.__getitem__, '2009-1-3') - - assert_series_equal(s['3/6/2009':'2009-06-05'], - s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) - - def test_fancy_setitem(self): - dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) - - s = Series(np.arange(len(dti)), index=dti) - s[48] = -1 - self.assertEqual(s[48], -1) - s['1/2/2009'] = -2 - self.assertEqual(s[48], -2) - s['1/2/2009':'2009-06-05'] = -3 - self.assertTrue((s[48:54] == -3).all()) - - def test_dti_snap(self): - dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', - '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') - - res = dti.snap(freq='W-MON') - exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') - exp = exp.repeat([3, 4]) - self.assertTrue((res == exp).all()) - - res = dti.snap(freq='B') - - exp = date_range('1/1/2002', '1/7/2002', freq='b') - exp = exp.repeat([1, 1, 1, 2, 2]) - self.assertTrue((res == exp).all()) - - def test_dti_reset_index_round_trip(self): - dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') - d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) - d2 = d1.reset_index() - self.assertEqual(d2.dtypes[0], np.dtype('M8[ns]')) - d3 = d2.set_index('index') - assert_frame_equal(d1, d3, check_names=False) - - # #2329 - stamp = datetime(2012, 11, 22) - df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) - df = df.set_index('Date') - - self.assertEqual(df.index[0], stamp) - self.assertEqual(df.reset_index()['Date'][0], stamp) - - def test_series_set_value(self): - # #1561 - - dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = DatetimeIndex(dates) - - s = Series().set_value(dates[0], 1.) - s2 = s.set_value(dates[1], np.nan) - - exp = Series([1., np.nan], index=index) - - assert_series_equal(s2, exp) - - # s = Series(index[:1], index[:1]) - # s2 = s.set_value(dates[1], index[1]) - # self.assertEqual(s2.values.dtype, 'M8[ns]') - - @slow - def test_slice_locs_indexerror(self): - times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) - for i in range(100000)] - s = Series(lrange(100000), times) - s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] - - def test_slicing_datetimes(self): - - # GH 7523 - - # unique - df = DataFrame(np.arange(4., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 3, 4]]) - result = df.loc[datetime(2001, 1, 1, 10):] - assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11):] - expected = df.iloc[1:] - assert_frame_equal(result, expected) - result = df.loc['20010101 11':] - assert_frame_equal(result, expected) - - # duplicates - df = pd.DataFrame(np.arange(5., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 2, 3, 4]]) - - result = df.loc[datetime(2001, 1, 1, 10):] - assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11):] - expected = df.iloc[1:] - assert_frame_equal(result, expected) - result = df.loc['20010101 11':] - assert_frame_equal(result, expected) - - -class TestSeriesDatetime64(tm.TestCase): - def setUp(self): - self.series = Series(date_range('1/1/2000', periods=10)) - - def test_auto_conversion(self): - series = Series(list(date_range('1/1/2000', periods=10))) - self.assertEqual(series.dtype, 'M8[ns]') - - def test_constructor_cant_cast_datetime64(self): - msg = "Cannot cast datetime64 to " - with tm.assertRaisesRegexp(TypeError, msg): - Series(date_range('1/1/2000', periods=10), dtype=float) - - with tm.assertRaisesRegexp(TypeError, msg): - Series(date_range('1/1/2000', periods=10), dtype=int) - - def test_constructor_cast_object(self): - s = Series(date_range('1/1/2000', periods=10), dtype=object) - exp = Series(date_range('1/1/2000', periods=10)) - tm.assert_series_equal(s, exp) - - def test_series_comparison_scalars(self): - val = datetime(2000, 1, 4) - result = self.series > val - expected = Series([x > val for x in self.series]) - self.assert_series_equal(result, expected) - - val = self.series[5] - result = self.series > val - expected = Series([x > val for x in self.series]) - self.assert_series_equal(result, expected) - - def test_between(self): - left, right = self.series[[2, 7]] - - result = self.series.between(left, right) - expected = (self.series >= left) & (self.series <= right) - assert_series_equal(result, expected) - - # --------------------------------------------------------------------- - # NaT support - - def test_NaT_scalar(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') - - val = series[3] - self.assertTrue(com.isnull(val)) - - series[2] = val - self.assertTrue(com.isnull(series[2])) - - def test_NaT_cast(self): - # GH10747 - result = Series([np.nan]).astype('M8[ns]') - expected = Series([NaT]) - assert_series_equal(result, expected) - - def test_set_none_nan(self): - self.series[3] = None - self.assertIs(self.series[3], NaT) - - self.series[3:5] = None - self.assertIs(self.series[4], NaT) - - self.series[5] = np.nan - self.assertIs(self.series[5], NaT) - - self.series[5:7] = np.nan - self.assertIs(self.series[6], NaT) - - def test_intercept_astype_object(self): - - # this test no longer makes sense as series is by default already - # M8[ns] - expected = self.series.astype('object') - - df = DataFrame({'a': self.series, - 'b': np.random.randn(len(self.series))}) - exp_dtypes = pd.Series([np.dtype('datetime64[ns]'), - np.dtype('float64')], index=['a', 'b']) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - result = df.values.squeeze() - self.assertTrue((result[:, 0] == expected.values).all()) - - df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) - - result = df.values.squeeze() - self.assertTrue((result[:, 0] == expected.values).all()) - - def test_nat_operations(self): - # GH 8617 - s = Series([0, pd.NaT], dtype='m8[ns]') - exp = s[0] - self.assertEqual(s.median(), exp) - self.assertEqual(s.min(), exp) - self.assertEqual(s.max(), exp) - - def test_round_nat(self): - # GH14940 - s = Series([pd.NaT]) - expected = Series(pd.NaT) - for method in ["round", "floor", "ceil"]: - round_method = getattr(s.dt, method) - for freq in ["s", "5s", "min", "5min", "h", "5h"]: - assert_series_equal(round_method(freq), expected) - - -class TestTimestamp(tm.TestCase): - def test_class_ops_pytz(self): - tm._skip_if_no_pytz() - from pytz import timezone - - def compare(x, y): - self.assertEqual(int(Timestamp(x).value / 1e9), - int(Timestamp(y).value / 1e9)) - - compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now('UTC'), datetime.now(timezone('UTC'))) - compare(Timestamp.utcnow(), datetime.utcnow()) - compare(Timestamp.today(), datetime.today()) - current_time = calendar.timegm(datetime.now().utctimetuple()) - compare(Timestamp.utcfromtimestamp(current_time), - datetime.utcfromtimestamp(current_time)) - compare(Timestamp.fromtimestamp(current_time), - datetime.fromtimestamp(current_time)) - - date_component = datetime.utcnow() - time_component = (date_component + timedelta(minutes=10)).time() - compare(Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component)) - - def test_class_ops_dateutil(self): - tm._skip_if_no_dateutil() - from dateutil.tz import tzutc - - def compare(x, y): - self.assertEqual(int(np.round(Timestamp(x).value / 1e9)), - int(np.round(Timestamp(y).value / 1e9))) - - compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now('UTC'), datetime.now(tzutc())) - compare(Timestamp.utcnow(), datetime.utcnow()) - compare(Timestamp.today(), datetime.today()) - current_time = calendar.timegm(datetime.now().utctimetuple()) - compare(Timestamp.utcfromtimestamp(current_time), - datetime.utcfromtimestamp(current_time)) - compare(Timestamp.fromtimestamp(current_time), - datetime.fromtimestamp(current_time)) - - date_component = datetime.utcnow() - time_component = (date_component + timedelta(minutes=10)).time() - compare(Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component)) - - def test_basics_nanos(self): - val = np.int64(946684800000000000).view('M8[ns]') - stamp = Timestamp(val.view('i8') + 500) - self.assertEqual(stamp.year, 2000) - self.assertEqual(stamp.month, 1) - self.assertEqual(stamp.microsecond, 0) - self.assertEqual(stamp.nanosecond, 500) - - # GH 14415 - val = np.iinfo(np.int64).min + 80000000000000 - stamp = Timestamp(val) - self.assertEqual(stamp.year, 1677) - self.assertEqual(stamp.month, 9) - self.assertEqual(stamp.day, 21) - self.assertEqual(stamp.microsecond, 145224) - self.assertEqual(stamp.nanosecond, 192) - - def test_unit(self): - - def check(val, unit=None, h=1, s=1, us=0): - stamp = Timestamp(val, unit=unit) - self.assertEqual(stamp.year, 2000) - self.assertEqual(stamp.month, 1) - self.assertEqual(stamp.day, 1) - self.assertEqual(stamp.hour, h) - if unit != 'D': - self.assertEqual(stamp.minute, 1) - self.assertEqual(stamp.second, s) - self.assertEqual(stamp.microsecond, us) - else: - self.assertEqual(stamp.minute, 0) - self.assertEqual(stamp.second, 0) - self.assertEqual(stamp.microsecond, 0) - self.assertEqual(stamp.nanosecond, 0) - - ts = Timestamp('20000101 01:01:01') - val = ts.value - days = (ts - Timestamp('1970-01-01')).days - - check(val) - check(val / long(1000), unit='us') - check(val / long(1000000), unit='ms') - check(val / long(1000000000), unit='s') - check(days, unit='D', h=0) - - # using truediv, so these are like floats - if compat.PY3: - check((val + 500000) / long(1000000000), unit='s', us=500) - check((val + 500000000) / long(1000000000), unit='s', us=500000) - check((val + 500000) / long(1000000), unit='ms', us=500) - - # get chopped in py2 - else: - check((val + 500000) / long(1000000000), unit='s') - check((val + 500000000) / long(1000000000), unit='s') - check((val + 500000) / long(1000000), unit='ms') - - # ok - check((val + 500000) / long(1000), unit='us', us=500) - check((val + 500000000) / long(1000000), unit='ms', us=500000) - - # floats - check(val / 1000.0 + 5, unit='us', us=5) - check(val / 1000.0 + 5000, unit='us', us=5000) - check(val / 1000000.0 + 0.5, unit='ms', us=500) - check(val / 1000000.0 + 0.005, unit='ms', us=5) - check(val / 1000000000.0 + 0.5, unit='s', us=500000) - check(days + 0.5, unit='D', h=12) - - # nan - result = Timestamp(np.nan) - self.assertIs(result, NaT) - - result = Timestamp(None) - self.assertIs(result, NaT) - - result = Timestamp(iNaT) - self.assertIs(result, NaT) - - result = Timestamp(NaT) - self.assertIs(result, NaT) - - result = Timestamp('NaT') - self.assertIs(result, NaT) - - self.assertTrue(isnull(Timestamp('nat'))) - - def test_roundtrip(self): - - # test value to string and back conversions - # further test accessors - base = Timestamp('20140101 00:00:00') - - result = Timestamp(base.value + pd.Timedelta('5ms').value) - self.assertEqual(result, Timestamp(str(base) + ".005000")) - self.assertEqual(result.microsecond, 5000) - - result = Timestamp(base.value + pd.Timedelta('5us').value) - self.assertEqual(result, Timestamp(str(base) + ".000005")) - self.assertEqual(result.microsecond, 5) - - result = Timestamp(base.value + pd.Timedelta('5ns').value) - self.assertEqual(result, Timestamp(str(base) + ".000000005")) - self.assertEqual(result.nanosecond, 5) - self.assertEqual(result.microsecond, 0) - - result = Timestamp(base.value + pd.Timedelta('6ms 5us').value) - self.assertEqual(result, Timestamp(str(base) + ".006005")) - self.assertEqual(result.microsecond, 5 + 6 * 1000) - - result = Timestamp(base.value + pd.Timedelta('200ms 5us').value) - self.assertEqual(result, Timestamp(str(base) + ".200005")) - self.assertEqual(result.microsecond, 5 + 200 * 1000) - - def test_comparison(self): - # 5-18-2012 00:00:00.000 - stamp = long(1337299200000000000) - - val = Timestamp(stamp) - - self.assertEqual(val, val) - self.assertFalse(val != val) - self.assertFalse(val < val) - self.assertTrue(val <= val) - self.assertFalse(val > val) - self.assertTrue(val >= val) - - other = datetime(2012, 5, 18) - self.assertEqual(val, other) - self.assertFalse(val != other) - self.assertFalse(val < other) - self.assertTrue(val <= other) - self.assertFalse(val > other) - self.assertTrue(val >= other) - - other = Timestamp(stamp + 100) - - self.assertNotEqual(val, other) - self.assertNotEqual(val, other) - self.assertTrue(val < other) - self.assertTrue(val <= other) - self.assertTrue(other > val) - self.assertTrue(other >= val) - - def test_compare_invalid(self): - - # GH 8058 - val = Timestamp('20130101 12:01:02') - self.assertFalse(val == 'foo') - self.assertFalse(val == 10.0) - self.assertFalse(val == 1) - self.assertFalse(val == long(1)) - self.assertFalse(val == []) - self.assertFalse(val == {'foo': 1}) - self.assertFalse(val == np.float64(1)) - self.assertFalse(val == np.int64(1)) - - self.assertTrue(val != 'foo') - self.assertTrue(val != 10.0) - self.assertTrue(val != 1) - self.assertTrue(val != long(1)) - self.assertTrue(val != []) - self.assertTrue(val != {'foo': 1}) - self.assertTrue(val != np.float64(1)) - self.assertTrue(val != np.int64(1)) - - # ops testing - df = DataFrame(randn(5, 2)) - a = df[0] - b = Series(randn(5)) - b.name = Timestamp('2000-01-01') - tm.assert_series_equal(a / b, 1 / (b / a)) - - def test_cant_compare_tz_naive_w_aware(self): - tm._skip_if_no_pytz() - # #1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz='utc') - - self.assertRaises(Exception, a.__eq__, b) - self.assertRaises(Exception, a.__ne__, b) - self.assertRaises(Exception, a.__lt__, b) - self.assertRaises(Exception, a.__gt__, b) - self.assertRaises(Exception, b.__eq__, a) - self.assertRaises(Exception, b.__ne__, a) - self.assertRaises(Exception, b.__lt__, a) - self.assertRaises(Exception, b.__gt__, a) - - if sys.version_info < (3, 3): - self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) - self.assertRaises(Exception, a.to_pydatetime().__eq__, b) - else: - self.assertFalse(a == b.to_pydatetime()) - self.assertFalse(a.to_pydatetime() == b) - - def test_cant_compare_tz_naive_w_aware_explicit_pytz(self): - tm._skip_if_no_pytz() - from pytz import utc - # #1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz=utc) - - self.assertRaises(Exception, a.__eq__, b) - self.assertRaises(Exception, a.__ne__, b) - self.assertRaises(Exception, a.__lt__, b) - self.assertRaises(Exception, a.__gt__, b) - self.assertRaises(Exception, b.__eq__, a) - self.assertRaises(Exception, b.__ne__, a) - self.assertRaises(Exception, b.__lt__, a) - self.assertRaises(Exception, b.__gt__, a) - - if sys.version_info < (3, 3): - self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) - self.assertRaises(Exception, a.to_pydatetime().__eq__, b) - else: - self.assertFalse(a == b.to_pydatetime()) - self.assertFalse(a.to_pydatetime() == b) - - def test_cant_compare_tz_naive_w_aware_dateutil(self): - tm._skip_if_no_dateutil() - from dateutil.tz import tzutc - utc = tzutc() - # #1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz=utc) - - self.assertRaises(Exception, a.__eq__, b) - self.assertRaises(Exception, a.__ne__, b) - self.assertRaises(Exception, a.__lt__, b) - self.assertRaises(Exception, a.__gt__, b) - self.assertRaises(Exception, b.__eq__, a) - self.assertRaises(Exception, b.__ne__, a) - self.assertRaises(Exception, b.__lt__, a) - self.assertRaises(Exception, b.__gt__, a) - - if sys.version_info < (3, 3): - self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) - self.assertRaises(Exception, a.to_pydatetime().__eq__, b) - else: - self.assertFalse(a == b.to_pydatetime()) - self.assertFalse(a.to_pydatetime() == b) - - def test_delta_preserve_nanos(self): - val = Timestamp(long(1337299200000000123)) - result = val + timedelta(1) - self.assertEqual(result.nanosecond, val.nanosecond) - - def test_frequency_misc(self): - self.assertEqual(frequencies.get_freq_group('T'), - frequencies.FreqGroup.FR_MIN) - - code, stride = frequencies.get_freq_code(offsets.Hour()) - self.assertEqual(code, frequencies.FreqGroup.FR_HR) - - code, stride = frequencies.get_freq_code((5, 'T')) - self.assertEqual(code, frequencies.FreqGroup.FR_MIN) - self.assertEqual(stride, 5) - - offset = offsets.Hour() - result = frequencies.to_offset(offset) - self.assertEqual(result, offset) - - result = frequencies.to_offset((5, 'T')) - expected = offsets.Minute(5) - self.assertEqual(result, expected) - - self.assertRaises(ValueError, frequencies.get_freq_code, (5, 'baz')) - - self.assertRaises(ValueError, frequencies.to_offset, '100foo') - - self.assertRaises(ValueError, frequencies.to_offset, ('', '')) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = frequencies.get_standard_freq(offsets.Hour()) - self.assertEqual(result, 'H') - - def test_hash_equivalent(self): - d = {datetime(2011, 1, 1): 5} - stamp = Timestamp(datetime(2011, 1, 1)) - self.assertEqual(d[stamp], 5) - - def test_timestamp_compare_scalars(self): - # case where ndim == 0 - lhs = np.datetime64(datetime(2013, 12, 6)) - rhs = Timestamp('now') - nat = Timestamp('nat') - - ops = {'gt': 'lt', - 'lt': 'gt', - 'ge': 'le', - 'le': 'ge', - 'eq': 'eq', - 'ne': 'ne'} - - for left, right in ops.items(): - left_f = getattr(operator, left) - right_f = getattr(operator, right) - expected = left_f(lhs, rhs) - - result = right_f(rhs, lhs) - self.assertEqual(result, expected) - - expected = left_f(rhs, nat) - result = right_f(nat, rhs) - self.assertEqual(result, expected) - - def test_timestamp_compare_series(self): - # make sure we can compare Timestamps on the right AND left hand side - # GH4982 - s = Series(date_range('20010101', periods=10), name='dates') - s_nat = s.copy(deep=True) - - s[0] = pd.Timestamp('nat') - s[3] = pd.Timestamp('nat') - - ops = {'lt': 'gt', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} - - for left, right in ops.items(): - left_f = getattr(operator, left) - right_f = getattr(operator, right) - - # no nats - expected = left_f(s, Timestamp('20010109')) - result = right_f(Timestamp('20010109'), s) - tm.assert_series_equal(result, expected) - - # nats - expected = left_f(s, Timestamp('nat')) - result = right_f(Timestamp('nat'), s) - tm.assert_series_equal(result, expected) - - # compare to timestamp with series containing nats - expected = left_f(s_nat, Timestamp('20010109')) - result = right_f(Timestamp('20010109'), s_nat) - tm.assert_series_equal(result, expected) - - # compare to nat with series containing nats - expected = left_f(s_nat, Timestamp('nat')) - result = right_f(Timestamp('nat'), s_nat) - tm.assert_series_equal(result, expected) - - def test_is_leap_year(self): - # GH 13727 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - dt = Timestamp('2000-01-01 00:00:00', tz=tz) - self.assertTrue(dt.is_leap_year) - self.assertIsInstance(dt.is_leap_year, bool) - - dt = Timestamp('1999-01-01 00:00:00', tz=tz) - self.assertFalse(dt.is_leap_year) - - dt = Timestamp('2004-01-01 00:00:00', tz=tz) - self.assertTrue(dt.is_leap_year) - - dt = Timestamp('2100-01-01 00:00:00', tz=tz) - self.assertFalse(dt.is_leap_year) - - self.assertFalse(pd.NaT.is_leap_year) - self.assertIsInstance(pd.NaT.is_leap_year, bool) - - def test_round_nat(self): - # GH14940 - ts = Timestamp('nat') - print(dir(ts)) - for method in ["round", "floor", "ceil"]: - round_method = getattr(ts, method) - for freq in ["s", "5s", "min", "5min", "h", "5h"]: - self.assertIs(round_method(freq), ts) - - -class TestSlicing(tm.TestCase): - def test_slice_year(self): - dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) - - s = Series(np.arange(len(dti)), index=dti) - result = s['2005'] - expected = s[s.index.year == 2005] - assert_series_equal(result, expected) - - df = DataFrame(np.random.rand(len(dti), 5), index=dti) - result = df.loc['2005'] - expected = df[df.index.year == 2005] - assert_frame_equal(result, expected) - - rng = date_range('1/1/2000', '1/1/2010') - - result = rng.get_loc('2009') - expected = slice(3288, 3653) - self.assertEqual(result, expected) - - def test_slice_quarter(self): - dti = DatetimeIndex(freq='D', start=datetime(2000, 6, 1), periods=500) - - s = Series(np.arange(len(dti)), index=dti) - self.assertEqual(len(s['2001Q1']), 90) - - df = DataFrame(np.random.rand(len(dti), 5), index=dti) - self.assertEqual(len(df.loc['1Q01']), 90) - - def test_slice_month(self): - dti = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) - s = Series(np.arange(len(dti)), index=dti) - self.assertEqual(len(s['2005-11']), 30) - - df = DataFrame(np.random.rand(len(dti), 5), index=dti) - self.assertEqual(len(df.loc['2005-11']), 30) - - assert_series_equal(s['2005-11'], s['11-2005']) - - def test_partial_slice(self): - rng = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-05':'2006-02'] - expected = s['20050501':'20060228'] - assert_series_equal(result, expected) - - result = s['2005-05':] - expected = s['20050501':] - assert_series_equal(result, expected) - - result = s[:'2006-02'] - expected = s[:'20060228'] - assert_series_equal(result, expected) - - result = s['2005-1-1'] - self.assertEqual(result, s.iloc[0]) - - self.assertRaises(Exception, s.__getitem__, '2004-12-31') - - def test_partial_slice_daily(self): - rng = DatetimeIndex(freq='H', start=datetime(2005, 1, 31), periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-1-31'] - assert_series_equal(result, s.iloc[:24]) - - self.assertRaises(Exception, s.__getitem__, '2004-12-31 00') - - def test_partial_slice_hourly(self): - rng = DatetimeIndex(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), - periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-1-1'] - assert_series_equal(result, s.iloc[:60 * 4]) - - result = s['2005-1-1 20'] - assert_series_equal(result, s.iloc[:60]) - - self.assertEqual(s['2005-1-1 20:00'], s.iloc[0]) - self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:15') - - def test_partial_slice_minutely(self): - rng = DatetimeIndex(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), - periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-1-1 23:59'] - assert_series_equal(result, s.iloc[:60]) - - result = s['2005-1-1'] - assert_series_equal(result, s.iloc[:60]) - - self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.iloc[0]) - self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') - - def test_partial_slice_second_precision(self): - rng = DatetimeIndex(start=datetime(2005, 1, 1, 0, 0, 59, - microsecond=999990), - periods=20, freq='US') - s = Series(np.arange(20), rng) - - assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) - assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) - - assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) - assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) - - self.assertEqual(s[Timestamp('2005-1-1 00:00:59.999990')], s.iloc[0]) - self.assertRaisesRegexp(KeyError, '2005-1-1 00:00:00', - lambda: s['2005-1-1 00:00:00']) - - def test_partial_slicing_dataframe(self): - # GH14856 - # Test various combinations of string slicing resolution vs. - # index resolution - # - If string resolution is less precise than index resolution, - # string is considered a slice - # - If string resolution is equal to or more precise than index - # resolution, string is considered an exact match - formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', - '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] - resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] - for rnum, resolution in enumerate(resolutions[2:], 2): - # we check only 'day', 'hour', 'minute' and 'second' - unit = Timedelta("1 " + resolution) - middate = datetime(2012, 1, 1, 0, 0, 0) - index = DatetimeIndex([middate - unit, - middate, middate + unit]) - values = [1, 2, 3] - df = DataFrame({'a': values}, index, dtype=np.int64) - self.assertEqual(df.index.resolution, resolution) - - # Timestamp with the same resolution as index - # Should be exact match for Series (return scalar) - # and raise KeyError for Frame - for timestamp, expected in zip(index, values): - ts_string = timestamp.strftime(formats[rnum]) - # make ts_string as precise as index - result = df['a'][ts_string] - self.assertIsInstance(result, np.int64) - self.assertEqual(result, expected) - self.assertRaises(KeyError, df.__getitem__, ts_string) - - # Timestamp with resolution less precise than index - for fmt in formats[:rnum]: - for element, theslice in [[0, slice(None, 1)], - [1, slice(1, None)]]: - ts_string = index[element].strftime(fmt) - - # Series should return slice - result = df['a'][ts_string] - expected = df['a'][theslice] - assert_series_equal(result, expected) - - # Frame should return slice as well - result = df[ts_string] - expected = df[theslice] - assert_frame_equal(result, expected) - - # Timestamp with resolution more precise than index - # Compatible with existing key - # Should return scalar for Series - # and raise KeyError for Frame - for fmt in formats[rnum + 1:]: - ts_string = index[1].strftime(fmt) - result = df['a'][ts_string] - self.assertIsInstance(result, np.int64) - self.assertEqual(result, 2) - self.assertRaises(KeyError, df.__getitem__, ts_string) - - # Not compatible with existing key - # Should raise KeyError - for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: - ts = index[1] + Timedelta("1 " + res) - ts_string = ts.strftime(fmt) - self.assertRaises(KeyError, df['a'].__getitem__, ts_string) - self.assertRaises(KeyError, df.__getitem__, ts_string) - - def test_partial_slicing_with_multiindex(self): - - # GH 4758 - # partial string indexing with a multi-index buggy - df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], - 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"], - 'val': [1, 2, 3, 4]}, - index=date_range("2013-06-19 09:30:00", - periods=4, freq='5T')) - df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) - - expected = DataFrame([ - [1] - ], index=Index(['ABC'], name='TICKER'), columns=['val']) - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] - assert_frame_equal(result, expected) - - expected = df_multi.loc[ - (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] - assert_series_equal(result, expected) - - # this is a KeyError as we don't do partial string selection on - # multi-levels - def f(): - df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] - - self.assertRaises(KeyError, f) - - # GH 4294 - # partial slice on a series mi - s = pd.DataFrame(randn(1000, 1000), index=pd.date_range( - '2000-1-1', periods=1000)).stack() - - s2 = s[:-1].copy() - expected = s2['2000-1-4'] - result = s2[pd.Timestamp('2000-1-4')] - assert_series_equal(result, expected) - - result = s[pd.Timestamp('2000-1-4')] - expected = s['2000-1-4'] - assert_series_equal(result, expected) - - df2 = pd.DataFrame(s) - expected = df2.xs('2000-1-4') - result = df2.loc[pd.Timestamp('2000-1-4')] - assert_frame_equal(result, expected) - - def test_date_range_normalize(self): - snap = datetime.today() - n = 50 - - rng = date_range(snap, periods=n, normalize=False, freq='2D') - - offset = timedelta(2) - values = DatetimeIndex([snap + i * offset for i in range(n)]) - - tm.assert_index_equal(rng, values) - - rng = date_range('1/1/2000 08:15', periods=n, normalize=False, - freq='B') - the_time = time(8, 15) - for val in rng: - self.assertEqual(val.time(), the_time) - - def test_timedelta(self): - # this is valid too - index = date_range('1/1/2000', periods=50, freq='B') - shifted = index + timedelta(1) - back = shifted + timedelta(-1) - self.assertTrue(tm.equalContents(index, back)) - self.assertEqual(shifted.freq, index.freq) - self.assertEqual(shifted.freq, back.freq) - - result = index - timedelta(1) - expected = index + timedelta(-1) - tm.assert_index_equal(result, expected) - - # GH4134, buggy with timedeltas - rng = date_range('2013', '2014') - s = Series(rng) - result1 = rng - pd.offsets.Hour(1) - result2 = DatetimeIndex(s - np.timedelta64(100000000)) - result3 = rng - np.timedelta64(100000000) - result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - tm.assert_index_equal(result1, result4) - tm.assert_index_equal(result2, result3) - - def test_shift(self): - ts = Series(np.random.randn(5), - index=date_range('1/1/2000', periods=5, freq='H')) - - result = ts.shift(1, freq='5T') - exp_index = ts.index.shift(1, freq='5T') - tm.assert_index_equal(result.index, exp_index) - - # GH #1063, multiple of same base - result = ts.shift(1, freq='4H') - exp_index = ts.index + offsets.Hour(4) - tm.assert_index_equal(result.index, exp_index) - - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) - self.assertRaises(ValueError, idx.shift, 1) - - def test_setops_preserve_freq(self): - for tz in [None, 'Asia/Tokyo', 'US/Eastern']: - rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz) - - result = rng[:50].union(rng[50:100]) - self.assertEqual(result.name, rng.name) - self.assertEqual(result.freq, rng.freq) - self.assertEqual(result.tz, rng.tz) - - result = rng[:50].union(rng[30:100]) - self.assertEqual(result.name, rng.name) - self.assertEqual(result.freq, rng.freq) - self.assertEqual(result.tz, rng.tz) - - result = rng[:50].union(rng[60:100]) - self.assertEqual(result.name, rng.name) - self.assertIsNone(result.freq) - self.assertEqual(result.tz, rng.tz) - - result = rng[:50].intersection(rng[25:75]) - self.assertEqual(result.name, rng.name) - self.assertEqual(result.freqstr, 'D') - self.assertEqual(result.tz, rng.tz) - - nofreq = DatetimeIndex(list(rng[25:75]), name='other') - result = rng[:50].union(nofreq) - self.assertIsNone(result.name) - self.assertEqual(result.freq, rng.freq) - self.assertEqual(result.tz, rng.tz) - - result = rng[:50].intersection(nofreq) - self.assertIsNone(result.name) - self.assertEqual(result.freq, rng.freq) - self.assertEqual(result.tz, rng.tz) - - def test_min_max(self): - rng = date_range('1/1/2000', '12/31/2000') - rng2 = rng.take(np.random.permutation(len(rng))) - - the_min = rng2.min() - the_max = rng2.max() - tm.assertIsInstance(the_min, Timestamp) - tm.assertIsInstance(the_max, Timestamp) - self.assertEqual(the_min, rng[0]) - self.assertEqual(the_max, rng[-1]) - - self.assertEqual(rng.min(), rng[0]) - self.assertEqual(rng.max(), rng[-1]) - - def test_min_max_series(self): - rng = date_range('1/1/2000', periods=10, freq='4h') - lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] - df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) - - result = df.TS.max() - exp = Timestamp(df.TS.iat[-1]) - self.assertTrue(isinstance(result, Timestamp)) - self.assertEqual(result, exp) - - result = df.TS.min() - exp = Timestamp(df.TS.iat[0]) - self.assertTrue(isinstance(result, Timestamp)) - self.assertEqual(result, exp) - - def test_from_M8_structured(self): - dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] - arr = np.array(dates, - dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) - df = DataFrame(arr) - - self.assertEqual(df['Date'][0], dates[0][0]) - self.assertEqual(df['Forecasting'][0], dates[0][1]) - - s = Series(arr['Date']) - self.assertTrue(s[0], Timestamp) - self.assertEqual(s[0], dates[0][0]) - - s = Series.from_array(arr['Date'], Index([0])) - self.assertEqual(s[0], dates[0][0]) - - def test_get_level_values_box(self): - from pandas import MultiIndex - - dates = date_range('1/1/2000', periods=4) - levels = [dates, [0, 1]] - labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] - - index = MultiIndex(levels=levels, labels=labels) - - self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp)) - - def test_frame_apply_dont_convert_datetime64(self): - from pandas.tseries.offsets import BDay - df = DataFrame({'x1': [datetime(1996, 1, 1)]}) - - df = df.applymap(lambda x: x + BDay()) - df = df.applymap(lambda x: x + BDay()) - - self.assertTrue(df.x1.dtype == 'M8[ns]') - - def test_date_range_fy5252(self): - dr = date_range(start="2013-01-01", periods=2, freq=offsets.FY5253( - startingMonth=1, weekday=3, variation="nearest")) - self.assertEqual(dr[0], Timestamp('2013-01-31')) - self.assertEqual(dr[1], Timestamp('2014-01-30')) - - def test_partial_slice_doesnt_require_monotonicity(self): - # For historical reasons. - s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10)) - - nonmonotonic = s[[3, 5, 4]] - expected = nonmonotonic.iloc[:0] - timestamp = pd.Timestamp('2014-01-10') - - assert_series_equal(nonmonotonic['2014-01-10':], expected) - self.assertRaisesRegexp(KeyError, - r"Timestamp\('2014-01-10 00:00:00'\)", - lambda: nonmonotonic[timestamp:]) - - assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) - self.assertRaisesRegexp(KeyError, - r"Timestamp\('2014-01-10 00:00:00'\)", - lambda: nonmonotonic.loc[timestamp:]) - - -class TimeConversionFormats(tm.TestCase): - def test_to_datetime_format(self): - values = ['1/1/2000', '1/2/2000', '1/3/2000'] - - results1 = [Timestamp('20000101'), Timestamp('20000201'), - Timestamp('20000301')] - results2 = [Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')] - for vals, expecteds in [(values, (Index(results1), Index(results2))), - (Series(values), - (Series(results1), Series(results2))), - (values[0], (results1[0], results2[0])), - (values[1], (results1[1], results2[1])), - (values[2], (results1[2], results2[2]))]: - - for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): - result = to_datetime(vals, format=fmt) - expected = expecteds[i] - - if isinstance(expected, Series): - assert_series_equal(result, Series(expected)) - elif isinstance(expected, Timestamp): - self.assertEqual(result, expected) - else: - tm.assert_index_equal(result, expected) - - def test_to_datetime_format_YYYYMMDD(self): - s = Series([19801222, 19801222] + [19810105] * 5) - expected = Series([Timestamp(x) for x in s.apply(str)]) - - result = to_datetime(s, format='%Y%m%d') - assert_series_equal(result, expected) - - result = to_datetime(s.apply(str), format='%Y%m%d') - assert_series_equal(result, expected) - - # with NaT - expected = Series([Timestamp("19801222"), Timestamp("19801222")] + - [Timestamp("19810105")] * 5) - expected[2] = np.nan - s[2] = np.nan - - result = to_datetime(s, format='%Y%m%d') - assert_series_equal(result, expected) - - # string with NaT - s = s.apply(str) - s[2] = 'nat' - result = to_datetime(s, format='%Y%m%d') - assert_series_equal(result, expected) - - # coercion - # GH 7930 - s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') - expected = Series([datetime(2012, 12, 31), - datetime(2014, 12, 31), datetime(9999, 12, 31)], - dtype=object) - self.assert_series_equal(result, expected) - - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') - expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') - assert_series_equal(result, expected) - - # GH 10178 - def test_to_datetime_format_integer(self): - s = Series([2000, 2001, 2002]) - expected = Series([Timestamp(x) for x in s.apply(str)]) - - result = to_datetime(s, format='%Y') - assert_series_equal(result, expected) - - s = Series([200001, 200105, 200206]) - expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) - ]) - - result = to_datetime(s, format='%Y%m') - assert_series_equal(result, expected) - - def test_to_datetime_format_microsecond(self): - - # these are locale dependent - lang, _ = locale.getlocale() - month_abbr = calendar.month_abbr[4] - val = '01-{}-2011 00:00:01.978'.format(month_abbr) - - format = '%d-%b-%Y %H:%M:%S.%f' - result = to_datetime(val, format=format) - exp = datetime.strptime(val, format) - self.assertEqual(result, exp) - - def test_to_datetime_format_time(self): - data = [ - ['01/10/2010 15:20', '%m/%d/%Y %H:%M', - Timestamp('2010-01-10 15:20')], - ['01/10/2010 05:43', '%m/%d/%Y %I:%M', - Timestamp('2010-01-10 05:43')], - ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', - Timestamp('2010-01-10 13:56:01')] # , - # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', - # Timestamp('2010-01-10 20:14')], - # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', - # Timestamp('2010-01-10 07:40')], - # ['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p', - # Timestamp('2010-01-10 09:12:56')] - ] - for s, format, dt in data: - self.assertEqual(to_datetime(s, format=format), dt) - - def test_to_datetime_with_non_exact(self): - # GH 10834 - _skip_if_has_locale() - - # 8904 - # exact kw - if sys.version_info < (2, 7): - raise nose.SkipTest('on python version < 2.7') - - s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', - '19MAY11 00:00:00Z']) - result = to_datetime(s, format='%d%b%y', exact=False) - expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), - format='%d%b%y') - assert_series_equal(result, expected) - - def test_parse_nanoseconds_with_formula(self): - - # GH8989 - # trunctaing the nanoseconds when a format was provided - for v in ["2012-01-01 09:00:00.000000001", - "2012-01-01 09:00:00.000001", - "2012-01-01 09:00:00.001", - "2012-01-01 09:00:00.001000", - "2012-01-01 09:00:00.001000000", ]: - expected = pd.to_datetime(v) - result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") - self.assertEqual(result, expected) - - def test_to_datetime_format_weeks(self): - data = [ - ['2009324', '%Y%W%w', Timestamp('2009-08-13')], - ['2013020', '%Y%U%w', Timestamp('2013-01-13')] - ] - for s, format, dt in data: - self.assertEqual(to_datetime(s, format=format), dt) - - -class TestToDatetimeInferFormat(tm.TestCase): - - def test_to_datetime_infer_datetime_format_consistent_format(self): - s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) - - test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', - '%Y-%m-%dT%H:%M:%S.%f'] - - for test_format in test_formats: - s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) - - with_format = pd.to_datetime(s_as_dt_strings, format=test_format) - no_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=False) - yes_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=True) - - # Whether the format is explicitly passed, it is inferred, or - # it is not inferred, the results should all be the same - self.assert_series_equal(with_format, no_infer) - self.assert_series_equal(no_infer, yes_infer) - - def test_to_datetime_infer_datetime_format_inconsistent_format(self): - s = pd.Series(np.array(['01/01/2011 00:00:00', - '01-02-2011 00:00:00', - '2011-01-03T00:00:00'])) - - # When the format is inconsistent, infer_datetime_format should just - # fallback to the default parsing - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) - - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_infer_datetime_format_series_with_nans(self): - s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, - '01/03/2011 00:00:00', np.nan])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): - s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', - '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_iso8601_noleading_0s(self): - # GH 11871 - s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) - expected = pd.Series([pd.Timestamp('2014-01-01'), - pd.Timestamp('2014-02-02'), - pd.Timestamp('2015-03-03')]) - tm.assert_series_equal(pd.to_datetime(s), expected) - tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) - - -class TestGuessDatetimeFormat(tm.TestCase): - - def test_guess_datetime_format_with_parseable_formats(self): - tm._skip_if_not_us_locale() - dt_string_to_format = (('20111230', '%Y%m%d'), - ('2011-12-30', '%Y-%m-%d'), - ('30-12-2011', '%d-%m-%Y'), - ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'), - ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'), - ('2011-12-30 00:00:00.000000', - '%Y-%m-%d %H:%M:%S.%f'), ) - - for dt_string, dt_format in dt_string_to_format: - self.assertEqual( - tools._guess_datetime_format(dt_string), - dt_format - ) - - def test_guess_datetime_format_with_dayfirst(self): - ambiguous_string = '01/01/2011' - self.assertEqual( - tools._guess_datetime_format(ambiguous_string, dayfirst=True), - '%d/%m/%Y' - ) - self.assertEqual( - tools._guess_datetime_format(ambiguous_string, dayfirst=False), - '%m/%d/%Y' - ) - - def test_guess_datetime_format_with_locale_specific_formats(self): - # The month names will vary depending on the locale, in which - # case these wont be parsed properly (dateutil can't parse them) - _skip_if_has_locale() - - dt_string_to_format = (('30/Dec/2011', '%d/%b/%Y'), - ('30/December/2011', '%d/%B/%Y'), - ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'), ) - - for dt_string, dt_format in dt_string_to_format: - self.assertEqual( - tools._guess_datetime_format(dt_string), - dt_format - ) - - def test_guess_datetime_format_invalid_inputs(self): - # A datetime string must include a year, month and a day for it - # to be guessable, in addition to being a string that looks like - # a datetime - invalid_dts = [ - '2013', - '01/2013', - '12:00:00', - '1/1/1/1', - 'this_is_not_a_datetime', - '51a', - 9, - datetime(2011, 1, 1), - ] - - for invalid_dt in invalid_dts: - self.assertTrue(tools._guess_datetime_format(invalid_dt) is None) - - def test_guess_datetime_format_nopadding(self): - # GH 11142 - dt_string_to_format = (('2011-1-1', '%Y-%m-%d'), - ('30-1-2011', '%d-%m-%Y'), - ('1/1/2011', '%m/%d/%Y'), - ('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'), - ('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'), - ('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')) - - for dt_string, dt_format in dt_string_to_format: - self.assertEqual( - tools._guess_datetime_format(dt_string), - dt_format - ) - - def test_guess_datetime_format_for_array(self): - tm._skip_if_not_us_locale() - expected_format = '%Y-%m-%d %H:%M:%S.%f' - dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) - - test_arrays = [ - np.array([dt_string, dt_string, dt_string], dtype='O'), - np.array([np.nan, np.nan, dt_string], dtype='O'), - np.array([dt_string, 'random_string'], dtype='O'), - ] - - for test_array in test_arrays: - self.assertEqual( - tools._guess_datetime_format_for_array(test_array), - expected_format - ) - - format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array( - [np.nan, np.nan, np.nan], dtype='O')) - self.assertTrue(format_for_string_of_nans is None) - - -class TestTimestampToJulianDate(tm.TestCase): - def test_compare_1700(self): - r = Timestamp('1700-06-23').to_julian_date() - self.assertEqual(r, 2342145.5) - - def test_compare_2000(self): - r = Timestamp('2000-04-12').to_julian_date() - self.assertEqual(r, 2451646.5) - - def test_compare_2100(self): - r = Timestamp('2100-08-12').to_julian_date() - self.assertEqual(r, 2488292.5) - - def test_compare_hour01(self): - r = Timestamp('2000-08-12T01:00:00').to_julian_date() - self.assertEqual(r, 2451768.5416666666666666) - - def test_compare_hour13(self): - r = Timestamp('2000-08-12T13:00:00').to_julian_date() - self.assertEqual(r, 2451769.0416666666666666) - - -class TestDateTimeIndexToJulianDate(tm.TestCase): - def test_1700(self): - r1 = Float64Index([2345897.5, 2345898.5, 2345899.5, 2345900.5, - 2345901.5]) - r2 = date_range(start=Timestamp('1710-10-01'), periods=5, - freq='D').to_julian_date() - self.assertIsInstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_2000(self): - r1 = Float64Index([2451601.5, 2451602.5, 2451603.5, 2451604.5, - 2451605.5]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='D').to_julian_date() - self.assertIsInstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_hour(self): - r1 = Float64Index( - [2451601.5, 2451601.5416666666666666, 2451601.5833333333333333, - 2451601.625, 2451601.6666666666666666]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='H').to_julian_date() - self.assertIsInstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_minute(self): - r1 = Float64Index( - [2451601.5, 2451601.5006944444444444, 2451601.5013888888888888, - 2451601.5020833333333333, 2451601.5027777777777777]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='T').to_julian_date() - self.assertIsInstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_second(self): - r1 = Float64Index( - [2451601.5, 2451601.500011574074074, 2451601.5000231481481481, - 2451601.5000347222222222, 2451601.5000462962962962]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='S').to_julian_date() - self.assertIsInstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - -class TestDaysInMonth(tm.TestCase): - # tests for issue #10154 - def test_day_not_in_month_coerce(self): - self.assertTrue(isnull(to_datetime('2015-02-29', errors='coerce'))) - self.assertTrue(isnull(to_datetime('2015-02-29', format="%Y-%m-%d", - errors='coerce'))) - self.assertTrue(isnull(to_datetime('2015-02-32', format="%Y-%m-%d", - errors='coerce'))) - self.assertTrue(isnull(to_datetime('2015-04-31', format="%Y-%m-%d", - errors='coerce'))) - - def test_day_not_in_month_raise(self): - self.assertRaises(ValueError, to_datetime, '2015-02-29', - errors='raise') - self.assertRaises(ValueError, to_datetime, '2015-02-29', - errors='raise', format="%Y-%m-%d") - self.assertRaises(ValueError, to_datetime, '2015-02-32', - errors='raise', format="%Y-%m-%d") - self.assertRaises(ValueError, to_datetime, '2015-04-31', - errors='raise', format="%Y-%m-%d") - - def test_day_not_in_month_ignore(self): - self.assertEqual(to_datetime( - '2015-02-29', errors='ignore'), '2015-02-29') - self.assertEqual(to_datetime( - '2015-02-29', errors='ignore', format="%Y-%m-%d"), '2015-02-29') - self.assertEqual(to_datetime( - '2015-02-32', errors='ignore', format="%Y-%m-%d"), '2015-02-32') - self.assertEqual(to_datetime( - '2015-04-31', errors='ignore', format="%Y-%m-%d"), '2015-04-31') - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 58ec1561b2535..cf5dbd671d38c 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1,29 +1,18 @@ import nose -from distutils.version import LooseVersion -import numpy as np - -from pandas import tslib, lib -import pandas._period as period import datetime +import numpy as np +from distutils.version import LooseVersion import pandas as pd -from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period, - to_datetime) -from pandas.tslib import get_timezone -from pandas._period import period_asfreq, period_ordinal -from pandas.tseries.index import date_range, DatetimeIndex -from pandas.tseries.frequencies import ( - get_freq, - RESO_US, RESO_MS, RESO_SEC, RESO_HR, RESO_DAY, RESO_MIN -) -import pandas.tseries.tools as tools -import pandas.tseries.offsets as offsets import pandas.util.testing as tm -import pandas.compat as compat -from pandas.compat.numpy import (np_datetime64_compat, - np_array_datetime64_compat) - -from pandas.util.testing import assert_series_equal, _skip_if_has_locale +from pandas import tslib, lib, compat +from pandas.tseries import offsets, tools +from pandas.tseries.frequencies import get_freq +from pandas.tseries.index import date_range, DatetimeIndex +from pandas.util.testing import _skip_if_has_locale +from pandas._period import period_ordinal, period_asfreq +from pandas.compat.numpy import np_array_datetime64_compat +from pandas.core.api import Timestamp, to_datetime, Index, Series class TestTsUtil(tm.TestCase): @@ -60,589 +49,6 @@ def test_to_datetime_bijective(self): Timestamp.min.value / 1000) -class TestTimestamp(tm.TestCase): - - def test_constructor(self): - base_str = '2014-07-01 09:00' - base_dt = datetime.datetime(2014, 7, 1, 9) - base_expected = 1404205200000000000 - - # confirm base representation is correct - import calendar - self.assertEqual(calendar.timegm(base_dt.timetuple()) * 1000000000, - base_expected) - - tests = [(base_str, base_dt, base_expected), - ('2014-07-01 10:00', datetime.datetime(2014, 7, 1, 10), - base_expected + 3600 * 1000000000), - ('2014-07-01 09:00:00.000008000', - datetime.datetime(2014, 7, 1, 9, 0, 0, 8), - base_expected + 8000), - ('2014-07-01 09:00:00.000000005', - Timestamp('2014-07-01 09:00:00.000000005'), - base_expected + 5)] - - tm._skip_if_no_pytz() - tm._skip_if_no_dateutil() - import pytz - import dateutil - timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), - ('US/Eastern', -4), ('dateutil/US/Pacific', -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5)] - - for date_str, date, expected in tests: - for result in [Timestamp(date_str), Timestamp(date)]: - # only with timestring - self.assertEqual(result.value, expected) - self.assertEqual(tslib.pydt_to_i8(result), expected) - - # re-creation shouldn't affect to internal value - result = Timestamp(result) - self.assertEqual(result.value, expected) - self.assertEqual(tslib.pydt_to_i8(result), expected) - - # with timezone - for tz, offset in timezones: - for result in [Timestamp(date_str, tz=tz), Timestamp(date, - tz=tz)]: - expected_tz = expected - offset * 3600 * 1000000000 - self.assertEqual(result.value, expected_tz) - self.assertEqual(tslib.pydt_to_i8(result), expected_tz) - - # should preserve tz - result = Timestamp(result) - self.assertEqual(result.value, expected_tz) - self.assertEqual(tslib.pydt_to_i8(result), expected_tz) - - # should convert to UTC - result = Timestamp(result, tz='UTC') - expected_utc = expected - offset * 3600 * 1000000000 - self.assertEqual(result.value, expected_utc) - self.assertEqual(tslib.pydt_to_i8(result), expected_utc) - - def test_constructor_with_stringoffset(self): - # GH 7833 - base_str = '2014-07-01 11:00:00+02:00' - base_dt = datetime.datetime(2014, 7, 1, 9) - base_expected = 1404205200000000000 - - # confirm base representation is correct - import calendar - self.assertEqual(calendar.timegm(base_dt.timetuple()) * 1000000000, - base_expected) - - tests = [(base_str, base_expected), - ('2014-07-01 12:00:00+02:00', - base_expected + 3600 * 1000000000), - ('2014-07-01 11:00:00.000008000+02:00', base_expected + 8000), - ('2014-07-01 11:00:00.000000005+02:00', base_expected + 5)] - - tm._skip_if_no_pytz() - tm._skip_if_no_dateutil() - import pytz - import dateutil - timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), - ('US/Eastern', -4), ('dateutil/US/Pacific', -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5)] - - for date_str, expected in tests: - for result in [Timestamp(date_str)]: - # only with timestring - self.assertEqual(result.value, expected) - self.assertEqual(tslib.pydt_to_i8(result), expected) - - # re-creation shouldn't affect to internal value - result = Timestamp(result) - self.assertEqual(result.value, expected) - self.assertEqual(tslib.pydt_to_i8(result), expected) - - # with timezone - for tz, offset in timezones: - result = Timestamp(date_str, tz=tz) - expected_tz = expected - self.assertEqual(result.value, expected_tz) - self.assertEqual(tslib.pydt_to_i8(result), expected_tz) - - # should preserve tz - result = Timestamp(result) - self.assertEqual(result.value, expected_tz) - self.assertEqual(tslib.pydt_to_i8(result), expected_tz) - - # should convert to UTC - result = Timestamp(result, tz='UTC') - expected_utc = expected - self.assertEqual(result.value, expected_utc) - self.assertEqual(tslib.pydt_to_i8(result), expected_utc) - - # This should be 2013-11-01 05:00 in UTC - # converted to Chicago tz - result = Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago') - self.assertEqual(result.value, Timestamp('2013-11-01 05:00').value) - expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa - self.assertEqual(repr(result), expected) - self.assertEqual(result, eval(repr(result))) - - # This should be 2013-11-01 05:00 in UTC - # converted to Tokyo tz (+09:00) - result = Timestamp('2013-11-01 00:00:00-0500', tz='Asia/Tokyo') - self.assertEqual(result.value, Timestamp('2013-11-01 05:00').value) - expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" - self.assertEqual(repr(result), expected) - self.assertEqual(result, eval(repr(result))) - - # GH11708 - # This should be 2015-11-18 10:00 in UTC - # converted to Asia/Katmandu - result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu") - self.assertEqual(result.value, Timestamp("2015-11-18 10:00").value) - expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')" - self.assertEqual(repr(result), expected) - self.assertEqual(result, eval(repr(result))) - - # This should be 2015-11-18 10:00 in UTC - # converted to Asia/Kolkata - result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata") - self.assertEqual(result.value, Timestamp("2015-11-18 10:00").value) - expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')" - self.assertEqual(repr(result), expected) - self.assertEqual(result, eval(repr(result))) - - def test_constructor_invalid(self): - with tm.assertRaisesRegexp(TypeError, 'Cannot convert input'): - Timestamp(slice(2)) - with tm.assertRaisesRegexp(ValueError, 'Cannot convert Period'): - Timestamp(Period('1000-01-01')) - - def test_constructor_positional(self): - # GH 10758 - with tm.assertRaises(TypeError): - Timestamp(2000, 1) - with tm.assertRaises(ValueError): - Timestamp(2000, 0, 1) - with tm.assertRaises(ValueError): - Timestamp(2000, 13, 1) - with tm.assertRaises(ValueError): - Timestamp(2000, 1, 0) - with tm.assertRaises(ValueError): - Timestamp(2000, 1, 32) - - # GH 11630 - self.assertEqual( - repr(Timestamp(2015, 11, 12)), - repr(Timestamp('20151112'))) - - self.assertEqual( - repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)), - repr(Timestamp('2015-11-12 01:02:03.999999'))) - - self.assertIs(Timestamp(None), pd.NaT) - - def test_constructor_keyword(self): - # GH 10758 - with tm.assertRaises(TypeError): - Timestamp(year=2000, month=1) - with tm.assertRaises(ValueError): - Timestamp(year=2000, month=0, day=1) - with tm.assertRaises(ValueError): - Timestamp(year=2000, month=13, day=1) - with tm.assertRaises(ValueError): - Timestamp(year=2000, month=1, day=0) - with tm.assertRaises(ValueError): - Timestamp(year=2000, month=1, day=32) - - self.assertEqual( - repr(Timestamp(year=2015, month=11, day=12)), - repr(Timestamp('20151112'))) - - self.assertEqual( - repr(Timestamp(year=2015, month=11, day=12, - hour=1, minute=2, second=3, microsecond=999999)), - repr(Timestamp('2015-11-12 01:02:03.999999'))) - - def test_constructor_fromordinal(self): - base = datetime.datetime(2000, 1, 1) - - ts = Timestamp.fromordinal(base.toordinal(), freq='D') - self.assertEqual(base, ts) - self.assertEqual(ts.freq, 'D') - self.assertEqual(base.toordinal(), ts.toordinal()) - - ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') - self.assertEqual(pd.Timestamp('2000-01-01', tz='US/Eastern'), ts) - self.assertEqual(base.toordinal(), ts.toordinal()) - - def test_constructor_offset_depr(self): - # GH 12160 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - ts = Timestamp('2011-01-01', offset='D') - self.assertEqual(ts.freq, 'D') - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - self.assertEqual(ts.offset, 'D') - - msg = "Can only specify freq or offset, not both" - with tm.assertRaisesRegexp(TypeError, msg): - Timestamp('2011-01-01', offset='D', freq='D') - - def test_constructor_offset_depr_fromordinal(self): - # GH 12160 - base = datetime.datetime(2000, 1, 1) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - ts = Timestamp.fromordinal(base.toordinal(), offset='D') - self.assertEqual(pd.Timestamp('2000-01-01'), ts) - self.assertEqual(ts.freq, 'D') - self.assertEqual(base.toordinal(), ts.toordinal()) - - msg = "Can only specify freq or offset, not both" - with tm.assertRaisesRegexp(TypeError, msg): - Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') - - def test_conversion(self): - # GH 9255 - ts = Timestamp('2000-01-01') - - result = ts.to_pydatetime() - expected = datetime.datetime(2000, 1, 1) - self.assertEqual(result, expected) - self.assertEqual(type(result), type(expected)) - - result = ts.to_datetime64() - expected = np.datetime64(ts.value, 'ns') - self.assertEqual(result, expected) - self.assertEqual(type(result), type(expected)) - self.assertEqual(result.dtype, expected.dtype) - - def test_repr(self): - tm._skip_if_no_pytz() - tm._skip_if_no_dateutil() - - dates = ['2014-03-07', '2014-01-01 09:00', - '2014-01-01 00:00:00.000000001'] - - # dateutil zone change (only matters for repr) - import dateutil - if (dateutil.__version__ >= LooseVersion('2.3') and - (dateutil.__version__ <= LooseVersion('2.4.0') or - dateutil.__version__ >= LooseVersion('2.6.0'))): - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific'] - else: - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/America/Los_Angeles'] - - freqs = ['D', 'M', 'S', 'N'] - - for date in dates: - for tz in timezones: - for freq in freqs: - - # avoid to match with timezone name - freq_repr = "'{0}'".format(freq) - if tz.startswith('dateutil'): - tz_repr = tz.replace('dateutil', '') - else: - tz_repr = tz - - date_only = Timestamp(date) - self.assertIn(date, repr(date_only)) - self.assertNotIn(tz_repr, repr(date_only)) - self.assertNotIn(freq_repr, repr(date_only)) - self.assertEqual(date_only, eval(repr(date_only))) - - date_tz = Timestamp(date, tz=tz) - self.assertIn(date, repr(date_tz)) - self.assertIn(tz_repr, repr(date_tz)) - self.assertNotIn(freq_repr, repr(date_tz)) - self.assertEqual(date_tz, eval(repr(date_tz))) - - date_freq = Timestamp(date, freq=freq) - self.assertIn(date, repr(date_freq)) - self.assertNotIn(tz_repr, repr(date_freq)) - self.assertIn(freq_repr, repr(date_freq)) - self.assertEqual(date_freq, eval(repr(date_freq))) - - date_tz_freq = Timestamp(date, tz=tz, freq=freq) - self.assertIn(date, repr(date_tz_freq)) - self.assertIn(tz_repr, repr(date_tz_freq)) - self.assertIn(freq_repr, repr(date_tz_freq)) - self.assertEqual(date_tz_freq, eval(repr(date_tz_freq))) - - # this can cause the tz field to be populated, but it's redundant to - # information in the datestring - tm._skip_if_no_pytz() - import pytz # noqa - date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) - self.assertIn('2014-03-13 00:00:00-0400', repr(date_with_utc_offset)) - self.assertNotIn('tzoffset', repr(date_with_utc_offset)) - self.assertIn('pytz.FixedOffset(-240)', repr(date_with_utc_offset)) - expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'", - 'pytz.FixedOffset(-240)') - self.assertEqual(date_with_utc_offset, eval(expr)) - - def test_bounds_with_different_units(self): - out_of_bounds_dates = ('1677-09-21', '2262-04-12', ) - - time_units = ('D', 'h', 'm', 's', 'ms', 'us') - - for date_string in out_of_bounds_dates: - for unit in time_units: - self.assertRaises(ValueError, Timestamp, np.datetime64( - date_string, dtype='M8[%s]' % unit)) - - in_bounds_dates = ('1677-09-23', '2262-04-11', ) - - for date_string in in_bounds_dates: - for unit in time_units: - Timestamp(np.datetime64(date_string, dtype='M8[%s]' % unit)) - - def test_tz(self): - t = '2014-02-01 09:00' - ts = Timestamp(t) - local = ts.tz_localize('Asia/Tokyo') - self.assertEqual(local.hour, 9) - self.assertEqual(local, Timestamp(t, tz='Asia/Tokyo')) - conv = local.tz_convert('US/Eastern') - self.assertEqual(conv, Timestamp('2014-01-31 19:00', tz='US/Eastern')) - self.assertEqual(conv.hour, 19) - - # preserves nanosecond - ts = Timestamp(t) + offsets.Nano(5) - local = ts.tz_localize('Asia/Tokyo') - self.assertEqual(local.hour, 9) - self.assertEqual(local.nanosecond, 5) - conv = local.tz_convert('US/Eastern') - self.assertEqual(conv.nanosecond, 5) - self.assertEqual(conv.hour, 19) - - def test_tz_localize_ambiguous(self): - - ts = Timestamp('2014-11-02 01:00') - ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) - ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) - - rng = date_range('2014-11-02', periods=3, freq='H', tz='US/Eastern') - self.assertEqual(rng[1], ts_dst) - self.assertEqual(rng[2], ts_no_dst) - self.assertRaises(ValueError, ts.tz_localize, 'US/Eastern', - ambiguous='infer') - - # GH 8025 - with tm.assertRaisesRegexp(TypeError, - 'Cannot localize tz-aware Timestamp, use ' - 'tz_convert for conversions'): - Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo') - - with tm.assertRaisesRegexp(TypeError, - 'Cannot convert tz-naive Timestamp, use ' - 'tz_localize to localize'): - Timestamp('2011-01-01').tz_convert('Asia/Tokyo') - - def test_tz_localize_nonexistent(self): - # See issue 13057 - from pytz.exceptions import NonExistentTimeError - times = ['2015-03-08 02:00', '2015-03-08 02:30', - '2015-03-29 02:00', '2015-03-29 02:30'] - timezones = ['US/Eastern', 'US/Pacific', - 'Europe/Paris', 'Europe/Belgrade'] - for t, tz in zip(times, timezones): - ts = Timestamp(t) - self.assertRaises(NonExistentTimeError, ts.tz_localize, - tz) - self.assertRaises(NonExistentTimeError, ts.tz_localize, - tz, errors='raise') - self.assertIs(ts.tz_localize(tz, errors='coerce'), - pd.NaT) - - def test_tz_localize_errors_ambiguous(self): - # See issue 13057 - from pytz.exceptions import AmbiguousTimeError - ts = pd.Timestamp('2015-11-1 01:00') - self.assertRaises(AmbiguousTimeError, - ts.tz_localize, 'US/Pacific', errors='coerce') - - def test_tz_localize_roundtrip(self): - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t) - localized = ts.tz_localize(tz) - self.assertEqual(localized, Timestamp(t, tz=tz)) - - with tm.assertRaises(TypeError): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - self.assertEqual(reset, ts) - self.assertTrue(reset.tzinfo is None) - - def test_tz_convert_roundtrip(self): - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t, tz='UTC') - converted = ts.tz_convert(tz) - - reset = converted.tz_convert(None) - self.assertEqual(reset, Timestamp(t)) - self.assertTrue(reset.tzinfo is None) - self.assertEqual(reset, - converted.tz_convert('UTC').tz_localize(None)) - - def test_barely_oob_dts(self): - one_us = np.timedelta64(1).astype('timedelta64[us]') - - # By definition we can't go out of bounds in [ns], so we - # convert the datetime64s to [us] so we can go out of bounds - min_ts_us = np.datetime64(Timestamp.min).astype('M8[us]') - max_ts_us = np.datetime64(Timestamp.max).astype('M8[us]') - - # No error for the min/max datetimes - Timestamp(min_ts_us) - Timestamp(max_ts_us) - - # One us less than the minimum is an error - self.assertRaises(ValueError, Timestamp, min_ts_us - one_us) - - # One us more than the maximum is an error - self.assertRaises(ValueError, Timestamp, max_ts_us + one_us) - - def test_utc_z_designator(self): - self.assertEqual(get_timezone( - Timestamp('2014-11-02 01:00Z').tzinfo), 'UTC') - - def test_now(self): - # #9000 - ts_from_string = Timestamp('now') - ts_from_method = Timestamp.now() - ts_datetime = datetime.datetime.now() - - ts_from_string_tz = Timestamp('now', tz='US/Eastern') - ts_from_method_tz = Timestamp.now(tz='US/Eastern') - - # Check that the delta between the times is less than 1s (arbitrarily - # small) - delta = Timedelta(seconds=1) - self.assertTrue(abs(ts_from_method - ts_from_string) < delta) - self.assertTrue(abs(ts_datetime - ts_from_method) < delta) - self.assertTrue(abs(ts_from_method_tz - ts_from_string_tz) < delta) - self.assertTrue(abs(ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None)) < delta) - - def test_today(self): - - ts_from_string = Timestamp('today') - ts_from_method = Timestamp.today() - ts_datetime = datetime.datetime.today() - - ts_from_string_tz = Timestamp('today', tz='US/Eastern') - ts_from_method_tz = Timestamp.today(tz='US/Eastern') - - # Check that the delta between the times is less than 1s (arbitrarily - # small) - delta = Timedelta(seconds=1) - self.assertTrue(abs(ts_from_method - ts_from_string) < delta) - self.assertTrue(abs(ts_datetime - ts_from_method) < delta) - self.assertTrue(abs(ts_from_method_tz - ts_from_string_tz) < delta) - self.assertTrue(abs(ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None)) < delta) - - def test_asm8(self): - np.random.seed(7960929) - ns = [Timestamp.min.value, Timestamp.max.value, 1000, ] - for n in ns: - self.assertEqual(Timestamp(n).asm8.view('i8'), - np.datetime64(n, 'ns').view('i8'), n) - self.assertEqual(Timestamp('nat').asm8.view('i8'), - np.datetime64('nat', 'ns').view('i8')) - - def test_fields(self): - def check(value, equal): - # that we are int/long like - self.assertTrue(isinstance(value, (int, compat.long))) - self.assertEqual(value, equal) - - # GH 10050 - ts = Timestamp('2015-05-10 09:06:03.000100001') - check(ts.year, 2015) - check(ts.month, 5) - check(ts.day, 10) - check(ts.hour, 9) - check(ts.minute, 6) - check(ts.second, 3) - self.assertRaises(AttributeError, lambda: ts.millisecond) - check(ts.microsecond, 100) - check(ts.nanosecond, 1) - check(ts.dayofweek, 6) - check(ts.quarter, 2) - check(ts.dayofyear, 130) - check(ts.week, 19) - check(ts.daysinmonth, 31) - check(ts.daysinmonth, 31) - - def test_nat_fields(self): - # GH 10050 - ts = Timestamp('NaT') - self.assertTrue(np.isnan(ts.year)) - self.assertTrue(np.isnan(ts.month)) - self.assertTrue(np.isnan(ts.day)) - self.assertTrue(np.isnan(ts.hour)) - self.assertTrue(np.isnan(ts.minute)) - self.assertTrue(np.isnan(ts.second)) - self.assertTrue(np.isnan(ts.microsecond)) - self.assertTrue(np.isnan(ts.nanosecond)) - self.assertTrue(np.isnan(ts.dayofweek)) - self.assertTrue(np.isnan(ts.quarter)) - self.assertTrue(np.isnan(ts.dayofyear)) - self.assertTrue(np.isnan(ts.week)) - self.assertTrue(np.isnan(ts.daysinmonth)) - self.assertTrue(np.isnan(ts.days_in_month)) - - def test_pprint(self): - # GH12622 - import pprint - nested_obj = {'foo': 1, - 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10} - result = pprint.pformat(nested_obj, width=50) - expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], - 'foo': 1}""" - self.assertEqual(result, expected) - - def to_datetime_depr(self): - # see gh-8254 - ts = Timestamp('2011-01-01') - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = datetime.datetime(2011, 1, 1) - result = ts.to_datetime() - self.assertEqual(result, expected) - - def to_pydatetime_nonzero_nano(self): - ts = Timestamp('2011-01-01 9:00:00.123456789') - - # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning, - check_stacklevel=False): - expected = datetime.datetime(2011, 1, 1, 9, 0, 0, 123456) - result = ts.to_pydatetime() - self.assertEqual(result, expected) - - class TestDatetimeParsingWrappers(tm.TestCase): def test_does_not_convert_mixed_integer(self): bad_date_strings = ('-50000', '999', '123.1234', 'm', 'T') @@ -1117,181 +523,6 @@ def test_parsing_timezone_offsets(self): ) -class TestTimestampNsOperations(tm.TestCase): - def setUp(self): - self.timestamp = Timestamp(datetime.datetime.utcnow()) - - def assert_ns_timedelta(self, modified_timestamp, expected_value): - value = self.timestamp.value - modified_value = modified_timestamp.value - - self.assertEqual(modified_value - value, expected_value) - - def test_timedelta_ns_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'ns'), - -123) - - def test_timedelta_ns_based_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64( - 1234567898, 'ns'), 1234567898) - - def test_timedelta_us_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'us'), - -123000) - - def test_timedelta_ms_arithmetic(self): - time = self.timestamp + np.timedelta64(-123, 'ms') - self.assert_ns_timedelta(time, -123000000) - - def test_nanosecond_string_parsing(self): - ts = Timestamp('2013-05-01 07:15:45.123456789') - # GH 7878 - expected_repr = '2013-05-01 07:15:45.123456789' - expected_value = 1367392545123456789 - self.assertEqual(ts.value, expected_value) - self.assertIn(expected_repr, repr(ts)) - - ts = Timestamp('2013-05-01 07:15:45.123456789+09:00', tz='Asia/Tokyo') - self.assertEqual(ts.value, expected_value - 9 * 3600 * 1000000000) - self.assertIn(expected_repr, repr(ts)) - - ts = Timestamp('2013-05-01 07:15:45.123456789', tz='UTC') - self.assertEqual(ts.value, expected_value) - self.assertIn(expected_repr, repr(ts)) - - ts = Timestamp('2013-05-01 07:15:45.123456789', tz='US/Eastern') - self.assertEqual(ts.value, expected_value + 4 * 3600 * 1000000000) - self.assertIn(expected_repr, repr(ts)) - - # GH 10041 - ts = Timestamp('20130501T071545.123456789') - self.assertEqual(ts.value, expected_value) - self.assertIn(expected_repr, repr(ts)) - - def test_nanosecond_timestamp(self): - # GH 7610 - expected = 1293840000000000005 - t = Timestamp('2011-01-01') + offsets.Nano(5) - self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") - self.assertEqual(t.value, expected) - self.assertEqual(t.nanosecond, 5) - - t = Timestamp(t) - self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") - self.assertEqual(t.value, expected) - self.assertEqual(t.nanosecond, 5) - - t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000005Z')) - self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") - self.assertEqual(t.value, expected) - self.assertEqual(t.nanosecond, 5) - - expected = 1293840000000000010 - t = t + offsets.Nano(5) - self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") - self.assertEqual(t.value, expected) - self.assertEqual(t.nanosecond, 10) - - t = Timestamp(t) - self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") - self.assertEqual(t.value, expected) - self.assertEqual(t.nanosecond, 10) - - t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000010Z')) - self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") - self.assertEqual(t.value, expected) - self.assertEqual(t.nanosecond, 10) - - def test_nat_arithmetic(self): - # GH 6873 - i = 2 - f = 1.5 - - for (left, right) in [(pd.NaT, i), (pd.NaT, f), (pd.NaT, np.nan)]: - self.assertIs(left / right, pd.NaT) - self.assertIs(left * right, pd.NaT) - self.assertIs(right * left, pd.NaT) - with tm.assertRaises(TypeError): - right / left - - # Timestamp / datetime - t = Timestamp('2014-01-01') - dt = datetime.datetime(2014, 1, 1) - for (left, right) in [(pd.NaT, pd.NaT), (pd.NaT, t), (pd.NaT, dt)]: - # NaT __add__ or __sub__ Timestamp-like (or inverse) returns NaT - self.assertIs(right + left, pd.NaT) - self.assertIs(left + right, pd.NaT) - self.assertIs(left - right, pd.NaT) - self.assertIs(right - left, pd.NaT) - - # timedelta-like - # offsets are tested in test_offsets.py - - delta = datetime.timedelta(3600) - td = Timedelta('5s') - - for (left, right) in [(pd.NaT, delta), (pd.NaT, td)]: - # NaT + timedelta-like returns NaT - self.assertIs(right + left, pd.NaT) - self.assertIs(left + right, pd.NaT) - self.assertIs(right - left, pd.NaT) - self.assertIs(left - right, pd.NaT) - - # GH 11718 - tm._skip_if_no_pytz() - import pytz - - t_utc = Timestamp('2014-01-01', tz='UTC') - t_tz = Timestamp('2014-01-01', tz='US/Eastern') - dt_tz = pytz.timezone('Asia/Tokyo').localize(dt) - - for (left, right) in [(pd.NaT, t_utc), (pd.NaT, t_tz), - (pd.NaT, dt_tz)]: - # NaT __add__ or __sub__ Timestamp-like (or inverse) returns NaT - self.assertIs(right + left, pd.NaT) - self.assertIs(left + right, pd.NaT) - self.assertIs(left - right, pd.NaT) - self.assertIs(right - left, pd.NaT) - - # int addition / subtraction - for (left, right) in [(pd.NaT, 2), (pd.NaT, 0), (pd.NaT, -3)]: - self.assertIs(right + left, pd.NaT) - self.assertIs(left + right, pd.NaT) - self.assertIs(left - right, pd.NaT) - self.assertIs(right - left, pd.NaT) - - def test_nat_arithmetic_index(self): - # GH 11718 - - # datetime - tm._skip_if_no_pytz() - - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], name='x') - exp = pd.DatetimeIndex([pd.NaT, pd.NaT], name='x') - self.assert_index_equal(dti + pd.NaT, exp) - self.assert_index_equal(pd.NaT + dti, exp) - - dti_tz = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], - tz='US/Eastern', name='x') - exp = pd.DatetimeIndex([pd.NaT, pd.NaT], name='x', tz='US/Eastern') - self.assert_index_equal(dti_tz + pd.NaT, exp) - self.assert_index_equal(pd.NaT + dti_tz, exp) - - exp = pd.TimedeltaIndex([pd.NaT, pd.NaT], name='x') - for (left, right) in [(pd.NaT, dti), (pd.NaT, dti_tz)]: - self.assert_index_equal(left - right, exp) - self.assert_index_equal(right - left, exp) - - # timedelta - tdi = pd.TimedeltaIndex(['1 day', '2 day'], name='x') - exp = pd.DatetimeIndex([pd.NaT, pd.NaT], name='x') - for (left, right) in [(pd.NaT, tdi)]: - self.assert_index_equal(left + right, exp) - self.assert_index_equal(right + left, exp) - self.assert_index_equal(left - right, exp) - self.assert_index_equal(right - left, exp) - - class TestTslib(tm.TestCase): def test_intraday_conversion_factors(self): self.assertEqual(period_asfreq( @@ -1461,86 +692,6 @@ def _check_round(freq, expected): stamp.round('foo') -class TestTimestampOps(tm.TestCase): - def test_timestamp_and_datetime(self): - self.assertEqual((Timestamp(datetime.datetime( - 2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) - self.assertEqual((datetime.datetime(2013, 10, 12) - - Timestamp(datetime.datetime(2013, 10, 13))).days, -1) - - def test_timestamp_and_series(self): - timestamp_series = Series(date_range('2014-03-17', periods=2, freq='D', - tz='US/Eastern')) - first_timestamp = timestamp_series[0] - - delta_series = Series([np.timedelta64(0, 'D'), np.timedelta64(1, 'D')]) - assert_series_equal(timestamp_series - first_timestamp, delta_series) - assert_series_equal(first_timestamp - timestamp_series, -delta_series) - - def test_addition_subtraction_types(self): - # Assert on the types resulting from Timestamp +/- various date/time - # objects - datetime_instance = datetime.datetime(2014, 3, 4) - timedelta_instance = datetime.timedelta(seconds=1) - # build a timestamp with a frequency, since then it supports - # addition/subtraction of integers - timestamp_instance = date_range(datetime_instance, periods=1, - freq='D')[0] - - self.assertEqual(type(timestamp_instance + 1), Timestamp) - self.assertEqual(type(timestamp_instance - 1), Timestamp) - - # Timestamp + datetime not supported, though subtraction is supported - # and yields timedelta more tests in tseries/base/tests/test_base.py - self.assertEqual( - type(timestamp_instance - datetime_instance), Timedelta) - self.assertEqual( - type(timestamp_instance + timedelta_instance), Timestamp) - self.assertEqual( - type(timestamp_instance - timedelta_instance), Timestamp) - - # Timestamp +/- datetime64 not supported, so not tested (could possibly - # assert error raised?) - timedelta64_instance = np.timedelta64(1, 'D') - self.assertEqual( - type(timestamp_instance + timedelta64_instance), Timestamp) - self.assertEqual( - type(timestamp_instance - timedelta64_instance), Timestamp) - - def test_addition_subtraction_preserve_frequency(self): - timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] - timedelta_instance = datetime.timedelta(days=1) - original_freq = timestamp_instance.freq - self.assertEqual((timestamp_instance + 1).freq, original_freq) - self.assertEqual((timestamp_instance - 1).freq, original_freq) - self.assertEqual( - (timestamp_instance + timedelta_instance).freq, original_freq) - self.assertEqual( - (timestamp_instance - timedelta_instance).freq, original_freq) - - timedelta64_instance = np.timedelta64(1, 'D') - self.assertEqual( - (timestamp_instance + timedelta64_instance).freq, original_freq) - self.assertEqual( - (timestamp_instance - timedelta64_instance).freq, original_freq) - - def test_resolution(self): - - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', - 'S', 'L', 'U'], - [RESO_DAY, RESO_DAY, - RESO_DAY, RESO_DAY, - RESO_HR, RESO_MIN, - RESO_SEC, RESO_MS, - RESO_US]): - for tz in [None, 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Eastern']: - idx = date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) - result = period.resolution(idx.asi8, idx.tz) - self.assertEqual(result, expected) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From c5342fd21fc67bbe4ef099dc24de415c493162ca Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Feb 2017 11:31:12 -0500 Subject: [PATCH 098/338] TST: making test files a bit more balanced TST: move parts of test_datetimelike.py to indexes/datetimes --- pandas/sparse/tests/test_frame.py | 44 ++ pandas/tests/frame/test_missing.py | 34 ++ pandas/tests/frame/test_timeseries.py | 58 ++- pandas/tests/indexes/datetimelike.py | 40 ++ pandas/tests/indexes/datetimes/test_astype.py | 62 +++ .../indexes/datetimes/test_construction.py | 17 + .../tests/indexes/datetimes/test_datetime.py | 8 + .../indexes/datetimes/test_datetimelike.py | 76 +++ pandas/tests/indexes/datetimes/test_misc.py | 62 ++- pandas/tests/indexes/test_datetimelike.py | 111 +--- pandas/tests/scalar/test_timestamp.py | 59 +++ pandas/tests/series/test_indexing.py | 129 +++++ pandas/tests/series/test_missing.py | 53 +- pandas/tests/series/test_timeseries.py | 489 ------------------ 14 files changed, 638 insertions(+), 604 deletions(-) create mode 100644 pandas/tests/indexes/datetimelike.py create mode 100644 pandas/tests/indexes/datetimes/test_datetimelike.py diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index b9e8a31393931..23bb827974c61 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -705,6 +705,50 @@ def test_fillna_fill_value(self): tm.assert_frame_equal(sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False) + def test_sparse_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index, method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index, method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_sparse_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + def test_rename(self): # just check this works renamed = self.frame.rename(index=str) # noqa diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index c4f037e85edf6..a8c9c72956463 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -322,6 +322,40 @@ def test_bfill(self): assert_frame_equal(self.tsframe.bfill(), self.tsframe.fillna(method='bfill')) + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index, method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 85967e9eda0d6..934aafc500611 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -8,7 +8,9 @@ from numpy.random import randn import numpy as np -from pandas import DataFrame, Series, Index, Timestamp, DatetimeIndex +from pandas import (DataFrame, Series, Index, + Timestamp, DatetimeIndex, + to_datetime, date_range) import pandas as pd import pandas.tseries.offsets as offsets @@ -117,6 +119,60 @@ def test_pct_change_shift_over_nas(self): edf = DataFrame({'a': expected, 'b': expected}) assert_frame_equal(chg, edf) + def test_frame_ctor_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + dates = np.asarray(rng) + + df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) + self.assertTrue(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) + + def test_frame_add_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + df = DataFrame(index=np.arange(len(rng))) + + df['A'] = rng + self.assertTrue(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({'year': date_range('1/1/1700', periods=50, + freq='A-DEC')}) + # it works! + repr(df) + + def test_frame_add_datetime64_col_other_units(self): + n = 100 + + units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] + + ns_dtype = np.dtype('M8[ns]') + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + ex_vals = to_datetime(vals.astype('O')).values + + self.assertEqual(df[unit].dtype, ns_dtype) + self.assertTrue((df[unit].values == ex_vals).all()) + + # Test insertion into existing datetime64 column + df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + tmp = df.copy() + + tmp['dates'] = vals + ex_vals = to_datetime(vals.astype('O')).values + + self.assertTrue((tmp['dates'].values == ex_vals).all()) + def test_shift(self): # naive shift shiftedFrame = self.tsframe.shift(5) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py new file mode 100644 index 0000000000000..964511a2e9d5b --- /dev/null +++ b/pandas/tests/indexes/datetimelike.py @@ -0,0 +1,40 @@ +""" generic datetimelike tests """ + +from .common import Base +import pandas.util.testing as tm + + +class DatetimeLike(Base): + + def test_shift_identity(self): + + idx = self.create_index() + self.assert_index_equal(idx, idx.shift(0)) + + def test_str(self): + + # test the string repr + idx = self.create_index() + idx.name = 'foo' + self.assertFalse("length=%s" % len(idx) in str(idx)) + self.assertTrue("'foo'" in str(idx)) + self.assertTrue(idx.__class__.__name__ in str(idx)) + + if hasattr(idx, 'tz'): + if idx.tz is not None: + self.assertTrue(idx.tz in str(idx)) + if hasattr(idx, 'freq'): + self.assertTrue("freq='%s'" % idx.freqstr in str(idx)) + + def test_view(self): + super(DatetimeLike, self).test_view() + + i = self.create_index() + + i_view = i.view('i8') + result = self._holder(i) + tm.assert_index_equal(result, i) + + i_view = i.view(self._holder) + result = self._holder(i) + tm.assert_index_equal(result, i_view) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index f64d18a69a093..d452a7e1840d7 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -1,5 +1,6 @@ import numpy as np +from datetime import datetime import pandas as pd import pandas.util.testing as tm from pandas import (DatetimeIndex, date_range, Series, NaT, Index, Timestamp, @@ -120,3 +121,64 @@ def test_astype_raises(self): self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') self.assertRaises(ValueError, idx.astype, 'datetime64') self.assertRaises(ValueError, idx.astype, 'datetime64[D]') + + def test_index_convert_to_datetime_array(self): + tm._skip_if_no_pytz() + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assertIsInstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assertIsInstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') + rng_utc = date_range('20090415', '20090519', tz='utc') + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assertIsInstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assertIsInstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', + tz=pytz.timezone('US/Eastern')) + rng_utc = date_range('20090415', '20090519', tz=pytz.utc) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assertIsInstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assertIsInstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', + tz='dateutil/US/Eastern') + rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index f8eca0f0d91d0..03bc0e0c554b0 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -2,6 +2,7 @@ from datetime import timedelta import pandas as pd +from pandas import tslib import pandas.util.testing as tm from pandas.tslib import OutOfBoundsDatetime from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range, @@ -477,6 +478,22 @@ def test_dti_constructor_numpy_timeunits(self): tm.assert_index_equal(DatetimeIndex(values), base) tm.assert_index_equal(to_datetime(values), base) + def test_ctor_str_intraday(self): + rng = DatetimeIndex(['1-1-2000 00:00:01']) + self.assertEqual(rng[0].second, 1) + + def test_is_(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + self.assertTrue(dti.is_(dti)) + self.assertTrue(dti.is_(dti.view())) + self.assertFalse(dti.is_(dti.copy())) + + def test_index_cast_datetime64_other_units(self): + arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') + idx = Index(arr) + + self.assertTrue((idx.values == tslib.cast_to_nanoseconds(arr)).all()) + def test_constructor_int64_nocopy(self): # #1624 arr = np.arange(1000, dtype=np.int64) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index f92fca6ecfa14..628cb9df94e39 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -94,6 +94,14 @@ def test_get_indexer(self): with tm.assertRaises(ValueError): idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') + def test_reasonable_keyerror(self): + # GH #1062 + index = DatetimeIndex(['1/3/2000']) + try: + index.get_loc('1/1/2000') + except KeyError as e: + self.assertIn('2000', str(e)) + def test_roundtrip_pickle_with_tz(self): # GH 8367 diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py new file mode 100644 index 0000000000000..b32801a8bcf25 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -0,0 +1,76 @@ +""" generic tests from the Datetimelike class """ + +import numpy as np +import pandas as pd +from pandas.util import testing as tm +from pandas import Series, Index, DatetimeIndex, date_range + +from ..datetimelike import DatetimeLike + +class TestDatetimeIndex(DatetimeLike, tm.TestCase): + _holder = DatetimeIndex + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(index=tm.makeDateIndex(10)) + self.setup_indices() + + def create_index(self): + return date_range('20130101', periods=5) + + def test_shift(self): + + # test shift for datetimeIndex and non datetimeIndex + # GH8083 + + drange = self.create_index() + result = drange.shift(1) + expected = DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', + '2013-01-05', + '2013-01-06'], freq='D') + self.assert_index_equal(result, expected) + + result = drange.shift(-1) + expected = DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02', + '2013-01-03', '2013-01-04'], + freq='D') + self.assert_index_equal(result, expected) + + result = drange.shift(3, freq='2D') + expected = DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09', + '2013-01-10', + '2013-01-11'], freq='D') + self.assert_index_equal(result, expected) + + def test_pickle_compat_construction(self): + pass + + def test_intersection(self): + first = self.index + second = self.index[5:] + intersect = first.intersection(second) + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + third = Index(['a', 'b', 'c']) + result = first.intersection(third) + expected = pd.Index([], dtype=object) + self.assert_index_equal(result, expected) + + def test_union(self): + first = self.index[:5] + second = self.index[5:] + everything = self.index + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 4685df580190b..92aad5a0b1997 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -4,8 +4,9 @@ import pandas.lib as lib import pandas.util.testing as tm from pandas import (Index, DatetimeIndex, datetime, offsets, to_datetime, - Series, DataFrame, Float64Index, date_range, Timestamp) - + Series, DataFrame, Float64Index, date_range, + Timestamp, isnull) +from pandas import tslib from pandas.util.testing import assert_series_equal @@ -143,6 +144,63 @@ def test_datetimeindex_integers_shift(self): expected = rng.shift(-5) tm.assert_index_equal(result, expected) + def test_string_na_nat_conversion(self): + # GH #999, #858 + + from pandas.compat import parse_date + + strings = np.array(['1/1/2000', '1/2/2000', np.nan, + '1/4/2000, 12:34:56'], dtype=object) + + expected = np.empty(4, dtype='M8[ns]') + for i, val in enumerate(strings): + if isnull(val): + expected[i] = tslib.iNaT + else: + expected[i] = parse_date(val) + + result = tslib.array_to_datetime(strings) + tm.assert_almost_equal(result, expected) + + result2 = to_datetime(strings) + tm.assertIsInstance(result2, DatetimeIndex) + tm.assert_numpy_array_equal(result, result2.values) + + malformed = np.array(['1/100/2000', np.nan], dtype=object) + + # GH 10636, default is now 'raise' + self.assertRaises(ValueError, + lambda: to_datetime(malformed, errors='raise')) + + result = to_datetime(malformed, errors='ignore') + tm.assert_numpy_array_equal(result, malformed) + + self.assertRaises(ValueError, to_datetime, malformed, errors='raise') + + idx = ['a', 'b', 'c', 'd', 'e'] + series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, + '1/5/2000'], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000'), np.nan, + to_datetime('1/3/2000'), np.nan, + to_datetime('1/5/2000')], index=idx, name='foo') + + result = to_datetime(series) + dresult = to_datetime(dseries) + + expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + for i in range(5): + x = series[i] + if isnull(x): + expected[i] = tslib.iNaT + else: + expected[i] = to_datetime(x) + + assert_series_equal(result, expected, check_names=False) + self.assertEqual(result.name, 'foo') + + assert_series_equal(dresult, expected, check_names=False) + self.assertEqual(dresult.name, 'foo') + def test_datetimeindex_repr_short(self): dr = date_range(start='1/1/2012', periods=1) repr(dr) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 32e4029a57fe9..e5a4ced4ced4d 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -4,119 +4,14 @@ from datetime import timedelta import pandas as pd -import pandas.util.testing as tm +from pandas.util import testing as tm from pandas import (DatetimeIndex, Float64Index, Index, Int64Index, NaT, Period, PeriodIndex, Series, Timedelta, - TimedeltaIndex, date_range, period_range, + TimedeltaIndex, period_range, timedelta_range, notnull) -from .common import Base - - -class DatetimeLike(Base): - - def test_shift_identity(self): - - idx = self.create_index() - self.assert_index_equal(idx, idx.shift(0)) - - def test_str(self): - - # test the string repr - idx = self.create_index() - idx.name = 'foo' - self.assertFalse("length=%s" % len(idx) in str(idx)) - self.assertTrue("'foo'" in str(idx)) - self.assertTrue(idx.__class__.__name__ in str(idx)) - - if hasattr(idx, 'tz'): - if idx.tz is not None: - self.assertTrue(idx.tz in str(idx)) - if hasattr(idx, 'freq'): - self.assertTrue("freq='%s'" % idx.freqstr in str(idx)) - - def test_view(self): - super(DatetimeLike, self).test_view() - - i = self.create_index() - - i_view = i.view('i8') - result = self._holder(i) - tm.assert_index_equal(result, i) - - i_view = i.view(self._holder) - result = self._holder(i) - tm.assert_index_equal(result, i_view) - - -class TestDatetimeIndex(DatetimeLike, tm.TestCase): - _holder = DatetimeIndex - _multiprocess_can_split_ = True - - def setUp(self): - self.indices = dict(index=tm.makeDateIndex(10)) - self.setup_indices() - - def create_index(self): - return date_range('20130101', periods=5) - - def test_shift(self): - - # test shift for datetimeIndex and non datetimeIndex - # GH8083 - - drange = self.create_index() - result = drange.shift(1) - expected = DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', - '2013-01-06'], freq='D') - self.assert_index_equal(result, expected) - - result = drange.shift(-1) - expected = DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02', - '2013-01-03', '2013-01-04'], - freq='D') - self.assert_index_equal(result, expected) - - result = drange.shift(3, freq='2D') - expected = DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09', - '2013-01-10', - '2013-01-11'], freq='D') - self.assert_index_equal(result, expected) - - def test_pickle_compat_construction(self): - pass - - def test_intersection(self): - first = self.index - second = self.index[5:] - intersect = first.intersection(second) - self.assertTrue(tm.equalContents(intersect, second)) - - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.intersection(case) - self.assertTrue(tm.equalContents(result, second)) - - third = Index(['a', 'b', 'c']) - result = first.intersection(third) - expected = pd.Index([], dtype=object) - self.assert_index_equal(result, expected) - - def test_union(self): - first = self.index[:5] - second = self.index[5:] - everything = self.index - union = first.union(second) - self.assertTrue(tm.equalContents(union, everything)) - - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.union(case) - self.assertTrue(tm.equalContents(result, everything)) +from .datetimelike import DatetimeLike class TestPeriodIndex(DatetimeLike, tm.TestCase): diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 94369ebbd0a19..f686f1aa6dc47 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -566,6 +566,65 @@ def test_nat_fields(self): self.assertTrue(np.isnan(ts.daysinmonth)) self.assertTrue(np.isnan(ts.days_in_month)) + def test_nat_vector_field_access(self): + idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000']) + + fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', + 'days_in_month', 'is_leap_year'] + + for field in fields: + result = getattr(idx, field) + expected = [getattr(x, field) for x in idx] + self.assert_numpy_array_equal(result, np.array(expected)) + + s = pd.Series(idx) + + for field in fields: + result = getattr(s.dt, field) + expected = [getattr(x, field) for x in idx] + self.assert_series_equal(result, pd.Series(expected)) + + def test_nat_scalar_field_access(self): + fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', + 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name'] + for field in fields: + result = getattr(NaT, field) + self.assertTrue(np.isnan(result)) + + def test_NaT_methods(self): + # GH 9513 + raise_methods = ['astimezone', 'combine', 'ctime', 'dst', + 'fromordinal', 'fromtimestamp', 'isocalendar', + 'strftime', 'strptime', 'time', 'timestamp', + 'timetuple', 'timetz', 'toordinal', 'tzname', + 'utcfromtimestamp', 'utcnow', 'utcoffset', + 'utctimetuple'] + nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today'] + nan_methods = ['weekday', 'isoweekday'] + + for method in raise_methods: + if hasattr(NaT, method): + self.assertRaises(ValueError, getattr(NaT, method)) + + for method in nan_methods: + if hasattr(NaT, method): + self.assertTrue(np.isnan(getattr(NaT, method)())) + + for method in nat_methods: + if hasattr(NaT, method): + # see gh-8254 + exp_warning = None + if method == 'to_datetime': + exp_warning = FutureWarning + with tm.assert_produces_warning( + exp_warning, check_stacklevel=False): + self.assertIs(getattr(NaT, method)(), NaT) + + # GH 12300 + self.assertEqual(NaT.isoformat(), 'NaT') + def test_pprint(self): # GH12622 import pprint diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index e6209a853e958..d4b6e7dd5349f 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -346,6 +346,135 @@ def test_getitem_setitem_slice_integers(self): self.assertTrue((s[:4] == 0).all()) self.assertTrue(not (s[4:] == 0).any()) + def test_getitem_setitem_datetime_tz_pytz(self): + tm._skip_if_no_pytz() + from pytz import timezone as tz + + from pandas import date_range + + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + + # comparison dates with datetime MUST be localized! + date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) + result[date] = 0 + result[date] = ts[4] + assert_series_equal(result, ts) + + def test_getitem_setitem_datetime_tz_dateutil(self): + tm._skip_if_no_dateutil() + from dateutil.tz import tzutc + from pandas.tslib import _dateutil_gettz as gettz + + tz = lambda x: tzutc() if x == 'UTC' else gettz( + x) # handle special case for utc in dateutil + + from pandas import date_range + + N = 50 + + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', + tz='America/New_York') + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] + assert_series_equal(result, ts) + + def test_getitem_setitem_periodindex(self): + from pandas import period_range + + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04"] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result["1990-01-01 04"] = 0 + result["1990-01-01 04"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04":"1990-01-01 07"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04":"1990-01-01 07"] = 0 + result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04" + rb = "1990-01-01 07" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # GH 2782 + result = ts[ts.index[4]] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + def test_getitem_median_slice_bug(self): + index = date_range('20090415', '20090519', freq='2B') + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + result = s[indexer] + expected = s[indexer[0]] + assert_series_equal(result, expected) + def test_getitem_out_of_bounds(self): # don't segfault, GH #495 self.assertRaises(IndexError, self.ts.__getitem__, len(self.ts)) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 91da36161e188..8cf0d190a95cc 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -8,11 +8,11 @@ import numpy as np import pandas as pd -from pandas import (Series, isnull, date_range, - MultiIndex, Index) -from pandas.tseries.index import Timestamp +from pandas import (Series, DataFrame, isnull, date_range, + MultiIndex, Index, Timestamp) from pandas.compat import range -from pandas.util.testing import assert_series_equal +from pandas import tslib +from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm from .common import TestData @@ -283,6 +283,43 @@ def test_fillna_raise(self): self.assertRaises(TypeError, s.fillna, [1, 2]) self.assertRaises(TypeError, s.fillna, (1, 2)) + def test_fillna_nat(self): + series = Series([0, 1, 2, tslib.iNaT], dtype='M8[ns]') + + filled = series.fillna(method='pad') + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.values[3] = expected.values[2] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='pad') + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + series = Series([tslib.iNaT, 0, 1, 2], dtype='M8[ns]') + + filled = series.fillna(method='bfill') + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='bfill') + filled2 = df.fillna(value=series[1]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + def test_isnull_for_inf(self): s = Series(['a', np.inf, np.nan, 1.0]) with pd.option_context('mode.use_inf_as_null', True): @@ -518,6 +555,14 @@ def test_pad_nan(self): assert_series_equal(x[1:], expected[1:]) self.assertTrue(np.isnan(x[0]), np.isnan(expected[0])) + def test_pad_require_monotonicity(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + # neither monotonic increasing or decreasing + rng2 = rng[[1, 0, 2]] + + self.assertRaises(ValueError, rng2.get_indexer, rng, method='pad') + def test_dropna_preserve_name(self): self.ts[:5] = np.nan result = self.ts.dropna() diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 073b8bfeee131..571a802e37211 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -230,126 +230,6 @@ def test_truncate(self): before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset) - def test_getitem_setitem_datetime_tz_pytz(self): - tm._skip_if_no_pytz() - from pytz import timezone as tz - - from pandas import date_range - - N = 50 - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(N), index=rng) - - # also test Timestamp tz handling, GH #2789 - result = ts.copy() - result["1990-01-01 09:00:00+00:00"] = 0 - result["1990-01-01 09:00:00+00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - result["1990-01-01 03:00:00-06:00"] = 0 - result["1990-01-01 03:00:00-06:00"] = ts[4] - assert_series_equal(result, ts) - - # repeat with datetimes - result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - - # comparison dates with datetime MUST be localized! - date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) - result[date] = 0 - result[date] = ts[4] - assert_series_equal(result, ts) - - def test_getitem_setitem_datetime_tz_dateutil(self): - tm._skip_if_no_dateutil() - from dateutil.tz import tzutc - from pandas.tslib import _dateutil_gettz as gettz - - tz = lambda x: tzutc() if x == 'UTC' else gettz( - x) # handle special case for utc in dateutil - - from pandas import date_range - - N = 50 - - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', - tz='America/New_York') - ts = Series(np.random.randn(N), index=rng) - - # also test Timestamp tz handling, GH #2789 - result = ts.copy() - result["1990-01-01 09:00:00+00:00"] = 0 - result["1990-01-01 09:00:00+00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - result["1990-01-01 03:00:00-06:00"] = 0 - result["1990-01-01 03:00:00-06:00"] = ts[4] - assert_series_equal(result, ts) - - # repeat with datetimes - result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] - assert_series_equal(result, ts) - - def test_getitem_setitem_periodindex(self): - from pandas import period_range - - N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') - ts = Series(np.random.randn(N), index=rng) - - result = ts["1990-01-01 04"] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts.copy() - result["1990-01-01 04"] = 0 - result["1990-01-01 04"] = ts[4] - assert_series_equal(result, ts) - - result = ts["1990-01-01 04":"1990-01-01 07"] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-01 04":"1990-01-01 07"] = 0 - result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] - assert_series_equal(result, ts) - - lb = "1990-01-01 04" - rb = "1990-01-01 07" - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - # GH 2782 - result = ts[ts.index[4]] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts[ts.index[4:8]] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] - assert_series_equal(result, ts) - def test_asfreq(self): ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime( 2009, 11, 30), datetime(2009, 12, 31)]) @@ -513,12 +393,6 @@ def test_empty_series_ops(self): assert_series_equal(a, b + a) self.assertRaises(TypeError, lambda x, y: x - y, b, a) - def test_is_(self): - dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') - self.assertTrue(dti.is_(dti)) - self.assertTrue(dti.is_(dti.view())) - self.assertFalse(dti.is_(dti.copy())) - def test_contiguous_boolean_preserve_freq(self): rng = date_range('1/1/2000', '3/1/2000', freq='B') @@ -534,159 +408,6 @@ def test_contiguous_boolean_preserve_freq(self): masked = rng[mask] self.assertIsNone(masked.freq) - def test_getitem_median_slice_bug(self): - index = date_range('20090415', '20090519', freq='2B') - s = Series(np.random.randn(13), index=index) - - indexer = [slice(6, 7, None)] - result = s[indexer] - expected = s[indexer[0]] - assert_series_equal(result, expected) - - def test_ctor_str_intraday(self): - rng = DatetimeIndex(['1-1-2000 00:00:01']) - self.assertEqual(rng[0].second, 1) - - def test_frame_pad_backfill_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - - result = df[:2].reindex(index, method='pad', limit=5) - - expected = df[:2].reindex(index).fillna(method='pad') - expected.values[-3:] = np.nan - tm.assert_frame_equal(result, expected) - - result = df[-2:].reindex(index, method='backfill', limit=5) - - expected = df[-2:].reindex(index).fillna(method='backfill') - expected.values[:3] = np.nan - tm.assert_frame_equal(result, expected) - - def test_frame_fillna_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - - result = df[:2].reindex(index) - result = result.fillna(method='pad', limit=5) - - expected = df[:2].reindex(index).fillna(method='pad') - expected.values[-3:] = np.nan - tm.assert_frame_equal(result, expected) - - result = df[-2:].reindex(index) - result = result.fillna(method='backfill', limit=5) - - expected = df[-2:].reindex(index).fillna(method='backfill') - expected.values[:3] = np.nan - tm.assert_frame_equal(result, expected) - - def test_sparse_frame_pad_backfill_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - sdf = df.to_sparse() - - result = sdf[:2].reindex(index, method='pad', limit=5) - - expected = sdf[:2].reindex(index).fillna(method='pad') - expected = expected.to_dense() - expected.values[-3:] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - result = sdf[-2:].reindex(index, method='backfill', limit=5) - - expected = sdf[-2:].reindex(index).fillna(method='backfill') - expected = expected.to_dense() - expected.values[:3] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - def test_sparse_frame_fillna_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - sdf = df.to_sparse() - - result = sdf[:2].reindex(index) - result = result.fillna(method='pad', limit=5) - - expected = sdf[:2].reindex(index).fillna(method='pad') - expected = expected.to_dense() - expected.values[-3:] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - result = sdf[-2:].reindex(index) - result = result.fillna(method='backfill', limit=5) - - expected = sdf[-2:].reindex(index).fillna(method='backfill') - expected = expected.to_dense() - expected.values[:3] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - def test_pad_require_monotonicity(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') - - # neither monotonic increasing or decreasing - rng2 = rng[[1, 0, 2]] - - self.assertRaises(ValueError, rng2.get_indexer, rng, method='pad') - - def test_frame_ctor_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') - dates = np.asarray(rng) - - df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) - self.assertTrue(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) - - def test_frame_add_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') - df = DataFrame(index=np.arange(len(rng))) - - df['A'] = rng - self.assertTrue(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) - - def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({'year': date_range('1/1/1700', periods=50, - freq='A-DEC')}) - # it works! - repr(df) - - def test_frame_add_datetime64_col_other_units(self): - n = 100 - - units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] - - ns_dtype = np.dtype('M8[ns]') - - for unit in units: - dtype = np.dtype('M8[%s]' % unit) - vals = np.arange(n, dtype=np.int64).view(dtype) - - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) - df[unit] = vals - - ex_vals = to_datetime(vals.astype('O')).values - - self.assertEqual(df[unit].dtype, ns_dtype) - self.assertTrue((df[unit].values == ex_vals).all()) - - # Test insertion into existing datetime64 column - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) - df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) - - for unit in units: - dtype = np.dtype('M8[%s]' % unit) - vals = np.arange(n, dtype=np.int64).view(dtype) - - tmp = df.copy() - - tmp['dates'] = vals - ex_vals = to_datetime(vals.astype('O')).values - - self.assertTrue((tmp['dates'].values == ex_vals).all()) - def test_to_datetime_unit(self): epoch = 1370745748 @@ -756,13 +477,6 @@ def test_series_ctor_datetime64(self): series = Series(dates) self.assertTrue(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) - def test_index_cast_datetime64_other_units(self): - arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') - - idx = Index(arr) - - self.assertTrue((idx.values == tslib.cast_to_nanoseconds(arr)).all()) - def test_reindex_series_add_nat(self): rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') series = Series(rng) @@ -796,159 +510,6 @@ def test_series_repr_nat(self): 'dtype: datetime64[ns]') self.assertEqual(result, expected) - def test_fillna_nat(self): - series = Series([0, 1, 2, iNaT], dtype='M8[ns]') - - filled = series.fillna(method='pad') - filled2 = series.fillna(value=series.values[2]) - - expected = series.copy() - expected.values[3] = expected.values[2] - - assert_series_equal(filled, expected) - assert_series_equal(filled2, expected) - - df = DataFrame({'A': series}) - filled = df.fillna(method='pad') - filled2 = df.fillna(value=series.values[2]) - expected = DataFrame({'A': expected}) - assert_frame_equal(filled, expected) - assert_frame_equal(filled2, expected) - - series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') - - filled = series.fillna(method='bfill') - filled2 = series.fillna(value=series[1]) - - expected = series.copy() - expected[0] = expected[1] - - assert_series_equal(filled, expected) - assert_series_equal(filled2, expected) - - df = DataFrame({'A': series}) - filled = df.fillna(method='bfill') - filled2 = df.fillna(value=series[1]) - expected = DataFrame({'A': expected}) - assert_frame_equal(filled, expected) - assert_frame_equal(filled2, expected) - - def test_string_na_nat_conversion(self): - # GH #999, #858 - - from pandas.compat import parse_date - - strings = np.array(['1/1/2000', '1/2/2000', np.nan, - '1/4/2000, 12:34:56'], dtype=object) - - expected = np.empty(4, dtype='M8[ns]') - for i, val in enumerate(strings): - if com.isnull(val): - expected[i] = iNaT - else: - expected[i] = parse_date(val) - - result = tslib.array_to_datetime(strings) - assert_almost_equal(result, expected) - - result2 = to_datetime(strings) - tm.assertIsInstance(result2, DatetimeIndex) - tm.assert_numpy_array_equal(result, result2.values) - - malformed = np.array(['1/100/2000', np.nan], dtype=object) - - # GH 10636, default is now 'raise' - self.assertRaises(ValueError, - lambda: to_datetime(malformed, errors='raise')) - - result = to_datetime(malformed, errors='ignore') - tm.assert_numpy_array_equal(result, malformed) - - self.assertRaises(ValueError, to_datetime, malformed, errors='raise') - - idx = ['a', 'b', 'c', 'd', 'e'] - series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, - '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000'), np.nan, - to_datetime('1/3/2000'), np.nan, - to_datetime('1/5/2000')], index=idx, name='foo') - - result = to_datetime(series) - dresult = to_datetime(dseries) - - expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) - for i in range(5): - x = series[i] - if isnull(x): - expected[i] = iNaT - else: - expected[i] = to_datetime(x) - - assert_series_equal(result, expected, check_names=False) - self.assertEqual(result.name, 'foo') - - assert_series_equal(dresult, expected, check_names=False) - self.assertEqual(dresult.name, 'foo') - - def test_nat_vector_field_access(self): - idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000']) - - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', - 'days_in_month', 'is_leap_year'] - - for field in fields: - result = getattr(idx, field) - expected = [getattr(x, field) for x in idx] - self.assert_numpy_array_equal(result, np.array(expected)) - - s = pd.Series(idx) - - for field in fields: - result = getattr(s.dt, field) - expected = [getattr(x, field) for x in idx] - self.assert_series_equal(result, pd.Series(expected)) - - def test_nat_scalar_field_access(self): - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', - 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name'] - for field in fields: - result = getattr(NaT, field) - self.assertTrue(np.isnan(result)) - - def test_NaT_methods(self): - # GH 9513 - raise_methods = ['astimezone', 'combine', 'ctime', 'dst', - 'fromordinal', 'fromtimestamp', 'isocalendar', - 'strftime', 'strptime', 'time', 'timestamp', - 'timetuple', 'timetz', 'toordinal', 'tzname', - 'utcfromtimestamp', 'utcnow', 'utcoffset', - 'utctimetuple'] - nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today'] - nan_methods = ['weekday', 'isoweekday'] - - for method in raise_methods: - if hasattr(NaT, method): - self.assertRaises(ValueError, getattr(NaT, method)) - - for method in nan_methods: - if hasattr(NaT, method): - self.assertTrue(np.isnan(getattr(NaT, method)())) - - for method in nat_methods: - if hasattr(NaT, method): - # see gh-8254 - exp_warning = None - if method == 'to_datetime': - exp_warning = FutureWarning - with tm.assert_produces_warning( - exp_warning, check_stacklevel=False): - self.assertIs(getattr(NaT, method)(), NaT) - - # GH 12300 - self.assertEqual(NaT.isoformat(), 'NaT') - def test_index_convert_to_datetime_array(self): tm._skip_if_no_pytz() @@ -968,56 +529,6 @@ def _check_rng(rng): _check_rng(rng_eastern) _check_rng(rng_utc) - def test_index_convert_to_datetime_array_explicit_pytz(self): - tm._skip_if_no_pytz() - import pytz - - def _check_rng(rng): - converted = rng.to_pydatetime() - tm.assertIsInstance(converted, np.ndarray) - for x, stamp in zip(converted, rng): - tm.assertIsInstance(x, datetime) - self.assertEqual(x, stamp.to_pydatetime()) - self.assertEqual(x.tzinfo, stamp.tzinfo) - - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) - rng_utc = date_range('20090415', '20090519', tz=pytz.utc) - - _check_rng(rng) - _check_rng(rng_eastern) - _check_rng(rng_utc) - - def test_index_convert_to_datetime_array_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - - def _check_rng(rng): - converted = rng.to_pydatetime() - tm.assertIsInstance(converted, np.ndarray) - for x, stamp in zip(converted, rng): - tm.assertIsInstance(x, datetime) - self.assertEqual(x, stamp.to_pydatetime()) - self.assertEqual(x.tzinfo, stamp.tzinfo) - - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz='dateutil/US/Eastern') - rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) - - _check_rng(rng) - _check_rng(rng_eastern) - _check_rng(rng_utc) - - def test_reasonable_keyerror(self): - # GH #1062 - index = DatetimeIndex(['1/3/2000']) - try: - index.get_loc('1/1/2000') - except KeyError as e: - self.assertIn('2000', str(e)) - def test_reindex_with_datetimes(self): rng = date_range('1/1/2000', periods=20) ts = Series(np.random.randn(20), index=rng) From f98398bb945edde6aa87cce9c5368b061694aa58 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Feb 2017 12:21:28 -0500 Subject: [PATCH 099/338] TST: more test moving from series/test_timeseries.py --- pandas/tests/frame/test_alter_axes.py | 28 +- pandas/tests/frame/test_apply.py | 9 + pandas/tests/frame/test_combine_concat.py | 16 +- pandas/tests/frame/test_constructors.py | 27 + pandas/tests/frame/test_indexing.py | 15 + pandas/tests/frame/test_timeseries.py | 121 +- pandas/tests/groupby/test_groupby.py | 8 + pandas/tests/indexes/datetimes/test_astype.py | 130 +- .../indexes/datetimes/test_construction.py | 90 +- .../indexes/datetimes/test_date_range.py | 18 + .../indexes/datetimes/test_datetimelike.py | 1 + pandas/tests/indexes/datetimes/test_misc.py | 293 +-- .../indexes/datetimes/test_partial_slcing.py | 256 ++ pandas/tests/indexes/datetimes/test_setops.py | 22 +- pandas/tests/indexes/datetimes/test_tools.py | 1019 ++++++++ pandas/tests/series/test_combine_concat.py | 102 +- pandas/tests/series/test_constructors.py | 45 +- pandas/tests/series/test_dtypes.py | 25 +- pandas/tests/series/test_indexing.py | 540 +++- pandas/tests/series/test_operators.py | 13 + pandas/tests/series/test_timeseries.py | 2173 +---------------- 21 files changed, 2485 insertions(+), 2466 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/test_partial_slcing.py create mode 100644 pandas/tests/indexes/datetimes/test_tools.py diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index edeca0a664a87..cab627dec63cb 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -8,7 +8,7 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, Index, MultiIndex, - RangeIndex) + RangeIndex, date_range) import pandas as pd from pandas.util.testing import (assert_series_equal, @@ -325,6 +325,32 @@ def test_set_columns(self): with assertRaisesRegexp(ValueError, 'Length mismatch'): self.mixed_frame.columns = cols[::2] + def test_dti_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') + idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.reindex(idx2) + tm.assert_index_equal(df.index, idx2) + + # 11314 + # with tz + index = date_range(datetime(2015, 10, 1), + datetime(2015, 10, 1, 23), + freq='H', tz='US/Eastern') + df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) + new_index = date_range(datetime(2015, 10, 2), + datetime(2015, 10, 2, 23), + freq='H', tz='US/Eastern') + + # TODO: unused? + result = df.set_index(new_index) # noqa + + self.assertEqual(new_index.freq, index.freq) + # Renaming def test_rename(self): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe04d1005e003..19fa98afd2163 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -433,6 +433,15 @@ def test_applymap_box(self): 'd': ['Period', 'Period']}) tm.assert_frame_equal(res, exp) + def test_frame_apply_dont_convert_datetime64(self): + from pandas.tseries.offsets import BDay + df = DataFrame({'x1': [datetime(1996, 1, 1)]}) + + df = df.applymap(lambda x: x + BDay()) + df = df.applymap(lambda x: x + BDay()) + + self.assertTrue(df.x1.dtype == 'M8[ns]') + # See gh-12244 def test_apply_non_numpy_dtype(self): df = DataFrame({'dt': pd.date_range( diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 71b6500e7184a..1167662b69375 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -9,7 +9,7 @@ import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp +from pandas import DataFrame, Index, Series, Timestamp, date_range from pandas.compat import lrange from pandas.tests.frame.common import TestData @@ -735,3 +735,17 @@ def test_combine_first_int(self): res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) self.assertEqual(res['a'].dtype, 'int64') + + def test_concat_datetime_datetime64_frame(self): + # #2624 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 'hi']) + + df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + + ind = date_range(start="2000/1/1", freq="D", periods=10) + df1 = DataFrame({'date': ind, 'test': lrange(10)}) + + # it works! + pd.concat([df1, df2_obj]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 07cf6816330bc..fe6a12fcca28a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1920,6 +1920,33 @@ def test_from_index(self): df2 = DataFrame(Series(idx2)) tm.assert_series_equal(df2[0], Series(idx2, name=0)) + def test_frame_dict_constructor_datetime64_1680(self): + dr = date_range('1/1/2012', periods=10) + s = Series(dr, index=dr) + + # it works! + DataFrame({'a': 'foo', 'b': s}, index=dr) + DataFrame({'a': 'foo', 'b': s.values}, index=dr) + + def test_frame_datetime64_mixed_index_ctor_1681(self): + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + ts = Series(dr) + + # it works! + d = DataFrame({'A': 'foo', 'B': ts}, index=dr) + self.assertTrue(d['B'].isnull().all()) + + def test_frame_timeseries_to_records(self): + index = date_range('1/1/2000', periods=10) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['a', 'b', 'c']) + + result = df.to_records() + result['index'].dtype == 'M8[ns]' + + result = df.to_records(index=False) + + if __name__ == '__main__': import nose # noqa diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index bc0a68f765903..7d68eac47766e 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -1945,6 +1945,21 @@ def test_reindex_methods(self): actual = df.reindex(target, method='nearest', tolerance=0.2) assert_frame_equal(expected, actual) + def test_reindex_frame_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) + + result = df.reindex(lrange(15)) + self.assertTrue(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) + + mask = com.isnull(result)['B'] + self.assertTrue(mask[-5:].all()) + self.assertFalse(mask[:-5].any()) + + def test_set_dataframe_column_ns_dtype(self): + x = DataFrame([datetime.now(), datetime.now()]) + self.assertEqual(x[0].dtype, np.dtype('M8[ns]')) + def test_non_monotonic_reindex_methods(self): dr = pd.date_range('2013-08-01', periods=6, freq='B') data = np.random.randn(6, 1) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 934aafc500611..9a9f0ee67fb89 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -2,7 +2,7 @@ from __future__ import print_function -from datetime import datetime +from datetime import datetime, time from numpy import nan from numpy.random import randn @@ -20,6 +20,7 @@ assertRaisesRegexp) import pandas.util.testing as tm +from pandas.compat import product from pandas.tests.frame.common import TestData @@ -418,6 +419,96 @@ def test_first_last_valid(self): self.assertIsNone(empty.last_valid_index()) self.assertIsNone(empty.first_valid_index()) + def test_at_time_frame(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + rs = ts.at_time(rng[1]) + self.assertTrue((rs.index.hour == rng[1].hour).all()) + self.assertTrue((rs.index.minute == rng[1].minute).all()) + self.assertTrue((rs.index.second == rng[1].second).all()) + + result = ts.at_time('9:30') + expected = ts.at_time(time(9, 30)) + assert_frame_equal(result, expected) + + result = ts.loc[time(9, 30)] + expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] + + assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range('1/1/2000', '1/31/2000') + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts.at_time(time(0, 0)) + assert_frame_equal(result, ts) + + # time doesn't exist + rng = date_range('1/1/2012', freq='23Min', periods=384) + ts = DataFrame(np.random.randn(len(rng), 2), rng) + rs = ts.at_time('16:00') + self.assertEqual(len(rs), 0) + + def test_between_time_frame(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue(t >= stime) + else: + self.assertTrue(t > stime) + + if inc_end: + self.assertTrue(t <= etime) + else: + self.assertTrue(t < etime) + + result = ts.between_time('00:00', '01:00') + expected = ts.between_time(stime, etime) + assert_frame_equal(result, expected) + + # across midnight + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue((t >= stime) or (t <= etime)) + else: + self.assertTrue((t > stime) or (t <= etime)) + + if inc_end: + self.assertTrue((t <= etime) or (t >= stime)) + else: + self.assertTrue((t < etime) or (t >= stime)) + def test_operation_on_NaT(self): # Both NaT and Timestamp are in DataFrame. df = pd.DataFrame({'foo': [pd.NaT, pd.NaT, @@ -457,6 +548,34 @@ def test_datetime_assignment_with_NaT_and_diff_time_units(self): 'new': [1e9, None]}, dtype='datetime64[ns]') tm.assert_frame_equal(result, expected) + def test_frame_to_period(self): + K = 5 + from pandas.tseries.period import period_range + + dr = date_range('1/1/2000', '1/1/2001') + pr = period_range('1/1/2000', '1/1/2001') + df = DataFrame(randn(len(dr), K), index=dr) + df['mix'] = 'a' + + pts = df.to_period() + exp = df.copy() + exp.index = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M') + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + + df = df.T + pts = df.to_period(axis=1) + exp = df.copy() + exp.columns = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M', axis=1) + tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) + + self.assertRaises(ValueError, df.to_period, axis=2) + if __name__ == '__main__': import nose diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ffb6025163a6b..bf61f5ef83859 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4167,6 +4167,14 @@ def test_groupby_groups_datetimeindex_tz(self): result = df.groupby(level=0).sum() assert_frame_equal(result, expected) + def test_frame_datetime64_handling_groupby(self): + # it works! + df = DataFrame([(3, np.datetime64('2012-07-03')), + (3, np.datetime64('2012-07-04'))], + columns=['a', 'date']) + result = df.groupby('a').first() + self.assertEqual(result['date'][3], Timestamp('2012-07-03')) + def test_groupby_multi_timezone(self): # combining multiple / different timezones yields UTC diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index d452a7e1840d7..edb044a3cb2d7 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -4,7 +4,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (DatetimeIndex, date_range, Series, NaT, Index, Timestamp, - Int64Index) + Int64Index, Period) class TestDatetimeIndex(tm.TestCase): @@ -182,3 +182,131 @@ def _check_rng(rng): _check_rng(rng) _check_rng(rng_eastern) _check_rng(rng_utc) + + +class TestToPeriod(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + data = [Timestamp('2007-01-01 10:11:12.123456Z'), + Timestamp('2007-01-01 10:11:13.789123Z')] + self.index = DatetimeIndex(data) + + def test_to_period_millisecond(self): + index = self.index + + period = index.to_period(freq='L') + self.assertEqual(2, len(period)) + self.assertEqual(period[0], Period('2007-01-01 10:11:12.123Z', 'L')) + self.assertEqual(period[1], Period('2007-01-01 10:11:13.789Z', 'L')) + + def test_to_period_microsecond(self): + index = self.index + + period = index.to_period(freq='U') + self.assertEqual(2, len(period)) + self.assertEqual(period[0], Period('2007-01-01 10:11:12.123456Z', 'U')) + self.assertEqual(period[1], Period('2007-01-01 10:11:13.789123Z', 'U')) + + def test_to_period_tz_pytz(self): + tm._skip_if_no_pytz() + from dateutil.tz import tzlocal + from pytz import utc as UTC + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz='US/Eastern') + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=UTC) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + tm.assert_index_equal(ts.to_period(), xp) + + def test_to_period_tz_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + from dateutil.tz import tzlocal + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz=pytz.timezone('US/Eastern')) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + def test_to_period_tz_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + from dateutil.tz import tzlocal + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz='dateutil/US/Eastern') + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertTrue(result == expected) + tm.assert_index_equal(ts.to_period(), xp) + + def test_astype_object(self): + # NumPy 1.6.1 weak ns support + rng = date_range('1/1/2000', periods=20) + + casted = rng.astype('O') + exp_values = list(rng) + + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + self.assertEqual(casted.tolist(), exp_values) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 03bc0e0c554b0..e54ebe3d93bc6 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -2,7 +2,7 @@ from datetime import timedelta import pandas as pd -from pandas import tslib +from pandas import tslib, offsets, lib import pandas.util.testing as tm from pandas.tslib import OutOfBoundsDatetime from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range, @@ -467,17 +467,6 @@ def test_dti_constructor_small_int(self): arr = np.array([0, 10, 20], dtype=dtype) tm.assert_index_equal(DatetimeIndex(arr), exp) - def test_dti_constructor_numpy_timeunits(self): - # GH 9114 - base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) - - for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', - 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: - values = base.values.astype(dtype) - - tm.assert_index_equal(DatetimeIndex(values), base) - tm.assert_index_equal(to_datetime(values), base) - def test_ctor_str_intraday(self): rng = DatetimeIndex(['1-1-2000 00:00:01']) self.assertEqual(rng[0].second, 1) @@ -507,3 +496,80 @@ def test_constructor_int64_nocopy(self): arr[50:100] = -1 self.assertTrue((index.asi8[50:100] != -1).all()) + + def test_from_freq_recreate_from_data(self): + freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', + 'C'] + + for f in freqs: + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) + idx = DatetimeIndex(org, freq=f) + tm.assert_index_equal(idx, org) + + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, + tz='US/Pacific', periods=1) + idx = DatetimeIndex(org, freq=f, tz='US/Pacific') + tm.assert_index_equal(idx, org) + + def test_datetimeindex_constructor_misc(self): + arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] + self.assertRaises(Exception, DatetimeIndex, arr) + + arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] + idx1 = DatetimeIndex(arr) + + arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] + idx2 = DatetimeIndex(arr) + + arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', + '2005-01-04'] + idx3 = DatetimeIndex(arr) + + arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', + '2005-01-04'], dtype='O') + idx4 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) + idx5 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' + ]) + idx6 = DatetimeIndex(arr) + + idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) + idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, + yearfirst=True) + tm.assert_index_equal(idx7, idx8) + + for other in [idx2, idx3, idx4, idx5, idx6]: + self.assertTrue((idx1.values == other.values).all()) + + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + idx = DatetimeIndex(start=sdate, freq='1B', periods=20) + self.assertEqual(len(idx), 20) + self.assertEqual(idx[0], sdate + 0 * offsets.BDay()) + self.assertEqual(idx.freq, 'B') + + idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) + self.assertEqual(len(idx), 20) + self.assertEqual(idx[-1], edate) + self.assertEqual(idx.freq, '5D') + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=offsets.Week(weekday=6)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=offsets.QuarterBegin(startingMonth=1)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=offsets.BQuarterEnd(startingMonth=12)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index b3d6c41573ab8..b2161aa5c75c6 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -3,6 +3,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import date_range, offsets, DatetimeIndex, Timestamp +from pandas import compat from pandas.tests.series.common import TestData @@ -110,3 +111,20 @@ def test_range_misspecified(self): self.assertRaises(ValueError, date_range, '1/1/2000', freq='H') self.assertRaises(ValueError, date_range, end='1/1/2000', freq='H') self.assertRaises(ValueError, date_range, periods=10, freq='H') + + def test_compat_replace(self): + # https://github.com/statsmodels/statsmodels/issues/3349 + # replace should take ints/longs for compat + + for f in [compat.long, int]: + result = date_range(Timestamp('1960-04-01 00:00:00', + freq='QS-JAN'), + periods=f(76), + freq='QS-JAN') + self.assertEqual(len(result), 76) + + def test_catch_infinite_loop(self): + offset = offsets.DateOffset(minute=5) + # blow up, don't loop forever + self.assertRaises(Exception, date_range, datetime(2011, 11, 11), + datetime(2011, 11, 12), freq=offset) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index b32801a8bcf25..eea08febc86e6 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -7,6 +7,7 @@ from ..datetimelike import DatetimeLike + class TestDatetimeIndex(DatetimeLike, tm.TestCase): _holder = DatetimeIndex _multiprocess_can_split_ = True diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 92aad5a0b1997..dda2785d2b0ae 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,13 +1,9 @@ import numpy as np import pandas as pd -import pandas.lib as lib import pandas.util.testing as tm -from pandas import (Index, DatetimeIndex, datetime, offsets, to_datetime, - Series, DataFrame, Float64Index, date_range, - Timestamp, isnull) -from pandas import tslib -from pandas.util.testing import assert_series_equal +from pandas import (Index, DatetimeIndex, datetime, offsets, + Float64Index, date_range, Timestamp) class TestDateTimeIndexToJulianDate(tm.TestCase): @@ -144,63 +140,6 @@ def test_datetimeindex_integers_shift(self): expected = rng.shift(-5) tm.assert_index_equal(result, expected) - def test_string_na_nat_conversion(self): - # GH #999, #858 - - from pandas.compat import parse_date - - strings = np.array(['1/1/2000', '1/2/2000', np.nan, - '1/4/2000, 12:34:56'], dtype=object) - - expected = np.empty(4, dtype='M8[ns]') - for i, val in enumerate(strings): - if isnull(val): - expected[i] = tslib.iNaT - else: - expected[i] = parse_date(val) - - result = tslib.array_to_datetime(strings) - tm.assert_almost_equal(result, expected) - - result2 = to_datetime(strings) - tm.assertIsInstance(result2, DatetimeIndex) - tm.assert_numpy_array_equal(result, result2.values) - - malformed = np.array(['1/100/2000', np.nan], dtype=object) - - # GH 10636, default is now 'raise' - self.assertRaises(ValueError, - lambda: to_datetime(malformed, errors='raise')) - - result = to_datetime(malformed, errors='ignore') - tm.assert_numpy_array_equal(result, malformed) - - self.assertRaises(ValueError, to_datetime, malformed, errors='raise') - - idx = ['a', 'b', 'c', 'd', 'e'] - series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, - '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000'), np.nan, - to_datetime('1/3/2000'), np.nan, - to_datetime('1/5/2000')], index=idx, name='foo') - - result = to_datetime(series) - dresult = to_datetime(dseries) - - expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) - for i in range(5): - x = series[i] - if isnull(x): - expected[i] = tslib.iNaT - else: - expected[i] = to_datetime(x) - - assert_series_equal(result, expected, check_names=False) - self.assertEqual(result.name, 'foo') - - assert_series_equal(dresult, expected, check_names=False) - self.assertEqual(dresult.name, 'foo') - def test_datetimeindex_repr_short(self): dr = date_range(start='1/1/2012', periods=1) repr(dr) @@ -211,84 +150,6 @@ def test_datetimeindex_repr_short(self): dr = date_range(start='1/1/2012', periods=3) repr(dr) - def test_getitem_setitem_datetimeindex(self): - N = 50 - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(N), index=rng) - - result = ts["1990-01-01 04:00:00"] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts.copy() - result["1990-01-01 04:00:00"] = 0 - result["1990-01-01 04:00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 - result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] - assert_series_equal(result, ts) - - lb = "1990-01-01 04:00:00" - rb = "1990-01-01 07:00:00" - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - # repeat all the above with naive datetimes - result = ts[datetime(1990, 1, 1, 4)] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts.copy() - result[datetime(1990, 1, 1, 4)] = 0 - result[datetime(1990, 1, 1, 4)] = ts[4] - assert_series_equal(result, ts) - - result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] - assert_series_equal(result, ts) - - lb = datetime(1990, 1, 1, 4) - rb = datetime(1990, 1, 1, 7) - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts[ts.index[4]] - expected = ts[4] - self.assertEqual(result, expected) - - result = ts[ts.index[4:8]] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] - assert_series_equal(result, ts) - - # also test partial date slicing - result = ts["1990-01-02"] - expected = ts[24:48] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-02"] = 0 - result["1990-01-02"] = ts[24:48] - assert_series_equal(result, ts) - def test_normalize(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D') @@ -308,13 +169,6 @@ def test_normalize(self): self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) - def test_series_ctor_plus_datetimeindex(self): - rng = date_range('20090415', '20090519', freq='B') - data = dict((k, 1) for k in rng) - - result = Series(data, index=rng) - self.assertIs(result.index, rng) - class TestDatetime64(tm.TestCase): @@ -451,151 +305,8 @@ def test_datetimeindex_accessors(self): for ts, value in tests: self.assertEqual(ts, value) - def test_datetimeindex_diff(self): - dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=100) - dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=98) - self.assertEqual(len(dti1.difference(dti2)), 2) - def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) self.assert_numpy_array_equal(dti.nanosecond, np.arange(10, dtype=np.int32)) - - def test_datetimeindex_constructor(self): - arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] - self.assertRaises(Exception, DatetimeIndex, arr) - - arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] - idx1 = DatetimeIndex(arr) - - arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] - idx2 = DatetimeIndex(arr) - - arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', - '2005-01-04'] - idx3 = DatetimeIndex(arr) - - arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', - '2005-01-04'], dtype='O') - idx4 = DatetimeIndex(arr) - - arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) - idx5 = DatetimeIndex(arr) - - arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' - ]) - idx6 = DatetimeIndex(arr) - - idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) - idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, - yearfirst=True) - tm.assert_index_equal(idx7, idx8) - - for other in [idx2, idx3, idx4, idx5, idx6]: - self.assertTrue((idx1.values == other.values).all()) - - sdate = datetime(1999, 12, 25) - edate = datetime(2000, 1, 1) - idx = DatetimeIndex(start=sdate, freq='1B', periods=20) - self.assertEqual(len(idx), 20) - self.assertEqual(idx[0], sdate + 0 * offsets.BDay()) - self.assertEqual(idx.freq, 'B') - - idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) - self.assertEqual(len(idx), 20) - self.assertEqual(idx[-1], edate) - self.assertEqual(idx.freq, '5D') - - idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') - idx2 = DatetimeIndex(start=sdate, end=edate, - freq=offsets.Week(weekday=6)) - self.assertEqual(len(idx1), len(idx2)) - self.assertEqual(idx1.offset, idx2.offset) - - idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') - idx2 = DatetimeIndex(start=sdate, end=edate, - freq=offsets.QuarterBegin(startingMonth=1)) - self.assertEqual(len(idx1), len(idx2)) - self.assertEqual(idx1.offset, idx2.offset) - - idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') - idx2 = DatetimeIndex(start=sdate, end=edate, - freq=offsets.BQuarterEnd(startingMonth=12)) - self.assertEqual(len(idx1), len(idx2)) - self.assertEqual(idx1.offset, idx2.offset) - - def test_dayfirst(self): - # GH 5917 - arr = ['10/02/2014', '11/02/2014', '12/02/2014'] - expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), - datetime(2014, 2, 12)]) - idx1 = DatetimeIndex(arr, dayfirst=True) - idx2 = DatetimeIndex(np.array(arr), dayfirst=True) - idx3 = to_datetime(arr, dayfirst=True) - idx4 = to_datetime(np.array(arr), dayfirst=True) - idx5 = DatetimeIndex(Index(arr), dayfirst=True) - idx6 = DatetimeIndex(Series(arr), dayfirst=True) - tm.assert_index_equal(expected, idx1) - tm.assert_index_equal(expected, idx2) - tm.assert_index_equal(expected, idx3) - tm.assert_index_equal(expected, idx4) - tm.assert_index_equal(expected, idx5) - tm.assert_index_equal(expected, idx6) - - def test_dti_set_index_reindex(self): - # GH 6631 - df = DataFrame(np.random.random(6)) - idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') - idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') - - df = df.set_index(idx1) - tm.assert_index_equal(df.index, idx1) - df = df.reindex(idx2) - tm.assert_index_equal(df.index, idx2) - - # 11314 - # with tz - index = date_range(datetime(2015, 10, 1), - datetime(2015, 10, 1, 23), - freq='H', tz='US/Eastern') - df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) - new_index = date_range(datetime(2015, 10, 2), - datetime(2015, 10, 2, 23), - freq='H', tz='US/Eastern') - - # TODO: unused? - result = df.set_index(new_index) # noqa - - self.assertEqual(new_index.freq, index.freq) - - def test_datetimeindex_union_join_empty(self): - dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') - empty = Index([]) - - result = dti.union(empty) - tm.assertIsInstance(result, DatetimeIndex) - self.assertIs(result, result) - - result = dti.join(empty) - tm.assertIsInstance(result, DatetimeIndex) - - -class TestTimeSeriesDuplicates(tm.TestCase): - _multiprocess_can_split_ = True - - def test_recreate_from_data(self): - freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', - 'C'] - - for f in freqs: - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) - idx = DatetimeIndex(org, freq=f) - tm.assert_index_equal(idx, org) - - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, - tz='US/Pacific', periods=1) - idx = DatetimeIndex(org, freq=f, tz='US/Pacific') - tm.assert_index_equal(idx, org) diff --git a/pandas/tests/indexes/datetimes/test_partial_slcing.py b/pandas/tests/indexes/datetimes/test_partial_slcing.py new file mode 100644 index 0000000000000..a960f5cf9235a --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_partial_slcing.py @@ -0,0 +1,256 @@ +""" test partial slicing on Series/Frame """ +from datetime import datetime +import numpy as np +import pandas as pd + +from pandas import (DatetimeIndex, Series, DataFrame, + date_range, Index, Timedelta, Timestamp) +from pandas.util import testing as tm + + +class TestSlicing(tm.TestCase): + + def test_slice_year(self): + dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + result = s['2005'] + expected = s[s.index.year == 2005] + tm.assert_series_equal(result, expected) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + result = df.loc['2005'] + expected = df[df.index.year == 2005] + tm.assert_frame_equal(result, expected) + + rng = date_range('1/1/2000', '1/1/2010') + + result = rng.get_loc('2009') + expected = slice(3288, 3653) + self.assertEqual(result, expected) + + def test_slice_quarter(self): + dti = DatetimeIndex(freq='D', start=datetime(2000, 6, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + self.assertEqual(len(s['2001Q1']), 90) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEqual(len(df.loc['1Q01']), 90) + + def test_slice_month(self): + dti = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(dti)), index=dti) + self.assertEqual(len(s['2005-11']), 30) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEqual(len(df.loc['2005-11']), 30) + + tm.assert_series_equal(s['2005-11'], s['11-2005']) + + def test_partial_slice(self): + rng = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-05':'2006-02'] + expected = s['20050501':'20060228'] + tm.assert_series_equal(result, expected) + + result = s['2005-05':] + expected = s['20050501':] + tm.assert_series_equal(result, expected) + + result = s[:'2006-02'] + expected = s[:'20060228'] + tm.assert_series_equal(result, expected) + + result = s['2005-1-1'] + self.assertEqual(result, s.iloc[0]) + + self.assertRaises(Exception, s.__getitem__, '2004-12-31') + + def test_partial_slice_daily(self): + rng = DatetimeIndex(freq='H', start=datetime(2005, 1, 31), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-31'] + tm.assert_series_equal(result, s.iloc[:24]) + + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00') + + def test_partial_slice_hourly(self): + rng = DatetimeIndex(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), + periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-1'] + tm.assert_series_equal(result, s.iloc[:60 * 4]) + + result = s['2005-1-1 20'] + tm.assert_series_equal(result, s.iloc[:60]) + + self.assertEqual(s['2005-1-1 20:00'], s.iloc[0]) + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:15') + + def test_partial_slice_minutely(self): + rng = DatetimeIndex(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), + periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-1 23:59'] + tm.assert_series_equal(result, s.iloc[:60]) + + result = s['2005-1-1'] + tm.assert_series_equal(result, s.iloc[:60]) + + self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.iloc[0]) + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') + + def test_partial_slice_second_precision(self): + rng = DatetimeIndex(start=datetime(2005, 1, 1, 0, 0, 59, + microsecond=999990), + periods=20, freq='US') + s = Series(np.arange(20), rng) + + tm.assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) + tm.assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) + + tm.assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) + tm.assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) + + self.assertEqual(s[Timestamp('2005-1-1 00:00:59.999990')], s.iloc[0]) + self.assertRaisesRegexp(KeyError, '2005-1-1 00:00:00', + lambda: s['2005-1-1 00:00:00']) + + def test_partial_slicing_dataframe(self): + # GH14856 + # Test various combinations of string slicing resolution vs. + # index resolution + # - If string resolution is less precise than index resolution, + # string is considered a slice + # - If string resolution is equal to or more precise than index + # resolution, string is considered an exact match + formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] + resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] + for rnum, resolution in enumerate(resolutions[2:], 2): + # we check only 'day', 'hour', 'minute' and 'second' + unit = Timedelta("1 " + resolution) + middate = datetime(2012, 1, 1, 0, 0, 0) + index = DatetimeIndex([middate - unit, + middate, middate + unit]) + values = [1, 2, 3] + df = DataFrame({'a': values}, index, dtype=np.int64) + self.assertEqual(df.index.resolution, resolution) + + # Timestamp with the same resolution as index + # Should be exact match for Series (return scalar) + # and raise KeyError for Frame + for timestamp, expected in zip(index, values): + ts_string = timestamp.strftime(formats[rnum]) + # make ts_string as precise as index + result = df['a'][ts_string] + self.assertIsInstance(result, np.int64) + self.assertEqual(result, expected) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + # Timestamp with resolution less precise than index + for fmt in formats[:rnum]: + for element, theslice in [[0, slice(None, 1)], + [1, slice(1, None)]]: + ts_string = index[element].strftime(fmt) + + # Series should return slice + result = df['a'][ts_string] + expected = df['a'][theslice] + tm.assert_series_equal(result, expected) + + # Frame should return slice as well + result = df[ts_string] + expected = df[theslice] + tm.assert_frame_equal(result, expected) + + # Timestamp with resolution more precise than index + # Compatible with existing key + # Should return scalar for Series + # and raise KeyError for Frame + for fmt in formats[rnum + 1:]: + ts_string = index[1].strftime(fmt) + result = df['a'][ts_string] + self.assertIsInstance(result, np.int64) + self.assertEqual(result, 2) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + # Not compatible with existing key + # Should raise KeyError + for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: + ts = index[1] + Timedelta("1 " + res) + ts_string = ts.strftime(fmt) + self.assertRaises(KeyError, df['a'].__getitem__, ts_string) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + def test_partial_slicing_with_multiindex(self): + + # GH 4758 + # partial string indexing with a multi-index buggy + df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"], + 'val': [1, 2, 3, 4]}, + index=date_range("2013-06-19 09:30:00", + periods=4, freq='5T')) + df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) + + expected = DataFrame([ + [1] + ], index=Index(['ABC'], name='TICKER'), columns=['val']) + result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] + tm.assert_frame_equal(result, expected) + + expected = df_multi.loc[ + (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] + result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] + tm.assert_series_equal(result, expected) + + # this is a KeyError as we don't do partial string selection on + # multi-levels + def f(): + df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] + + self.assertRaises(KeyError, f) + + # GH 4294 + # partial slice on a series mi + s = pd.DataFrame(np.random.rand(1000, 1000), index=pd.date_range( + '2000-1-1', periods=1000)).stack() + + s2 = s[:-1].copy() + expected = s2['2000-1-4'] + result = s2[pd.Timestamp('2000-1-4')] + tm.assert_series_equal(result, expected) + + result = s[pd.Timestamp('2000-1-4')] + expected = s['2000-1-4'] + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame(s) + expected = df2.xs('2000-1-4') + result = df2.loc[pd.Timestamp('2000-1-4')] + tm.assert_frame_equal(result, expected) + + def test_partial_slice_doesnt_require_monotonicity(self): + # For historical reasons. + s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10)) + + nonmonotonic = s[[3, 5, 4]] + expected = nonmonotonic.iloc[:0] + timestamp = pd.Timestamp('2014-01-10') + + tm.assert_series_equal(nonmonotonic['2014-01-10':], expected) + self.assertRaisesRegexp(KeyError, + r"Timestamp\('2014-01-10 00:00:00'\)", + lambda: nonmonotonic[timestamp:]) + + tm.assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) + self.assertRaisesRegexp(KeyError, + r"Timestamp\('2014-01-10 00:00:00'\)", + lambda: nonmonotonic.loc[timestamp:]) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index ba6beb03c7f24..229ae803aa2ff 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -1,9 +1,11 @@ +from datetime import datetime + import numpy as np import pandas as pd import pandas.util.testing as tm from pandas import (DatetimeIndex, date_range, Series, bdate_range, DataFrame, - Int64Index) + Int64Index, Index) class TestDatetimeIndex(tm.TestCase): @@ -166,3 +168,21 @@ def test_difference_freq(self): expected = DatetimeIndex(["20160920", "20160921"], freq=None) tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal('freq', idx_diff, expected) + + def test_datetimeindex_diff(self): + dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), + periods=100) + dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), + periods=98) + self.assertEqual(len(dti1.difference(dti2)), 2) + + def test_datetimeindex_union_join_empty(self): + dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') + empty = Index([]) + + result = dti.union(empty) + tm.assertIsInstance(result, DatetimeIndex) + self.assertIs(result, result) + + result = dti.join(empty) + tm.assertIsInstance(result, DatetimeIndex) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py new file mode 100644 index 0000000000000..42d135f634298 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -0,0 +1,1019 @@ +""" test to_datetime """ + +import nose + +import sys +import calendar +import locale +from datetime import datetime + +import numpy as np +from pandas.types.common import is_datetime64_ns_dtype +from pandas import (isnull, to_datetime, Timestamp, Series, DataFrame, + Index, DatetimeIndex, NaT, date_range, bdate_range) +from pandas import tslib +from pandas.compat import lmap +import pandas as pd +from pandas.tseries import tools +from pandas.util import testing as tm +from pandas.util.testing import assert_series_equal + + +class TimeConversionFormats(tm.TestCase): + + def test_to_datetime_format(self): + values = ['1/1/2000', '1/2/2000', '1/3/2000'] + + results1 = [Timestamp('20000101'), Timestamp('20000201'), + Timestamp('20000301')] + results2 = [Timestamp('20000101'), Timestamp('20000102'), + Timestamp('20000103')] + for vals, expecteds in [(values, (Index(results1), Index(results2))), + (Series(values), + (Series(results1), Series(results2))), + (values[0], (results1[0], results2[0])), + (values[1], (results1[1], results2[1])), + (values[2], (results1[2], results2[2]))]: + + for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): + result = to_datetime(vals, format=fmt) + expected = expecteds[i] + + if isinstance(expected, Series): + assert_series_equal(result, Series(expected)) + elif isinstance(expected, Timestamp): + self.assertEqual(result, expected) + else: + tm.assert_index_equal(result, expected) + + def test_to_datetime_format_YYYYMMDD(self): + s = Series([19801222, 19801222] + [19810105] * 5) + expected = Series([Timestamp(x) for x in s.apply(str)]) + + result = to_datetime(s, format='%Y%m%d') + assert_series_equal(result, expected) + + result = to_datetime(s.apply(str), format='%Y%m%d') + assert_series_equal(result, expected) + + # with NaT + expected = Series([Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5) + expected[2] = np.nan + s[2] = np.nan + + result = to_datetime(s, format='%Y%m%d') + assert_series_equal(result, expected) + + # string with NaT + s = s.apply(str) + s[2] = 'nat' + result = to_datetime(s, format='%Y%m%d') + assert_series_equal(result, expected) + + # coercion + # GH 7930 + s = Series([20121231, 20141231, 99991231]) + result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') + expected = Series([datetime(2012, 12, 31), + datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object) + self.assert_series_equal(result, expected) + + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') + expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + assert_series_equal(result, expected) + + # GH 10178 + def test_to_datetime_format_integer(self): + s = Series([2000, 2001, 2002]) + expected = Series([Timestamp(x) for x in s.apply(str)]) + + result = to_datetime(s, format='%Y') + assert_series_equal(result, expected) + + s = Series([200001, 200105, 200206]) + expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) + ]) + + result = to_datetime(s, format='%Y%m') + assert_series_equal(result, expected) + + def test_to_datetime_format_microsecond(self): + + # these are locale dependent + lang, _ = locale.getlocale() + month_abbr = calendar.month_abbr[4] + val = '01-{}-2011 00:00:01.978'.format(month_abbr) + + format = '%d-%b-%Y %H:%M:%S.%f' + result = to_datetime(val, format=format) + exp = datetime.strptime(val, format) + self.assertEqual(result, exp) + + def test_to_datetime_format_time(self): + data = [ + ['01/10/2010 15:20', '%m/%d/%Y %H:%M', + Timestamp('2010-01-10 15:20')], + ['01/10/2010 05:43', '%m/%d/%Y %I:%M', + Timestamp('2010-01-10 05:43')], + ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', + Timestamp('2010-01-10 13:56:01')] # , + # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', + # Timestamp('2010-01-10 20:14')], + # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', + # Timestamp('2010-01-10 07:40')], + # ['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p', + # Timestamp('2010-01-10 09:12:56')] + ] + for s, format, dt in data: + self.assertEqual(to_datetime(s, format=format), dt) + + def test_to_datetime_with_non_exact(self): + # GH 10834 + tm._skip_if_has_locale() + + # 8904 + # exact kw + if sys.version_info < (2, 7): + raise nose.SkipTest('on python version < 2.7') + + s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', + '19MAY11 00:00:00Z']) + result = to_datetime(s, format='%d%b%y', exact=False) + expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), + format='%d%b%y') + assert_series_equal(result, expected) + + def test_parse_nanoseconds_with_formula(self): + + # GH8989 + # trunctaing the nanoseconds when a format was provided + for v in ["2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", ]: + expected = pd.to_datetime(v) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") + self.assertEqual(result, expected) + + def test_to_datetime_format_weeks(self): + data = [ + ['2009324', '%Y%W%w', Timestamp('2009-08-13')], + ['2013020', '%Y%U%w', Timestamp('2013-01-13')] + ] + for s, format, dt in data: + self.assertEqual(to_datetime(s, format=format), dt) + + +class TestToDatetime(tm.TestCase): + _multiprocess_can_split_ = True + + def test_to_datetime_dt64s(self): + in_bound_dts = [ + np.datetime64('2000-01-01'), + np.datetime64('2000-01-02'), + ] + + for dt in in_bound_dts: + self.assertEqual(pd.to_datetime(dt), Timestamp(dt)) + + oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ] + + for dt in oob_dts: + self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') + self.assertRaises(ValueError, Timestamp, dt) + self.assertIs(pd.to_datetime(dt, errors='coerce'), NaT) + + def test_to_datetime_array_of_dt64s(self): + dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] + + # Assuming all datetimes are in bounds, to_datetime() returns + # an array that is equal to Timestamp() parsing + self.assert_numpy_array_equal( + pd.to_datetime(dts, box=False), + np.array([Timestamp(x).asm8 for x in dts]) + ) + + # A list of datetimes where the last one is out of bounds + dts_with_oob = dts + [np.datetime64('9999-01-01')] + + self.assertRaises(ValueError, pd.to_datetime, dts_with_oob, + errors='raise') + + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, errors='coerce'), + np.array( + [ + Timestamp(dts_with_oob[0]).asm8, + Timestamp(dts_with_oob[1]).asm8, + tslib.iNaT, + ], + dtype='M8' + ) + ) + + # With errors='ignore', out of bounds datetime64s + # are converted to their .item(), which depending on the version of + # numpy is either a python datetime.datetime or datetime.date + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, errors='ignore'), + np.array( + [dt.item() for dt in dts_with_oob], + dtype='O' + ) + ) + + def test_to_datetime_tz(self): + + # xref 8260 + # uniform returns a DatetimeIndex + arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] + result = pd.to_datetime(arr) + expected = DatetimeIndex( + ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') + tm.assert_index_equal(result, expected) + + # mixed tzs will raise + arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] + self.assertRaises(ValueError, lambda: pd.to_datetime(arr)) + + def test_to_datetime_tz_pytz(self): + + # xref 8260 + tm._skip_if_no_pytz() + import pytz + + us_eastern = pytz.timezone('US/Eastern') + arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, + hour=3, minute=0)), + us_eastern.localize(datetime(year=2000, month=6, day=1, + hour=3, minute=0))], + dtype=object) + result = pd.to_datetime(arr, utc=True) + expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', + '2000-06-01 07:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + tm.assert_index_equal(result, expected) + + def test_to_datetime_utc_is_true(self): + # See gh-11934 + start = pd.Timestamp('2014-01-01', tz='utc') + end = pd.Timestamp('2014-01-03', tz='utc') + date_range = pd.bdate_range(start, end) + + result = pd.to_datetime(date_range, utc=True) + expected = pd.DatetimeIndex(data=date_range) + tm.assert_index_equal(result, expected) + + def test_to_datetime_tz_psycopg2(self): + + # xref 8260 + try: + import psycopg2 + except ImportError: + raise nose.SkipTest("no psycopg2 installed") + + # misc cases + tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) + tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) + arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1), + datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], + dtype=object) + + result = pd.to_datetime(arr, errors='coerce', utc=True) + expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', + '2000-06-01 07:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + tm.assert_index_equal(result, expected) + + # dtype coercion + i = pd.DatetimeIndex([ + '2000-01-01 08:00:00+00:00' + ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) + self.assertTrue(is_datetime64_ns_dtype(i)) + + # tz coerceion + result = pd.to_datetime(i, errors='coerce') + tm.assert_index_equal(result, i) + + result = pd.to_datetime(i, errors='coerce', utc=True) + expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], + dtype='datetime64[ns, UTC]') + tm.assert_index_equal(result, expected) + + def test_datetime_bool(self): + # GH13176 + with self.assertRaises(TypeError): + to_datetime(False) + self.assertTrue(to_datetime(False, errors="coerce") is NaT) + self.assertEqual(to_datetime(False, errors="ignore"), False) + with self.assertRaises(TypeError): + to_datetime(True) + self.assertTrue(to_datetime(True, errors="coerce") is NaT) + self.assertEqual(to_datetime(True, errors="ignore"), True) + with self.assertRaises(TypeError): + to_datetime([False, datetime.today()]) + with self.assertRaises(TypeError): + to_datetime(['20130101', True]) + tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], + errors="coerce"), + DatetimeIndex([to_datetime(0), NaT, + NaT, to_datetime(0)])) + + def test_datetime_invalid_datatype(self): + # GH13176 + + with self.assertRaises(TypeError): + pd.to_datetime(bool) + with self.assertRaises(TypeError): + pd.to_datetime(pd.to_datetime) + + +class ToDatetimeUnit(tm.TestCase): + + def test_unit(self): + # GH 11758 + # test proper behavior with erros + + with self.assertRaises(ValueError): + to_datetime([1], unit='D', format='%Y%m%d') + + values = [11111111, 1, 1.0, tslib.iNaT, NaT, np.nan, + 'NaT', ''] + result = to_datetime(values, unit='D', errors='ignore') + expected = Index([11111111, Timestamp('1970-01-02'), + Timestamp('1970-01-02'), NaT, + NaT, NaT, NaT, NaT], + dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit='D', errors='coerce') + expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', + 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, unit='D', errors='raise') + + values = [1420043460000, tslib.iNaT, NaT, np.nan, 'NaT'] + + result = to_datetime(values, errors='ignore', unit='s') + expected = Index([1420043460000, NaT, NaT, + NaT, NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors='coerce', unit='s') + expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, errors='raise', unit='s') + + # if we have a string, then we raise a ValueError + # and NOT an OutOfBoundsDatetime + for val in ['foo', Timestamp('20130101')]: + try: + to_datetime(val, errors='raise', unit='s') + except tslib.OutOfBoundsDatetime: + raise AssertionError("incorrect exception raised") + except ValueError: + pass + + def test_unit_consistency(self): + + # consistency of conversions + expected = Timestamp('1970-05-09 14:25:11') + result = pd.to_datetime(11111111, unit='s', errors='raise') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='coerce') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='ignore') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + def test_unit_with_numeric(self): + + # GH 13180 + # coercions from floats/ints are ok + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr1 = [1.434692e+18, 1.432766e+18] + arr2 = np.array(arr1).astype('int64') + for errors in ['ignore', 'raise', 'coerce']: + result = pd.to_datetime(arr1, errors=errors) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(arr2, errors=errors) + tm.assert_index_equal(result, expected) + + # but we want to make sure that we are coercing + # if we have ints/strings + expected = DatetimeIndex(['NaT', + '2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr = ['foo', 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20', + 'NaT', + 'NaT']) + arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + def test_unit_mixed(self): + + # mixed integers/datetimes + expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) + arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + expected = DatetimeIndex(['NaT', + 'NaT', + '2013-01-01']) + arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + def test_dataframe(self): + + df = DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5], + 'hour': [6, 7], + 'minute': [58, 59], + 'second': [10, 11], + 'ms': [1, 1], + 'us': [2, 2], + 'ns': [3, 3]}) + + result = to_datetime({'year': df['year'], + 'month': df['month'], + 'day': df['day']}) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:0:00')]) + assert_series_equal(result, expected) + + # dict-like + result = to_datetime(df[['year', 'month', 'day']].to_dict()) + assert_series_equal(result, expected) + + # dict but with constructable + df2 = df[['year', 'month', 'day']].to_dict() + df2['month'] = 2 + result = to_datetime(df2) + expected2 = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160205 00:0:00')]) + assert_series_equal(result, expected2) + + # unit mappings + units = [{'year': 'years', + 'month': 'months', + 'day': 'days', + 'hour': 'hours', + 'minute': 'minutes', + 'second': 'seconds'}, + {'year': 'year', + 'month': 'month', + 'day': 'day', + 'hour': 'hour', + 'minute': 'minute', + 'second': 'second'}, + ] + + for d in units: + result = to_datetime(df[list(d.keys())].rename(columns=d)) + expected = Series([Timestamp('20150204 06:58:10'), + Timestamp('20160305 07:59:11')]) + assert_series_equal(result, expected) + + d = {'year': 'year', + 'month': 'month', + 'day': 'day', + 'hour': 'hour', + 'minute': 'minute', + 'second': 'second', + 'ms': 'ms', + 'us': 'us', + 'ns': 'ns'} + + result = to_datetime(df.rename(columns=d)) + expected = Series([Timestamp('20150204 06:58:10.001002003'), + Timestamp('20160305 07:59:11.001002003')]) + assert_series_equal(result, expected) + + # coerce back to int + result = to_datetime(df.astype(str)) + assert_series_equal(result, expected) + + # passing coerce + df2 = DataFrame({'year': [2015, 2016], + 'month': [2, 20], + 'day': [4, 5]}) + with self.assertRaises(ValueError): + to_datetime(df2) + result = to_datetime(df2, errors='coerce') + expected = Series([Timestamp('20150204 00:00:00'), + NaT]) + assert_series_equal(result, expected) + + # extra columns + with self.assertRaises(ValueError): + df2 = df.copy() + df2['foo'] = 1 + to_datetime(df2) + + # not enough + for c in [['year'], + ['year', 'month'], + ['year', 'month', 'second'], + ['month', 'day'], + ['year', 'day', 'second']]: + with self.assertRaises(ValueError): + to_datetime(df[c]) + + # duplicates + df2 = DataFrame({'year': [2015, 2016], + 'month': [2, 20], + 'day': [4, 5]}) + df2.columns = ['year', 'year', 'day'] + with self.assertRaises(ValueError): + to_datetime(df2) + + df2 = DataFrame({'year': [2015, 2016], + 'month': [2, 20], + 'day': [4, 5], + 'hour': [4, 5]}) + df2.columns = ['year', 'month', 'day', 'day'] + with self.assertRaises(ValueError): + to_datetime(df2) + + def test_dataframe_dtypes(self): + # #13451 + df = DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]}) + + # int16 + result = to_datetime(df.astype('int16')) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # mixed dtypes + df['month'] = df['month'].astype('int8') + df['day'] = df['day'].astype('int8') + result = to_datetime(df) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # float + df = DataFrame({'year': [2000, 2001], + 'month': [1.5, 1], + 'day': [1, 1]}) + with self.assertRaises(ValueError): + to_datetime(df) + + +class ToDatetimeMisc(tm.TestCase): + + def test_index_to_datetime(self): + idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = idx.to_datetime() + expected = DatetimeIndex(pd.to_datetime(idx.values)) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + today = datetime.today() + idx = Index([today], dtype=object) + result = idx.to_datetime() + expected = DatetimeIndex([today]) + tm.assert_index_equal(result, expected) + + def test_to_datetime_iso8601(self): + result = to_datetime(["2012-01-01 00:00:00"]) + exp = Timestamp("2012-01-01 00:00:00") + self.assertEqual(result[0], exp) + + result = to_datetime(['20121001']) # bad iso 8601 + exp = Timestamp('2012-10-01') + self.assertEqual(result[0], exp) + + def test_to_datetime_default(self): + rs = to_datetime('2001') + xp = datetime(2001, 1, 1) + self.assertTrue(rs, xp) + + # dayfirst is essentially broken + + # to_datetime('01-13-2012', dayfirst=True) + # self.assertRaises(ValueError, to_datetime('01-13-2012', + # dayfirst=True)) + + def test_to_datetime_on_datetime64_series(self): + # #2699 + s = Series(date_range('1/1/2000', periods=10)) + + result = to_datetime(s) + self.assertEqual(result[0], s[0]) + + def test_to_datetime_with_space_in_series(self): + # GH 6428 + s = Series(['10/18/2006', '10/18/2008', ' ']) + tm.assertRaises(ValueError, lambda: to_datetime(s, errors='raise')) + result_coerce = to_datetime(s, errors='coerce') + expected_coerce = Series([datetime(2006, 10, 18), + datetime(2008, 10, 18), + NaT]) + tm.assert_series_equal(result_coerce, expected_coerce) + result_ignore = to_datetime(s, errors='ignore') + tm.assert_series_equal(result_ignore, s) + + def test_to_datetime_with_apply(self): + # this is only locale tested with US/None locales + tm._skip_if_has_locale() + + # GH 5195 + # with a format and coerce a single item to_datetime fails + td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) + expected = pd.to_datetime(td, format='%b %y') + result = td.apply(pd.to_datetime, format='%b %y') + assert_series_equal(result, expected) + + td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) + self.assertRaises(ValueError, + lambda: pd.to_datetime(td, format='%b %y', + errors='raise')) + self.assertRaises(ValueError, + lambda: td.apply(pd.to_datetime, format='%b %y', + errors='raise')) + expected = pd.to_datetime(td, format='%b %y', errors='coerce') + + result = td.apply( + lambda x: pd.to_datetime(x, format='%b %y', errors='coerce')) + assert_series_equal(result, expected) + + def test_to_datetime_types(self): + + # empty string + result = to_datetime('') + self.assertIs(result, NaT) + + result = to_datetime(['', '']) + self.assertTrue(isnull(result).all()) + + # ints + result = Timestamp(0) + expected = to_datetime(0) + self.assertEqual(result, expected) + + # GH 3888 (strings) + expected = to_datetime(['2012'])[0] + result = to_datetime('2012') + self.assertEqual(result, expected) + + # array = ['2012','20120101','20120101 12:01:01'] + array = ['20120101', '20120101 12:01:01'] + expected = list(to_datetime(array)) + result = lmap(Timestamp, array) + tm.assert_almost_equal(result, expected) + + # currently fails ### + # result = Timestamp('2012') + # expected = to_datetime('2012') + # self.assertEqual(result, expected) + + def test_to_datetime_unprocessable_input(self): + # GH 4928 + self.assert_numpy_array_equal( + to_datetime([1, '1'], errors='ignore'), + np.array([1, '1'], dtype='O') + ) + self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') + + def test_to_datetime_other_datetime64_units(self): + # 5/25/2012 + scalar = np.int64(1337904000000000).view('M8[us]') + as_obj = scalar.astype('O') + + index = DatetimeIndex([scalar]) + self.assertEqual(index[0], scalar.astype('O')) + + value = Timestamp(scalar) + self.assertEqual(value, as_obj) + + def test_to_datetime_list_of_integers(self): + rng = date_range('1/1/2000', periods=20) + rng = DatetimeIndex(rng.values) + + ints = list(rng.asi8) + + result = DatetimeIndex(ints) + + tm.assert_index_equal(rng, result) + + def test_to_datetime_freq(self): + xp = bdate_range('2000-1-1', periods=10, tz='UTC') + rs = xp.to_datetime() + self.assertEqual(xp.freq, rs.freq) + self.assertEqual(xp.tzinfo, rs.tzinfo) + + def test_string_na_nat_conversion(self): + # GH #999, #858 + + from pandas.compat import parse_date + + strings = np.array(['1/1/2000', '1/2/2000', np.nan, + '1/4/2000, 12:34:56'], dtype=object) + + expected = np.empty(4, dtype='M8[ns]') + for i, val in enumerate(strings): + if isnull(val): + expected[i] = tslib.iNaT + else: + expected[i] = parse_date(val) + + result = tslib.array_to_datetime(strings) + tm.assert_almost_equal(result, expected) + + result2 = to_datetime(strings) + tm.assertIsInstance(result2, DatetimeIndex) + tm.assert_numpy_array_equal(result, result2.values) + + malformed = np.array(['1/100/2000', np.nan], dtype=object) + + # GH 10636, default is now 'raise' + self.assertRaises(ValueError, + lambda: to_datetime(malformed, errors='raise')) + + result = to_datetime(malformed, errors='ignore') + tm.assert_numpy_array_equal(result, malformed) + + self.assertRaises(ValueError, to_datetime, malformed, errors='raise') + + idx = ['a', 'b', 'c', 'd', 'e'] + series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, + '1/5/2000'], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000'), np.nan, + to_datetime('1/3/2000'), np.nan, + to_datetime('1/5/2000')], index=idx, name='foo') + + result = to_datetime(series) + dresult = to_datetime(dseries) + + expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + for i in range(5): + x = series[i] + if isnull(x): + expected[i] = tslib.iNaT + else: + expected[i] = to_datetime(x) + + assert_series_equal(result, expected, check_names=False) + self.assertEqual(result.name, 'foo') + + assert_series_equal(dresult, expected, check_names=False) + self.assertEqual(dresult.name, 'foo') + + def test_dti_constructor_numpy_timeunits(self): + # GH 9114 + base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) + + for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', + 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: + values = base.values.astype(dtype) + + tm.assert_index_equal(DatetimeIndex(values), base) + tm.assert_index_equal(to_datetime(values), base) + + def test_dayfirst(self): + # GH 5917 + arr = ['10/02/2014', '11/02/2014', '12/02/2014'] + expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), + datetime(2014, 2, 12)]) + idx1 = DatetimeIndex(arr, dayfirst=True) + idx2 = DatetimeIndex(np.array(arr), dayfirst=True) + idx3 = to_datetime(arr, dayfirst=True) + idx4 = to_datetime(np.array(arr), dayfirst=True) + idx5 = DatetimeIndex(Index(arr), dayfirst=True) + idx6 = DatetimeIndex(Series(arr), dayfirst=True) + tm.assert_index_equal(expected, idx1) + tm.assert_index_equal(expected, idx2) + tm.assert_index_equal(expected, idx3) + tm.assert_index_equal(expected, idx4) + tm.assert_index_equal(expected, idx5) + tm.assert_index_equal(expected, idx6) + + +class TestGuessDatetimeFormat(tm.TestCase): + + def test_guess_datetime_format_with_parseable_formats(self): + tm._skip_if_not_us_locale() + dt_string_to_format = (('20111230', '%Y%m%d'), + ('2011-12-30', '%Y-%m-%d'), + ('30-12-2011', '%d-%m-%Y'), + ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'), + ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'), + ('2011-12-30 00:00:00.000000', + '%Y-%m-%d %H:%M:%S.%f'), ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_with_dayfirst(self): + ambiguous_string = '01/01/2011' + self.assertEqual( + tools._guess_datetime_format(ambiguous_string, dayfirst=True), + '%d/%m/%Y' + ) + self.assertEqual( + tools._guess_datetime_format(ambiguous_string, dayfirst=False), + '%m/%d/%Y' + ) + + def test_guess_datetime_format_with_locale_specific_formats(self): + # The month names will vary depending on the locale, in which + # case these wont be parsed properly (dateutil can't parse them) + tm._skip_if_has_locale() + + dt_string_to_format = (('30/Dec/2011', '%d/%b/%Y'), + ('30/December/2011', '%d/%B/%Y'), + ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'), ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_invalid_inputs(self): + # A datetime string must include a year, month and a day for it + # to be guessable, in addition to being a string that looks like + # a datetime + invalid_dts = [ + '2013', + '01/2013', + '12:00:00', + '1/1/1/1', + 'this_is_not_a_datetime', + '51a', + 9, + datetime(2011, 1, 1), + ] + + for invalid_dt in invalid_dts: + self.assertTrue(tools._guess_datetime_format(invalid_dt) is None) + + def test_guess_datetime_format_nopadding(self): + # GH 11142 + dt_string_to_format = (('2011-1-1', '%Y-%m-%d'), + ('30-1-2011', '%d-%m-%Y'), + ('1/1/2011', '%m/%d/%Y'), + ('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'), + ('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'), + ('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_for_array(self): + tm._skip_if_not_us_locale() + expected_format = '%Y-%m-%d %H:%M:%S.%f' + dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) + + test_arrays = [ + np.array([dt_string, dt_string, dt_string], dtype='O'), + np.array([np.nan, np.nan, dt_string], dtype='O'), + np.array([dt_string, 'random_string'], dtype='O'), + ] + + for test_array in test_arrays: + self.assertEqual( + tools._guess_datetime_format_for_array(test_array), + expected_format + ) + + format_for_string_of_nans = tools._guess_datetime_format_for_array( + np.array( + [np.nan, np.nan, np.nan], dtype='O')) + self.assertTrue(format_for_string_of_nans is None) + + +class TestToDatetimeInferFormat(tm.TestCase): + + def test_to_datetime_infer_datetime_format_consistent_format(self): + s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) + + test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', + '%Y-%m-%dT%H:%M:%S.%f'] + + for test_format in test_formats: + s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) + + with_format = pd.to_datetime(s_as_dt_strings, format=test_format) + no_infer = pd.to_datetime(s_as_dt_strings, + infer_datetime_format=False) + yes_infer = pd.to_datetime(s_as_dt_strings, + infer_datetime_format=True) + + # Whether the format is explicitly passed, it is inferred, or + # it is not inferred, the results should all be the same + self.assert_series_equal(with_format, no_infer) + self.assert_series_equal(no_infer, yes_infer) + + def test_to_datetime_infer_datetime_format_inconsistent_format(self): + s = pd.Series(np.array(['01/01/2011 00:00:00', + '01-02-2011 00:00:00', + '2011-01-03T00:00:00'])) + + # When the format is inconsistent, infer_datetime_format should just + # fallback to the default parsing + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) + + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + def test_to_datetime_infer_datetime_format_series_with_nans(self): + s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, + '01/03/2011 00:00:00', np.nan])) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): + s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', + '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) + + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) + + def test_to_datetime_iso8601_noleading_0s(self): + # GH 11871 + s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) + expected = pd.Series([pd.Timestamp('2014-01-01'), + pd.Timestamp('2014-02-02'), + pd.Timestamp('2015-03-03')]) + tm.assert_series_equal(pd.to_datetime(s), expected) + tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) + + +class TestDaysInMonth(tm.TestCase): + # tests for issue #10154 + def test_day_not_in_month_coerce(self): + self.assertTrue(isnull(to_datetime('2015-02-29', errors='coerce'))) + self.assertTrue(isnull(to_datetime('2015-02-29', format="%Y-%m-%d", + errors='coerce'))) + self.assertTrue(isnull(to_datetime('2015-02-32', format="%Y-%m-%d", + errors='coerce'))) + self.assertTrue(isnull(to_datetime('2015-04-31', format="%Y-%m-%d", + errors='coerce'))) + + def test_day_not_in_month_raise(self): + self.assertRaises(ValueError, to_datetime, '2015-02-29', + errors='raise') + self.assertRaises(ValueError, to_datetime, '2015-02-29', + errors='raise', format="%Y-%m-%d") + self.assertRaises(ValueError, to_datetime, '2015-02-32', + errors='raise', format="%Y-%m-%d") + self.assertRaises(ValueError, to_datetime, '2015-04-31', + errors='raise', format="%Y-%m-%d") + + def test_day_not_in_month_ignore(self): + self.assertEqual(to_datetime( + '2015-02-29', errors='ignore'), '2015-02-29') + self.assertEqual(to_datetime( + '2015-02-29', errors='ignore', format="%Y-%m-%d"), '2015-02-29') + self.assertEqual(to_datetime( + '2015-02-32', errors='ignore', format="%Y-%m-%d"), '2015-02-32') + self.assertEqual(to_datetime( + '2015-04-31', errors='ignore', format="%Y-%m-%d"), '2015-04-31') diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 23261c2ef79e2..7bcd1763537dc 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from pandas import Series, DataFrame +from pandas import Series, DataFrame, date_range, DatetimeIndex from pandas import compat from pandas.util.testing import assert_series_equal @@ -218,3 +218,103 @@ def test_combine_first_dt64(self): rs = s0.combine_first(s1) xp = Series([datetime(2010, 1, 1), '2011']) assert_series_equal(rs, xp) + + +class TestTimeseries(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_append_concat(self): + rng = date_range('5/8/2012 1:45', periods=10, freq='5T') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) + + appended = rng.append(rng) + tm.assert_index_equal(appended, ex_index) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + tm.assert_index_equal(appended, ex_index) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = 'foo' + rng2.name = 'bar' + self.assertEqual(rng1.append(rng1).name, 'foo') + self.assertIsNone(rng1.append(rng2).name) + + def test_append_concat_tz(self): + # GH 2938 + tm._skip_if_no_pytz() + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz='US/Eastern') + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz='US/Eastern') + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz='US/Eastern') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_concat_tz_explicit_pytz(self): + # GH 2938 + tm._skip_if_no_pytz() + from pytz import timezone as timezone + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz=timezone('US/Eastern')) + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz=timezone('US/Eastern')) + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz=timezone('US/Eastern')) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_concat_tz_dateutil(self): + # GH 2938 + tm._skip_if_no_dateutil() + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz='dateutil/US/Eastern') + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz='dateutil/US/Eastern') + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz='dateutil/US/Eastern') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 05818b013ac52..777b188b8fdd9 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -9,11 +9,12 @@ import pandas as pd from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype -from pandas import Index, Series, isnull, date_range, period_range +from pandas import (Index, Series, isnull, date_range, + period_range, NaT) from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex -import pandas.lib as lib +from pandas import lib, tslib from pandas.compat import lrange, range, zip, OrderedDict, long from pandas import compat @@ -214,7 +215,6 @@ def test_constructor_maskedarray(self): expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) - from pandas import tslib data = ma.masked_all((3, ), dtype='M8[ns]') result = Series(data) expected = Series([tslib.iNaT, tslib.iNaT, tslib.iNaT], dtype='M8[ns]') @@ -234,6 +234,13 @@ def test_constructor_maskedarray(self): datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) + def test_series_ctor_plus_datetimeindex(self): + rng = date_range('20090415', '20090519', freq='B') + data = dict((k, 1) for k in rng) + + result = Series(data, index=rng) + self.assertIs(result.index, rng) + def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) @@ -800,6 +807,21 @@ def f(): s = Series([pd.NaT, np.nan, '1 Day']) self.assertEqual(s.dtype, 'timedelta64[ns]') + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, tslib.iNaT], dtype='M8[ns]') + + val = series[3] + self.assertTrue(isnull(val)) + + series[2] = val + self.assertTrue(isnull(series[2])) + + def test_NaT_cast(self): + # GH10747 + result = Series([np.nan]).astype('M8[ns]') + expected = Series([NaT]) + assert_series_equal(result, expected) + def test_constructor_name_hashable(self): for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]: for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]: @@ -810,3 +832,20 @@ def test_constructor_name_unhashable(self): for n in [['name_list'], np.ones(2), {1: 2}]: for data in [['name_list'], np.ones(2), {1: 2}]: self.assertRaises(TypeError, Series, data, name=n) + + def test_auto_conversion(self): + series = Series(list(date_range('1/1/2000', periods=10))) + self.assertEqual(series.dtype, 'M8[ns]') + + def test_constructor_cant_cast_datetime64(self): + msg = "Cannot cast datetime64 to " + with tm.assertRaisesRegexp(TypeError, msg): + Series(date_range('1/1/2000', periods=10), dtype=float) + + with tm.assertRaisesRegexp(TypeError, msg): + Series(date_range('1/1/2000', periods=10), dtype=int) + + def test_constructor_cast_object(self): + s = Series(date_range('1/1/2000', periods=10), dtype=object) + exp = Series(date_range('1/1/2000', periods=10)) + tm.assert_series_equal(s, exp) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 1a1ff28bbb398..127a410f66fdb 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,9 +8,7 @@ from numpy import nan import numpy as np -from pandas import Series -from pandas.tseries.index import Timestamp -from pandas.tseries.tdi import Timedelta +from pandas import Series, Timestamp, Timedelta, DataFrame, date_range from pandas.compat import lrange, range, u from pandas import compat @@ -181,3 +179,24 @@ def test_arg_for_errors_in_astype(self): sr.astype(np.int8, raise_on_error=True) sr.astype(np.int8, errors='raise') + + def test_intercept_astype_object(self): + series = Series(date_range('1/1/2000', periods=10)) + + # this test no longer makes sense as series is by default already + # M8[ns] + expected = series.astype('object') + + df = DataFrame({'a': series, + 'b': np.random.randn(len(series))}) + exp_dtypes = Series([np.dtype('datetime64[ns]'), + np.dtype('float64')], index=['a', 'b']) + tm.assert_series_equal(df.dtypes, exp_dtypes) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + df = DataFrame({'a': series, 'b': ['foo'] * len(series)}) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index d4b6e7dd5349f..bdae11770de65 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -7,17 +7,21 @@ import numpy as np import pandas as pd +import pandas.index as _index from pandas.types.common import is_integer, is_scalar -from pandas import Index, Series, DataFrame, isnull, date_range -from pandas.core.index import MultiIndex +from pandas import (Index, Series, DataFrame, isnull, + date_range, NaT, MultiIndex, + Timestamp, DatetimeIndex, Timedelta) from pandas.core.indexing import IndexingError -from pandas.tseries.index import Timestamp from pandas.tseries.offsets import BDay -from pandas.tseries.tdi import Timedelta +from pandas import lib, tslib from pandas.compat import lrange, range from pandas import compat -from pandas.util.testing import assert_series_equal, assert_almost_equal +from pandas.util.testing import (slow, + assert_series_equal, + assert_almost_equal, + assert_frame_equal) import pandas.util.testing as tm from pandas.tests.series.common import TestData @@ -421,6 +425,84 @@ def test_getitem_setitem_datetime_tz_dateutil(self): result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] assert_series_equal(result, ts) + def test_getitem_setitem_datetimeindex(self): + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04:00:00"] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00"] = 0 + result["1990-01-01 04:00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04:00:00" + rb = "1990-01-01 07:00:00" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # repeat all the above with naive datetimes + result = ts[datetime(1990, 1, 1, 4)] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4)] = 0 + result[datetime(1990, 1, 1, 4)] = ts[4] + assert_series_equal(result, ts) + + result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] + assert_series_equal(result, ts) + + lb = datetime(1990, 1, 1, 4) + rb = datetime(1990, 1, 1, 7) + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts[ts.index[4]] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + # also test partial date slicing + result = ts["1990-01-02"] + expected = ts[24:48] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-02"] = 0 + result["1990-01-02"] = ts[24:48] + assert_series_equal(result, ts) + def test_getitem_setitem_periodindex(self): from pandas import period_range @@ -1835,6 +1917,28 @@ def test_reindex_nan(self): # reindex coerces index.dtype to float, loc/iloc doesn't assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) + def test_reindex_series_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + series = Series(rng) + + result = series.reindex(lrange(15)) + self.assertTrue(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) + + mask = result.isnull() + self.assertTrue(mask[-5:].all()) + self.assertFalse(mask[:-5].any()) + + def test_reindex_with_datetimes(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + def test_reindex_corner(self): # (don't forget to fix this) I think it's fixed self.empty.reindex(self.ts.index, method='pad') # it works @@ -2110,6 +2214,432 @@ def test_setitem_slice_into_readonly_backing_data(self): ' array was still mutated!', ) + +class TestTimeSeriesDuplicates(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), + datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 3), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 4), + datetime(2000, 1, 4), datetime(2000, 1, 5)] + + self.dups = Series(np.random.randn(len(dates)), index=dates) + + def test_constructor(self): + tm.assertIsInstance(self.dups, Series) + tm.assertIsInstance(self.dups.index, DatetimeIndex) + + def test_is_unique_monotonic(self): + self.assertFalse(self.dups.index.is_unique) + + def test_index_unique(self): + uniques = self.dups.index.unique() + expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 5)]) + self.assertEqual(uniques.dtype, 'M8[ns]') # sanity + tm.assert_index_equal(uniques, expected) + self.assertEqual(self.dups.index.nunique(), 4) + + # #2563 + self.assertTrue(isinstance(uniques, DatetimeIndex)) + + dups_local = self.dups.index.tz_localize('US/Eastern') + dups_local.name = 'foo' + result = dups_local.unique() + expected = DatetimeIndex(expected, name='foo') + expected = expected.tz_localize('US/Eastern') + self.assertTrue(result.tz is not None) + self.assertEqual(result.name, 'foo') + tm.assert_index_equal(result, expected) + + # NaT, note this is excluded + arr = [1370745748 + t for t in range(20)] + [tslib.iNaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + + arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) + for t in range(20)] + [NaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + + def test_index_dupes_contains(self): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + self.assertTrue(d in ix) + + def test_duplicate_dates_indexing(self): + ts = self.dups + + uniques = ts.index.unique() + for date in uniques: + result = ts[date] + + mask = ts.index == date + total = (ts.index == date).sum() + expected = ts[mask] + if total > 1: + assert_series_equal(result, expected) + else: + assert_almost_equal(result, expected[0]) + + cp = ts.copy() + cp[date] = 0 + expected = Series(np.where(mask, 0, ts), index=ts.index) + assert_series_equal(cp, expected) + + self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) + + # new index + ts[datetime(2000, 1, 6)] = 0 + self.assertEqual(ts[datetime(2000, 1, 6)], 0) + + def test_range_slice(self): + idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', + '1/4/2000']) + + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts['1/2/2000':] + expected = ts[1:] + assert_series_equal(result, expected) + + result = ts['1/2/2000':'1/3/2000'] + expected = ts[1:4] + assert_series_equal(result, expected) + + def test_groupby_average_dup_values(self): + result = self.dups.groupby(level=0).mean() + expected = self.dups.groupby(self.dups.index).mean() + assert_series_equal(result, expected) + + def test_indexing_over_size_cutoff(self): + import datetime + # #1821 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + # create large list of non periodic datetime + dates = [] + sec = datetime.timedelta(seconds=1) + half_sec = datetime.timedelta(microseconds=500000) + d = datetime.datetime(2011, 12, 5, 20, 30) + n = 1100 + for i in range(n): + dates.append(d) + dates.append(d + sec) + dates.append(d + sec + half_sec) + dates.append(d + sec + sec + half_sec) + d += 3 * sec + + # duplicate some values in the list + duplicate_positions = np.random.randint(0, len(dates) - 1, 20) + for p in duplicate_positions: + dates[p + 1] = dates[p] + + df = DataFrame(np.random.randn(len(dates), 4), + index=dates, + columns=list('ABCD')) + + pos = n * 3 + timestamp = df.index[pos] + self.assertIn(timestamp, df.index) + + # it works! + df.loc[timestamp] + self.assertTrue(len(df.loc[[timestamp]]) > 0) + finally: + _index._SIZE_CUTOFF = old_cutoff + + def test_indexing_unordered(self): + # GH 2437 + rng = date_range(start='2011-01-01', end='2011-01-15') + ts = Series(np.random.rand(len(rng)), index=rng) + ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) + + for t in ts.index: + # TODO: unused? + s = str(t) # noqa + + expected = ts[t] + result = ts2[t] + self.assertTrue(expected == result) + + # GH 3448 (ranges) + def compare(slobj): + result = ts2[slobj].copy() + result = result.sort_index() + expected = ts[slobj] + assert_series_equal(result, expected) + + compare(slice('2011-01-01', '2011-01-15')) + compare(slice('2010-12-30', '2011-01-15')) + compare(slice('2011-01-01', '2011-01-16')) + + # partial ranges + compare(slice('2011-01-01', '2011-01-6')) + compare(slice('2011-01-06', '2011-01-8')) + compare(slice('2011-01-06', '2011-01-12')) + + # single values + result = ts2['2011'].sort_index() + expected = ts['2011'] + assert_series_equal(result, expected) + + # diff freq + rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') + ts = Series(np.arange(len(rng)), index=rng) + ts = ts.take(np.random.permutation(20)) + + result = ts['2005'] + for t in result.index: + self.assertTrue(t.year == 2005) + + def test_indexing(self): + + idx = date_range("2001-1-1", periods=20, freq='M') + ts = Series(np.random.rand(len(idx)), index=idx) + + # getting + + # GH 3070, make sure semantics work on Series/Frame + expected = ts['2001'] + expected.name = 'A' + + df = DataFrame(dict(A=ts)) + result = df['2001']['A'] + assert_series_equal(expected, result) + + # setting + ts['2001'] = 1 + expected = ts['2001'] + expected.name = 'A' + + df.loc['2001', 'A'] = 1 + + result = df['2001']['A'] + assert_series_equal(expected, result) + + # GH3546 (not including times on the last day) + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', + freq='H') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected, ts) + + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', + freq='S') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected, ts) + + idx = [Timestamp('2013-05-31 00:00'), + Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013'] + assert_series_equal(expected, ts) + + # GH14826, indexing with a seconds resolution string / datetime object + df = DataFrame(np.random.rand(5, 5), + columns=['open', 'high', 'low', 'close', 'volume'], + index=date_range('2012-01-02 18:01:00', + periods=5, tz='US/Central', freq='s')) + expected = df.loc[[df.index[2]]] + + # this is a single date, so will raise + self.assertRaises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) + self.assertRaises(KeyError, df.__getitem__, df.index[2], ) + + +class TestDatetimeIndexing(tm.TestCase): + """ + Also test support for datetime64[ns] in Series / DataFrame + """ + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005, 1, 1), + end=datetime(2005, 1, 10), freq='Min') + self.series = Series(np.random.rand(len(dti)), dti) + + def test_fancy_getitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + + self.assertEqual(s[48], 48) + self.assertEqual(s['1/2/2009'], 48) + self.assertEqual(s['2009-1-2'], 48) + self.assertEqual(s[datetime(2009, 1, 2)], 48) + self.assertEqual(s[lib.Timestamp(datetime(2009, 1, 2))], 48) + self.assertRaises(KeyError, s.__getitem__, '2009-1-3') + + assert_series_equal(s['3/6/2009':'2009-06-05'], + s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) + + def test_fancy_setitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + s[48] = -1 + self.assertEqual(s[48], -1) + s['1/2/2009'] = -2 + self.assertEqual(s[48], -2) + s['1/2/2009':'2009-06-05'] = -3 + self.assertTrue((s[48:54] == -3).all()) + + def test_dti_snap(self): + dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', + '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') + + res = dti.snap(freq='W-MON') + exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') + exp = exp.repeat([3, 4]) + self.assertTrue((res == exp).all()) + + res = dti.snap(freq='B') + + exp = date_range('1/1/2002', '1/7/2002', freq='b') + exp = exp.repeat([1, 1, 1, 2, 2]) + self.assertTrue((res == exp).all()) + + def test_dti_reset_index_round_trip(self): + dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') + d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) + d2 = d1.reset_index() + self.assertEqual(d2.dtypes[0], np.dtype('M8[ns]')) + d3 = d2.set_index('index') + assert_frame_equal(d1, d3, check_names=False) + + # #2329 + stamp = datetime(2012, 11, 22) + df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) + df = df.set_index('Date') + + self.assertEqual(df.index[0], stamp) + self.assertEqual(df.reset_index()['Date'][0], stamp) + + def test_series_set_value(self): + # #1561 + + dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] + index = DatetimeIndex(dates) + + s = Series().set_value(dates[0], 1.) + s2 = s.set_value(dates[1], np.nan) + + exp = Series([1., np.nan], index=index) + + assert_series_equal(s2, exp) + + # s = Series(index[:1], index[:1]) + # s2 = s.set_value(dates[1], index[1]) + # self.assertEqual(s2.values.dtype, 'M8[ns]') + + @slow + def test_slice_locs_indexerror(self): + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) + for i in range(100000)] + s = Series(lrange(100000), times) + s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] + + def test_slicing_datetimes(self): + + # GH 7523 + + # unique + df = DataFrame(np.arange(4., dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) + for i in [1, 2, 3, 4]]) + result = df.loc[datetime(2001, 1, 1, 10):] + assert_frame_equal(result, df) + result = df.loc[:datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11):] + expected = df.iloc[1:] + assert_frame_equal(result, expected) + result = df.loc['20010101 11':] + assert_frame_equal(result, expected) + + # duplicates + df = pd.DataFrame(np.arange(5., dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) + for i in [1, 2, 2, 3, 4]]) + + result = df.loc[datetime(2001, 1, 1, 10):] + assert_frame_equal(result, df) + result = df.loc[:datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11):] + expected = df.iloc[1:] + assert_frame_equal(result, expected) + result = df.loc['20010101 11':] + assert_frame_equal(result, expected) + + def test_frame_datetime64_duplicated(self): + dates = date_range('2010-07-01', end='2010-08-05') + + tst = DataFrame({'symbol': 'AAA', 'date': dates}) + result = tst.duplicated(['date', 'symbol']) + self.assertTrue((-result).all()) + + tst = DataFrame({'date': dates}) + result = tst.duplicated() + self.assertTrue((-result).all()) + + +class TestNatIndexing(tm.TestCase): + def setUp(self): + self.series = Series(date_range('1/1/2000', periods=10)) + + # --------------------------------------------------------------------- + # NaT support + + def test_set_none_nan(self): + self.series[3] = None + self.assertIs(self.series[3], NaT) + + self.series[3:5] = None + self.assertIs(self.series[4], NaT) + + self.series[5] = np.nan + self.assertIs(self.series[5], NaT) + + self.series[5:7] = np.nan + self.assertIs(self.series[6], NaT) + + def test_nat_operations(self): + # GH 8617 + s = Series([0, pd.NaT], dtype='m8[ns]') + exp = s[0] + self.assertEqual(s.median(), exp) + self.assertEqual(s.min(), exp) + self.assertEqual(s.max(), exp) + + def test_round_nat(self): + # GH14940 + s = Series([pd.NaT]) + expected = Series(pd.NaT) + for method in ["round", "floor", "ceil"]: + round_method = getattr(s.dt, method) + for freq in ["s", "5s", "min", "5min", "h", "5h"]: + assert_series_equal(round_method(freq), expected) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index b013e1a6f1c10..7b1201b971c71 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -30,6 +30,19 @@ class TestSeriesOperators(TestData, tm.TestCase): _multiprocess_can_split_ = True + def test_series_comparison_scalars(self): + series = Series(date_range('1/1/2000', periods=10)) + + val = datetime(2000, 1, 4) + result = series > val + expected = Series([x > val for x in series]) + self.assert_series_equal(result, expected) + + val = series[5] + result = series > val + expected = Series([x > val for x in series]) + self.assert_series_equal(result, expected) + def test_comparisons(self): left = np.random.randn(10) right = np.random.randn(10) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 571a802e37211..9754a9d3737e3 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1,35 +1,24 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -import sys -import nose -import locale -import calendar import numpy as np -from numpy.random import rand from datetime import datetime, timedelta, time import pandas as pd -import pandas.index as _index -import pandas.tseries.tools as tools -import pandas.core.common as com import pandas.util.testing as tm from pandas.tslib import iNaT -from pandas.compat import lrange, lmap, StringIO, product +from pandas.compat import lrange, StringIO, product from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.index import DatetimeIndex from pandas.tseries.offsets import BDay, BMonthEnd -from pandas.types.common import is_datetime64_ns_dtype from pandas import (Index, Series, date_range, NaT, concat, DataFrame, - Timestamp, lib, isnull, to_datetime, offsets, Timedelta, - tslib, bdate_range, Period, timedelta_range, compat) + Timestamp, to_datetime, offsets, + timedelta_range) from pandas.util.testing import (assert_series_equal, assert_almost_equal, - slow, assert_frame_equal, _skip_if_has_locale) + assert_frame_equal, _skip_if_has_locale) from pandas.tests.series.common import TestData -randn = np.random.randn - def _simple_ts(start, end, freq='D'): rng = date_range(start, end, freq=freq) @@ -118,6 +107,22 @@ def test_shift(self): tz='CET'), name='foo') self.assertRaises(ValueError, lambda: s - s2) + def test_shift2(self): + ts = Series(np.random.randn(5), + index=date_range('1/1/2000', periods=5, freq='H')) + + result = ts.shift(1, freq='5T') + exp_index = ts.index.shift(1, freq='5T') + tm.assert_index_equal(result.index, exp_index) + + # GH #1063, multiple of same base + result = ts.shift(1, freq='4H') + exp_index = ts.index + offsets.Hour(4) + tm.assert_index_equal(result.index, exp_index) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.shift, 1) + def test_shift_dst(self): # GH 13926 dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern') @@ -477,28 +482,6 @@ def test_series_ctor_datetime64(self): series = Series(dates) self.assertTrue(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) - def test_reindex_series_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - series = Series(rng) - - result = series.reindex(lrange(15)) - self.assertTrue(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) - - mask = result.isnull() - self.assertTrue(mask[-5:].all()) - self.assertFalse(mask[:-5].any()) - - def test_reindex_frame_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) - - result = df.reindex(lrange(15)) - self.assertTrue(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) - - mask = com.isnull(result)['B'] - self.assertTrue(mask[-5:].all()) - self.assertFalse(mask[:-5].any()) - def test_series_repr_nat(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') @@ -510,36 +493,6 @@ def test_series_repr_nat(self): 'dtype: datetime64[ns]') self.assertEqual(result, expected) - def test_index_convert_to_datetime_array(self): - tm._skip_if_no_pytz() - - def _check_rng(rng): - converted = rng.to_pydatetime() - tm.assertIsInstance(converted, np.ndarray) - for x, stamp in zip(converted, rng): - tm.assertIsInstance(x, datetime) - self.assertEqual(x, stamp.to_pydatetime()) - self.assertEqual(x.tzinfo, stamp.tzinfo) - - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') - rng_utc = date_range('20090415', '20090519', tz='utc') - - _check_rng(rng) - _check_rng(rng_eastern) - _check_rng(rng_utc) - - def test_reindex_with_datetimes(self): - rng = date_range('1/1/2000', periods=20) - ts = Series(np.random.randn(20), index=rng) - - result = ts.reindex(list(ts.index[5:10])) - expected = ts[5:10] - tm.assert_series_equal(result, expected) - - result = ts[list(ts.index[5:10])] - tm.assert_series_equal(result, expected) - def test_asfreq_keep_index_name(self): # GH #9854 index_name = 'bar' @@ -680,35 +633,13 @@ def test_at_time(self): rs = ts.at_time('16:00') self.assertEqual(len(rs), 0) - def test_at_time_frame(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - rs = ts.at_time(rng[1]) - self.assertTrue((rs.index.hour == rng[1].hour).all()) - self.assertTrue((rs.index.minute == rng[1].minute).all()) - self.assertTrue((rs.index.second == rng[1].second).all()) - - result = ts.at_time('9:30') - expected = ts.at_time(time(9, 30)) - assert_frame_equal(result, expected) - - result = ts.loc[time(9, 30)] - expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] - - assert_frame_equal(result, expected) - - # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') - ts = DataFrame(np.random.randn(len(rng), 3), index=rng) - - result = ts.at_time(time(0, 0)) - assert_frame_equal(result, ts) + def test_between(self): + series = Series(date_range('1/1/2000', periods=10)) + left, right = series[[2, 7]] - # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) - ts = DataFrame(np.random.randn(len(rng), 2), rng) - rs = ts.at_time('16:00') - self.assertEqual(len(rs), 0) + result = series.between(left, right) + expected = (series >= left) & (series <= right) + assert_series_equal(result, expected) def test_between_time(self): rng = date_range('1/1/2000', '1/5/2000', freq='5min') @@ -770,66 +701,6 @@ def test_between_time(self): else: self.assertTrue((t < etime) or (t >= stime)) - def test_between_time_frame(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - stime = time(0, 0) - etime = time(1, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = 13 * 4 + 1 - if not inc_start: - exp_len -= 5 - if not inc_end: - exp_len -= 4 - - self.assertEqual(len(filtered), exp_len) - for rs in filtered.index: - t = rs.time() - if inc_start: - self.assertTrue(t >= stime) - else: - self.assertTrue(t > stime) - - if inc_end: - self.assertTrue(t <= etime) - else: - self.assertTrue(t < etime) - - result = ts.between_time('00:00', '01:00') - expected = ts.between_time(stime, etime) - assert_frame_equal(result, expected) - - # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - stime = time(22, 0) - etime = time(9, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = (12 * 11 + 1) * 4 + 1 - if not inc_start: - exp_len -= 4 - if not inc_end: - exp_len -= 4 - - self.assertEqual(len(filtered), exp_len) - for rs in filtered.index: - t = rs.time() - if inc_start: - self.assertTrue((t >= stime) or (t <= etime)) - else: - self.assertTrue((t > stime) or (t <= etime)) - - if inc_end: - self.assertTrue((t <= etime) or (t >= stime)) - else: - self.assertTrue((t < etime) or (t >= stime)) - def test_between_time_types(self): # GH11818 rng = date_range('1/1/2000', '1/5/2000', freq='5min') @@ -897,275 +768,6 @@ def test_to_period(self): expected.columns = exp_idx assert_frame_equal(df.to_period(axis=1), expected) - def create_dt64_based_index(self): - data = [Timestamp('2007-01-01 10:11:12.123456Z'), - Timestamp('2007-01-01 10:11:13.789123Z')] - index = DatetimeIndex(data) - return index - - def test_to_period_millisecond(self): - index = self.create_dt64_based_index() - - period = index.to_period(freq='L') - self.assertEqual(2, len(period)) - self.assertEqual(period[0], Period('2007-01-01 10:11:12.123Z', 'L')) - self.assertEqual(period[1], Period('2007-01-01 10:11:13.789Z', 'L')) - - def test_to_period_microsecond(self): - index = self.create_dt64_based_index() - - period = index.to_period(freq='U') - self.assertEqual(2, len(period)) - self.assertEqual(period[0], Period('2007-01-01 10:11:12.123456Z', 'U')) - self.assertEqual(period[1], Period('2007-01-01 10:11:13.789123Z', 'U')) - - def test_to_period_tz_pytz(self): - tm._skip_if_no_pytz() - from dateutil.tz import tzlocal - from pytz import utc as UTC - - xp = date_range('1/1/2000', '4/1/2000').to_period() - - ts = date_range('1/1/2000', '4/1/2000', tz='US/Eastern') - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertEqual(result, expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=UTC) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertEqual(result, expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertEqual(result, expected) - tm.assert_index_equal(ts.to_period(), xp) - - def test_to_period_tz_explicit_pytz(self): - tm._skip_if_no_pytz() - import pytz - from dateutil.tz import tzlocal - - xp = date_range('1/1/2000', '4/1/2000').to_period() - - ts = date_range('1/1/2000', '4/1/2000', tz=pytz.timezone('US/Eastern')) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - def test_to_period_tz_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - from dateutil.tz import tzlocal - - xp = date_range('1/1/2000', '4/1/2000').to_period() - - ts = date_range('1/1/2000', '4/1/2000', tz='dateutil/US/Eastern') - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) - - result = ts.to_period()[0] - expected = ts[0].to_period() - - self.assertTrue(result == expected) - tm.assert_index_equal(ts.to_period(), xp) - - def test_frame_to_period(self): - K = 5 - from pandas.tseries.period import period_range - - dr = date_range('1/1/2000', '1/1/2001') - pr = period_range('1/1/2000', '1/1/2001') - df = DataFrame(randn(len(dr), K), index=dr) - df['mix'] = 'a' - - pts = df.to_period() - exp = df.copy() - exp.index = pr - assert_frame_equal(pts, exp) - - pts = df.to_period('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) - - df = df.T - pts = df.to_period(axis=1) - exp = df.copy() - exp.columns = pr - assert_frame_equal(pts, exp) - - pts = df.to_period('M', axis=1) - tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) - - self.assertRaises(ValueError, df.to_period, axis=2) - - def test_compat_replace(self): - # https://github.com/statsmodels/statsmodels/issues/3349 - # replace should take ints/longs for compat - - for f in [compat.long, int]: - result = date_range(Timestamp('1960-04-01 00:00:00', - freq='QS-JAN'), - periods=f(76), - freq='QS-JAN') - self.assertEqual(len(result), 76) - - def test_astype_object(self): - # NumPy 1.6.1 weak ns support - rng = date_range('1/1/2000', periods=20) - - casted = rng.astype('O') - exp_values = list(rng) - - tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) - self.assertEqual(casted.tolist(), exp_values) - - def test_catch_infinite_loop(self): - offset = offsets.DateOffset(minute=5) - # blow up, don't loop forever - self.assertRaises(Exception, date_range, datetime(2011, 11, 11), - datetime(2011, 11, 12), freq=offset) - - def test_append_concat(self): - rng = date_range('5/8/2012 1:45', periods=10, freq='5T') - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - - result = ts.append(ts) - result_df = df.append(df) - ex_index = DatetimeIndex(np.tile(rng.values, 2)) - tm.assert_index_equal(result.index, ex_index) - tm.assert_index_equal(result_df.index, ex_index) - - appended = rng.append(rng) - tm.assert_index_equal(appended, ex_index) - - appended = rng.append([rng, rng]) - ex_index = DatetimeIndex(np.tile(rng.values, 3)) - tm.assert_index_equal(appended, ex_index) - - # different index names - rng1 = rng.copy() - rng2 = rng.copy() - rng1.name = 'foo' - rng2.name = 'bar' - self.assertEqual(rng1.append(rng1).name, 'foo') - self.assertIsNone(rng1.append(rng2).name) - - def test_append_concat_tz(self): - # GH 2938 - tm._skip_if_no_pytz() - - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='US/Eastern') - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_explicit_pytz(self): - # GH 2938 - tm._skip_if_no_pytz() - from pytz import timezone as timezone - - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz=timezone('US/Eastern')) - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_dateutil(self): - # GH 2938 - tm._skip_if_no_dateutil() - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='dateutil/US/Eastern') - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_set_dataframe_column_ns_dtype(self): - x = DataFrame([datetime.now(), datetime.now()]) - self.assertEqual(x[0].dtype, np.dtype('M8[ns]')) - def test_groupby_count_dateparseerror(self): dr = date_range(start='1/1/2012', freq='5min', periods=10) @@ -1180,40 +782,6 @@ def test_groupby_count_dateparseerror(self): assert_series_equal(result, expected) - def test_frame_datetime64_handling_groupby(self): - # it works! - df = DataFrame([(3, np.datetime64('2012-07-03')), - (3, np.datetime64('2012-07-04'))], - columns=['a', 'date']) - result = df.groupby('a').first() - self.assertEqual(result['date'][3], Timestamp('2012-07-03')) - - def test_frame_dict_constructor_datetime64_1680(self): - dr = date_range('1/1/2012', periods=10) - s = Series(dr, index=dr) - - # it works! - DataFrame({'a': 'foo', 'b': s}, index=dr) - DataFrame({'a': 'foo', 'b': s.values}, index=dr) - - def test_frame_datetime64_mixed_index_ctor_1681(self): - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - ts = Series(dr) - - # it works! - d = DataFrame({'A': 'foo', 'B': ts}, index=dr) - self.assertTrue(d['B'].isnull().all()) - - def test_frame_timeseries_to_records(self): - index = date_range('1/1/2000', periods=10) - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['a', 'b', 'c']) - - result = df.to_records() - result['index'].dtype == 'M8[ns]' - - result = df.to_records(index=False) - def test_to_csv_numpy_16_bug(self): frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) @@ -1234,20 +802,6 @@ def f(x): s.apply(f) DataFrame(s).applymap(f) - def test_concat_datetime_datetime64_frame(self): - # #2624 - rows = [] - rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), 'hi']) - - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) - - ind = date_range(start="2000/1/1", freq="D", periods=10) - df1 = DataFrame({'date': ind, 'test': lrange(10)}) - - # it works! - pd.concat([df1, df2_obj]) - def test_asfreq_resample_set_correct_freq(self): # GH5613 # we test if .asfreq() and .resample() set the correct value for .freq @@ -1283,1085 +837,6 @@ def test_pickle(self): idx_p = self.round_trip_pickle(idx) tm.assert_index_equal(idx, idx_p) - -class TestTimeSeriesDuplicates(tm.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), - datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 3), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 4), - datetime(2000, 1, 4), datetime(2000, 1, 5)] - - self.dups = Series(np.random.randn(len(dates)), index=dates) - - def test_constructor(self): - tm.assertIsInstance(self.dups, Series) - tm.assertIsInstance(self.dups.index, DatetimeIndex) - - def test_is_unique_monotonic(self): - self.assertFalse(self.dups.index.is_unique) - - def test_index_unique(self): - uniques = self.dups.index.unique() - expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 5)]) - self.assertEqual(uniques.dtype, 'M8[ns]') # sanity - tm.assert_index_equal(uniques, expected) - self.assertEqual(self.dups.index.nunique(), 4) - - # #2563 - self.assertTrue(isinstance(uniques, DatetimeIndex)) - - dups_local = self.dups.index.tz_localize('US/Eastern') - dups_local.name = 'foo' - result = dups_local.unique() - expected = DatetimeIndex(expected, name='foo') - expected = expected.tz_localize('US/Eastern') - self.assertTrue(result.tz is not None) - self.assertEqual(result.name, 'foo') - tm.assert_index_equal(result, expected) - - # NaT, note this is excluded - arr = [1370745748 + t for t in range(20)] + [iNaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - self.assertEqual(idx.nunique(), 20) - self.assertEqual(idx.nunique(dropna=False), 21) - - arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) - for t in range(20)] + [NaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - self.assertEqual(idx.nunique(), 20) - self.assertEqual(idx.nunique(dropna=False), 21) - - def test_index_dupes_contains(self): - d = datetime(2011, 12, 5, 20, 30) - ix = DatetimeIndex([d, d]) - self.assertTrue(d in ix) - - def test_duplicate_dates_indexing(self): - ts = self.dups - - uniques = ts.index.unique() - for date in uniques: - result = ts[date] - - mask = ts.index == date - total = (ts.index == date).sum() - expected = ts[mask] - if total > 1: - assert_series_equal(result, expected) - else: - assert_almost_equal(result, expected[0]) - - cp = ts.copy() - cp[date] = 0 - expected = Series(np.where(mask, 0, ts), index=ts.index) - assert_series_equal(cp, expected) - - self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) - - # new index - ts[datetime(2000, 1, 6)] = 0 - self.assertEqual(ts[datetime(2000, 1, 6)], 0) - - def test_range_slice(self): - idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', - '1/4/2000']) - - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts['1/2/2000':] - expected = ts[1:] - assert_series_equal(result, expected) - - result = ts['1/2/2000':'1/3/2000'] - expected = ts[1:4] - assert_series_equal(result, expected) - - def test_groupby_average_dup_values(self): - result = self.dups.groupby(level=0).mean() - expected = self.dups.groupby(self.dups.index).mean() - assert_series_equal(result, expected) - - def test_indexing_over_size_cutoff(self): - import datetime - # #1821 - - old_cutoff = _index._SIZE_CUTOFF - try: - _index._SIZE_CUTOFF = 1000 - - # create large list of non periodic datetime - dates = [] - sec = datetime.timedelta(seconds=1) - half_sec = datetime.timedelta(microseconds=500000) - d = datetime.datetime(2011, 12, 5, 20, 30) - n = 1100 - for i in range(n): - dates.append(d) - dates.append(d + sec) - dates.append(d + sec + half_sec) - dates.append(d + sec + sec + half_sec) - d += 3 * sec - - # duplicate some values in the list - duplicate_positions = np.random.randint(0, len(dates) - 1, 20) - for p in duplicate_positions: - dates[p + 1] = dates[p] - - df = DataFrame(np.random.randn(len(dates), 4), - index=dates, - columns=list('ABCD')) - - pos = n * 3 - timestamp = df.index[pos] - self.assertIn(timestamp, df.index) - - # it works! - df.loc[timestamp] - self.assertTrue(len(df.loc[[timestamp]]) > 0) - finally: - _index._SIZE_CUTOFF = old_cutoff - - def test_indexing_unordered(self): - # GH 2437 - rng = date_range(start='2011-01-01', end='2011-01-15') - ts = Series(randn(len(rng)), index=rng) - ts2 = concat([ts[0:4], ts[-4:], ts[4:-4]]) - - for t in ts.index: - # TODO: unused? - s = str(t) # noqa - - expected = ts[t] - result = ts2[t] - self.assertTrue(expected == result) - - # GH 3448 (ranges) - def compare(slobj): - result = ts2[slobj].copy() - result = result.sort_index() - expected = ts[slobj] - assert_series_equal(result, expected) - - compare(slice('2011-01-01', '2011-01-15')) - compare(slice('2010-12-30', '2011-01-15')) - compare(slice('2011-01-01', '2011-01-16')) - - # partial ranges - compare(slice('2011-01-01', '2011-01-6')) - compare(slice('2011-01-06', '2011-01-8')) - compare(slice('2011-01-06', '2011-01-12')) - - # single values - result = ts2['2011'].sort_index() - expected = ts['2011'] - assert_series_equal(result, expected) - - # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') - ts = Series(np.arange(len(rng)), index=rng) - ts = ts.take(np.random.permutation(20)) - - result = ts['2005'] - for t in result.index: - self.assertTrue(t.year == 2005) - - def test_indexing(self): - - idx = date_range("2001-1-1", periods=20, freq='M') - ts = Series(np.random.rand(len(idx)), index=idx) - - # getting - - # GH 3070, make sure semantics work on Series/Frame - expected = ts['2001'] - expected.name = 'A' - - df = DataFrame(dict(A=ts)) - result = df['2001']['A'] - assert_series_equal(expected, result) - - # setting - ts['2001'] = 1 - expected = ts['2001'] - expected.name = 'A' - - df.loc['2001', 'A'] = 1 - - result = df['2001']['A'] - assert_series_equal(expected, result) - - # GH3546 (not including times on the last day) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', - freq='H') - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013-05'] - assert_series_equal(expected, ts) - - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', - freq='S') - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013-05'] - assert_series_equal(expected, ts) - - idx = [Timestamp('2013-05-31 00:00'), - Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013'] - assert_series_equal(expected, ts) - - # GH14826, indexing with a seconds resolution string / datetime object - df = DataFrame(randn(5, 5), - columns=['open', 'high', 'low', 'close', 'volume'], - index=date_range('2012-01-02 18:01:00', - periods=5, tz='US/Central', freq='s')) - expected = df.loc[[df.index[2]]] - - # this is a single date, so will raise - self.assertRaises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) - self.assertRaises(KeyError, df.__getitem__, df.index[2], ) - - -class TestDatetime64(tm.TestCase): - """ - Also test support for datetime64[ns] in Series / DataFrame - """ - - def setUp(self): - dti = DatetimeIndex(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='Min') - self.series = Series(rand(len(dti)), dti) - - def test_fancy_getitem(self): - dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) - - s = Series(np.arange(len(dti)), index=dti) - - self.assertEqual(s[48], 48) - self.assertEqual(s['1/2/2009'], 48) - self.assertEqual(s['2009-1-2'], 48) - self.assertEqual(s[datetime(2009, 1, 2)], 48) - self.assertEqual(s[lib.Timestamp(datetime(2009, 1, 2))], 48) - self.assertRaises(KeyError, s.__getitem__, '2009-1-3') - - assert_series_equal(s['3/6/2009':'2009-06-05'], - s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) - - def test_fancy_setitem(self): - dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) - - s = Series(np.arange(len(dti)), index=dti) - s[48] = -1 - self.assertEqual(s[48], -1) - s['1/2/2009'] = -2 - self.assertEqual(s[48], -2) - s['1/2/2009':'2009-06-05'] = -3 - self.assertTrue((s[48:54] == -3).all()) - - def test_dti_snap(self): - dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', - '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') - - res = dti.snap(freq='W-MON') - exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') - exp = exp.repeat([3, 4]) - self.assertTrue((res == exp).all()) - - res = dti.snap(freq='B') - - exp = date_range('1/1/2002', '1/7/2002', freq='b') - exp = exp.repeat([1, 1, 1, 2, 2]) - self.assertTrue((res == exp).all()) - - def test_dti_reset_index_round_trip(self): - dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') - d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) - d2 = d1.reset_index() - self.assertEqual(d2.dtypes[0], np.dtype('M8[ns]')) - d3 = d2.set_index('index') - assert_frame_equal(d1, d3, check_names=False) - - # #2329 - stamp = datetime(2012, 11, 22) - df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) - df = df.set_index('Date') - - self.assertEqual(df.index[0], stamp) - self.assertEqual(df.reset_index()['Date'][0], stamp) - - def test_series_set_value(self): - # #1561 - - dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = DatetimeIndex(dates) - - s = Series().set_value(dates[0], 1.) - s2 = s.set_value(dates[1], np.nan) - - exp = Series([1., np.nan], index=index) - - assert_series_equal(s2, exp) - - # s = Series(index[:1], index[:1]) - # s2 = s.set_value(dates[1], index[1]) - # self.assertEqual(s2.values.dtype, 'M8[ns]') - - @slow - def test_slice_locs_indexerror(self): - times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) - for i in range(100000)] - s = Series(lrange(100000), times) - s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] - - def test_slicing_datetimes(self): - - # GH 7523 - - # unique - df = DataFrame(np.arange(4., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 3, 4]]) - result = df.loc[datetime(2001, 1, 1, 10):] - assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11):] - expected = df.iloc[1:] - assert_frame_equal(result, expected) - result = df.loc['20010101 11':] - assert_frame_equal(result, expected) - - # duplicates - df = pd.DataFrame(np.arange(5., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 2, 3, 4]]) - - result = df.loc[datetime(2001, 1, 1, 10):] - assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11):] - expected = df.iloc[1:] - assert_frame_equal(result, expected) - result = df.loc['20010101 11':] - assert_frame_equal(result, expected) - - def test_frame_datetime64_duplicated(self): - dates = date_range('2010-07-01', end='2010-08-05') - - tst = DataFrame({'symbol': 'AAA', 'date': dates}) - result = tst.duplicated(['date', 'symbol']) - self.assertTrue((-result).all()) - - tst = DataFrame({'date': dates}) - result = tst.duplicated() - self.assertTrue((-result).all()) - - -class TestSeriesDatetime64(tm.TestCase): - def setUp(self): - self.series = Series(date_range('1/1/2000', periods=10)) - - def test_auto_conversion(self): - series = Series(list(date_range('1/1/2000', periods=10))) - self.assertEqual(series.dtype, 'M8[ns]') - - def test_constructor_cant_cast_datetime64(self): - msg = "Cannot cast datetime64 to " - with tm.assertRaisesRegexp(TypeError, msg): - Series(date_range('1/1/2000', periods=10), dtype=float) - - with tm.assertRaisesRegexp(TypeError, msg): - Series(date_range('1/1/2000', periods=10), dtype=int) - - def test_constructor_cast_object(self): - s = Series(date_range('1/1/2000', periods=10), dtype=object) - exp = Series(date_range('1/1/2000', periods=10)) - tm.assert_series_equal(s, exp) - - def test_series_comparison_scalars(self): - val = datetime(2000, 1, 4) - result = self.series > val - expected = Series([x > val for x in self.series]) - self.assert_series_equal(result, expected) - - val = self.series[5] - result = self.series > val - expected = Series([x > val for x in self.series]) - self.assert_series_equal(result, expected) - - def test_between(self): - left, right = self.series[[2, 7]] - - result = self.series.between(left, right) - expected = (self.series >= left) & (self.series <= right) - assert_series_equal(result, expected) - - # --------------------------------------------------------------------- - # NaT support - - def test_NaT_scalar(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') - - val = series[3] - self.assertTrue(com.isnull(val)) - - series[2] = val - self.assertTrue(com.isnull(series[2])) - - def test_NaT_cast(self): - # GH10747 - result = Series([np.nan]).astype('M8[ns]') - expected = Series([NaT]) - assert_series_equal(result, expected) - - def test_set_none_nan(self): - self.series[3] = None - self.assertIs(self.series[3], NaT) - - self.series[3:5] = None - self.assertIs(self.series[4], NaT) - - self.series[5] = np.nan - self.assertIs(self.series[5], NaT) - - self.series[5:7] = np.nan - self.assertIs(self.series[6], NaT) - - def test_intercept_astype_object(self): - - # this test no longer makes sense as series is by default already - # M8[ns] - expected = self.series.astype('object') - - df = DataFrame({'a': self.series, - 'b': np.random.randn(len(self.series))}) - exp_dtypes = pd.Series([np.dtype('datetime64[ns]'), - np.dtype('float64')], index=['a', 'b']) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - result = df.values.squeeze() - self.assertTrue((result[:, 0] == expected.values).all()) - - df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) - - result = df.values.squeeze() - self.assertTrue((result[:, 0] == expected.values).all()) - - def test_nat_operations(self): - # GH 8617 - s = Series([0, pd.NaT], dtype='m8[ns]') - exp = s[0] - self.assertEqual(s.median(), exp) - self.assertEqual(s.min(), exp) - self.assertEqual(s.max(), exp) - - def test_round_nat(self): - # GH14940 - s = Series([pd.NaT]) - expected = Series(pd.NaT) - for method in ["round", "floor", "ceil"]: - round_method = getattr(s.dt, method) - for freq in ["s", "5s", "min", "5min", "h", "5h"]: - assert_series_equal(round_method(freq), expected) - - -class TestDaysInMonth(tm.TestCase): - # tests for issue #10154 - def test_day_not_in_month_coerce(self): - self.assertTrue(isnull(to_datetime('2015-02-29', errors='coerce'))) - self.assertTrue(isnull(to_datetime('2015-02-29', format="%Y-%m-%d", - errors='coerce'))) - self.assertTrue(isnull(to_datetime('2015-02-32', format="%Y-%m-%d", - errors='coerce'))) - self.assertTrue(isnull(to_datetime('2015-04-31', format="%Y-%m-%d", - errors='coerce'))) - - def test_day_not_in_month_raise(self): - self.assertRaises(ValueError, to_datetime, '2015-02-29', - errors='raise') - self.assertRaises(ValueError, to_datetime, '2015-02-29', - errors='raise', format="%Y-%m-%d") - self.assertRaises(ValueError, to_datetime, '2015-02-32', - errors='raise', format="%Y-%m-%d") - self.assertRaises(ValueError, to_datetime, '2015-04-31', - errors='raise', format="%Y-%m-%d") - - def test_day_not_in_month_ignore(self): - self.assertEqual(to_datetime( - '2015-02-29', errors='ignore'), '2015-02-29') - self.assertEqual(to_datetime( - '2015-02-29', errors='ignore', format="%Y-%m-%d"), '2015-02-29') - self.assertEqual(to_datetime( - '2015-02-32', errors='ignore', format="%Y-%m-%d"), '2015-02-32') - self.assertEqual(to_datetime( - '2015-04-31', errors='ignore', format="%Y-%m-%d"), '2015-04-31') - - -class TestGuessDatetimeFormat(tm.TestCase): - - def test_guess_datetime_format_with_parseable_formats(self): - tm._skip_if_not_us_locale() - dt_string_to_format = (('20111230', '%Y%m%d'), - ('2011-12-30', '%Y-%m-%d'), - ('30-12-2011', '%d-%m-%Y'), - ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'), - ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'), - ('2011-12-30 00:00:00.000000', - '%Y-%m-%d %H:%M:%S.%f'), ) - - for dt_string, dt_format in dt_string_to_format: - self.assertEqual( - tools._guess_datetime_format(dt_string), - dt_format - ) - - def test_guess_datetime_format_with_dayfirst(self): - ambiguous_string = '01/01/2011' - self.assertEqual( - tools._guess_datetime_format(ambiguous_string, dayfirst=True), - '%d/%m/%Y' - ) - self.assertEqual( - tools._guess_datetime_format(ambiguous_string, dayfirst=False), - '%m/%d/%Y' - ) - - def test_guess_datetime_format_with_locale_specific_formats(self): - # The month names will vary depending on the locale, in which - # case these wont be parsed properly (dateutil can't parse them) - _skip_if_has_locale() - - dt_string_to_format = (('30/Dec/2011', '%d/%b/%Y'), - ('30/December/2011', '%d/%B/%Y'), - ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'), ) - - for dt_string, dt_format in dt_string_to_format: - self.assertEqual( - tools._guess_datetime_format(dt_string), - dt_format - ) - - def test_guess_datetime_format_invalid_inputs(self): - # A datetime string must include a year, month and a day for it - # to be guessable, in addition to being a string that looks like - # a datetime - invalid_dts = [ - '2013', - '01/2013', - '12:00:00', - '1/1/1/1', - 'this_is_not_a_datetime', - '51a', - 9, - datetime(2011, 1, 1), - ] - - for invalid_dt in invalid_dts: - self.assertTrue(tools._guess_datetime_format(invalid_dt) is None) - - def test_guess_datetime_format_nopadding(self): - # GH 11142 - dt_string_to_format = (('2011-1-1', '%Y-%m-%d'), - ('30-1-2011', '%d-%m-%Y'), - ('1/1/2011', '%m/%d/%Y'), - ('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'), - ('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'), - ('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')) - - for dt_string, dt_format in dt_string_to_format: - self.assertEqual( - tools._guess_datetime_format(dt_string), - dt_format - ) - - def test_guess_datetime_format_for_array(self): - tm._skip_if_not_us_locale() - expected_format = '%Y-%m-%d %H:%M:%S.%f' - dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) - - test_arrays = [ - np.array([dt_string, dt_string, dt_string], dtype='O'), - np.array([np.nan, np.nan, dt_string], dtype='O'), - np.array([dt_string, 'random_string'], dtype='O'), - ] - - for test_array in test_arrays: - self.assertEqual( - tools._guess_datetime_format_for_array(test_array), - expected_format - ) - - format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array( - [np.nan, np.nan, np.nan], dtype='O')) - self.assertTrue(format_for_string_of_nans is None) - - -class TestToDatetimeInferFormat(tm.TestCase): - - def test_to_datetime_infer_datetime_format_consistent_format(self): - s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) - - test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', - '%Y-%m-%dT%H:%M:%S.%f'] - - for test_format in test_formats: - s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) - - with_format = pd.to_datetime(s_as_dt_strings, format=test_format) - no_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=False) - yes_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=True) - - # Whether the format is explicitly passed, it is inferred, or - # it is not inferred, the results should all be the same - self.assert_series_equal(with_format, no_infer) - self.assert_series_equal(no_infer, yes_infer) - - def test_to_datetime_infer_datetime_format_inconsistent_format(self): - s = pd.Series(np.array(['01/01/2011 00:00:00', - '01-02-2011 00:00:00', - '2011-01-03T00:00:00'])) - - # When the format is inconsistent, infer_datetime_format should just - # fallback to the default parsing - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) - - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_infer_datetime_format_series_with_nans(self): - s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, - '01/03/2011 00:00:00', np.nan])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): - s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', - '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_iso8601_noleading_0s(self): - # GH 11871 - s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) - expected = pd.Series([pd.Timestamp('2014-01-01'), - pd.Timestamp('2014-02-02'), - pd.Timestamp('2015-03-03')]) - tm.assert_series_equal(pd.to_datetime(s), expected) - tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) - - -class TimeConversionFormats(tm.TestCase): - def test_to_datetime_format(self): - values = ['1/1/2000', '1/2/2000', '1/3/2000'] - - results1 = [Timestamp('20000101'), Timestamp('20000201'), - Timestamp('20000301')] - results2 = [Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')] - for vals, expecteds in [(values, (Index(results1), Index(results2))), - (Series(values), - (Series(results1), Series(results2))), - (values[0], (results1[0], results2[0])), - (values[1], (results1[1], results2[1])), - (values[2], (results1[2], results2[2]))]: - - for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): - result = to_datetime(vals, format=fmt) - expected = expecteds[i] - - if isinstance(expected, Series): - assert_series_equal(result, Series(expected)) - elif isinstance(expected, Timestamp): - self.assertEqual(result, expected) - else: - tm.assert_index_equal(result, expected) - - def test_to_datetime_format_YYYYMMDD(self): - s = Series([19801222, 19801222] + [19810105] * 5) - expected = Series([Timestamp(x) for x in s.apply(str)]) - - result = to_datetime(s, format='%Y%m%d') - assert_series_equal(result, expected) - - result = to_datetime(s.apply(str), format='%Y%m%d') - assert_series_equal(result, expected) - - # with NaT - expected = Series([Timestamp("19801222"), Timestamp("19801222")] + - [Timestamp("19810105")] * 5) - expected[2] = np.nan - s[2] = np.nan - - result = to_datetime(s, format='%Y%m%d') - assert_series_equal(result, expected) - - # string with NaT - s = s.apply(str) - s[2] = 'nat' - result = to_datetime(s, format='%Y%m%d') - assert_series_equal(result, expected) - - # coercion - # GH 7930 - s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') - expected = Series([datetime(2012, 12, 31), - datetime(2014, 12, 31), datetime(9999, 12, 31)], - dtype=object) - self.assert_series_equal(result, expected) - - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') - expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') - assert_series_equal(result, expected) - - # GH 10178 - def test_to_datetime_format_integer(self): - s = Series([2000, 2001, 2002]) - expected = Series([Timestamp(x) for x in s.apply(str)]) - - result = to_datetime(s, format='%Y') - assert_series_equal(result, expected) - - s = Series([200001, 200105, 200206]) - expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) - ]) - - result = to_datetime(s, format='%Y%m') - assert_series_equal(result, expected) - - def test_to_datetime_format_microsecond(self): - - # these are locale dependent - lang, _ = locale.getlocale() - month_abbr = calendar.month_abbr[4] - val = '01-{}-2011 00:00:01.978'.format(month_abbr) - - format = '%d-%b-%Y %H:%M:%S.%f' - result = to_datetime(val, format=format) - exp = datetime.strptime(val, format) - self.assertEqual(result, exp) - - def test_to_datetime_format_time(self): - data = [ - ['01/10/2010 15:20', '%m/%d/%Y %H:%M', - Timestamp('2010-01-10 15:20')], - ['01/10/2010 05:43', '%m/%d/%Y %I:%M', - Timestamp('2010-01-10 05:43')], - ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', - Timestamp('2010-01-10 13:56:01')] # , - # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', - # Timestamp('2010-01-10 20:14')], - # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', - # Timestamp('2010-01-10 07:40')], - # ['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p', - # Timestamp('2010-01-10 09:12:56')] - ] - for s, format, dt in data: - self.assertEqual(to_datetime(s, format=format), dt) - - def test_to_datetime_with_non_exact(self): - # GH 10834 - _skip_if_has_locale() - - # 8904 - # exact kw - if sys.version_info < (2, 7): - raise nose.SkipTest('on python version < 2.7') - - s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', - '19MAY11 00:00:00Z']) - result = to_datetime(s, format='%d%b%y', exact=False) - expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), - format='%d%b%y') - assert_series_equal(result, expected) - - def test_parse_nanoseconds_with_formula(self): - - # GH8989 - # trunctaing the nanoseconds when a format was provided - for v in ["2012-01-01 09:00:00.000000001", - "2012-01-01 09:00:00.000001", - "2012-01-01 09:00:00.001", - "2012-01-01 09:00:00.001000", - "2012-01-01 09:00:00.001000000", ]: - expected = pd.to_datetime(v) - result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") - self.assertEqual(result, expected) - - def test_to_datetime_format_weeks(self): - data = [ - ['2009324', '%Y%W%w', Timestamp('2009-08-13')], - ['2013020', '%Y%U%w', Timestamp('2013-01-13')] - ] - for s, format, dt in data: - self.assertEqual(to_datetime(s, format=format), dt) - - -class TestSlicing(tm.TestCase): - def test_slice_year(self): - dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) - - s = Series(np.arange(len(dti)), index=dti) - result = s['2005'] - expected = s[s.index.year == 2005] - assert_series_equal(result, expected) - - df = DataFrame(np.random.rand(len(dti), 5), index=dti) - result = df.loc['2005'] - expected = df[df.index.year == 2005] - assert_frame_equal(result, expected) - - rng = date_range('1/1/2000', '1/1/2010') - - result = rng.get_loc('2009') - expected = slice(3288, 3653) - self.assertEqual(result, expected) - - def test_slice_quarter(self): - dti = DatetimeIndex(freq='D', start=datetime(2000, 6, 1), periods=500) - - s = Series(np.arange(len(dti)), index=dti) - self.assertEqual(len(s['2001Q1']), 90) - - df = DataFrame(np.random.rand(len(dti), 5), index=dti) - self.assertEqual(len(df.loc['1Q01']), 90) - - def test_slice_month(self): - dti = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) - s = Series(np.arange(len(dti)), index=dti) - self.assertEqual(len(s['2005-11']), 30) - - df = DataFrame(np.random.rand(len(dti), 5), index=dti) - self.assertEqual(len(df.loc['2005-11']), 30) - - assert_series_equal(s['2005-11'], s['11-2005']) - - def test_partial_slice(self): - rng = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-05':'2006-02'] - expected = s['20050501':'20060228'] - assert_series_equal(result, expected) - - result = s['2005-05':] - expected = s['20050501':] - assert_series_equal(result, expected) - - result = s[:'2006-02'] - expected = s[:'20060228'] - assert_series_equal(result, expected) - - result = s['2005-1-1'] - self.assertEqual(result, s.iloc[0]) - - self.assertRaises(Exception, s.__getitem__, '2004-12-31') - - def test_partial_slice_daily(self): - rng = DatetimeIndex(freq='H', start=datetime(2005, 1, 31), periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-1-31'] - assert_series_equal(result, s.iloc[:24]) - - self.assertRaises(Exception, s.__getitem__, '2004-12-31 00') - - def test_partial_slice_hourly(self): - rng = DatetimeIndex(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), - periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-1-1'] - assert_series_equal(result, s.iloc[:60 * 4]) - - result = s['2005-1-1 20'] - assert_series_equal(result, s.iloc[:60]) - - self.assertEqual(s['2005-1-1 20:00'], s.iloc[0]) - self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:15') - - def test_partial_slice_minutely(self): - rng = DatetimeIndex(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), - periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['2005-1-1 23:59'] - assert_series_equal(result, s.iloc[:60]) - - result = s['2005-1-1'] - assert_series_equal(result, s.iloc[:60]) - - self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.iloc[0]) - self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') - - def test_partial_slice_second_precision(self): - rng = DatetimeIndex(start=datetime(2005, 1, 1, 0, 0, 59, - microsecond=999990), - periods=20, freq='US') - s = Series(np.arange(20), rng) - - assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) - assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) - - assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) - assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) - - self.assertEqual(s[Timestamp('2005-1-1 00:00:59.999990')], s.iloc[0]) - self.assertRaisesRegexp(KeyError, '2005-1-1 00:00:00', - lambda: s['2005-1-1 00:00:00']) - - def test_partial_slicing_dataframe(self): - # GH14856 - # Test various combinations of string slicing resolution vs. - # index resolution - # - If string resolution is less precise than index resolution, - # string is considered a slice - # - If string resolution is equal to or more precise than index - # resolution, string is considered an exact match - formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', - '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] - resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] - for rnum, resolution in enumerate(resolutions[2:], 2): - # we check only 'day', 'hour', 'minute' and 'second' - unit = Timedelta("1 " + resolution) - middate = datetime(2012, 1, 1, 0, 0, 0) - index = DatetimeIndex([middate - unit, - middate, middate + unit]) - values = [1, 2, 3] - df = DataFrame({'a': values}, index, dtype=np.int64) - self.assertEqual(df.index.resolution, resolution) - - # Timestamp with the same resolution as index - # Should be exact match for Series (return scalar) - # and raise KeyError for Frame - for timestamp, expected in zip(index, values): - ts_string = timestamp.strftime(formats[rnum]) - # make ts_string as precise as index - result = df['a'][ts_string] - self.assertIsInstance(result, np.int64) - self.assertEqual(result, expected) - self.assertRaises(KeyError, df.__getitem__, ts_string) - - # Timestamp with resolution less precise than index - for fmt in formats[:rnum]: - for element, theslice in [[0, slice(None, 1)], - [1, slice(1, None)]]: - ts_string = index[element].strftime(fmt) - - # Series should return slice - result = df['a'][ts_string] - expected = df['a'][theslice] - assert_series_equal(result, expected) - - # Frame should return slice as well - result = df[ts_string] - expected = df[theslice] - assert_frame_equal(result, expected) - - # Timestamp with resolution more precise than index - # Compatible with existing key - # Should return scalar for Series - # and raise KeyError for Frame - for fmt in formats[rnum + 1:]: - ts_string = index[1].strftime(fmt) - result = df['a'][ts_string] - self.assertIsInstance(result, np.int64) - self.assertEqual(result, 2) - self.assertRaises(KeyError, df.__getitem__, ts_string) - - # Not compatible with existing key - # Should raise KeyError - for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: - ts = index[1] + Timedelta("1 " + res) - ts_string = ts.strftime(fmt) - self.assertRaises(KeyError, df['a'].__getitem__, ts_string) - self.assertRaises(KeyError, df.__getitem__, ts_string) - - def test_partial_slicing_with_multiindex(self): - - # GH 4758 - # partial string indexing with a multi-index buggy - df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], - 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"], - 'val': [1, 2, 3, 4]}, - index=date_range("2013-06-19 09:30:00", - periods=4, freq='5T')) - df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) - - expected = DataFrame([ - [1] - ], index=Index(['ABC'], name='TICKER'), columns=['val']) - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] - assert_frame_equal(result, expected) - - expected = df_multi.loc[ - (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] - assert_series_equal(result, expected) - - # this is a KeyError as we don't do partial string selection on - # multi-levels - def f(): - df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] - - self.assertRaises(KeyError, f) - - # GH 4294 - # partial slice on a series mi - s = pd.DataFrame(randn(1000, 1000), index=pd.date_range( - '2000-1-1', periods=1000)).stack() - - s2 = s[:-1].copy() - expected = s2['2000-1-4'] - result = s2[pd.Timestamp('2000-1-4')] - assert_series_equal(result, expected) - - result = s[pd.Timestamp('2000-1-4')] - expected = s['2000-1-4'] - assert_series_equal(result, expected) - - df2 = pd.DataFrame(s) - expected = df2.xs('2000-1-4') - result = df2.loc[pd.Timestamp('2000-1-4')] - assert_frame_equal(result, expected) - - def test_shift(self): - ts = Series(np.random.randn(5), - index=date_range('1/1/2000', periods=5, freq='H')) - - result = ts.shift(1, freq='5T') - exp_index = ts.index.shift(1, freq='5T') - tm.assert_index_equal(result.index, exp_index) - - # GH #1063, multiple of same base - result = ts.shift(1, freq='4H') - exp_index = ts.index + offsets.Hour(4) - tm.assert_index_equal(result.index, exp_index) - - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) - self.assertRaises(ValueError, idx.shift, 1) - def test_setops_preserve_freq(self): for tz in [None, 'Asia/Tokyo', 'US/Eastern']: rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz) @@ -2453,602 +928,8 @@ def test_get_level_values_box(self): self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp)) - def test_frame_apply_dont_convert_datetime64(self): - from pandas.tseries.offsets import BDay - df = DataFrame({'x1': [datetime(1996, 1, 1)]}) - - df = df.applymap(lambda x: x + BDay()) - df = df.applymap(lambda x: x + BDay()) - - self.assertTrue(df.x1.dtype == 'M8[ns]') - - def test_partial_slice_doesnt_require_monotonicity(self): - # For historical reasons. - s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10)) - - nonmonotonic = s[[3, 5, 4]] - expected = nonmonotonic.iloc[:0] - timestamp = pd.Timestamp('2014-01-10') - - assert_series_equal(nonmonotonic['2014-01-10':], expected) - self.assertRaisesRegexp(KeyError, - r"Timestamp\('2014-01-10 00:00:00'\)", - lambda: nonmonotonic[timestamp:]) - - assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) - self.assertRaisesRegexp(KeyError, - r"Timestamp\('2014-01-10 00:00:00'\)", - lambda: nonmonotonic.loc[timestamp:]) - - -class TestToDatetime(tm.TestCase): - _multiprocess_can_split_ = True - - def test_to_datetime_dt64s(self): - in_bound_dts = [ - np.datetime64('2000-01-01'), - np.datetime64('2000-01-02'), - ] - - for dt in in_bound_dts: - self.assertEqual(pd.to_datetime(dt), Timestamp(dt)) - - oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ] - - for dt in oob_dts: - self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') - self.assertRaises(ValueError, tslib.Timestamp, dt) - self.assertIs(pd.to_datetime(dt, errors='coerce'), NaT) - - def test_to_datetime_array_of_dt64s(self): - dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] - - # Assuming all datetimes are in bounds, to_datetime() returns - # an array that is equal to Timestamp() parsing - self.assert_numpy_array_equal( - pd.to_datetime(dts, box=False), - np.array([Timestamp(x).asm8 for x in dts]) - ) - - # A list of datetimes where the last one is out of bounds - dts_with_oob = dts + [np.datetime64('9999-01-01')] - - self.assertRaises(ValueError, pd.to_datetime, dts_with_oob, - errors='raise') - - self.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='coerce'), - np.array( - [ - Timestamp(dts_with_oob[0]).asm8, - Timestamp(dts_with_oob[1]).asm8, - iNaT, - ], - dtype='M8' - ) - ) - - # With errors='ignore', out of bounds datetime64s - # are converted to their .item(), which depending on the version of - # numpy is either a python datetime.datetime or datetime.date - self.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='ignore'), - np.array( - [dt.item() for dt in dts_with_oob], - dtype='O' - ) - ) - - def test_to_datetime_tz(self): - - # xref 8260 - # uniform returns a DatetimeIndex - arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] - result = pd.to_datetime(arr) - expected = DatetimeIndex( - ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') - tm.assert_index_equal(result, expected) - - # mixed tzs will raise - arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] - self.assertRaises(ValueError, lambda: pd.to_datetime(arr)) - - def test_to_datetime_tz_pytz(self): - - # xref 8260 - tm._skip_if_no_pytz() - import pytz - - us_eastern = pytz.timezone('US/Eastern') - arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, - hour=3, minute=0)), - us_eastern.localize(datetime(year=2000, month=6, day=1, - hour=3, minute=0))], - dtype=object) - result = pd.to_datetime(arr, utc=True) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) - tm.assert_index_equal(result, expected) - - def test_to_datetime_utc_is_true(self): - # See gh-11934 - start = pd.Timestamp('2014-01-01', tz='utc') - end = pd.Timestamp('2014-01-03', tz='utc') - date_range = pd.bdate_range(start, end) - - result = pd.to_datetime(date_range, utc=True) - expected = pd.DatetimeIndex(data=date_range) - tm.assert_index_equal(result, expected) - - def test_to_datetime_tz_psycopg2(self): - - # xref 8260 - try: - import psycopg2 - except ImportError: - raise nose.SkipTest("no psycopg2 installed") - - # misc cases - tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) - tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) - arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1), - datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], - dtype=object) - - result = pd.to_datetime(arr, errors='coerce', utc=True) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) - tm.assert_index_equal(result, expected) - - # dtype coercion - i = pd.DatetimeIndex([ - '2000-01-01 08:00:00+00:00' - ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) - self.assertTrue(is_datetime64_ns_dtype(i)) - - # tz coerceion - result = pd.to_datetime(i, errors='coerce') - tm.assert_index_equal(result, i) - - result = pd.to_datetime(i, errors='coerce', utc=True) - expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], - dtype='datetime64[ns, UTC]') - tm.assert_index_equal(result, expected) - - def test_datetime_bool(self): - # GH13176 - with self.assertRaises(TypeError): - to_datetime(False) - self.assertTrue(to_datetime(False, errors="coerce") is tslib.NaT) - self.assertEqual(to_datetime(False, errors="ignore"), False) - with self.assertRaises(TypeError): - to_datetime(True) - self.assertTrue(to_datetime(True, errors="coerce") is tslib.NaT) - self.assertEqual(to_datetime(True, errors="ignore"), True) - with self.assertRaises(TypeError): - to_datetime([False, datetime.today()]) - with self.assertRaises(TypeError): - to_datetime(['20130101', True]) - tm.assert_index_equal(to_datetime([0, False, tslib.NaT, 0.0], - errors="coerce"), - DatetimeIndex([to_datetime(0), tslib.NaT, - tslib.NaT, to_datetime(0)])) - - def test_datetime_invalid_datatype(self): - # GH13176 - - with self.assertRaises(TypeError): - pd.to_datetime(bool) - with self.assertRaises(TypeError): - pd.to_datetime(pd.to_datetime) - - def test_unit(self): - # GH 11758 - # test proper behavior with erros - - with self.assertRaises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') - - values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), pd.NaT, - pd.NaT, pd.NaT, pd.NaT, pd.NaT], - dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, unit='D', errors='coerce') - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') - - values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] - - result = to_datetime(values, errors='ignore', unit='s') - expected = Index([1420043460000, pd.NaT, pd.NaT, - pd.NaT, pd.NaT], dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, errors='coerce', unit='s') - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') - - # if we have a string, then we raise a ValueError - # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: - try: - to_datetime(val, errors='raise', unit='s') - except tslib.OutOfBoundsDatetime: - raise AssertionError("incorrect exception raised") - except ValueError: - pass - - def test_unit_consistency(self): - - # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='coerce') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='ignore') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - def test_unit_with_numeric(self): - - # GH 13180 - # coercions from floats/ints are ok - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr1 = [1.434692e+18, 1.432766e+18] - arr2 = np.array(arr1).astype('int64') - for errors in ['ignore', 'raise', 'coerce']: - result = pd.to_datetime(arr1, errors=errors) - tm.assert_index_equal(result, expected) - - result = pd.to_datetime(arr2, errors=errors) - tm.assert_index_equal(result, expected) - - # but we want to make sure that we are coercing - # if we have ints/strings - expected = DatetimeIndex(['NaT', - '2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20', - 'NaT', - 'NaT']) - arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - def test_unit_mixed(self): - - # mixed integers/datetimes - expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) - arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - with self.assertRaises(ValueError): - pd.to_datetime(arr, errors='raise') - - expected = DatetimeIndex(['NaT', - 'NaT', - '2013-01-01']) - arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce') - tm.assert_index_equal(result, expected) - - with self.assertRaises(ValueError): - pd.to_datetime(arr, errors='raise') - - def test_dataframe(self): - - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [6, 7], - 'minute': [58, 59], - 'second': [10, 11], - 'ms': [1, 1], - 'us': [2, 2], - 'ns': [3, 3]}) - - result = to_datetime({'year': df['year'], - 'month': df['month'], - 'day': df['day']}) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:0:00')]) - assert_series_equal(result, expected) - - # dict-like - result = to_datetime(df[['year', 'month', 'day']].to_dict()) - assert_series_equal(result, expected) - - # dict but with constructable - df2 = df[['year', 'month', 'day']].to_dict() - df2['month'] = 2 - result = to_datetime(df2) - expected2 = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160205 00:0:00')]) - assert_series_equal(result, expected2) - - # unit mappings - units = [{'year': 'years', - 'month': 'months', - 'day': 'days', - 'hour': 'hours', - 'minute': 'minutes', - 'second': 'seconds'}, - {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second'}, - ] - - for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d)) - expected = Series([Timestamp('20150204 06:58:10'), - Timestamp('20160305 07:59:11')]) - assert_series_equal(result, expected) - - d = {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second', - 'ms': 'ms', - 'us': 'us', - 'ns': 'ns'} - - result = to_datetime(df.rename(columns=d)) - expected = Series([Timestamp('20150204 06:58:10.001002003'), - Timestamp('20160305 07:59:11.001002003')]) - assert_series_equal(result, expected) - - # coerce back to int - result = to_datetime(df.astype(str)) - assert_series_equal(result, expected) - - # passing coerce - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) - with self.assertRaises(ValueError): - to_datetime(df2) - result = to_datetime(df2, errors='coerce') - expected = Series([Timestamp('20150204 00:00:00'), - pd.NaT]) - assert_series_equal(result, expected) - - # extra columns - with self.assertRaises(ValueError): - df2 = df.copy() - df2['foo'] = 1 - to_datetime(df2) - - # not enough - for c in [['year'], - ['year', 'month'], - ['year', 'month', 'second'], - ['month', 'day'], - ['year', 'day', 'second']]: - with self.assertRaises(ValueError): - to_datetime(df[c]) - - # duplicates - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) - df2.columns = ['year', 'year', 'day'] - with self.assertRaises(ValueError): - to_datetime(df2) - - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5], - 'hour': [4, 5]}) - df2.columns = ['year', 'month', 'day', 'day'] - with self.assertRaises(ValueError): - to_datetime(df2) - - def test_dataframe_dtypes(self): - # #13451 - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) - - # int16 - result = to_datetime(df.astype('int16')) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) - assert_series_equal(result, expected) - - # mixed dtypes - df['month'] = df['month'].astype('int8') - df['day'] = df['day'].astype('int8') - result = to_datetime(df) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) - assert_series_equal(result, expected) - - # float - df = DataFrame({'year': [2000, 2001], - 'month': [1.5, 1], - 'day': [1, 1]}) - with self.assertRaises(ValueError): - to_datetime(df) - - def test_index_to_datetime(self): - idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = idx.to_datetime() - expected = DatetimeIndex(pd.to_datetime(idx.values)) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - today = datetime.today() - idx = Index([today], dtype=object) - result = idx.to_datetime() - expected = DatetimeIndex([today]) - tm.assert_index_equal(result, expected) - - def test_to_datetime_iso8601(self): - result = to_datetime(["2012-01-01 00:00:00"]) - exp = Timestamp("2012-01-01 00:00:00") - self.assertEqual(result[0], exp) - - result = to_datetime(['20121001']) # bad iso 8601 - exp = Timestamp('2012-10-01') - self.assertEqual(result[0], exp) - - def test_to_datetime_default(self): - rs = to_datetime('2001') - xp = datetime(2001, 1, 1) - self.assertTrue(rs, xp) - - # dayfirst is essentially broken - - # to_datetime('01-13-2012', dayfirst=True) - # self.assertRaises(ValueError, to_datetime('01-13-2012', - # dayfirst=True)) - - def test_to_datetime_on_datetime64_series(self): - # #2699 - s = Series(date_range('1/1/2000', periods=10)) - - result = to_datetime(s) - self.assertEqual(result[0], s[0]) - - def test_to_datetime_with_space_in_series(self): - # GH 6428 - s = Series(['10/18/2006', '10/18/2008', ' ']) - tm.assertRaises(ValueError, lambda: to_datetime(s, errors='raise')) - result_coerce = to_datetime(s, errors='coerce') - expected_coerce = Series([datetime(2006, 10, 18), - datetime(2008, 10, 18), - pd.NaT]) - tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(s, errors='ignore') - tm.assert_series_equal(result_ignore, s) - - def test_to_datetime_with_apply(self): - # this is only locale tested with US/None locales - _skip_if_has_locale() - - # GH 5195 - # with a format and coerce a single item to_datetime fails - td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) - expected = pd.to_datetime(td, format='%b %y') - result = td.apply(pd.to_datetime, format='%b %y') - assert_series_equal(result, expected) - - td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) - self.assertRaises(ValueError, - lambda: pd.to_datetime(td, format='%b %y', - errors='raise')) - self.assertRaises(ValueError, - lambda: td.apply(pd.to_datetime, format='%b %y', - errors='raise')) - expected = pd.to_datetime(td, format='%b %y', errors='coerce') - - result = td.apply( - lambda x: pd.to_datetime(x, format='%b %y', errors='coerce')) - assert_series_equal(result, expected) - - def test_to_datetime_types(self): - - # empty string - result = to_datetime('') - self.assertIs(result, NaT) - - result = to_datetime(['', '']) - self.assertTrue(isnull(result).all()) - - # ints - result = Timestamp(0) - expected = to_datetime(0) - self.assertEqual(result, expected) - - # GH 3888 (strings) - expected = to_datetime(['2012'])[0] - result = to_datetime('2012') - self.assertEqual(result, expected) - - # array = ['2012','20120101','20120101 12:01:01'] - array = ['20120101', '20120101 12:01:01'] - expected = list(to_datetime(array)) - result = lmap(Timestamp, array) - tm.assert_almost_equal(result, expected) - - # currently fails ### - # result = Timestamp('2012') - # expected = to_datetime('2012') - # self.assertEqual(result, expected) - - def test_to_datetime_unprocessable_input(self): - # GH 4928 - self.assert_numpy_array_equal( - to_datetime([1, '1'], errors='ignore'), - np.array([1, '1'], dtype='O') - ) - self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') - - def test_to_datetime_other_datetime64_units(self): - # 5/25/2012 - scalar = np.int64(1337904000000000).view('M8[us]') - as_obj = scalar.astype('O') - - index = DatetimeIndex([scalar]) - self.assertEqual(index[0], scalar.astype('O')) - - value = Timestamp(scalar) - self.assertEqual(value, as_obj) - - def test_to_datetime_list_of_integers(self): - rng = date_range('1/1/2000', periods=20) - rng = DatetimeIndex(rng.values) - - ints = list(rng.asi8) - - result = DatetimeIndex(ints) - - tm.assert_index_equal(rng, result) - - def test_to_datetime_freq(self): - xp = bdate_range('2000-1-1', periods=10, tz='UTC') - rs = xp.to_datetime() - self.assertEqual(xp.freq, rs.freq) - self.assertEqual(xp.tzinfo, rs.tzinfo) - if __name__ == '__main__': + import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 1aef9bcf9c9e980fff6031672d00018ac8f0fa21 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Feb 2017 13:28:49 -0500 Subject: [PATCH 100/338] ENH: .isnull and .notnull have been added as methods to Index to make this more consistent with the Series API Author: Jeff Reback Closes #15300 from jreback/null and squashes the following commits: 8c35656 [Jeff Reback] DOC: move Index.where to shared_docs e4502bf [Jeff Reback] ENH: .isnull and .notnull have been added as methods to Index to make this more consistent with the Series API --- doc/source/api.rst | 8 +++++++ doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/indexes/base.py | 38 +++++++++++++++++++++++++++++++-- pandas/indexes/category.py | 13 +---------- pandas/tests/indexes/common.py | 27 ++++++++++++++++++++++- pandas/tseries/base.py | 13 +---------- 6 files changed, 73 insertions(+), 28 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 92f290b5ee0a9..6c4a3cff5b4cf 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1356,8 +1356,16 @@ Modifying and Computations Index.unique Index.nunique Index.value_counts + +Missing Values +~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + Index.fillna Index.dropna + Index.isnull + Index.notnull Conversion ~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d76a78c68fb73..c6d757c6884d0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -125,7 +125,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) - +- ``.isnull()`` and ``.notnull()`` have been added to ``Index`` object to make them more consistent with the ``Series`` API (:issue:`15300`) - ``pd.read_gbq`` method now allows query configuration preferences (:issue:`14742`) - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index bc2dce4e97e5b..dcd565ee5f0e9 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -564,8 +564,7 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) - def where(self, cond, other=None): - """ + _index_shared_docs['where'] = """ .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding @@ -577,6 +576,9 @@ def where(self, cond, other=None): cond : boolean same length as self other : scalar, or array-like """ + + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self.values, other) @@ -1662,6 +1664,38 @@ def hasnans(self): else: return False + def isnull(self): + """ + Detect missing values + + .. versionadded:: 0.20.0 + + Returns + ------- + a boolean array of whether my values are null + + See also + -------- + pandas.isnull : pandas version + """ + return self._isnan + + def notnull(self): + """ + Reverse of isnull + + .. versionadded:: 0.20.0 + + Returns + ------- + a boolean array of whether my values are not null + + See also + -------- + pandas.notnull : pandas version + """ + return ~self.isnull() + def putmask(self, mask, value): """ return a new Index of the values set with the mask diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index e3ffa40f5f94a..e2e0fd056b111 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -332,19 +332,8 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass + @Appender(_index_shared_docs['where']) def where(self, cond, other=None): - """ - .. versionadded:: 0.19.0 - - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. - - Parameters - ---------- - cond : boolean same length as self - other : scalar, or array-like - """ if other is None: other = self._na_value values = np.where(cond, self.values, other) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 5a482acf403cd..81ad0524807f3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,7 +7,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, notnull) + TimedeltaIndex, PeriodIndex, notnull, isnull) from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp @@ -879,3 +879,28 @@ def test_fillna(self): expected[1] = True self.assert_numpy_array_equal(idx._isnan, expected) self.assertTrue(idx.hasnans) + + def test_nulls(self): + # this is really a smoke test for the methods + # as these are adequantely tested for function elsewhere + + for name, index in self.indices.items(): + if len(index) == 0: + self.assert_numpy_array_equal( + index.isnull(), np.array([], dtype=bool)) + elif isinstance(index, MultiIndex): + idx = index.copy() + msg = "isnull is not defined for MultiIndex" + with self.assertRaisesRegexp(NotImplementedError, msg): + idx.isnull() + else: + + if not index.hasnans: + self.assert_numpy_array_equal( + index.isnull(), np.zeros(len(index), dtype=bool)) + self.assert_numpy_array_equal( + index.notnull(), np.ones(len(index), dtype=bool)) + else: + result = isnull(index) + self.assert_numpy_array_equal(index.isnull(), result) + self.assert_numpy_array_equal(index.notnull(), ~result) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index a8dd2238c2063..ee9234d6c8237 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -786,19 +786,8 @@ def repeat(self, repeats, *args, **kwargs): return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + @Appender(_index_shared_docs['where']) def where(self, cond, other=None): - """ - .. versionadded:: 0.19.0 - - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. - - Parameters - ---------- - cond : boolean same length as self - other : scalar, or array-like - """ other = _ensure_datetimelike_to_i8(other) values = _ensure_datetimelike_to_i8(self) result = np.where(cond, values, other).astype('i8') From 76becb7565602ddf677b8df88edfffb116d3ee5e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Mon, 6 Feb 2017 10:23:26 -0500 Subject: [PATCH 101/338] BUG: Fix downcast argument for DataFrame.fillna() closes #15277 Author: Albert Villanova del Moral Closes #15278 from albertvillanova/fix-15277 and squashes the following commits: 1b594a9 [Albert Villanova del Moral] Fix tab indentation 631a2dc [Albert Villanova del Moral] Add whatsnew note d691954 [Albert Villanova del Moral] BUG: Fix downcast argument for DataFrame.fillna() --- doc/source/whatsnew/v0.20.0.txt | 3 +-- pandas/core/generic.py | 2 +- pandas/tests/frame/test_missing.py | 14 ++++++++++++++ pandas/tests/series/test_missing.py | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c6d757c6884d0..16caef57673f7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -496,6 +496,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) - Incorrect dtyped ``Series`` was returned by comparison methods (e.g., ``lt``, ``gt``, ...) against a constant for an empty ``DataFrame`` (:issue:`15077`) - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) +- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) - Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) @@ -509,7 +510,5 @@ Bug Fixes - - - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8074b167ff176..bb2664a5b8d28 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3347,7 +3347,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if k not in result: continue obj = result[k] - obj.fillna(v, limit=limit, inplace=True) + obj.fillna(v, limit=limit, inplace=True, downcast=downcast) return result elif not is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index a8c9c72956463..eabdb79295c27 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -252,6 +252,20 @@ def test_fillna(self): result = df.fillna(value={'Date': df['Date2']}) assert_frame_equal(result, expected) + def test_fillna_downcast(self): + # GH 15277 + # infer int64 from float64 + df = pd.DataFrame({'a': [1., np.nan]}) + result = df.fillna(0, downcast='infer') + expected = pd.DataFrame({'a': [1, 0]}) + assert_frame_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + df = pd.DataFrame({'a': [1., np.nan]}) + result = df.fillna({'a': 0}, downcast='infer') + expected = pd.DataFrame({'a': [1, 0]}) + assert_frame_equal(result, expected) + def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 8cf0d190a95cc..8c877ade6fe98 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -273,6 +273,20 @@ def test_datetime64tz_fillna_round_issue(self): assert_series_equal(filled, expected) + def test_fillna_downcast(self): + # GH 15277 + # infer int64 from float64 + s = pd.Series([1., np.nan]) + result = s.fillna(0, downcast='infer') + expected = pd.Series([1, 0]) + assert_series_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + s = pd.Series([1., np.nan]) + result = s.fillna({1: 0}, downcast='infer') + expected = pd.Series([1, 0]) + assert_series_equal(result, expected) + def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) s.fillna(method='ffill', inplace=True) From 4a5d851c7bbd049931faf50a25addba57a782375 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Feb 2017 16:55:38 +0100 Subject: [PATCH 102/338] DOC/CI: ensure correct pandas version (GH15311) (#15317) --- ci/build_docs.sh | 2 +- ci/requirements-3.5_DOC_BUILD.sh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 4dc9a203f1978..5dc649a91c4f7 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -22,8 +22,8 @@ if [ x"$DOC_BUILD" != x"" ]; then echo "Will build docs" source activate pandas - conda install -n pandas -c r r rpy2 --yes + # install sudo deps time sudo apt-get $APT_ARGS install dvipng texlive-latex-base texlive-latex-extra mv "$TRAVIS_BUILD_DIR"/doc /tmp diff --git a/ci/requirements-3.5_DOC_BUILD.sh b/ci/requirements-3.5_DOC_BUILD.sh index ca18ad976d46d..25bc63acc96d1 100644 --- a/ci/requirements-3.5_DOC_BUILD.sh +++ b/ci/requirements-3.5_DOC_BUILD.sh @@ -2,6 +2,8 @@ source activate pandas -echo "install DOC_BUILD" +echo "[install DOC_BUILD deps]" conda install -n pandas -c conda-forge feather-format + +conda install -n pandas -c r r rpy2 --yes From 5eac460fa1a5fa90ede7c8cc601b1a97a08491ec Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 6 Feb 2017 18:03:26 -0500 Subject: [PATCH 103/338] CLN: reorg pandas/io/json to sub-dirs xref #14904 Author: Jeff Reback Closes #15322 from jreback/json and squashes the following commits: 0c2da60 [Jeff Reback] DOC: whatsnew update fa3deef [Jeff Reback] CLN: reorg pandas/io/json to sub-dirs --- doc/source/whatsnew/v0.20.0.txt | 3 + pandas/io/json/__init__.py | 4 + pandas/io/{ => json}/json.py | 246 +---------------- pandas/io/json/normalize.py | 248 ++++++++++++++++++ .../{test_json_norm.py => test_normalize.py} | 3 +- setup.py | 1 + 6 files changed, 259 insertions(+), 246 deletions(-) create mode 100644 pandas/io/json/__init__.py rename pandas/io/{ => json}/json.py (73%) create mode 100644 pandas/io/json/normalize.py rename pandas/io/tests/json/{test_json_norm.py => test_normalize.py} (99%) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 16caef57673f7..1a32498d53c23 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -96,6 +96,9 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). .. _whatsnew_0200.enhancements.uint64_support: +UInt64 Support Improved +^^^^^^^^^^^^^^^^^^^^^^^ + Pandas has significantly improved support for operations involving unsigned, or purely non-negative, integers. Previously, handling these integers would result in improper rounding or data-type casting, leading to incorrect results. diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py new file mode 100644 index 0000000000000..a9390a04cc2cd --- /dev/null +++ b/pandas/io/json/__init__.py @@ -0,0 +1,4 @@ +from .json import to_json, read_json, loads, dumps # noqa +from .normalize import json_normalize # noqa + +del json, normalize # noqa diff --git a/pandas/io/json.py b/pandas/io/json/json.py similarity index 73% rename from pandas/io/json.py rename to pandas/io/json/json.py index 767a2212d92da..d29f4a371dd4d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json/json.py @@ -1,8 +1,6 @@ # pylint: disable-msg=E1101,W0613,W0603 import os -import copy -from collections import defaultdict import numpy as np import pandas.json as _json @@ -13,6 +11,7 @@ from pandas.io.common import get_filepath_or_buffer, _get_handle from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing +from .normalize import _convert_to_line_delimits loads = _json.loads dumps = _json.dumps @@ -641,246 +640,3 @@ def is_ok(col): lambda col, c: self._try_convert_to_date(c), lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates)) - -# --------------------------------------------------------------------- -# JSON normalization routines - - -def _convert_to_line_delimits(s): - """Helper function that converts json lists to line delimited json.""" - - # Determine we have a JSON list to turn to lines otherwise just return the - # json object, only lists can - if not s[0] == '[' and s[-1] == ']': - return s - s = s[1:-1] - - from pandas.lib import convert_json_to_lines - return convert_json_to_lines(s) - - -def nested_to_record(ds, prefix="", level=0): - """a simplified json_normalize - - converts a nested dict into a flat dict ("record"), unlike json_normalize, - it does not attempt to extract a subset of the data. - - Parameters - ---------- - ds : dict or list of dicts - prefix: the prefix, optional, default: "" - level: the number of levels in the jason string, optional, default: 0 - - Returns - ------- - d - dict or list of dicts, matching `ds` - - Examples - -------- - - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), - nested=dict(e=dict(c=1,d=2),d=2))) - Out[52]: - {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1, - 'nested.d': 2, - 'nested.e.c': 1, - 'nested.e.d': 2} - """ - singleton = False - if isinstance(ds, dict): - ds = [ds] - singleton = True - - new_ds = [] - for d in ds: - - new_d = copy.deepcopy(d) - for k, v in d.items(): - # each key gets renamed with prefix - if not isinstance(k, compat.string_types): - k = str(k) - if level == 0: - newkey = k - else: - newkey = prefix + '.' + k - - # only dicts gets recurse-flattend - # only at level>1 do we rename the rest of the keys - if not isinstance(v, dict): - if level != 0: # so we skip copying for top level, common case - v = new_d.pop(k) - new_d[newkey] = v - continue - else: - v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, level + 1)) - new_ds.append(new_d) - - if singleton: - return new_ds[0] - return new_ds - - -def json_normalize(data, record_path=None, meta=None, - meta_prefix=None, - record_prefix=None, - errors='raise'): - - """ - "Normalize" semi-structured JSON data into a flat table - - Parameters - ---------- - data : dict or list of dicts - Unserialized JSON objects - record_path : string or list of strings, default None - Path in each object to list of records. If not passed, data will be - assumed to be an array of records - meta : list of paths (string or list of strings), default None - Fields to use as metadata for each record in resulting table - record_prefix : string, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if - path to records is ['foo', 'bar'] - meta_prefix : string, default None - errors : {'raise', 'ignore'}, default 'raise' - - * ignore : will ignore KeyError if keys listed in meta are not - always present - * raise : will raise KeyError if keys listed in meta are not - always present - - .. versionadded:: 0.20.0 - - Returns - ------- - frame : DataFrame - - Examples - -------- - - >>> data = [{'state': 'Florida', - ... 'shortname': 'FL', - ... 'info': { - ... 'governor': 'Rick Scott' - ... }, - ... 'counties': [{'name': 'Dade', 'population': 12345}, - ... {'name': 'Broward', 'population': 40000}, - ... {'name': 'Palm Beach', 'population': 60000}]}, - ... {'state': 'Ohio', - ... 'shortname': 'OH', - ... 'info': { - ... 'governor': 'John Kasich' - ... }, - ... 'counties': [{'name': 'Summit', 'population': 1234}, - ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> from pandas.io.json import json_normalize - >>> result = json_normalize(data, 'counties', ['state', 'shortname', - ... ['info', 'governor']]) - >>> result - name population info.governor state shortname - 0 Dade 12345 Rick Scott Florida FL - 1 Broward 40000 Rick Scott Florida FL - 2 Palm Beach 60000 Rick Scott Florida FL - 3 Summit 1234 John Kasich Ohio OH - 4 Cuyahoga 1337 John Kasich Ohio OH - - """ - def _pull_field(js, spec): - result = js - if isinstance(spec, list): - for field in spec: - result = result[field] - else: - result = result[spec] - - return result - - # A bit of a hackjob - if isinstance(data, dict): - data = [data] - - if record_path is None: - if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): - # naive normalization, this is idempotent for flat records - # and potentially will inflate the data considerably for - # deeply nested structures: - # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} - # - # TODO: handle record value which are lists, at least error - # reasonably - data = nested_to_record(data) - return DataFrame(data) - elif not isinstance(record_path, list): - record_path = [record_path] - - if meta is None: - meta = [] - elif not isinstance(meta, list): - meta = [meta] - - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] - - # Disastrously inefficient for now - records = [] - lengths = [] - - meta_vals = defaultdict(list) - meta_keys = ['.'.join(val) for val in meta] - - def _recursive_extract(data, path, seen_meta, level=0): - if len(path) > 1: - for obj in data: - for val, key in zip(meta, meta_keys): - if level + 1 == len(val): - seen_meta[key] = _pull_field(obj, val[-1]) - - _recursive_extract(obj[path[0]], path[1:], - seen_meta, level=level + 1) - else: - for obj in data: - recs = _pull_field(obj, path[0]) - - # For repeating the metadata later - lengths.append(len(recs)) - - for val, key in zip(meta, meta_keys): - if level + 1 > len(val): - meta_val = seen_meta[key] - else: - try: - meta_val = _pull_field(obj, val[level:]) - except KeyError as e: - if errors == 'ignore': - meta_val = np.nan - else: - raise \ - KeyError("Try running with " - "errors='ignore' as key " - "%s is not always present", e) - meta_vals[key].append(meta_val) - - records.extend(recs) - - _recursive_extract(data, record_path, {}, level=0) - - result = DataFrame(records) - - if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) - - # Data types, a problem - for k, v in compat.iteritems(meta_vals): - if meta_prefix is not None: - k = meta_prefix + k - - if k in result: - raise ValueError('Conflicting metadata name %s, ' - 'need distinguishing prefix ' % k) - - result[k] = np.array(v).repeat(lengths) - - return result diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py new file mode 100644 index 0000000000000..aa80954233682 --- /dev/null +++ b/pandas/io/json/normalize.py @@ -0,0 +1,248 @@ +# --------------------------------------------------------------------- +# JSON normalization routines + +import copy +from collections import defaultdict +import numpy as np + +from pandas.lib import convert_json_to_lines +from pandas import compat, DataFrame + + +def _convert_to_line_delimits(s): + """Helper function that converts json lists to line delimited json.""" + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == '[' and s[-1] == ']': + return s + s = s[1:-1] + + return convert_json_to_lines(s) + + +def nested_to_record(ds, prefix="", level=0): + """a simplified json_normalize + + converts a nested dict into a flat dict ("record"), unlike json_normalize, + it does not attempt to extract a subset of the data. + + Parameters + ---------- + ds : dict or list of dicts + prefix: the prefix, optional, default: "" + level: the number of levels in the jason string, optional, default: 0 + + Returns + ------- + d - dict or list of dicts, matching `ds` + + Examples + -------- + + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2),d=2))) + Out[52]: + {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + """ + singleton = False + if isinstance(ds, dict): + ds = [ds] + singleton = True + + new_ds = [] + for d in ds: + + new_d = copy.deepcopy(d) + for k, v in d.items(): + # each key gets renamed with prefix + if not isinstance(k, compat.string_types): + k = str(k) + if level == 0: + newkey = k + else: + newkey = prefix + '.' + k + + # only dicts gets recurse-flattend + # only at level>1 do we rename the rest of the keys + if not isinstance(v, dict): + if level != 0: # so we skip copying for top level, common case + v = new_d.pop(k) + new_d[newkey] = v + continue + else: + v = new_d.pop(k) + new_d.update(nested_to_record(v, newkey, level + 1)) + new_ds.append(new_d) + + if singleton: + return new_ds[0] + return new_ds + + +def json_normalize(data, record_path=None, meta=None, + meta_prefix=None, + record_prefix=None, + errors='raise'): + + """ + "Normalize" semi-structured JSON data into a flat table + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects + record_path : string or list of strings, default None + Path in each object to list of records. If not passed, data will be + assumed to be an array of records + meta : list of paths (string or list of strings), default None + Fields to use as metadata for each record in resulting table + record_prefix : string, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar'] + meta_prefix : string, default None + errors : {'raise', 'ignore'}, default 'raise' + + * ignore : will ignore KeyError if keys listed in meta are not + always present + * raise : will raise KeyError if keys listed in meta are not + always present + + .. versionadded:: 0.20.0 + + Returns + ------- + frame : DataFrame + + Examples + -------- + + >>> data = [{'state': 'Florida', + ... 'shortname': 'FL', + ... 'info': { + ... 'governor': 'Rick Scott' + ... }, + ... 'counties': [{'name': 'Dade', 'population': 12345}, + ... {'name': 'Broward', 'population': 40000}, + ... {'name': 'Palm Beach', 'population': 60000}]}, + ... {'state': 'Ohio', + ... 'shortname': 'OH', + ... 'info': { + ... 'governor': 'John Kasich' + ... }, + ... 'counties': [{'name': 'Summit', 'population': 1234}, + ... {'name': 'Cuyahoga', 'population': 1337}]}] + >>> from pandas.io.json import json_normalize + >>> result = json_normalize(data, 'counties', ['state', 'shortname', + ... ['info', 'governor']]) + >>> result + name population info.governor state shortname + 0 Dade 12345 Rick Scott Florida FL + 1 Broward 40000 Rick Scott Florida FL + 2 Palm Beach 60000 Rick Scott Florida FL + 3 Summit 1234 John Kasich Ohio OH + 4 Cuyahoga 1337 John Kasich Ohio OH + + """ + def _pull_field(js, spec): + result = js + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + + return result + + # A bit of a hackjob + if isinstance(data, dict): + data = [data] + + if record_path is None: + if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): + # naive normalization, this is idempotent for flat records + # and potentially will inflate the data considerably for + # deeply nested structures: + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} + # + # TODO: handle record value which are lists, at least error + # reasonably + data = nested_to_record(data) + return DataFrame(data) + elif not isinstance(record_path, list): + record_path = [record_path] + + if meta is None: + meta = [] + elif not isinstance(meta, list): + meta = [meta] + + for i, x in enumerate(meta): + if not isinstance(x, list): + meta[i] = [x] + + # Disastrously inefficient for now + records = [] + lengths = [] + + meta_vals = defaultdict(list) + meta_keys = ['.'.join(val) for val in meta] + + def _recursive_extract(data, path, seen_meta, level=0): + if len(path) > 1: + for obj in data: + for val, key in zip(meta, meta_keys): + if level + 1 == len(val): + seen_meta[key] = _pull_field(obj, val[-1]) + + _recursive_extract(obj[path[0]], path[1:], + seen_meta, level=level + 1) + else: + for obj in data: + recs = _pull_field(obj, path[0]) + + # For repeating the metadata later + lengths.append(len(recs)) + + for val, key in zip(meta, meta_keys): + if level + 1 > len(val): + meta_val = seen_meta[key] + else: + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise \ + KeyError("Try running with " + "errors='ignore' as key " + "%s is not always present", e) + meta_vals[key].append(meta_val) + + records.extend(recs) + + _recursive_extract(data, record_path, {}, level=0) + + result = DataFrame(records) + + if record_prefix is not None: + result.rename(columns=lambda x: record_prefix + x, inplace=True) + + # Data types, a problem + for k, v in compat.iteritems(meta_vals): + if meta_prefix is not None: + k = meta_prefix + k + + if k in result: + raise ValueError('Conflicting metadata name %s, ' + 'need distinguishing prefix ' % k) + + result[k] = np.array(v).repeat(lengths) + + return result diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_normalize.py similarity index 99% rename from pandas/io/tests/json/test_json_norm.py rename to pandas/io/tests/json/test_normalize.py index 36110898448ea..e5aba43648d0c 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_normalize.py @@ -7,7 +7,8 @@ import pandas.util.testing as tm from pandas import compat -from pandas.io.json import json_normalize, nested_to_record +from pandas.io.json import json_normalize +from pandas.io.json.normalize import nested_to_record def _assert_equal_data(left, right): diff --git a/setup.py b/setup.py index 93a044bc3cc7d..4d6bb76fd6b7c 100755 --- a/setup.py +++ b/setup.py @@ -631,6 +631,7 @@ def pxd(name): 'pandas.core', 'pandas.indexes', 'pandas.io', + 'pandas.io.json', 'pandas.io.sas', 'pandas.formats', 'pandas.sparse', From c3775bf0be5eb9078000a800b46290eb1bfa9486 Mon Sep 17 00:00:00 2001 From: Nicholas Ver Halen Date: Tue, 7 Feb 2017 08:47:18 -0500 Subject: [PATCH 104/338] BUG: bug in passing 'on=' keyword for groupby(..).resample() closes #15021 Author: Nicholas Ver Halen Closes #15326 from verhalenn/issue15021 and squashes the following commits: 9fc3b4f [Nicholas Ver Halen] Updated the whatsnew for issue 15021 ec1f316 [Nicholas Ver Halen] Created a test for GH 15021 b8b10b0 [Nicholas Ver Halen] Added the on arg to resample on a grouped dataframe. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tseries/resample.py | 4 ++++ pandas/tseries/tests/test_resample.py | 14 ++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1a32498d53c23..3f6c06e20b546 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -434,6 +434,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) - Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) +- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index e93e5637099c1..5692d6c5cabde 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -975,6 +975,10 @@ def resample(obj, kind=None, **kwds): def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs): """ return our appropriate resampler when grouping as well """ + + # .resample uses 'on' similar to how .groupby uses 'key' + kwargs['key'] = kwargs.pop('on', None) + tg = TimeGrouper(freq=rule, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) r = resampler._get_resampler_for_grouping(groupby=groupby) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 56953541265a6..c40f930fbd094 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -217,6 +217,20 @@ def test_groupby_resample_api(self): lambda x: x.resample('1D').ffill())[['val']] assert_frame_equal(result, expected) + def test_groupby_resample_on_api(self): + + # GH 15021 + # .groupby(...).resample(on=...) results in an unexpected + # keyword warning. + df = pd.DataFrame({'key': ['A', 'B'] * 5, + 'dates': pd.date_range('2016-01-01', periods=10), + 'values': np.random.randn(10)}) + + expected = df.set_index('dates').groupby('key').resample('D').mean() + + result = df.groupby('key').resample('D', on='dates').mean() + assert_frame_equal(result, expected) + def test_plot_api(self): tm._skip_if_no_mpl() From 27fedba7b16f623c213873b9cafaf2f05f1802f7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Feb 2017 11:13:29 -0500 Subject: [PATCH 105/338] TST: remove __main__ from all test files (#15330) --- pandas/api/tests/test_api.py | 5 ---- pandas/computation/tests/test_compat.py | 5 ---- pandas/computation/tests/test_eval.py | 5 ---- pandas/core/frame.py | 5 ---- pandas/formats/format.py | 14 ----------- pandas/io/tests/json/test_normalize.py | 7 ------ pandas/io/tests/json/test_pandas.py | 5 ---- pandas/io/tests/json/test_ujson.py | 5 ---- pandas/io/tests/parser/test_network.py | 4 --- pandas/io/tests/parser/test_parsers.py | 5 ---- pandas/io/tests/parser/test_textreader.py | 5 ---- pandas/io/tests/parser/test_unsupported.py | 6 ----- pandas/io/tests/test_date_converters.py | 6 ----- pandas/io/tests/test_excel.py | 5 ---- pandas/io/tests/test_feather.py | 5 ---- pandas/io/tests/test_gbq.py | 4 --- pandas/io/tests/test_html.py | 4 --- pandas/io/tests/test_pickle.py | 6 ----- pandas/io/tests/test_pytables.py | 6 ----- pandas/io/tests/test_s3.py | 6 +---- pandas/io/tests/test_sql.py | 5 ---- pandas/io/tests/test_stata.py | 5 ---- pandas/sparse/tests/test_array.py | 6 ----- pandas/sparse/tests/test_combine_concat.py | 7 ------ pandas/sparse/tests/test_frame.py | 5 ---- pandas/sparse/tests/test_libsparse.py | 10 ++------ pandas/sparse/tests/test_list.py | 6 ----- pandas/sparse/tests/test_series.py | 6 ----- pandas/stats/tests/test_fama_macbeth.py | 5 ---- pandas/stats/tests/test_math.py | 4 --- pandas/stats/tests/test_ols.py | 6 ----- pandas/stats/tests/test_var.py | 5 ---- pandas/tests/formats/test_format.py | 5 ---- pandas/tests/formats/test_printing.py | 6 ----- pandas/tests/frame/test_analytics.py | 4 --- pandas/tests/frame/test_asof.py | 6 ----- pandas/tests/frame/test_constructors.py | 8 ------ pandas/tests/frame/test_indexing.py | 6 ----- pandas/tests/frame/test_misc_api.py | 6 ----- pandas/tests/frame/test_missing.py | 7 ------ pandas/tests/frame/test_operators.py | 5 ---- pandas/tests/frame/test_query_eval.py | 5 ---- pandas/tests/frame/test_timeseries.py | 6 ----- pandas/tests/frame/test_to_csv.py | 6 ----- pandas/tests/groupby/test_aggregate.py | 6 ----- pandas/tests/groupby/test_categorical.py | 7 ------ pandas/tests/groupby/test_filters.py | 7 ------ pandas/tests/groupby/test_groupby.py | 5 ---- pandas/tests/indexes/test_base.py | 6 ----- pandas/tests/indexes/test_numeric.py | 6 ----- pandas/tests/indexing/test_callable.py | 6 ----- pandas/tests/indexing/test_indexing.py | 5 ---- pandas/tests/plotting/test_boxplot_method.py | 5 ---- pandas/tests/plotting/test_datetimelike.py | 10 ++------ pandas/tests/plotting/test_frame.py | 10 ++------ pandas/tests/plotting/test_groupby.py | 11 ++------ pandas/tests/plotting/test_hist_method.py | 10 +------- pandas/tests/plotting/test_misc.py | 9 +------ pandas/tests/plotting/test_series.py | 12 +++------ pandas/tests/series/test_asof.py | 7 ------ pandas/tests/series/test_indexing.py | 6 ----- pandas/tests/series/test_missing.py | 7 ------ pandas/tests/series/test_timeseries.py | 6 ----- pandas/tests/test_algos.py | 6 ----- pandas/tests/test_base.py | 10 +------- pandas/tests/test_categorical.py | 7 ------ pandas/tests/test_common.py | 6 ----- pandas/tests/test_expressions.py | 10 ++------ pandas/tests/test_generic.py | 4 --- pandas/tests/test_internals.py | 5 ---- pandas/tests/test_join.py | 6 ----- pandas/tests/test_lib.py | 7 ------ pandas/tests/test_multilevel.py | 5 ---- pandas/tests/test_nanops.py | 6 ----- pandas/tests/test_panel.py | 5 ---- pandas/tests/test_panel4d.py | 5 ---- pandas/tests/test_panelnd.py | 6 ----- pandas/tests/test_reshape.py | 6 ----- pandas/tests/test_stats.py | 6 ----- pandas/tests/test_strings.py | 7 ------ pandas/tests/test_take.py | 6 ----- pandas/tests/test_testing.py | 5 ---- pandas/tests/test_util.py | 7 ------ pandas/tests/types/test_cast.py | 7 ------ pandas/tests/types/test_common.py | 6 ----- pandas/tests/types/test_concat.py | 6 ----- pandas/tests/types/test_dtypes.py | 6 ----- pandas/tests/types/test_generic.py | 6 ----- pandas/tests/types/test_inference.py | 6 ----- pandas/tests/types/test_io.py | 7 ------ pandas/tests/types/test_missing.py | 6 ----- pandas/tools/plotting.py | 25 ------------------- pandas/tools/tests/test_concat.py | 7 ------ pandas/tools/tests/test_join.py | 7 ------ pandas/tools/tests/test_merge.py | 7 ------ pandas/tools/tests/test_merge_asof.py | 6 ----- pandas/tools/tests/test_merge_ordered.py | 6 ----- pandas/tools/tests/test_pivot.py | 6 ----- pandas/tools/tests/test_tile.py | 6 ----- pandas/tools/tests/test_util.py | 5 ---- pandas/tseries/tests/test_base.py | 7 ------ pandas/tseries/tests/test_converter.py | 9 +------ pandas/tseries/tests/test_daterange.py | 6 ----- pandas/tseries/tests/test_holiday.py | 6 ----- pandas/tseries/tests/test_offsets.py | 10 +++----- pandas/tseries/tests/test_period.py | 6 ----- pandas/tseries/tests/test_resample.py | 6 ----- pandas/tseries/tests/test_timedeltas.py | 6 ----- .../tseries/tests/test_timeseries_legacy.py | 5 ---- pandas/tseries/tests/test_timezones.py | 7 ------ pandas/tseries/tests/test_tslib.py | 6 ----- pandas/tseries/tests/test_util.py | 6 ----- 112 files changed, 21 insertions(+), 703 deletions(-) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 410d70c65404f..f925fd792f9ca 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -227,8 +227,3 @@ def test_deprecation_access_obj(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.datetools.monthEnd - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/computation/tests/test_compat.py b/pandas/computation/tests/test_compat.py index 8e8924379f153..900dd2c28b4c5 100644 --- a/pandas/computation/tests/test_compat.py +++ b/pandas/computation/tests/test_compat.py @@ -61,8 +61,3 @@ def testit(): testit() else: testit() - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 3a446bfc36c21..dbac72c619a52 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1977,8 +1977,3 @@ def test_validate_bool_args(self): for value in invalid_values: with self.assertRaises(ValueError): pd.eval("2+2", inplace=value) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cf306034001db..cc81c66100a6f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5762,8 +5762,3 @@ def boxplot(self, column=None, by=None, ax=None, fontsize=None, rot=0, ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs) ops.add_special_arithmetic_methods(DataFrame, **ops.frame_special_funcs) - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index adfb54c02d926..3bac7d2821760 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2679,17 +2679,3 @@ def _binify(cols, line_width): bins.append(len(cols)) return bins - - -if __name__ == '__main__': - arr = np.array([746.03, 0.00, 5620.00, 1592.36]) - # arr = np.array([11111111.1, 1.55]) - # arr = [314200.0034, 1.4125678] - arr = np.array( - [327763.3119, 345040.9076, 364460.9915, 398226.8688, 383800.5172, - 433442.9262, 539415.0568, 568590.4108, 599502.4276, 620921.8593, - 620898.5294, 552427.1093, 555221.2193, 519639.7059, 388175.7, - 379199.5854, 614898.25, 504833.3333, 560600., 941214.2857, 1134250., - 1219550., 855736.85, 1042615.4286, 722621.3043, 698167.1818, 803750.]) - fmt = FloatArrayFormatter(arr, digits=7) - print(fmt.get_result()) diff --git a/pandas/io/tests/json/test_normalize.py b/pandas/io/tests/json/test_normalize.py index e5aba43648d0c..c60b81ffe504d 100644 --- a/pandas/io/tests/json/test_normalize.py +++ b/pandas/io/tests/json/test_normalize.py @@ -1,5 +1,3 @@ -import nose - from pandas import DataFrame import numpy as np import json @@ -283,8 +281,3 @@ def test_json_normalize_errors(self): ['general', 'trade_version']], errors='raise' ) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', - '--pdb-failure', '-s'], exit=False) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 345d181a0e53a..ee5039c38b182 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1044,8 +1044,3 @@ def roundtrip(s, encoding='latin-1'): for s in examples: roundtrip(s) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', - '--pdb-failure', '-s'], exit=False) diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index 704023bd847b7..3da61b7696fdc 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -1611,8 +1611,3 @@ def test_encodeSet(self): def _clean_dict(d): return dict((str(k), v) for k, v in compat.iteritems(d)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index d84c2ae3beb0c..e06f94c780c8b 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -182,7 +182,3 @@ def test_s3_fails(self): # It's irrelevant here that this isn't actually a table. with tm.assertRaises(IOError): read_csv('s3://cant_get_it/') - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index a90f546d37fc8..93b5fdcffed4c 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import os -import nose import pandas.util.testing as tm @@ -99,7 +98,3 @@ def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine return read_table(*args, **kwds) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index 98cb09cd85480..0e91ca806e8fe 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -10,7 +10,6 @@ import os import sys -import nose from numpy import nan import numpy as np @@ -402,7 +401,3 @@ def test_empty_csv_input(self): def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): assert(np.array_equal(v, right[k])) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 4d93df16a0279..e941c9186cd6a 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -9,8 +9,6 @@ test suite as new feature support is added to the parsers. """ -import nose - import pandas.io.parsers as parsers import pandas.util.testing as tm @@ -142,7 +140,3 @@ def test_deprecated_args(self): kwargs = {arg: non_default_val} read_csv(StringIO(data), engine=engine, **kwargs) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py index 99abbacb604fa..5b54925c65fbd 100644 --- a/pandas/io/tests/test_date_converters.py +++ b/pandas/io/tests/test_date_converters.py @@ -1,8 +1,6 @@ from pandas.compat import StringIO from datetime import date, datetime -import nose - import numpy as np from pandas import DataFrame, MultiIndex @@ -150,7 +148,3 @@ def test_parse_date_column_with_empty_string(self): [621, ' ']] expected = DataFrame(expected_data, columns=['case', 'opdate']) assert_frame_equal(result, expected) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 12aecfd50c3a6..2791e397d5b86 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -2325,8 +2325,3 @@ def check_called(func): check_called(lambda: panel.to_excel('something.test')) check_called(lambda: df.to_excel('something.xlsx')) check_called(lambda: df.to_excel('something.xls', engine='dummy')) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_feather.py b/pandas/io/tests/test_feather.py index b8b85d7dbbece..dcb057ec30004 100644 --- a/pandas/io/tests/test_feather.py +++ b/pandas/io/tests/test_feather.py @@ -116,8 +116,3 @@ def test_write_with_index(self): df.index = [0, 1, 2] df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), self.check_error_on_write(df, ValueError) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 8a414dcd3ba4f..ac481a44de5e8 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -1224,7 +1224,3 @@ def test_upload_data_as_service_account_with_key_contents(self): project_id=_get_project_id(), private_key=_get_private_key_contents()) self.assertEqual(result['NUM_ROWS'][0], test_size) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 9ac8def3a074d..356adb92829c6 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -918,7 +918,3 @@ def test_same_ordering(): dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index a49f50b1bcb9f..b5c316b326b8d 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -283,9 +283,3 @@ def test_pickle_v0_15_2(self): # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'], - exit=False) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 40db10c42d5a7..f4f03856f94e2 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -5516,9 +5516,3 @@ def _test_sort(obj): return obj.reindex(major=sorted(obj.major_axis)) else: raise ValueError('type not supported here') - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_s3.py b/pandas/io/tests/test_s3.py index 8058698a906ea..2983fa647445c 100644 --- a/pandas/io/tests/test_s3.py +++ b/pandas/io/tests/test_s3.py @@ -1,14 +1,10 @@ -import nose from pandas.util import testing as tm from pandas.io.common import _is_s3_url class TestS3URL(tm.TestCase): + def test_is_s3_url(self): self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) self.assertFalse(_is_s3_url("s4://pandas/somethingelse.com")) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 9e639f7ef6057..4bcde764001c1 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -2658,8 +2658,3 @@ def clean_up(test_table_to_drop): self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 8cfd5d98fe05f..fcb935925e61f 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1276,8 +1276,3 @@ def test_out_of_range_float(self): original.to_stata(path) tm.assertTrue('ColumnTooBig' in cm.exception) tm.assertTrue('infinity' in cm.exception) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 592926f8e821d..55f292a8a231a 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -810,9 +810,3 @@ def test_ufunc_args(self): sparse = SparseArray([1, -1, 0, -2], fill_value=0) result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_combine_concat.py b/pandas/sparse/tests/test_combine_concat.py index fcdc6d9580dd5..5240d592810ad 100644 --- a/pandas/sparse/tests/test_combine_concat.py +++ b/pandas/sparse/tests/test_combine_concat.py @@ -1,6 +1,5 @@ # pylint: disable-msg=E1101,W0612 -import nose # noqa import numpy as np import pandas as pd import pandas.util.testing as tm @@ -356,9 +355,3 @@ def test_concat_sparse_dense(self): exp = pd.concat([self.dense1, self.dense3], axis=1) self.assertIsInstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res, exp) - - -if __name__ == '__main__': - import nose # noqa - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 23bb827974c61..e26c0ed1afe58 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -1193,8 +1193,3 @@ def test_numpy_func_call(self): 'std', 'min', 'max'] for func in funcs: getattr(np, func)(self.frame) - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index c289b4a1b204f..b3aa3368e9455 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -1,6 +1,6 @@ from pandas import Series -import nose # noqa +import nose import numpy as np import operator import pandas.util.testing as tm @@ -196,7 +196,7 @@ def _check_correct(a, b, expected): assert (result.equals(expected)) def _check_length_exc(a, longer): - nose.tools.assert_raises(Exception, a.intersect, longer) + self.assertRaises(Exception, a.intersect, longer) def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) @@ -585,9 +585,3 @@ def f(self): g = make_optestf(op) setattr(TestSparseOperators, g.__name__, g) del g - - -if __name__ == '__main__': - import nose # noqa - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py index b117685b6e968..458681cdc1de0 100644 --- a/pandas/sparse/tests/test_list.py +++ b/pandas/sparse/tests/test_list.py @@ -112,9 +112,3 @@ def test_getitem(self): for i in range(len(arr)): tm.assert_almost_equal(splist[i], arr[i]) tm.assert_almost_equal(splist[-i], arr[-i]) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 06d76bdd4dd3d..b34f5dd2cee9f 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1366,9 +1366,3 @@ def test_numpy_func_call(self): for func in funcs: for series in ('bseries', 'zbseries'): getattr(np, func)(getattr(self, series)) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py index 706becfa730c4..0c9fcf775ad2d 100644 --- a/pandas/stats/tests/test_fama_macbeth.py +++ b/pandas/stats/tests/test_fama_macbeth.py @@ -66,8 +66,3 @@ def _check_stuff_works(self, result): # does it work? result.summary - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/stats/tests/test_math.py b/pandas/stats/tests/test_math.py index bc09f33d2f467..3f89dbcd20065 100644 --- a/pandas/stats/tests/test_math.py +++ b/pandas/stats/tests/test_math.py @@ -57,7 +57,3 @@ def test_inv_illformed(self): rs = pmath.inv(singular) expected = np.array([[0.1, 0.2], [0.1, 0.2]]) self.assertTrue(np.allclose(rs, expected)) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 2935f986cca9f..09fa21d58ea9d 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -974,9 +974,3 @@ def testFilterWithDictRHS(self): def tsAssertEqual(self, ts1, ts2, **kwargs): self.assert_series_equal(ts1, ts2, **kwargs) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/stats/tests/test_var.py b/pandas/stats/tests/test_var.py index 9f2c95a2d3d5c..04e2019f00a82 100644 --- a/pandas/stats/tests/test_var.py +++ b/pandas/stats/tests/test_var.py @@ -6,7 +6,6 @@ from pandas.compat import range import nose -import unittest raise nose.SkipTest('skipping this for now') @@ -93,7 +92,3 @@ def __init__(self): self.res1 = VAR2(endog=data).fit(maxlag=2) from results import results_var self.res2 = results_var.MacrodataResults() - - -if __name__ == '__main__': - unittest.main() diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 9eff64b40625d..7a2c5f3b7f7c1 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -4999,8 +4999,3 @@ def test_format_percentiles(): tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5]) tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5]) tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a']) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 3bcceca1f50a7..d1eb1faecc401 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import nose from pandas import compat import pandas.formats.printing as printing import pandas.formats.format as fmt @@ -135,8 +134,3 @@ def test_ambiguous_width(self): # result = printing.console_encode(u"\u05d0") # expected = u"\u05d0".encode('utf-8') # assert (result == expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 5d51306363053..0dbb78ec89b2e 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2202,7 +2202,3 @@ def test_dot(self): with tm.assertRaisesRegexp(ValueError, 'aligned'): df.dot(df2) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index f68219120b48e..323960d54a42c 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -1,7 +1,5 @@ # coding=utf-8 -import nose - import numpy as np from pandas import (DataFrame, date_range, Timestamp, Series, to_datetime) @@ -84,7 +82,3 @@ def test_missing(self): expected = DataFrame(index=to_datetime(['1989-12-31']), columns=['A', 'B'], dtype='float64') assert_frame_equal(result, expected) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fe6a12fcca28a..1676c57a274cd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -7,7 +7,6 @@ import itertools import nose - from numpy.random import randn import numpy as np @@ -1945,10 +1944,3 @@ def test_frame_timeseries_to_records(self): result['index'].dtype == 'M8[ns]' result = df.to_records(index=False) - - -if __name__ == '__main__': - import nose # noqa - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 7d68eac47766e..f0e6ab4c17915 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2954,9 +2954,3 @@ def test_transpose(self): expected = DataFrame(self.df.values.T) expected.index = ['A', 'B'] assert_frame_equal(result, expected) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index f5719fa1d8b85..2fc14d9e4d123 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -4,7 +4,6 @@ # pylint: disable-msg=W0612,E1101 from copy import deepcopy import sys -import nose from distutils.version import LooseVersion from pandas.compat import range, lrange @@ -486,8 +485,3 @@ def _check_f(base, f): # rename f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(d.copy(), f) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index eabdb79295c27..8c25f71c00684 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -711,10 +711,3 @@ def test_interp_ignore_all_good(self): # all good result = df[['B', 'D']].interpolate(downcast=None) assert_frame_equal(result, df[['B', 'D']]) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'] - exit=False) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index f843a5c08ce05..15f98abe1445d 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1275,8 +1275,3 @@ def test_alignment_non_pandas(self): align(df, val, 'index') with tm.assertRaises(ValueError): align(df, val, 'columns') - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 36ae5dac733a5..a9a90a6f5cd40 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1155,8 +1155,3 @@ class TestDataFrameEvalPythonPython(TestDataFrameEvalNumExprPython): def setUpClass(cls): super(TestDataFrameEvalPythonPython, cls).tearDownClass() cls.engine = cls.parser = 'python' - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 9a9f0ee67fb89..55848847f2266 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -575,9 +575,3 @@ def test_frame_to_period(self): tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) self.assertRaises(ValueError, df.to_period, axis=2) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index b585462365606..5c47b0357b4f6 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1145,9 +1145,3 @@ def test_to_csv_quoting(self): df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 6b162b71f79de..5f680a6876873 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import print_function -import nose from datetime import datetime @@ -487,8 +486,3 @@ def testit(label_list, shape): shape = (10000, 10000) label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] testit(label_list, shape) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' - ], exit=False) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 81aa183426be9..82ec1832be961 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,9 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import print_function -import nose from numpy import nan - from pandas.core.index import Index, MultiIndex, CategoricalIndex from pandas.core.api import DataFrame, Categorical @@ -490,8 +488,3 @@ def testit(label_list, shape): shape = (10000, 10000) label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] testit(label_list, shape) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' - ], exit=False) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 40d8039f71576..663fbd04e7e5a 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import print_function -import nose - from numpy import nan @@ -641,8 +639,3 @@ def testit(label_list, shape): shape = (10000, 10000) label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] testit(label_list, shape) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' - ], exit=False) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bf61f5ef83859..01c81bd7904bd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6057,8 +6057,3 @@ def testit(label_list, shape): shape = (10000, 10000) label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] testit(label_list, shape) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' - ], exit=False) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a0f2a090c9a06..c574a4a1f01a7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -11,7 +11,6 @@ import operator import os -import nose import numpy as np from pandas import (period_range, date_range, Series, @@ -2078,8 +2077,3 @@ def test_intersect_str_dates(self): res = i2.intersection(i1) self.assertEqual(len(res), 0) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index c7acbf51a17e5..4dab7ae76a011 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -3,7 +3,6 @@ from datetime import datetime from pandas.compat import range, PY3 -import nose import numpy as np from pandas import (date_range, Series, Index, Float64Index, @@ -1144,8 +1143,3 @@ def test_join_outer(self): self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index ab225f72934ce..bcadc41b13370 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 -import nose import numpy as np import pandas as pd @@ -268,8 +267,3 @@ def test_frame_iloc_callable_setitem(self): exp = df.copy() exp.iloc[[1, 3], [0]] = [-5, -5] tm.assert_frame_equal(res, exp) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f7fa07916ca74..a9dfcf2672357 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -5532,8 +5532,3 @@ def test_boolean_indexing(self): index=pd.to_timedelta(range(10), unit='s'), columns=['x']) tm.assert_frame_equal(expected, result) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 289d48ba6d4cc..f7fd6a8519533 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -378,8 +378,3 @@ def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}) self._check_ticks_props(df.boxplot("a", by="b", fontsize=16), xlabelsize=16, ylabelsize=16) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 6486c8aa21c1b..bcc9c7ceea8b5 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1,3 +1,5 @@ +""" Test cases for time series specific (freq conversion, etc) """ + from datetime import datetime, timedelta, date, time import nose @@ -18,9 +20,6 @@ _skip_if_no_scipy_gaussian_kde) -""" Test cases for time series specific (freq conversion, etc) """ - - @tm.mplskip class TestTSPlot(TestPlotBase): @@ -1309,8 +1308,3 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): plt.savefig(path) finally: plt.close(fig) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index fba554b03f191..81a54bd38b3f8 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1,5 +1,7 @@ # coding: utf-8 +""" Test cases for DataFrame.plot """ + import nose import string import warnings @@ -26,9 +28,6 @@ _ok_for_gaussian_kde) -""" Test cases for DataFrame.plot """ - - @tm.mplskip class TestDataFramePlots(TestPlotBase): @@ -2726,8 +2725,3 @@ def _generate_4_axes_via_gridspec(): ax_lr = plt.subplot(gs[1, 1]) return gs, [ax_tl, ax_ll, ax_tr, ax_lr] - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 3c682fbfbb89e..93efb3f994c38 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -1,6 +1,7 @@ # coding: utf-8 -import nose +""" Test cases for GroupBy.plot """ + from pandas import Series, DataFrame import pandas.util.testing as tm @@ -10,9 +11,6 @@ from pandas.tests.plotting.common import TestPlotBase -""" Test cases for GroupBy.plot """ - - @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): @@ -74,8 +72,3 @@ def test_plot_kwargs(self): res = df.groupby('z').plot.scatter(x='x', y='y') self.assertEqual(len(res['a'].collections), 1) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index bde5544390b85..4f64f66bd3c4d 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -1,6 +1,6 @@ # coding: utf-8 -import nose +""" Test cases for .hist method """ from pandas import Series, DataFrame import pandas.util.testing as tm @@ -13,9 +13,6 @@ from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) -""" Test cases for .hist method """ - - @tm.mplskip class TestSeriesPlots(TestPlotBase): @@ -418,8 +415,3 @@ def test_axis_share_xy(self): self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 2650ce2879db7..c92287b2bdc42 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -1,6 +1,6 @@ # coding: utf-8 -import nose +""" Test cases for misc plot functions """ from pandas import Series, DataFrame from pandas.compat import lmap @@ -15,8 +15,6 @@ from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, _ok_for_gaussian_kde) -""" Test cases for misc plot functions """ - @tm.mplskip class TestSeriesPlots(TestPlotBase): @@ -298,8 +296,3 @@ def test_subplot_titles(self): title=title[:-1]) title_list = [ax.get_title() for sublist in plot for ax in sublist] self.assertEqual(title_list, title[:3] + ['']) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index f668c46a15173..8c00d606059a4 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -1,6 +1,8 @@ # coding: utf-8 -import nose +""" Test cases for Series.plot """ + + import itertools from datetime import datetime @@ -20,9 +22,6 @@ _ok_for_gaussian_kde) -""" Test cases for Series.plot """ - - @tm.mplskip class TestSeriesPlots(TestPlotBase): @@ -811,8 +810,3 @@ def test_custom_business_day_freq(self): freq=CustomBusinessDay(holidays=['2014-05-26']))) _check_plot_works(s.plot) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py index e2092feab9004..db306d2a742c1 100644 --- a/pandas/tests/series/test_asof.py +++ b/pandas/tests/series/test_asof.py @@ -1,9 +1,6 @@ # coding=utf-8 -import nose - import numpy as np - from pandas import (offsets, Series, notnull, isnull, date_range, Timestamp) @@ -152,7 +149,3 @@ def test_errors(self): s = Series(np.random.randn(N), index=rng) with self.assertRaises(ValueError): s.asof(s.index[0], subset='foo') - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index bdae11770de65..e0d83d6eeadac 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -2638,9 +2638,3 @@ def test_round_nat(self): round_method = getattr(s.dt, method) for freq in ["s", "5s", "min", "5min", "h", "5h"]: assert_series_equal(round_method(freq), expected) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 8c877ade6fe98..6821a8b9f4221 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1092,10 +1092,3 @@ def test_series_interpolate_intraday(self): result = ts.reindex(new_index).interpolate(method='time') self.assert_numpy_array_equal(result.values, exp.values) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'] - exit=False) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 9754a9d3737e3..bd346fb9bb0c8 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -927,9 +927,3 @@ def test_get_level_values_box(self): index = MultiIndex(levels=levels, labels=labels) self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp)) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 99453b9793007..40b277f3f1f8a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1372,9 +1372,3 @@ def test_index(self): idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', '2 min', '2 min'], dtype='timedelta64[ns]') tm.assert_series_equal(algos.mode(idx), exp) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index f750936961831..1d1ef1a08859c 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -4,7 +4,7 @@ import re import sys from datetime import datetime, timedelta - +import nose import numpy as np import pandas as pd @@ -1105,11 +1105,3 @@ def f(): self.assertRaises(AttributeError, f) self.assertFalse(hasattr(t, "b")) - - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'], - exit=False) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 745914d3e7ef5..be55d6e1976ec 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4576,10 +4576,3 @@ def test_map(self): self.assertIsInstance(res, tm.SubclassedCategorical) exp = Categorical(['A', 'B', 'C']) tm.assert_categorical_equal(res, exp) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'] - exit=False) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 09dd3f7ab517c..0239250129494 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import nose import numpy as np from pandas import Series, Timestamp @@ -196,8 +195,3 @@ def test_dict_compat(): assert (com._dict_compat(data_datetime64) == expected) assert (com._dict_compat(expected) == expected) assert (com._dict_compat(data_unchanged) == data_unchanged) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index c037f02f20609..18b078d0a677e 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -2,12 +2,12 @@ from __future__ import print_function # pylint: disable-msg=W0612,E1101 -import nose import re +import operator +import nose from numpy.random import randn -import operator import numpy as np from pandas.core.api import DataFrame, Panel @@ -439,9 +439,3 @@ def test_bool_ops_warn_on_arithmetic(self): r = f(df, True) e = fe(df, True) tm.assert_frame_equal(r, e) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 0ca8ba47b8a8f..5bf2eda47ea27 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -2022,7 +2022,3 @@ def test_pipe_panel(self): with tm.assertRaises(ValueError): result = wp.pipe((f, 'y'), x=1, y=1) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 5000d6d4510fb..2bfe31ad4260e 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -1188,8 +1188,3 @@ def assert_add_equals(val, inc, result): lambda: BlockPlacement([1, 2, 4]).add(-10)) self.assertRaises(ValueError, lambda: BlockPlacement(slice(2, None, -1)).add(-1)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index bfdb77f3fb350..0e7dda05a0c27 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -193,9 +193,3 @@ def test_inner_join_indexer2(): exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) assert_almost_equal(ridx, exp_ridx) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 945f8004687cd..2381c52ef14b6 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -232,10 +232,3 @@ def test_empty_like(self): expected = np.array([True]) self._check_behavior(arr, expected) - - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 37bfe667b0205..d87ad8d906854 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2478,8 +2478,3 @@ def test_iloc_mi(self): for r in range(5)]) assert_frame_equal(result, expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index dd3a49de55d73..937c20d009b6b 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1000,9 +1000,3 @@ def test_nans_skipna(self): @property def prng(self): return np.random.RandomState(1234) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' - ], exit=False) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index d79081a06dbc0..89e8fb78ad821 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2538,8 +2538,3 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)], names=['time', 'panel']) tm.assert_index_equal(index, expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 0769b8916a11b..aeca24964222a 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -949,8 +949,3 @@ def test_rename(self): def test_get_attr(self): assert_panel_equal(self.panel4d['l1'], self.panel4d.l1) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_panelnd.py b/pandas/tests/test_panelnd.py index 92805f3b30ec6..6a578d85d3ee3 100644 --- a/pandas/tests/test_panelnd.py +++ b/pandas/tests/test_panelnd.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -import nose - from pandas.core import panelnd from pandas.core.panel import Panel @@ -101,7 +99,3 @@ def test_5d_construction(self): # test a transpose # results = p5d.transpose(1,2,3,4,0) # expected = - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 603674ac01bc0..b5fa945a5bb8f 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 -import nose from pandas import DataFrame, Series from pandas.core.sparse import SparseDataFrame @@ -914,8 +913,3 @@ def test_multiple_id_columns(self): exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') tm.assert_frame_equal(long_frame, exp_frame) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 41d25b9662b5b..eb8ab02c29548 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from pandas import compat -import nose from distutils.version import LooseVersion from numpy import nan @@ -185,8 +184,3 @@ def test_rank_object_bug(self): # smoke tests Series([np.nan] * 32).astype(object).rank(ascending=True) Series([np.nan] * 32).astype(object).rank(ascending=False) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f59127c853ed1..f358946983dce 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -4,8 +4,6 @@ from datetime import datetime, timedelta import re -import nose - from numpy import nan as NA import numpy as np from numpy.random import randint @@ -2715,8 +2713,3 @@ def test_method_on_bytes(self): expected = Series(np.array( ['ad', 'be', 'cf'], 'S2').astype(object)) tm.assert_series_equal(result, expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 98b3b474f785d..bf8a3ab370625 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -2,7 +2,6 @@ import re from datetime import datetime -import nose import numpy as np from pandas.compat import long import pandas.core.algorithms as algos @@ -448,8 +447,3 @@ def test_2d_datetime64(self): expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index e2f295a5343bc..5e60efd153ab1 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -802,8 +802,3 @@ def f(): with assertRaises(ValueError): f() raise ValueError - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index ed82604035358..e2f6a7f6cc1ed 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -import nose - from collections import OrderedDict import sys import unittest @@ -402,8 +400,3 @@ def test_numpy_errstate_is_default(): from pandas.compat import numpy # noqa # The errstate should be unchanged after that import. tm.assert_equal(np.geterr(), expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 56a14a51105ca..a8579e89aeb1f 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -5,8 +5,6 @@ """ - -import nose from datetime import datetime import numpy as np @@ -278,8 +276,3 @@ def test_period_dtype(self): np.dtype('datetime64[ns]'), np.object, np.int64]: self.assertEqual(_find_common_type([dtype, dtype2]), np.object) self.assertEqual(_find_common_type([dtype2, dtype]), np.object) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py index 4d6f50862c562..7c17c61aec440 100644 --- a/pandas/tests/types/test_common.py +++ b/pandas/tests/types/test_common.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import nose import numpy as np from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype @@ -55,8 +54,3 @@ def test_dtype_equal(): assert not DatetimeTZDtype.is_dtype(np.int64) assert not PeriodDtype.is_dtype(np.int64) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_concat.py b/pandas/tests/types/test_concat.py index 6403dcb5a5350..8acafe0af1792 100644 --- a/pandas/tests/types/test_concat.py +++ b/pandas/tests/types/test_concat.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import nose import pandas as pd import pandas.types.concat as _concat import pandas.util.testing as tm @@ -79,8 +78,3 @@ def test_get_dtype_kinds_period(self): pd.Series([pd.Period('2011-02', freq='D')])] res = _concat.get_dtype_kinds(to_concat) self.assertEqual(res, set(['object'])) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index f190c85404ff9..68105cfd7c886 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from itertools import product -import nose import numpy as np import pandas as pd from pandas import Series, Categorical, date_range @@ -353,8 +352,3 @@ def test_empty(self): def test_not_string(self): # though PeriodDtype has object kind, it cannot be string self.assertFalse(is_string_dtype(PeriodDtype('D'))) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_generic.py b/pandas/tests/types/test_generic.py index 28600687e8062..2861252bef26a 100644 --- a/pandas/tests/types/test_generic.py +++ b/pandas/tests/types/test_generic.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import nose import numpy as np import pandas as pd import pandas.util.testing as tm @@ -41,8 +40,3 @@ def test_abc_types(self): self.assertIsInstance(self.sparse_array, gt.ABCSparseArray) self.assertIsInstance(self.categorical, gt.ABCCategorical) self.assertIsInstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 5c35112d0fe19..15f9545f3476c 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -6,7 +6,6 @@ """ -import nose import collections import re from datetime import datetime, date, timedelta, time @@ -968,8 +967,3 @@ def test_ensure_categorical(): values = Categorical(values) result = _ensure_categorical(values) tm.assert_categorical_equal(result, values) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_io.py b/pandas/tests/types/test_io.py index 545edf8f1386c..ce8e23342bf5a 100644 --- a/pandas/tests/types/test_io.py +++ b/pandas/tests/types/test_io.py @@ -107,10 +107,3 @@ def test_convert_downcast_int64(self): expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) result = lib.downcast_int64(arr, na_values) self.assert_numpy_array_equal(result, expected) - - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index fa2bd535bb8d5..2b09cf5ab633d 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import nose import numpy as np from datetime import datetime from pandas.util import testing as tm @@ -304,8 +303,3 @@ def test_na_value_for_dtype(): for dtype in ['O']: assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 012d67d29cc3f..ee70515850b25 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -4003,28 +4003,3 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, if gridsize is not None: kwds['gridsize'] = gridsize return self(kind='hexbin', x=x, y=y, C=C, **kwds) - - -if __name__ == '__main__': - # import pandas.rpy.common as com - # sales = com.load_data('sanfrancisco.home.sales', package='nutshell') - # top10 = sales['zip'].value_counts()[:10].index - # sales2 = sales[sales.zip.isin(top10)] - # _ = scatter_plot(sales2, 'squarefeet', 'price', by='zip') - - # plt.show() - - import matplotlib.pyplot as plt - - import pandas.tools.plotting as plots - import pandas.core.frame as fr - reload(plots) # noqa - reload(fr) # noqa - from pandas.core.frame import DataFrame - - data = DataFrame([[3, 6, -5], [4, 8, 2], [4, 9, -6], - [4, 9, -3], [2, 5, -1]], - columns=['A', 'B', 'C']) - data.plot(kind='barh', stacked=True) - - plt.show() diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 2be7e75573d6e..dae24c48b8238 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -1,5 +1,3 @@ -import nose - import numpy as np from numpy.random import randn @@ -2171,8 +2169,3 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = pd.concat(example_dict, names=['testname']) tm.assert_frame_equal(result_no_copy, expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index 4a2b64d080b4b..605a85026d605 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -1,7 +1,5 @@ # pylint: disable=E1103 -import nose - from numpy.random import randn import numpy as np @@ -799,8 +797,3 @@ def _join_by_hand(a, b, how='left'): for col, s in compat.iteritems(b_re): a_re[col] = s return a_re.reindex(columns=result_columns) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index e08074649f7e8..88856a012da6f 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1,7 +1,5 @@ # pylint: disable=E1103 -import nose - from datetime import datetime from numpy.random import randn from numpy import nan @@ -1370,8 +1368,3 @@ def f(): def f(): household.join(log_return, how='outer') self.assertRaises(NotImplementedError, f) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index ef7b25008e80a..8e7323f72a8f5 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -1,4 +1,3 @@ -import nose import os import pytz @@ -938,8 +937,3 @@ def test_on_float_by_int(self): columns=['symbol', 'exch', 'price', 'mpv']) assert_frame_equal(result, expected) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_merge_ordered.py b/pandas/tools/tests/test_merge_ordered.py index d163468abc88e..e08cc98e50794 100644 --- a/pandas/tools/tests/test_merge_ordered.py +++ b/pandas/tools/tests/test_merge_ordered.py @@ -1,5 +1,3 @@ -import nose - import pandas as pd from pandas import DataFrame, merge_ordered from pandas.util import testing as tm @@ -92,7 +90,3 @@ def test_empty_sequence_concat(self): pd.concat([pd.DataFrame()]) pd.concat([None, pd.DataFrame()]) pd.concat([pd.DataFrame(), None]) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 9cc520a7adb05..398e57d4ad0a4 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1321,9 +1321,3 @@ def test_crosstab_with_numpy_size(self): index=expected_index, columns=expected_column) tm.assert_frame_equal(result, expected) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 5c7cee862ccd3..c5261597cf35d 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -1,5 +1,4 @@ import os -import nose import numpy as np from pandas.compat import zip @@ -351,8 +350,3 @@ def test_datetime_bin(self): def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 8a8960a057926..e1d057eb3c3c0 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -478,8 +478,3 @@ def test_downcast_limits(self): for dtype, downcast, min_max in dtype_downcast_min_max: series = pd.to_numeric(pd.Series(min_max), downcast=downcast) tm.assert_equal(series.dtype, dtype) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 4f2ac3ff0d87e..2ff06517f175a 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1860,10 +1860,3 @@ def test_equals(self): self.assertFalse(idx.asobject.equals(idx3)) self.assertFalse(idx.equals(list(idx3))) self.assertFalse(idx.equals(pd.Series(idx3))) - - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index f6cf11c871bba..8caed80f5a45b 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -1,7 +1,5 @@ from datetime import datetime, date -import nose - import numpy as np from pandas import Timestamp, Period, Index from pandas.compat import u @@ -12,6 +10,7 @@ try: import pandas.tseries.converter as converter except ImportError: + import nose raise nose.SkipTest("no pandas.tseries.converter, skipping") @@ -199,9 +198,3 @@ def test_integer_passthrough(self): rs = self.pc.convert([0, 1], None, self.axis) xp = [0, 1] self.assertEqual(rs, xp) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 87f9f55e0189c..209e6e40d5cf0 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -1,6 +1,5 @@ from datetime import datetime from pandas.compat import range -import nose import numpy as np from pandas.core.index import Index @@ -817,8 +816,3 @@ def test_cdaterange_weekmask_and_holidays(self): holidays=['2013-05-01']) xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) self.assert_index_equal(xp, rng) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tseries/tests/test_holiday.py index 62446e8e637c6..d4d273347e6e3 100644 --- a/pandas/tseries/tests/test_holiday.py +++ b/pandas/tseries/tests/test_holiday.py @@ -15,7 +15,6 @@ USLaborDay, USColumbusDay, USMartinLutherKingJr, USPresidentsDay) from pytz import utc -import nose class TestCalendar(tm.TestCase): @@ -385,8 +384,3 @@ def test_both_offset_observance_raises(self): Holiday("Cyber Monday", month=11, day=1, offset=[DateOffset(weekday=SA(4))], observance=next_monday) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 768e9212e6c42..ac488a3dfdcb2 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -2,10 +2,11 @@ from distutils.version import LooseVersion from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta -from pandas.compat import range, iteritems -from pandas import compat + import nose from nose.tools import assert_raises +from pandas.compat import range, iteritems +from pandas import compat import numpy as np @@ -4956,8 +4957,3 @@ def test_all_offset_classes(self): first = Timestamp(test_values[0], tz='US/Eastern') + offset() second = Timestamp(test_values[1], tz='US/Eastern') self.assertEqual(first, second, msg=str(offset)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index a707cc3eb74ce..fdc067a827a5b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4967,9 +4967,3 @@ def test_get_period_field_raises_on_out_of_range(self): def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, _period.get_period_field_arr, -1, np.empty(1), 0) - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index c40f930fbd094..222ffb735921a 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -3,7 +3,6 @@ from datetime import datetime, timedelta from functools import partial -import nose import numpy as np import pandas as pd @@ -3188,8 +3187,3 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 6efa024d81b98..13263259e0b8a 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -2,7 +2,6 @@ from __future__ import division from datetime import timedelta, time -import nose from distutils.version import LooseVersion import numpy as np @@ -2051,8 +2050,3 @@ def test_add_overflow(self): result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) tm.assert_index_equal(result, exp) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py index d8c01c53fb2e5..5395056c93412 100644 --- a/pandas/tseries/tests/test_timeseries_legacy.py +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -219,8 +219,3 @@ def test_ms_vs_MS(self): def test_rule_aliases(self): rule = to_offset('10us') self.assertEqual(rule, Micro(10)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 64787b6e4e79a..00b60ba620c4b 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -1,7 +1,5 @@ # pylint: disable-msg=E1101,W0612 from datetime import datetime, timedelta, tzinfo, date -import nose - import numpy as np import pytz from distutils.version import LooseVersion @@ -1683,8 +1681,3 @@ def test_nat(self): idx = idx.tz_convert('US/Eastern') expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index cf5dbd671d38c..20e91a6f5bc44 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1,4 +1,3 @@ -import nose import datetime import numpy as np from distutils.version import LooseVersion @@ -690,8 +689,3 @@ def _check_round(freq, expected): msg = pd.tseries.frequencies._INVALID_FREQ_ERROR with self.assertRaisesRegexp(ValueError, msg): stamp.round('foo') - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 96da32a4a845c..3feffe924c291 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -1,5 +1,4 @@ from pandas.compat import range -import nose import numpy as np @@ -125,8 +124,3 @@ def test_normalize_date(): result = normalize_date(value) assert (result == datetime(2012, 9, 7)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) From 0f10b042fd48d74d56601085a1528e9b620a8616 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Feb 2017 13:50:37 -0500 Subject: [PATCH 106/338] TST/STYLE: remove multiprocess nose flags and slight PEP fixes xref https://github.com/pandas-dev/pandas/pull/13856#issuecomment-278058522 Author: Jeff Reback Closes #15333 from jreback/mcs and squashes the following commits: 2edc842 [Jeff Reback] TST/STYLE: remove multiprocess nose flags and slight PEP fixes --- pandas/__init__.py | 8 +-- pandas/_version.py | 2 +- pandas/api/tests/test_api.py | 2 - pandas/compat/__init__.py | 24 +++++---- pandas/compat/chainmap.py | 1 + pandas/compat/numpy/function.py | 1 + pandas/compat/pickle_compat.py | 9 +++- pandas/computation/tests/test_eval.py | 10 ++-- pandas/core/base.py | 2 + pandas/core/config.py | 1 + pandas/core/frame.py | 5 +- pandas/core/indexing.py | 1 + pandas/core/internals.py | 1 + pandas/core/nanops.py | 2 + pandas/core/panel.py | 2 + pandas/core/series.py | 1 + pandas/core/window.py | 3 +- pandas/formats/format.py | 12 +++++ pandas/indexes/base.py | 6 +-- pandas/indexes/multi.py | 2 +- pandas/io/common.py | 1 + pandas/io/json/json.py | 4 +- pandas/io/json/normalize.py | 1 - pandas/io/sas/sasreader.py | 1 - pandas/io/tests/parser/c_parser_only.py | 9 ++-- pandas/io/tests/parser/compression.py | 1 + pandas/io/tests/parser/converters.py | 1 + pandas/io/tests/parser/dtypes.py | 1 + pandas/io/tests/parser/parse_dates.py | 1 + pandas/io/tests/parser/python_parser_only.py | 1 + pandas/io/tests/parser/test_parsers.py | 1 + pandas/io/tests/parser/test_unsupported.py | 2 + pandas/io/tests/test_clipboard.py | 2 +- pandas/io/tests/test_feather.py | 1 - pandas/io/tests/test_gbq.py | 3 ++ pandas/io/tests/test_packers.py | 4 +- pandas/io/tests/test_pickle.py | 1 - pandas/io/tests/test_pytables.py | 1 - pandas/msgpack/exceptions.py | 1 + pandas/sparse/array.py | 3 +- pandas/sparse/series.py | 1 + pandas/sparse/tests/test_arithmetics.py | 2 - pandas/sparse/tests/test_array.py | 2 +- pandas/sparse/tests/test_combine_concat.py | 4 -- pandas/sparse/tests/test_format.py | 2 - pandas/sparse/tests/test_frame.py | 2 +- pandas/sparse/tests/test_groupby.py | 2 - pandas/sparse/tests/test_indexing.py | 7 +-- pandas/sparse/tests/test_libsparse.py | 6 --- pandas/sparse/tests/test_list.py | 2 - pandas/sparse/tests/test_pivot.py | 2 - pandas/sparse/tests/test_series.py | 2 +- pandas/stats/fama_macbeth.py | 1 + pandas/stats/ols.py | 1 + pandas/stats/tests/test_ols.py | 8 --- pandas/tests/formats/test_format.py | 35 +++++++------ pandas/tests/formats/test_printing.py | 2 - pandas/tests/formats/test_style.py | 2 +- pandas/tests/frame/test_alter_axes.py | 2 - pandas/tests/frame/test_analytics.py | 2 - pandas/tests/frame/test_apply.py | 2 - pandas/tests/frame/test_asof.py | 1 - .../tests/frame/test_axis_select_reindex.py | 2 - pandas/tests/frame/test_block_internals.py | 2 - pandas/tests/frame/test_combine_concat.py | 4 -- pandas/tests/frame/test_constructors.py | 4 -- pandas/tests/frame/test_convert_to.py | 2 - pandas/tests/frame/test_dtypes.py | 4 -- pandas/tests/frame/test_indexing.py | 6 --- pandas/tests/frame/test_misc_api.py | 4 -- pandas/tests/frame/test_missing.py | 2 - pandas/tests/frame/test_mutate_columns.py | 2 - pandas/tests/frame/test_nonunique_indexes.py | 2 - pandas/tests/frame/test_operators.py | 2 - pandas/tests/frame/test_quantile.py | 2 - pandas/tests/frame/test_query_eval.py | 4 -- pandas/tests/frame/test_replace.py | 2 - pandas/tests/frame/test_repr_info.py | 2 - pandas/tests/frame/test_reshape.py | 2 - pandas/tests/frame/test_sorting.py | 2 - pandas/tests/frame/test_subclass.py | 2 - pandas/tests/frame/test_timeseries.py | 6 +-- pandas/tests/frame/test_to_csv.py | 2 - pandas/tests/groupby/test_aggregate.py | 2 - pandas/tests/groupby/test_categorical.py | 2 - pandas/tests/groupby/test_filters.py | 2 - pandas/tests/groupby/test_groupby.py | 6 +-- pandas/tests/indexes/datetimes/test_astype.py | 2 - .../indexes/datetimes/test_construction.py | 2 - .../indexes/datetimes/test_date_range.py | 1 - .../tests/indexes/datetimes/test_datetime.py | 1 - .../indexes/datetimes/test_datetimelike.py | 1 - .../tests/indexes/datetimes/test_indexing.py | 1 - pandas/tests/indexes/datetimes/test_misc.py | 1 - .../tests/indexes/datetimes/test_missing.py | 1 - pandas/tests/indexes/datetimes/test_ops.py | 1 - pandas/tests/indexes/datetimes/test_setops.py | 1 - pandas/tests/indexes/datetimes/test_tools.py | 2 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexes/test_category.py | 6 ++- pandas/tests/indexes/test_datetimelike.py | 2 - pandas/tests/indexes/test_multi.py | 21 ++++---- pandas/tests/indexes/test_numeric.py | 3 -- pandas/tests/indexes/test_timedelta.py | 1 - pandas/tests/indexing/test_callable.py | 2 - pandas/tests/indexing/test_coercion.py | 4 +- pandas/tests/indexing/test_indexing.py | 2 - pandas/tests/indexing/test_indexing_slow.py | 2 - pandas/tests/plotting/test_misc.py | 2 + pandas/tests/scalar/test_timestamp.py | 4 +- pandas/tests/series/test_alter_axes.py | 2 - pandas/tests/series/test_analytics.py | 2 - pandas/tests/series/test_apply.py | 4 -- pandas/tests/series/test_asof.py | 1 - pandas/tests/series/test_combine_concat.py | 4 -- pandas/tests/series/test_constructors.py | 2 - pandas/tests/series/test_datetime_values.py | 6 +-- pandas/tests/series/test_dtypes.py | 2 - pandas/tests/series/test_indexing.py | 4 +- pandas/tests/series/test_internals.py | 2 - pandas/tests/series/test_io.py | 6 --- pandas/tests/series/test_misc_api.py | 2 - pandas/tests/series/test_missing.py | 2 - pandas/tests/series/test_operators.py | 2 - pandas/tests/series/test_replace.py | 2 - pandas/tests/series/test_repr.py | 2 - pandas/tests/series/test_sorting.py | 2 - pandas/tests/series/test_subclass.py | 4 -- pandas/tests/series/test_timeseries.py | 1 - pandas/tests/test_algos.py | 11 ---- pandas/tests/test_categorical.py | 50 +++++++++---------- pandas/tests/test_common.py | 2 - pandas/tests/test_config.py | 1 - pandas/tests/test_expressions.py | 2 - pandas/tests/test_generic.py | 2 - pandas/tests/test_internals.py | 5 -- pandas/tests/test_join.py | 1 - pandas/tests/test_multilevel.py | 2 - pandas/tests/test_panel.py | 6 --- pandas/tests/test_panel4d.py | 8 --- pandas/tests/test_reshape.py | 2 - pandas/tests/test_stats.py | 1 - pandas/tests/test_strings.py | 2 - pandas/tests/test_take.py | 4 -- pandas/tests/test_testing.py | 6 --- pandas/tests/test_util.py | 1 + pandas/tests/test_window.py | 2 - pandas/tests/types/test_cast.py | 2 - pandas/tests/types/test_common.py | 2 - pandas/tests/types/test_concat.py | 2 - pandas/tests/types/test_dtypes.py | 2 - pandas/tests/types/test_generic.py | 2 - pandas/tests/types/test_inference.py | 3 -- pandas/tests/types/test_missing.py | 2 - pandas/tools/tests/test_concat.py | 2 - pandas/tools/tests/test_hashing.py | 2 - pandas/tools/tests/test_join.py | 2 - pandas/tools/tests/test_merge.py | 2 - pandas/tools/tests/test_merge_asof.py | 3 +- pandas/tools/tests/test_pivot.py | 10 ++-- pandas/tools/tests/test_tile.py | 8 +-- pandas/tseries/period.py | 2 +- pandas/tseries/tests/test_base.py | 4 +- pandas/tseries/tests/test_bin_groupby.py | 2 +- pandas/tseries/tests/test_converter.py | 2 + pandas/tseries/tests/test_daterange.py | 2 + pandas/tseries/tests/test_frequencies.py | 1 + pandas/tseries/tests/test_holiday.py | 4 ++ pandas/tseries/tests/test_offsets.py | 19 ++++--- pandas/tseries/tests/test_period.py | 3 ++ pandas/tseries/tests/test_resample.py | 11 ++-- pandas/tseries/tests/test_timedeltas.py | 3 +- .../tseries/tests/test_timeseries_legacy.py | 2 - pandas/tseries/tests/test_timezones.py | 4 +- pandas/tseries/tests/test_tslib.py | 3 ++ pandas/types/generic.py | 1 + pandas/util/clipboard/__init__.py | 2 +- pandas/util/clipboard/clipboards.py | 1 + pandas/util/clipboard/exceptions.py | 1 + pandas/util/clipboard/windows.py | 4 +- pandas/util/decorators.py | 2 + pandas/util/depr_module.py | 1 + pandas/util/testing.py | 5 +- 183 files changed, 229 insertions(+), 418 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 2d91c97144e3c..9133e11beaa2b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -15,7 +15,8 @@ missing_dependencies.append(dependency) if missing_dependencies: - raise ImportError("Missing required dependencies {0}".format(missing_dependencies)) + raise ImportError( + "Missing required dependencies {0}".format(missing_dependencies)) del hard_dependencies, dependency, missing_dependencies # numpy compat @@ -24,7 +25,8 @@ try: from pandas import hashtable, tslib, lib except ImportError as e: # pragma: no cover - module = str(e).lstrip('cannot import name ') # hack but overkill to use re + # hack but overkill to use re + module = str(e).lstrip('cannot import name ') raise ImportError("C extension: {0} not built. If you want to import " "pandas from the source directory, you may need to run " "'python setup.py build_ext --inplace --force' to build " @@ -61,5 +63,5 @@ # use the closest tagged version if possible from ._version import get_versions v = get_versions() -__version__ = v.get('closest-tag',v['version']) +__version__ = v.get('closest-tag', v['version']) del get_versions, v diff --git a/pandas/_version.py b/pandas/_version.py index 77b2fdca59576..d764923fd7247 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -157,7 +157,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) + print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index f925fd792f9ca..02165d82d4232 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -8,8 +8,6 @@ from pandas.api import types from pandas.util import testing as tm -_multiprocess_can_split_ = True - class Base(object): diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 532f960468204..7ebdd9735b967 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -79,25 +79,25 @@ def signature(f): args = [ p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD - ] + ] varargs = [ p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.VAR_POSITIONAL - ] + ] varargs = varargs[0] if varargs else None keywords = [ p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.VAR_KEYWORD - ] + ] keywords = keywords[0] if keywords else None defaults = [ p.default for p in sig.parameters.values() if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD and p.default is not p.empty - ] or None - argspec = namedtuple('Signature',['args','defaults', - 'varargs','keywords']) - return argspec(args,defaults,varargs,keywords) + ] or None + argspec = namedtuple('Signature', ['args', 'defaults', + 'varargs', 'keywords']) + return argspec(args, defaults, varargs, keywords) # have to explicitly put builtins into the namespace range = range @@ -170,7 +170,7 @@ def iterkeys(obj, **kw): def itervalues(obj, **kw): return obj.itervalues(**kw) - next = lambda it : it.next() + next = lambda it: it.next() else: def iteritems(obj, **kw): return iter(obj.items(**kw)) @@ -183,6 +183,7 @@ def itervalues(obj, **kw): next = next + def bind_method(cls, name, func): """Bind a method to class, python 2 and python 3 compatible. @@ -307,7 +308,8 @@ def set_function_name(f, name, cls): f.__name__ = name return f - class ResourceWarning(Warning): pass + class ResourceWarning(Warning): + pass string_and_binary_types = string_types + (binary_type,) @@ -398,14 +400,18 @@ def is_platform_little_endian(): """ am I little endian """ return sys.byteorder == 'little' + def is_platform_windows(): return sys.platform == 'win32' or sys.platform == 'cygwin' + def is_platform_linux(): return sys.platform == 'linux2' + def is_platform_mac(): return sys.platform == 'darwin' + def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index 9edd2ef056a52..cf1cad5694570 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -5,6 +5,7 @@ class DeepChainMap(ChainMap): + def __setitem__(self, key, value): for mapping in self.maps: if key in mapping: diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 895a376457f09..72e89586d0280 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -27,6 +27,7 @@ class CompatValidator(object): + def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): self.fname = fname diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 7ed9e7ff90bd8..1cdf8afd563c6 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -9,6 +9,7 @@ from pandas import compat, Index from pandas.compat import u, string_types + def load_reduce(self): stack = self.stack args = stack.pop() @@ -34,7 +35,7 @@ def load_reduce(self): pass # try to reencode the arguments - if getattr(self,'encoding',None) is not None: + if getattr(self, 'encoding', None) is not None: args = tuple([arg.encode(self.encoding) if isinstance(arg, string_types) else arg for arg in args]) @@ -44,7 +45,7 @@ def load_reduce(self): except: pass - if getattr(self,'is_verbose',None): + if getattr(self, 'is_verbose', None): print(sys.exc_info()) print(func, args) raise @@ -61,6 +62,7 @@ class Unpickler(pkl.Unpickler): Unpickler.dispatch = copy.copy(Unpickler.dispatch) Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce + def load_newobj(self): args = self.stack.pop() cls = self.stack[-1] @@ -75,6 +77,8 @@ def load_newobj(self): Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj # py3 compat + + def load_newobj_ex(self): kwargs = self.stack.pop() args = self.stack.pop() @@ -91,6 +95,7 @@ def load_newobj_ex(self): except: pass + def load(fh, encoding=None, compat=False, is_verbose=False): """load a pickle, with a provided encoding diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index dbac72c619a52..aa05626af9175 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -201,7 +201,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): binop=binop, cmp2=cmp2) scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or - cmp2 in skip_these)) + cmp2 in skip_these)) if scalar_with_in_notin: with tm.assertRaises(TypeError): pd.eval(ex, engine=self.engine, parser=self.parser) @@ -702,7 +702,6 @@ def test_float_truncation(self): tm.assert_frame_equal(expected, result) - class TestEvalNumexprPython(TestEvalNumexprPandas): @classmethod @@ -782,6 +781,7 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): # typecasting rules consistency with python # issue #12388 + class TestTypeCasting(object): def check_binop_typecasting(self, engine, parser, op, dt): @@ -803,7 +803,8 @@ def test_binop_typecasting(self): for engine, parser in ENGINES_PARSERS: for op in ['+', '-', '*', '**', '/']: # maybe someday... numexpr has too many upcasting rules now - #for dt in chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])): + # for dt in chain(*(np.sctypes[x] for x in ['uint', 'int', + # 'float'])): for dt in [np.float32, np.float64]: yield self.check_binop_typecasting, engine, parser, op, dt @@ -1969,10 +1970,11 @@ def test_negate_lt_eq_le(): for engine, parser in product(_engines, expr._parsers): yield check_negate_lt_eq_le, engine, parser + class TestValidate(tm.TestCase): def test_validate_bool_args(self): - invalid_values = [1, "True", [1,2,3], 5.0] + invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: with self.assertRaises(ValueError): diff --git a/pandas/core/base.py b/pandas/core/base.py index e7a79c3291a92..657da859ddde2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -230,6 +230,7 @@ def f(self, *args, **kwargs): class AccessorProperty(object): """Descriptor for implementing accessor properties like Series.str """ + def __init__(self, accessor_cls, construct_accessor): self.accessor_cls = accessor_cls self.construct_accessor = construct_accessor @@ -651,6 +652,7 @@ class GroupByMixin(object): @staticmethod def _dispatch(name, *args, **kwargs): """ dispatch to apply """ + def outer(self, *args, **kwargs): def f(x): x = self._shallow_copy(x, groupby=self._groupby) diff --git a/pandas/core/config.py b/pandas/core/config.py index 618de4e02b56f..ed63c865ebfb4 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -215,6 +215,7 @@ def __dir__(self): class CallableDynamicDoc(object): + def __init__(self, func, doc_tmpl): self.__doc_tmpl__ = doc_tmpl self.__func__ = func diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cc81c66100a6f..79bdad82af5a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5326,9 +5326,10 @@ def isin(self, values): "allowed to be passed to DataFrame.isin(), " "you passed a " "{0!r}".format(type(values).__name__)) - return DataFrame(lib.ismember(self.values.ravel(), + return DataFrame( + lib.ismember(self.values.ravel(), set(values)).reshape(self.shape), self.index, - self.columns) + self.columns) # ---------------------------------------------------------------------- # Deprecated stuff diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 40050d6d769a6..6bb2d1c479844 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -42,6 +42,7 @@ def get_indexers_list(): # the public IndexSlicerMaker class _IndexSlice(object): + def __getitem__(self, arg): return arg diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 289ce150eb46b..f0b1516d786c6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5122,6 +5122,7 @@ def trim_join_unit(join_unit, length): class JoinUnit(object): + def __init__(self, block, shape, indexers=None): # Passing shape explicitly is required for cases when block is None. if indexers is None: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1f76bc850cee9..0cc3a2d039b5e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -26,6 +26,7 @@ class disallow(object): + def __init__(self, *dtypes): super(disallow, self).__init__() self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes) @@ -58,6 +59,7 @@ def _f(*args, **kwargs): class bottleneck_switch(object): + def __init__(self, zero_value=None, **kwargs): self.zero_value = zero_value self.kwargs = kwargs diff --git a/pandas/core/panel.py b/pandas/core/panel.py index a11ef53de1af9..6da10305eb4fc 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1560,6 +1560,7 @@ def f(self, other, axis=0): # legacy class WidePanel(Panel): + def __init__(self, *args, **kwargs): # deprecation, #10892 warnings.warn("WidePanel is deprecated. Please use Panel", @@ -1569,6 +1570,7 @@ def __init__(self, *args, **kwargs): class LongPanel(DataFrame): + def __init__(self, *args, **kwargs): # deprecation, #10892 warnings.warn("LongPanel is deprecated. Please use DataFrame", diff --git a/pandas/core/series.py b/pandas/core/series.py index 9845e1cd4ad47..43f16f690692a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2987,6 +2987,7 @@ def create_from_value(value, index, dtype): # backwards compatiblity class TimeSeries(Series): + def __init__(self, *args, **kwargs): # deprecation TimeSeries, #10890 warnings.warn("TimeSeries is deprecated. Please use Series", diff --git a/pandas/core/window.py b/pandas/core/window.py index bda134dd8a2a4..50de6b84d7cba 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -659,6 +659,7 @@ def f(x, name=name, *args): class _Rolling(_Window): + @property def _constructor(self): return Rolling @@ -1718,7 +1719,7 @@ def dataframe_from_int_dict(data, frame_template): def _get_center_of_mass(com, span, halflife, alpha): valid_count = len([x for x in [com, span, halflife, alpha] - if x is not None]) + if x is not None]) if valid_count > 1: raise ValueError("com, span, halflife, and alpha " "are mutually exclusive") diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 3bac7d2821760..439b96d650204 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -89,6 +89,7 @@ class CategoricalFormatter(object): + def __init__(self, categorical, buf=None, length=True, na_rep='NaN', footer=True): self.categorical = categorical @@ -142,6 +143,7 @@ def to_string(self): class SeriesFormatter(object): + def __init__(self, series, buf=None, length=True, header=True, index=True, na_rep='NaN', name=False, float_format=None, dtype=True, max_rows=None): @@ -272,6 +274,7 @@ def to_string(self): class TextAdjustment(object): + def __init__(self): self.encoding = get_option("display.encoding") @@ -287,6 +290,7 @@ def adjoin(self, space, *lists, **kwargs): class EastAsianTextAdjustment(TextAdjustment): + def __init__(self): super(EastAsianTextAdjustment, self).__init__() if get_option("display.unicode.ambiguous_as_wide"): @@ -1366,6 +1370,7 @@ def _get_level_lengths(levels, sentinel=''): class CSVFormatter(object): + def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, @@ -1950,6 +1955,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', class GenericArrayFormatter(object): + def __init__(self, values, digits=7, formatter=None, na_rep='NaN', space=12, float_format=None, justify='right', decimal='.', quoting=None, fixed_width=True): @@ -2151,6 +2157,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): + def _format_strings(self): formatter = self.formatter or (lambda x: '% d' % x) fmt_values = [formatter(x) for x in self.values] @@ -2158,6 +2165,7 @@ def _format_strings(self): class Datetime64Formatter(GenericArrayFormatter): + def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): super(Datetime64Formatter, self).__init__(values, **kwargs) self.nat_rep = nat_rep @@ -2183,6 +2191,7 @@ def _format_strings(self): class PeriodArrayFormatter(IntArrayFormatter): + def _format_strings(self): from pandas.tseries.period import IncompatibleFrequency try: @@ -2197,6 +2206,7 @@ def _format_strings(self): class CategoricalArrayFormatter(GenericArrayFormatter): + def __init__(self, values, *args, **kwargs): GenericArrayFormatter.__init__(self, values, *args, **kwargs) @@ -2328,6 +2338,7 @@ def _get_format_datetime64_from_values(values, date_format): class Datetime64TZFormatter(Datetime64Formatter): + def _format_strings(self): """ we by definition have a TZ """ @@ -2342,6 +2353,7 @@ def _format_strings(self): class Timedelta64Formatter(GenericArrayFormatter): + def __init__(self, values, nat_rep='NaT', box=False, **kwargs): super(Timedelta64Formatter, self).__init__(values, **kwargs) self.nat_rep = nat_rep diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index dcd565ee5f0e9..bb2941a121452 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -168,8 +168,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif isinstance(data, (np.ndarray, Index, ABCSeries)): if (is_datetime64_any_dtype(data) or - (dtype is not None and is_datetime64_any_dtype(dtype)) or - 'tz' in kwargs): + (dtype is not None and is_datetime64_any_dtype(dtype)) or + 'tz' in kwargs): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) @@ -3606,7 +3606,7 @@ def _validate_for_numeric_binop(self, other, op, opstr): typ=type(other)) ) elif isinstance(other, np.ndarray) and not other.ndim: - other = other.item() + other = other.item() if isinstance(other, (Index, ABCSeries, np.ndarray)): if len(self) != len(other): diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 00ead012a916a..d2469cf1a3eed 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1813,7 +1813,7 @@ def partial_selection(key, indexer=None): for k, l in zip(key, self.levels)] can_index_exactly = any(all_dates) if (any([l.is_all_dates - for k, l in zip(key, self.levels)]) and + for k, l in zip(key, self.levels)]) and not can_index_exactly): indexer = self.get_loc(key) diff --git a/pandas/io/common.py b/pandas/io/common.py index fd8e69675329f..c2e4301a23731 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -116,6 +116,7 @@ class BaseIterator(object): """Subclass this and provide a "__next__()" method to obtain an iterator. Useful only when the object being iterated is non-reusable (e.g. OK for a parser, not for an in-memory table, yes for its iterator).""" + def __iter__(self): return self diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index d29f4a371dd4d..6fc766081eefe 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -23,8 +23,8 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', default_handler=None, lines=False): if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index aa80954233682..d684441c5974d 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -89,7 +89,6 @@ def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise'): - """ "Normalize" semi-structured JSON data into a flat table diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 29e7f131fd9bc..3e4d9c9024dbd 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -6,7 +6,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False): - """ Read SAS files stored as either XPORT or SAS7BDAT format files. diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 73edda90720af..11073f3f108ba 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -18,6 +18,7 @@ class CParserTests(object): + def test_buffer_overflow(self): # see gh-9205: test certain malformed input files that cause # buffer overflows in tokenizer.c @@ -375,13 +376,13 @@ def test_internal_null_byte(self): def test_read_nrows_large(self): # gh-7626 - Read only nrows of data in for large inputs (>262144b) header_narrow = '\t'.join(['COL_HEADER_' + str(i) - for i in range(10)]) + '\n' + for i in range(10)]) + '\n' data_narrow = '\t'.join(['somedatasomedatasomedata1' - for i in range(10)]) + '\n' + for i in range(10)]) + '\n' header_wide = '\t'.join(['COL_HEADER_' + str(i) - for i in range(15)]) + '\n' + for i in range(15)]) + '\n' data_wide = '\t'.join(['somedatasomedatasomedata2' - for i in range(15)]) + '\n' + for i in range(15)]) + '\n' test_input = (header_narrow + data_narrow * 1050 + header_wide + data_wide * 2) diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index e95617faf2071..308ca6e8a5a2c 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -11,6 +11,7 @@ class CompressionTests(object): + def test_zip(self): try: import zipfile diff --git a/pandas/io/tests/parser/converters.py b/pandas/io/tests/parser/converters.py index 68231d67534ee..2ceaff9291e7e 100644 --- a/pandas/io/tests/parser/converters.py +++ b/pandas/io/tests/parser/converters.py @@ -19,6 +19,7 @@ class ConverterTests(object): + def test_converters_type_must_be_dict(self): data = """index,A,B,C,D foo,2,3,4,5 diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index abcd14e9499cb..fa95c18c4d7a9 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -16,6 +16,7 @@ class DtypeTests(object): + def test_passing_dtype(self): # see gh-6607 df = DataFrame(np.random.rand(5, 2).round(4), columns=list( diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index e4af1ff70a498..ad3d5f2382a49 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -25,6 +25,7 @@ class ParseDatesTests(object): + def test_separator_date_conflict(self): # Regression test for gh-4678: make sure thousands separator and # date parsing do not conflict. diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index ad62aaa275127..283ff366b5efd 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -18,6 +18,7 @@ class PythonParserTests(object): + def test_negative_skipfooter_raises(self): text = """#foo,a,b,c #foo,a,b,c diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 93b5fdcffed4c..2ae557a7d57db 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -32,6 +32,7 @@ class BaseParser(CommentTests, CompressionTests, ParseDatesTests, ParserTests, SkipRowsTests, UsecolsTests, QuotingTests, DtypeTests): + def read_csv(self, *args, **kwargs): raise NotImplementedError diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index e941c9186cd6a..999db47cf2eaf 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -18,6 +18,7 @@ class TestUnsupportedFeatures(tm.TestCase): + def test_mangle_dupe_cols_false(self): # see gh-12935 data = 'a b c\n1 2 3' @@ -111,6 +112,7 @@ def test_python_engine(self): class TestDeprecatedFeatures(tm.TestCase): + def test_deprecated_args(self): data = '1,2,3' diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index 93d14077aeacf..98a4152754b55 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -54,7 +54,7 @@ def setUpClass(cls): 'es': 'en español'.split()}) # unicode round trip test for GH 13747, GH 12529 cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) + 'b': ['øπ∆˚¬', 'œ∑´®']}) cls.data_types = list(cls.data.keys()) @classmethod diff --git a/pandas/io/tests/test_feather.py b/pandas/io/tests/test_feather.py index dcb057ec30004..218175e5ef527 100644 --- a/pandas/io/tests/test_feather.py +++ b/pandas/io/tests/test_feather.py @@ -18,7 +18,6 @@ class TestFeather(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): pass diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index ac481a44de5e8..0507f0d89661c 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -294,6 +294,7 @@ def test_get_application_default_credentials_returns_credentials(self): class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase): + def setUp(self): _setup_common() @@ -325,6 +326,7 @@ def test_should_be_able_to_get_results_from_query(self): class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase): + def setUp(self): _setup_common() @@ -356,6 +358,7 @@ def test_should_be_able_to_get_results_from_query(self): class GBQUnitTests(tm.TestCase): + def setUp(self): _setup_common() diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 6b368bb2bb5ce..8a0cfb92bd3c0 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -40,8 +40,6 @@ else: _ZLIB_INSTALLED = True -_multiprocess_can_split_ = False - def check_arbitrary(a, b): @@ -870,7 +868,7 @@ def read_msgpacks(self, version): for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 if (compat.PY3 and version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): + f.split('.')[-4][-1] == '2'): continue vf = os.path.join(pth, f) try: diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index b5c316b326b8d..73a9173e85906 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -30,7 +30,6 @@ class TestPickle(): http://stackoverflow.com/questions/6689537/ nose-test-generators-inside-class """ - _multiprocess_can_split_ = True def setUp(self): from pandas.io.tests.generate_legacy_storage_files import ( diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index f4f03856f94e2..501e744ad308c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -50,7 +50,6 @@ _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' else 'zlib') -_multiprocess_can_split_ = False # testing on windows/py3 seems to fault # for using compression diff --git a/pandas/msgpack/exceptions.py b/pandas/msgpack/exceptions.py index 40f5a8af8f583..ae0f74a6700bd 100644 --- a/pandas/msgpack/exceptions.py +++ b/pandas/msgpack/exceptions.py @@ -15,6 +15,7 @@ class UnpackValueError(UnpackException, ValueError): class ExtraData(ValueError): + def __init__(self, unpacked, extra): self.unpacked = unpacked self.extra = extra diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index da13726e88a14..c65e0dd5c9f7b 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -239,7 +239,7 @@ def _simple_new(cls, data, sp_index, fill_value): fill_value = na_value_for_dtype(data.dtype) if (is_integer_dtype(data) and is_float(fill_value) and - sp_index.ngaps > 0): + sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float data = data.astype(float) @@ -405,7 +405,6 @@ def __iter__(self): yield self._get_val_at(i) def __getitem__(self, key): - """ """ diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index d6bc892921c42..2d3a9effe6939 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -847,6 +847,7 @@ def from_coo(cls, A, dense_index=False): # backwards compatiblity class SparseTimeSeries(SparseSeries): + def __init__(self, *args, **kwargs): # deprecation TimeSeries, #10890 warnings.warn("SparseTimeSeries is deprecated. Please use " diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index f24244b38c42b..eb926082a7b7c 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -5,8 +5,6 @@ class TestSparseArrayArithmetics(tm.TestCase): - _multiprocess_can_split_ = True - _base = np.array _klass = pd.SparseArray diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 55f292a8a231a..70aaea5b5b1f0 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -14,7 +14,6 @@ class TestSparseArray(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.arr_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) @@ -655,6 +654,7 @@ def test_fillna_overlap(self): class TestSparseArrayAnalytics(tm.TestCase): + def test_sum(self): data = np.arange(10).astype(float) out = SparseArray(data).sum() diff --git a/pandas/sparse/tests/test_combine_concat.py b/pandas/sparse/tests/test_combine_concat.py index 5240d592810ad..81655daec6164 100644 --- a/pandas/sparse/tests/test_combine_concat.py +++ b/pandas/sparse/tests/test_combine_concat.py @@ -7,8 +7,6 @@ class TestSparseSeriesConcat(tm.TestCase): - _multiprocess_can_split_ = True - def test_concat(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) @@ -126,8 +124,6 @@ def test_concat_sparse_dense(self): class TestSparseDataFrameConcat(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.dense1 = pd.DataFrame({'A': [0., 1., 2., np.nan], diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py index 377eaa20565a2..0c0e773d19bb9 100644 --- a/pandas/sparse/tests/test_format.py +++ b/pandas/sparse/tests/test_format.py @@ -15,8 +15,6 @@ class TestSparseSeriesFormatting(tm.TestCase): - _multiprocess_can_split_ = True - @property def dtype_format_for_platform(self): return '' if use_32bit_repr else ', dtype=int32' diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index e26c0ed1afe58..e3b865492c043 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -22,7 +22,6 @@ class TestSparseDataFrame(tm.TestCase, SharedWithSparse): klass = SparseDataFrame - _multiprocess_can_split_ = True def setUp(self): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], @@ -1150,6 +1149,7 @@ def test_comparison_op_scalar(self): class TestSparseDataFrameAnalytics(tm.TestCase): + def setUp(self): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/sparse/tests/test_groupby.py index 0cb33f4ea0a56..23bea94a2aef8 100644 --- a/pandas/sparse/tests/test_groupby.py +++ b/pandas/sparse/tests/test_groupby.py @@ -6,8 +6,6 @@ class TestSparseGroupBy(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index a634c34139186..c400b68c8a7d8 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -8,8 +8,6 @@ class TestSparseSeriesIndexing(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) self.sparse = self.orig.to_sparse() @@ -431,8 +429,6 @@ def tests_indexing_with_sparse(self): class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): - _multiprocess_can_split_ = True - def setUp(self): # Mi with duplicated values idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), @@ -544,8 +540,6 @@ def test_loc_slice(self): class TestSparseDataFrameIndexing(tm.TestCase): - _multiprocess_can_split_ = True - def test_getitem(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], @@ -908,6 +902,7 @@ def test_reindex_fill_value(self): class TestMultitype(tm.TestCase): + def setUp(self): self.cols = ['string', 'int', 'float', 'object'] diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index b3aa3368e9455..491005db2ae79 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -241,8 +241,6 @@ def test_intersect_identical(self): class TestSparseIndexCommon(tm.TestCase): - _multiprocess_can_split_ = True - def test_int_internal(self): idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') self.assertIsInstance(idx, IntIndex) @@ -391,8 +389,6 @@ def _check(index): class TestBlockIndex(tm.TestCase): - _multiprocess_can_split_ = True - def test_block_internal(self): idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') self.assertIsInstance(idx, BlockIndex) @@ -478,8 +474,6 @@ def test_to_block_index(self): class TestIntIndex(tm.TestCase): - _multiprocess_can_split_ = True - def test_int_internal(self): idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') self.assertIsInstance(idx, IntIndex) diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py index 458681cdc1de0..8511cd5997368 100644 --- a/pandas/sparse/tests/test_list.py +++ b/pandas/sparse/tests/test_list.py @@ -10,8 +10,6 @@ class TestSparseList(unittest.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/sparse/tests/test_pivot.py index 482a99a96194f..4ff9f20093c67 100644 --- a/pandas/sparse/tests/test_pivot.py +++ b/pandas/sparse/tests/test_pivot.py @@ -5,8 +5,6 @@ class TestPivotTable(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index b34f5dd2cee9f..db6ae14b096d3 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -56,7 +56,6 @@ def _test_data2_zero(): class TestSparseSeries(tm.TestCase, SharedWithSparse): - _multiprocess_can_split_ = True def setUp(self): arr, index = _test_data1() @@ -941,6 +940,7 @@ def test_combine_first(self): class TestSparseHandlingMultiIndexes(tm.TestCase): + def setUp(self): miindex = pd.MultiIndex.from_product( [["x", "y"], ["10", "20"]], names=['row-foo', 'row-bar']) diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py index f7d50e8e72a5c..d564f9cb6c425 100644 --- a/pandas/stats/fama_macbeth.py +++ b/pandas/stats/fama_macbeth.py @@ -9,6 +9,7 @@ # flake8: noqa + def fama_macbeth(**kwargs): """Runs Fama-MacBeth regression. diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py index b533d255bd196..96ec70d59488a 100644 --- a/pandas/stats/ols.py +++ b/pandas/stats/ols.py @@ -24,6 +24,7 @@ _FP_ERR = 1e-8 + class OLS(StringMixin): """ Runs a full sample ordinary least squares regression. diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 09fa21d58ea9d..b90c51366c86f 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -60,8 +60,6 @@ def _compare_moving_ols(model1, model2): class TestOLS(BaseTest): - _multiprocess_can_split_ = True - # TODO: Add tests for OLS y predict # TODO: Right now we just check for consistency between full-sample and # rolling/expanding results of the panel OLS. We should also cross-check @@ -262,8 +260,6 @@ def test_ols_object_dtype(self): class TestOLSMisc(tm.TestCase): - _multiprocess_can_split_ = True - """ For test coverage with faux data """ @@ -511,8 +507,6 @@ def test_columns_tuples_summary(self): class TestPanelOLS(BaseTest): - _multiprocess_can_split_ = True - FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', 'p_value', 'r2', 'r2_adj', 'rmse', 'std_err', 't_stat', 'var_beta'] @@ -894,8 +888,6 @@ def _period_slice(panelModel, i): class TestOLSFilter(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): date_index = date_range(datetime(2009, 12, 11), periods=3, freq=offsets.BDay()) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 7a2c5f3b7f7c1..a9553d9ea10cb 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -113,7 +113,6 @@ def has_expanded_repr(df): class TestDataFrameFormatting(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.warn_filters = warnings.filters @@ -762,14 +761,15 @@ def test_truncate_with_different_dtypes(self): # 11594 import datetime - s = Series([datetime.datetime(2012, 1, 1)]*10 + [datetime.datetime(1012,1,2)] + [datetime.datetime(2012, 1, 3)]*10) + s = Series([datetime.datetime(2012, 1, 1)] * 10 + + [datetime.datetime(1012, 1, 2)] + [datetime.datetime(2012, 1, 3)] * 10) with pd.option_context('display.max_rows', 8): result = str(s) self.assertTrue('object' in result) # 12045 - df = DataFrame({'text': ['some words'] + [None]*9}) + df = DataFrame({'text': ['some words'] + [None] * 9}) with pd.option_context('display.max_rows', 8, 'display.max_columns', 3): result = str(df) @@ -779,7 +779,8 @@ def test_truncate_with_different_dtypes(self): def test_datetimelike_frame(self): # GH 12211 - df = DataFrame({'date' : [pd.Timestamp('20130101').tz_localize('UTC')] + [pd.NaT]*5}) + df = DataFrame( + {'date': [pd.Timestamp('20130101').tz_localize('UTC')] + [pd.NaT] * 5}) with option_context("display.max_rows", 5): result = str(df) @@ -1219,8 +1220,8 @@ def test_to_html_multiindex_odd_even_truncate(self): mi = MultiIndex.from_product([[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], - names=['a','b','c']) - df = DataFrame({'n' : range(len(mi))}, index = mi) + names=['a', 'b', 'c']) + df = DataFrame({'n': range(len(mi))}, index=mi) result = df.to_html(max_rows=60) expected = """\ @@ -3451,8 +3452,8 @@ def test_to_latex_with_formatters(self): 'float': [1.0, 2.0, 3.0], 'object': [(1, 2), True, False], 'datetime64': [datetime(2016, 1, 1), - datetime(2016, 2, 5), - datetime(2016, 3, 3)]}) + datetime(2016, 2, 5), + datetime(2016, 3, 3)]}) formatters = {'int': lambda x: '0x%x' % x, 'float': lambda x: '[% 4.1f]' % x, @@ -3896,7 +3897,7 @@ def test_to_csv_date_format(self): def test_to_csv_multi_index(self): # see gh-6618 - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1],[2]])) + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp = ",1\n,2\n0,1\n" self.assertEqual(df.to_csv(), exp) @@ -3904,8 +3905,8 @@ def test_to_csv_multi_index(self): exp = "1\n2\n1\n" self.assertEqual(df.to_csv(index=False), exp) - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1],[2]]), - index=pd.MultiIndex.from_arrays([[1],[2]])) + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), + index=pd.MultiIndex.from_arrays([[1], [2]])) exp = ",,1\n,,2\n1,2,1\n" self.assertEqual(df.to_csv(), exp) @@ -3913,7 +3914,8 @@ def test_to_csv_multi_index(self): exp = "1\n2\n1\n" self.assertEqual(df.to_csv(index=False), exp) - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([['foo'],['bar']])) + df = DataFrame( + [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) exp = ",foo\n,bar\n0,1\n" self.assertEqual(df.to_csv(), exp) @@ -3938,8 +3940,6 @@ def test_period(self): class TestSeriesFormatting(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.ts = tm.makeTimeSeries() @@ -4452,7 +4452,6 @@ def test_to_string_header(self): class TestEngFormatter(tm.TestCase): - _multiprocess_can_split_ = True def test_eng_float_formatter(self): df = DataFrame({'A': [1.41, 141., 14100, 1410000.]}) @@ -4605,9 +4604,9 @@ def test_nan(self): result = formatter(np.nan) self.assertEqual(result, u('NaN')) - df = pd.DataFrame({'a':[1.5, 10.3, 20.5], - 'b':[50.3, 60.67, 70.12], - 'c':[100.2, 101.33, 120.33]}) + df = pd.DataFrame({'a': [1.5, 10.3, 20.5], + 'b': [50.3, 60.67, 70.12], + 'c': [100.2, 101.33, 120.33]}) pt = df.pivot_table(values='a', index='b', columns='c') fmt.set_eng_float_format(accuracy=1) result = pt.to_string() diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index d1eb1faecc401..1e6794c1c9c69 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -5,8 +5,6 @@ import pandas.util.testing as tm import pandas.core.config as cf -_multiprocess_can_split_ = True - def test_adjoin(): data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index 2fec04b9c1aa3..eaa209178b2e9 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -660,7 +660,7 @@ def test_mi_sparse_disabled(self): with pd.option_context('display.multi_sparse', False): df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays([['a', 'a'], - [0, 1]])) + [0, 1]])) result = df.style._translate() body = result['body'] for row in body: diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index cab627dec63cb..e84bb6407fafc 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -22,8 +22,6 @@ class TestDataFrameAlterAxes(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_set_index(self): idx = Index(np.arange(len(self.mixed_frame))) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 0dbb78ec89b2e..a55d2cfb2fb2b 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -25,8 +25,6 @@ class TestDataFrameAnalytics(tm.TestCase, TestData): - _multiprocess_can_split_ = True - # ---------------------------------------------------------------------= # Correlation and covariance diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 19fa98afd2163..30fde4b5b78d8 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -19,8 +19,6 @@ class TestDataFrameApply(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_apply(self): with np.errstate(all='ignore'): # ufunc diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index 323960d54a42c..8bb26d3d7474c 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -11,7 +11,6 @@ class TestFrameAsof(TestData, tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.N = N = 50 diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index ff6215531fc64..839ceb5368240 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -26,8 +26,6 @@ class TestDataFrameSelectReindex(tm.TestCase, TestData): # These are specific reindex-based tests; other indexing tests should go in # test_indexing - _multiprocess_can_split_ = True - def test_drop_names(self): df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 33550670720c3..7b64dea8c102d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -29,8 +29,6 @@ class TestDataFrameBlockInternals(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_cast_internals(self): casted = DataFrame(self.frame._data, dtype=int) expected = DataFrame(self.frame._series, dtype=int) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 1167662b69375..eed4d6261d6e8 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -22,8 +22,6 @@ class TestDataFrameConcatCommon(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_concat_multiple_frames_dtypes(self): # GH 2759 @@ -427,8 +425,6 @@ def test_concat_axis_parameter(self): class TestDataFrameCombineFirst(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_combine_first_mixed(self): a = Series(['a', 'b'], index=lrange(2)) b = Series(lrange(2), index=lrange(2)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1676c57a274cd..66a235e1260bd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -36,8 +36,6 @@ class TestDataFrameConstructors(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_constructor(self): df = DataFrame() self.assertEqual(len(df.index), 0) @@ -1886,8 +1884,6 @@ def test_from_records_len0_with_columns(self): class TestDataFrameConstructorWithDatetimeTZ(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_from_dict(self): # 8260 diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 53083a602e183..1bc8313726d0c 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -16,8 +16,6 @@ class TestDataFrameConvertTo(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_to_dict(self): test_data = { 'A': {'1': 1, '2': 2}, diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 798982bcbdedf..f7d2c1a654cd5 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -18,8 +18,6 @@ class TestDataFrameDataTypes(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) df['a'] = df['a'].astype(np.bool_) @@ -539,8 +537,6 @@ def test_arg_for_errors_in_astype(self): class TestDataFrameDatetimeWithTZ(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_interleave(self): # interleave with object diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index f0e6ab4c17915..c06faa75ed346 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -37,8 +37,6 @@ class TestDataFrameIndexing(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_getitem(self): # slicing sl = self.frame[:20] @@ -2841,8 +2839,6 @@ def test_type_error_multiindex(self): class TestDataFrameIndexingDatetimeWithTZ(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def setUp(self): self.idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), name='foo') @@ -2902,8 +2898,6 @@ def test_transpose(self): class TestDataFrameIndexingUInt64(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def setUp(self): self.ir = Index(np.arange(3), dtype=np.uint64) self.idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo') diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 2fc14d9e4d123..674202980807a 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -27,8 +27,6 @@ class SharedWithSparse(object): - _multiprocess_can_split_ = True - def test_copy_index_name_checking(self): # don't want to be able to modify the index stored elsewhere after # making a copy @@ -159,8 +157,6 @@ class TestDataFrameMisc(tm.TestCase, SharedWithSparse, TestData): klass = DataFrame - _multiprocess_can_split_ = True - def test_get_axis(self): f = self.frame self.assertEqual(f._get_axis_number(0), 0) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 8c25f71c00684..ef800f0dface3 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -29,8 +29,6 @@ def _skip_if_no_pchip(): class TestDataFrameMissingData(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_dropEmptyRows(self): N = len(self.frame.index) mat = random.randn(N) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 5beab1565e538..6b4c56747c981 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -21,8 +21,6 @@ class TestDataFrameMutateColumns(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_assign(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) original = df.copy() diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 835c18ffc6081..4ad88a12a2625 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -19,8 +19,6 @@ class TestDataFrameNonuniqueIndexes(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_column_dups_operations(self): def check(result, expected=None): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 15f98abe1445d..ec73689088035 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -31,8 +31,6 @@ class TestDataFrameOperators(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_operators(self): garbage = random.random(4) colSeries = Series(garbage, index=np.array(self.frame.columns)) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 22414a6ba8a53..400ead788aa7c 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -21,8 +21,6 @@ class TestDataFrameQuantile(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_quantile(self): from numpy import percentile diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a9a90a6f5cd40..aed02b7323f85 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -90,8 +90,6 @@ def test_query_numexpr(self): class TestDataFrameEval(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_ops(self): # tst ops and reversed ops in evaluation @@ -168,8 +166,6 @@ def test_eval_resolvers_as_list(self): class TestDataFrameQueryWithMultiIndex(tm.TestCase): - _multiprocess_can_split_ = True - def check_query_with_named_multiindex(self, parser, engine): tm.skip_if_no_ne(engine) a = np.random.choice(['red', 'green'], size=10) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index f46215105b375..8b50036cd50f8 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -23,8 +23,6 @@ class TestDataFrameReplace(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_replace_inplace(self): self.tsframe['A'][:5] = nan self.tsframe['A'][-5:] = nan diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 12cd62f8b4cc0..2df297d03bcdf 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -25,8 +25,6 @@ class TestDataFrameReprInfoEtc(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_repr_empty(self): # empty foo = repr(self.empty) # noqa diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 705270b695b77..1890b33e3dbaa 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -25,8 +25,6 @@ class TestDataFrameReshape(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_pivot(self): data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index bbd8dd9b48b5c..7779afdc47b48 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -19,8 +19,6 @@ class TestDataFrameSorting(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_sort_index(self): # GH13496 diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 8bd6d3ba54371..9052a16bf973c 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -13,8 +13,6 @@ class TestDataFrameSubclassing(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 55848847f2266..862f76b4ecc05 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -27,8 +27,6 @@ class TestDataFrameTimeSeriesMethods(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_diff(self): the_diff = self.tsframe.diff(1) @@ -539,13 +537,13 @@ def test_datetime_assignment_with_NaT_and_diff_time_units(self): result = pd.Series(data_ns).to_frame() result['new'] = data_ns expected = pd.DataFrame({0: [1, None], - 'new': [1, None]}, dtype='datetime64[ns]') + 'new': [1, None]}, dtype='datetime64[ns]') tm.assert_frame_equal(result, expected) # OutOfBoundsDatetime error shouldn't occur data_s = np.array([1, 'nat'], dtype='datetime64[s]') result['new'] = data_s expected = pd.DataFrame({0: [1, None], - 'new': [1e9, None]}, dtype='datetime64[ns]') + 'new': [1e9, None]}, dtype='datetime64[ns]') tm.assert_frame_equal(result, expected) def test_frame_to_period(self): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 5c47b0357b4f6..471fc536a90f6 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -31,8 +31,6 @@ class TestDataFrameToCSV(tm.TestCase, TestData): - _multiprocess_can_split_ = True - def test_to_csv_from_csv1(self): with ensure_clean('__tmp_to_csv_from_csv1__') as path: diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 5f680a6876873..00ddd293f6014 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -27,8 +27,6 @@ class TestGroupByAggregate(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.ts = tm.makeTimeSeries() diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 82ec1832be961..605b327208a03 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -23,8 +23,6 @@ class TestGroupByCategorical(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.ts = tm.makeTimeSeries() diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 663fbd04e7e5a..1640858802047 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -24,8 +24,6 @@ class TestGroupByFilter(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.ts = tm.makeTimeSeries() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 01c81bd7904bd..df4707fcef3f0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -36,8 +36,6 @@ class TestGroupBy(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.ts = tm.makeTimeSeries() @@ -5908,8 +5906,8 @@ def test_group_shift_with_null_key(self): g = df.groupby(["A", "B"]) expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 - else np.nan) - for i in range(n_rows)], dtype=float, + else np.nan) + for i in range(n_rows)], dtype=float, columns=["Z"], index=None) result = g.shift(-1) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index edb044a3cb2d7..c9a695ee8db3b 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -8,7 +8,6 @@ class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_astype(self): # GH 13149, GH 13209 @@ -185,7 +184,6 @@ def _check_rng(rng): class TestToPeriod(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): data = [Timestamp('2007-01-01 10:11:12.123456Z'), diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index e54ebe3d93bc6..772d76305cff2 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -10,7 +10,6 @@ class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_construction_with_alt(self): @@ -428,7 +427,6 @@ def test_000constructor_resolution(self): class TestTimeSeries(tm.TestCase): - _multiprocess_can_split_ = True def test_dti_constructor_preserve_dti_freq(self): rng = date_range('1/1/2000', '1/2/2000', freq='5min') diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index b2161aa5c75c6..9d5f397329c76 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -9,7 +9,6 @@ class TestTimeSeries(TestData, tm.TestCase): - _multiprocess_can_split_ = True def test_date_range_gen_error(self): rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 628cb9df94e39..2c87c48bcda11 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -14,7 +14,6 @@ class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_get_loc(self): idx = pd.date_range('2000-01-01', periods=3) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index eea08febc86e6..2b254bc8be931 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -10,7 +10,6 @@ class TestDatetimeIndex(DatetimeLike, tm.TestCase): _holder = DatetimeIndex - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(index=tm.makeDateIndex(10)) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 5b6bcffe71856..23271a8d45499 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -7,7 +7,6 @@ class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_where_other(self): diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index dda2785d2b0ae..6b0191edbda5a 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -53,7 +53,6 @@ def test_second(self): class TestTimeSeries(tm.TestCase): - _multiprocess_can_split_ = True def test_pass_datetimeindex_to_index(self): # Bugs in #1396 diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index 5c408d5300cdc..8f3752227b6d0 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -3,7 +3,6 @@ class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_fillna_datetime64(self): # GH 11343 diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index c25cd6a3fa90e..a46980a0f742a 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -955,7 +955,6 @@ def test_second(self): class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True # GH 10699 def test_datetime64_with_DateOffset(self): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 229ae803aa2ff..7777de869bb20 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -9,7 +9,6 @@ class TestDatetimeIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_union(self): i1 = Int64Index(np.arange(0, 20, 2)) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 42d135f634298..841d0be605058 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -168,7 +168,6 @@ def test_to_datetime_format_weeks(self): class TestToDatetime(tm.TestCase): - _multiprocess_can_split_ = True def test_to_datetime_dt64s(self): in_bound_dts = [ @@ -989,6 +988,7 @@ def test_to_datetime_iso8601_noleading_0s(self): class TestDaysInMonth(tm.TestCase): # tests for issue #10154 + def test_day_not_in_month_coerce(self): self.assertTrue(isnull(to_datetime('2015-02-29', errors='coerce'))) self.assertTrue(isnull(to_datetime('2015-02-29', format="%Y-%m-%d", diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c574a4a1f01a7..2f5b98d145e57 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -31,7 +31,6 @@ class TestIndex(Base, tm.TestCase): _holder = Index - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100), @@ -1795,7 +1794,6 @@ class TestMixedIntIndex(Base, tm.TestCase): # (GH 13514) _holder = Index - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) @@ -1993,7 +1991,7 @@ def test_dropna(self): idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days']) tm.assert_index_equal(idx.dropna(), idx) nanidx = pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', - '3 days', pd.NaT]) + '3 days', pd.NaT]) tm.assert_index_equal(nanidx.dropna(), idx) idx = pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 708f424d9bad1..6b6885c082533 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -225,6 +225,7 @@ def test_map(self): # change categories dtype ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), ordered=False) + def f(x): return {'A': 10, 'B': 20, 'C': 30}.get(x) @@ -360,7 +361,8 @@ def test_reindexing(self): expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal(expected.values, actual, check_dtype=False) + tm.assert_numpy_array_equal( + expected.values, actual, check_dtype=False) def test_reindex_dtype(self): c = CategoricalIndex(['a', 'b', 'c', 'a']) @@ -519,7 +521,7 @@ def test_ensure_copied_data(self): # GH12309 # Must be tested separately from other indexes because # self.value is not an ndarray - _base = lambda ar : ar if ar.base is None else ar.base + _base = lambda ar: ar if ar.base is None else ar.base for index in self.indices.values(): result = CategoricalIndex(index.values, copy=True) tm.assert_index_equal(index, result) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index e5a4ced4ced4d..b212a7b75904c 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -16,7 +16,6 @@ class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(index=tm.makePeriodIndex(10)) @@ -240,7 +239,6 @@ def test_difference_freq(self): class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(index=tm.makeTimedeltaIndex(10)) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 7d9ceb526b912..365236f72e80e 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -30,7 +30,6 @@ class TestMultiIndex(Base, tm.TestCase): _holder = MultiIndex - _multiprocess_can_split_ = True _compat_props = ['shape', 'ndim', 'size', 'itemsize'] def setUp(self): @@ -900,11 +899,11 @@ def test_append_mixed_dtypes(self): res = mi.append(mi) exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3], - [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], - ['a', 'b', 'c', 'a', 'b', 'c'], - dti.append(dti), - dti_tz.append(dti_tz), - pi.append(pi)]) + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ['a', 'b', 'c', 'a', 'b', 'c'], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi)]) tm.assert_index_equal(res, exp) other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'], @@ -913,11 +912,11 @@ def test_append_mixed_dtypes(self): res = mi.append(other) exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'], - [1.1, np.nan, 3.3, 'x', 'y', 'z'], - ['a', 'b', 'c', 'x', 'y', 'z'], - dti.append(pd.Index(['x', 'y', 'z'])), - dti_tz.append(pd.Index(['x', 'y', 'z'])), - pi.append(pd.Index(['x', 'y', 'z']))]) + [1.1, np.nan, 3.3, 'x', 'y', 'z'], + ['a', 'b', 'c', 'x', 'y', 'z'], + dti.append(pd.Index(['x', 'y', 'z'])), + dti_tz.append(pd.Index(['x', 'y', 'z'])), + pi.append(pd.Index(['x', 'y', 'z']))]) tm.assert_index_equal(res, exp) def test_get_level_values(self): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 4dab7ae76a011..1bf9a10628542 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -176,7 +176,6 @@ def test_modulo(self): class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(mixed=Float64Index([1.5, 2, 3, 4, 5]), @@ -624,7 +623,6 @@ def test_ufunc_coercions(self): class TestInt64Index(NumericInt, tm.TestCase): _dtype = 'int64' _holder = Int64Index - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(index=Int64Index(np.arange(0, 20, 2))) @@ -895,7 +893,6 @@ class TestUInt64Index(NumericInt, tm.TestCase): _dtype = 'uint64' _holder = UInt64Index - _multiprocess_can_split_ = True def setUp(self): self.indices = dict(index=UInt64Index([2**63, 2**63 + 10, 2**63 + 15, diff --git a/pandas/tests/indexes/test_timedelta.py b/pandas/tests/indexes/test_timedelta.py index be01ad03a0660..e6071b8c4fa06 100644 --- a/pandas/tests/indexes/test_timedelta.py +++ b/pandas/tests/indexes/test_timedelta.py @@ -34,7 +34,6 @@ def test_timedelta(self): class TestTimeSeries(tm.TestCase): - _multiprocess_can_split_ = True def test_series_box_timedelta(self): rng = timedelta_range('1 day 1 s', periods=5, freq='h') diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index bcadc41b13370..1d70205076b86 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -8,8 +8,6 @@ class TestIndexingCallable(tm.TestCase): - _multiprocess_can_split_ = True - def test_frame_loc_ix_callable(self): # GH 11485 df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'), diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0cfa7258461f1..b9a746cd25c7a 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -15,8 +15,6 @@ class CoercionBase(object): - _multiprocess_can_split_ = True - klasses = ['index', 'series'] dtypes = ['object', 'int64', 'float64', 'complex128', 'bool', 'datetime64', 'datetime64tz', 'timedelta64', 'period'] @@ -1187,7 +1185,7 @@ def _assert_replace_conversion(self, from_key, to_key, how): to_key in ('bool')) or # TODO_GH12747 The result must be int? - (from_key == 'bool' and to_key == 'int64')): + (from_key == 'bool' and to_key == 'int64')): # buggy on 32-bit if tm.is_platform_32bit(): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a9dfcf2672357..b06b1067b7c6b 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -99,8 +99,6 @@ def _mklbl(prefix, n): class TestIndexing(tm.TestCase): - _multiprocess_can_split_ = True - _objs = set(['series', 'frame', 'panel']) _typs = set(['ints', 'uints', 'labels', 'mixed', 'ts', 'floats', 'empty', 'ts_rev']) diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index 5d563e20087b9..42b50e37f0492 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -10,8 +10,6 @@ class TestIndexingSlow(tm.TestCase): - _multiprocess_can_split_ = True - @tm.slow def test_multiindex_get_loc(self): # GH7724, GH2646 diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c92287b2bdc42..11f00386ec592 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -18,6 +18,7 @@ @tm.mplskip class TestSeriesPlots(TestPlotBase): + def setUp(self): TestPlotBase.setUp(self) import matplotlib as mpl @@ -49,6 +50,7 @@ def test_bootstrap_plot(self): @tm.mplskip class TestDataFramePlots(TestPlotBase): + @slow def test_scatter_plot_legacy(self): tm._skip_if_no_scipy() diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index f686f1aa6dc47..0cef27d2e41fc 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -1149,6 +1149,7 @@ def test_round_nat(self): class TestTimestampNsOperations(tm.TestCase): + def setUp(self): self.timestamp = Timestamp(datetime.utcnow()) @@ -1324,6 +1325,7 @@ def test_nat_arithmetic_index(self): class TestTimestampOps(tm.TestCase): + def test_timestamp_and_datetime(self): self.assertEqual((Timestamp(datetime( 2013, 10, 13)) - datetime(2013, 10, 12)).days, 1) @@ -1404,6 +1406,7 @@ def test_resolution(self): class TestTimestampToJulianDate(tm.TestCase): + def test_compare_1700(self): r = Timestamp('1700-06-23').to_julian_date() self.assertEqual(r, 2342145.5) @@ -1426,7 +1429,6 @@ def test_compare_hour13(self): class TestTimeSeries(tm.TestCase): - _multiprocess_can_split_ = True def test_timestamp_to_datetime(self): tm._skip_if_no_pytz() diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 2ddfa27eea377..6473dbeeaa1bc 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -18,8 +18,6 @@ class TestSeriesAlterAxes(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_setindex(self): # wrong type series = self.series.copy() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 07e1be609670f..52b85c89a7009 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -30,8 +30,6 @@ class TestSeriesAnalytics(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_sum_zero(self): arr = np.array([]) self.assertEqual(nanops.nansum(arr), 0) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index ec7ffde344d31..16d1466bb90fe 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -15,8 +15,6 @@ class TestSeriesApply(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_apply(self): with np.errstate(all='ignore'): assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) @@ -141,8 +139,6 @@ def f(x): class TestSeriesMap(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_map(self): index, data = tm.getMixedTypeDict() diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py index db306d2a742c1..d2fd8858e7647 100644 --- a/pandas/tests/series/test_asof.py +++ b/pandas/tests/series/test_asof.py @@ -10,7 +10,6 @@ class TestSeriesAsof(TestData, tm.TestCase): - _multiprocess_can_split_ = True def test_basic(self): diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 7bcd1763537dc..d4e5d36c15c68 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -18,8 +18,6 @@ class TestSeriesCombine(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_append(self): appendedSeries = self.series.append(self.objSeries) for idx, value in compat.iteritems(appendedSeries): @@ -222,8 +220,6 @@ def test_combine_first_dt64(self): class TestTimeseries(tm.TestCase): - _multiprocess_can_split_ = True - def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 777b188b8fdd9..aef4c9269bc62 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -26,8 +26,6 @@ class TestSeriesConstructors(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_scalar_conversion(self): # Pass in scalar is disabled diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index b9f999a6c6ffe..4c697c7e52bb8 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -22,8 +22,6 @@ class TestSeriesDatetimeValues(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_dt_namespace_accessor(self): # GH 7207, 11128 @@ -168,9 +166,9 @@ def compare(s, name): cases = [Series(timedelta_range('1 day', periods=5), index=list('abcde'), name='xxx'), Series(timedelta_range('1 day 01:23:45', periods=5, - freq='s'), name='xxx'), + freq='s'), name='xxx'), Series(timedelta_range('2 days 01:23:45.012345', periods=5, - freq='ms'), name='xxx')] + freq='ms'), name='xxx')] for s in cases: for prop in ok_for_td: # we test freq below diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 127a410f66fdb..13375ab886d8d 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -20,8 +20,6 @@ class TestSeriesDtypes(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_astype(self): s = Series(np.random.randn(5), name='foo') diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index e0d83d6eeadac..a20cb8324d2a3 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -31,8 +31,6 @@ class TestSeriesIndexing(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_get(self): # GH 6383 @@ -2216,7 +2214,6 @@ def test_setitem_slice_into_readonly_backing_data(self): class TestTimeSeriesDuplicates(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), @@ -2603,6 +2600,7 @@ def test_frame_datetime64_duplicated(self): class TestNatIndexing(tm.TestCase): + def setUp(self): self.series = Series(date_range('1/1/2000', periods=10)) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index e3a0e056f4da1..a3b13ba9b993a 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -16,8 +16,6 @@ class TestSeriesInternals(tm.TestCase): - _multiprocess_can_split_ = True - def test_convert_objects(self): s = Series([1., 2, 3], index=['a', 'b', 'c']) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 48528dc54adbd..d514fbfc142f0 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -18,8 +18,6 @@ class TestSeriesToCSV(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_from_csv(self): with ensure_clean() as path: @@ -112,8 +110,6 @@ def test_to_csv_path_is_none(self): class TestSeriesIO(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_to_frame(self): self.ts.name = None rs = self.ts.to_frame() @@ -174,8 +170,6 @@ class SubclassedFrame(DataFrame): class TestSeriesToList(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_tolist(self): rs = self.ts.tolist() xp = self.ts.values.tolist() diff --git a/pandas/tests/series/test_misc_api.py b/pandas/tests/series/test_misc_api.py index b1b06cc7be8a4..2facbaf1fe31e 100644 --- a/pandas/tests/series/test_misc_api.py +++ b/pandas/tests/series/test_misc_api.py @@ -118,8 +118,6 @@ def test_to_sparse_pass_name(self): class TestSeriesMisc(TestData, SharedWithSparse, tm.TestCase): - _multiprocess_can_split_ = True - def test_tab_completion(self): # GH 9910 s = Series(list('abcd')) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 6821a8b9f4221..702fa2acb5106 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -41,8 +41,6 @@ def _simple_ts(start, end, freq='D'): class TestSeriesMissingData(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_timedelta_fillna(self): # GH 3371 s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7b1201b971c71..3d609dec7958a 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -28,8 +28,6 @@ class TestSeriesOperators(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_series_comparison_scalars(self): series = Series(date_range('1/1/2000', periods=10)) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index aa16f2cca9475..7fe31bab87537 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -11,8 +11,6 @@ class TestSeriesReplace(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index af52f6e712e61..99a406a71b12b 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -18,8 +18,6 @@ class TestSeriesRepr(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_multilevel_name_print(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index fb3817eb84acd..db506f12a2293 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -13,8 +13,6 @@ class TestSeriesSorting(TestData, tm.TestCase): - _multiprocess_can_split_ = True - def test_sort(self): ts = self.ts.copy() diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 5bcf258020349..3b1b8aca426e1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -8,8 +8,6 @@ class TestSeriesSubclassing(tm.TestCase): - _multiprocess_can_split_ = True - def test_indexing_sliced(self): s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd')) res = s.loc[['a', 'b']] @@ -37,8 +35,6 @@ def test_to_frame(self): class TestSparseSeriesSubclassing(tm.TestCase): - _multiprocess_can_split_ = True - def test_subclass_sparse_slice(self): # int64 s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index bd346fb9bb0c8..e0db813e60c14 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -32,7 +32,6 @@ def assert_range_equal(left, right): class TestTimeSeries(TestData, tm.TestCase): - _multiprocess_can_split_ = True def test_shift(self): shifted = self.ts.shift(1) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 40b277f3f1f8a..fab04f7fa4bf2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -20,7 +20,6 @@ class TestMatch(tm.TestCase): - _multiprocess_can_split_ = True def test_ints(self): values = np.array([0, 2, 1]) @@ -57,7 +56,6 @@ def test_strings(self): class TestSafeSort(tm.TestCase): - _multiprocess_can_split_ = True def test_basic_sort(self): values = [3, 1, 2, 0, 4] @@ -144,7 +142,6 @@ def test_exceptions(self): class TestFactorize(tm.TestCase): - _multiprocess_can_split_ = True def test_basic(self): @@ -306,7 +303,6 @@ def test_uint64_factorize(self): class TestUnique(tm.TestCase): - _multiprocess_can_split_ = True def test_ints(self): arr = np.random.randint(0, 100, size=50) @@ -389,7 +385,6 @@ def test_uint64_overflow(self): class TestIsin(tm.TestCase): - _multiprocess_can_split_ = True def test_invalid(self): @@ -472,7 +467,6 @@ def test_large(self): class TestValueCounts(tm.TestCase): - _multiprocess_can_split_ = True def test_value_counts(self): np.random.seed(1234) @@ -659,8 +653,6 @@ def test_value_counts_uint64(self): class TestDuplicated(tm.TestCase): - _multiprocess_can_split_ = True - def test_duplicated_with_nas(self): keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) @@ -896,7 +888,6 @@ def test_group_var_constant(self): class TestGroupVarFloat64(tm.TestCase, GroupVarTestMixin): __test__ = True - _multiprocess_can_split_ = True algo = algos.algos.group_var_float64 dtype = np.float64 @@ -920,7 +911,6 @@ def test_group_var_large_inputs(self): class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin): __test__ = True - _multiprocess_can_split_ = True algo = algos.algos.group_var_float32 dtype = np.float32 @@ -1068,7 +1058,6 @@ def test_arrmap(): class TestTseriesUtil(tm.TestCase): - _multiprocess_can_split_ = True def test_combineFunc(self): pass diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index be55d6e1976ec..cc99cf0f830aa 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -26,7 +26,6 @@ class TestCategorical(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], @@ -1574,12 +1573,12 @@ def test_searchsorted(self): # https://github.com/pandas-dev/pandas/issues/14522 c1 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], - categories=['cheese', 'milk', 'apple', 'bread'], - ordered=True) + categories=['cheese', 'milk', 'apple', 'bread'], + ordered=True) s1 = pd.Series(c1) c2 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], - categories=['cheese', 'milk', 'apple', 'bread'], - ordered=False) + categories=['cheese', 'milk', 'apple', 'bread'], + ordered=False) s2 = pd.Series(c2) # Searching for single item argument, side='left' (default) @@ -1697,8 +1696,8 @@ def test_map(self): tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) def test_validate_inplace(self): - cat = Categorical(['A','B','B','C','A']) - invalid_values = [1, "True", [1,2,3], 5.0] + cat = Categorical(['A', 'B', 'B', 'C', 'A']) + invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: with self.assertRaises(ValueError): @@ -1711,19 +1710,21 @@ def test_validate_inplace(self): cat.as_unordered(inplace=value) with self.assertRaises(ValueError): - cat.set_categories(['X','Y','Z'], rename=True, inplace=value) + cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value) with self.assertRaises(ValueError): - cat.rename_categories(['X','Y','Z'], inplace=value) + cat.rename_categories(['X', 'Y', 'Z'], inplace=value) with self.assertRaises(ValueError): - cat.reorder_categories(['X','Y','Z'], ordered=True, inplace=value) + cat.reorder_categories( + ['X', 'Y', 'Z'], ordered=True, inplace=value) with self.assertRaises(ValueError): - cat.add_categories(new_categories=['D','E','F'], inplace=value) + cat.add_categories( + new_categories=['D', 'E', 'F'], inplace=value) with self.assertRaises(ValueError): - cat.remove_categories(removals=['D','E','F'], inplace=value) + cat.remove_categories(removals=['D', 'E', 'F'], inplace=value) with self.assertRaises(ValueError): cat.remove_unused_categories(inplace=value) @@ -1733,7 +1734,6 @@ def test_validate_inplace(self): class TestCategoricalAsBlock(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) @@ -3045,13 +3045,15 @@ def test_value_counts_with_nan(self): tm.assert_series_equal(res, exp) # we don't exclude the count of None and sort by counts - exp = pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"])) + exp = pd.Series( + [3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"])) res = s.value_counts(dropna=False) tm.assert_series_equal(res, exp) # When we aren't sorting by counts, and np.nan isn't a # category, it should be last. - exp = pd.Series([2, 1, 3], index=pd.CategoricalIndex(["a", "b", np.nan])) + exp = pd.Series( + [2, 1, 3], index=pd.CategoricalIndex(["a", "b", np.nan])) res = s.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) @@ -3703,7 +3705,8 @@ def f(): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.loc["j":"k", df.columns[0]] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc["j":"k", df.columns[0]] = pd.Categorical( + ["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): @@ -4013,7 +4016,6 @@ def test_concat_append_gh7864(self): self.assert_index_equal(df['grade'].cat.categories, dfa['grade'].cat.categories) - def test_concat_preserve(self): # GH 8641 series concat not preserving category dtype @@ -4042,7 +4044,7 @@ def test_concat_preserve(self): res = pd.concat([df2, df2]) exp = DataFrame({'A': pd.concat([a, a]), 'B': pd.concat([b, b]).astype( - 'category', categories=list('cab'))}) + 'category', categories=list('cab'))}) tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): @@ -4052,18 +4054,18 @@ def test_categorical_index_preserver(self): df2 = DataFrame({'A': a, 'B': b.astype('category', categories=list('cab')) - }).set_index('B') + }).set_index('B') result = pd.concat([df2, df2]) expected = DataFrame({'A': pd.concat([a, a]), 'B': pd.concat([b, b]).astype( 'category', categories=list('cab')) - }).set_index('B') + }).set_index('B') tm.assert_frame_equal(result, expected) # wrong catgories df3 = DataFrame({'A': a, 'B': pd.Categorical(b, categories=list('abc')) - }).set_index('B') + }).set_index('B') self.assertRaises(TypeError, lambda: pd.concat([df2, df3])) def test_merge(self): @@ -4391,8 +4393,8 @@ def test_str_accessor_api_for_categorical(self): ('decode', ("UTF-8",), {}), ('encode', ("UTF-8",), {}), ('endswith', ("a",), {}), - ('extract', ("([a-z]*) ",), {"expand":False}), - ('extract', ("([a-z]*) ",), {"expand":True}), + ('extract', ("([a-z]*) ",), {"expand": False}), + ('extract', ("([a-z]*) ",), {"expand": True}), ('extractall', ("([a-z]*) ",), {}), ('find', ("a",), {}), ('findall', ("a",), {}), @@ -4550,8 +4552,6 @@ def test_concat_categorical(self): class TestCategoricalSubclassing(tm.TestCase): - _multiprocess_can_split_ = True - def test_constructor(self): sc = tm.SubclassedCategorical(['a', 'b', 'c']) self.assertIsInstance(sc, tm.SubclassedCategorical) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 0239250129494..90b1157572be1 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,8 +7,6 @@ import pandas.core.common as com import pandas.util.testing as tm -_multiprocess_can_split_ = True - def test_mut_exclusive(): msg = "mutually exclusive arguments: '[ab]' and '[ab]'" diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index ed8c37fd6dd20..c58aada193b15 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -5,7 +5,6 @@ class TestConfig(unittest.TestCase): - _multiprocess_can_split_ = True def __init__(self, *args): super(TestConfig, self).__init__(*args) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 18b078d0a677e..eca4a8f3c9e66 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -58,8 +58,6 @@ class TestExpressions(tm.TestCase): - _multiprocess_can_split_ = False - def setUp(self): self.frame = _frame.copy() diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 5bf2eda47ea27..916d7ae0b0ec4 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -33,8 +33,6 @@ class Generic(object): - _multiprocess_can_split_ = True - def setUp(self): pass diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 2bfe31ad4260e..1dfea168c067c 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -182,8 +182,6 @@ def create_mgr(descr, item_shape=None): class TestBlock(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): # self.fblock = get_float_ex() # a,c,e # self.cblock = get_complex_ex() # @@ -299,7 +297,6 @@ def test_split_block_at(self): class TestDatetimeBlock(tm.TestCase): - _multiprocess_can_split_ = True def test_try_coerce_arg(self): block = create_block('datetime', [0]) @@ -318,7 +315,6 @@ def test_try_coerce_arg(self): class TestBlockManager(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.mgr = create_mgr( @@ -1057,7 +1053,6 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, class TestBlockPlacement(tm.TestCase): - _multiprocess_can_split_ = True def test_slice_len(self): self.assertEqual(len(BlockPlacement(slice(0, 4))), 4) diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 0e7dda05a0c27..2a16d7663b0cf 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -9,7 +9,6 @@ class TestIndexer(tm.TestCase): - _multiprocess_can_split_ = True def test_outer_join_indexer(self): typemap = [('int32', _join.outer_join_indexer_int32), diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d87ad8d906854..1fe2d701f5a41 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -25,8 +25,6 @@ class TestMultiLevel(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 89e8fb78ad821..4f56419b1323a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -52,7 +52,6 @@ def not_hashable(self): class SafeForLongAndSparse(object): - _multiprocess_can_split_ = True def test_repr(self): repr(self.panel) @@ -177,7 +176,6 @@ def wrapper(x): class SafeForSparse(object): - _multiprocess_can_split_ = True @classmethod def assert_panel_equal(cls, x, y): @@ -422,8 +420,6 @@ def test_abs(self): class CheckIndexing(object): - _multiprocess_can_split_ = True - def test_getitem(self): self.assertRaises(Exception, self.panel.__getitem__, 'ItemQ') @@ -869,7 +865,6 @@ def test_set_value(self): class TestPanel(tm.TestCase, PanelTests, CheckIndexing, SafeForLongAndSparse, SafeForSparse): - _multiprocess_can_split_ = True @classmethod def assert_panel_equal(cls, x, y): @@ -2278,7 +2273,6 @@ class TestLongPanel(tm.TestCase): """ LongPanel no longer exists, but... """ - _multiprocess_can_split_ = True def setUp(self): import warnings diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index aeca24964222a..96864c626ba7f 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -29,8 +29,6 @@ def add_nans(panel4d): class SafeForLongAndSparse(object): - _multiprocess_can_split_ = True - def test_repr(self): repr(self.panel4d) @@ -148,8 +146,6 @@ def wrapper(x): class SafeForSparse(object): - _multiprocess_can_split_ = True - @classmethod def assert_panel_equal(cls, x, y): assert_panel_equal(x, y) @@ -305,8 +301,6 @@ def test_abs(self): class CheckIndexing(object): - _multiprocess_can_split_ = True - def test_getitem(self): self.assertRaises(Exception, self.panel4d.__getitem__, 'ItemQ') @@ -604,8 +598,6 @@ def test_set_value(self): class TestPanel4d(tm.TestCase, CheckIndexing, SafeForSparse, SafeForLongAndSparse): - _multiprocess_can_split_ = True - @classmethod def assert_panel4d_equal(cls, x, y): assert_panel4d_equal(x, y) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index b5fa945a5bb8f..ed5ec970ba33c 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -14,8 +14,6 @@ import pandas.util.testing as tm from pandas.compat import range, u -_multiprocess_can_split_ = True - class TestMelt(tm.TestCase): diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index eb8ab02c29548..118c4147a2019 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -13,7 +13,6 @@ class TestRank(tm.TestCase): - _multiprocess_can_split_ = True s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) df = DataFrame({'A': s, 'B': s}) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f358946983dce..ce97b09b7e3ca 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -20,8 +20,6 @@ class TestStringMethods(tm.TestCase): - _multiprocess_can_split_ = True - def test_api(self): # GH 6106, GH 9322 diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index bf8a3ab370625..3aed22c140ffe 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -8,15 +8,11 @@ import pandas.util.testing as tm from pandas.tslib import iNaT -_multiprocess_can_split_ = True - class TestTake(tm.TestCase): # standard incompatible fill error fill_error = re.compile("Incompatible type for fill_value") - _multiprocess_can_split_ = True - def test_1d_with_out(self): def _test_dtype(dtype, can_hold_na, writeable=True): data = np.random.randint(0, 2, 4).astype(dtype) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 5e60efd153ab1..466e9ee5a30b8 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -18,7 +18,6 @@ class TestAssertAlmostEqual(tm.TestCase): - _multiprocess_can_split_ = True def _assert_almost_equal_both(self, a, b, **kwargs): assert_almost_equal(a, b, **kwargs) @@ -146,7 +145,6 @@ def test_assert_almost_equal_object(self): class TestUtilTesting(tm.TestCase): - _multiprocess_can_split_ = True def test_raise_with_traceback(self): with assertRaisesRegexp(LookupError, "error_text"): @@ -347,7 +345,6 @@ def test_assert_almost_equal_iterable_message(self): class TestAssertIndexEqual(unittest.TestCase): - _multiprocess_can_split_ = True def test_index_equal_message(self): @@ -495,7 +492,6 @@ def test_index_equal_metadata_message(self): class TestAssertSeriesEqual(tm.TestCase): - _multiprocess_can_split_ = True def _assert_equal(self, x, y, **kwargs): assert_series_equal(x, y, **kwargs) @@ -590,7 +586,6 @@ def test_series_equal_message(self): class TestAssertFrameEqual(tm.TestCase): - _multiprocess_can_split_ = True def _assert_equal(self, x, y, **kwargs): assert_frame_equal(x, y, **kwargs) @@ -701,7 +696,6 @@ def test_notisinstance(self): class TestAssertCategoricalEqual(unittest.TestCase): - _multiprocess_can_split_ = True def test_categorical_equal_message(self): diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index e2f6a7f6cc1ed..1bf9f4da45bff 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -314,6 +314,7 @@ def test_validation(self): class TestMove(tm.TestCase): + def test_cannot_create_instance_of_stolenbuffer(self): """Stolen buffers need to be created through the smart constructor ``move_into_mutable_buffer`` which has a bunch of checks in it. diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index dc23469976e35..48861fc6a9528 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -32,8 +32,6 @@ def assert_equal(left, right): class Base(tm.TestCase): - _multiprocess_can_split_ = True - _nan_locs = np.arange(20, 40) _inf_locs = np.array([]) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index a8579e89aeb1f..497130b117289 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -19,8 +19,6 @@ DatetimeTZDtype, PeriodDtype) from pandas.util import testing as tm -_multiprocess_can_split_ = True - class TestPossiblyDowncast(tm.TestCase): diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py index 7c17c61aec440..4667bbd47ad18 100644 --- a/pandas/tests/types/test_common.py +++ b/pandas/tests/types/test_common.py @@ -7,8 +7,6 @@ import pandas.util.testing as tm -_multiprocess_can_split_ = True - class TestPandasDtype(tm.TestCase): diff --git a/pandas/tests/types/test_concat.py b/pandas/tests/types/test_concat.py index 8acafe0af1792..f4faab45f4ba2 100644 --- a/pandas/tests/types/test_concat.py +++ b/pandas/tests/types/test_concat.py @@ -7,8 +7,6 @@ class TestConcatCompat(tm.TestCase): - _multiprocess_can_split_ = True - def check_concat(self, to_concat, exp): for klass in [pd.Index, pd.Series]: to_concat_klass = [klass(c) for c in to_concat] diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 68105cfd7c886..8ef2868ae324f 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -15,8 +15,6 @@ _coerce_to_dtype) import pandas.util.testing as tm -_multiprocess_can_split_ = True - class Base(object): diff --git a/pandas/tests/types/test_generic.py b/pandas/tests/types/test_generic.py index 2861252bef26a..c7c8b0becad63 100644 --- a/pandas/tests/types/test_generic.py +++ b/pandas/tests/types/test_generic.py @@ -5,8 +5,6 @@ import pandas.util.testing as tm from pandas.types import generic as gt -_multiprocess_can_split_ = True - class TestABCClasses(tm.TestCase): tuples = [[1, 2, 2], ['red', 'blue', 'red']] diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 15f9545f3476c..629aa63f4a0ae 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -35,8 +35,6 @@ from pandas.types.missing import isnull from pandas.util import testing as tm -_multiprocess_can_split_ = True - def test_is_sequence(): is_seq = inference.is_sequence @@ -340,7 +338,6 @@ def test_mixed_dtypes_remain_object_array(self): class TestTypeInference(tm.TestCase): - _multiprocess_can_split_ = True def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index 2b09cf5ab633d..cab44f1122ae1 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -14,8 +14,6 @@ from pandas.types.missing import (array_equivalent, isnull, notnull, na_value_for_dtype) -_multiprocess_can_split_ = True - def test_notnull(): assert notnull(1.) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index dae24c48b8238..87a0dda34a525 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -17,8 +17,6 @@ class ConcatenateBase(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.frame = DataFrame(tm.getSeriesData()) self.mixed_frame = self.frame.copy() diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index fb1f187ddd5c0..05a352f259e8b 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -8,8 +8,6 @@ class TestHashing(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.df = DataFrame( {'i32': np.array([1, 2, 3] * 3, dtype='int32'), diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index 605a85026d605..ff0a494bd7d02 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -20,8 +20,6 @@ class TestJoin(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): # aggregate multiple columns self.df = DataFrame({'key1': get_test_data(), diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 88856a012da6f..a348a901442c9 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -33,8 +33,6 @@ def get_test_data(ngroups=NGROUPS, n=N): class TestMerge(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): # aggregate multiple columns self.df = DataFrame({'key1': get_test_data(), diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index 8e7323f72a8f5..76798b3c895ea 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -11,7 +11,6 @@ class TestAsOfMerge(tm.TestCase): - _multiprocess_can_split_ = True def read_data(self, name, dedupe=False): path = os.path.join(tm.get_data_path(), name) @@ -686,7 +685,7 @@ def test_allow_exact_matches_and_tolerance3(self): # GH 13709 df1 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030', - '2016-07-15 13:30:00.030']), + '2016-07-15 13:30:00.030']), 'username': ['bob', 'charlie']}) df2 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.000', diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 398e57d4ad0a4..40b46c5413c8f 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -12,8 +12,6 @@ class TestPivotTable(tm.TestCase): - _multiprocess_can_split_ = True - def setUp(self): self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', @@ -1152,8 +1150,8 @@ def test_crosstab_normalize(self): pd.crosstab(df.a, df.b, normalize='index')) row_normal_margins = pd.DataFrame([[1.0, 0], - [0.25, 0.75], - [0.4, 0.6]], + [0.25, 0.75], + [0.4, 0.6]], index=pd.Index([1, 2, 'All'], name='a', dtype='object'), @@ -1165,8 +1163,8 @@ def test_crosstab_normalize(self): name='b')) all_normal_margins = pd.DataFrame([[0.2, 0, 0.2], - [0.2, 0.6, 0.8], - [0.4, 0.6, 1]], + [0.2, 0.6, 0.8], + [0.4, 0.6, 1]], index=pd.Index([1, 2, 'All'], name='a', dtype='object'), diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index c5261597cf35d..de44eadc15751 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -303,7 +303,7 @@ def test_datetime_cut(self): data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) result, bins = cut(data, 3, retbins=True) expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], ).astype("category", ordered=True) tm.assert_series_equal(result, expected) @@ -316,8 +316,8 @@ def test_datetime_cut(self): # testing for time data to be present as ndarray data = np.array([np.datetime64('2013-01-01'), - np.datetime64('2013-01-02'), - np.datetime64('2013-01-03')]) + np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')]) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) @@ -330,7 +330,7 @@ def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] expected = Series(['(2012-12-12 00:00:00, 2012-12-14 00:00:00]', - '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'], + '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'], ).astype("category", ordered=True) for conv in [Timestamp, Timestamp, np.datetime64]: diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8c75195b25ef5..98151d5b6130c 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -393,7 +393,7 @@ def __array_wrap__(self, result, context=None): left = context[1][0] right = context[1][1] if (isinstance(left, PeriodIndex) and - isinstance(right, PeriodIndex)): + isinstance(right, PeriodIndex)): name = left.name if left.name == right.name else None return Index(result, name=name) elif isinstance(left, Period) or isinstance(right, Period): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 2ff06517f175a..be3b917cb8117 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -15,6 +15,7 @@ class TestTimedeltaIndexOps(Ops): + def setUp(self): super(TestTimedeltaIndexOps, self).setUp() mask = lambda x: isinstance(x, TimedeltaIndex) @@ -490,7 +491,7 @@ def test_addition_ops(self): def test_comp_nat(self): left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, - pd.Timedelta('3 days')]) + pd.Timedelta('3 days')]) right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) for l, r in [(left, right), (left.asobject, right.asobject)]: @@ -854,6 +855,7 @@ def test_equals(self): class TestPeriodIndexOps(Ops): + def setUp(self): super(TestPeriodIndexOps, self).setUp() mask = lambda x: (isinstance(x, DatetimeIndex) or diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tseries/tests/test_bin_groupby.py index 08c0833be0cd6..51a10f4141ab5 100644 --- a/pandas/tseries/tests/test_bin_groupby.py +++ b/pandas/tseries/tests/test_bin_groupby.py @@ -46,7 +46,6 @@ def test_series_bin_grouper(): class TestBinGroupers(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): self.obj = np.random.randn(10, 1) @@ -122,6 +121,7 @@ class TestMoments(tm.TestCase): class TestReducer(tm.TestCase): + def test_int_index(self): from pandas.core.series import Series diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index 8caed80f5a45b..b934aaed7d41f 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -19,6 +19,7 @@ def test_timtetonum_accepts_unicode(): class TestDateTimeConverter(tm.TestCase): + def setUp(self): self.dtc = converter.DatetimeConverter() self.tc = converter.TimeFormatter(None) @@ -142,6 +143,7 @@ def _assert_less(ts1, ts2): class TestPeriodConverter(tm.TestCase): + def setUp(self): self.pc = converter.PeriodConverter() diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 209e6e40d5cf0..a64882380850b 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -73,6 +73,7 @@ def test_precision_finer_than_offset(self): class TestDateRange(tm.TestCase): + def setUp(self): self.rng = bdate_range(START, END) @@ -588,6 +589,7 @@ def test_freq_divides_end_in_nanos(self): class TestCustomDateRange(tm.TestCase): + def setUp(self): self.rng = cdate_range(START, END) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index dfb7b26371d7a..9983bf5270b29 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -477,6 +477,7 @@ def test_get_freq_code(self): class TestFrequencyInference(tm.TestCase): + def test_raise_if_period_index(self): index = PeriodIndex(start="1/1/1990", periods=20, freq="M") self.assertRaises(TypeError, frequencies.infer_freq, index) diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tseries/tests/test_holiday.py index d4d273347e6e3..2adf28a506c53 100644 --- a/pandas/tseries/tests/test_holiday.py +++ b/pandas/tseries/tests/test_holiday.py @@ -18,6 +18,7 @@ class TestCalendar(tm.TestCase): + def setUp(self): self.holiday_list = [ datetime(2012, 1, 2), @@ -54,6 +55,7 @@ def test_calendar_caching(self): # Test for issue #9552 class TestCalendar(AbstractHolidayCalendar): + def __init__(self, name=None, rules=None): super(TestCalendar, self).__init__(name=name, rules=rules) @@ -83,6 +85,7 @@ def test_rule_from_name(self): class TestHoliday(tm.TestCase): + def setUp(self): self.start_date = datetime(2011, 1, 1) self.end_date = datetime(2020, 12, 31) @@ -288,6 +291,7 @@ def test_factory(self): class TestObservanceRules(tm.TestCase): + def setUp(self): self.we = datetime(2014, 4, 9) self.th = datetime(2014, 4, 10) diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index ac488a3dfdcb2..7c5a4c3df28b2 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -38,8 +38,6 @@ import pandas.util.testing as tm from pandas.tseries.holiday import USFederalHolidayCalendar -_multiprocess_can_split_ = True - def test_monthrange(): import calendar @@ -507,7 +505,6 @@ def test_pickle_v0_15_2(self): class TestDateOffset(Base): - _multiprocess_can_split_ = True def setUp(self): self.d = Timestamp(datetime(2008, 1, 2)) @@ -547,7 +544,6 @@ def test_eq(self): class TestBusinessDay(Base): - _multiprocess_can_split_ = True _offset = BDay def setUp(self): @@ -725,7 +721,6 @@ def test_offsets_compare_equal(self): class TestBusinessHour(Base): - _multiprocess_can_split_ = True _offset = BusinessHour def setUp(self): @@ -1432,7 +1427,6 @@ def test_datetimeindex(self): class TestCustomBusinessHour(Base): - _multiprocess_can_split_ = True _offset = CustomBusinessHour def setUp(self): @@ -1693,7 +1687,6 @@ def test_apply_nanoseconds(self): class TestCustomBusinessDay(Base): - _multiprocess_can_split_ = True _offset = CDay def setUp(self): @@ -1931,7 +1924,6 @@ def test_pickle_compat_0_14_1(self): class CustomBusinessMonthBase(object): - _multiprocess_can_split_ = True def setUp(self): self.d = datetime(2008, 1, 1) @@ -3257,6 +3249,7 @@ def makeFY5253LastOfMonth(*args, **kwds): class TestFY5253LastOfMonth(Base): + def test_onOffset(self): offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, @@ -3342,6 +3335,7 @@ def test_apply(self): class TestFY5253NearestEndMonth(Base): + def test_get_target_month_end(self): self.assertEqual(makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT) @@ -3507,6 +3501,7 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter(Base): + def test_isAnchored(self): self.assertTrue( makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SAT, @@ -3729,6 +3724,7 @@ def test_get_weeks(self): class TestFY5253NearestEndMonthQuarter(Base): + def test_onOffset(self): offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( @@ -3814,6 +3810,7 @@ def test_offset(self): class TestQuarterBegin(Base): + def test_repr(self): self.assertEqual(repr(QuarterBegin()), "") @@ -4168,6 +4165,7 @@ def test_onOffset(self): class TestBYearEndLagged(Base): + def test_bad_month_fail(self): self.assertRaises(Exception, BYearEnd, month=13) self.assertRaises(Exception, BYearEnd, month=0) @@ -4307,6 +4305,7 @@ def test_onOffset(self): class TestYearEndDiffMonth(Base): + def test_offset(self): tests = [] @@ -4542,6 +4541,7 @@ def test_compare_ticks(self): class TestOffsetNames(tm.TestCase): + def test_get_offset_name(self): self.assertEqual(BDay().freqstr, 'B') self.assertEqual(BDay(2).freqstr, '2B') @@ -4600,6 +4600,7 @@ def test_get_offset_legacy(): class TestParseTimeString(tm.TestCase): + def test_parse_time_string(self): (date, parsed, reso) = parse_time_string('4Q1984') (date_lower, parsed_lower, reso_lower) = parse_time_string('4q1984') @@ -4662,6 +4663,7 @@ def test_quarterly_dont_normalize(): class TestOffsetAliases(tm.TestCase): + def setUp(self): _offset_map.clear() @@ -4797,6 +4799,7 @@ def test_week_of_month_index_creation(self): class TestReprNames(tm.TestCase): + def test_str_for_named_is_name(self): # look at all the amazing combinations! month_prefixes = ['A', 'AS', 'BA', 'BAS', 'Q', 'BQ', 'BQS', 'QS'] diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index fdc067a827a5b..a39830b6aede6 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1652,6 +1652,7 @@ def test_is_leap_year(self): class TestPeriodIndex(tm.TestCase): + def setUp(self): pass @@ -4456,6 +4457,7 @@ def test_negone_ordinals(self): class TestComparisons(tm.TestCase): + def setUp(self): self.january1 = Period('2000-01', 'M') self.january2 = Period('2000-01', 'M') @@ -4961,6 +4963,7 @@ def test_ops_frame_period(self): class TestPeriodField(tm.TestCase): + def test_get_period_field_raises_on_out_of_range(self): self.assertRaises(ValueError, _period.get_period_field, -1, 0, 0) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 222ffb735921a..afb44887fe7d1 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -49,7 +49,6 @@ def _simple_pts(start, end, freq='D'): class TestResampleAPI(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): dti = DatetimeIndex(start=datetime(2005, 1, 1), @@ -754,8 +753,8 @@ def test_resample_empty_series(self): self.assertEqual(result.index.freq, expected.index.freq) if (method == 'size' and - isinstance(result.index, PeriodIndex) and - freq in ['M', 'D']): + isinstance(result.index, PeriodIndex) and + freq in ['M', 'D']): # GH12871 - TODO: name should propagate, but currently # doesn't on lower / same frequency with PeriodIndex assert_series_equal(result, expected, check_dtype=False, @@ -839,7 +838,6 @@ def test_resample_loffset_arg_type(self): class TestDatetimeIndex(Base, tm.TestCase): - _multiprocess_can_split_ = True _index_factory = lambda x: date_range def setUp(self): @@ -990,6 +988,7 @@ def fn(x, a=1): return str(type(x)) class fn_class: + def __call__(self, x): return str(type(x)) @@ -2135,7 +2134,6 @@ def test_resample_datetime_values(self): class TestPeriodIndex(Base, tm.TestCase): - _multiprocess_can_split_ = True _index_factory = lambda x: period_range def create_series(self): @@ -2744,7 +2742,6 @@ def test_evenly_divisible_with_no_extra_bins(self): class TestTimedeltaIndex(Base, tm.TestCase): - _multiprocess_can_split_ = True _index_factory = lambda x: timedelta_range def create_series(self): @@ -2766,6 +2763,7 @@ def test_asfreq_bug(self): class TestResamplerGrouper(tm.TestCase): + def setUp(self): self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, 'B': np.arange(40)}, @@ -2960,6 +2958,7 @@ def test_median_duplicate_columns(self): class TestTimeGrouper(tm.TestCase): + def setUp(self): self.ts = Series(np.random.randn(1000), index=date_range('1/1/2000', periods=1000)) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 13263259e0b8a..170d5cdafa60b 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -24,7 +24,6 @@ class TestTimedeltas(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): pass @@ -1231,7 +1230,6 @@ def test_timedelta_arithmetic(self): class TestTimedeltaIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_pass_TimedeltaIndex_to_index(self): @@ -1907,6 +1905,7 @@ def test_factorize(self): class TestSlicing(tm.TestCase): + def test_partial_slice(self): rng = timedelta_range('1 day 10:11:12', freq='h', periods=500) s = Series(np.arange(len(rng)), index=rng) diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py index 5395056c93412..17cc93ac42639 100644 --- a/pandas/tseries/tests/test_timeseries_legacy.py +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -27,8 +27,6 @@ # class TestLegacySupport(unittest.TestCase): class LegacySupport(object): - _multiprocess_can_split_ = True - @classmethod def setUpClass(cls): if compat.PY3: diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 00b60ba620c4b..38cd8079faf93 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -52,7 +52,6 @@ def dst(self, dt): class TestTimeZoneSupportPytz(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): tm._skip_if_no_pytz() @@ -899,7 +898,6 @@ def test_datetimeindex_tz_nat(self): class TestTimeZoneSupportDateutil(TestTimeZoneSupportPytz): - _multiprocess_can_split_ = True def setUp(self): tm._skip_if_no_dateutil() @@ -1142,6 +1140,7 @@ def test_tz_convert_tzlocal(self): class TestTimeZoneCacheKey(tm.TestCase): + def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): tzs = pytz.common_timezones for tz_name in tzs: @@ -1158,7 +1157,6 @@ def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): class TestTimeZones(tm.TestCase): - _multiprocess_can_split_ = True timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] def setUp(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 20e91a6f5bc44..a141d445e6035 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -49,6 +49,7 @@ def test_to_datetime_bijective(self): class TestDatetimeParsingWrappers(tm.TestCase): + def test_does_not_convert_mixed_integer(self): bad_date_strings = ('-50000', '999', '123.1234', 'm', 'T') @@ -408,6 +409,7 @@ def test_parsers_iso8601(self): class TestArrayToDatetime(tm.TestCase): + def test_parsing_valid_dates(self): arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) self.assert_numpy_array_equal( @@ -523,6 +525,7 @@ def test_parsing_timezone_offsets(self): class TestTslib(tm.TestCase): + def test_intraday_conversion_factors(self): self.assertEqual(period_asfreq( 1, get_freq('D'), get_freq('H'), False), 24) diff --git a/pandas/types/generic.py b/pandas/types/generic.py index 86d266f4595e2..756fb47596700 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -53,6 +53,7 @@ def _check(cls, inst): class _ABCGeneric(type): + def __instancecheck__(cls, inst): return hasattr(inst, "_data") diff --git a/pandas/util/clipboard/__init__.py b/pandas/util/clipboard/__init__.py index 358c9b5f8035a..9e2b2faf858db 100644 --- a/pandas/util/clipboard/__init__.py +++ b/pandas/util/clipboard/__init__.py @@ -107,4 +107,4 @@ def set_clipboard(clipboard): # pandas aliases clipboard_get = paste -clipboard_set = copy \ No newline at end of file +clipboard_set = copy diff --git a/pandas/util/clipboard/clipboards.py b/pandas/util/clipboard/clipboards.py index 182a685f956e6..f73f4f191d577 100644 --- a/pandas/util/clipboard/clipboards.py +++ b/pandas/util/clipboard/clipboards.py @@ -123,6 +123,7 @@ def paste_klipper(): def init_no_clipboard(): class ClipboardUnavailable(object): + def __call__(self, *args, **kwargs): raise PyperclipException(EXCEPT_MSG) diff --git a/pandas/util/clipboard/exceptions.py b/pandas/util/clipboard/exceptions.py index 615335f3a58da..f42d263a02993 100644 --- a/pandas/util/clipboard/exceptions.py +++ b/pandas/util/clipboard/exceptions.py @@ -7,6 +7,7 @@ class PyperclipException(RuntimeError): class PyperclipWindowsException(PyperclipException): + def __init__(self, message): message += " (%s)" % ctypes.WinError() super(PyperclipWindowsException, self).__init__(message) diff --git a/pandas/util/clipboard/windows.py b/pandas/util/clipboard/windows.py index 956d5b9d34025..5c9be9ddaf508 100644 --- a/pandas/util/clipboard/windows.py +++ b/pandas/util/clipboard/windows.py @@ -10,6 +10,7 @@ class CheckedCall(object): + def __init__(self, f): super(CheckedCall, self).__setattr__("f", f) @@ -133,7 +134,8 @@ def copy_windows(text): count * sizeof(c_wchar)) locked_handle = safeGlobalLock(handle) - ctypes.memmove(c_wchar_p(locked_handle), c_wchar_p(text), count * sizeof(c_wchar)) + ctypes.memmove(c_wchar_p(locked_handle), + c_wchar_p(text), count * sizeof(c_wchar)) safeGlobalUnlock(handle) safeSetClipboardData(CF_UNICODETEXT, handle) diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index e1888a3ffd62a..85d77c2f6f57c 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -125,6 +125,7 @@ def some_function(x): def some_function(x): "%s %s wrote the Raven" """ + def __init__(self, *args, **kwargs): if (args and kwargs): raise AssertionError("Only positional or keyword args are allowed") @@ -171,6 +172,7 @@ def my_dog(has='fleas'): "This docstring will have a copyright below" pass """ + def __init__(self, addendum, join='', indents=0): if indents > 0: self.addendum = indent(addendum, indents=indents) diff --git a/pandas/util/depr_module.py b/pandas/util/depr_module.py index 736d2cdaab31c..cf8b0f7960f17 100644 --- a/pandas/util/depr_module.py +++ b/pandas/util/depr_module.py @@ -16,6 +16,7 @@ class _DeprecatedModule(object): removals : objects or methods in module that will no longer be accessible once module is removed. """ + def __init__(self, deprmod, removals=None): self.deprmod = deprmod self.removals = removals diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6ea91543677a7..6b2e920a24063 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -165,7 +165,7 @@ def assert_almost_equal(left, right, check_exact=False, pass else: if (isinstance(left, np.ndarray) or - isinstance(right, np.ndarray)): + isinstance(right, np.ndarray)): obj = 'numpy array' else: obj = 'Input' @@ -1103,7 +1103,6 @@ def assert_series_equal(left, right, check_dtype=True, check_datetimelike_compat=False, check_categorical=True, obj='Series'): - """Check that left and right Series are equal. Parameters @@ -1211,7 +1210,6 @@ def assert_frame_equal(left, right, check_dtype=True, check_categorical=True, check_like=False, obj='DataFrame'): - """Check that left and right DataFrame are equal. Parameters @@ -2446,6 +2444,7 @@ class _AssertRaisesContextmanager(object): Handles the behind the scenes work for assertRaises and assertRaisesRegexp """ + def __init__(self, exception, regexp=None, *args, **kwargs): self.exception = exception if regexp is not None and not hasattr(regexp, "search"): From 1fc92b9c54a8b2c89f83b48cf7a4a2e706e10656 Mon Sep 17 00:00:00 2001 From: TrigonaMinima Date: Tue, 7 Feb 2017 03:56:12 +0530 Subject: [PATCH 107/338] TST/CLN: reorg more of tseries/tests xref #14854 closes #15324 --- .../indexes/datetimes/test_date_range.py | 810 ++++++- pandas/tests/indexes/datetimes/test_ops.py | 17 +- pandas/tests/indexes/datetimes/test_tools.py | 626 ++++- pandas/tests/indexes/period/__init__.py | 0 pandas/tests/indexes/period/test_period.py | 233 ++ pandas/tests/indexes/test_datetimelike.py | 465 ---- pandas/tests/indexes/test_timedelta.py | 42 - pandas/tests/indexes/timedeltas/__init__.py | 0 .../tests/indexes/timedeltas/test_astype.py | 121 + .../indexes/timedeltas/test_construction.py | 88 + .../tests/indexes/timedeltas/test_indexing.py | 110 + pandas/tests/indexes/timedeltas/test_ops.py | 1276 ++++++++++ .../timedeltas/test_partial_slicing.py | 81 + .../tests/indexes/timedeltas/test_setops.py | 76 + .../indexes/timedeltas/test_timedelta.py | 592 +++++ .../timedeltas/test_timedelta_range.py | 51 + pandas/tests/indexes/timedeltas/test_tools.py | 201 ++ pandas/tests/scalar/test_timedelta.py | 712 ++++++ pandas/tests/scalar/test_timestamp.py | 45 + pandas/tseries/tests/test_base.py | 846 +------ pandas/tseries/tests/test_daterange.py | 820 ------- pandas/tseries/tests/test_period.py | 119 +- pandas/tseries/tests/test_timedeltas.py | 2051 ----------------- pandas/tseries/tests/test_timezones.py | 70 +- pandas/tseries/tests/test_tslib.py | 694 ------ pandas/tseries/tests/test_util.py | 126 - 26 files changed, 5192 insertions(+), 5080 deletions(-) create mode 100644 pandas/tests/indexes/period/__init__.py create mode 100644 pandas/tests/indexes/period/test_period.py delete mode 100644 pandas/tests/indexes/test_datetimelike.py delete mode 100644 pandas/tests/indexes/test_timedelta.py create mode 100644 pandas/tests/indexes/timedeltas/__init__.py create mode 100644 pandas/tests/indexes/timedeltas/test_astype.py create mode 100644 pandas/tests/indexes/timedeltas/test_construction.py create mode 100644 pandas/tests/indexes/timedeltas/test_indexing.py create mode 100644 pandas/tests/indexes/timedeltas/test_ops.py create mode 100644 pandas/tests/indexes/timedeltas/test_partial_slicing.py create mode 100644 pandas/tests/indexes/timedeltas/test_setops.py create mode 100644 pandas/tests/indexes/timedeltas/test_timedelta.py create mode 100644 pandas/tests/indexes/timedeltas/test_timedelta_range.py create mode 100644 pandas/tests/indexes/timedeltas/test_tools.py delete mode 100644 pandas/tseries/tests/test_daterange.py delete mode 100644 pandas/tseries/tests/test_timedeltas.py delete mode 100644 pandas/tseries/tests/test_tslib.py delete mode 100644 pandas/tseries/tests/test_util.py diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 9d5f397329c76..8dab10269f76d 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -1,12 +1,20 @@ +import numpy as np from datetime import datetime, timedelta, time import pandas as pd import pandas.util.testing as tm -from pandas import date_range, offsets, DatetimeIndex, Timestamp from pandas import compat +from pandas.core import common as com +from pandas.util.testing import assertRaisesRegexp +from pandas.tseries.index import bdate_range, cdate_range +from pandas import date_range, offsets, DatetimeIndex, Timestamp, Index +from pandas.tseries.offsets import (generate_range, CDay, BDay, Minute, + BMonthEnd, DateOffset, MonthEnd) from pandas.tests.series.common import TestData +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + class TestTimeSeries(TestData, tm.TestCase): @@ -127,3 +135,803 @@ def test_catch_infinite_loop(self): # blow up, don't loop forever self.assertRaises(Exception, date_range, datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) + + +def eq_gen_range(kwargs, expected): + rng = generate_range(**kwargs) + assert (np.array_equal(list(rng), expected)) + + +class TestGenRangeGeneration(tm.TestCase): + + def test_generate(self): + rng1 = list(generate_range(START, END, offset=BDay())) + rng2 = list(generate_range(START, END, time_rule='B')) + self.assertEqual(rng1, rng2) + + def test_generate_cday(self): + rng1 = list(generate_range(START, END, offset=CDay())) + rng2 = list(generate_range(START, END, time_rule='C')) + self.assertEqual(rng1, rng2) + + def test_1(self): + eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), + [datetime(2009, 3, 25), datetime(2009, 3, 26)]) + + def test_2(self): + eq_gen_range(dict(start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3)), + [datetime(2008, 1, 1), + datetime(2008, 1, 2), + datetime(2008, 1, 3)]) + + def test_3(self): + eq_gen_range(dict(start=datetime(2008, 1, 5), + end=datetime(2008, 1, 6)), + []) + + def test_precision_finer_than_offset(self): + # GH 9907 + result1 = DatetimeIndex(start='2015-04-15 00:00:03', + end='2016-04-22 00:00:00', freq='Q') + result2 = DatetimeIndex(start='2015-04-15 00:00:03', + end='2015-06-22 00:00:04', freq='W') + expected1_list = ['2015-06-30 00:00:03', '2015-09-30 00:00:03', + '2015-12-31 00:00:03', '2016-03-31 00:00:03'] + expected2_list = ['2015-04-19 00:00:03', '2015-04-26 00:00:03', + '2015-05-03 00:00:03', '2015-05-10 00:00:03', + '2015-05-17 00:00:03', '2015-05-24 00:00:03', + '2015-05-31 00:00:03', '2015-06-07 00:00:03', + '2015-06-14 00:00:03', '2015-06-21 00:00:03'] + expected1 = DatetimeIndex(expected1_list, dtype='datetime64[ns]', + freq='Q-DEC', tz=None) + expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]', + freq='W-SUN', tz=None) + self.assert_index_equal(result1, expected1) + self.assert_index_equal(result2, expected2) + + +class TestDateRange(tm.TestCase): + def setUp(self): + self.rng = bdate_range(START, END) + + def test_constructor(self): + bdate_range(START, END, freq=BDay()) + bdate_range(START, periods=20, freq=BDay()) + bdate_range(end=START, periods=20, freq=BDay()) + self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') + self.assertRaises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') + + def test_naive_aware_conflicts(self): + naive = bdate_range(START, END, freq=BDay(), tz=None) + aware = bdate_range(START, END, freq=BDay(), + tz="Asia/Hong_Kong") + assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", naive.join, aware) + assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", aware.join, naive) + + def test_cached_range(self): + DatetimeIndex._cached_range(START, END, offset=BDay()) + DatetimeIndex._cached_range(START, periods=20, offset=BDay()) + DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) + + assertRaisesRegexp(TypeError, "offset", DatetimeIndex._cached_range, + START, END) + + assertRaisesRegexp(TypeError, "specify period", + DatetimeIndex._cached_range, START, + offset=BDay()) + + assertRaisesRegexp(TypeError, "specify period", + DatetimeIndex._cached_range, end=END, + offset=BDay()) + + assertRaisesRegexp(TypeError, "start or end", + DatetimeIndex._cached_range, periods=20, + offset=BDay()) + + def test_cached_range_bug(self): + rng = date_range('2010-09-01 05:00:00', periods=50, + freq=DateOffset(hours=6)) + self.assertEqual(len(rng), 50) + self.assertEqual(rng[0], datetime(2010, 9, 1, 5)) + + def test_timezone_comparaison_bug(self): + start = Timestamp('20130220 10:00', tz='US/Eastern') + try: + date_range(start, periods=2, tz='US/Eastern') + except AssertionError: + self.fail() + + def test_timezone_comparaison_assert(self): + start = Timestamp('20130220 10:00', tz='US/Eastern') + self.assertRaises(AssertionError, date_range, start, periods=2, + tz='Europe/Berlin') + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assertTrue(comp[11]) + self.assertFalse(comp[9]) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assert_index_equal(cp, self.rng) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) + + self.assertEqual(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEqual(sliced.offset, BDay() * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEqual(len(fancy_indexed), 5) + tm.assertIsInstance(fancy_indexed, DatetimeIndex) + self.assertIsNone(fancy_indexed.freq) + + # 32-bit vs. 64-bit platforms + self.assertEqual(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_numpy_array_equal(values, expected) + + def test_shift(self): + shifted = self.rng.shift(5) + self.assertEqual(shifted[0], self.rng[5]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEqual(shifted[5], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEqual(shifted[0], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=BDay()) + self.assertEqual(shifted[0], rng[0] + BDay()) + + def test_pickle_unpickle(self): + unpickled = self.round_trip_pickle(self.rng) + self.assertIsNotNone(unpickled.offset) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + tm.assertIsInstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # order does not matter + tm.assert_index_equal(right.union(left), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = self.rng.union(rng) + tm.assertIsInstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_join = self.rng.join(rng, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + def test_union_not_cacheable(self): + rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_union = rng1.union(rng2) + self.assert_index_equal(the_union, rng) + + rng1 = rng[10:] + rng2 = rng[15:35] + the_union = rng1.union(rng2) + expected = rng[10:] + self.assert_index_equal(the_union, expected) + + def test_intersection(self): + rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_int = rng1.intersection(rng2) + expected = rng[10:25] + self.assert_index_equal(the_int, expected) + tm.assertIsInstance(the_int, DatetimeIndex) + self.assertEqual(the_int.offset, rng.offset) + + the_int = rng1.intersection(rng2.view(DatetimeIndex)) + self.assert_index_equal(the_int, expected) + + # non-overlapping + the_int = rng[:10].intersection(rng[10:]) + expected = DatetimeIndex([]) + self.assert_index_equal(the_int, expected) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range('11/30/2011', '12/31/2011') + b = bdate_range('12/10/2011', '12/20/2011') + result = a.intersection(b) + self.assert_index_equal(result, b) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + + def test_summary_pytz(self): + tm._skip_if_no_pytz() + import pytz + bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_summary_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + bdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20) + firstDate = end - 19 * BDay() + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = '2007/100/1' + + self.assertRaises(ValueError, Timestamp, badly_formed_date) + + self.assertRaises(ValueError, bdate_range, start=badly_formed_date, + periods=10) + self.assertRaises(ValueError, bdate_range, end=badly_formed_date, + periods=10) + self.assertRaises(ValueError, bdate_range, badly_formed_date, + badly_formed_date) + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) + + def test_identical(self): + t1 = self.rng.copy() + t2 = self.rng.copy() + self.assertTrue(t1.identical(t2)) + + # name + t1 = t1.rename('foo') + self.assertTrue(t1.equals(t2)) + self.assertFalse(t1.identical(t2)) + t2 = t2.rename('foo') + self.assertTrue(t1.identical(t2)) + + # freq + t2v = Index(t2.values) + self.assertTrue(t1.equals(t2v)) + self.assertFalse(t1.identical(t2v)) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range('12/5/2011', '12/5/2011') + rng2 = bdate_range('12/2/2011', '12/5/2011') + rng2.offset = BDay() + + result = rng1.union(rng2) + tm.assertIsInstance(result, DatetimeIndex) + + def test_error_with_zero_monthends(self): + self.assertRaises(ValueError, date_range, '1/1/2000', '1/1/2001', + freq=MonthEnd(0)) + + def test_range_bug(self): + # GH #770 + offset = DateOffset(months=3) + result = date_range("2011-1-1", "2012-1-31", freq=offset) + + start = datetime(2011, 1, 1) + exp_values = [start + i * offset for i in range(5)] + tm.assert_index_equal(result, DatetimeIndex(exp_values)) + + def test_range_tz_pytz(self): + # GH 2906 + tm._skip_if_no_pytz() + from pytz import timezone + + tz = timezone('US/Eastern') + start = tz.localize(datetime(2011, 1, 1)) + end = tz.localize(datetime(2011, 1, 3)) + + dr = date_range(start=start, periods=3) + self.assertEqual(dr.tz.zone, tz.zone) + self.assertEqual(dr[0], start) + self.assertEqual(dr[2], end) + + dr = date_range(end=end, periods=3) + self.assertEqual(dr.tz.zone, tz.zone) + self.assertEqual(dr[0], start) + self.assertEqual(dr[2], end) + + dr = date_range(start=start, end=end) + self.assertEqual(dr.tz.zone, tz.zone) + self.assertEqual(dr[0], start) + self.assertEqual(dr[2], end) + + def test_range_tz_dst_straddle_pytz(self): + + tm._skip_if_no_pytz() + from pytz import timezone + tz = timezone('US/Eastern') + dates = [(tz.localize(datetime(2014, 3, 6)), + tz.localize(datetime(2014, 3, 12))), + (tz.localize(datetime(2013, 11, 1)), + tz.localize(datetime(2013, 11, 6)))] + for (start, end) in dates: + dr = date_range(start, end, freq='D') + self.assertEqual(dr[0], start) + self.assertEqual(dr[-1], end) + self.assertEqual(np.all(dr.hour == 0), True) + + dr = date_range(start, end, freq='D', tz='US/Eastern') + self.assertEqual(dr[0], start) + self.assertEqual(dr[-1], end) + self.assertEqual(np.all(dr.hour == 0), True) + + dr = date_range(start.replace(tzinfo=None), end.replace( + tzinfo=None), freq='D', tz='US/Eastern') + self.assertEqual(dr[0], start) + self.assertEqual(dr[-1], end) + self.assertEqual(np.all(dr.hour == 0), True) + + def test_range_tz_dateutil(self): + # GH 2906 + tm._skip_if_no_dateutil() + # Use maybe_get_tz to fix filename in tz under dateutil. + from pandas.tslib import maybe_get_tz + tz = lambda x: maybe_get_tz('dateutil/' + x) + + start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) + end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern')) + + dr = date_range(start=start, periods=3) + self.assertTrue(dr.tz == tz('US/Eastern')) + self.assertTrue(dr[0] == start) + self.assertTrue(dr[2] == end) + + dr = date_range(end=end, periods=3) + self.assertTrue(dr.tz == tz('US/Eastern')) + self.assertTrue(dr[0] == start) + self.assertTrue(dr[2] == end) + + dr = date_range(start=start, end=end) + self.assertTrue(dr.tz == tz('US/Eastern')) + self.assertTrue(dr[0] == start) + self.assertTrue(dr[2] == end) + + def test_month_range_union_tz_pytz(self): + tm._skip_if_no_pytz() + from pytz import timezone + tz = timezone('US/Eastern') + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, + freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, + freq=MonthEnd()) + + early_dr.union(late_dr) + + def test_month_range_union_tz_dateutil(self): + tm._skip_if_windows_python_3() + tm._skip_if_no_dateutil() + from pandas.tslib import _dateutil_gettz as timezone + tz = timezone('US/Eastern') + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, + freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, + freq=MonthEnd()) + + early_dr.union(late_dr) + + def test_range_closed(self): + begin = datetime(2011, 1, 1) + end = datetime(2014, 1, 1) + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq) + left = date_range(begin, end, closed="left", freq=freq) + right = date_range(begin, end, closed="right", freq=freq) + expected_left = left + expected_right = right + + if end == closed[-1]: + expected_left = closed[:-1] + if begin == closed[0]: + expected_right = closed[1:] + + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) + + def test_range_closed_with_tz_aware_start_end(self): + # GH12409, GH12684 + begin = Timestamp('2011/1/1', tz='US/Eastern') + end = Timestamp('2014/1/1', tz='US/Eastern') + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq) + left = date_range(begin, end, closed="left", freq=freq) + right = date_range(begin, end, closed="right", freq=freq) + expected_left = left + expected_right = right + + if end == closed[-1]: + expected_left = closed[:-1] + if begin == closed[0]: + expected_right = closed[1:] + + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) + + begin = Timestamp('2011/1/1') + end = Timestamp('2014/1/1') + begintz = Timestamp('2011/1/1', tz='US/Eastern') + endtz = Timestamp('2014/1/1', tz='US/Eastern') + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq, + tz='US/Eastern') + left = date_range(begin, end, closed="left", freq=freq, + tz='US/Eastern') + right = date_range(begin, end, closed="right", freq=freq, + tz='US/Eastern') + expected_left = left + expected_right = right + + if endtz == closed[-1]: + expected_left = closed[:-1] + if begintz == closed[0]: + expected_right = closed[1:] + + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) + + def test_range_closed_boundary(self): + # GH 11804 + for closed in ['right', 'left', None]: + right_boundary = date_range('2015-09-12', '2015-12-01', + freq='QS-MAR', closed=closed) + left_boundary = date_range('2015-09-01', '2015-09-12', + freq='QS-MAR', closed=closed) + both_boundary = date_range('2015-09-01', '2015-12-01', + freq='QS-MAR', closed=closed) + expected_right = expected_left = expected_both = both_boundary + + if closed == 'right': + expected_left = both_boundary[1:] + if closed == 'left': + expected_right = both_boundary[:-1] + if closed is None: + expected_right = both_boundary[1:] + expected_left = both_boundary[:-1] + + self.assert_index_equal(right_boundary, expected_right) + self.assert_index_equal(left_boundary, expected_left) + self.assert_index_equal(both_boundary, expected_both) + + def test_years_only(self): + # GH 6961 + dr = date_range('2014', '2015', freq='M') + self.assertEqual(dr[0], datetime(2014, 1, 31)) + self.assertEqual(dr[-1], datetime(2014, 12, 31)) + + def test_freq_divides_end_in_nanos(self): + # GH 10885 + result_1 = date_range('2005-01-12 10:00', '2005-01-12 16:00', + freq='345min') + result_2 = date_range('2005-01-13 10:00', '2005-01-13 16:00', + freq='345min') + expected_1 = DatetimeIndex(['2005-01-12 10:00:00', + '2005-01-12 15:45:00'], + dtype='datetime64[ns]', freq='345T', + tz=None) + expected_2 = DatetimeIndex(['2005-01-13 10:00:00', + '2005-01-13 15:45:00'], + dtype='datetime64[ns]', freq='345T', + tz=None) + self.assert_index_equal(result_1, expected_1) + self.assert_index_equal(result_2, expected_2) + + +class TestCustomDateRange(tm.TestCase): + def setUp(self): + self.rng = cdate_range(START, END) + + def test_constructor(self): + cdate_range(START, END, freq=CDay()) + cdate_range(START, periods=20, freq=CDay()) + cdate_range(end=START, periods=20, freq=CDay()) + self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'C') + self.assertRaises(ValueError, cdate_range, '2011-1-1', '2012-1-1', 'C') + + def test_cached_range(self): + DatetimeIndex._cached_range(START, END, offset=CDay()) + DatetimeIndex._cached_range(START, periods=20, + offset=CDay()) + DatetimeIndex._cached_range(end=START, periods=20, + offset=CDay()) + + self.assertRaises(Exception, DatetimeIndex._cached_range, START, END) + + self.assertRaises(Exception, DatetimeIndex._cached_range, START, + freq=CDay()) + + self.assertRaises(Exception, DatetimeIndex._cached_range, end=END, + freq=CDay()) + + self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, + freq=CDay()) + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assertTrue(comp[11]) + self.assertFalse(comp[9]) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assert_index_equal(cp, self.rng) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) + self.assertEqual(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEqual(sliced.offset, CDay() * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEqual(len(fancy_indexed), 5) + tm.assertIsInstance(fancy_indexed, DatetimeIndex) + self.assertIsNone(fancy_indexed.freq) + + # 32-bit vs. 64-bit platforms + self.assertEqual(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_numpy_array_equal(values, expected) + + def test_shift(self): + + shifted = self.rng.shift(5) + self.assertEqual(shifted[0], self.rng[5]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEqual(shifted[5], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEqual(shifted[0], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + with tm.assert_produces_warning(com.PerformanceWarning): + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=CDay()) + self.assertEqual(shifted[0], rng[0] + CDay()) + + def test_pickle_unpickle(self): + unpickled = self.round_trip_pickle(self.rng) + self.assertIsNotNone(unpickled.offset) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + tm.assertIsInstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # order does not matter + self.assert_index_equal(right.union(left), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = self.rng.union(rng) + tm.assertIsInstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_join = self.rng.join(rng, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + def test_intersection_bug(self): + # GH #771 + a = cdate_range('11/30/2011', '12/31/2011') + b = cdate_range('12/10/2011', '12/20/2011') + result = a.intersection(b) + self.assert_index_equal(result, b) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + + def test_summary_pytz(self): + tm._skip_if_no_pytz() + import pytz + cdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_summary_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + cdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = cdate_range(end=end, periods=20) + firstDate = end - 19 * CDay() + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = '2007/100/1' + + self.assertRaises(ValueError, Timestamp, badly_formed_date) + + self.assertRaises(ValueError, cdate_range, start=badly_formed_date, + periods=10) + self.assertRaises(ValueError, cdate_range, end=badly_formed_date, + periods=10) + self.assertRaises(ValueError, cdate_range, badly_formed_date, + badly_formed_date) + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = cdate_range('12/5/2011', '12/5/2011') + rng2 = cdate_range('12/2/2011', '12/5/2011') + rng2.offset = CDay() + + result = rng1.union(rng2) + tm.assertIsInstance(result, DatetimeIndex) + + def test_cdaterange(self): + rng = cdate_range('2013-05-01', periods=3) + xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) + self.assert_index_equal(xp, rng) + + def test_cdaterange_weekmask(self): + rng = cdate_range('2013-05-01', periods=3, + weekmask='Sun Mon Tue Wed Thu') + xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) + self.assert_index_equal(xp, rng) + + def test_cdaterange_holidays(self): + rng = cdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) + xp = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) + self.assert_index_equal(xp, rng) + + def test_cdaterange_weekmask_and_holidays(self): + rng = cdate_range('2013-05-01', periods=3, + weekmask='Sun Mon Tue Wed Thu', + holidays=['2013-05-01']) + xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) + self.assert_index_equal(xp, rng) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index a46980a0f742a..c7cdcd9318a0e 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -8,7 +8,7 @@ from pandas.core.common import PerformanceWarning from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, date_range, TimedeltaIndex, _np_version_under1p10, Index, - datetime, Float64Index) + datetime, Float64Index, offsets) from pandas.tests.test_base import Ops @@ -1070,3 +1070,18 @@ def test_datetime64_with_DateOffset(self): assert_func(klass([x + op for x in s]), s + op) assert_func(klass([x - op for x in s]), s - op) assert_func(klass([op + x for x in s]), op + s) + + +class TestTslib(tm.TestCase): + + def test_shift_months(self): + s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp( + '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( + '2000-02-29'), Timestamp('2000-12-31')]) + for years in [-1, 0, 1]: + for months in [-2, 0, 2]: + actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + + months)) + expected = DatetimeIndex([x + offsets.DateOffset( + years=years, months=months) for x in s]) + tm.assert_index_equal(actual, expected) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 841d0be605058..bf1f82b90d5d6 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1,22 +1,26 @@ """ test to_datetime """ -import nose - import sys -import calendar +import nose import locale -from datetime import datetime - +import calendar import numpy as np -from pandas.types.common import is_datetime64_ns_dtype -from pandas import (isnull, to_datetime, Timestamp, Series, DataFrame, - Index, DatetimeIndex, NaT, date_range, bdate_range) -from pandas import tslib -from pandas.compat import lmap +from datetime import datetime, date, time +from distutils.version import LooseVersion + import pandas as pd +from pandas import tslib from pandas.tseries import tools +from pandas.tseries.tools import normalize_date +from pandas.tseries.util import pivot_annual, isleapyear +from pandas.compat import lmap +from pandas.compat.numpy import np_array_datetime64_compat +from pandas.types.common import is_datetime64_ns_dtype from pandas.util import testing as tm -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, _skip_if_has_locale +from pandas import (isnull, to_datetime, Timestamp, Series, DataFrame, + Index, DatetimeIndex, NaT, date_range, bdate_range, + compat, lib) class TimeConversionFormats(tm.TestCase): @@ -1017,3 +1021,603 @@ def test_day_not_in_month_ignore(self): '2015-02-32', errors='ignore', format="%Y-%m-%d"), '2015-02-32') self.assertEqual(to_datetime( '2015-04-31', errors='ignore', format="%Y-%m-%d"), '2015-04-31') + + +class TestDatetimeParsingWrappers(tm.TestCase): + def test_does_not_convert_mixed_integer(self): + bad_date_strings = ('-50000', '999', '123.1234', 'm', 'T') + + for bad_date_string in bad_date_strings: + self.assertFalse(tslib._does_string_look_like_datetime( + bad_date_string)) + + good_date_strings = ('2012-01-01', + '01/01/2012', + 'Mon Sep 16, 2013', + '01012012', + '0101', + '1-1', ) + + for good_date_string in good_date_strings: + self.assertTrue(tslib._does_string_look_like_datetime( + good_date_string)) + + def test_parsers(self): + + # https://github.com/dateutil/dateutil/issues/217 + import dateutil + yearfirst = dateutil.__version__ >= LooseVersion('2.5.0') + + cases = {'2011-01-01': datetime(2011, 1, 1), + '2Q2005': datetime(2005, 4, 1), + '2Q05': datetime(2005, 4, 1), + '2005Q1': datetime(2005, 1, 1), + '05Q1': datetime(2005, 1, 1), + '2011Q3': datetime(2011, 7, 1), + '11Q3': datetime(2011, 7, 1), + '3Q2011': datetime(2011, 7, 1), + '3Q11': datetime(2011, 7, 1), + + # quarterly without space + '2000Q4': datetime(2000, 10, 1), + '00Q4': datetime(2000, 10, 1), + '4Q2000': datetime(2000, 10, 1), + '4Q00': datetime(2000, 10, 1), + '2000q4': datetime(2000, 10, 1), + '2000-Q4': datetime(2000, 10, 1), + '00-Q4': datetime(2000, 10, 1), + '4Q-2000': datetime(2000, 10, 1), + '4Q-00': datetime(2000, 10, 1), + '00q4': datetime(2000, 10, 1), + '2005': datetime(2005, 1, 1), + '2005-11': datetime(2005, 11, 1), + '2005 11': datetime(2005, 11, 1), + '11-2005': datetime(2005, 11, 1), + '11 2005': datetime(2005, 11, 1), + '200511': datetime(2020, 5, 11), + '20051109': datetime(2005, 11, 9), + '20051109 10:15': datetime(2005, 11, 9, 10, 15), + '20051109 08H': datetime(2005, 11, 9, 8, 0), + '2005-11-09 10:15': datetime(2005, 11, 9, 10, 15), + '2005-11-09 08H': datetime(2005, 11, 9, 8, 0), + '2005/11/09 10:15': datetime(2005, 11, 9, 10, 15), + '2005/11/09 08H': datetime(2005, 11, 9, 8, 0), + "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, + 36, 28), + "Thu Sep 25 2003": datetime(2003, 9, 25), + "Sep 25 2003": datetime(2003, 9, 25), + "January 1 2014": datetime(2014, 1, 1), + + # GH 10537 + '2014-06': datetime(2014, 6, 1), + '06-2014': datetime(2014, 6, 1), + '2014-6': datetime(2014, 6, 1), + '6-2014': datetime(2014, 6, 1), + + '20010101 12': datetime(2001, 1, 1, 12), + '20010101 1234': datetime(2001, 1, 1, 12, 34), + '20010101 123456': datetime(2001, 1, 1, 12, 34, 56), + } + + for date_str, expected in compat.iteritems(cases): + result1, _, _ = tools.parse_time_string(date_str, + yearfirst=yearfirst) + result2 = to_datetime(date_str, yearfirst=yearfirst) + result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below + result4 = to_datetime(np.array([date_str], dtype=object), + yearfirst=yearfirst) + result6 = DatetimeIndex([date_str], yearfirst=yearfirst) + # result7 is used below + result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) + result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) + + for res in [result1, result2]: + self.assertEqual(res, expected) + for res in [result3, result4, result6, result8, result9]: + exp = DatetimeIndex([pd.Timestamp(expected)]) + tm.assert_index_equal(res, exp) + + # these really need to have yearfist, but we don't support + if not yearfirst: + result5 = Timestamp(date_str) + self.assertEqual(result5, expected) + result7 = date_range(date_str, freq='S', periods=1, + yearfirst=yearfirst) + self.assertEqual(result7, expected) + + # NaT + result1, _, _ = tools.parse_time_string('NaT') + result2 = to_datetime('NaT') + result3 = Timestamp('NaT') + result4 = DatetimeIndex(['NaT'])[0] + self.assertTrue(result1 is tslib.NaT) + self.assertTrue(result1 is tslib.NaT) + self.assertTrue(result1 is tslib.NaT) + self.assertTrue(result1 is tslib.NaT) + + def test_parsers_quarter_invalid(self): + + cases = ['2Q 2005', '2Q-200A', '2Q-200', '22Q2005', '6Q-20', '2Q200.'] + for case in cases: + self.assertRaises(ValueError, tools.parse_time_string, case) + + def test_parsers_dayfirst_yearfirst(self): + tm._skip_if_no_dateutil() + + # OK + # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 + # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00 + # 2.5.3 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 + + # OK + # 2.5.1 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 + # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 + # 2.5.3 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 + + # bug fix in 2.5.2 + # 2.5.1 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-11-12 00:00:00 + # 2.5.2 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00 + # 2.5.3 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00 + + # OK + # 2.5.1 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 + # 2.5.2 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 + # 2.5.3 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 + + # OK + # 2.5.1 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.2 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.3 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 + + # OK + # 2.5.1 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 + # 2.5.2 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 + # 2.5.3 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 + + # revert of bug in 2.5.2 + # 2.5.1 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00 + # 2.5.2 20/12/21 [dayfirst=1, yearfirst=1] -> month must be in 1..12 + # 2.5.3 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00 + + # OK + # 2.5.1 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.2 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 + + import dateutil + is_lt_253 = dateutil.__version__ < LooseVersion('2.5.3') + + # str : dayfirst, yearfirst, expected + cases = {'10-11-12': [(False, False, + datetime(2012, 10, 11)), + (True, False, + datetime(2012, 11, 10)), + (False, True, + datetime(2010, 11, 12)), + (True, True, + datetime(2010, 12, 11))], + '20/12/21': [(False, False, + datetime(2021, 12, 20)), + (True, False, + datetime(2021, 12, 20)), + (False, True, + datetime(2020, 12, 21)), + (True, True, + datetime(2020, 12, 21))]} + + from dateutil.parser import parse + for date_str, values in compat.iteritems(cases): + for dayfirst, yearfirst, expected in values: + + # odd comparisons across version + # let's just skip + if dayfirst and yearfirst and is_lt_253: + continue + + # compare with dateutil result + dateutil_result = parse(date_str, dayfirst=dayfirst, + yearfirst=yearfirst) + self.assertEqual(dateutil_result, expected) + + result1, _, _ = tools.parse_time_string(date_str, + dayfirst=dayfirst, + yearfirst=yearfirst) + + # we don't support dayfirst/yearfirst here: + if not dayfirst and not yearfirst: + result2 = Timestamp(date_str) + self.assertEqual(result2, expected) + + result3 = to_datetime(date_str, dayfirst=dayfirst, + yearfirst=yearfirst) + + result4 = DatetimeIndex([date_str], dayfirst=dayfirst, + yearfirst=yearfirst)[0] + + self.assertEqual(result1, expected) + self.assertEqual(result3, expected) + self.assertEqual(result4, expected) + + def test_parsers_timestring(self): + tm._skip_if_no_dateutil() + from dateutil.parser import parse + + # must be the same as dateutil result + cases = {'10:15': (parse('10:15'), datetime(1, 1, 1, 10, 15)), + '9:05': (parse('9:05'), datetime(1, 1, 1, 9, 5))} + + for date_str, (exp_now, exp_def) in compat.iteritems(cases): + result1, _, _ = tools.parse_time_string(date_str) + result2 = to_datetime(date_str) + result3 = to_datetime([date_str]) + result4 = Timestamp(date_str) + result5 = DatetimeIndex([date_str])[0] + # parse time string return time string based on default date + # others are not, and can't be changed because it is used in + # time series plot + self.assertEqual(result1, exp_def) + self.assertEqual(result2, exp_now) + self.assertEqual(result3, exp_now) + self.assertEqual(result4, exp_now) + self.assertEqual(result5, exp_now) + + def test_parsers_time(self): + # GH11818 + _skip_if_has_locale() + strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500", + "2:15:00pm", "021500pm", time(14, 15)] + expected = time(14, 15) + + for time_string in strings: + self.assertEqual(tools.to_time(time_string), expected) + + new_string = "14.15" + self.assertRaises(ValueError, tools.to_time, new_string) + self.assertEqual(tools.to_time(new_string, format="%H.%M"), expected) + + arg = ["14:15", "20:20"] + expected_arr = [time(14, 15), time(20, 20)] + self.assertEqual(tools.to_time(arg), expected_arr) + self.assertEqual(tools.to_time(arg, format="%H:%M"), expected_arr) + self.assertEqual(tools.to_time(arg, infer_time_format=True), + expected_arr) + self.assertEqual(tools.to_time(arg, format="%I:%M%p", errors="coerce"), + [None, None]) + + res = tools.to_time(arg, format="%I:%M%p", errors="ignore") + self.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) + + with tm.assertRaises(ValueError): + tools.to_time(arg, format="%I:%M%p", errors="raise") + + self.assert_series_equal(tools.to_time(Series(arg, name="test")), + Series(expected_arr, name="test")) + + res = tools.to_time(np.array(arg)) + self.assertIsInstance(res, list) + self.assert_equal(res, expected_arr) + + def test_parsers_monthfreq(self): + cases = {'201101': datetime(2011, 1, 1, 0, 0), + '200005': datetime(2000, 5, 1, 0, 0)} + + for date_str, expected in compat.iteritems(cases): + result1, _, _ = tools.parse_time_string(date_str, freq='M') + self.assertEqual(result1, expected) + + def test_parsers_quarterly_with_freq(self): + msg = ('Incorrect quarterly string is given, quarter ' + 'must be between 1 and 4: 2013Q5') + with tm.assertRaisesRegexp(tslib.DateParseError, msg): + tools.parse_time_string('2013Q5') + + # GH 5418 + msg = ('Unable to retrieve month information from given freq: ' + 'INVLD-L-DEC-SAT') + with tm.assertRaisesRegexp(tslib.DateParseError, msg): + tools.parse_time_string('2013Q1', freq='INVLD-L-DEC-SAT') + + cases = {('2013Q2', None): datetime(2013, 4, 1), + ('2013Q2', 'A-APR'): datetime(2012, 8, 1), + ('2013-Q2', 'A-DEC'): datetime(2013, 4, 1)} + + for (date_str, freq), exp in compat.iteritems(cases): + result, _, _ = tools.parse_time_string(date_str, freq=freq) + self.assertEqual(result, exp) + + def test_parsers_timezone_minute_offsets_roundtrip(self): + # GH11708 + base = to_datetime("2013-01-01 00:00:00") + dt_strings = [ + ('2013-01-01 05:45+0545', + "Asia/Katmandu", + "Timestamp('2013-01-01 05:45:00+0545', tz='Asia/Katmandu')"), + ('2013-01-01 05:30+0530', + "Asia/Kolkata", + "Timestamp('2013-01-01 05:30:00+0530', tz='Asia/Kolkata')") + ] + + for dt_string, tz, dt_string_repr in dt_strings: + dt_time = to_datetime(dt_string) + self.assertEqual(base, dt_time) + converted_time = dt_time.tz_localize('UTC').tz_convert(tz) + self.assertEqual(dt_string_repr, repr(converted_time)) + + def test_parsers_iso8601(self): + # GH 12060 + # test only the iso parser - flexibility to different + # separators and leadings 0s + # Timestamp construction falls back to dateutil + cases = {'2011-01-02': datetime(2011, 1, 2), + '2011-1-2': datetime(2011, 1, 2), + '2011-01': datetime(2011, 1, 1), + '2011-1': datetime(2011, 1, 1), + '2011 01 02': datetime(2011, 1, 2), + '2011.01.02': datetime(2011, 1, 2), + '2011/01/02': datetime(2011, 1, 2), + '2011\\01\\02': datetime(2011, 1, 2), + '2013-01-01 05:30:00': datetime(2013, 1, 1, 5, 30), + '2013-1-1 5:30:00': datetime(2013, 1, 1, 5, 30)} + for date_str, exp in compat.iteritems(cases): + actual = tslib._test_parse_iso8601(date_str) + self.assertEqual(actual, exp) + + # seperators must all match - YYYYMM not valid + invalid_cases = ['2011-01/02', '2011^11^11', + '201401', '201111', '200101', + # mixed separated and unseparated + '2005-0101', '200501-01', + '20010101 12:3456', '20010101 1234:56', + # HHMMSS must have two digits in each component + # if unseparated + '20010101 1', '20010101 123', '20010101 12345', + '20010101 12345Z', + # wrong separator for HHMMSS + '2001-01-01 12-34-56'] + for date_str in invalid_cases: + with tm.assertRaises(ValueError): + tslib._test_parse_iso8601(date_str) + # If no ValueError raised, let me know which case failed. + raise Exception(date_str) + + +class TestTsUtil(tm.TestCase): + + def test_try_parse_dates(self): + from dateutil.parser import parse + arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) + + result = lib.try_parse_dates(arr, dayfirst=True) + expected = [parse(d, dayfirst=True) for d in arr] + self.assertTrue(np.array_equal(result, expected)) + + +class TestArrayToDatetime(tm.TestCase): + def test_parsing_valid_dates(self): + arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np_array_datetime64_compat( + [ + '2013-01-01T00:00:00.000000000-0000', + '2013-01-02T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + + arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np_array_datetime64_compat( + [ + '2013-09-16T00:00:00.000000000-0000', + '2013-09-17T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + + def test_parsing_timezone_offsets(self): + # All of these datetime strings with offsets are equivalent + # to the same datetime after the timezone offset is added + dt_strings = [ + '01-01-2013 08:00:00+08:00', + '2013-01-01T08:00:00.000000000+0800', + '2012-12-31T16:00:00.000000000-0800', + '12-31-2012 23:00:00-01:00' + ] + + expected_output = tslib.array_to_datetime(np.array( + ['01-01-2013 00:00:00'], dtype=object)) + + for dt_string in dt_strings: + self.assert_numpy_array_equal( + tslib.array_to_datetime( + np.array([dt_string], dtype=object) + ), + expected_output + ) + + def test_number_looking_strings_not_into_datetime(self): + # #4601 + # These strings don't look like datetimes so they shouldn't be + # attempted to be converted + arr = np.array(['-352.737091', '183.575577'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, errors='ignore'), arr) + + arr = np.array(['1', '2', '3', '4', '5'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, errors='ignore'), arr) + + def test_coercing_dates_outside_of_datetime64_ns_bounds(self): + invalid_dates = [ + date(1000, 1, 1), + datetime(1000, 1, 1), + '1000-01-01', + 'Jan 1, 1000', + np.datetime64('1000-01-01'), + ] + + for invalid_date in invalid_dates: + self.assertRaises(ValueError, + tslib.array_to_datetime, + np.array( + [invalid_date], dtype='object'), + errors='raise', ) + self.assert_numpy_array_equal( + tslib.array_to_datetime( + np.array([invalid_date], dtype='object'), + errors='coerce'), + np.array([tslib.iNaT], dtype='M8[ns]') + ) + + arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, errors='coerce'), + np_array_datetime64_compat( + [ + tslib.iNaT, + '2000-01-01T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + + def test_coerce_of_invalid_datetimes(self): + arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) + + # Without coercing, the presence of any invalid dates prevents + # any values from being converted + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, errors='ignore'), arr) + + # With coercing, the invalid dates becomes iNaT + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, errors='coerce'), + np_array_datetime64_compat( + [ + '2013-01-01T00:00:00.000000000-0000', + tslib.iNaT, + tslib.iNaT + ], + dtype='M8[ns]' + ) + ) + + +class TestPivotAnnual(tm.TestCase): + """ + New pandas of scikits.timeseries pivot_annual + """ + + def test_daily(self): + rng = date_range('1/1/2000', '12/31/2004', freq='D') + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'D') + + doy = ts.index.dayofyear + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 + + for i in range(1, 367): + subset = ts[doy == i] + subset.index = [x.year for x in subset.index] + + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + self.assertEqual(result.name, i) + + # check leap days + leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] + day = leaps.index.dayofyear[0] + leaps.index = leaps.index.year + leaps.name = 60 + tm.assert_series_equal(annual[day].dropna(), leaps) + + def test_hourly(self): + rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24), + freq='H') + data_hourly = np.random.randint(100, 350, rng_hourly.size) + ts_hourly = Series(data_hourly, index=rng_hourly) + + grouped = ts_hourly.groupby(ts_hourly.index.year) + hoy = grouped.apply(lambda x: x.reset_index(drop=True)) + hoy = hoy.index.droplevel(0).values + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 + hoy += 1 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts_hourly) + + ts_hourly = ts_hourly.astype(float) + for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: + subset = ts_hourly[hoy == i] + subset.index = [x.year for x in subset.index] + + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + self.assertEqual(result.name, i) + + leaps = ts_hourly[(ts_hourly.index.month == 2) & ( + ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)] + hour = leaps.index.dayofyear[0] * 24 - 23 + leaps.index = leaps.index.year + leaps.name = 1417 + tm.assert_series_equal(annual[hour].dropna(), leaps) + + def test_weekly(self): + pass + + def test_monthly(self): + rng = date_range('1/1/2000', '12/31/2004', freq='M') + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'M') + + month = ts.index.month + for i in range(1, 13): + subset = ts[month == i] + subset.index = [x.year for x in subset.index] + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + self.assertEqual(result.name, i) + + def test_period_monthly(self): + pass + + def test_period_daily(self): + pass + + def test_period_weekly(self): + pass + + def test_isleapyear_deprecate(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2000)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertFalse(isleapyear(2001)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2004)) + + +def test_normalize_date(): + value = date(2012, 9, 7) + + result = normalize_date(value) + assert (result == datetime(2012, 9, 7)) + + value = datetime(2012, 9, 7, 12) + + result = normalize_date(value) + assert (result == datetime(2012, 9, 7)) diff --git a/pandas/tests/indexes/period/__init__.py b/pandas/tests/indexes/period/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py new file mode 100644 index 0000000000000..33653c92da719 --- /dev/null +++ b/pandas/tests/indexes/period/test_period.py @@ -0,0 +1,233 @@ +import numpy as np +from datetime import timedelta + +import pandas as pd +from pandas.util import testing as tm +from pandas import (PeriodIndex, period_range, notnull, DatetimeIndex, NaT, + Index, Period, Int64Index) + +from ..datetimelike import DatetimeLike + + +class TestPeriodIndex(DatetimeLike, tm.TestCase): + _holder = PeriodIndex + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(index=tm.makePeriodIndex(10)) + self.setup_indices() + + def create_index(self): + return period_range('20130101', periods=5, freq='D') + + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='D')] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.Index(np.array(arr), dtype=object)) + + def test_astype(self): + # GH 13149, GH 13209 + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + + result = idx.astype(object) + expected = Index([Period('2016-05-16', freq='D')] + + [Period(NaT, freq='D')] * 3, dtype='object') + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index([16937] + [-9223372036854775808] * 3, + dtype=np.int64) + tm.assert_index_equal(result, expected) + + idx = period_range('1990', '2009', freq='A') + result = idx.astype('i8') + self.assert_index_equal(result, Index(idx.asi8)) + self.assert_numpy_array_equal(result.values, idx.asi8) + + def test_astype_raises(self): + # GH 13149, GH 13209 + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + + self.assertRaises(ValueError, idx.astype, str) + self.assertRaises(ValueError, idx.astype, float) + self.assertRaises(ValueError, idx.astype, 'timedelta64') + self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') + + def test_shift(self): + + # test shift for PeriodIndex + # GH8083 + drange = self.create_index() + result = drange.shift(1) + expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', + '2013-01-05', '2013-01-06'], freq='D') + self.assert_index_equal(result, expected) + + def test_pickle_compat_construction(self): + pass + + def test_get_loc(self): + idx = pd.period_range('2000-01-01', periods=3) + + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(idx[1], method), 1) + self.assertEqual( + idx.get_loc(idx[1].asfreq('H', how='start'), method), 1) + self.assertEqual(idx.get_loc(idx[1].to_timestamp(), method), 1) + self.assertEqual( + idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method), 1) + self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + + idx = pd.period_range('2000-01-01', periods=5)[::2] + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance='1 day'), 1) + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance=pd.Timedelta('1D')), 1) + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance=np.timedelta64(1, 'D')), 1) + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance=timedelta(1)), 1) + with tm.assertRaisesRegexp(ValueError, 'must be convertible'): + idx.get_loc('2000-01-10', method='nearest', tolerance='foo') + + msg = 'Input has different freq from PeriodIndex\\(freq=D\\)' + with tm.assertRaisesRegexp(ValueError, msg): + idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour') + with tm.assertRaises(KeyError): + idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') + + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + + def test_where_other(self): + + i = self.create_index() + for arr in [np.nan, pd.NaT]: + result = i.where(notnull(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2), i2.values) + tm.assert_index_equal(result, i2) + + def test_get_indexer(self): + idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') + tm.assert_numpy_array_equal(idx.get_indexer(idx), + np.array([0, 1, 2], dtype=np.intp)) + + target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', + '2000-01-02T01'], freq='H') + tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), + np.array([-1, 0, 1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), + np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), + np.array([0, 1, 1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', + tolerance='1 hour'), + np.array([0, -1, 1], dtype=np.intp)) + + msg = 'Input has different freq from PeriodIndex\\(freq=H\\)' + with self.assertRaisesRegexp(ValueError, msg): + idx.get_indexer(target, 'nearest', tolerance='1 minute') + + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', + tolerance='1 day'), + np.array([0, 1, 1], dtype=np.intp)) + + def test_repeat(self): + # GH10183 + idx = pd.period_range('2000-01-01', periods=3, freq='D') + res = idx.repeat(3) + exp = PeriodIndex(idx.values.repeat(3), freq='D') + self.assert_index_equal(res, exp) + self.assertEqual(res.freqstr, 'D') + + def test_period_index_indexer(self): + # GH4125 + idx = pd.period_range('2002-01', '2003-12', freq='M') + df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) + self.assert_frame_equal(df, df.loc[idx]) + self.assert_frame_equal(df, df.loc[list(idx)]) + self.assert_frame_equal(df, df.loc[list(idx)]) + self.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) + self.assert_frame_equal(df, df.loc[list(idx)]) + + def test_fillna_period(self): + # GH 11343 + idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], freq='H') + + exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H') + self.assert_index_equal( + idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) + + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), 'x', + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), + pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna(pd.Period('2011-01-01', freq='D')), + exp) + + def test_no_millisecond_field(self): + with self.assertRaises(AttributeError): + DatetimeIndex.millisecond + + with self.assertRaises(AttributeError): + DatetimeIndex([]).millisecond + + def test_difference_freq(self): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq='D') + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = PeriodIndex(["20160920", "20160921"], freq='D') + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py deleted file mode 100644 index b212a7b75904c..0000000000000 --- a/pandas/tests/indexes/test_datetimelike.py +++ /dev/null @@ -1,465 +0,0 @@ -# -*- coding: utf-8 -*- - -import numpy as np -from datetime import timedelta - -import pandas as pd -from pandas.util import testing as tm -from pandas import (DatetimeIndex, Float64Index, Index, Int64Index, - NaT, Period, PeriodIndex, Series, Timedelta, - TimedeltaIndex, period_range, - timedelta_range, notnull) - - -from .datetimelike import DatetimeLike - - -class TestPeriodIndex(DatetimeLike, tm.TestCase): - _holder = PeriodIndex - - def setUp(self): - self.indices = dict(index=tm.makePeriodIndex(10)) - self.setup_indices() - - def create_index(self): - return period_range('20130101', periods=5, freq='D') - - def test_construction_base_constructor(self): - # GH 13664 - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='M')] - tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) - - arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] - tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) - - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='D')] - tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) - - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.Index(np.array(arr), dtype=object)) - - def test_astype(self): - # GH 13149, GH 13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - - result = idx.astype(object) - expected = Index([Period('2016-05-16', freq='D')] + - [Period(NaT, freq='D')] * 3, dtype='object') - tm.assert_index_equal(result, expected) - - result = idx.astype(int) - expected = Int64Index([16937] + [-9223372036854775808] * 3, - dtype=np.int64) - tm.assert_index_equal(result, expected) - - idx = period_range('1990', '2009', freq='A') - result = idx.astype('i8') - self.assert_index_equal(result, Index(idx.asi8)) - self.assert_numpy_array_equal(result.values, idx.asi8) - - def test_astype_raises(self): - # GH 13149, GH 13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - - self.assertRaises(ValueError, idx.astype, str) - self.assertRaises(ValueError, idx.astype, float) - self.assertRaises(ValueError, idx.astype, 'timedelta64') - self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') - - def test_shift(self): - - # test shift for PeriodIndex - # GH8083 - drange = self.create_index() - result = drange.shift(1) - expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', '2013-01-06'], freq='D') - self.assert_index_equal(result, expected) - - def test_pickle_compat_construction(self): - pass - - def test_get_loc(self): - idx = pd.period_range('2000-01-01', periods=3) - - for method in [None, 'pad', 'backfill', 'nearest']: - self.assertEqual(idx.get_loc(idx[1], method), 1) - self.assertEqual( - idx.get_loc(idx[1].asfreq('H', how='start'), method), 1) - self.assertEqual(idx.get_loc(idx[1].to_timestamp(), method), 1) - self.assertEqual( - idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method), 1) - self.assertEqual(idx.get_loc(str(idx[1]), method), 1) - - idx = pd.period_range('2000-01-01', periods=5)[::2] - self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', - tolerance='1 day'), 1) - self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', - tolerance=pd.Timedelta('1D')), 1) - self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', - tolerance=np.timedelta64(1, 'D')), 1) - self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', - tolerance=timedelta(1)), 1) - with tm.assertRaisesRegexp(ValueError, 'must be convertible'): - idx.get_loc('2000-01-10', method='nearest', tolerance='foo') - - msg = 'Input has different freq from PeriodIndex\\(freq=D\\)' - with tm.assertRaisesRegexp(ValueError, msg): - idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour') - with tm.assertRaises(KeyError): - idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') - - def test_where(self): - i = self.create_index() - result = i.where(notnull(i)) - expected = i - tm.assert_index_equal(result, expected) - - i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notnull(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_other(self): - - i = self.create_index() - for arr in [np.nan, pd.NaT]: - result = i.where(notnull(i), other=np.nan) - expected = i - tm.assert_index_equal(result, expected) - - i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notnull(i2), i2) - tm.assert_index_equal(result, i2) - - i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notnull(i2), i2.values) - tm.assert_index_equal(result, i2) - - def test_get_indexer(self): - idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) - - target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', - '2000-01-02T01'], freq='H') - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', - tolerance='1 hour'), - np.array([0, -1, 1], dtype=np.intp)) - - msg = 'Input has different freq from PeriodIndex\\(freq=H\\)' - with self.assertRaisesRegexp(ValueError, msg): - idx.get_indexer(target, 'nearest', tolerance='1 minute') - - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', - tolerance='1 day'), - np.array([0, 1, 1], dtype=np.intp)) - - def test_repeat(self): - # GH10183 - idx = pd.period_range('2000-01-01', periods=3, freq='D') - res = idx.repeat(3) - exp = PeriodIndex(idx.values.repeat(3), freq='D') - self.assert_index_equal(res, exp) - self.assertEqual(res.freqstr, 'D') - - def test_period_index_indexer(self): - # GH4125 - idx = pd.period_range('2002-01', '2003-12', freq='M') - df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) - self.assert_frame_equal(df, df.loc[idx]) - self.assert_frame_equal(df, df.loc[list(idx)]) - self.assert_frame_equal(df, df.loc[list(idx)]) - self.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) - self.assert_frame_equal(df, df.loc[list(idx)]) - - def test_fillna_period(self): - # GH 11343 - idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], freq='H') - - exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], freq='H') - self.assert_index_equal( - idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) - - exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), 'x', - pd.Period('2011-01-01 11:00', freq='H')], dtype=object) - self.assert_index_equal(idx.fillna('x'), exp) - - exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), - pd.Period('2011-01-01', freq='D'), - pd.Period('2011-01-01 11:00', freq='H')], dtype=object) - self.assert_index_equal(idx.fillna(pd.Period('2011-01-01', freq='D')), - exp) - - def test_no_millisecond_field(self): - with self.assertRaises(AttributeError): - DatetimeIndex.millisecond - - with self.assertRaises(AttributeError): - DatetimeIndex([]).millisecond - - def test_difference_freq(self): - # GH14323: difference of Period MUST preserve frequency - # but the ability to union results must be preserved - - index = period_range("20160920", "20160925", freq="D") - - other = period_range("20160921", "20160924", freq="D") - expected = PeriodIndex(["20160920", "20160925"], freq='D') - idx_diff = index.difference(other) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) - - other = period_range("20160922", "20160925", freq="D") - idx_diff = index.difference(other) - expected = PeriodIndex(["20160920", "20160921"], freq='D') - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) - - -class TestTimedeltaIndex(DatetimeLike, tm.TestCase): - _holder = TimedeltaIndex - - def setUp(self): - self.indices = dict(index=tm.makeTimedeltaIndex(10)) - self.setup_indices() - - def create_index(self): - return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - - def test_construction_base_constructor(self): - arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] - tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.TimedeltaIndex(np.array(arr))) - - arr = [np.nan, pd.NaT, pd.Timedelta('1 days')] - tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.TimedeltaIndex(np.array(arr))) - - def test_shift(self): - # test shift for TimedeltaIndex - # err8083 - - drange = self.create_index() - result = drange.shift(1) - expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', - '3 days 01:00:00', - '4 days 01:00:00', '5 days 01:00:00'], - freq='D') - self.assert_index_equal(result, expected) - - result = drange.shift(3, freq='2D 1s') - expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', - '8 days 01:00:03', '9 days 01:00:03', - '10 days 01:00:03'], freq='D') - self.assert_index_equal(result, expected) - - def test_astype(self): - # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) - - result = idx.astype(object) - expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3, - dtype=object) - tm.assert_index_equal(result, expected) - - result = idx.astype(int) - expected = Int64Index([100000000000000] + [-9223372036854775808] * 3, - dtype=np.int64) - tm.assert_index_equal(result, expected) - - rng = timedelta_range('1 days', periods=10) - - result = rng.astype('i8') - self.assert_index_equal(result, Index(rng.asi8)) - self.assert_numpy_array_equal(rng.asi8, result.values) - - def test_astype_timedelta64(self): - # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) - - result = idx.astype('timedelta64') - expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64') - tm.assert_index_equal(result, expected) - - result = idx.astype('timedelta64[ns]') - tm.assert_index_equal(result, idx) - self.assertFalse(result is idx) - - result = idx.astype('timedelta64[ns]', copy=False) - tm.assert_index_equal(result, idx) - self.assertTrue(result is idx) - - def test_astype_raises(self): - # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) - - self.assertRaises(ValueError, idx.astype, float) - self.assertRaises(ValueError, idx.astype, str) - self.assertRaises(ValueError, idx.astype, 'datetime64') - self.assertRaises(ValueError, idx.astype, 'datetime64[ns]') - - def test_get_loc(self): - idx = pd.to_timedelta(['0 days', '1 days', '2 days']) - - for method in [None, 'pad', 'backfill', 'nearest']: - self.assertEqual(idx.get_loc(idx[1], method), 1) - self.assertEqual(idx.get_loc(idx[1].to_pytimedelta(), method), 1) - self.assertEqual(idx.get_loc(str(idx[1]), method), 1) - - self.assertEqual( - idx.get_loc(idx[1], 'pad', tolerance=pd.Timedelta(0)), 1) - self.assertEqual( - idx.get_loc(idx[1], 'pad', tolerance=np.timedelta64(0, 's')), 1) - self.assertEqual(idx.get_loc(idx[1], 'pad', tolerance=timedelta(0)), 1) - - with tm.assertRaisesRegexp(ValueError, 'must be convertible'): - idx.get_loc(idx[1], method='nearest', tolerance='foo') - - for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: - self.assertEqual(idx.get_loc('1 day 1 hour', method), loc) - - def test_get_indexer(self): - idx = pd.to_timedelta(['0 days', '1 days', '2 days']) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) - - target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - - res = idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 hour')) - tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) - - def test_numeric_compat(self): - - idx = self._holder(np.arange(5, dtype='int64')) - didx = self._holder(np.arange(5, dtype='int64') ** 2) - result = idx * 1 - tm.assert_index_equal(result, idx) - - result = 1 * idx - tm.assert_index_equal(result, idx) - - result = idx / 1 - tm.assert_index_equal(result, idx) - - result = idx // 1 - tm.assert_index_equal(result, idx) - - result = idx * np.array(5, dtype='int64') - tm.assert_index_equal(result, - self._holder(np.arange(5, dtype='int64') * 5)) - - result = idx * np.arange(5, dtype='int64') - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='int64')) - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='float64') + 0.1) - tm.assert_index_equal(result, self._holder(np.arange( - 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) - - # invalid - self.assertRaises(TypeError, lambda: idx * idx) - self.assertRaises(ValueError, lambda: idx * self._holder(np.arange(3))) - self.assertRaises(ValueError, lambda: idx * np.array([1, 2])) - - def test_pickle_compat_construction(self): - pass - - def test_ufunc_coercions(self): - # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') - - for result in [idx * 2, np.multiply(idx, 2)]: - tm.assertIsInstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'], - freq='4H', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '4H') - - for result in [idx / 2, np.divide(idx, 2)]: - tm.assertIsInstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'], - freq='H', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, 'H') - - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') - for result in [-idx, np.negative(idx)]: - tm.assertIsInstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'], - freq='-2H', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '-2H') - - idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'], - freq='H', name='x') - for result in [abs(idx), np.absolute(idx)]: - tm.assertIsInstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'], - freq=None, name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, None) - - def test_fillna_timedelta(self): - # GH 11343 - idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day']) - - exp = pd.TimedeltaIndex(['1 day', '2 day', '3 day']) - self.assert_index_equal(idx.fillna(pd.Timedelta('2 day')), exp) - - exp = pd.TimedeltaIndex(['1 day', '3 hour', '3 day']) - idx.fillna(pd.Timedelta('3 hour')) - - exp = pd.Index( - [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) - self.assert_index_equal(idx.fillna('x'), exp) - - def test_difference_freq(self): - # GH14323: Difference of TimedeltaIndex should not preserve frequency - - index = timedelta_range("0 days", "5 days", freq="D") - - other = timedelta_range("1 days", "4 days", freq="D") - expected = TimedeltaIndex(["0 days", "5 days"], freq=None) - idx_diff = index.difference(other) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) diff --git a/pandas/tests/indexes/test_timedelta.py b/pandas/tests/indexes/test_timedelta.py deleted file mode 100644 index e6071b8c4fa06..0000000000000 --- a/pandas/tests/indexes/test_timedelta.py +++ /dev/null @@ -1,42 +0,0 @@ -import numpy as np -from datetime import timedelta - -import pandas as pd -import pandas.util.testing as tm -from pandas import (timedelta_range, date_range, Series, Timedelta, - DatetimeIndex) - - -class TestSlicing(tm.TestCase): - - def test_timedelta(self): - # this is valid too - index = date_range('1/1/2000', periods=50, freq='B') - shifted = index + timedelta(1) - back = shifted + timedelta(-1) - self.assertTrue(tm.equalContents(index, back)) - self.assertEqual(shifted.freq, index.freq) - self.assertEqual(shifted.freq, back.freq) - - result = index - timedelta(1) - expected = index + timedelta(-1) - tm.assert_index_equal(result, expected) - - # GH4134, buggy with timedeltas - rng = date_range('2013', '2014') - s = Series(rng) - result1 = rng - pd.offsets.Hour(1) - result2 = DatetimeIndex(s - np.timedelta64(100000000)) - result3 = rng - np.timedelta64(100000000) - result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - tm.assert_index_equal(result1, result4) - tm.assert_index_equal(result2, result3) - - -class TestTimeSeries(tm.TestCase): - - def test_series_box_timedelta(self): - rng = timedelta_range('1 day 1 s', periods=5, freq='h') - s = Series(rng) - tm.assertIsInstance(s[1], Timedelta) - tm.assertIsInstance(s.iat[2], Timedelta) diff --git a/pandas/tests/indexes/timedeltas/__init__.py b/pandas/tests/indexes/timedeltas/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py new file mode 100644 index 0000000000000..88e7b1387feff --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -0,0 +1,121 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import (TimedeltaIndex, timedelta_range, Int64Index, Float64Index, + Index, Timedelta, Series) + +from ..datetimelike import DatetimeLike + + +class TestTimedeltaIndex(DatetimeLike, tm.TestCase): + _holder = TimedeltaIndex + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(index=tm.makeTimedeltaIndex(10)) + self.setup_indices() + + def create_index(self): + return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + + def test_astype(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + + result = idx.astype(object) + expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3, + dtype=object) + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index([100000000000000] + [-9223372036854775808] * 3, + dtype=np.int64) + tm.assert_index_equal(result, expected) + + rng = timedelta_range('1 days', periods=10) + + result = rng.astype('i8') + self.assert_index_equal(result, Index(rng.asi8)) + self.assert_numpy_array_equal(rng.asi8, result.values) + + def test_astype_timedelta64(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + + result = idx.astype('timedelta64') + expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64') + tm.assert_index_equal(result, expected) + + result = idx.astype('timedelta64[ns]') + tm.assert_index_equal(result, idx) + self.assertFalse(result is idx) + + result = idx.astype('timedelta64[ns]', copy=False) + tm.assert_index_equal(result, idx) + self.assertTrue(result is idx) + + def test_astype_raises(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + + self.assertRaises(ValueError, idx.astype, float) + self.assertRaises(ValueError, idx.astype, str) + self.assertRaises(ValueError, idx.astype, 'datetime64') + self.assertRaises(ValueError, idx.astype, 'datetime64[ns]') + + def test_pickle_compat_construction(self): + pass + + def test_shift(self): + # test shift for TimedeltaIndex + # err8083 + + drange = self.create_index() + result = drange.shift(1) + expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', + '4 days 01:00:00', '5 days 01:00:00'], + freq='D') + self.assert_index_equal(result, expected) + + result = drange.shift(3, freq='2D 1s') + expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', + '8 days 01:00:03', '9 days 01:00:03', + '10 days 01:00:03'], freq='D') + self.assert_index_equal(result, expected) + + def test_numeric_compat(self): + + idx = self._holder(np.arange(5, dtype='int64')) + didx = self._holder(np.arange(5, dtype='int64') ** 2) + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, + self._holder(np.arange(5, dtype='int64') * 5)) + + result = idx * np.arange(5, dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='float64') + 0.1) + tm.assert_index_equal(result, self._holder(np.arange( + 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) + + # invalid + self.assertRaises(TypeError, lambda: idx * idx) + self.assertRaises(ValueError, lambda: idx * self._holder(np.arange(3))) + self.assertRaises(ValueError, lambda: idx * np.array([1, 2])) diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py new file mode 100644 index 0000000000000..0810b13eb0f53 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -0,0 +1,88 @@ +import numpy as np +from datetime import timedelta + +import pandas as pd +import pandas.util.testing as tm +from pandas import TimedeltaIndex, timedelta_range, tslib, to_timedelta + +iNaT = tslib.iNaT + + +class TestTimedeltaIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_construction_base_constructor(self): + arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timedelta('1 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + + def test_constructor(self): + expected = TimedeltaIndex(['1 days', '1 days 00:00:05', '2 days', + '2 days 00:00:02', '0 days 00:00:03']) + result = TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64( + 2, 'D'), timedelta(days=2, seconds=2), pd.offsets.Second(3)]) + tm.assert_index_equal(result, expected) + + # unicode + result = TimedeltaIndex([u'1 days', '1 days, 00:00:05', np.timedelta64( + 2, 'D'), timedelta(days=2, seconds=2), pd.offsets.Second(3)]) + + expected = TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', + '0 days 00:00:02']) + tm.assert_index_equal(TimedeltaIndex(range(3), unit='s'), expected) + expected = TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:05', + '0 days 00:00:09']) + tm.assert_index_equal(TimedeltaIndex([0, 5, 9], unit='s'), expected) + expected = TimedeltaIndex( + ['0 days 00:00:00.400', '0 days 00:00:00.450', + '0 days 00:00:01.200']) + tm.assert_index_equal(TimedeltaIndex([400, 450, 1200], unit='ms'), + expected) + + def test_constructor_coverage(self): + rng = timedelta_range('1 days', periods=10.5) + exp = timedelta_range('1 days', periods=10) + self.assert_index_equal(rng, exp) + + self.assertRaises(ValueError, TimedeltaIndex, start='1 days', + periods='foo', freq='D') + + self.assertRaises(ValueError, TimedeltaIndex, start='1 days', + end='10 days') + + self.assertRaises(ValueError, TimedeltaIndex, '1 days') + + # generator expression + gen = (timedelta(i) for i in range(10)) + result = TimedeltaIndex(gen) + expected = TimedeltaIndex([timedelta(i) for i in range(10)]) + self.assert_index_equal(result, expected) + + # NumPy string array + strings = np.array(['1 days', '2 days', '3 days']) + result = TimedeltaIndex(strings) + expected = to_timedelta([1, 2, 3], unit='d') + self.assert_index_equal(result, expected) + + from_ints = TimedeltaIndex(expected.asi8) + self.assert_index_equal(from_ints, expected) + + # non-conforming freq + self.assertRaises(ValueError, TimedeltaIndex, + ['1 days', '2 days', '4 days'], freq='D') + + self.assertRaises(ValueError, TimedeltaIndex, periods=10, freq='D') + + def test_constructor_name(self): + idx = TimedeltaIndex(start='1 days', periods=1, freq='D', name='TEST') + self.assertEqual(idx.name, 'TEST') + + # GH10025 + idx2 = TimedeltaIndex(idx, name='something else') + self.assertEqual(idx2.name, 'something else') diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py new file mode 100644 index 0000000000000..b4a8bc79921bf --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -0,0 +1,110 @@ +from datetime import timedelta + +import pandas.util.testing as tm +from pandas import TimedeltaIndex, timedelta_range, compat, Index, Timedelta + + +class TestTimedeltaIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_insert(self): + + idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx') + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') + self.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, 'inserted') + expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), + Timedelta('2day')], name='idx') + self.assertNotIsInstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + + idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') + + # preserve freq + expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02', + '1day 00:00:03'], + name='idx', freq='s') + expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', + '1day 00:00:03', '1day 00:00:04'], + name='idx', freq='s') + + # reset freq to None + expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01', + '1day 00:00:02', '1day 00:00:03'], + name='idx', freq=None) + expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', + '1day 00:00:03', '1day 00:00:05'], + name='idx', freq=None) + + cases = [(0, Timedelta('1day'), expected_0), + (-3, Timedelta('1day'), expected_0), + (3, Timedelta('1day 00:00:04'), expected_3), + (1, Timedelta('1day 00:00:01'), expected_1_nofreq), + (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] + + for n, d, expected in cases: + result = idx.insert(n, d) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + def test_delete(self): + idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') + + # prserve freq + expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', + name='idx') + expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', + name='idx') + + # reset freq to None + expected_1 = TimedeltaIndex( + ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') + + cases = {0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') + + # prserve freq + expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', + name='idx') + expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', + name='idx') + + # reset freq to None + expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', + '7 d', '8 d', '9 d', '10d'], + freq=None, name='idx') + + cases = {(0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + result = idx.delete(slice(n[0], n[-1] + 1)) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py new file mode 100644 index 0000000000000..406a5bdbf3bcd --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -0,0 +1,1276 @@ +import numpy as np +from datetime import timedelta +from distutils.version import LooseVersion + +import pandas as pd +import pandas.util.testing as tm +from pandas import to_timedelta +from pandas.util.testing import assert_series_equal, assert_frame_equal +from pandas import (Series, Timedelta, DataFrame, Timestamp, TimedeltaIndex, + timedelta_range, date_range, DatetimeIndex, Int64Index, + _np_version_under1p10, Float64Index, Index, tslib) + +from pandas.tests.test_base import Ops + + +class TestTimedeltaIndexOps(Ops): + def setUp(self): + super(TestTimedeltaIndexOps, self).setUp() + mask = lambda x: isinstance(x, TimedeltaIndex) + self.is_valid_objs = [o for o in self.objs if mask(o)] + self.not_valid_objs = [] + + def test_ops_properties(self): + self.check_ops_properties(['days', 'hours', 'minutes', 'seconds', + 'milliseconds']) + self.check_ops_properties(['microseconds', 'nanoseconds']) + + def test_asobject_tolist(self): + idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') + expected_list = [Timedelta('1 days'), Timedelta('2 days'), + Timedelta('3 days'), Timedelta('4 days')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + + self.assertEqual(result.dtype, object) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT, + timedelta(days=4)], name='idx') + expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, + Timedelta('4 days')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + def test_minmax(self): + + # monotonic + idx1 = TimedeltaIndex(['1 days', '2 days', '3 days']) + self.assertTrue(idx1.is_monotonic) + + # non-monotonic + idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) + self.assertFalse(idx2.is_monotonic) + + for idx in [idx1, idx2]: + self.assertEqual(idx.min(), Timedelta('1 days')), + self.assertEqual(idx.max(), Timedelta('3 days')), + self.assertEqual(idx.argmin(), 0) + self.assertEqual(idx.argmax(), 2) + + for op in ['min', 'max']: + # Return NaT + obj = TimedeltaIndex([]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = TimedeltaIndex([pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + def test_numpy_minmax(self): + dr = pd.date_range(start='2016-01-15', end='2016-01-20') + td = TimedeltaIndex(np.asarray(dr)) + + self.assertEqual(np.min(td), Timedelta('16815 days')) + self.assertEqual(np.max(td), Timedelta('16820 days')) + + errmsg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, errmsg, np.min, td, out=0) + tm.assertRaisesRegexp(ValueError, errmsg, np.max, td, out=0) + + self.assertEqual(np.argmin(td), 0) + self.assertEqual(np.argmax(td), 5) + + if not _np_version_under1p10: + errmsg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, errmsg, np.argmin, td, out=0) + tm.assertRaisesRegexp(ValueError, errmsg, np.argmax, td, out=0) + + def test_round(self): + td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min') + elt = td[1] + + expected_rng = TimedeltaIndex([ + Timedelta('16801 days 00:00:00'), + Timedelta('16801 days 00:00:00'), + Timedelta('16801 days 01:00:00'), + Timedelta('16801 days 02:00:00'), + Timedelta('16801 days 02:00:00'), + ]) + expected_elt = expected_rng[1] + + tm.assert_index_equal(td.round(freq='H'), expected_rng) + self.assertEqual(elt.round(freq='H'), expected_elt) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + td.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') + + msg = " is a non-fixed frequency" + tm.assertRaisesRegexp(ValueError, msg, td.round, freq='M') + tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') + + def test_representation(self): + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" + + exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " + "freq='D')") + + exp3 = ("TimedeltaIndex(['1 days', '2 days'], " + "dtype='timedelta64[ns]', freq='D')") + + exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], " + "dtype='timedelta64[ns]', freq='D')") + + exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " + "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)") + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) + + def test_representation_to_series(self): + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + exp1 = """Series([], dtype: timedelta64[ns])""" + + exp2 = """0 1 days +dtype: timedelta64[ns]""" + + exp3 = """0 1 days +1 2 days +dtype: timedelta64[ns]""" + + exp4 = """0 1 days +1 2 days +2 3 days +dtype: timedelta64[ns]""" + + exp5 = """0 1 days 00:00:01 +1 2 days 00:00:00 +2 3 days 00:00:00 +dtype: timedelta64[ns]""" + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + result = repr(pd.Series(idx)) + self.assertEqual(result, expected) + + def test_summary(self): + # GH9116 + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + exp1 = """TimedeltaIndex: 0 entries +Freq: D""" + + exp2 = """TimedeltaIndex: 1 entries, 1 days to 1 days +Freq: D""" + + exp3 = """TimedeltaIndex: 2 entries, 1 days to 2 days +Freq: D""" + + exp4 = """TimedeltaIndex: 3 entries, 1 days to 3 days +Freq: D""" + + exp5 = ("TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days " + "00:00:00") + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + result = idx.summary() + self.assertEqual(result, expected) + + def test_add_iadd(self): + + # only test adding/sub offsets as + is now numeric + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = timedelta_range('1 days', '10 days') + result = rng + delta + expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', + freq='D') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + # int + rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + result = rng + 1 + expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + def test_sub_isub(self): + # only test adding/sub offsets as - is now numeric + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = timedelta_range('1 days', '10 days') + result = rng - delta + expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + # int + rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + result = rng - 1 + expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + idx = TimedeltaIndex(['1 day', '2 day']) + msg = "cannot subtract a datelike from a TimedeltaIndex" + with tm.assertRaisesRegexp(TypeError, msg): + idx - Timestamp('2011-01-01') + + result = Timestamp('2011-01-01') + idx + expected = DatetimeIndex(['2011-01-02', '2011-01-03']) + tm.assert_index_equal(result, expected) + + def test_ops_compat(self): + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + rng = timedelta_range('1 days', '10 days', name='foo') + + # multiply + for offset in offsets: + self.assertRaises(TypeError, lambda: rng * offset) + + # divide + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') + for offset in offsets: + result = rng / offset + tm.assert_index_equal(result, expected, exact=False) + + # divide with nats + rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + expected = Float64Index([12, np.nan, 24], name='foo') + for offset in offsets: + result = rng / offset + tm.assert_index_equal(result, expected) + + # don't allow division by NaT (make could in the future) + self.assertRaises(TypeError, lambda: rng / pd.NaT) + + def test_subtraction_ops(self): + + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') + td = Timedelta('1 days') + dt = Timestamp('20130101') + + self.assertRaises(TypeError, lambda: tdi - dt) + self.assertRaises(TypeError, lambda: tdi - dti) + self.assertRaises(TypeError, lambda: td - dt) + self.assertRaises(TypeError, lambda: td - dti) + + result = dt - dti + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') + tm.assert_index_equal(result, expected) + + result = dti - dt + expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') + tm.assert_index_equal(result, expected) + + result = tdi - td + expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') + tm.assert_index_equal(result, expected, check_names=False) + + result = td - tdi + expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') + tm.assert_index_equal(result, expected, check_names=False) + + result = dti - td + expected = DatetimeIndex( + ['20121231', '20130101', '20130102'], name='bar') + tm.assert_index_equal(result, expected, check_names=False) + + result = dt - tdi + expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') + tm.assert_index_equal(result, expected) + + def test_subtraction_ops_with_tz(self): + + # check that dt/dti subtraction ops with tz are validated + dti = date_range('20130101', periods=3) + ts = Timestamp('20130101') + dt = ts.to_pydatetime() + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + ts_tz = Timestamp('20130101').tz_localize('US/Eastern') + ts_tz2 = Timestamp('20130101').tz_localize('CET') + dt_tz = ts_tz.to_pydatetime() + td = Timedelta('1 days') + + def _check(result, expected): + self.assertEqual(result, expected) + self.assertIsInstance(result, Timedelta) + + # scalars + result = ts - ts + expected = Timedelta('0 days') + _check(result, expected) + + result = dt_tz - ts_tz + expected = Timedelta('0 days') + _check(result, expected) + + result = ts_tz - dt_tz + expected = Timedelta('0 days') + _check(result, expected) + + # tz mismatches + self.assertRaises(TypeError, lambda: dt_tz - ts) + self.assertRaises(TypeError, lambda: dt_tz - dt) + self.assertRaises(TypeError, lambda: dt_tz - ts_tz2) + self.assertRaises(TypeError, lambda: dt - dt_tz) + self.assertRaises(TypeError, lambda: ts - dt_tz) + self.assertRaises(TypeError, lambda: ts_tz2 - ts) + self.assertRaises(TypeError, lambda: ts_tz2 - dt) + self.assertRaises(TypeError, lambda: ts_tz - ts_tz2) + + # with dti + self.assertRaises(TypeError, lambda: dti - ts_tz) + self.assertRaises(TypeError, lambda: dti_tz - ts) + self.assertRaises(TypeError, lambda: dti_tz - ts_tz2) + + result = dti_tz - dt_tz + expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + tm.assert_index_equal(result, expected) + + result = dt_tz - dti_tz + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + tm.assert_index_equal(result, expected) + + result = dti_tz - ts_tz + expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + tm.assert_index_equal(result, expected) + + result = ts_tz - dti_tz + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + tm.assert_index_equal(result, expected) + + result = td - td + expected = Timedelta('0 days') + _check(result, expected) + + result = dti_tz - td + expected = DatetimeIndex( + ['20121231', '20130101', '20130102'], tz='US/Eastern') + tm.assert_index_equal(result, expected) + + def test_dti_tdi_numeric_ops(self): + + # These are normally union/diff set-like ops + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') + + # TODO(wesm): unused? + # td = Timedelta('1 days') + # dt = Timestamp('20130101') + + result = tdi - tdi + expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') + tm.assert_index_equal(result, expected) + + result = tdi + tdi + expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') + tm.assert_index_equal(result, expected) + + result = dti - tdi # name will be reset + expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) + tm.assert_index_equal(result, expected) + + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'H']: + idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + + with tm.assertRaises(TypeError): + idx - p + + with tm.assertRaises(TypeError): + p - idx + + def test_addition_ops(self): + + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') + td = Timedelta('1 days') + dt = Timestamp('20130101') + + result = tdi + dt + expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + tm.assert_index_equal(result, expected) + + result = dt + tdi + expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + tm.assert_index_equal(result, expected) + + result = td + tdi + expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + tm.assert_index_equal(result, expected) + + result = tdi + td + expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + tm.assert_index_equal(result, expected) + + # unequal length + self.assertRaises(ValueError, lambda: tdi + dti[0:1]) + self.assertRaises(ValueError, lambda: tdi[0:1] + dti) + + # random indexes + self.assertRaises(TypeError, lambda: tdi + Int64Index([1, 2, 3])) + + # this is a union! + # self.assertRaises(TypeError, lambda : Int64Index([1,2,3]) + tdi) + + result = tdi + dti # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) + + result = dti + tdi # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) + + result = dt + td + expected = Timestamp('20130102') + self.assertEqual(result, expected) + + result = td + dt + expected = Timestamp('20130102') + self.assertEqual(result, expected) + + def test_comp_nat(self): + left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')]) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + + def test_value_counts_unique(self): + # GH 7735 + + idx = timedelta_range('1 days 09:00:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) + + exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) + tm.assert_index_equal(idx.unique(), expected) + + idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', + '1 days 09:00:00', '1 days 08:00:00', + '1 days 08:00:00', pd.NaT]) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) + expected = Series([3, 2], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', + pd.NaT]) + expected = Series([3, 2, 1], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + def test_nonunique_contains(self): + # GH 9512 + for idx in map(TimedeltaIndex, ([0, 1, 0], [0, 0, -1], [0, -1, -1], + ['00:01:00', '00:01:00', '00:02:00'], + ['00:01:00', '00:01:00', '00:00:01'])): + tm.assertIn(idx[0], idx) + + def test_unknown_attribute(self): + # GH 9680 + tdi = pd.timedelta_range(start=0, periods=10, freq='1s') + ts = pd.Series(np.random.normal(size=10), index=tdi) + self.assertNotIn('foo', ts.__dict__.keys()) + self.assertRaises(AttributeError, lambda: ts.foo) + + def test_order(self): + # GH 10295 + idx1 = TimedeltaIndex(['1 day', '2 day', '3 day'], freq='D', + name='idx') + idx2 = TimedeltaIndex( + ['1 hour', '2 hour', '3 hour'], freq='H', name='idx') + + for idx in [idx1, idx2]: + ordered = idx.sort_values() + self.assert_index_equal(ordered, idx) + self.assertEqual(ordered.freq, idx.freq) + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + self.assert_index_equal(ordered, expected) + self.assertEqual(ordered.freq, expected.freq) + self.assertEqual(ordered.freq.n, -1) + + ordered, indexer = idx.sort_values(return_indexer=True) + self.assert_index_equal(ordered, idx) + self.assert_numpy_array_equal(indexer, + np.array([0, 1, 2]), + check_dtype=False) + self.assertEqual(ordered.freq, idx.freq) + + ordered, indexer = idx.sort_values(return_indexer=True, + ascending=False) + self.assert_index_equal(ordered, idx[::-1]) + self.assertEqual(ordered.freq, expected.freq) + self.assertEqual(ordered.freq.n, -1) + + idx1 = TimedeltaIndex(['1 hour', '3 hour', '5 hour', + '2 hour ', '1 hour'], name='idx1') + exp1 = TimedeltaIndex(['1 hour', '1 hour', '2 hour', + '3 hour', '5 hour'], name='idx1') + + idx2 = TimedeltaIndex(['1 day', '3 day', '5 day', + '2 day', '1 day'], name='idx2') + + # TODO(wesm): unused? + # exp2 = TimedeltaIndex(['1 day', '1 day', '2 day', + # '3 day', '5 day'], name='idx2') + + # idx3 = TimedeltaIndex([pd.NaT, '3 minute', '5 minute', + # '2 minute', pd.NaT], name='idx3') + # exp3 = TimedeltaIndex([pd.NaT, pd.NaT, '2 minute', '3 minute', + # '5 minute'], name='idx3') + + for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: + ordered = idx.sort_values() + self.assert_index_equal(ordered, expected) + self.assertIsNone(ordered.freq) + + ordered = idx.sort_values(ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True) + self.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True, + ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) + + def test_getitem(self): + idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + + for idx in [idx1]: + result = idx[0] + self.assertEqual(result, pd.Timedelta('1 day')) + + result = idx[0:5] + expected = pd.timedelta_range('1 day', '5 day', freq='D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx[0:10:2] + expected = pd.timedelta_range('1 day', '9 day', freq='2D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx[-20:-5:3] + expected = pd.timedelta_range('12 day', '24 day', freq='3D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx[4::-1] + expected = TimedeltaIndex(['5 day', '4 day', '3 day', + '2 day', '1 day'], + freq='-1D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + def test_drop_duplicates_metadata(self): + # GH 10115 + idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + result = idx.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertEqual(idx.freq, result.freq) + + idx_dup = idx.append(idx) + self.assertIsNone(idx_dup.freq) # freq is reset + result = idx_dup.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertIsNone(result.freq) + + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + + def test_take(self): + # GH 10295 + idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + + for idx in [idx1]: + result = idx.take([0]) + self.assertEqual(result, pd.Timedelta('1 day')) + + result = idx.take([-1]) + self.assertEqual(result, pd.Timedelta('31 day')) + + result = idx.take([0, 1, 2]) + expected = pd.timedelta_range('1 day', '3 day', freq='D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx.take([0, 2, 4]) + expected = pd.timedelta_range('1 day', '5 day', freq='2D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx.take([7, 4, 1]) + expected = pd.timedelta_range('8 day', '2 day', freq='-3D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + + result = idx.take([3, 2, 5]) + expected = TimedeltaIndex(['4 day', '3 day', '6 day'], name='idx') + self.assert_index_equal(result, expected) + self.assertIsNone(result.freq) + + result = idx.take([-3, 2, 5]) + expected = TimedeltaIndex(['29 day', '3 day', '6 day'], name='idx') + self.assert_index_equal(result, expected) + self.assertIsNone(result.freq) + + def test_take_invalid_kwargs(self): + idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + indices = [1, 6, 5, 9, 10, 13, 15, 3] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + tm.assertRaisesRegexp(TypeError, msg, idx.take, + indices, foo=2) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, idx.take, + indices, out=indices) + + msg = "the 'mode' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, idx.take, + indices, mode='clip') + + def test_infer_freq(self): + # GH 11018 + for freq in ['D', '3D', '-3D', 'H', '2H', '-2H', 'T', '2T', 'S', '-3S' + ]: + idx = pd.timedelta_range('1', freq=freq, periods=10) + result = pd.TimedeltaIndex(idx.asi8, freq='infer') + tm.assert_index_equal(idx, result) + self.assertEqual(result.freq, freq) + + def test_nat_new(self): + + idx = pd.timedelta_range('1', freq='D', periods=5, name='x') + result = idx._nat_new() + exp = pd.TimedeltaIndex([pd.NaT] * 5, name='x') + tm.assert_index_equal(result, exp) + + result = idx._nat_new(box=False) + exp = np.array([tslib.iNaT] * 5, dtype=np.int64) + tm.assert_numpy_array_equal(result, exp) + + def test_shift(self): + # GH 9903 + idx = pd.TimedeltaIndex([], name='xxx') + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + tm.assert_index_equal(idx.shift(3, freq='H'), idx) + + idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx') + tm.assert_index_equal(idx.shift(3, freq='H'), exp) + exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx') + tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + + tm.assert_index_equal(idx.shift(0, freq='T'), idx) + exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'], + name='xxx') + tm.assert_index_equal(idx.shift(3, freq='T'), exp) + exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'], + name='xxx') + tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + + def test_repeat(self): + index = pd.timedelta_range('1 days', periods=2, freq='D') + exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days']) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = TimedeltaIndex(['1 days', 'NaT', '3 days']) + exp = TimedeltaIndex(['1 days', '1 days', '1 days', + 'NaT', 'NaT', 'NaT', + '3 days', '3 days', '3 days']) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + def test_nat(self): + self.assertIs(pd.TimedeltaIndex._na_value, pd.NaT) + self.assertIs(pd.TimedeltaIndex([])._na_value, pd.NaT) + + idx = pd.TimedeltaIndex(['1 days', '2 days']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.intp)) + + idx = pd.TimedeltaIndex(['1 days', 'NaT']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.intp)) + + def test_equals(self): + # GH 13107 + idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.asobject.equals(idx2.asobject)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + + +class TestTimedeltas(tm.TestCase): + _multiprocess_can_split_ = True + + def test_ops(self): + + td = Timedelta(10, unit='d') + self.assertEqual(-td, Timedelta(-10, unit='d')) + self.assertEqual(+td, Timedelta(10, unit='d')) + self.assertEqual(td - td, Timedelta(0, unit='ns')) + self.assertTrue((td - pd.NaT) is pd.NaT) + self.assertEqual(td + td, Timedelta(20, unit='d')) + self.assertTrue((td + pd.NaT) is pd.NaT) + self.assertEqual(td * 2, Timedelta(20, unit='d')) + self.assertTrue((td * pd.NaT) is pd.NaT) + self.assertEqual(td / 2, Timedelta(5, unit='d')) + self.assertEqual(abs(td), td) + self.assertEqual(abs(-td), td) + self.assertEqual(td / td, 1) + self.assertTrue((td / pd.NaT) is np.nan) + + # invert + self.assertEqual(-td, Timedelta('-10d')) + self.assertEqual(td * -1, Timedelta('-10d')) + self.assertEqual(-1 * td, Timedelta('-10d')) + self.assertEqual(abs(-td), Timedelta('10d')) + + # invalid + self.assertRaises(TypeError, lambda: Timedelta(11, unit='d') // 2) + + # invalid multiply with another timedelta + self.assertRaises(TypeError, lambda: td * td) + + # can't operate with integers + self.assertRaises(TypeError, lambda: td + 2) + self.assertRaises(TypeError, lambda: td - 2) + + def test_ops_offsets(self): + td = Timedelta(10, unit='d') + self.assertEqual(Timedelta(241, unit='h'), td + pd.offsets.Hour(1)) + self.assertEqual(Timedelta(241, unit='h'), pd.offsets.Hour(1) + td) + self.assertEqual(240, td / pd.offsets.Hour(1)) + self.assertEqual(1 / 240.0, pd.offsets.Hour(1) / td) + self.assertEqual(Timedelta(239, unit='h'), td - pd.offsets.Hour(1)) + self.assertEqual(Timedelta(-239, unit='h'), pd.offsets.Hour(1) - td) + + def test_ops_ndarray(self): + td = Timedelta('1 day') + + # timedelta, timedelta + other = pd.to_timedelta(['1 day']).values + expected = pd.to_timedelta(['2 days']).values + self.assert_numpy_array_equal(td + other, expected) + if LooseVersion(np.__version__) >= '1.8': + self.assert_numpy_array_equal(other + td, expected) + self.assertRaises(TypeError, lambda: td + np.array([1])) + self.assertRaises(TypeError, lambda: np.array([1]) + td) + + expected = pd.to_timedelta(['0 days']).values + self.assert_numpy_array_equal(td - other, expected) + if LooseVersion(np.__version__) >= '1.8': + self.assert_numpy_array_equal(-other + td, expected) + self.assertRaises(TypeError, lambda: td - np.array([1])) + self.assertRaises(TypeError, lambda: np.array([1]) - td) + + expected = pd.to_timedelta(['2 days']).values + self.assert_numpy_array_equal(td * np.array([2]), expected) + self.assert_numpy_array_equal(np.array([2]) * td, expected) + self.assertRaises(TypeError, lambda: td * other) + self.assertRaises(TypeError, lambda: other * td) + + self.assert_numpy_array_equal(td / other, + np.array([1], dtype=np.float64)) + if LooseVersion(np.__version__) >= '1.8': + self.assert_numpy_array_equal(other / td, + np.array([1], dtype=np.float64)) + + # timedelta, datetime + other = pd.to_datetime(['2000-01-01']).values + expected = pd.to_datetime(['2000-01-02']).values + self.assert_numpy_array_equal(td + other, expected) + if LooseVersion(np.__version__) >= '1.8': + self.assert_numpy_array_equal(other + td, expected) + + expected = pd.to_datetime(['1999-12-31']).values + self.assert_numpy_array_equal(-td + other, expected) + if LooseVersion(np.__version__) >= '1.8': + self.assert_numpy_array_equal(other - td, expected) + + def test_ops_series(self): + # regression test for GH8813 + td = Timedelta('1 day') + other = pd.Series([1, 2]) + expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) + tm.assert_series_equal(expected, td * other) + tm.assert_series_equal(expected, other * td) + + def test_ops_series_object(self): + # GH 13043 + s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + # object series & object series + s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), + pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s2.dtype, object) + exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], + name='xxx') + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], + name='xxx', dtype=object) + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) + tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) + + def test_ops_notimplemented(self): + class Other: + pass + + other = Other() + + td = Timedelta('1 day') + self.assertTrue(td.__add__(other) is NotImplemented) + self.assertTrue(td.__sub__(other) is NotImplemented) + self.assertTrue(td.__truediv__(other) is NotImplemented) + self.assertTrue(td.__mul__(other) is NotImplemented) + self.assertTrue(td.__floordiv__(td) is NotImplemented) + + def test_ops_error_str(self): + # GH 13624 + tdi = TimedeltaIndex(['1 day', '2 days']) + + for l, r in [(tdi, 'a'), ('a', tdi)]: + with tm.assertRaises(TypeError): + l + r + + with tm.assertRaises(TypeError): + l > r + + with tm.assertRaises(TypeError): + l == r + + with tm.assertRaises(TypeError): + l != r + + def test_timedelta_ops(self): + # GH4984 + # make sure ops return Timedelta + s = Series([Timestamp('20130101') + timedelta(seconds=i * i) + for i in range(10)]) + td = s.diff() + + result = td.mean() + expected = to_timedelta(timedelta(seconds=9)) + self.assertEqual(result, expected) + + result = td.to_frame().mean() + self.assertEqual(result[0], expected) + + result = td.quantile(.1) + expected = Timedelta(np.timedelta64(2600, 'ms')) + self.assertEqual(result, expected) + + result = td.median() + expected = to_timedelta('00:00:09') + self.assertEqual(result, expected) + + result = td.to_frame().median() + self.assertEqual(result[0], expected) + + # GH 6462 + # consistency in returned values for sum + result = td.sum() + expected = to_timedelta('00:01:21') + self.assertEqual(result, expected) + + result = td.to_frame().sum() + self.assertEqual(result[0], expected) + + # std + result = td.std() + expected = to_timedelta(Series(td.dropna().values).std()) + self.assertEqual(result, expected) + + result = td.to_frame().std() + self.assertEqual(result[0], expected) + + # invalid ops + for op in ['skew', 'kurt', 'sem', 'prod']: + self.assertRaises(TypeError, getattr(td, op)) + + # GH 10040 + # make sure NaT is properly handled by median() + s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) + self.assertEqual(s.diff().median(), timedelta(days=4)) + + s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), + Timestamp('2015-02-15')]) + self.assertEqual(s.diff().median(), timedelta(days=6)) + + def test_timedelta_ops_scalar(self): + # GH 6808 + base = pd.to_datetime('20130101 09:01:12.123456') + expected_add = pd.to_datetime('20130101 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), + np.timedelta64(10, 's'), + np.timedelta64(10000000000, 'ns'), + pd.offsets.Second(10)]: + result = base + offset + self.assertEqual(result, expected_add) + + result = base - offset + self.assertEqual(result, expected_sub) + + base = pd.to_datetime('20130102 09:01:12.123456') + expected_add = pd.to_datetime('20130103 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta('1 day, 00:00:10'), + pd.to_timedelta('1 days, 00:00:10'), + timedelta(days=1, seconds=10), + np.timedelta64(1, 'D') + np.timedelta64(10, 's'), + pd.offsets.Day() + pd.offsets.Second(10)]: + result = base + offset + self.assertEqual(result, expected_add) + + result = base - offset + self.assertEqual(result, expected_sub) + + def test_timedelta_ops_with_missing_values(self): + # setup + s1 = pd.to_timedelta(Series(['00:00:01'])) + s2 = pd.to_timedelta(Series(['00:00:02'])) + sn = pd.to_timedelta(Series([pd.NaT])) + df1 = DataFrame(['00:00:01']).apply(pd.to_timedelta) + df2 = DataFrame(['00:00:02']).apply(pd.to_timedelta) + dfn = DataFrame([pd.NaT]).apply(pd.to_timedelta) + scalar1 = pd.to_timedelta('00:00:01') + scalar2 = pd.to_timedelta('00:00:02') + timedelta_NaT = pd.to_timedelta('NaT') + NA = np.nan + + actual = scalar1 + scalar1 + self.assertEqual(actual, scalar2) + actual = scalar2 - scalar1 + self.assertEqual(actual, scalar1) + + actual = s1 + s1 + assert_series_equal(actual, s2) + actual = s2 - s1 + assert_series_equal(actual, s1) + + actual = s1 + scalar1 + assert_series_equal(actual, s2) + actual = scalar1 + s1 + assert_series_equal(actual, s2) + actual = s2 - scalar1 + assert_series_equal(actual, s1) + actual = -scalar1 + s2 + assert_series_equal(actual, s1) + + actual = s1 + timedelta_NaT + assert_series_equal(actual, sn) + actual = timedelta_NaT + s1 + assert_series_equal(actual, sn) + actual = s1 - timedelta_NaT + assert_series_equal(actual, sn) + actual = -timedelta_NaT + s1 + assert_series_equal(actual, sn) + + actual = s1 + NA + assert_series_equal(actual, sn) + actual = NA + s1 + assert_series_equal(actual, sn) + actual = s1 - NA + assert_series_equal(actual, sn) + actual = -NA + s1 + assert_series_equal(actual, sn) + + actual = s1 + pd.NaT + assert_series_equal(actual, sn) + actual = s2 - pd.NaT + assert_series_equal(actual, sn) + + actual = s1 + df1 + assert_frame_equal(actual, df2) + actual = s2 - df1 + assert_frame_equal(actual, df1) + actual = df1 + s1 + assert_frame_equal(actual, df2) + actual = df2 - s1 + assert_frame_equal(actual, df1) + + actual = df1 + df1 + assert_frame_equal(actual, df2) + actual = df2 - df1 + assert_frame_equal(actual, df1) + + actual = df1 + scalar1 + assert_frame_equal(actual, df2) + actual = df2 - scalar1 + assert_frame_equal(actual, df1) + + actual = df1 + timedelta_NaT + assert_frame_equal(actual, dfn) + actual = df1 - timedelta_NaT + assert_frame_equal(actual, dfn) + + actual = df1 + NA + assert_frame_equal(actual, dfn) + actual = df1 - NA + assert_frame_equal(actual, dfn) + + actual = df1 + pd.NaT # NaT is datetime, not timedelta + assert_frame_equal(actual, dfn) + actual = df1 - pd.NaT + assert_frame_equal(actual, dfn) + + def test_compare_timedelta_series(self): + # regresssion test for GH5963 + s = pd.Series([timedelta(days=1), timedelta(days=2)]) + actual = s > timedelta(days=1) + expected = pd.Series([False, True]) + tm.assert_series_equal(actual, expected) + + def test_compare_timedelta_ndarray(self): + # GH11835 + periods = [Timedelta('0 days 01:00:00'), Timedelta('0 days 01:00:00')] + arr = np.array(periods) + result = arr[0] > arr + expected = np.array([False, False]) + self.assert_numpy_array_equal(result, expected) + + +class TestSlicing(tm.TestCase): + + def test_tdi_ops_attributes(self): + rng = timedelta_range('2 days', periods=5, freq='2D', name='x') + + result = rng + 1 + exp = timedelta_range('4 days', periods=5, freq='2D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '2D') + + result = rng - 2 + exp = timedelta_range('-2 days', periods=5, freq='2D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '2D') + + result = rng * 2 + exp = timedelta_range('4 days', periods=5, freq='4D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '4D') + + result = rng / 2 + exp = timedelta_range('1 days', periods=5, freq='D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, 'D') + + result = -rng + exp = timedelta_range('-2 days', periods=5, freq='-2D', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '-2D') + + rng = pd.timedelta_range('-2 days', periods=5, freq='D', name='x') + + result = abs(rng) + exp = TimedeltaIndex(['2 days', '1 days', '0 days', '1 days', + '2 days'], name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, None) + + def test_add_overflow(self): + # see gh-14068 + msg = "too (big|large) to convert" + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta(106580, 'D') + Timestamp('2000') + with tm.assertRaisesRegexp(OverflowError, msg): + Timestamp('2000') + to_timedelta(106580, 'D') + + _NaT = int(pd.NaT) + 1 + msg = "Overflow in int64 addition" + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta([106580], 'D') + Timestamp('2000') + with tm.assertRaisesRegexp(OverflowError, msg): + Timestamp('2000') + to_timedelta([106580], 'D') + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta([_NaT]) - Timedelta('1 days') + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta(['5 days', _NaT]) - Timedelta('1 days') + with tm.assertRaisesRegexp(OverflowError, msg): + (to_timedelta([_NaT, '5 days', '1 hours']) - + to_timedelta(['7 seconds', _NaT, '4 hours'])) + + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = to_timedelta([pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(['4 days', pd.NaT]) + result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) + result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py new file mode 100644 index 0000000000000..0d46ee4172211 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -0,0 +1,81 @@ +import numpy as np +import pandas.util.testing as tm + +import pandas as pd +from pandas import Series, timedelta_range, Timedelta +from pandas.util.testing import assert_series_equal + + +class TestSlicing(tm.TestCase): + + def test_partial_slice(self): + rng = timedelta_range('1 day 10:11:12', freq='h', periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['5 day':'6 day'] + expected = s.iloc[86:134] + assert_series_equal(result, expected) + + result = s['5 day':] + expected = s.iloc[86:] + assert_series_equal(result, expected) + + result = s[:'6 day'] + expected = s.iloc[:134] + assert_series_equal(result, expected) + + result = s['6 days, 23:11:12'] + self.assertEqual(result, s.iloc[133]) + + self.assertRaises(KeyError, s.__getitem__, '50 days') + + def test_partial_slice_high_reso(self): + + # higher reso + rng = timedelta_range('1 day 10:11:12', freq='us', periods=2000) + s = Series(np.arange(len(rng)), index=rng) + + result = s['1 day 10:11:12':] + expected = s.iloc[0:] + assert_series_equal(result, expected) + + result = s['1 day 10:11:12.001':] + expected = s.iloc[1000:] + assert_series_equal(result, expected) + + result = s['1 days, 10:11:12.001001'] + self.assertEqual(result, s.iloc[1001]) + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1]) + assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1]) + + assert_slices_equivalent(SLC[:Timedelta(hours=7):-1], SLC[:6:-1]) + assert_slices_equivalent(SLC[:'7 hours':-1], SLC[:6:-1]) + + assert_slices_equivalent(SLC['15 hours':'7 hours':-1], SLC[15:6:-1]) + assert_slices_equivalent(SLC[Timedelta(hours=15):Timedelta(hours=7):- + 1], SLC[15:6:-1]) + assert_slices_equivalent(SLC['15 hours':Timedelta(hours=7):-1], + SLC[15:6:-1]) + assert_slices_equivalent(SLC[Timedelta(hours=15):'7 hours':-1], + SLC[15:6:-1]) + + assert_slices_equivalent(SLC['7 hours':'15 hours':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py new file mode 100644 index 0000000000000..9000fb3beb279 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -0,0 +1,76 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import TimedeltaIndex, timedelta_range, Int64Index + + +class TestTimedeltaIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_union(self): + + i1 = timedelta_range('1day', periods=5) + i2 = timedelta_range('3day', periods=5) + result = i1.union(i2) + expected = timedelta_range('1day', periods=7) + self.assert_index_equal(result, expected) + + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = TimedeltaIndex(start='1 day', periods=10, freq='D') + i1.union(i2) # Works + i2.union(i1) # Fails with "AttributeError: can't set attribute" + + def test_union_coverage(self): + + idx = TimedeltaIndex(['3d', '1d', '2d']) + ordered = TimedeltaIndex(idx.sort_values(), freq='infer') + result = ordered.union(idx) + self.assert_index_equal(result, ordered) + + result = ordered[:0].union(ordered) + self.assert_index_equal(result, ordered) + self.assertEqual(result.freq, ordered.freq) + + def test_union_bug_1730(self): + + rng_a = timedelta_range('1 day', periods=4, freq='3H') + rng_b = timedelta_range('1 day', periods=4, freq='4H') + + result = rng_a.union(rng_b) + exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + self.assert_index_equal(result, exp) + + def test_union_bug_1745(self): + + left = TimedeltaIndex(['1 day 15:19:49.695000']) + right = TimedeltaIndex(['2 day 13:04:21.322000', + '1 day 15:27:24.873000', + '1 day 15:31:05.350000']) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + self.assert_index_equal(result, exp) + + def test_union_bug_4564(self): + + left = timedelta_range("1 day", "30d") + right = left + pd.offsets.Minute(15) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + self.assert_index_equal(result, exp) + + def test_intersection_bug_1708(self): + index_1 = timedelta_range('1 day', periods=4, freq='h') + index_2 = index_1 + pd.offsets.Hour(5) + + result = index_1 & index_2 + self.assertEqual(len(result), 0) + + index_1 = timedelta_range('1 day', periods=4, freq='h') + index_2 = index_1 + pd.offsets.Hour(1) + + result = index_1 & index_2 + expected = timedelta_range('1 day 01:00:00', periods=3, freq='h') + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py new file mode 100644 index 0000000000000..4c8571e4f08f9 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -0,0 +1,592 @@ +import numpy as np +from datetime import timedelta + +import pandas as pd +import pandas.util.testing as tm +from pandas import (timedelta_range, date_range, Series, Timedelta, + DatetimeIndex, TimedeltaIndex, Index, DataFrame, + Int64Index, _np_version_under1p8) +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_index_equal) + +from ..datetimelike import DatetimeLike + +randn = np.random.randn + + +class TestTimedeltaIndex(DatetimeLike, tm.TestCase): + _holder = TimedeltaIndex + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(index=tm.makeTimedeltaIndex(10)) + self.setup_indices() + + def create_index(self): + return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + + def test_shift(self): + # test shift for TimedeltaIndex + # err8083 + + drange = self.create_index() + result = drange.shift(1) + expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', + '4 days 01:00:00', '5 days 01:00:00'], + freq='D') + self.assert_index_equal(result, expected) + + result = drange.shift(3, freq='2D 1s') + expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', + '8 days 01:00:03', '9 days 01:00:03', + '10 days 01:00:03'], freq='D') + self.assert_index_equal(result, expected) + + def test_get_loc(self): + idx = pd.to_timedelta(['0 days', '1 days', '2 days']) + + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(idx[1], method), 1) + self.assertEqual(idx.get_loc(idx[1].to_pytimedelta(), method), 1) + self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + + self.assertEqual( + idx.get_loc(idx[1], 'pad', tolerance=pd.Timedelta(0)), 1) + self.assertEqual( + idx.get_loc(idx[1], 'pad', tolerance=np.timedelta64(0, 's')), 1) + self.assertEqual(idx.get_loc(idx[1], 'pad', tolerance=timedelta(0)), 1) + + with tm.assertRaisesRegexp(ValueError, 'must be convertible'): + idx.get_loc(idx[1], method='nearest', tolerance='foo') + + for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + self.assertEqual(idx.get_loc('1 day 1 hour', method), loc) + + def test_get_loc_nat(self): + tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) + + self.assertEqual(tidx.get_loc(pd.NaT), 1) + self.assertEqual(tidx.get_loc(None), 1) + self.assertEqual(tidx.get_loc(float('nan')), 1) + self.assertEqual(tidx.get_loc(np.nan), 1) + + def test_get_indexer(self): + idx = pd.to_timedelta(['0 days', '1 days', '2 days']) + tm.assert_numpy_array_equal(idx.get_indexer(idx), + np.array([0, 1, 2], dtype=np.intp)) + + target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), + np.array([-1, 0, 1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), + np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), + np.array([0, 1, 1], dtype=np.intp)) + + res = idx.get_indexer(target, 'nearest', + tolerance=pd.Timedelta('1 hour')) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) + + def test_numeric_compat(self): + + idx = self._holder(np.arange(5, dtype='int64')) + didx = self._holder(np.arange(5, dtype='int64') ** 2) + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, + self._holder(np.arange(5, dtype='int64') * 5)) + + result = idx * np.arange(5, dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='float64') + 0.1) + tm.assert_index_equal(result, self._holder(np.arange( + 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) + + # invalid + self.assertRaises(TypeError, lambda: idx * idx) + self.assertRaises(ValueError, lambda: idx * self._holder(np.arange(3))) + self.assertRaises(ValueError, lambda: idx * np.array([1, 2])) + + def test_pickle_compat_construction(self): + pass + + def test_ufunc_coercions(self): + # normal ops are also tested in tseries/test_timedeltas.py + idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], + freq='2H', name='x') + + for result in [idx * 2, np.multiply(idx, 2)]: + tm.assertIsInstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'], + freq='4H', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '4H') + + for result in [idx / 2, np.divide(idx, 2)]: + tm.assertIsInstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'], + freq='H', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, 'H') + + idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], + freq='2H', name='x') + for result in [-idx, np.negative(idx)]: + tm.assertIsInstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'], + freq='-2H', name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, '-2H') + + idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'], + freq='H', name='x') + for result in [abs(idx), np.absolute(idx)]: + tm.assertIsInstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'], + freq=None, name='x') + tm.assert_index_equal(result, exp) + self.assertEqual(result.freq, None) + + def test_fillna_timedelta(self): + # GH 11343 + idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day']) + + exp = pd.TimedeltaIndex(['1 day', '2 day', '3 day']) + self.assert_index_equal(idx.fillna(pd.Timedelta('2 day')), exp) + + exp = pd.TimedeltaIndex(['1 day', '3 hour', '3 day']) + idx.fillna(pd.Timedelta('3 hour')) + + exp = pd.Index( + [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + + def test_difference_freq(self): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + def test_take(self): + + tds = ['1day 02:00:00', '1 day 04:00:00', '1 day 10:00:00'] + idx = TimedeltaIndex(start='1d', end='2d', freq='H', name='idx') + expected = TimedeltaIndex(tds, freq=None, name='idx') + + taken1 = idx.take([2, 4, 10]) + taken2 = idx[[2, 4, 10]] + + for taken in [taken1, taken2]: + self.assert_index_equal(taken, expected) + tm.assertIsInstance(taken, TimedeltaIndex) + self.assertIsNone(taken.freq) + self.assertEqual(taken.name, expected.name) + + def test_take_fill_value(self): + # GH 12631 + idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days'], + name='xxx') + result = idx.take(np.array([1, 0, -1])) + expected = pd.TimedeltaIndex(['2 days', '1 days', '3 days'], + name='xxx') + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.TimedeltaIndex(['2 days', '1 days', 'NaT'], + name='xxx') + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = pd.TimedeltaIndex(['2 days', '1 days', '3 days'], + name='xxx') + tm.assert_index_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + idx.take(np.array([1, -5])) + + def test_isin(self): + + index = tm.makeTimedeltaIndex(4) + result = index.isin(index) + self.assertTrue(result.all()) + + result = index.isin(list(index)) + self.assertTrue(result.all()) + + assert_almost_equal(index.isin([index[2], 5]), + np.array([False, False, True, False])) + + def test_factorize(self): + idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', + '3 day']) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day']) + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_index_equal(idx, exp_idx) + + # freq must be preserved + idx3 = timedelta_range('1 day', periods=4, freq='s') + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_index_equal(idx, idx3) + + def test_join_self(self): + + index = timedelta_range('1 day', periods=10) + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = index.join(index, how=kind) + tm.assert_index_equal(index, joined) + + def test_slice_keeps_name(self): + + # GH4226 + dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket') + self.assertEqual(dr[1:].name, dr.name) + + def test_does_not_convert_mixed_integer(self): + df = tm.makeCustomDataframe(10, 10, + data_gen_f=lambda *args, **kwargs: randn(), + r_idx_type='i', c_idx_type='td') + str(df) + + cols = df.columns.join(df.index, how='outer') + joined = cols.join(df.columns) + self.assertEqual(cols.dtype, np.dtype('O')) + self.assertEqual(cols.dtype, joined.dtype) + tm.assert_index_equal(cols, joined) + + def test_sort_values(self): + + idx = TimedeltaIndex(['4d', '1d', '2d']) + + ordered = idx.sort_values() + self.assertTrue(ordered.is_monotonic) + + ordered = idx.sort_values(ascending=False) + self.assertTrue(ordered[::-1].is_monotonic) + + ordered, dexer = idx.sort_values(return_indexer=True) + self.assertTrue(ordered.is_monotonic) + self.assert_numpy_array_equal(dexer, + np.array([1, 2, 0]), + check_dtype=False) + + ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) + self.assertTrue(ordered[::-1].is_monotonic) + self.assert_numpy_array_equal(dexer, + np.array([0, 2, 1]), + check_dtype=False) + + def test_get_duplicates(self): + idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day', + '4day']) + + result = idx.get_duplicates() + ex = TimedeltaIndex(['2 day', '3day']) + self.assert_index_equal(result, ex) + + def test_argmin_argmax(self): + idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01', + '1 day 00:00:02']) + self.assertEqual(idx.argmin(), 1) + self.assertEqual(idx.argmax(), 0) + + def test_misc_coverage(self): + + rng = timedelta_range('1 day', periods=5) + result = rng.groupby(rng.days) + tm.assertIsInstance(list(result.values())[0][0], Timedelta) + + idx = TimedeltaIndex(['3d', '1d', '2d']) + self.assertFalse(idx.equals(list(idx))) + + non_td = Index(list('abc')) + self.assertFalse(idx.equals(list(non_td))) + + def test_map(self): + + rng = timedelta_range('1 day', periods=10) + + f = lambda x: x.days + result = rng.map(f) + exp = Int64Index([f(x) for x in rng]) + tm.assert_index_equal(result, exp) + + def test_comparisons_nat(self): + + tdidx1 = pd.TimedeltaIndex(['1 day', pd.NaT, '1 day 00:00:01', pd.NaT, + '1 day 00:00:01', '5 day 00:00:03']) + tdidx2 = pd.TimedeltaIndex(['2 day', '2 day', pd.NaT, pd.NaT, + '1 day 00:00:02', '5 days 00:00:03']) + tdarr = np.array([np.timedelta64(2, 'D'), + np.timedelta64(2, 'D'), np.timedelta64('nat'), + np.timedelta64('nat'), + np.timedelta64(1, 'D') + np.timedelta64(2, 's'), + np.timedelta64(5, 'D') + np.timedelta64(3, 's')]) + + if _np_version_under1p8: + # cannot test array because np.datetime('nat') returns today's date + cases = [(tdidx1, tdidx2)] + else: + cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] + + # Check pd.NaT is handles as the same as np.nan + for idx1, idx2 in cases: + + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + self.assert_numpy_array_equal(result, expected) + + def test_comparisons_coverage(self): + rng = timedelta_range('1 days', periods=10) + + result = rng < rng[3] + exp = np.array([True, True, True] + [False] * 7) + self.assert_numpy_array_equal(result, exp) + + # raise TypeError for now + self.assertRaises(TypeError, rng.__lt__, rng[3].value) + + result = rng == list(rng) + exp = rng == rng + self.assert_numpy_array_equal(result, exp) + + def test_total_seconds(self): + # GH 10939 + # test index + rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, + freq='s') + expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, + 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] + tm.assert_almost_equal(rng.total_seconds(), np.array(expt)) + + # test Series + s = Series(rng) + s_expt = Series(expt, index=[0, 1]) + tm.assert_series_equal(s.dt.total_seconds(), s_expt) + + # with nat + s[1] = np.nan + s_expt = Series([1 * 86400 + 10 * 3600 + 11 * 60 + + 12 + 100123456. / 1e9, np.nan], index=[0, 1]) + tm.assert_series_equal(s.dt.total_seconds(), s_expt) + + # with both nat + s = Series([np.nan, np.nan], dtype='timedelta64[ns]') + tm.assert_series_equal(s.dt.total_seconds(), + Series([np.nan, np.nan], index=[0, 1])) + + def test_pass_TimedeltaIndex_to_index(self): + + rng = timedelta_range('1 days', '10 days') + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pytimedelta(), dtype=object) + + self.assert_numpy_array_equal(idx.values, expected.values) + + def test_pickle(self): + + rng = timedelta_range('1 days', periods=10) + rng_p = self.round_trip_pickle(rng) + tm.assert_index_equal(rng, rng_p) + + def test_hash_error(self): + index = timedelta_range('1 days', periods=10) + with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % + type(index).__name__): + hash(index) + + def test_append_join_nondatetimeindex(self): + rng = timedelta_range('1 days', periods=10) + idx = Index(['a', 'b', 'c', 'd']) + + result = rng.append(idx) + tm.assertIsInstance(result[0], Timedelta) + + # it works + rng.join(idx, how='outer') + + def test_append_numpy_bug_1681(self): + + td = timedelta_range('1 days', '10 days', freq='2D') + a = DataFrame() + c = DataFrame({'A': 'foo', 'B': td}, index=td) + str(c) + + result = a.append(c) + self.assertTrue((result['B'] == td).all()) + + def test_fields(self): + rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, + freq='s') + self.assert_numpy_array_equal(rng.days, np.array( + [1, 1], dtype='int64')) + self.assert_numpy_array_equal( + rng.seconds, + np.array([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], + dtype='int64')) + self.assert_numpy_array_equal(rng.microseconds, np.array( + [100 * 1000 + 123, 100 * 1000 + 123], dtype='int64')) + self.assert_numpy_array_equal(rng.nanoseconds, np.array( + [456, 456], dtype='int64')) + + self.assertRaises(AttributeError, lambda: rng.hours) + self.assertRaises(AttributeError, lambda: rng.minutes) + self.assertRaises(AttributeError, lambda: rng.milliseconds) + + # with nat + s = Series(rng) + s[1] = np.nan + + tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) + tm.assert_series_equal(s.dt.seconds, Series( + [10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1])) + + def test_freq_conversion(self): + + # doc example + + # series + td = Series(date_range('20130101', periods=4)) - \ + Series(date_range('20121201', periods=4)) + td[2] += timedelta(minutes=5, seconds=3) + td[3] = np.nan + + result = td / np.timedelta64(1, 'D') + expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan + ]) + assert_series_equal(result, expected) + + result = td.astype('timedelta64[D]') + expected = Series([31, 31, 31, np.nan]) + assert_series_equal(result, expected) + + result = td / np.timedelta64(1, 's') + expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, + np.nan]) + assert_series_equal(result, expected) + + result = td.astype('timedelta64[s]') + assert_series_equal(result, expected) + + # tdi + td = TimedeltaIndex(td) + + result = td / np.timedelta64(1, 'D') + expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) + assert_index_equal(result, expected) + + result = td.astype('timedelta64[D]') + expected = Index([31, 31, 31, np.nan]) + assert_index_equal(result, expected) + + result = td / np.timedelta64(1, 's') + expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, + np.nan]) + assert_index_equal(result, expected) + + result = td.astype('timedelta64[s]') + assert_index_equal(result, expected) + + +class TestSlicing(tm.TestCase): + + def test_timedelta(self): + # this is valid too + index = date_range('1/1/2000', periods=50, freq='B') + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + self.assertTrue(tm.equalContents(index, back)) + self.assertEqual(shifted.freq, index.freq) + self.assertEqual(shifted.freq, back.freq) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) + + # GH4134, buggy with timedeltas + rng = date_range('2013', '2014') + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) + + +class TestTimeSeries(tm.TestCase): + _multiprocess_can_split_ = True + + def test_series_box_timedelta(self): + rng = timedelta_range('1 day 1 s', periods=5, freq='h') + s = Series(rng) + tm.assertIsInstance(s[1], Timedelta) + tm.assertIsInstance(s.iat[2], Timedelta) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py new file mode 100644 index 0000000000000..8bd56b5885bba --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -0,0 +1,51 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas.tseries.offsets import Day, Second +from pandas import to_timedelta, timedelta_range +from pandas.util.testing import assert_frame_equal + + +class TestTimedeltas(tm.TestCase): + _multiprocess_can_split_ = True + + def test_timedelta_range(self): + + expected = to_timedelta(np.arange(5), unit='D') + result = timedelta_range('0 days', periods=5, freq='D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(11), unit='D') + result = timedelta_range('0 days', '10 days', freq='D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(5), unit='D') + Second(2) + Day() + result = timedelta_range('1 days, 00:00:02', '5 days, 00:00:02', + freq='D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta([1, 3, 5, 7, 9], unit='D') + Second(2) + result = timedelta_range('1 days, 00:00:02', periods=5, freq='2D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(50), unit='T') * 30 + result = timedelta_range('0 days', freq='30T', periods=50) + tm.assert_index_equal(result, expected) + + # GH 11776 + arr = np.arange(10).reshape(2, 5) + df = pd.DataFrame(np.arange(10).reshape(2, 5)) + for arg in (arr, df): + with tm.assertRaisesRegexp(TypeError, "1-d array"): + to_timedelta(arg) + for errors in ['ignore', 'raise', 'coerce']: + with tm.assertRaisesRegexp(TypeError, "1-d array"): + to_timedelta(arg, errors=errors) + + # issue10583 + df = pd.DataFrame(np.random.normal(size=(10, 4))) + df.index = pd.timedelta_range(start='0s', periods=10, freq='s') + expected = df.loc[pd.Timedelta('0s'):, :] + result = df.loc['0s':, :] + assert_frame_equal(expected, result) diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py new file mode 100644 index 0000000000000..2442051547312 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -0,0 +1,201 @@ +from datetime import time, timedelta +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas.util.testing import assert_series_equal +from pandas import (Series, Timedelta, to_timedelta, tslib, isnull, + TimedeltaIndex) + + +class TestTimedeltas(tm.TestCase): + _multiprocess_can_split_ = True + + def test_to_timedelta(self): + def conv(v): + return v.astype('m8[ns]') + + d1 = np.timedelta64(1, 'D') + + self.assertEqual(to_timedelta('1 days 06:05:01.00003', box=False), + conv(d1 + np.timedelta64(6 * 3600 + + 5 * 60 + 1, 's') + + np.timedelta64(30, 'us'))) + self.assertEqual(to_timedelta('15.5us', box=False), + conv(np.timedelta64(15500, 'ns'))) + + # empty string + result = to_timedelta('', box=False) + self.assertEqual(result.astype('int64'), tslib.iNaT) + + result = to_timedelta(['', '']) + self.assertTrue(isnull(result).all()) + + # pass thru + result = to_timedelta(np.array([np.timedelta64(1, 's')])) + expected = pd.Index(np.array([np.timedelta64(1, 's')])) + tm.assert_index_equal(result, expected) + + # ints + result = np.timedelta64(0, 'ns') + expected = to_timedelta(0, box=False) + self.assertEqual(result, expected) + + # Series + expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) + result = to_timedelta(Series(['1d', '1days 00:00:01'])) + tm.assert_series_equal(result, expected) + + # with units + result = TimedeltaIndex([np.timedelta64(0, 'ns'), np.timedelta64( + 10, 's').astype('m8[ns]')]) + expected = to_timedelta([0, 10], unit='s') + tm.assert_index_equal(result, expected) + + # single element conversion + v = timedelta(seconds=1) + result = to_timedelta(v, box=False) + expected = np.timedelta64(timedelta(seconds=1)) + self.assertEqual(result, expected) + + v = np.timedelta64(timedelta(seconds=1)) + result = to_timedelta(v, box=False) + expected = np.timedelta64(timedelta(seconds=1)) + self.assertEqual(result, expected) + + # arrays of various dtypes + arr = np.array([1] * 5, dtype='int64') + result = to_timedelta(arr, unit='s') + expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype='int64') + result = to_timedelta(arr, unit='m') + expected = TimedeltaIndex([np.timedelta64(1, 'm')] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype='int64') + result = to_timedelta(arr, unit='h') + expected = TimedeltaIndex([np.timedelta64(1, 'h')] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype='timedelta64[s]') + result = to_timedelta(arr) + expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype='timedelta64[D]') + result = to_timedelta(arr) + expected = TimedeltaIndex([np.timedelta64(1, 'D')] * 5) + tm.assert_index_equal(result, expected) + + # Test with lists as input when box=false + expected = np.array(np.arange(3) * 1000000000, dtype='timedelta64[ns]') + result = to_timedelta(range(3), unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) + + result = to_timedelta(np.arange(3), unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) + + result = to_timedelta([0, 1, 2], unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) + + # Tests with fractional seconds as input: + expected = np.array( + [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]') + result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) + + def test_to_timedelta_invalid(self): + + # bad value for errors parameter + msg = "errors must be one of" + tm.assertRaisesRegexp(ValueError, msg, to_timedelta, + ['foo'], errors='never') + + # these will error + self.assertRaises(ValueError, lambda: to_timedelta([1, 2], unit='foo')) + self.assertRaises(ValueError, lambda: to_timedelta(1, unit='foo')) + + # time not supported ATM + self.assertRaises(ValueError, lambda: to_timedelta(time(second=1))) + self.assertTrue(to_timedelta( + time(second=1), errors='coerce') is pd.NaT) + + self.assertRaises(ValueError, lambda: to_timedelta(['foo', 'bar'])) + tm.assert_index_equal(TimedeltaIndex([pd.NaT, pd.NaT]), + to_timedelta(['foo', 'bar'], errors='coerce')) + + tm.assert_index_equal(TimedeltaIndex(['1 day', pd.NaT, '1 min']), + to_timedelta(['1 day', 'bar', '1 min'], + errors='coerce')) + + # gh-13613: these should not error because errors='ignore' + invalid_data = 'apple' + self.assertEqual(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = ['apple', '1 days'] + tm.assert_numpy_array_equal( + np.array(invalid_data, dtype=object), + to_timedelta(invalid_data, errors='ignore')) + + invalid_data = pd.Index(['apple', '1 days']) + tm.assert_index_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = Series(['apple', '1 days']) + tm.assert_series_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + def test_to_timedelta_via_apply(self): + # GH 5458 + expected = Series([np.timedelta64(1, 's')]) + result = Series(['00:00:01']).apply(to_timedelta) + tm.assert_series_equal(result, expected) + + result = Series([to_timedelta('00:00:01')]) + tm.assert_series_equal(result, expected) + + def test_to_timedelta_on_missing_values(self): + # GH5438 + timedelta_NaT = np.timedelta64('NaT') + + actual = pd.to_timedelta(Series(['00:00:01', np.nan])) + expected = Series([np.timedelta64(1000000000, 'ns'), + timedelta_NaT], dtype=' r + + self.assertFalse(l == r) + self.assertTrue(l != r) diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 0cef27d2e41fc..2abc83ca6109c 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -1637,3 +1637,48 @@ def test_woy_boundary(self): for args in [(2000, 1, 1), (2000, 1, 2), ( 2005, 1, 1), (2005, 1, 2)]]) self.assertTrue((result == [52, 52, 53, 53]).all()) + + +class TestTsUtil(tm.TestCase): + + def test_min_valid(self): + # Ensure that Timestamp.min is a valid Timestamp + Timestamp(Timestamp.min) + + def test_max_valid(self): + # Ensure that Timestamp.max is a valid Timestamp + Timestamp(Timestamp.max) + + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + self.assertEqual( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000, + Timestamp.max.value / 1000) + + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + self.assertEqual( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000, + Timestamp.min.value / 1000) + + +class TestTslib(tm.TestCase): + + def test_round(self): + stamp = Timestamp('2000-01-05 05:09:15.13') + + def _check_round(freq, expected): + result = stamp.round(freq=freq) + self.assertEqual(result, expected) + + for freq, expected in [('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15'))]: + _check_round(freq, expected) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + stamp.round('foo') diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index be3b917cb8117..114cb02205d4f 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -2,10 +2,8 @@ from datetime import timedelta import numpy as np import pandas as pd -from pandas import (Series, Index, Int64Index, Timestamp, Period, - DatetimeIndex, PeriodIndex, TimedeltaIndex, - Timedelta, timedelta_range, date_range, Float64Index, - _np_version_under1p10) +from pandas import (Series, Index, Period, DatetimeIndex, PeriodIndex, + Timedelta, _np_version_under1p10) import pandas.tslib as tslib import pandas.tseries.period as period @@ -14,846 +12,6 @@ from pandas.tests.test_base import Ops -class TestTimedeltaIndexOps(Ops): - - def setUp(self): - super(TestTimedeltaIndexOps, self).setUp() - mask = lambda x: isinstance(x, TimedeltaIndex) - self.is_valid_objs = [o for o in self.objs if mask(o)] - self.not_valid_objs = [] - - def test_ops_properties(self): - self.check_ops_properties(['days', 'hours', 'minutes', 'seconds', - 'milliseconds']) - self.check_ops_properties(['microseconds', 'nanoseconds']) - - def test_asobject_tolist(self): - idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') - expected_list = [Timedelta('1 days'), Timedelta('2 days'), - Timedelta('3 days'), Timedelta('4 days')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - - self.assertEqual(result.dtype, object) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT, - timedelta(days=4)], name='idx') - expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, - Timedelta('4 days')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - def test_minmax(self): - - # monotonic - idx1 = TimedeltaIndex(['1 days', '2 days', '3 days']) - self.assertTrue(idx1.is_monotonic) - - # non-monotonic - idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) - self.assertFalse(idx2.is_monotonic) - - for idx in [idx1, idx2]: - self.assertEqual(idx.min(), Timedelta('1 days')), - self.assertEqual(idx.max(), Timedelta('3 days')), - self.assertEqual(idx.argmin(), 0) - self.assertEqual(idx.argmax(), 2) - - for op in ['min', 'max']: - # Return NaT - obj = TimedeltaIndex([]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - obj = TimedeltaIndex([pd.NaT]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - def test_numpy_minmax(self): - dr = pd.date_range(start='2016-01-15', end='2016-01-20') - td = TimedeltaIndex(np.asarray(dr)) - - self.assertEqual(np.min(td), Timedelta('16815 days')) - self.assertEqual(np.max(td), Timedelta('16820 days')) - - errmsg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, errmsg, np.min, td, out=0) - tm.assertRaisesRegexp(ValueError, errmsg, np.max, td, out=0) - - self.assertEqual(np.argmin(td), 0) - self.assertEqual(np.argmax(td), 5) - - if not _np_version_under1p10: - errmsg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, errmsg, np.argmin, td, out=0) - tm.assertRaisesRegexp(ValueError, errmsg, np.argmax, td, out=0) - - def test_round(self): - td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min') - elt = td[1] - - expected_rng = TimedeltaIndex([ - Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 01:00:00'), - Timedelta('16801 days 02:00:00'), - Timedelta('16801 days 02:00:00'), - ]) - expected_elt = expected_rng[1] - - tm.assert_index_equal(td.round(freq='H'), expected_rng) - self.assertEqual(elt.round(freq='H'), expected_elt) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - td.round(freq='foo') - with tm.assertRaisesRegexp(ValueError, msg): - elt.round(freq='foo') - - msg = " is a non-fixed frequency" - tm.assertRaisesRegexp(ValueError, msg, td.round, freq='M') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') - - def test_representation(self): - idx1 = TimedeltaIndex([], freq='D') - idx2 = TimedeltaIndex(['1 days'], freq='D') - idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') - idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') - idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - - exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" - - exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " - "freq='D')") - - exp3 = ("TimedeltaIndex(['1 days', '2 days'], " - "dtype='timedelta64[ns]', freq='D')") - - exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], " - "dtype='timedelta64[ns]', freq='D')") - - exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " - "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)") - - with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - self.assertEqual(result, expected) - - def test_representation_to_series(self): - idx1 = TimedeltaIndex([], freq='D') - idx2 = TimedeltaIndex(['1 days'], freq='D') - idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') - idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') - idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - - exp1 = """Series([], dtype: timedelta64[ns])""" - - exp2 = """0 1 days -dtype: timedelta64[ns]""" - - exp3 = """0 1 days -1 2 days -dtype: timedelta64[ns]""" - - exp4 = """0 1 days -1 2 days -2 3 days -dtype: timedelta64[ns]""" - - exp5 = """0 1 days 00:00:01 -1 2 days 00:00:00 -2 3 days 00:00:00 -dtype: timedelta64[ns]""" - - with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - result = repr(pd.Series(idx)) - self.assertEqual(result, expected) - - def test_summary(self): - # GH9116 - idx1 = TimedeltaIndex([], freq='D') - idx2 = TimedeltaIndex(['1 days'], freq='D') - idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') - idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') - idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - - exp1 = """TimedeltaIndex: 0 entries -Freq: D""" - - exp2 = """TimedeltaIndex: 1 entries, 1 days to 1 days -Freq: D""" - - exp3 = """TimedeltaIndex: 2 entries, 1 days to 2 days -Freq: D""" - - exp4 = """TimedeltaIndex: 3 entries, 1 days to 3 days -Freq: D""" - - exp5 = ("TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days " - "00:00:00") - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - result = idx.summary() - self.assertEqual(result, expected) - - def test_add_iadd(self): - - # only test adding/sub offsets as + is now numeric - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = timedelta_range('1 days', '10 days') - result = rng + delta - expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', - freq='D') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - # int - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - result = rng + 1 - expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - def test_sub_isub(self): - # only test adding/sub offsets as - is now numeric - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = timedelta_range('1 days', '10 days') - result = rng - delta - expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - # int - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - result = rng - 1 - expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - - idx = TimedeltaIndex(['1 day', '2 day']) - msg = "cannot subtract a datelike from a TimedeltaIndex" - with tm.assertRaisesRegexp(TypeError, msg): - idx - Timestamp('2011-01-01') - - result = Timestamp('2011-01-01') + idx - expected = DatetimeIndex(['2011-01-02', '2011-01-03']) - tm.assert_index_equal(result, expected) - - def test_ops_compat(self): - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - rng = timedelta_range('1 days', '10 days', name='foo') - - # multiply - for offset in offsets: - self.assertRaises(TypeError, lambda: rng * offset) - - # divide - expected = Int64Index((np.arange(10) + 1) * 12, name='foo') - for offset in offsets: - result = rng / offset - tm.assert_index_equal(result, expected, exact=False) - - # divide with nats - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - expected = Float64Index([12, np.nan, 24], name='foo') - for offset in offsets: - result = rng / offset - tm.assert_index_equal(result, expected) - - # don't allow division by NaT (make could in the future) - self.assertRaises(TypeError, lambda: rng / pd.NaT) - - def test_subtraction_ops(self): - - # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') - - self.assertRaises(TypeError, lambda: tdi - dt) - self.assertRaises(TypeError, lambda: tdi - dti) - self.assertRaises(TypeError, lambda: td - dt) - self.assertRaises(TypeError, lambda: td - dti) - - result = dt - dti - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') - tm.assert_index_equal(result, expected) - - result = dti - dt - expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') - tm.assert_index_equal(result, expected) - - result = tdi - td - expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) - - result = td - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) - - result = dti - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], name='bar') - tm.assert_index_equal(result, expected, check_names=False) - - result = dt - tdi - expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') - tm.assert_index_equal(result, expected) - - def test_subtraction_ops_with_tz(self): - - # check that dt/dti subtraction ops with tz are validated - dti = date_range('20130101', periods=3) - ts = Timestamp('20130101') - dt = ts.to_pydatetime() - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - ts_tz = Timestamp('20130101').tz_localize('US/Eastern') - ts_tz2 = Timestamp('20130101').tz_localize('CET') - dt_tz = ts_tz.to_pydatetime() - td = Timedelta('1 days') - - def _check(result, expected): - self.assertEqual(result, expected) - self.assertIsInstance(result, Timedelta) - - # scalars - result = ts - ts - expected = Timedelta('0 days') - _check(result, expected) - - result = dt_tz - ts_tz - expected = Timedelta('0 days') - _check(result, expected) - - result = ts_tz - dt_tz - expected = Timedelta('0 days') - _check(result, expected) - - # tz mismatches - self.assertRaises(TypeError, lambda: dt_tz - ts) - self.assertRaises(TypeError, lambda: dt_tz - dt) - self.assertRaises(TypeError, lambda: dt_tz - ts_tz2) - self.assertRaises(TypeError, lambda: dt - dt_tz) - self.assertRaises(TypeError, lambda: ts - dt_tz) - self.assertRaises(TypeError, lambda: ts_tz2 - ts) - self.assertRaises(TypeError, lambda: ts_tz2 - dt) - self.assertRaises(TypeError, lambda: ts_tz - ts_tz2) - - # with dti - self.assertRaises(TypeError, lambda: dti - ts_tz) - self.assertRaises(TypeError, lambda: dti_tz - ts) - self.assertRaises(TypeError, lambda: dti_tz - ts_tz2) - - result = dti_tz - dt_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) - tm.assert_index_equal(result, expected) - - result = dt_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) - tm.assert_index_equal(result, expected) - - result = dti_tz - ts_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) - tm.assert_index_equal(result, expected) - - result = ts_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) - tm.assert_index_equal(result, expected) - - result = td - td - expected = Timedelta('0 days') - _check(result, expected) - - result = dti_tz - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], tz='US/Eastern') - tm.assert_index_equal(result, expected) - - def test_dti_tdi_numeric_ops(self): - - # These are normally union/diff set-like ops - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = date_range('20130101', periods=3, name='bar') - - # TODO(wesm): unused? - # td = Timedelta('1 days') - # dt = Timestamp('20130101') - - result = tdi - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') - tm.assert_index_equal(result, expected) - - result = tdi + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') - tm.assert_index_equal(result, expected) - - result = dti - tdi # name will be reset - expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) - tm.assert_index_equal(result, expected) - - def test_sub_period(self): - # GH 13078 - # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - - for freq in [None, 'H']: - idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) - - with tm.assertRaises(TypeError): - idx - p - - with tm.assertRaises(TypeError): - p - idx - - def test_addition_ops(self): - - # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') - - result = tdi + dt - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') - tm.assert_index_equal(result, expected) - - result = dt + tdi - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') - tm.assert_index_equal(result, expected) - - result = td + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') - tm.assert_index_equal(result, expected) - - result = tdi + td - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') - tm.assert_index_equal(result, expected) - - # unequal length - self.assertRaises(ValueError, lambda: tdi + dti[0:1]) - self.assertRaises(ValueError, lambda: tdi[0:1] + dti) - - # random indexes - self.assertRaises(TypeError, lambda: tdi + Int64Index([1, 2, 3])) - - # this is a union! - # self.assertRaises(TypeError, lambda : Int64Index([1,2,3]) + tdi) - - result = tdi + dti # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) - tm.assert_index_equal(result, expected) - - result = dti + tdi # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) - tm.assert_index_equal(result, expected) - - result = dt + td - expected = Timestamp('20130102') - self.assertEqual(result, expected) - - result = td + dt - expected = Timestamp('20130102') - self.assertEqual(result, expected) - - def test_comp_nat(self): - left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, - pd.Timedelta('3 days')]) - right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) - - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r - expected = np.array([False, False, True]) - tm.assert_numpy_array_equal(result, expected) - - result = l != r - expected = np.array([True, True, False]) - tm.assert_numpy_array_equal(result, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) - - expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) - - def test_value_counts_unique(self): - # GH 7735 - - idx = timedelta_range('1 days 09:00:00', freq='H', periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) - - exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) - tm.assert_index_equal(idx.unique(), expected) - - idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', - '1 days 09:00:00', '1 days 08:00:00', - '1 days 08:00:00', pd.NaT]) - - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) - expected = Series([3, 2], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', - pd.NaT]) - expected = Series([3, 2, 1], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - - def test_nonunique_contains(self): - # GH 9512 - for idx in map(TimedeltaIndex, ([0, 1, 0], [0, 0, -1], [0, -1, -1], - ['00:01:00', '00:01:00', '00:02:00'], - ['00:01:00', '00:01:00', '00:00:01'])): - tm.assertIn(idx[0], idx) - - def test_unknown_attribute(self): - # GH 9680 - tdi = pd.timedelta_range(start=0, periods=10, freq='1s') - ts = pd.Series(np.random.normal(size=10), index=tdi) - self.assertNotIn('foo', ts.__dict__.keys()) - self.assertRaises(AttributeError, lambda: ts.foo) - - def test_order(self): - # GH 10295 - idx1 = TimedeltaIndex(['1 day', '2 day', '3 day'], freq='D', - name='idx') - idx2 = TimedeltaIndex( - ['1 hour', '2 hour', '3 hour'], freq='H', name='idx') - - for idx in [idx1, idx2]: - ordered = idx.sort_values() - self.assert_index_equal(ordered, idx) - self.assertEqual(ordered.freq, idx.freq) - - ordered = idx.sort_values(ascending=False) - expected = idx[::-1] - self.assert_index_equal(ordered, expected) - self.assertEqual(ordered.freq, expected.freq) - self.assertEqual(ordered.freq.n, -1) - - ordered, indexer = idx.sort_values(return_indexer=True) - self.assert_index_equal(ordered, idx) - self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2]), - check_dtype=False) - self.assertEqual(ordered.freq, idx.freq) - - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) - self.assert_index_equal(ordered, idx[::-1]) - self.assertEqual(ordered.freq, expected.freq) - self.assertEqual(ordered.freq.n, -1) - - idx1 = TimedeltaIndex(['1 hour', '3 hour', '5 hour', - '2 hour ', '1 hour'], name='idx1') - exp1 = TimedeltaIndex(['1 hour', '1 hour', '2 hour', - '3 hour', '5 hour'], name='idx1') - - idx2 = TimedeltaIndex(['1 day', '3 day', '5 day', - '2 day', '1 day'], name='idx2') - - # TODO(wesm): unused? - # exp2 = TimedeltaIndex(['1 day', '1 day', '2 day', - # '3 day', '5 day'], name='idx2') - - # idx3 = TimedeltaIndex([pd.NaT, '3 minute', '5 minute', - # '2 minute', pd.NaT], name='idx3') - # exp3 = TimedeltaIndex([pd.NaT, pd.NaT, '2 minute', '3 minute', - # '5 minute'], name='idx3') - - for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: - ordered = idx.sort_values() - self.assert_index_equal(ordered, expected) - self.assertIsNone(ordered.freq) - - ordered = idx.sort_values(ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True) - self.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - self.assert_numpy_array_equal(indexer, exp, check_dtype=False) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0]) - self.assert_numpy_array_equal(indexer, exp, check_dtype=False) - self.assertIsNone(ordered.freq) - - def test_getitem(self): - idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') - - for idx in [idx1]: - result = idx[0] - self.assertEqual(result, pd.Timedelta('1 day')) - - result = idx[0:5] - expected = pd.timedelta_range('1 day', '5 day', freq='D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx[0:10:2] - expected = pd.timedelta_range('1 day', '9 day', freq='2D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx[-20:-5:3] - expected = pd.timedelta_range('12 day', '24 day', freq='3D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx[4::-1] - expected = TimedeltaIndex(['5 day', '4 day', '3 day', - '2 day', '1 day'], - freq='-1D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - def test_drop_duplicates_metadata(self): - # GH 10115 - idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') - result = idx.drop_duplicates() - self.assert_index_equal(idx, result) - self.assertEqual(idx.freq, result.freq) - - idx_dup = idx.append(idx) - self.assertIsNone(idx_dup.freq) # freq is reset - result = idx_dup.drop_duplicates() - self.assert_index_equal(idx, result) - self.assertIsNone(result.freq) - - def test_drop_duplicates(self): - # to check Index/Series compat - base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') - idx = base.append(base[:5]) - - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) - - res = idx.drop_duplicates(keep='last') - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) - - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - - def test_take(self): - # GH 10295 - idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') - - for idx in [idx1]: - result = idx.take([0]) - self.assertEqual(result, pd.Timedelta('1 day')) - - result = idx.take([-1]) - self.assertEqual(result, pd.Timedelta('31 day')) - - result = idx.take([0, 1, 2]) - expected = pd.timedelta_range('1 day', '3 day', freq='D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx.take([0, 2, 4]) - expected = pd.timedelta_range('1 day', '5 day', freq='2D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx.take([7, 4, 1]) - expected = pd.timedelta_range('8 day', '2 day', freq='-3D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - - result = idx.take([3, 2, 5]) - expected = TimedeltaIndex(['4 day', '3 day', '6 day'], name='idx') - self.assert_index_equal(result, expected) - self.assertIsNone(result.freq) - - result = idx.take([-3, 2, 5]) - expected = TimedeltaIndex(['29 day', '3 day', '6 day'], name='idx') - self.assert_index_equal(result, expected) - self.assertIsNone(result.freq) - - def test_take_invalid_kwargs(self): - idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') - indices = [1, 6, 5, 9, 10, 13, 15, 3] - - msg = r"take\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, idx.take, - indices, foo=2) - - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, idx.take, - indices, out=indices) - - msg = "the 'mode' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, idx.take, - indices, mode='clip') - - def test_infer_freq(self): - # GH 11018 - for freq in ['D', '3D', '-3D', 'H', '2H', '-2H', 'T', '2T', 'S', '-3S' - ]: - idx = pd.timedelta_range('1', freq=freq, periods=10) - result = pd.TimedeltaIndex(idx.asi8, freq='infer') - tm.assert_index_equal(idx, result) - self.assertEqual(result.freq, freq) - - def test_nat_new(self): - - idx = pd.timedelta_range('1', freq='D', periods=5, name='x') - result = idx._nat_new() - exp = pd.TimedeltaIndex([pd.NaT] * 5, name='x') - tm.assert_index_equal(result, exp) - - result = idx._nat_new(box=False) - exp = np.array([tslib.iNaT] * 5, dtype=np.int64) - tm.assert_numpy_array_equal(result, exp) - - def test_shift(self): - # GH 9903 - idx = pd.TimedeltaIndex([], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) - - idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx') - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) - - tm.assert_index_equal(idx.shift(0, freq='T'), idx) - exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'], - name='xxx') - tm.assert_index_equal(idx.shift(3, freq='T'), exp) - exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'], - name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='T'), exp) - - def test_repeat(self): - index = pd.timedelta_range('1 days', periods=2, freq='D') - exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days']) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - self.assertIsNone(res.freq) - - index = TimedeltaIndex(['1 days', 'NaT', '3 days']) - exp = TimedeltaIndex(['1 days', '1 days', '1 days', - 'NaT', 'NaT', 'NaT', - '3 days', '3 days', '3 days']) - for res in [index.repeat(3), np.repeat(index, 3)]: - tm.assert_index_equal(res, exp) - self.assertIsNone(res.freq) - - def test_nat(self): - self.assertIs(pd.TimedeltaIndex._na_value, pd.NaT) - self.assertIs(pd.TimedeltaIndex([])._na_value, pd.NaT) - - idx = pd.TimedeltaIndex(['1 days', '2 days']) - self.assertTrue(idx._can_hold_na) - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) - self.assertFalse(idx.hasnans) - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) - - idx = pd.TimedeltaIndex(['1 days', 'NaT']) - self.assertTrue(idx._can_hold_na) - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - self.assertTrue(idx.hasnans) - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) - - def test_equals(self): - # GH 13107 - idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) - self.assertTrue(idx.equals(idx)) - self.assertTrue(idx.equals(idx.copy())) - self.assertTrue(idx.equals(idx.asobject)) - self.assertTrue(idx.asobject.equals(idx)) - self.assertTrue(idx.asobject.equals(idx.asobject)) - self.assertFalse(idx.equals(list(idx))) - self.assertFalse(idx.equals(pd.Series(idx))) - - idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) - self.assertFalse(idx.equals(idx2)) - self.assertFalse(idx.equals(idx2.copy())) - self.assertFalse(idx.equals(idx2.asobject)) - self.assertFalse(idx.asobject.equals(idx2)) - self.assertFalse(idx.asobject.equals(idx2.asobject)) - self.assertFalse(idx.equals(list(idx2))) - self.assertFalse(idx.equals(pd.Series(idx2))) - - class TestPeriodIndexOps(Ops): def setUp(self): diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py deleted file mode 100644 index a64882380850b..0000000000000 --- a/pandas/tseries/tests/test_daterange.py +++ /dev/null @@ -1,820 +0,0 @@ -from datetime import datetime -from pandas.compat import range -import numpy as np - -from pandas.core.index import Index -from pandas.tseries.index import DatetimeIndex - -from pandas import Timestamp -from pandas.tseries.offsets import (BDay, BMonthEnd, CDay, MonthEnd, - generate_range, DateOffset, Minute) -from pandas.tseries.index import cdate_range, bdate_range, date_range - -from pandas.core import common as com -from pandas.util.testing import assertRaisesRegexp -import pandas.util.testing as tm - - -def eq_gen_range(kwargs, expected): - rng = generate_range(**kwargs) - assert (np.array_equal(list(rng), expected)) - - -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - - -class TestGenRangeGeneration(tm.TestCase): - - def test_generate(self): - rng1 = list(generate_range(START, END, offset=BDay())) - rng2 = list(generate_range(START, END, time_rule='B')) - self.assertEqual(rng1, rng2) - - def test_generate_cday(self): - rng1 = list(generate_range(START, END, offset=CDay())) - rng2 = list(generate_range(START, END, time_rule='C')) - self.assertEqual(rng1, rng2) - - def test_1(self): - eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), - [datetime(2009, 3, 25), datetime(2009, 3, 26)]) - - def test_2(self): - eq_gen_range(dict(start=datetime(2008, 1, 1), - end=datetime(2008, 1, 3)), - [datetime(2008, 1, 1), - datetime(2008, 1, 2), - datetime(2008, 1, 3)]) - - def test_3(self): - eq_gen_range(dict(start=datetime(2008, 1, 5), - end=datetime(2008, 1, 6)), - []) - - def test_precision_finer_than_offset(self): - # GH 9907 - result1 = DatetimeIndex(start='2015-04-15 00:00:03', - end='2016-04-22 00:00:00', freq='Q') - result2 = DatetimeIndex(start='2015-04-15 00:00:03', - end='2015-06-22 00:00:04', freq='W') - expected1_list = ['2015-06-30 00:00:03', '2015-09-30 00:00:03', - '2015-12-31 00:00:03', '2016-03-31 00:00:03'] - expected2_list = ['2015-04-19 00:00:03', '2015-04-26 00:00:03', - '2015-05-03 00:00:03', '2015-05-10 00:00:03', - '2015-05-17 00:00:03', '2015-05-24 00:00:03', - '2015-05-31 00:00:03', '2015-06-07 00:00:03', - '2015-06-14 00:00:03', '2015-06-21 00:00:03'] - expected1 = DatetimeIndex(expected1_list, dtype='datetime64[ns]', - freq='Q-DEC', tz=None) - expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]', - freq='W-SUN', tz=None) - self.assert_index_equal(result1, expected1) - self.assert_index_equal(result2, expected2) - - -class TestDateRange(tm.TestCase): - - def setUp(self): - self.rng = bdate_range(START, END) - - def test_constructor(self): - bdate_range(START, END, freq=BDay()) - bdate_range(START, periods=20, freq=BDay()) - bdate_range(end=START, periods=20, freq=BDay()) - self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') - self.assertRaises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') - - def test_naive_aware_conflicts(self): - naive = bdate_range(START, END, freq=BDay(), tz=None) - aware = bdate_range(START, END, freq=BDay(), - tz="Asia/Hong_Kong") - assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", naive.join, aware) - assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", aware.join, naive) - - def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=BDay()) - DatetimeIndex._cached_range(START, periods=20, offset=BDay()) - DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) - - assertRaisesRegexp(TypeError, "offset", DatetimeIndex._cached_range, - START, END) - - assertRaisesRegexp(TypeError, "specify period", - DatetimeIndex._cached_range, START, - offset=BDay()) - - assertRaisesRegexp(TypeError, "specify period", - DatetimeIndex._cached_range, end=END, - offset=BDay()) - - assertRaisesRegexp(TypeError, "start or end", - DatetimeIndex._cached_range, periods=20, - offset=BDay()) - - def test_cached_range_bug(self): - rng = date_range('2010-09-01 05:00:00', periods=50, - freq=DateOffset(hours=6)) - self.assertEqual(len(rng), 50) - self.assertEqual(rng[0], datetime(2010, 9, 1, 5)) - - def test_timezone_comparaison_bug(self): - start = Timestamp('20130220 10:00', tz='US/Eastern') - try: - date_range(start, periods=2, tz='US/Eastern') - except AssertionError: - self.fail() - - def test_timezone_comparaison_assert(self): - start = Timestamp('20130220 10:00', tz='US/Eastern') - self.assertRaises(AssertionError, date_range, start, periods=2, - tz='Europe/Berlin') - - def test_comparison(self): - d = self.rng[10] - - comp = self.rng > d - self.assertTrue(comp[11]) - self.assertFalse(comp[9]) - - def test_copy(self): - cp = self.rng.copy() - repr(cp) - self.assert_index_equal(cp, self.rng) - - def test_repr(self): - # only really care that it works - repr(self.rng) - - def test_getitem(self): - smaller = self.rng[:5] - exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) - self.assert_index_equal(smaller, exp) - - self.assertEqual(smaller.offset, self.rng.offset) - - sliced = self.rng[::5] - self.assertEqual(sliced.offset, BDay() * 5) - - fancy_indexed = self.rng[[4, 3, 2, 1, 0]] - self.assertEqual(len(fancy_indexed), 5) - tm.assertIsInstance(fancy_indexed, DatetimeIndex) - self.assertIsNone(fancy_indexed.freq) - - # 32-bit vs. 64-bit platforms - self.assertEqual(self.rng[4], self.rng[np.int_(4)]) - - def test_getitem_matplotlib_hackaround(self): - values = self.rng[:, None] - expected = self.rng.values[:, None] - self.assert_numpy_array_equal(values, expected) - - def test_shift(self): - shifted = self.rng.shift(5) - self.assertEqual(shifted[0], self.rng[5]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(-5) - self.assertEqual(shifted[5], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(0) - self.assertEqual(shifted[0], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=BDay()) - self.assertEqual(shifted[0], rng[0] + BDay()) - - def test_pickle_unpickle(self): - unpickled = self.round_trip_pickle(self.rng) - self.assertIsNotNone(unpickled.offset) - - def test_union(self): - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_union = left.union(right) - tm.assertIsInstance(the_union, Index) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # order does not matter - tm.assert_index_equal(right.union(left), the_union) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_union = self.rng.union(rng) - tm.assertIsInstance(the_union, DatetimeIndex) - - def test_outer_join(self): - # should just behave as union - - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_join = self.rng.join(rng, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - def test_union_not_cacheable(self): - rng = date_range('1/1/2000', periods=50, freq=Minute()) - rng1 = rng[10:] - rng2 = rng[:25] - the_union = rng1.union(rng2) - self.assert_index_equal(the_union, rng) - - rng1 = rng[10:] - rng2 = rng[15:35] - the_union = rng1.union(rng2) - expected = rng[10:] - self.assert_index_equal(the_union, expected) - - def test_intersection(self): - rng = date_range('1/1/2000', periods=50, freq=Minute()) - rng1 = rng[10:] - rng2 = rng[:25] - the_int = rng1.intersection(rng2) - expected = rng[10:25] - self.assert_index_equal(the_int, expected) - tm.assertIsInstance(the_int, DatetimeIndex) - self.assertEqual(the_int.offset, rng.offset) - - the_int = rng1.intersection(rng2.view(DatetimeIndex)) - self.assert_index_equal(the_int, expected) - - # non-overlapping - the_int = rng[:10].intersection(rng[10:]) - expected = DatetimeIndex([]) - self.assert_index_equal(the_int, expected) - - def test_intersection_bug(self): - # GH #771 - a = bdate_range('11/30/2011', '12/31/2011') - b = bdate_range('12/10/2011', '12/20/2011') - result = a.intersection(b) - self.assert_index_equal(result, b) - - def test_summary(self): - self.rng.summary() - self.rng[2:2].summary() - - def test_summary_pytz(self): - tm._skip_if_no_pytz() - import pytz - bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() - - def test_summary_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - bdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() - - def test_misc(self): - end = datetime(2009, 5, 13) - dr = bdate_range(end=end, periods=20) - firstDate = end - 19 * BDay() - - assert len(dr) == 20 - assert dr[0] == firstDate - assert dr[-1] == end - - def test_date_parse_failure(self): - badly_formed_date = '2007/100/1' - - self.assertRaises(ValueError, Timestamp, badly_formed_date) - - self.assertRaises(ValueError, bdate_range, start=badly_formed_date, - periods=10) - self.assertRaises(ValueError, bdate_range, end=badly_formed_date, - periods=10) - self.assertRaises(ValueError, bdate_range, badly_formed_date, - badly_formed_date) - - def test_equals(self): - self.assertFalse(self.rng.equals(list(self.rng))) - - def test_identical(self): - t1 = self.rng.copy() - t2 = self.rng.copy() - self.assertTrue(t1.identical(t2)) - - # name - t1 = t1.rename('foo') - self.assertTrue(t1.equals(t2)) - self.assertFalse(t1.identical(t2)) - t2 = t2.rename('foo') - self.assertTrue(t1.identical(t2)) - - # freq - t2v = Index(t2.values) - self.assertTrue(t1.equals(t2v)) - self.assertFalse(t1.identical(t2v)) - - def test_daterange_bug_456(self): - # GH #456 - rng1 = bdate_range('12/5/2011', '12/5/2011') - rng2 = bdate_range('12/2/2011', '12/5/2011') - rng2.offset = BDay() - - result = rng1.union(rng2) - tm.assertIsInstance(result, DatetimeIndex) - - def test_error_with_zero_monthends(self): - self.assertRaises(ValueError, date_range, '1/1/2000', '1/1/2001', - freq=MonthEnd(0)) - - def test_range_bug(self): - # GH #770 - offset = DateOffset(months=3) - result = date_range("2011-1-1", "2012-1-31", freq=offset) - - start = datetime(2011, 1, 1) - exp_values = [start + i * offset for i in range(5)] - tm.assert_index_equal(result, DatetimeIndex(exp_values)) - - def test_range_tz_pytz(self): - # GH 2906 - tm._skip_if_no_pytz() - from pytz import timezone - - tz = timezone('US/Eastern') - start = tz.localize(datetime(2011, 1, 1)) - end = tz.localize(datetime(2011, 1, 3)) - - dr = date_range(start=start, periods=3) - self.assertEqual(dr.tz.zone, tz.zone) - self.assertEqual(dr[0], start) - self.assertEqual(dr[2], end) - - dr = date_range(end=end, periods=3) - self.assertEqual(dr.tz.zone, tz.zone) - self.assertEqual(dr[0], start) - self.assertEqual(dr[2], end) - - dr = date_range(start=start, end=end) - self.assertEqual(dr.tz.zone, tz.zone) - self.assertEqual(dr[0], start) - self.assertEqual(dr[2], end) - - def test_range_tz_dst_straddle_pytz(self): - - tm._skip_if_no_pytz() - from pytz import timezone - tz = timezone('US/Eastern') - dates = [(tz.localize(datetime(2014, 3, 6)), - tz.localize(datetime(2014, 3, 12))), - (tz.localize(datetime(2013, 11, 1)), - tz.localize(datetime(2013, 11, 6)))] - for (start, end) in dates: - dr = date_range(start, end, freq='D') - self.assertEqual(dr[0], start) - self.assertEqual(dr[-1], end) - self.assertEqual(np.all(dr.hour == 0), True) - - dr = date_range(start, end, freq='D', tz='US/Eastern') - self.assertEqual(dr[0], start) - self.assertEqual(dr[-1], end) - self.assertEqual(np.all(dr.hour == 0), True) - - dr = date_range(start.replace(tzinfo=None), end.replace( - tzinfo=None), freq='D', tz='US/Eastern') - self.assertEqual(dr[0], start) - self.assertEqual(dr[-1], end) - self.assertEqual(np.all(dr.hour == 0), True) - - def test_range_tz_dateutil(self): - # GH 2906 - tm._skip_if_no_dateutil() - # Use maybe_get_tz to fix filename in tz under dateutil. - from pandas.tslib import maybe_get_tz - tz = lambda x: maybe_get_tz('dateutil/' + x) - - start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) - end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern')) - - dr = date_range(start=start, periods=3) - self.assertTrue(dr.tz == tz('US/Eastern')) - self.assertTrue(dr[0] == start) - self.assertTrue(dr[2] == end) - - dr = date_range(end=end, periods=3) - self.assertTrue(dr.tz == tz('US/Eastern')) - self.assertTrue(dr[0] == start) - self.assertTrue(dr[2] == end) - - dr = date_range(start=start, end=end) - self.assertTrue(dr.tz == tz('US/Eastern')) - self.assertTrue(dr[0] == start) - self.assertTrue(dr[2] == end) - - def test_month_range_union_tz_pytz(self): - tm._skip_if_no_pytz() - from pytz import timezone - tz = timezone('US/Eastern') - - early_start = datetime(2011, 1, 1) - early_end = datetime(2011, 3, 1) - - late_start = datetime(2011, 3, 1) - late_end = datetime(2011, 5, 1) - - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) - - early_dr.union(late_dr) - - def test_month_range_union_tz_dateutil(self): - tm._skip_if_windows_python_3() - tm._skip_if_no_dateutil() - from pandas.tslib import _dateutil_gettz as timezone - tz = timezone('US/Eastern') - - early_start = datetime(2011, 1, 1) - early_end = datetime(2011, 3, 1) - - late_start = datetime(2011, 3, 1) - late_end = datetime(2011, 5, 1) - - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) - - early_dr.union(late_dr) - - def test_range_closed(self): - begin = datetime(2011, 1, 1) - end = datetime(2014, 1, 1) - - for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: - closed = date_range(begin, end, closed=None, freq=freq) - left = date_range(begin, end, closed="left", freq=freq) - right = date_range(begin, end, closed="right", freq=freq) - expected_left = left - expected_right = right - - if end == closed[-1]: - expected_left = closed[:-1] - if begin == closed[0]: - expected_right = closed[1:] - - self.assert_index_equal(expected_left, left) - self.assert_index_equal(expected_right, right) - - def test_range_closed_with_tz_aware_start_end(self): - # GH12409, GH12684 - begin = Timestamp('2011/1/1', tz='US/Eastern') - end = Timestamp('2014/1/1', tz='US/Eastern') - - for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: - closed = date_range(begin, end, closed=None, freq=freq) - left = date_range(begin, end, closed="left", freq=freq) - right = date_range(begin, end, closed="right", freq=freq) - expected_left = left - expected_right = right - - if end == closed[-1]: - expected_left = closed[:-1] - if begin == closed[0]: - expected_right = closed[1:] - - self.assert_index_equal(expected_left, left) - self.assert_index_equal(expected_right, right) - - begin = Timestamp('2011/1/1') - end = Timestamp('2014/1/1') - begintz = Timestamp('2011/1/1', tz='US/Eastern') - endtz = Timestamp('2014/1/1', tz='US/Eastern') - - for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: - closed = date_range(begin, end, closed=None, freq=freq, - tz='US/Eastern') - left = date_range(begin, end, closed="left", freq=freq, - tz='US/Eastern') - right = date_range(begin, end, closed="right", freq=freq, - tz='US/Eastern') - expected_left = left - expected_right = right - - if endtz == closed[-1]: - expected_left = closed[:-1] - if begintz == closed[0]: - expected_right = closed[1:] - - self.assert_index_equal(expected_left, left) - self.assert_index_equal(expected_right, right) - - def test_range_closed_boundary(self): - # GH 11804 - for closed in ['right', 'left', None]: - right_boundary = date_range('2015-09-12', '2015-12-01', - freq='QS-MAR', closed=closed) - left_boundary = date_range('2015-09-01', '2015-09-12', - freq='QS-MAR', closed=closed) - both_boundary = date_range('2015-09-01', '2015-12-01', - freq='QS-MAR', closed=closed) - expected_right = expected_left = expected_both = both_boundary - - if closed == 'right': - expected_left = both_boundary[1:] - if closed == 'left': - expected_right = both_boundary[:-1] - if closed is None: - expected_right = both_boundary[1:] - expected_left = both_boundary[:-1] - - self.assert_index_equal(right_boundary, expected_right) - self.assert_index_equal(left_boundary, expected_left) - self.assert_index_equal(both_boundary, expected_both) - - def test_years_only(self): - # GH 6961 - dr = date_range('2014', '2015', freq='M') - self.assertEqual(dr[0], datetime(2014, 1, 31)) - self.assertEqual(dr[-1], datetime(2014, 12, 31)) - - def test_freq_divides_end_in_nanos(self): - # GH 10885 - result_1 = date_range('2005-01-12 10:00', '2005-01-12 16:00', - freq='345min') - result_2 = date_range('2005-01-13 10:00', '2005-01-13 16:00', - freq='345min') - expected_1 = DatetimeIndex(['2005-01-12 10:00:00', - '2005-01-12 15:45:00'], - dtype='datetime64[ns]', freq='345T', - tz=None) - expected_2 = DatetimeIndex(['2005-01-13 10:00:00', - '2005-01-13 15:45:00'], - dtype='datetime64[ns]', freq='345T', - tz=None) - self.assert_index_equal(result_1, expected_1) - self.assert_index_equal(result_2, expected_2) - - -class TestCustomDateRange(tm.TestCase): - - def setUp(self): - self.rng = cdate_range(START, END) - - def test_constructor(self): - cdate_range(START, END, freq=CDay()) - cdate_range(START, periods=20, freq=CDay()) - cdate_range(end=START, periods=20, freq=CDay()) - self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'C') - self.assertRaises(ValueError, cdate_range, '2011-1-1', '2012-1-1', 'C') - - def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=CDay()) - DatetimeIndex._cached_range(START, periods=20, - offset=CDay()) - DatetimeIndex._cached_range(end=START, periods=20, - offset=CDay()) - - self.assertRaises(Exception, DatetimeIndex._cached_range, START, END) - - self.assertRaises(Exception, DatetimeIndex._cached_range, START, - freq=CDay()) - - self.assertRaises(Exception, DatetimeIndex._cached_range, end=END, - freq=CDay()) - - self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, - freq=CDay()) - - def test_comparison(self): - d = self.rng[10] - - comp = self.rng > d - self.assertTrue(comp[11]) - self.assertFalse(comp[9]) - - def test_copy(self): - cp = self.rng.copy() - repr(cp) - self.assert_index_equal(cp, self.rng) - - def test_repr(self): - # only really care that it works - repr(self.rng) - - def test_getitem(self): - smaller = self.rng[:5] - exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) - self.assert_index_equal(smaller, exp) - self.assertEqual(smaller.offset, self.rng.offset) - - sliced = self.rng[::5] - self.assertEqual(sliced.offset, CDay() * 5) - - fancy_indexed = self.rng[[4, 3, 2, 1, 0]] - self.assertEqual(len(fancy_indexed), 5) - tm.assertIsInstance(fancy_indexed, DatetimeIndex) - self.assertIsNone(fancy_indexed.freq) - - # 32-bit vs. 64-bit platforms - self.assertEqual(self.rng[4], self.rng[np.int_(4)]) - - def test_getitem_matplotlib_hackaround(self): - values = self.rng[:, None] - expected = self.rng.values[:, None] - self.assert_numpy_array_equal(values, expected) - - def test_shift(self): - - shifted = self.rng.shift(5) - self.assertEqual(shifted[0], self.rng[5]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(-5) - self.assertEqual(shifted[5], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(0) - self.assertEqual(shifted[0], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - with tm.assert_produces_warning(com.PerformanceWarning): - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=CDay()) - self.assertEqual(shifted[0], rng[0] + CDay()) - - def test_pickle_unpickle(self): - unpickled = self.round_trip_pickle(self.rng) - self.assertIsNotNone(unpickled.offset) - - def test_union(self): - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_union = left.union(right) - tm.assertIsInstance(the_union, Index) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # order does not matter - self.assert_index_equal(right.union(left), the_union) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_union = self.rng.union(rng) - tm.assertIsInstance(the_union, DatetimeIndex) - - def test_outer_join(self): - # should just behave as union - - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_join = self.rng.join(rng, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - def test_intersection_bug(self): - # GH #771 - a = cdate_range('11/30/2011', '12/31/2011') - b = cdate_range('12/10/2011', '12/20/2011') - result = a.intersection(b) - self.assert_index_equal(result, b) - - def test_summary(self): - self.rng.summary() - self.rng[2:2].summary() - - def test_summary_pytz(self): - tm._skip_if_no_pytz() - import pytz - cdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() - - def test_summary_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - cdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() - - def test_misc(self): - end = datetime(2009, 5, 13) - dr = cdate_range(end=end, periods=20) - firstDate = end - 19 * CDay() - - assert len(dr) == 20 - assert dr[0] == firstDate - assert dr[-1] == end - - def test_date_parse_failure(self): - badly_formed_date = '2007/100/1' - - self.assertRaises(ValueError, Timestamp, badly_formed_date) - - self.assertRaises(ValueError, cdate_range, start=badly_formed_date, - periods=10) - self.assertRaises(ValueError, cdate_range, end=badly_formed_date, - periods=10) - self.assertRaises(ValueError, cdate_range, badly_formed_date, - badly_formed_date) - - def test_equals(self): - self.assertFalse(self.rng.equals(list(self.rng))) - - def test_daterange_bug_456(self): - # GH #456 - rng1 = cdate_range('12/5/2011', '12/5/2011') - rng2 = cdate_range('12/2/2011', '12/5/2011') - rng2.offset = CDay() - - result = rng1.union(rng2) - tm.assertIsInstance(result, DatetimeIndex) - - def test_cdaterange(self): - rng = cdate_range('2013-05-01', periods=3) - xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) - self.assert_index_equal(xp, rng) - - def test_cdaterange_weekmask(self): - rng = cdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu') - xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) - self.assert_index_equal(xp, rng) - - def test_cdaterange_holidays(self): - rng = cdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) - xp = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) - self.assert_index_equal(xp, rng) - - def test_cdaterange_weekmask_and_holidays(self): - rng = cdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu', - holidays=['2013-05-01']) - xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) - self.assert_index_equal(xp, rng) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index a39830b6aede6..3459da9d2b5c5 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -6,27 +6,25 @@ """ +import numpy as np +from numpy.random import randn from datetime import datetime, date, timedelta -from pandas import Timestamp, _period -from pandas.tseries.frequencies import MONTHS, DAYS, _period_code_map -from pandas.tseries.period import Period, PeriodIndex, period_range -from pandas.tseries.index import DatetimeIndex, date_range, Index -from pandas.tseries.tools import to_datetime +import pandas as pd +import pandas.util.testing as tm import pandas.tseries.period as period import pandas.tseries.offsets as offsets - -import pandas as pd -import numpy as np -from numpy.random import randn +from pandas.tseries.tools import to_datetime +from pandas.tseries.period import Period, PeriodIndex, period_range +from pandas.tseries.index import DatetimeIndex, date_range, Index +from pandas._period import period_ordinal, period_asfreq from pandas.compat import range, lrange, lmap, zip, text_type, PY3, iteritems from pandas.compat.numpy import np_datetime64_compat - -from pandas import (Series, DataFrame, +from pandas.tseries.frequencies import (MONTHS, DAYS, _period_code_map, + get_freq) +from pandas import (Series, DataFrame, Timestamp, _period, tslib, _np_version_under1p9, _np_version_under1p10, _np_version_under1p12) -from pandas import tslib -import pandas.util.testing as tm class TestPeriodProperties(tm.TestCase): @@ -4970,3 +4968,98 @@ def test_get_period_field_raises_on_out_of_range(self): def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, _period.get_period_field_arr, -1, np.empty(1), 0) + + +class TestTslib(tm.TestCase): + def test_intraday_conversion_factors(self): + self.assertEqual(period_asfreq( + 1, get_freq('D'), get_freq('H'), False), 24) + self.assertEqual(period_asfreq( + 1, get_freq('D'), get_freq('T'), False), 1440) + self.assertEqual(period_asfreq( + 1, get_freq('D'), get_freq('S'), False), 86400) + self.assertEqual(period_asfreq(1, get_freq( + 'D'), get_freq('L'), False), 86400000) + self.assertEqual(period_asfreq(1, get_freq( + 'D'), get_freq('U'), False), 86400000000) + self.assertEqual(period_asfreq(1, get_freq( + 'D'), get_freq('N'), False), 86400000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('H'), get_freq('T'), False), 60) + self.assertEqual(period_asfreq( + 1, get_freq('H'), get_freq('S'), False), 3600) + self.assertEqual(period_asfreq(1, get_freq('H'), + get_freq('L'), False), 3600000) + self.assertEqual(period_asfreq(1, get_freq( + 'H'), get_freq('U'), False), 3600000000) + self.assertEqual(period_asfreq(1, get_freq( + 'H'), get_freq('N'), False), 3600000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('T'), get_freq('S'), False), 60) + self.assertEqual(period_asfreq( + 1, get_freq('T'), get_freq('L'), False), 60000) + self.assertEqual(period_asfreq(1, get_freq( + 'T'), get_freq('U'), False), 60000000) + self.assertEqual(period_asfreq(1, get_freq( + 'T'), get_freq('N'), False), 60000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('S'), get_freq('L'), False), 1000) + self.assertEqual(period_asfreq(1, get_freq('S'), + get_freq('U'), False), 1000000) + self.assertEqual(period_asfreq(1, get_freq( + 'S'), get_freq('N'), False), 1000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('L'), get_freq('U'), False), 1000) + self.assertEqual(period_asfreq(1, get_freq('L'), + get_freq('N'), False), 1000000) + + self.assertEqual(period_asfreq( + 1, get_freq('U'), get_freq('N'), False), 1000) + + def test_period_ordinal_start_values(self): + # information for 1.1.1970 + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('A'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('M'))) + self.assertEqual(1, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('W'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('D'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('B'))) + + def test_period_ordinal_week(self): + self.assertEqual(1, period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, + get_freq('W'))) + self.assertEqual(2, period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, + get_freq('W'))) + + self.assertEqual(2284, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, + get_freq('W'))) + self.assertEqual(2285, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, + get_freq('W'))) + + def test_period_ordinal_business_day(self): + # Thursday + self.assertEqual(11415, period_ordinal(2013, 10, 3, 0, 0, 0, 0, 0, + get_freq('B'))) + # Friday + self.assertEqual(11416, period_ordinal(2013, 10, 4, 0, 0, 0, 0, 0, + get_freq('B'))) + # Saturday + self.assertEqual(11417, period_ordinal(2013, 10, 5, 0, 0, 0, 0, 0, + get_freq('B'))) + # Sunday + self.assertEqual(11417, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, + get_freq('B'))) + # Monday + self.assertEqual(11417, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, + get_freq('B'))) + # Tuesday + self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, + get_freq('B'))) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py deleted file mode 100644 index 170d5cdafa60b..0000000000000 --- a/pandas/tseries/tests/test_timedeltas.py +++ /dev/null @@ -1,2051 +0,0 @@ -# pylint: disable-msg=E1101,W0612 - -from __future__ import division -from datetime import timedelta, time - -from distutils.version import LooseVersion -import numpy as np -import pandas as pd - -from pandas import (Index, Series, DataFrame, Timestamp, Timedelta, - TimedeltaIndex, isnull, date_range, - timedelta_range, Int64Index) -from pandas.compat import range -from pandas import compat, to_timedelta, tslib -from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct -from pandas.util.testing import (assert_series_equal, assert_frame_equal, - assert_almost_equal, assert_index_equal) -from pandas.tseries.offsets import Day, Second -import pandas.util.testing as tm -from numpy.random import randn -from pandas import _np_version_under1p8 - -iNaT = tslib.iNaT - - -class TestTimedeltas(tm.TestCase): - - def setUp(self): - pass - - def test_get_loc_nat(self): - tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) - - self.assertEqual(tidx.get_loc(pd.NaT), 1) - self.assertEqual(tidx.get_loc(None), 1) - self.assertEqual(tidx.get_loc(float('nan')), 1) - self.assertEqual(tidx.get_loc(np.nan), 1) - - def test_contains(self): - # Checking for any NaT-like objects - # GH 13603 - td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - for v in [pd.NaT, None, float('nan'), np.nan]: - self.assertFalse((v in td)) - - td = to_timedelta([pd.NaT]) - for v in [pd.NaT, None, float('nan'), np.nan]: - self.assertTrue((v in td)) - - def test_construction(self): - - expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') - self.assertEqual(Timedelta(10, unit='d').value, expected) - self.assertEqual(Timedelta(10.0, unit='d').value, expected) - self.assertEqual(Timedelta('10 days').value, expected) - self.assertEqual(Timedelta(days=10).value, expected) - self.assertEqual(Timedelta(days=10.0).value, expected) - - expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') - self.assertEqual(Timedelta('10 days 00:00:10').value, expected) - self.assertEqual(Timedelta(days=10, seconds=10).value, expected) - self.assertEqual( - Timedelta(days=10, milliseconds=10 * 1000).value, expected) - self.assertEqual( - Timedelta(days=10, microseconds=10 * 1000 * 1000).value, expected) - - # test construction with np dtypes - # GH 8757 - timedelta_kwargs = {'days': 'D', - 'seconds': 's', - 'microseconds': 'us', - 'milliseconds': 'ms', - 'minutes': 'm', - 'hours': 'h', - 'weeks': 'W'} - npdtypes = [np.int64, np.int32, np.int16, np.float64, np.float32, - np.float16] - for npdtype in npdtypes: - for pykwarg, npkwarg in timedelta_kwargs.items(): - expected = np.timedelta64(1, - npkwarg).astype('m8[ns]').view('i8') - self.assertEqual( - Timedelta(**{pykwarg: npdtype(1)}).value, expected) - - # rounding cases - self.assertEqual(Timedelta(82739999850000).value, 82739999850000) - self.assertTrue('0 days 22:58:59.999850' in str(Timedelta( - 82739999850000))) - self.assertEqual(Timedelta(123072001000000).value, 123072001000000) - self.assertTrue('1 days 10:11:12.001' in str(Timedelta( - 123072001000000))) - - # string conversion with/without leading zero - # GH 9570 - self.assertEqual(Timedelta('0:00:00'), timedelta(hours=0)) - self.assertEqual(Timedelta('00:00:00'), timedelta(hours=0)) - self.assertEqual(Timedelta('-1:00:00'), -timedelta(hours=1)) - self.assertEqual(Timedelta('-01:00:00'), -timedelta(hours=1)) - - # more strings & abbrevs - # GH 8190 - self.assertEqual(Timedelta('1 h'), timedelta(hours=1)) - self.assertEqual(Timedelta('1 hour'), timedelta(hours=1)) - self.assertEqual(Timedelta('1 hr'), timedelta(hours=1)) - self.assertEqual(Timedelta('1 hours'), timedelta(hours=1)) - self.assertEqual(Timedelta('-1 hours'), -timedelta(hours=1)) - self.assertEqual(Timedelta('1 m'), timedelta(minutes=1)) - self.assertEqual(Timedelta('1.5 m'), timedelta(seconds=90)) - self.assertEqual(Timedelta('1 minute'), timedelta(minutes=1)) - self.assertEqual(Timedelta('1 minutes'), timedelta(minutes=1)) - self.assertEqual(Timedelta('1 s'), timedelta(seconds=1)) - self.assertEqual(Timedelta('1 second'), timedelta(seconds=1)) - self.assertEqual(Timedelta('1 seconds'), timedelta(seconds=1)) - self.assertEqual(Timedelta('1 ms'), timedelta(milliseconds=1)) - self.assertEqual(Timedelta('1 milli'), timedelta(milliseconds=1)) - self.assertEqual(Timedelta('1 millisecond'), timedelta(milliseconds=1)) - self.assertEqual(Timedelta('1 us'), timedelta(microseconds=1)) - self.assertEqual(Timedelta('1 micros'), timedelta(microseconds=1)) - self.assertEqual(Timedelta('1 microsecond'), timedelta(microseconds=1)) - self.assertEqual(Timedelta('1.5 microsecond'), - Timedelta('00:00:00.000001500')) - self.assertEqual(Timedelta('1 ns'), Timedelta('00:00:00.000000001')) - self.assertEqual(Timedelta('1 nano'), Timedelta('00:00:00.000000001')) - self.assertEqual(Timedelta('1 nanosecond'), - Timedelta('00:00:00.000000001')) - - # combos - self.assertEqual(Timedelta('10 days 1 hour'), - timedelta(days=10, hours=1)) - self.assertEqual(Timedelta('10 days 1 h'), timedelta(days=10, hours=1)) - self.assertEqual(Timedelta('10 days 1 h 1m 1s'), timedelta( - days=10, hours=1, minutes=1, seconds=1)) - self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), - - timedelta(days=10, hours=1, minutes=1, seconds=1)) - self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), - - timedelta(days=10, hours=1, minutes=1, seconds=1)) - self.assertEqual(Timedelta('-10 days 1 h 1m 1s 3us'), - - timedelta(days=10, hours=1, minutes=1, - seconds=1, microseconds=3)) - self.assertEqual(Timedelta('-10 days 1 h 1.5m 1s 3us'), - - timedelta(days=10, hours=1, minutes=1, - seconds=31, microseconds=3)) - - # currently invalid as it has a - on the hhmmdd part (only allowed on - # the days) - self.assertRaises(ValueError, - lambda: Timedelta('-10 days -1 h 1.5m 1s 3us')) - - # only leading neg signs are allowed - self.assertRaises(ValueError, - lambda: Timedelta('10 days -1 h 1.5m 1s 3us')) - - # no units specified - self.assertRaises(ValueError, lambda: Timedelta('3.1415')) - - # invalid construction - tm.assertRaisesRegexp(ValueError, "cannot construct a Timedelta", - lambda: Timedelta()) - tm.assertRaisesRegexp(ValueError, "unit abbreviation w/o a number", - lambda: Timedelta('foo')) - tm.assertRaisesRegexp(ValueError, - "cannot construct a Timedelta from the passed " - "arguments, allowed keywords are ", - lambda: Timedelta(day=10)) - - # roundtripping both for string and value - for v in ['1s', '-1s', '1us', '-1us', '1 day', '-1 day', - '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', - '1ns', '-23:59:59.999999999']: - - td = Timedelta(v) - self.assertEqual(Timedelta(td.value), td) - - # str does not normally display nanos - if not td.nanoseconds: - self.assertEqual(Timedelta(str(td)), td) - self.assertEqual(Timedelta(td._repr_base(format='all')), td) - - # floats - expected = np.timedelta64( - 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( - 500, 'ms').astype('m8[ns]').view('i8') - self.assertEqual(Timedelta(10.5, unit='s').value, expected) - - # nat - self.assertEqual(Timedelta('').value, iNaT) - self.assertEqual(Timedelta('nat').value, iNaT) - self.assertEqual(Timedelta('NAT').value, iNaT) - self.assertEqual(Timedelta(None).value, iNaT) - self.assertEqual(Timedelta(np.nan).value, iNaT) - self.assertTrue(isnull(Timedelta('nat'))) - - # offset - self.assertEqual(to_timedelta(pd.offsets.Hour(2)), - Timedelta('0 days, 02:00:00')) - self.assertEqual(Timedelta(pd.offsets.Hour(2)), - Timedelta('0 days, 02:00:00')) - self.assertEqual(Timedelta(pd.offsets.Second(2)), - Timedelta('0 days, 00:00:02')) - - # unicode - # GH 11995 - expected = Timedelta('1H') - result = pd.Timedelta(u'1H') - self.assertEqual(result, expected) - self.assertEqual(to_timedelta(pd.offsets.Hour(2)), - Timedelta(u'0 days, 02:00:00')) - - self.assertRaises(ValueError, lambda: Timedelta(u'foo bar')) - - def test_round(self): - - t1 = Timedelta('1 days 02:34:56.789123456') - t2 = Timedelta('-1 days 02:34:56.789123456') - - for (freq, s1, s2) in [('N', t1, t2), - ('U', Timedelta('1 days 02:34:56.789123000'), - Timedelta('-1 days 02:34:56.789123000')), - ('L', Timedelta('1 days 02:34:56.789000000'), - Timedelta('-1 days 02:34:56.789000000')), - ('S', Timedelta('1 days 02:34:57'), - Timedelta('-1 days 02:34:57')), - ('2S', Timedelta('1 days 02:34:56'), - Timedelta('-1 days 02:34:56')), - ('5S', Timedelta('1 days 02:34:55'), - Timedelta('-1 days 02:34:55')), - ('T', Timedelta('1 days 02:35:00'), - Timedelta('-1 days 02:35:00')), - ('12T', Timedelta('1 days 02:36:00'), - Timedelta('-1 days 02:36:00')), - ('H', Timedelta('1 days 03:00:00'), - Timedelta('-1 days 03:00:00')), - ('d', Timedelta('1 days'), - Timedelta('-1 days'))]: - r1 = t1.round(freq) - self.assertEqual(r1, s1) - r2 = t2.round(freq) - self.assertEqual(r2, s2) - - # invalid - for freq in ['Y', 'M', 'foobar']: - self.assertRaises(ValueError, lambda: t1.round(freq)) - - t1 = timedelta_range('1 days', periods=3, freq='1 min 2 s 3 us') - t2 = -1 * t1 - t1a = timedelta_range('1 days', periods=3, freq='1 min 2 s') - t1c = pd.TimedeltaIndex([1, 1, 1], unit='D') - - # note that negative times round DOWN! so don't give whole numbers - for (freq, s1, s2) in [('N', t1, t2), - ('U', t1, t2), - ('L', t1a, - TimedeltaIndex(['-1 days +00:00:00', - '-2 days +23:58:58', - '-2 days +23:57:56'], - dtype='timedelta64[ns]', - freq=None) - ), - ('S', t1a, - TimedeltaIndex(['-1 days +00:00:00', - '-2 days +23:58:58', - '-2 days +23:57:56'], - dtype='timedelta64[ns]', - freq=None) - ), - ('12T', t1c, - TimedeltaIndex(['-1 days', - '-1 days', - '-1 days'], - dtype='timedelta64[ns]', - freq=None) - ), - ('H', t1c, - TimedeltaIndex(['-1 days', - '-1 days', - '-1 days'], - dtype='timedelta64[ns]', - freq=None) - ), - ('d', t1c, - pd.TimedeltaIndex([-1, -1, -1], unit='D') - )]: - - r1 = t1.round(freq) - tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) - tm.assert_index_equal(r2, s2) - - # invalid - for freq in ['Y', 'M', 'foobar']: - self.assertRaises(ValueError, lambda: t1.round(freq)) - - def test_repr(self): - - self.assertEqual(repr(Timedelta(10, unit='d')), - "Timedelta('10 days 00:00:00')") - self.assertEqual(repr(Timedelta(10, unit='s')), - "Timedelta('0 days 00:00:10')") - self.assertEqual(repr(Timedelta(10, unit='ms')), - "Timedelta('0 days 00:00:00.010000')") - self.assertEqual(repr(Timedelta(-10, unit='ms')), - "Timedelta('-1 days +23:59:59.990000')") - - def test_identity(self): - - td = Timedelta(10, unit='d') - self.assertTrue(isinstance(td, Timedelta)) - self.assertTrue(isinstance(td, timedelta)) - - def test_conversion(self): - - for td in [Timedelta(10, unit='d'), - Timedelta('1 days, 10:11:12.012345')]: - pydt = td.to_pytimedelta() - self.assertTrue(td == Timedelta(pydt)) - self.assertEqual(td, pydt) - self.assertTrue(isinstance(pydt, timedelta) and not isinstance( - pydt, Timedelta)) - - self.assertEqual(td, np.timedelta64(td.value, 'ns')) - td64 = td.to_timedelta64() - self.assertEqual(td64, np.timedelta64(td.value, 'ns')) - self.assertEqual(td, td64) - self.assertTrue(isinstance(td64, np.timedelta64)) - - # this is NOT equal and cannot be roundtriped (because of the nanos) - td = Timedelta('1 days, 10:11:12.012345678') - self.assertTrue(td != td.to_pytimedelta()) - - def test_ops(self): - - td = Timedelta(10, unit='d') - self.assertEqual(-td, Timedelta(-10, unit='d')) - self.assertEqual(+td, Timedelta(10, unit='d')) - self.assertEqual(td - td, Timedelta(0, unit='ns')) - self.assertTrue((td - pd.NaT) is pd.NaT) - self.assertEqual(td + td, Timedelta(20, unit='d')) - self.assertTrue((td + pd.NaT) is pd.NaT) - self.assertEqual(td * 2, Timedelta(20, unit='d')) - self.assertTrue((td * pd.NaT) is pd.NaT) - self.assertEqual(td / 2, Timedelta(5, unit='d')) - self.assertEqual(abs(td), td) - self.assertEqual(abs(-td), td) - self.assertEqual(td / td, 1) - self.assertTrue((td / pd.NaT) is np.nan) - - # invert - self.assertEqual(-td, Timedelta('-10d')) - self.assertEqual(td * -1, Timedelta('-10d')) - self.assertEqual(-1 * td, Timedelta('-10d')) - self.assertEqual(abs(-td), Timedelta('10d')) - - # invalid - self.assertRaises(TypeError, lambda: Timedelta(11, unit='d') // 2) - - # invalid multiply with another timedelta - self.assertRaises(TypeError, lambda: td * td) - - # can't operate with integers - self.assertRaises(TypeError, lambda: td + 2) - self.assertRaises(TypeError, lambda: td - 2) - - def test_ops_offsets(self): - td = Timedelta(10, unit='d') - self.assertEqual(Timedelta(241, unit='h'), td + pd.offsets.Hour(1)) - self.assertEqual(Timedelta(241, unit='h'), pd.offsets.Hour(1) + td) - self.assertEqual(240, td / pd.offsets.Hour(1)) - self.assertEqual(1 / 240.0, pd.offsets.Hour(1) / td) - self.assertEqual(Timedelta(239, unit='h'), td - pd.offsets.Hour(1)) - self.assertEqual(Timedelta(-239, unit='h'), pd.offsets.Hour(1) - td) - - def test_freq_conversion(self): - - td = Timedelta('1 days 2 hours 3 ns') - result = td / np.timedelta64(1, 'D') - self.assertEqual(result, td.value / float(86400 * 1e9)) - result = td / np.timedelta64(1, 's') - self.assertEqual(result, td.value / float(1e9)) - result = td / np.timedelta64(1, 'ns') - self.assertEqual(result, td.value) - - def test_ops_ndarray(self): - td = Timedelta('1 day') - - # timedelta, timedelta - other = pd.to_timedelta(['1 day']).values - expected = pd.to_timedelta(['2 days']).values - self.assert_numpy_array_equal(td + other, expected) - if LooseVersion(np.__version__) >= '1.8': - self.assert_numpy_array_equal(other + td, expected) - self.assertRaises(TypeError, lambda: td + np.array([1])) - self.assertRaises(TypeError, lambda: np.array([1]) + td) - - expected = pd.to_timedelta(['0 days']).values - self.assert_numpy_array_equal(td - other, expected) - if LooseVersion(np.__version__) >= '1.8': - self.assert_numpy_array_equal(-other + td, expected) - self.assertRaises(TypeError, lambda: td - np.array([1])) - self.assertRaises(TypeError, lambda: np.array([1]) - td) - - expected = pd.to_timedelta(['2 days']).values - self.assert_numpy_array_equal(td * np.array([2]), expected) - self.assert_numpy_array_equal(np.array([2]) * td, expected) - self.assertRaises(TypeError, lambda: td * other) - self.assertRaises(TypeError, lambda: other * td) - - self.assert_numpy_array_equal(td / other, - np.array([1], dtype=np.float64)) - if LooseVersion(np.__version__) >= '1.8': - self.assert_numpy_array_equal(other / td, - np.array([1], dtype=np.float64)) - - # timedelta, datetime - other = pd.to_datetime(['2000-01-01']).values - expected = pd.to_datetime(['2000-01-02']).values - self.assert_numpy_array_equal(td + other, expected) - if LooseVersion(np.__version__) >= '1.8': - self.assert_numpy_array_equal(other + td, expected) - - expected = pd.to_datetime(['1999-12-31']).values - self.assert_numpy_array_equal(-td + other, expected) - if LooseVersion(np.__version__) >= '1.8': - self.assert_numpy_array_equal(other - td, expected) - - def test_ops_series(self): - # regression test for GH8813 - td = Timedelta('1 day') - other = pd.Series([1, 2]) - expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) - tm.assert_series_equal(expected, td * other) - tm.assert_series_equal(expected, other * td) - - def test_ops_series_object(self): - # GH 13043 - s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), - pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], - name='xxx') - self.assertEqual(s.dtype, object) - - exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), - pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], - name='xxx') - tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) - tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) - - # object series & object series - s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), - pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], - name='xxx') - self.assertEqual(s2.dtype, object) - exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], - name='xxx') - tm.assert_series_equal(s2 - s, exp) - tm.assert_series_equal(s - s2, -exp) - - s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], - name='xxx', dtype=object) - self.assertEqual(s.dtype, object) - - exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], - name='xxx') - tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) - tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) - - def test_compare_timedelta_series(self): - # regresssion test for GH5963 - s = pd.Series([timedelta(days=1), timedelta(days=2)]) - actual = s > timedelta(days=1) - expected = pd.Series([False, True]) - tm.assert_series_equal(actual, expected) - - def test_compare_timedelta_ndarray(self): - # GH11835 - periods = [Timedelta('0 days 01:00:00'), Timedelta('0 days 01:00:00')] - arr = np.array(periods) - result = arr[0] > arr - expected = np.array([False, False]) - self.assert_numpy_array_equal(result, expected) - - def test_ops_notimplemented(self): - class Other: - pass - - other = Other() - - td = Timedelta('1 day') - self.assertTrue(td.__add__(other) is NotImplemented) - self.assertTrue(td.__sub__(other) is NotImplemented) - self.assertTrue(td.__truediv__(other) is NotImplemented) - self.assertTrue(td.__mul__(other) is NotImplemented) - self.assertTrue(td.__floordiv__(td) is NotImplemented) - - def test_ops_error_str(self): - # GH 13624 - td = Timedelta('1 day') - - for l, r in [(td, 'a'), ('a', td)]: - - with tm.assertRaises(TypeError): - l + r - - with tm.assertRaises(TypeError): - l > r - - self.assertFalse(l == r) - self.assertTrue(l != r) - - def test_fields(self): - def check(value): - # that we are int/long like - self.assertTrue(isinstance(value, (int, compat.long))) - - # compat to datetime.timedelta - rng = to_timedelta('1 days, 10:11:12') - self.assertEqual(rng.days, 1) - self.assertEqual(rng.seconds, 10 * 3600 + 11 * 60 + 12) - self.assertEqual(rng.microseconds, 0) - self.assertEqual(rng.nanoseconds, 0) - - self.assertRaises(AttributeError, lambda: rng.hours) - self.assertRaises(AttributeError, lambda: rng.minutes) - self.assertRaises(AttributeError, lambda: rng.milliseconds) - - # GH 10050 - check(rng.days) - check(rng.seconds) - check(rng.microseconds) - check(rng.nanoseconds) - - td = Timedelta('-1 days, 10:11:12') - self.assertEqual(abs(td), Timedelta('13:48:48')) - self.assertTrue(str(td) == "-1 days +10:11:12") - self.assertEqual(-td, Timedelta('0 days 13:48:48')) - self.assertEqual(-Timedelta('-1 days, 10:11:12').value, 49728000000000) - self.assertEqual(Timedelta('-1 days, 10:11:12').value, -49728000000000) - - rng = to_timedelta('-1 days, 10:11:12.100123456') - self.assertEqual(rng.days, -1) - self.assertEqual(rng.seconds, 10 * 3600 + 11 * 60 + 12) - self.assertEqual(rng.microseconds, 100 * 1000 + 123) - self.assertEqual(rng.nanoseconds, 456) - self.assertRaises(AttributeError, lambda: rng.hours) - self.assertRaises(AttributeError, lambda: rng.minutes) - self.assertRaises(AttributeError, lambda: rng.milliseconds) - - # components - tup = pd.to_timedelta(-1, 'us').components - self.assertEqual(tup.days, -1) - self.assertEqual(tup.hours, 23) - self.assertEqual(tup.minutes, 59) - self.assertEqual(tup.seconds, 59) - self.assertEqual(tup.milliseconds, 999) - self.assertEqual(tup.microseconds, 999) - self.assertEqual(tup.nanoseconds, 0) - - # GH 10050 - check(tup.days) - check(tup.hours) - check(tup.minutes) - check(tup.seconds) - check(tup.milliseconds) - check(tup.microseconds) - check(tup.nanoseconds) - - tup = Timedelta('-1 days 1 us').components - self.assertEqual(tup.days, -2) - self.assertEqual(tup.hours, 23) - self.assertEqual(tup.minutes, 59) - self.assertEqual(tup.seconds, 59) - self.assertEqual(tup.milliseconds, 999) - self.assertEqual(tup.microseconds, 999) - self.assertEqual(tup.nanoseconds, 0) - - def test_timedelta_range(self): - - expected = to_timedelta(np.arange(5), unit='D') - result = timedelta_range('0 days', periods=5, freq='D') - tm.assert_index_equal(result, expected) - - expected = to_timedelta(np.arange(11), unit='D') - result = timedelta_range('0 days', '10 days', freq='D') - tm.assert_index_equal(result, expected) - - expected = to_timedelta(np.arange(5), unit='D') + Second(2) + Day() - result = timedelta_range('1 days, 00:00:02', '5 days, 00:00:02', - freq='D') - tm.assert_index_equal(result, expected) - - expected = to_timedelta([1, 3, 5, 7, 9], unit='D') + Second(2) - result = timedelta_range('1 days, 00:00:02', periods=5, freq='2D') - tm.assert_index_equal(result, expected) - - expected = to_timedelta(np.arange(50), unit='T') * 30 - result = timedelta_range('0 days', freq='30T', periods=50) - tm.assert_index_equal(result, expected) - - # GH 11776 - arr = np.arange(10).reshape(2, 5) - df = pd.DataFrame(np.arange(10).reshape(2, 5)) - for arg in (arr, df): - with tm.assertRaisesRegexp(TypeError, "1-d array"): - to_timedelta(arg) - for errors in ['ignore', 'raise', 'coerce']: - with tm.assertRaisesRegexp(TypeError, "1-d array"): - to_timedelta(arg, errors=errors) - - # issue10583 - df = pd.DataFrame(np.random.normal(size=(10, 4))) - df.index = pd.timedelta_range(start='0s', periods=10, freq='s') - expected = df.loc[pd.Timedelta('0s'):, :] - result = df.loc['0s':, :] - assert_frame_equal(expected, result) - - def test_numeric_conversions(self): - self.assertEqual(ct(0), np.timedelta64(0, 'ns')) - self.assertEqual(ct(10), np.timedelta64(10, 'ns')) - self.assertEqual(ct(10, unit='ns'), np.timedelta64( - 10, 'ns').astype('m8[ns]')) - - self.assertEqual(ct(10, unit='us'), np.timedelta64( - 10, 'us').astype('m8[ns]')) - self.assertEqual(ct(10, unit='ms'), np.timedelta64( - 10, 'ms').astype('m8[ns]')) - self.assertEqual(ct(10, unit='s'), np.timedelta64( - 10, 's').astype('m8[ns]')) - self.assertEqual(ct(10, unit='d'), np.timedelta64( - 10, 'D').astype('m8[ns]')) - - def test_timedelta_conversions(self): - self.assertEqual(ct(timedelta(seconds=1)), - np.timedelta64(1, 's').astype('m8[ns]')) - self.assertEqual(ct(timedelta(microseconds=1)), - np.timedelta64(1, 'us').astype('m8[ns]')) - self.assertEqual(ct(timedelta(days=1)), - np.timedelta64(1, 'D').astype('m8[ns]')) - - def test_short_format_converters(self): - def conv(v): - return v.astype('m8[ns]') - - self.assertEqual(ct('10'), np.timedelta64(10, 'ns')) - self.assertEqual(ct('10ns'), np.timedelta64(10, 'ns')) - self.assertEqual(ct('100'), np.timedelta64(100, 'ns')) - self.assertEqual(ct('100ns'), np.timedelta64(100, 'ns')) - - self.assertEqual(ct('1000'), np.timedelta64(1000, 'ns')) - self.assertEqual(ct('1000ns'), np.timedelta64(1000, 'ns')) - self.assertEqual(ct('1000NS'), np.timedelta64(1000, 'ns')) - - self.assertEqual(ct('10us'), np.timedelta64(10000, 'ns')) - self.assertEqual(ct('100us'), np.timedelta64(100000, 'ns')) - self.assertEqual(ct('1000us'), np.timedelta64(1000000, 'ns')) - self.assertEqual(ct('1000Us'), np.timedelta64(1000000, 'ns')) - self.assertEqual(ct('1000uS'), np.timedelta64(1000000, 'ns')) - - self.assertEqual(ct('1ms'), np.timedelta64(1000000, 'ns')) - self.assertEqual(ct('10ms'), np.timedelta64(10000000, 'ns')) - self.assertEqual(ct('100ms'), np.timedelta64(100000000, 'ns')) - self.assertEqual(ct('1000ms'), np.timedelta64(1000000000, 'ns')) - - self.assertEqual(ct('-1s'), -np.timedelta64(1000000000, 'ns')) - self.assertEqual(ct('1s'), np.timedelta64(1000000000, 'ns')) - self.assertEqual(ct('10s'), np.timedelta64(10000000000, 'ns')) - self.assertEqual(ct('100s'), np.timedelta64(100000000000, 'ns')) - self.assertEqual(ct('1000s'), np.timedelta64(1000000000000, 'ns')) - - self.assertEqual(ct('1d'), conv(np.timedelta64(1, 'D'))) - self.assertEqual(ct('-1d'), -conv(np.timedelta64(1, 'D'))) - self.assertEqual(ct('1D'), conv(np.timedelta64(1, 'D'))) - self.assertEqual(ct('10D'), conv(np.timedelta64(10, 'D'))) - self.assertEqual(ct('100D'), conv(np.timedelta64(100, 'D'))) - self.assertEqual(ct('1000D'), conv(np.timedelta64(1000, 'D'))) - self.assertEqual(ct('10000D'), conv(np.timedelta64(10000, 'D'))) - - # space - self.assertEqual(ct(' 10000D '), conv(np.timedelta64(10000, 'D'))) - self.assertEqual(ct(' - 10000D '), -conv(np.timedelta64(10000, 'D'))) - - # invalid - self.assertRaises(ValueError, ct, '1foo') - self.assertRaises(ValueError, ct, 'foo') - - def test_full_format_converters(self): - def conv(v): - return v.astype('m8[ns]') - - d1 = np.timedelta64(1, 'D') - - self.assertEqual(ct('1days'), conv(d1)) - self.assertEqual(ct('1days,'), conv(d1)) - self.assertEqual(ct('- 1days,'), -conv(d1)) - - self.assertEqual(ct('00:00:01'), conv(np.timedelta64(1, 's'))) - self.assertEqual(ct('06:00:01'), conv( - np.timedelta64(6 * 3600 + 1, 's'))) - self.assertEqual(ct('06:00:01.0'), conv( - np.timedelta64(6 * 3600 + 1, 's'))) - self.assertEqual(ct('06:00:01.01'), conv( - np.timedelta64(1000 * (6 * 3600 + 1) + 10, 'ms'))) - - self.assertEqual(ct('- 1days, 00:00:01'), - conv(-d1 + np.timedelta64(1, 's'))) - self.assertEqual(ct('1days, 06:00:01'), conv( - d1 + np.timedelta64(6 * 3600 + 1, 's'))) - self.assertEqual(ct('1days, 06:00:01.01'), conv( - d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, 'ms'))) - - # invalid - self.assertRaises(ValueError, ct, '- 1days, 00') - - def test_nat_converters(self): - self.assertEqual(to_timedelta( - 'nat', box=False).astype('int64'), tslib.iNaT) - self.assertEqual(to_timedelta( - 'nan', box=False).astype('int64'), tslib.iNaT) - - def test_to_timedelta(self): - def conv(v): - return v.astype('m8[ns]') - - d1 = np.timedelta64(1, 'D') - - self.assertEqual(to_timedelta('1 days 06:05:01.00003', box=False), - conv(d1 + np.timedelta64(6 * 3600 + - 5 * 60 + 1, 's') + - np.timedelta64(30, 'us'))) - self.assertEqual(to_timedelta('15.5us', box=False), - conv(np.timedelta64(15500, 'ns'))) - - # empty string - result = to_timedelta('', box=False) - self.assertEqual(result.astype('int64'), tslib.iNaT) - - result = to_timedelta(['', '']) - self.assertTrue(isnull(result).all()) - - # pass thru - result = to_timedelta(np.array([np.timedelta64(1, 's')])) - expected = pd.Index(np.array([np.timedelta64(1, 's')])) - tm.assert_index_equal(result, expected) - - # ints - result = np.timedelta64(0, 'ns') - expected = to_timedelta(0, box=False) - self.assertEqual(result, expected) - - # Series - expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(['1d', '1days 00:00:01'])) - tm.assert_series_equal(result, expected) - - # with units - result = TimedeltaIndex([np.timedelta64(0, 'ns'), np.timedelta64( - 10, 's').astype('m8[ns]')]) - expected = to_timedelta([0, 10], unit='s') - tm.assert_index_equal(result, expected) - - # single element conversion - v = timedelta(seconds=1) - result = to_timedelta(v, box=False) - expected = np.timedelta64(timedelta(seconds=1)) - self.assertEqual(result, expected) - - v = np.timedelta64(timedelta(seconds=1)) - result = to_timedelta(v, box=False) - expected = np.timedelta64(timedelta(seconds=1)) - self.assertEqual(result, expected) - - # arrays of various dtypes - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='s') - expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='m') - expected = TimedeltaIndex([np.timedelta64(1, 'm')] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='h') - expected = TimedeltaIndex([np.timedelta64(1, 'h')] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype='timedelta64[s]') - result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype='timedelta64[D]') - result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, 'D')] * 5) - tm.assert_index_equal(result, expected) - - # Test with lists as input when box=false - expected = np.array(np.arange(3) * 1000000000, dtype='timedelta64[ns]') - result = to_timedelta(range(3), unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) - - result = to_timedelta(np.arange(3), unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) - - result = to_timedelta([0, 1, 2], unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) - - # Tests with fractional seconds as input: - expected = np.array( - [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]') - result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) - - def testit(unit, transform): - - # array - result = to_timedelta(np.arange(5), unit=unit) - expected = TimedeltaIndex([np.timedelta64(i, transform(unit)) - for i in np.arange(5).tolist()]) - tm.assert_index_equal(result, expected) - - # scalar - result = to_timedelta(2, unit=unit) - expected = Timedelta(np.timedelta64(2, transform(unit)).astype( - 'timedelta64[ns]')) - self.assertEqual(result, expected) - - # validate all units - # GH 6855 - for unit in ['Y', 'M', 'W', 'D', 'y', 'w', 'd']: - testit(unit, lambda x: x.upper()) - for unit in ['days', 'day', 'Day', 'Days']: - testit(unit, lambda x: 'D') - for unit in ['h', 'm', 's', 'ms', 'us', 'ns', 'H', 'S', 'MS', 'US', - 'NS']: - testit(unit, lambda x: x.lower()) - - # offsets - - # m - testit('T', lambda x: 'm') - - # ms - testit('L', lambda x: 'ms') - - def test_to_timedelta_invalid(self): - - # bad value for errors parameter - msg = "errors must be one of" - tm.assertRaisesRegexp(ValueError, msg, to_timedelta, - ['foo'], errors='never') - - # these will error - self.assertRaises(ValueError, lambda: to_timedelta([1, 2], unit='foo')) - self.assertRaises(ValueError, lambda: to_timedelta(1, unit='foo')) - - # time not supported ATM - self.assertRaises(ValueError, lambda: to_timedelta(time(second=1))) - self.assertTrue(to_timedelta( - time(second=1), errors='coerce') is pd.NaT) - - self.assertRaises(ValueError, lambda: to_timedelta(['foo', 'bar'])) - tm.assert_index_equal(TimedeltaIndex([pd.NaT, pd.NaT]), - to_timedelta(['foo', 'bar'], errors='coerce')) - - tm.assert_index_equal(TimedeltaIndex(['1 day', pd.NaT, '1 min']), - to_timedelta(['1 day', 'bar', '1 min'], - errors='coerce')) - - # gh-13613: these should not error because errors='ignore' - invalid_data = 'apple' - self.assertEqual(invalid_data, to_timedelta( - invalid_data, errors='ignore')) - - invalid_data = ['apple', '1 days'] - tm.assert_numpy_array_equal( - np.array(invalid_data, dtype=object), - to_timedelta(invalid_data, errors='ignore')) - - invalid_data = pd.Index(['apple', '1 days']) - tm.assert_index_equal(invalid_data, to_timedelta( - invalid_data, errors='ignore')) - - invalid_data = Series(['apple', '1 days']) - tm.assert_series_equal(invalid_data, to_timedelta( - invalid_data, errors='ignore')) - - def test_to_timedelta_via_apply(self): - # GH 5458 - expected = Series([np.timedelta64(1, 's')]) - result = Series(['00:00:01']).apply(to_timedelta) - tm.assert_series_equal(result, expected) - - result = Series([to_timedelta('00:00:01')]) - tm.assert_series_equal(result, expected) - - def test_timedelta_ops(self): - # GH4984 - # make sure ops return Timedelta - s = Series([Timestamp('20130101') + timedelta(seconds=i * i) - for i in range(10)]) - td = s.diff() - - result = td.mean() - expected = to_timedelta(timedelta(seconds=9)) - self.assertEqual(result, expected) - - result = td.to_frame().mean() - self.assertEqual(result[0], expected) - - result = td.quantile(.1) - expected = Timedelta(np.timedelta64(2600, 'ms')) - self.assertEqual(result, expected) - - result = td.median() - expected = to_timedelta('00:00:09') - self.assertEqual(result, expected) - - result = td.to_frame().median() - self.assertEqual(result[0], expected) - - # GH 6462 - # consistency in returned values for sum - result = td.sum() - expected = to_timedelta('00:01:21') - self.assertEqual(result, expected) - - result = td.to_frame().sum() - self.assertEqual(result[0], expected) - - # std - result = td.std() - expected = to_timedelta(Series(td.dropna().values).std()) - self.assertEqual(result, expected) - - result = td.to_frame().std() - self.assertEqual(result[0], expected) - - # invalid ops - for op in ['skew', 'kurt', 'sem', 'prod']: - self.assertRaises(TypeError, getattr(td, op)) - - # GH 10040 - # make sure NaT is properly handled by median() - s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) - self.assertEqual(s.diff().median(), timedelta(days=4)) - - s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), - Timestamp('2015-02-15')]) - self.assertEqual(s.diff().median(), timedelta(days=6)) - - def test_overflow(self): - # GH 9442 - s = Series(pd.date_range('20130101', periods=100000, freq='H')) - s[0] += pd.Timedelta('1s 1ms') - - # mean - result = (s - s.min()).mean() - expected = pd.Timedelta((pd.DatetimeIndex((s - s.min())).asi8 / len(s) - ).sum()) - - # the computation is converted to float so might be some loss of - # precision - self.assertTrue(np.allclose(result.value / 1000, expected.value / - 1000)) - - # sum - self.assertRaises(ValueError, lambda: (s - s.min()).sum()) - s1 = s[0:10000] - self.assertRaises(ValueError, lambda: (s1 - s1.min()).sum()) - s2 = s[0:1000] - result = (s2 - s2.min()).sum() - - def test_overflow_on_construction(self): - # xref https://github.com/statsmodels/statsmodels/issues/3374 - value = pd.Timedelta('1day').value * 20169940 - self.assertRaises(OverflowError, pd.Timedelta, value) - - def test_timedelta_ops_scalar(self): - # GH 6808 - base = pd.to_datetime('20130101 09:01:12.123456') - expected_add = pd.to_datetime('20130101 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), - np.timedelta64(10, 's'), - np.timedelta64(10000000000, 'ns'), - pd.offsets.Second(10)]: - result = base + offset - self.assertEqual(result, expected_add) - - result = base - offset - self.assertEqual(result, expected_sub) - - base = pd.to_datetime('20130102 09:01:12.123456') - expected_add = pd.to_datetime('20130103 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta('1 day, 00:00:10'), - pd.to_timedelta('1 days, 00:00:10'), - timedelta(days=1, seconds=10), - np.timedelta64(1, 'D') + np.timedelta64(10, 's'), - pd.offsets.Day() + pd.offsets.Second(10)]: - result = base + offset - self.assertEqual(result, expected_add) - - result = base - offset - self.assertEqual(result, expected_sub) - - def test_to_timedelta_on_missing_values(self): - # GH5438 - timedelta_NaT = np.timedelta64('NaT') - - actual = pd.to_timedelta(Series(['00:00:01', np.nan])) - expected = Series([np.timedelta64(1000000000, 'ns'), - timedelta_NaT], dtype=' idx1 - expected = np.array([True, False, False, False, True, False]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 <= idx2 - expected = np.array([True, False, False, False, True, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx2 >= idx1 - expected = np.array([True, False, False, False, True, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 == idx2 - expected = np.array([False, False, False, False, False, True]) - self.assert_numpy_array_equal(result, expected) - - result = idx1 != idx2 - expected = np.array([True, True, True, True, True, False]) - self.assert_numpy_array_equal(result, expected) - - def test_ops_error_str(self): - # GH 13624 - tdi = TimedeltaIndex(['1 day', '2 days']) - - for l, r in [(tdi, 'a'), ('a', tdi)]: - with tm.assertRaises(TypeError): - l + r - - with tm.assertRaises(TypeError): - l > r - - with tm.assertRaises(TypeError): - l == r - - with tm.assertRaises(TypeError): - l != r - - def test_map(self): - - rng = timedelta_range('1 day', periods=10) - - f = lambda x: x.days - result = rng.map(f) - exp = Int64Index([f(x) for x in rng]) - tm.assert_index_equal(result, exp) - - def test_misc_coverage(self): - - rng = timedelta_range('1 day', periods=5) - result = rng.groupby(rng.days) - tm.assertIsInstance(list(result.values())[0][0], Timedelta) - - idx = TimedeltaIndex(['3d', '1d', '2d']) - self.assertFalse(idx.equals(list(idx))) - - non_td = Index(list('abc')) - self.assertFalse(idx.equals(list(non_td))) - - def test_union(self): - - i1 = timedelta_range('1day', periods=5) - i2 = timedelta_range('3day', periods=5) - result = i1.union(i2) - expected = timedelta_range('1day', periods=7) - self.assert_index_equal(result, expected) - - i1 = Int64Index(np.arange(0, 20, 2)) - i2 = TimedeltaIndex(start='1 day', periods=10, freq='D') - i1.union(i2) # Works - i2.union(i1) # Fails with "AttributeError: can't set attribute" - - def test_union_coverage(self): - - idx = TimedeltaIndex(['3d', '1d', '2d']) - ordered = TimedeltaIndex(idx.sort_values(), freq='infer') - result = ordered.union(idx) - self.assert_index_equal(result, ordered) - - result = ordered[:0].union(ordered) - self.assert_index_equal(result, ordered) - self.assertEqual(result.freq, ordered.freq) - - def test_union_bug_1730(self): - - rng_a = timedelta_range('1 day', periods=4, freq='3H') - rng_b = timedelta_range('1 day', periods=4, freq='4H') - - result = rng_a.union(rng_b) - exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) - self.assert_index_equal(result, exp) - - def test_union_bug_1745(self): - - left = TimedeltaIndex(['1 day 15:19:49.695000']) - right = TimedeltaIndex(['2 day 13:04:21.322000', - '1 day 15:27:24.873000', - '1 day 15:31:05.350000']) - - result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) - self.assert_index_equal(result, exp) - - def test_union_bug_4564(self): - - left = timedelta_range("1 day", "30d") - right = left + pd.offsets.Minute(15) - - result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) - self.assert_index_equal(result, exp) - - def test_intersection_bug_1708(self): - index_1 = timedelta_range('1 day', periods=4, freq='h') - index_2 = index_1 + pd.offsets.Hour(5) - - result = index_1 & index_2 - self.assertEqual(len(result), 0) - - index_1 = timedelta_range('1 day', periods=4, freq='h') - index_2 = index_1 + pd.offsets.Hour(1) - - result = index_1 & index_2 - expected = timedelta_range('1 day 01:00:00', periods=3, freq='h') - tm.assert_index_equal(result, expected) - - def test_get_duplicates(self): - idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day', - '4day']) - - result = idx.get_duplicates() - ex = TimedeltaIndex(['2 day', '3day']) - self.assert_index_equal(result, ex) - - def test_argmin_argmax(self): - idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01', - '1 day 00:00:02']) - self.assertEqual(idx.argmin(), 1) - self.assertEqual(idx.argmax(), 0) - - def test_sort_values(self): - - idx = TimedeltaIndex(['4d', '1d', '2d']) - - ordered = idx.sort_values() - self.assertTrue(ordered.is_monotonic) - - ordered = idx.sort_values(ascending=False) - self.assertTrue(ordered[::-1].is_monotonic) - - ordered, dexer = idx.sort_values(return_indexer=True) - self.assertTrue(ordered.is_monotonic) - self.assert_numpy_array_equal(dexer, - np.array([1, 2, 0]), - check_dtype=False) - - ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) - self.assertTrue(ordered[::-1].is_monotonic) - self.assert_numpy_array_equal(dexer, - np.array([0, 2, 1]), - check_dtype=False) - - def test_insert(self): - - idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx') - - result = idx.insert(2, timedelta(days=5)) - exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') - self.assert_index_equal(result, exp) - - # insertion of non-datetime should coerce to object index - result = idx.insert(1, 'inserted') - expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), - Timedelta('2day')], name='idx') - self.assertNotIsInstance(result, TimedeltaIndex) - tm.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - - idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') - - # preserve freq - expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02', - '1day 00:00:03'], - name='idx', freq='s') - expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:04'], - name='idx', freq='s') - - # reset freq to None - expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01', - '1day 00:00:02', '1day 00:00:03'], - name='idx', freq=None) - expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:05'], - name='idx', freq=None) - - cases = [(0, Timedelta('1day'), expected_0), - (-3, Timedelta('1day'), expected_0), - (3, Timedelta('1day 00:00:04'), expected_3), - (1, Timedelta('1day 00:00:01'), expected_1_nofreq), - (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] - - for n, d, expected in cases: - result = idx.insert(n, d) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(result.freq, expected.freq) - - def test_delete(self): - idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') - - # prserve freq - expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', - name='idx') - expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', - name='idx') - - # reset freq to None - expected_1 = TimedeltaIndex( - ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') - - cases = {0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1} - for n, expected in compat.iteritems(cases): - result = idx.delete(n) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(result.freq, expected.freq) - - with tm.assertRaises((IndexError, ValueError)): - # either depeidnig on numpy version - result = idx.delete(5) - - def test_delete_slice(self): - idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') - - # prserve freq - expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', - name='idx') - expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', - name='idx') - - # reset freq to None - expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', - '7 d', '8 d', '9 d', '10d'], - freq=None, name='idx') - - cases = {(0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5} - for n, expected in compat.iteritems(cases): - result = idx.delete(n) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(result.freq, expected.freq) - - result = idx.delete(slice(n[0], n[-1] + 1)) - self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(result.freq, expected.freq) - - def test_take(self): - - tds = ['1day 02:00:00', '1 day 04:00:00', '1 day 10:00:00'] - idx = TimedeltaIndex(start='1d', end='2d', freq='H', name='idx') - expected = TimedeltaIndex(tds, freq=None, name='idx') - - taken1 = idx.take([2, 4, 10]) - taken2 = idx[[2, 4, 10]] - - for taken in [taken1, taken2]: - self.assert_index_equal(taken, expected) - tm.assertIsInstance(taken, TimedeltaIndex) - self.assertIsNone(taken.freq) - self.assertEqual(taken.name, expected.name) - - def test_take_fill_value(self): - # GH 12631 - idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days'], - name='xxx') - result = idx.take(np.array([1, 0, -1])) - expected = pd.TimedeltaIndex(['2 days', '1 days', '3 days'], - name='xxx') - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.TimedeltaIndex(['2 days', '1 days', 'NaT'], - name='xxx') - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.TimedeltaIndex(['2 days', '1 days', '3 days'], - name='xxx') - tm.assert_index_equal(result, expected) - - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with tm.assertRaises(IndexError): - idx.take(np.array([1, -5])) - - def test_isin(self): - - index = tm.makeTimedeltaIndex(4) - result = index.isin(index) - self.assertTrue(result.all()) - - result = index.isin(list(index)) - self.assertTrue(result.all()) - - assert_almost_equal(index.isin([index[2], 5]), - np.array([False, False, True, False])) - - def test_does_not_convert_mixed_integer(self): - df = tm.makeCustomDataframe(10, 10, - data_gen_f=lambda *args, **kwargs: randn(), - r_idx_type='i', c_idx_type='td') - str(df) - - cols = df.columns.join(df.index, how='outer') - joined = cols.join(df.columns) - self.assertEqual(cols.dtype, np.dtype('O')) - self.assertEqual(cols.dtype, joined.dtype) - tm.assert_index_equal(cols, joined) - - def test_slice_keeps_name(self): - - # GH4226 - dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket') - self.assertEqual(dr[1:].name, dr.name) - - def test_join_self(self): - - index = timedelta_range('1 day', periods=10) - kinds = 'outer', 'inner', 'left', 'right' - for kind in kinds: - joined = index.join(index, how=kind) - tm.assert_index_equal(index, joined) - - def test_factorize(self): - idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', - '3 day']) - - exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day']) - - arr, idx = idx1.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - self.assert_index_equal(idx, exp_idx) - - arr, idx = idx1.factorize(sort=True) - self.assert_numpy_array_equal(arr, exp_arr) - self.assert_index_equal(idx, exp_idx) - - # freq must be preserved - idx3 = timedelta_range('1 day', periods=4, freq='s') - exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) - arr, idx = idx3.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - self.assert_index_equal(idx, idx3) - - -class TestSlicing(tm.TestCase): - - def test_partial_slice(self): - rng = timedelta_range('1 day 10:11:12', freq='h', periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s['5 day':'6 day'] - expected = s.iloc[86:134] - assert_series_equal(result, expected) - - result = s['5 day':] - expected = s.iloc[86:] - assert_series_equal(result, expected) - - result = s[:'6 day'] - expected = s.iloc[:134] - assert_series_equal(result, expected) - - result = s['6 days, 23:11:12'] - self.assertEqual(result, s.iloc[133]) - - self.assertRaises(KeyError, s.__getitem__, '50 days') - - def test_partial_slice_high_reso(self): - - # higher reso - rng = timedelta_range('1 day 10:11:12', freq='us', periods=2000) - s = Series(np.arange(len(rng)), index=rng) - - result = s['1 day 10:11:12':] - expected = s.iloc[0:] - assert_series_equal(result, expected) - - result = s['1 day 10:11:12.001':] - expected = s.iloc[1000:] - assert_series_equal(result, expected) - - result = s['1 days, 10:11:12.001001'] - self.assertEqual(result, s.iloc[1001]) - - def test_slice_with_negative_step(self): - ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) - SLC = pd.IndexSlice - - def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(ts[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - - assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1]) - assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1]) - - assert_slices_equivalent(SLC[:Timedelta(hours=7):-1], SLC[:6:-1]) - assert_slices_equivalent(SLC[:'7 hours':-1], SLC[:6:-1]) - - assert_slices_equivalent(SLC['15 hours':'7 hours':-1], SLC[15:6:-1]) - assert_slices_equivalent(SLC[Timedelta(hours=15):Timedelta(hours=7):- - 1], SLC[15:6:-1]) - assert_slices_equivalent(SLC['15 hours':Timedelta(hours=7):-1], - SLC[15:6:-1]) - assert_slices_equivalent(SLC[Timedelta(hours=15):'7 hours':-1], - SLC[15:6:-1]) - - assert_slices_equivalent(SLC['7 hours':'15 hours':-1], SLC[:0]) - - def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - - def test_tdi_ops_attributes(self): - rng = timedelta_range('2 days', periods=5, freq='2D', name='x') - - result = rng + 1 - exp = timedelta_range('4 days', periods=5, freq='2D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '2D') - - result = rng - 2 - exp = timedelta_range('-2 days', periods=5, freq='2D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '2D') - - result = rng * 2 - exp = timedelta_range('4 days', periods=5, freq='4D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '4D') - - result = rng / 2 - exp = timedelta_range('1 days', periods=5, freq='D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, 'D') - - result = -rng - exp = timedelta_range('-2 days', periods=5, freq='-2D', name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, '-2D') - - rng = pd.timedelta_range('-2 days', periods=5, freq='D', name='x') - - result = abs(rng) - exp = TimedeltaIndex(['2 days', '1 days', '0 days', '1 days', - '2 days'], name='x') - tm.assert_index_equal(result, exp) - self.assertEqual(result.freq, None) - - def test_add_overflow(self): - # see gh-14068 - msg = "too (big|large) to convert" - with tm.assertRaisesRegexp(OverflowError, msg): - to_timedelta(106580, 'D') + Timestamp('2000') - with tm.assertRaisesRegexp(OverflowError, msg): - Timestamp('2000') + to_timedelta(106580, 'D') - - _NaT = int(pd.NaT) + 1 - msg = "Overflow in int64 addition" - with tm.assertRaisesRegexp(OverflowError, msg): - to_timedelta([106580], 'D') + Timestamp('2000') - with tm.assertRaisesRegexp(OverflowError, msg): - Timestamp('2000') + to_timedelta([106580], 'D') - with tm.assertRaisesRegexp(OverflowError, msg): - to_timedelta([_NaT]) - Timedelta('1 days') - with tm.assertRaisesRegexp(OverflowError, msg): - to_timedelta(['5 days', _NaT]) - Timedelta('1 days') - with tm.assertRaisesRegexp(OverflowError, msg): - (to_timedelta([_NaT, '5 days', '1 hours']) - - to_timedelta(['7 seconds', _NaT, '4 hours'])) - - # These should not overflow! - exp = TimedeltaIndex([pd.NaT]) - result = to_timedelta([pd.NaT]) - Timedelta('1 days') - tm.assert_index_equal(result, exp) - - exp = TimedeltaIndex(['4 days', pd.NaT]) - result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') - tm.assert_index_equal(result, exp) - - exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) - result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + - to_timedelta(['7 seconds', pd.NaT, '4 hours'])) - tm.assert_index_equal(result, exp) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 38cd8079faf93..771fb2f50c410 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -1,23 +1,20 @@ # pylint: disable-msg=E1101,W0612 -from datetime import datetime, timedelta, tzinfo, date -import numpy as np import pytz +import numpy as np from distutils.version import LooseVersion -from pandas.types.dtypes import DatetimeTZDtype -from pandas import (Index, Series, DataFrame, isnull, Timestamp) - -from pandas import DatetimeIndex, to_datetime, NaT -from pandas import tslib - -import pandas.tseries.offsets as offsets -from pandas.tseries.index import bdate_range, date_range -import pandas.tseries.tools as tools +from datetime import datetime, timedelta, tzinfo, date from pytz import NonExistentTimeError import pandas.util.testing as tm +import pandas.tseries.tools as tools +import pandas.tseries.offsets as offsets +from pandas.compat import lrange, zip +from pandas.tseries.index import bdate_range, date_range +from pandas.types.dtypes import DatetimeTZDtype +from pandas import (Index, Series, DataFrame, isnull, Timestamp, tslib, NaT, + DatetimeIndex, to_datetime) from pandas.util.testing import (assert_frame_equal, assert_series_equal, set_timezone) -from pandas.compat import lrange, zip try: import pytz # noqa @@ -1679,3 +1676,52 @@ def test_nat(self): idx = idx.tz_convert('US/Eastern') expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + + +class TestTslib(tm.TestCase): + + def test_tslib_tz_convert(self): + def compare_utc_to_local(tz_didx, utc_didx): + f = lambda x: tslib.tz_convert_single(x, 'UTC', tz_didx.tz) + result = tslib.tz_convert(tz_didx.asi8, 'UTC', tz_didx.tz) + result_single = np.vectorize(f)(tz_didx.asi8) + self.assert_numpy_array_equal(result, result_single) + + def compare_local_to_utc(tz_didx, utc_didx): + f = lambda x: tslib.tz_convert_single(x, tz_didx.tz, 'UTC') + result = tslib.tz_convert(utc_didx.asi8, tz_didx.tz, 'UTC') + result_single = np.vectorize(f)(utc_didx.asi8) + self.assert_numpy_array_equal(result, result_single) + + for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'Europe/Moscow']: + # US: 2014-03-09 - 2014-11-11 + # MOSCOW: 2014-10-26 / 2014-12-31 + tz_didx = date_range('2014-03-01', '2015-01-10', freq='H', tz=tz) + utc_didx = date_range('2014-03-01', '2015-01-10', freq='H') + compare_utc_to_local(tz_didx, utc_didx) + # local tz to UTC can be differ in hourly (or higher) freqs because + # of DST + compare_local_to_utc(tz_didx, utc_didx) + + tz_didx = date_range('2000-01-01', '2020-01-01', freq='D', tz=tz) + utc_didx = date_range('2000-01-01', '2020-01-01', freq='D') + compare_utc_to_local(tz_didx, utc_didx) + compare_local_to_utc(tz_didx, utc_didx) + + tz_didx = date_range('2000-01-01', '2100-01-01', freq='A', tz=tz) + utc_didx = date_range('2000-01-01', '2100-01-01', freq='A') + compare_utc_to_local(tz_didx, utc_didx) + compare_local_to_utc(tz_didx, utc_didx) + + # Check empty array + result = tslib.tz_convert(np.array([], dtype=np.int64), + tslib.maybe_get_tz('US/Eastern'), + tslib.maybe_get_tz('Asia/Tokyo')) + self.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) + + # Check all-NaT array + result = tslib.tz_convert(np.array([tslib.iNaT], dtype=np.int64), + tslib.maybe_get_tz('US/Eastern'), + tslib.maybe_get_tz('Asia/Tokyo')) + self.assert_numpy_array_equal(result, np.array( + [tslib.iNaT], dtype=np.int64)) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py deleted file mode 100644 index a141d445e6035..0000000000000 --- a/pandas/tseries/tests/test_tslib.py +++ /dev/null @@ -1,694 +0,0 @@ -import datetime -import numpy as np -from distutils.version import LooseVersion - -import pandas as pd -import pandas.util.testing as tm -from pandas import tslib, lib, compat -from pandas.tseries import offsets, tools -from pandas.tseries.frequencies import get_freq -from pandas.tseries.index import date_range, DatetimeIndex -from pandas.util.testing import _skip_if_has_locale -from pandas._period import period_ordinal, period_asfreq -from pandas.compat.numpy import np_array_datetime64_compat -from pandas.core.api import Timestamp, to_datetime, Index, Series - - -class TestTsUtil(tm.TestCase): - - def test_try_parse_dates(self): - from dateutil.parser import parse - arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) - - result = lib.try_parse_dates(arr, dayfirst=True) - expected = [parse(d, dayfirst=True) for d in arr] - self.assertTrue(np.array_equal(result, expected)) - - def test_min_valid(self): - # Ensure that Timestamp.min is a valid Timestamp - Timestamp(Timestamp.min) - - def test_max_valid(self): - # Ensure that Timestamp.max is a valid Timestamp - Timestamp(Timestamp.max) - - def test_to_datetime_bijective(self): - # Ensure that converting to datetime and back only loses precision - # by going from nanoseconds to microseconds. - exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - self.assertEqual( - Timestamp(Timestamp.max.to_pydatetime()).value / 1000, - Timestamp.max.value / 1000) - - exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - self.assertEqual( - Timestamp(Timestamp.min.to_pydatetime()).value / 1000, - Timestamp.min.value / 1000) - - -class TestDatetimeParsingWrappers(tm.TestCase): - - def test_does_not_convert_mixed_integer(self): - bad_date_strings = ('-50000', '999', '123.1234', 'm', 'T') - - for bad_date_string in bad_date_strings: - self.assertFalse(tslib._does_string_look_like_datetime( - bad_date_string)) - - good_date_strings = ('2012-01-01', - '01/01/2012', - 'Mon Sep 16, 2013', - '01012012', - '0101', - '1-1', ) - - for good_date_string in good_date_strings: - self.assertTrue(tslib._does_string_look_like_datetime( - good_date_string)) - - def test_parsers(self): - - # https://github.com/dateutil/dateutil/issues/217 - import dateutil - yearfirst = dateutil.__version__ >= LooseVersion('2.5.0') - - cases = {'2011-01-01': datetime.datetime(2011, 1, 1), - '2Q2005': datetime.datetime(2005, 4, 1), - '2Q05': datetime.datetime(2005, 4, 1), - '2005Q1': datetime.datetime(2005, 1, 1), - '05Q1': datetime.datetime(2005, 1, 1), - '2011Q3': datetime.datetime(2011, 7, 1), - '11Q3': datetime.datetime(2011, 7, 1), - '3Q2011': datetime.datetime(2011, 7, 1), - '3Q11': datetime.datetime(2011, 7, 1), - - # quarterly without space - '2000Q4': datetime.datetime(2000, 10, 1), - '00Q4': datetime.datetime(2000, 10, 1), - '4Q2000': datetime.datetime(2000, 10, 1), - '4Q00': datetime.datetime(2000, 10, 1), - '2000q4': datetime.datetime(2000, 10, 1), - '2000-Q4': datetime.datetime(2000, 10, 1), - '00-Q4': datetime.datetime(2000, 10, 1), - '4Q-2000': datetime.datetime(2000, 10, 1), - '4Q-00': datetime.datetime(2000, 10, 1), - '00q4': datetime.datetime(2000, 10, 1), - '2005': datetime.datetime(2005, 1, 1), - '2005-11': datetime.datetime(2005, 11, 1), - '2005 11': datetime.datetime(2005, 11, 1), - '11-2005': datetime.datetime(2005, 11, 1), - '11 2005': datetime.datetime(2005, 11, 1), - '200511': datetime.datetime(2020, 5, 11), - '20051109': datetime.datetime(2005, 11, 9), - '20051109 10:15': datetime.datetime(2005, 11, 9, 10, 15), - '20051109 08H': datetime.datetime(2005, 11, 9, 8, 0), - '2005-11-09 10:15': datetime.datetime(2005, 11, 9, 10, 15), - '2005-11-09 08H': datetime.datetime(2005, 11, 9, 8, 0), - '2005/11/09 10:15': datetime.datetime(2005, 11, 9, 10, 15), - '2005/11/09 08H': datetime.datetime(2005, 11, 9, 8, 0), - "Thu Sep 25 10:36:28 2003": datetime.datetime(2003, 9, 25, 10, - 36, 28), - "Thu Sep 25 2003": datetime.datetime(2003, 9, 25), - "Sep 25 2003": datetime.datetime(2003, 9, 25), - "January 1 2014": datetime.datetime(2014, 1, 1), - - # GH 10537 - '2014-06': datetime.datetime(2014, 6, 1), - '06-2014': datetime.datetime(2014, 6, 1), - '2014-6': datetime.datetime(2014, 6, 1), - '6-2014': datetime.datetime(2014, 6, 1), - - '20010101 12': datetime.datetime(2001, 1, 1, 12), - '20010101 1234': datetime.datetime(2001, 1, 1, 12, 34), - '20010101 123456': datetime.datetime(2001, 1, 1, 12, 34, 56), - } - - for date_str, expected in compat.iteritems(cases): - result1, _, _ = tools.parse_time_string(date_str, - yearfirst=yearfirst) - result2 = to_datetime(date_str, yearfirst=yearfirst) - result3 = to_datetime([date_str], yearfirst=yearfirst) - # result5 is used below - result4 = to_datetime(np.array([date_str], dtype=object), - yearfirst=yearfirst) - result6 = DatetimeIndex([date_str], yearfirst=yearfirst) - # result7 is used below - result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) - result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) - - for res in [result1, result2]: - self.assertEqual(res, expected) - for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([pd.Timestamp(expected)]) - tm.assert_index_equal(res, exp) - - # these really need to have yearfist, but we don't support - if not yearfirst: - result5 = Timestamp(date_str) - self.assertEqual(result5, expected) - result7 = date_range(date_str, freq='S', periods=1, - yearfirst=yearfirst) - self.assertEqual(result7, expected) - - # NaT - result1, _, _ = tools.parse_time_string('NaT') - result2 = to_datetime('NaT') - result3 = Timestamp('NaT') - result4 = DatetimeIndex(['NaT'])[0] - self.assertTrue(result1 is tslib.NaT) - self.assertTrue(result1 is tslib.NaT) - self.assertTrue(result1 is tslib.NaT) - self.assertTrue(result1 is tslib.NaT) - - def test_parsers_quarter_invalid(self): - - cases = ['2Q 2005', '2Q-200A', '2Q-200', '22Q2005', '6Q-20', '2Q200.'] - for case in cases: - self.assertRaises(ValueError, tools.parse_time_string, case) - - def test_parsers_dayfirst_yearfirst(self): - tm._skip_if_no_dateutil() - - # OK - # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 - # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00 - # 2.5.3 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 - - # OK - # 2.5.1 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 - # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 - # 2.5.3 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 - - # bug fix in 2.5.2 - # 2.5.1 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-11-12 00:00:00 - # 2.5.2 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00 - # 2.5.3 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00 - - # OK - # 2.5.1 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 - # 2.5.2 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 - # 2.5.3 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 - - # OK - # 2.5.1 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 - # 2.5.2 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 - # 2.5.3 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 - - # OK - # 2.5.1 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 - # 2.5.2 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 - # 2.5.3 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 - - # revert of bug in 2.5.2 - # 2.5.1 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00 - # 2.5.2 20/12/21 [dayfirst=1, yearfirst=1] -> month must be in 1..12 - # 2.5.3 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00 - - # OK - # 2.5.1 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 - # 2.5.2 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 - # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 - - import dateutil - is_lt_253 = dateutil.__version__ < LooseVersion('2.5.3') - - # str : dayfirst, yearfirst, expected - cases = {'10-11-12': [(False, False, - datetime.datetime(2012, 10, 11)), - (True, False, - datetime.datetime(2012, 11, 10)), - (False, True, - datetime.datetime(2010, 11, 12)), - (True, True, - datetime.datetime(2010, 12, 11))], - '20/12/21': [(False, False, - datetime.datetime(2021, 12, 20)), - (True, False, - datetime.datetime(2021, 12, 20)), - (False, True, - datetime.datetime(2020, 12, 21)), - (True, True, - datetime.datetime(2020, 12, 21))]} - - from dateutil.parser import parse - for date_str, values in compat.iteritems(cases): - for dayfirst, yearfirst, expected in values: - - # odd comparisons across version - # let's just skip - if dayfirst and yearfirst and is_lt_253: - continue - - # compare with dateutil result - dateutil_result = parse(date_str, dayfirst=dayfirst, - yearfirst=yearfirst) - self.assertEqual(dateutil_result, expected) - - result1, _, _ = tools.parse_time_string(date_str, - dayfirst=dayfirst, - yearfirst=yearfirst) - - # we don't support dayfirst/yearfirst here: - if not dayfirst and not yearfirst: - result2 = Timestamp(date_str) - self.assertEqual(result2, expected) - - result3 = to_datetime(date_str, dayfirst=dayfirst, - yearfirst=yearfirst) - - result4 = DatetimeIndex([date_str], dayfirst=dayfirst, - yearfirst=yearfirst)[0] - - self.assertEqual(result1, expected) - self.assertEqual(result3, expected) - self.assertEqual(result4, expected) - - def test_parsers_timestring(self): - tm._skip_if_no_dateutil() - from dateutil.parser import parse - - # must be the same as dateutil result - cases = {'10:15': (parse('10:15'), datetime.datetime(1, 1, 1, 10, 15)), - '9:05': (parse('9:05'), datetime.datetime(1, 1, 1, 9, 5))} - - for date_str, (exp_now, exp_def) in compat.iteritems(cases): - result1, _, _ = tools.parse_time_string(date_str) - result2 = to_datetime(date_str) - result3 = to_datetime([date_str]) - result4 = Timestamp(date_str) - result5 = DatetimeIndex([date_str])[0] - # parse time string return time string based on default date - # others are not, and can't be changed because it is used in - # time series plot - self.assertEqual(result1, exp_def) - self.assertEqual(result2, exp_now) - self.assertEqual(result3, exp_now) - self.assertEqual(result4, exp_now) - self.assertEqual(result5, exp_now) - - def test_parsers_time(self): - # GH11818 - _skip_if_has_locale() - strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500", - "2:15:00pm", "021500pm", datetime.time(14, 15)] - expected = datetime.time(14, 15) - - for time_string in strings: - self.assertEqual(tools.to_time(time_string), expected) - - new_string = "14.15" - self.assertRaises(ValueError, tools.to_time, new_string) - self.assertEqual(tools.to_time(new_string, format="%H.%M"), expected) - - arg = ["14:15", "20:20"] - expected_arr = [datetime.time(14, 15), datetime.time(20, 20)] - self.assertEqual(tools.to_time(arg), expected_arr) - self.assertEqual(tools.to_time(arg, format="%H:%M"), expected_arr) - self.assertEqual(tools.to_time(arg, infer_time_format=True), - expected_arr) - self.assertEqual(tools.to_time(arg, format="%I:%M%p", errors="coerce"), - [None, None]) - - res = tools.to_time(arg, format="%I:%M%p", errors="ignore") - self.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) - - with tm.assertRaises(ValueError): - tools.to_time(arg, format="%I:%M%p", errors="raise") - - self.assert_series_equal(tools.to_time(Series(arg, name="test")), - Series(expected_arr, name="test")) - - res = tools.to_time(np.array(arg)) - self.assertIsInstance(res, list) - self.assert_equal(res, expected_arr) - - def test_parsers_monthfreq(self): - cases = {'201101': datetime.datetime(2011, 1, 1, 0, 0), - '200005': datetime.datetime(2000, 5, 1, 0, 0)} - - for date_str, expected in compat.iteritems(cases): - result1, _, _ = tools.parse_time_string(date_str, freq='M') - self.assertEqual(result1, expected) - - def test_parsers_quarterly_with_freq(self): - msg = ('Incorrect quarterly string is given, quarter ' - 'must be between 1 and 4: 2013Q5') - with tm.assertRaisesRegexp(tslib.DateParseError, msg): - tools.parse_time_string('2013Q5') - - # GH 5418 - msg = ('Unable to retrieve month information from given freq: ' - 'INVLD-L-DEC-SAT') - with tm.assertRaisesRegexp(tslib.DateParseError, msg): - tools.parse_time_string('2013Q1', freq='INVLD-L-DEC-SAT') - - cases = {('2013Q2', None): datetime.datetime(2013, 4, 1), - ('2013Q2', 'A-APR'): datetime.datetime(2012, 8, 1), - ('2013-Q2', 'A-DEC'): datetime.datetime(2013, 4, 1)} - - for (date_str, freq), exp in compat.iteritems(cases): - result, _, _ = tools.parse_time_string(date_str, freq=freq) - self.assertEqual(result, exp) - - def test_parsers_timezone_minute_offsets_roundtrip(self): - # GH11708 - base = to_datetime("2013-01-01 00:00:00") - dt_strings = [ - ('2013-01-01 05:45+0545', - "Asia/Katmandu", - "Timestamp('2013-01-01 05:45:00+0545', tz='Asia/Katmandu')"), - ('2013-01-01 05:30+0530', - "Asia/Kolkata", - "Timestamp('2013-01-01 05:30:00+0530', tz='Asia/Kolkata')") - ] - - for dt_string, tz, dt_string_repr in dt_strings: - dt_time = to_datetime(dt_string) - self.assertEqual(base, dt_time) - converted_time = dt_time.tz_localize('UTC').tz_convert(tz) - self.assertEqual(dt_string_repr, repr(converted_time)) - - def test_parsers_iso8601(self): - # GH 12060 - # test only the iso parser - flexibility to different - # separators and leadings 0s - # Timestamp construction falls back to dateutil - cases = {'2011-01-02': datetime.datetime(2011, 1, 2), - '2011-1-2': datetime.datetime(2011, 1, 2), - '2011-01': datetime.datetime(2011, 1, 1), - '2011-1': datetime.datetime(2011, 1, 1), - '2011 01 02': datetime.datetime(2011, 1, 2), - '2011.01.02': datetime.datetime(2011, 1, 2), - '2011/01/02': datetime.datetime(2011, 1, 2), - '2011\\01\\02': datetime.datetime(2011, 1, 2), - '2013-01-01 05:30:00': datetime.datetime(2013, 1, 1, 5, 30), - '2013-1-1 5:30:00': datetime.datetime(2013, 1, 1, 5, 30)} - for date_str, exp in compat.iteritems(cases): - actual = tslib._test_parse_iso8601(date_str) - self.assertEqual(actual, exp) - - # seperators must all match - YYYYMM not valid - invalid_cases = ['2011-01/02', '2011^11^11', - '201401', '201111', '200101', - # mixed separated and unseparated - '2005-0101', '200501-01', - '20010101 12:3456', '20010101 1234:56', - # HHMMSS must have two digits in each component - # if unseparated - '20010101 1', '20010101 123', '20010101 12345', - '20010101 12345Z', - # wrong separator for HHMMSS - '2001-01-01 12-34-56'] - for date_str in invalid_cases: - with tm.assertRaises(ValueError): - tslib._test_parse_iso8601(date_str) - # If no ValueError raised, let me know which case failed. - raise Exception(date_str) - - -class TestArrayToDatetime(tm.TestCase): - - def test_parsing_valid_dates(self): - arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr), - np_array_datetime64_compat( - [ - '2013-01-01T00:00:00.000000000-0000', - '2013-01-02T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr), - np_array_datetime64_compat( - [ - '2013-09-16T00:00:00.000000000-0000', - '2013-09-17T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - def test_number_looking_strings_not_into_datetime(self): - # #4601 - # These strings don't look like datetimes so they shouldn't be - # attempted to be converted - arr = np.array(['-352.737091', '183.575577'], dtype=object) - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - arr = np.array(['1', '2', '3', '4', '5'], dtype=object) - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - def test_coercing_dates_outside_of_datetime64_ns_bounds(self): - invalid_dates = [ - datetime.date(1000, 1, 1), - datetime.datetime(1000, 1, 1), - '1000-01-01', - 'Jan 1, 1000', - np.datetime64('1000-01-01'), - ] - - for invalid_date in invalid_dates: - self.assertRaises(ValueError, - tslib.array_to_datetime, - np.array( - [invalid_date], dtype='object'), - errors='raise', ) - self.assert_numpy_array_equal( - tslib.array_to_datetime( - np.array([invalid_date], dtype='object'), - errors='coerce'), - np.array([tslib.iNaT], dtype='M8[ns]') - ) - - arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='coerce'), - np_array_datetime64_compat( - [ - tslib.iNaT, - '2000-01-01T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - def test_coerce_of_invalid_datetimes(self): - arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) - - # Without coercing, the presence of any invalid dates prevents - # any values from being converted - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - # With coercing, the invalid dates becomes iNaT - self.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='coerce'), - np_array_datetime64_compat( - [ - '2013-01-01T00:00:00.000000000-0000', - tslib.iNaT, - tslib.iNaT - ], - dtype='M8[ns]' - ) - ) - - def test_parsing_timezone_offsets(self): - # All of these datetime strings with offsets are equivalent - # to the same datetime after the timezone offset is added - dt_strings = [ - '01-01-2013 08:00:00+08:00', - '2013-01-01T08:00:00.000000000+0800', - '2012-12-31T16:00:00.000000000-0800', - '12-31-2012 23:00:00-01:00' - ] - - expected_output = tslib.array_to_datetime(np.array( - ['01-01-2013 00:00:00'], dtype=object)) - - for dt_string in dt_strings: - self.assert_numpy_array_equal( - tslib.array_to_datetime( - np.array([dt_string], dtype=object) - ), - expected_output - ) - - -class TestTslib(tm.TestCase): - - def test_intraday_conversion_factors(self): - self.assertEqual(period_asfreq( - 1, get_freq('D'), get_freq('H'), False), 24) - self.assertEqual(period_asfreq( - 1, get_freq('D'), get_freq('T'), False), 1440) - self.assertEqual(period_asfreq( - 1, get_freq('D'), get_freq('S'), False), 86400) - self.assertEqual(period_asfreq(1, get_freq( - 'D'), get_freq('L'), False), 86400000) - self.assertEqual(period_asfreq(1, get_freq( - 'D'), get_freq('U'), False), 86400000000) - self.assertEqual(period_asfreq(1, get_freq( - 'D'), get_freq('N'), False), 86400000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('H'), get_freq('T'), False), 60) - self.assertEqual(period_asfreq( - 1, get_freq('H'), get_freq('S'), False), 3600) - self.assertEqual(period_asfreq(1, get_freq('H'), - get_freq('L'), False), 3600000) - self.assertEqual(period_asfreq(1, get_freq( - 'H'), get_freq('U'), False), 3600000000) - self.assertEqual(period_asfreq(1, get_freq( - 'H'), get_freq('N'), False), 3600000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('T'), get_freq('S'), False), 60) - self.assertEqual(period_asfreq( - 1, get_freq('T'), get_freq('L'), False), 60000) - self.assertEqual(period_asfreq(1, get_freq( - 'T'), get_freq('U'), False), 60000000) - self.assertEqual(period_asfreq(1, get_freq( - 'T'), get_freq('N'), False), 60000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('S'), get_freq('L'), False), 1000) - self.assertEqual(period_asfreq(1, get_freq('S'), - get_freq('U'), False), 1000000) - self.assertEqual(period_asfreq(1, get_freq( - 'S'), get_freq('N'), False), 1000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('L'), get_freq('U'), False), 1000) - self.assertEqual(period_asfreq(1, get_freq('L'), - get_freq('N'), False), 1000000) - - self.assertEqual(period_asfreq( - 1, get_freq('U'), get_freq('N'), False), 1000) - - def test_period_ordinal_start_values(self): - # information for 1.1.1970 - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('A'))) - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('M'))) - self.assertEqual(1, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('W'))) - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('D'))) - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('B'))) - - def test_period_ordinal_week(self): - self.assertEqual(1, period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, - get_freq('W'))) - self.assertEqual(2, period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, - get_freq('W'))) - - self.assertEqual(2284, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, - get_freq('W'))) - self.assertEqual(2285, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, - get_freq('W'))) - - def test_period_ordinal_business_day(self): - # Thursday - self.assertEqual(11415, period_ordinal(2013, 10, 3, 0, 0, 0, 0, 0, - get_freq('B'))) - # Friday - self.assertEqual(11416, period_ordinal(2013, 10, 4, 0, 0, 0, 0, 0, - get_freq('B'))) - # Saturday - self.assertEqual(11417, period_ordinal(2013, 10, 5, 0, 0, 0, 0, 0, - get_freq('B'))) - # Sunday - self.assertEqual(11417, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, - get_freq('B'))) - # Monday - self.assertEqual(11417, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, - get_freq('B'))) - # Tuesday - self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, - get_freq('B'))) - - def test_tslib_tz_convert(self): - def compare_utc_to_local(tz_didx, utc_didx): - f = lambda x: tslib.tz_convert_single(x, 'UTC', tz_didx.tz) - result = tslib.tz_convert(tz_didx.asi8, 'UTC', tz_didx.tz) - result_single = np.vectorize(f)(tz_didx.asi8) - self.assert_numpy_array_equal(result, result_single) - - def compare_local_to_utc(tz_didx, utc_didx): - f = lambda x: tslib.tz_convert_single(x, tz_didx.tz, 'UTC') - result = tslib.tz_convert(utc_didx.asi8, tz_didx.tz, 'UTC') - result_single = np.vectorize(f)(utc_didx.asi8) - self.assert_numpy_array_equal(result, result_single) - - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'Europe/Moscow']: - # US: 2014-03-09 - 2014-11-11 - # MOSCOW: 2014-10-26 / 2014-12-31 - tz_didx = date_range('2014-03-01', '2015-01-10', freq='H', tz=tz) - utc_didx = date_range('2014-03-01', '2015-01-10', freq='H') - compare_utc_to_local(tz_didx, utc_didx) - # local tz to UTC can be differ in hourly (or higher) freqs because - # of DST - compare_local_to_utc(tz_didx, utc_didx) - - tz_didx = date_range('2000-01-01', '2020-01-01', freq='D', tz=tz) - utc_didx = date_range('2000-01-01', '2020-01-01', freq='D') - compare_utc_to_local(tz_didx, utc_didx) - compare_local_to_utc(tz_didx, utc_didx) - - tz_didx = date_range('2000-01-01', '2100-01-01', freq='A', tz=tz) - utc_didx = date_range('2000-01-01', '2100-01-01', freq='A') - compare_utc_to_local(tz_didx, utc_didx) - compare_local_to_utc(tz_didx, utc_didx) - - # Check empty array - result = tslib.tz_convert(np.array([], dtype=np.int64), - tslib.maybe_get_tz('US/Eastern'), - tslib.maybe_get_tz('Asia/Tokyo')) - self.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) - - # Check all-NaT array - result = tslib.tz_convert(np.array([tslib.iNaT], dtype=np.int64), - tslib.maybe_get_tz('US/Eastern'), - tslib.maybe_get_tz('Asia/Tokyo')) - self.assert_numpy_array_equal(result, np.array( - [tslib.iNaT], dtype=np.int64)) - - def test_shift_months(self): - s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp( - '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( - '2000-02-29'), Timestamp('2000-12-31')]) - for years in [-1, 0, 1]: - for months in [-2, 0, 2]: - actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + - months)) - expected = DatetimeIndex([x + offsets.DateOffset( - years=years, months=months) for x in s]) - tm.assert_index_equal(actual, expected) - - def test_round(self): - stamp = Timestamp('2000-01-05 05:09:15.13') - - def _check_round(freq, expected): - result = stamp.round(freq=freq) - self.assertEqual(result, expected) - - for freq, expected in [('D', Timestamp('2000-01-05 00:00:00')), - ('H', Timestamp('2000-01-05 05:00:00')), - ('S', Timestamp('2000-01-05 05:09:15'))]: - _check_round(freq, expected) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - stamp.round('foo') diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py deleted file mode 100644 index 3feffe924c291..0000000000000 --- a/pandas/tseries/tests/test_util.py +++ /dev/null @@ -1,126 +0,0 @@ -from pandas.compat import range - -import numpy as np - -from pandas import Series, date_range -import pandas.util.testing as tm - -from datetime import datetime, date - -from pandas.tseries.tools import normalize_date -from pandas.tseries.util import pivot_annual, isleapyear - - -class TestPivotAnnual(tm.TestCase): - """ - New pandas of scikits.timeseries pivot_annual - """ - - def test_daily(self): - rng = date_range('1/1/2000', '12/31/2004', freq='D') - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - annual = pivot_annual(ts, 'D') - - doy = ts.index.dayofyear - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 - - for i in range(1, 367): - subset = ts[doy == i] - subset.index = [x.year for x in subset.index] - - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - self.assertEqual(result.name, i) - - # check leap days - leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] - day = leaps.index.dayofyear[0] - leaps.index = leaps.index.year - leaps.name = 60 - tm.assert_series_equal(annual[day].dropna(), leaps) - - def test_hourly(self): - rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24), - freq='H') - data_hourly = np.random.randint(100, 350, rng_hourly.size) - ts_hourly = Series(data_hourly, index=rng_hourly) - - grouped = ts_hourly.groupby(ts_hourly.index.year) - hoy = grouped.apply(lambda x: x.reset_index(drop=True)) - hoy = hoy.index.droplevel(0).values - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 - hoy += 1 - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - annual = pivot_annual(ts_hourly) - - ts_hourly = ts_hourly.astype(float) - for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: - subset = ts_hourly[hoy == i] - subset.index = [x.year for x in subset.index] - - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - self.assertEqual(result.name, i) - - leaps = ts_hourly[(ts_hourly.index.month == 2) & ( - ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)] - hour = leaps.index.dayofyear[0] * 24 - 23 - leaps.index = leaps.index.year - leaps.name = 1417 - tm.assert_series_equal(annual[hour].dropna(), leaps) - - def test_weekly(self): - pass - - def test_monthly(self): - rng = date_range('1/1/2000', '12/31/2004', freq='M') - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - annual = pivot_annual(ts, 'M') - - month = ts.index.month - for i in range(1, 13): - subset = ts[month == i] - subset.index = [x.year for x in subset.index] - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - self.assertEqual(result.name, i) - - def test_period_monthly(self): - pass - - def test_period_daily(self): - pass - - def test_period_weekly(self): - pass - - def test_isleapyear_deprecate(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(isleapyear(2000)) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertFalse(isleapyear(2001)) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(isleapyear(2004)) - - -def test_normalize_date(): - value = date(2012, 9, 7) - - result = normalize_date(value) - assert (result == datetime(2012, 9, 7)) - - value = datetime(2012, 9, 7, 12) - - result = normalize_date(value) - assert (result == datetime(2012, 9, 7)) From c087c7ad3132d1834cc76de55adb69c6014f35cc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Feb 2017 17:58:45 -0500 Subject: [PATCH 108/338] TST: more tseries/tests reorg --- .../indexes/datetimes/test_date_range.py | 444 ++---------------- pandas/tests/indexes/datetimes/test_ops.py | 195 +++++++- pandas/tests/indexes/datetimes/test_setops.py | 224 +++++++++ pandas/tests/indexes/datetimes/test_tools.py | 108 +---- pandas/tools/tests/test_pivot.py | 106 ++++- setup.py | 2 + 6 files changed, 557 insertions(+), 522 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 8dab10269f76d..80664ce246bf8 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -1,22 +1,30 @@ +""" +test date_range, bdate_range, cdate_range +construction from the convenience range functions +""" + import numpy as np from datetime import datetime, timedelta, time import pandas as pd import pandas.util.testing as tm from pandas import compat -from pandas.core import common as com -from pandas.util.testing import assertRaisesRegexp from pandas.tseries.index import bdate_range, cdate_range -from pandas import date_range, offsets, DatetimeIndex, Timestamp, Index -from pandas.tseries.offsets import (generate_range, CDay, BDay, Minute, - BMonthEnd, DateOffset, MonthEnd) +from pandas import date_range, offsets, DatetimeIndex, Timestamp +from pandas.tseries.offsets import (generate_range, CDay, BDay, + DateOffset, MonthEnd) from pandas.tests.series.common import TestData START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) -class TestTimeSeries(TestData, tm.TestCase): +def eq_gen_range(kwargs, expected): + rng = generate_range(**kwargs) + assert (np.array_equal(list(rng), expected)) + + +class TestDateRanges(TestData, tm.TestCase): def test_date_range_gen_error(self): rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') @@ -137,11 +145,6 @@ def test_catch_infinite_loop(self): datetime(2011, 11, 12), freq=offset) -def eq_gen_range(kwargs, expected): - rng = generate_range(**kwargs) - assert (np.array_equal(list(rng), expected)) - - class TestGenRangeGeneration(tm.TestCase): def test_generate(self): @@ -191,7 +194,8 @@ def test_precision_finer_than_offset(self): self.assert_index_equal(result2, expected2) -class TestDateRange(tm.TestCase): +class TestBusinessDateRange(tm.TestCase): + def setUp(self): self.rng = bdate_range(START, END) @@ -206,28 +210,31 @@ def test_naive_aware_conflicts(self): naive = bdate_range(START, END, freq=BDay(), tz=None) aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") - assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", naive.join, aware) - assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", aware.join, naive) + self.assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", + naive.join, aware) + self.assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", + aware.join, naive) def test_cached_range(self): DatetimeIndex._cached_range(START, END, offset=BDay()) DatetimeIndex._cached_range(START, periods=20, offset=BDay()) DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) - assertRaisesRegexp(TypeError, "offset", DatetimeIndex._cached_range, - START, END) + self.assertRaisesRegexp(TypeError, "offset", + DatetimeIndex._cached_range, + START, END) - assertRaisesRegexp(TypeError, "specify period", - DatetimeIndex._cached_range, START, - offset=BDay()) + self.assertRaisesRegexp(TypeError, "specify period", + DatetimeIndex._cached_range, START, + offset=BDay()) - assertRaisesRegexp(TypeError, "specify period", - DatetimeIndex._cached_range, end=END, - offset=BDay()) + self.assertRaisesRegexp(TypeError, "specify period", + DatetimeIndex._cached_range, end=END, + offset=BDay()) - assertRaisesRegexp(TypeError, "start or end", - DatetimeIndex._cached_range, periods=20, - offset=BDay()) + self.assertRaisesRegexp(TypeError, "start or end", + DatetimeIndex._cached_range, periods=20, + offset=BDay()) def test_cached_range_bug(self): rng = date_range('2010-09-01 05:00:00', periods=50, @@ -236,192 +243,16 @@ def test_cached_range_bug(self): self.assertEqual(rng[0], datetime(2010, 9, 1, 5)) def test_timezone_comparaison_bug(self): + # smoke test start = Timestamp('20130220 10:00', tz='US/Eastern') - try: - date_range(start, periods=2, tz='US/Eastern') - except AssertionError: - self.fail() + result = date_range(start, periods=2, tz='US/Eastern') + self.assertEqual(len(result), 2) def test_timezone_comparaison_assert(self): start = Timestamp('20130220 10:00', tz='US/Eastern') self.assertRaises(AssertionError, date_range, start, periods=2, tz='Europe/Berlin') - def test_comparison(self): - d = self.rng[10] - - comp = self.rng > d - self.assertTrue(comp[11]) - self.assertFalse(comp[9]) - - def test_copy(self): - cp = self.rng.copy() - repr(cp) - self.assert_index_equal(cp, self.rng) - - def test_repr(self): - # only really care that it works - repr(self.rng) - - def test_getitem(self): - smaller = self.rng[:5] - exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) - self.assert_index_equal(smaller, exp) - - self.assertEqual(smaller.offset, self.rng.offset) - - sliced = self.rng[::5] - self.assertEqual(sliced.offset, BDay() * 5) - - fancy_indexed = self.rng[[4, 3, 2, 1, 0]] - self.assertEqual(len(fancy_indexed), 5) - tm.assertIsInstance(fancy_indexed, DatetimeIndex) - self.assertIsNone(fancy_indexed.freq) - - # 32-bit vs. 64-bit platforms - self.assertEqual(self.rng[4], self.rng[np.int_(4)]) - - def test_getitem_matplotlib_hackaround(self): - values = self.rng[:, None] - expected = self.rng.values[:, None] - self.assert_numpy_array_equal(values, expected) - - def test_shift(self): - shifted = self.rng.shift(5) - self.assertEqual(shifted[0], self.rng[5]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(-5) - self.assertEqual(shifted[5], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(0) - self.assertEqual(shifted[0], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=BDay()) - self.assertEqual(shifted[0], rng[0] + BDay()) - - def test_pickle_unpickle(self): - unpickled = self.round_trip_pickle(self.rng) - self.assertIsNotNone(unpickled.offset) - - def test_union(self): - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_union = left.union(right) - tm.assertIsInstance(the_union, Index) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # order does not matter - tm.assert_index_equal(right.union(left), the_union) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_union = self.rng.union(rng) - tm.assertIsInstance(the_union, DatetimeIndex) - - def test_outer_join(self): - # should just behave as union - - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_join = self.rng.join(rng, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - def test_union_not_cacheable(self): - rng = date_range('1/1/2000', periods=50, freq=Minute()) - rng1 = rng[10:] - rng2 = rng[:25] - the_union = rng1.union(rng2) - self.assert_index_equal(the_union, rng) - - rng1 = rng[10:] - rng2 = rng[15:35] - the_union = rng1.union(rng2) - expected = rng[10:] - self.assert_index_equal(the_union, expected) - - def test_intersection(self): - rng = date_range('1/1/2000', periods=50, freq=Minute()) - rng1 = rng[10:] - rng2 = rng[:25] - the_int = rng1.intersection(rng2) - expected = rng[10:25] - self.assert_index_equal(the_int, expected) - tm.assertIsInstance(the_int, DatetimeIndex) - self.assertEqual(the_int.offset, rng.offset) - - the_int = rng1.intersection(rng2.view(DatetimeIndex)) - self.assert_index_equal(the_int, expected) - - # non-overlapping - the_int = rng[:10].intersection(rng[10:]) - expected = DatetimeIndex([]) - self.assert_index_equal(the_int, expected) - - def test_intersection_bug(self): - # GH #771 - a = bdate_range('11/30/2011', '12/31/2011') - b = bdate_range('12/10/2011', '12/20/2011') - result = a.intersection(b) - self.assert_index_equal(result, b) - - def test_summary(self): - self.rng.summary() - self.rng[2:2].summary() - - def test_summary_pytz(self): - tm._skip_if_no_pytz() - import pytz - bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() - - def test_summary_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - bdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() - def test_misc(self): end = datetime(2009, 5, 13) dr = bdate_range(end=end, periods=20) @@ -443,26 +274,6 @@ def test_date_parse_failure(self): self.assertRaises(ValueError, bdate_range, badly_formed_date, badly_formed_date) - def test_equals(self): - self.assertFalse(self.rng.equals(list(self.rng))) - - def test_identical(self): - t1 = self.rng.copy() - t2 = self.rng.copy() - self.assertTrue(t1.identical(t2)) - - # name - t1 = t1.rename('foo') - self.assertTrue(t1.equals(t2)) - self.assertFalse(t1.identical(t2)) - t2 = t2.rename('foo') - self.assertTrue(t1.identical(t2)) - - # freq - t2v = Index(t2.values) - self.assertTrue(t1.equals(t2v)) - self.assertFalse(t1.identical(t2v)) - def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range('12/5/2011', '12/5/2011') @@ -560,43 +371,6 @@ def test_range_tz_dateutil(self): self.assertTrue(dr[0] == start) self.assertTrue(dr[2] == end) - def test_month_range_union_tz_pytz(self): - tm._skip_if_no_pytz() - from pytz import timezone - tz = timezone('US/Eastern') - - early_start = datetime(2011, 1, 1) - early_end = datetime(2011, 3, 1) - - late_start = datetime(2011, 3, 1) - late_end = datetime(2011, 5, 1) - - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) - - early_dr.union(late_dr) - - def test_month_range_union_tz_dateutil(self): - tm._skip_if_windows_python_3() - tm._skip_if_no_dateutil() - from pandas.tslib import _dateutil_gettz as timezone - tz = timezone('US/Eastern') - - early_start = datetime(2011, 1, 1) - early_end = datetime(2011, 3, 1) - - late_start = datetime(2011, 3, 1) - late_end = datetime(2011, 5, 1) - - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) - - early_dr.union(late_dr) - def test_range_closed(self): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -735,151 +509,6 @@ def test_cached_range(self): self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, freq=CDay()) - def test_comparison(self): - d = self.rng[10] - - comp = self.rng > d - self.assertTrue(comp[11]) - self.assertFalse(comp[9]) - - def test_copy(self): - cp = self.rng.copy() - repr(cp) - self.assert_index_equal(cp, self.rng) - - def test_repr(self): - # only really care that it works - repr(self.rng) - - def test_getitem(self): - smaller = self.rng[:5] - exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) - self.assert_index_equal(smaller, exp) - self.assertEqual(smaller.offset, self.rng.offset) - - sliced = self.rng[::5] - self.assertEqual(sliced.offset, CDay() * 5) - - fancy_indexed = self.rng[[4, 3, 2, 1, 0]] - self.assertEqual(len(fancy_indexed), 5) - tm.assertIsInstance(fancy_indexed, DatetimeIndex) - self.assertIsNone(fancy_indexed.freq) - - # 32-bit vs. 64-bit platforms - self.assertEqual(self.rng[4], self.rng[np.int_(4)]) - - def test_getitem_matplotlib_hackaround(self): - values = self.rng[:, None] - expected = self.rng.values[:, None] - self.assert_numpy_array_equal(values, expected) - - def test_shift(self): - - shifted = self.rng.shift(5) - self.assertEqual(shifted[0], self.rng[5]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(-5) - self.assertEqual(shifted[5], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - shifted = self.rng.shift(0) - self.assertEqual(shifted[0], self.rng[0]) - self.assertEqual(shifted.offset, self.rng.offset) - - with tm.assert_produces_warning(com.PerformanceWarning): - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=CDay()) - self.assertEqual(shifted[0], rng[0] + CDay()) - - def test_pickle_unpickle(self): - unpickled = self.round_trip_pickle(self.rng) - self.assertIsNotNone(unpickled.offset) - - def test_union(self): - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_union = left.union(right) - tm.assertIsInstance(the_union, Index) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_union = left.union(right) - tm.assertIsInstance(the_union, DatetimeIndex) - - # order does not matter - self.assert_index_equal(right.union(left), the_union) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_union = self.rng.union(rng) - tm.assertIsInstance(the_union, DatetimeIndex) - - def test_outer_join(self): - # should just behave as union - - # overlapping - left = self.rng[:10] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # non-overlapping, gap in middle - left = self.rng[:5] - right = self.rng[10:] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - # non-overlapping, no gap - left = self.rng[:5] - right = self.rng[5:10] - - the_join = left.join(right, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - - # overlapping, but different offset - rng = date_range(START, END, freq=BMonthEnd()) - - the_join = self.rng.join(rng, how='outer') - tm.assertIsInstance(the_join, DatetimeIndex) - self.assertIsNone(the_join.freq) - - def test_intersection_bug(self): - # GH #771 - a = cdate_range('11/30/2011', '12/31/2011') - b = cdate_range('12/10/2011', '12/20/2011') - result = a.intersection(b) - self.assert_index_equal(result, b) - - def test_summary(self): - self.rng.summary() - self.rng[2:2].summary() - - def test_summary_pytz(self): - tm._skip_if_no_pytz() - import pytz - cdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() - - def test_summary_dateutil(self): - tm._skip_if_no_dateutil() - import dateutil - cdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() - def test_misc(self): end = datetime(2009, 5, 13) dr = cdate_range(end=end, periods=20) @@ -901,9 +530,6 @@ def test_date_parse_failure(self): self.assertRaises(ValueError, cdate_range, badly_formed_date, badly_formed_date) - def test_equals(self): - self.assertFalse(self.rng.equals(list(self.rng))) - def test_daterange_bug_456(self): # GH #456 rng1 = cdate_range('12/5/2011', '12/5/2011') diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index c7cdcd9318a0e..63bf07ec041d3 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -6,13 +6,17 @@ import pandas.tslib as tslib import pandas.util.testing as tm from pandas.core.common import PerformanceWarning +from pandas.tseries.index import cdate_range from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, date_range, TimedeltaIndex, _np_version_under1p10, Index, - datetime, Float64Index, offsets) - + datetime, Float64Index, offsets, bdate_range) +from pandas.tseries.offsets import BMonthEnd, CDay, BDay from pandas.tests.test_base import Ops +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + class TestDatetimeIndexOps(Ops): tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', 'dateutil/US/Pacific'] @@ -1071,9 +1075,6 @@ def test_datetime64_with_DateOffset(self): assert_func(klass([x - op for x in s]), s - op) assert_func(klass([op + x for x in s]), op + s) - -class TestTslib(tm.TestCase): - def test_shift_months(self): s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp( '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( @@ -1085,3 +1086,187 @@ def test_shift_months(self): expected = DatetimeIndex([x + offsets.DateOffset( years=years, months=months) for x in s]) tm.assert_index_equal(actual, expected) + + +class TestBusinessDatetimeIndex(tm.TestCase): + + def setUp(self): + self.rng = bdate_range(START, END) + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assertTrue(comp[11]) + self.assertFalse(comp[9]) + + def test_pickle_unpickle(self): + unpickled = self.round_trip_pickle(self.rng) + self.assertIsNotNone(unpickled.offset) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assert_index_equal(cp, self.rng) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) + + self.assertEqual(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEqual(sliced.offset, BDay() * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEqual(len(fancy_indexed), 5) + tm.assertIsInstance(fancy_indexed, DatetimeIndex) + self.assertIsNone(fancy_indexed.freq) + + # 32-bit vs. 64-bit platforms + self.assertEqual(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_numpy_array_equal(values, expected) + + def test_shift(self): + shifted = self.rng.shift(5) + self.assertEqual(shifted[0], self.rng[5]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEqual(shifted[5], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEqual(shifted[0], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=BDay()) + self.assertEqual(shifted[0], rng[0] + BDay()) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + + def test_summary_pytz(self): + tm._skip_if_no_pytz() + import pytz + bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_summary_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + bdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) + + def test_identical(self): + t1 = self.rng.copy() + t2 = self.rng.copy() + self.assertTrue(t1.identical(t2)) + + # name + t1 = t1.rename('foo') + self.assertTrue(t1.equals(t2)) + self.assertFalse(t1.identical(t2)) + t2 = t2.rename('foo') + self.assertTrue(t1.identical(t2)) + + # freq + t2v = Index(t2.values) + self.assertTrue(t1.equals(t2v)) + self.assertFalse(t1.identical(t2v)) + + +class TestCustomDatetimeIndex(tm.TestCase): + + def setUp(self): + self.rng = cdate_range(START, END) + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assertTrue(comp[11]) + self.assertFalse(comp[9]) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assert_index_equal(cp, self.rng) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) + self.assertEqual(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEqual(sliced.offset, CDay() * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEqual(len(fancy_indexed), 5) + tm.assertIsInstance(fancy_indexed, DatetimeIndex) + self.assertIsNone(fancy_indexed.freq) + + # 32-bit vs. 64-bit platforms + self.assertEqual(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_numpy_array_equal(values, expected) + + def test_shift(self): + + shifted = self.rng.shift(5) + self.assertEqual(shifted[0], self.rng[5]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEqual(shifted[5], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEqual(shifted[0], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + with tm.assert_produces_warning(PerformanceWarning): + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=CDay()) + self.assertEqual(shifted[0], rng[0] + CDay()) + + def test_pickle_unpickle(self): + unpickled = self.round_trip_pickle(self.rng) + self.assertIsNotNone(unpickled.offset) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + + def test_summary_pytz(self): + tm._skip_if_no_pytz() + import pytz + cdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_summary_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + cdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 7777de869bb20..7da660a956e23 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -4,8 +4,12 @@ import pandas as pd import pandas.util.testing as tm +from pandas.tseries.index import cdate_range from pandas import (DatetimeIndex, date_range, Series, bdate_range, DataFrame, Int64Index, Index) +from pandas.tseries.offsets import Minute, BMonthEnd, MonthEnd + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) class TestDatetimeIndex(tm.TestCase): @@ -185,3 +189,223 @@ def test_datetimeindex_union_join_empty(self): result = dti.join(empty) tm.assertIsInstance(result, DatetimeIndex) + + +class TestBusinessDatetimeIndex(tm.TestCase): + + def setUp(self): + self.rng = bdate_range(START, END) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + tm.assertIsInstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # order does not matter + tm.assert_index_equal(right.union(left), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = self.rng.union(rng) + tm.assertIsInstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_join = self.rng.join(rng, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + def test_union_not_cacheable(self): + rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_union = rng1.union(rng2) + self.assert_index_equal(the_union, rng) + + rng1 = rng[10:] + rng2 = rng[15:35] + the_union = rng1.union(rng2) + expected = rng[10:] + self.assert_index_equal(the_union, expected) + + def test_intersection(self): + rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_int = rng1.intersection(rng2) + expected = rng[10:25] + self.assert_index_equal(the_int, expected) + tm.assertIsInstance(the_int, DatetimeIndex) + self.assertEqual(the_int.offset, rng.offset) + + the_int = rng1.intersection(rng2.view(DatetimeIndex)) + self.assert_index_equal(the_int, expected) + + # non-overlapping + the_int = rng[:10].intersection(rng[10:]) + expected = DatetimeIndex([]) + self.assert_index_equal(the_int, expected) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range('11/30/2011', '12/31/2011') + b = bdate_range('12/10/2011', '12/20/2011') + result = a.intersection(b) + self.assert_index_equal(result, b) + + def test_month_range_union_tz_pytz(self): + tm._skip_if_no_pytz() + from pytz import timezone + tz = timezone('US/Eastern') + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, + freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, + freq=MonthEnd()) + + early_dr.union(late_dr) + + def test_month_range_union_tz_dateutil(self): + tm._skip_if_windows_python_3() + tm._skip_if_no_dateutil() + from pandas.tslib import _dateutil_gettz as timezone + tz = timezone('US/Eastern') + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, + freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, + freq=MonthEnd()) + + early_dr.union(late_dr) + + +class TestCustomDatetimeIndex(tm.TestCase): + + def setUp(self): + self.rng = cdate_range(START, END) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + tm.assertIsInstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assertIsInstance(the_union, DatetimeIndex) + + # order does not matter + self.assert_index_equal(right.union(left), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = self.rng.union(rng) + tm.assertIsInstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_join = self.rng.join(rng, how='outer') + tm.assertIsInstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + def test_intersection_bug(self): + # GH #771 + a = cdate_range('11/30/2011', '12/31/2011') + b = cdate_range('12/10/2011', '12/20/2011') + result = a.intersection(b) + self.assert_index_equal(result, b) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index bf1f82b90d5d6..af749963146c6 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -12,7 +12,6 @@ from pandas import tslib from pandas.tseries import tools from pandas.tseries.tools import normalize_date -from pandas.tseries.util import pivot_annual, isleapyear from pandas.compat import lmap from pandas.compat.numpy import np_array_datetime64_compat from pandas.types.common import is_datetime64_ns_dtype @@ -1382,7 +1381,7 @@ def test_parsers_iso8601(self): raise Exception(date_str) -class TestTsUtil(tm.TestCase): +class TestArrayToDatetime(tm.TestCase): def test_try_parse_dates(self): from dateutil.parser import parse @@ -1392,8 +1391,6 @@ def test_try_parse_dates(self): expected = [parse(d, dayfirst=True) for d in arr] self.assertTrue(np.array_equal(result, expected)) - -class TestArrayToDatetime(tm.TestCase): def test_parsing_valid_dates(self): arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) self.assert_numpy_array_equal( @@ -1508,109 +1505,6 @@ def test_coerce_of_invalid_datetimes(self): ) -class TestPivotAnnual(tm.TestCase): - """ - New pandas of scikits.timeseries pivot_annual - """ - - def test_daily(self): - rng = date_range('1/1/2000', '12/31/2004', freq='D') - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - annual = pivot_annual(ts, 'D') - - doy = ts.index.dayofyear - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 - - for i in range(1, 367): - subset = ts[doy == i] - subset.index = [x.year for x in subset.index] - - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - self.assertEqual(result.name, i) - - # check leap days - leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] - day = leaps.index.dayofyear[0] - leaps.index = leaps.index.year - leaps.name = 60 - tm.assert_series_equal(annual[day].dropna(), leaps) - - def test_hourly(self): - rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24), - freq='H') - data_hourly = np.random.randint(100, 350, rng_hourly.size) - ts_hourly = Series(data_hourly, index=rng_hourly) - - grouped = ts_hourly.groupby(ts_hourly.index.year) - hoy = grouped.apply(lambda x: x.reset_index(drop=True)) - hoy = hoy.index.droplevel(0).values - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 - hoy += 1 - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - annual = pivot_annual(ts_hourly) - - ts_hourly = ts_hourly.astype(float) - for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: - subset = ts_hourly[hoy == i] - subset.index = [x.year for x in subset.index] - - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - self.assertEqual(result.name, i) - - leaps = ts_hourly[(ts_hourly.index.month == 2) & ( - ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)] - hour = leaps.index.dayofyear[0] * 24 - 23 - leaps.index = leaps.index.year - leaps.name = 1417 - tm.assert_series_equal(annual[hour].dropna(), leaps) - - def test_weekly(self): - pass - - def test_monthly(self): - rng = date_range('1/1/2000', '12/31/2004', freq='M') - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - annual = pivot_annual(ts, 'M') - - month = ts.index.month - for i in range(1, 13): - subset = ts[month == i] - subset.index = [x.year for x in subset.index] - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - self.assertEqual(result.name, i) - - def test_period_monthly(self): - pass - - def test_period_daily(self): - pass - - def test_period_weekly(self): - pass - - def test_isleapyear_deprecate(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(isleapyear(2000)) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertFalse(isleapyear(2001)) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(isleapyear(2004)) - - def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 40b46c5413c8f..7f2bb7e724362 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -3,11 +3,12 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Index, MultiIndex, Grouper +from pandas import DataFrame, Series, Index, MultiIndex, Grouper, date_range from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab from pandas.compat import range, product import pandas.util.testing as tm +from pandas.tseries.util import pivot_annual, isleapyear class TestPivotTable(tm.TestCase): @@ -1319,3 +1320,106 @@ def test_crosstab_with_numpy_size(self): index=expected_index, columns=expected_column) tm.assert_frame_equal(result, expected) + + +class TestPivotAnnual(tm.TestCase): + """ + New pandas of scikits.timeseries pivot_annual + """ + + def test_daily(self): + rng = date_range('1/1/2000', '12/31/2004', freq='D') + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'D') + + doy = ts.index.dayofyear + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 + + for i in range(1, 367): + subset = ts[doy == i] + subset.index = [x.year for x in subset.index] + + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + self.assertEqual(result.name, i) + + # check leap days + leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] + day = leaps.index.dayofyear[0] + leaps.index = leaps.index.year + leaps.name = 60 + tm.assert_series_equal(annual[day].dropna(), leaps) + + def test_hourly(self): + rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24), + freq='H') + data_hourly = np.random.randint(100, 350, rng_hourly.size) + ts_hourly = Series(data_hourly, index=rng_hourly) + + grouped = ts_hourly.groupby(ts_hourly.index.year) + hoy = grouped.apply(lambda x: x.reset_index(drop=True)) + hoy = hoy.index.droplevel(0).values + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 + hoy += 1 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts_hourly) + + ts_hourly = ts_hourly.astype(float) + for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: + subset = ts_hourly[hoy == i] + subset.index = [x.year for x in subset.index] + + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + self.assertEqual(result.name, i) + + leaps = ts_hourly[(ts_hourly.index.month == 2) & ( + ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)] + hour = leaps.index.dayofyear[0] * 24 - 23 + leaps.index = leaps.index.year + leaps.name = 1417 + tm.assert_series_equal(annual[hour].dropna(), leaps) + + def test_weekly(self): + pass + + def test_monthly(self): + rng = date_range('1/1/2000', '12/31/2004', freq='M') + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'M') + + month = ts.index.month + for i in range(1, 13): + subset = ts[month == i] + subset.index = [x.year for x in subset.index] + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + self.assertEqual(result.name, i) + + def test_period_monthly(self): + pass + + def test_period_daily(self): + pass + + def test_period_weekly(self): + pass + + def test_isleapyear_deprecate(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2000)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertFalse(isleapyear(2001)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2004)) diff --git a/setup.py b/setup.py index 4d6bb76fd6b7c..3c2617da18eae 100755 --- a/setup.py +++ b/setup.py @@ -642,6 +642,8 @@ def pxd(name): 'pandas.tests.frame', 'pandas.tests.indexes', 'pandas.tests.indexes.datetimes', + 'pandas.tests.indexes.timedeltas', + 'pandas.tests.indexes.period', 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', From 248c53b419b9791909251ed10fce8a57360f8dcb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Feb 2017 20:24:09 -0500 Subject: [PATCH 109/338] TST/CLN: reorg groupby tests (#15336) --- pandas/tests/groupby/common.py | 52 + pandas/tests/groupby/test_aggregate.py | 336 ++++- pandas/tests/groupby/test_categorical.py | 291 ++-- pandas/tests/groupby/test_groupby.py | 1670 +--------------------- pandas/tests/groupby/test_misc.py | 101 ++ pandas/tests/groupby/test_timegrouper.py | 609 ++++++++ pandas/tests/groupby/test_transform.py | 494 +++++++ 7 files changed, 1731 insertions(+), 1822 deletions(-) create mode 100644 pandas/tests/groupby/common.py create mode 100644 pandas/tests/groupby/test_misc.py create mode 100644 pandas/tests/groupby/test_timegrouper.py create mode 100644 pandas/tests/groupby/test_transform.py diff --git a/pandas/tests/groupby/common.py b/pandas/tests/groupby/common.py new file mode 100644 index 0000000000000..8a70777d08682 --- /dev/null +++ b/pandas/tests/groupby/common.py @@ -0,0 +1,52 @@ +""" Base setup """ + +import numpy as np +from pandas.util import testing as tm +from pandas import DataFrame, MultiIndex + + +class MixIn(object): + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 00ddd293f6014..a1fc97eb8d780 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -1,28 +1,25 @@ # -*- coding: utf-8 -*- -from __future__ import print_function -from datetime import datetime - - -from pandas import date_range -from pandas.core.index import MultiIndex -from pandas.core.api import DataFrame -from pandas.core.series import Series - -from pandas.util.testing import (assert_frame_equal, assert_series_equal - ) - -from pandas.core.groupby import (SpecificationError) -from pandas.compat import (lmap, OrderedDict) -from pandas.formats.printing import pprint_thing +""" +we test .agg behavior / note that .apply is tested +generally in test_groupby.py +""" -from pandas import compat +from __future__ import print_function +from datetime import datetime +from functools import partial -import pandas.core.common as com import numpy as np +from numpy import nan +import pandas as pd +from pandas import (date_range, MultiIndex, DataFrame, + Series, Index, bdate_range) +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.core.groupby import SpecificationError, DataError +from pandas.compat import OrderedDict +from pandas.formats.printing import pprint_thing import pandas.util.testing as tm -import pandas as pd class TestGroupByAggregate(tm.TestCase): @@ -452,35 +449,292 @@ def bad(x): expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') assert_frame_equal(result, expected) + def test_cythonized_aggers(self): + data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + 'B': ['A', 'B'] * 6, + 'C': np.random.randn(12)} + df = DataFrame(data) + df.loc[2:10:2, 'C'] = nan + + def _testit(name): + + op = lambda x: getattr(x, name)() + + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C': exp}) + exp.index.name = 'A' + result = op(grouped) + assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ['A', 'B'] + exp.name = 'C' + + result = op(grouped)['C'] + if not tm._incompat_bottleneck_version(name): + assert_series_equal(result, exp) + + _testit('count') + _testit('sum') + _testit('std') + _testit('var') + _testit('sem') + _testit('mean') + _testit('median') + _testit('prod') + _testit('min') + _testit('max') + + def test_cython_agg_boolean(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': np.random.randint(0, 2, 50).astype('bool')}) + result = frame.groupby('a')['b'].mean() + expected = frame.groupby('a')['b'].agg(np.mean) -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() + assert_series_equal(result, expected) + def test_cython_agg_nothing_to_agg(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + self.assertRaises(DataError, frame.groupby('a')['b'].mean) + + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean) + + def test_cython_agg_nothing_to_agg_with_dates(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25, + 'dates': pd.date_range('now', periods=50, + freq='T')}) + with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"): + frame.groupby('b').dates.mean() + + def test_cython_agg_frame_columns(self): + # #2113 + df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + + def test_cython_fail_agg(self): + dr = bdate_range('1/1/2000', periods=50) + ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + assert_series_equal(summed, expected) + + def test_agg_consistency(self): + # agg with ([]) and () not consistent + # GH 6715 + + def P1(a): + try: + return np.percentile(a.dropna(), q=1) + except: + return np.nan + + import datetime as dt + df = DataFrame({'col1': [1, 2, 3, 4], + 'col2': [10, 25, 26, 31], + 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), + dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) + + g = df.groupby('date') + + expected = g.agg([P1]) + expected.columns = expected.columns.levels[0] + + result = g.agg(P1) + assert_frame_equal(result, expected) -def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): - tups = lmap(tuple, df[keys].values) - tups = com._asarray_tuplesafe(tups) - expected = f(df.groupby(tups)[field]) - for k, v in compat.iteritems(expected): - assert (result[k] == v) + def test_wrap_agg_out(self): + grouped = self.three_group.groupby(['A', 'B']) + def func(ser): + if ser.dtype == np.object: + raise TypeError + else: + return ser.sum() -def test_decons(): - from pandas.core.groupby import decons_group_index, get_group_index + result = grouped.aggregate(func) + exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] + expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + assert_frame_equal(result, expected) + + def test_agg_multiple_functions_maintain_order(self): + # GH #610 + funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] + result = self.df.groupby('A')['C'].agg(funcs) + exp_cols = Index(['mean', 'max', 'min']) + + self.assert_index_equal(result.columns, exp_cols) + + def test_multiple_functions_tuples_and_non_tuples(self): + # #1359 - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) + funcs = [('foo', 'mean'), 'std'] + ex_funcs = [('foo', 'mean'), ('std', 'std')] - for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) + result = self.df.groupby('A')['C'].agg(funcs) + expected = self.df.groupby('A')['C'].agg(ex_funcs) + assert_frame_equal(result, expected) + + result = self.df.groupby('A').agg(funcs) + expected = self.df.groupby('A').agg(ex_funcs) + assert_frame_equal(result, expected) + + def test_agg_multiple_functions_too_many_lambdas(self): + grouped = self.df.groupby('A') + funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] + + self.assertRaises(SpecificationError, grouped.agg, funcs) + + def test_more_flexible_frame_multi_function(self): + from pandas import concat + + grouped = self.df.groupby('A') - shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] - testit(label_list, shape) + exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) + exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) - shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] - testit(label_list, shape) + expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) + + d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) + result = grouped.aggregate(d) + + assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + expected = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + assert_frame_equal(result, expected) + + def foo(x): + return np.mean(x) + + def bar(x): + return np.std(x, ddof=1) + + d = OrderedDict([['C', np.mean], ['D', OrderedDict( + [['foo', np.mean], ['bar', np.std]])]]) + result = grouped.aggregate(d) + + d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + expected = grouped.aggregate(d) + + assert_frame_equal(result, expected) + + def test_multi_function_flexible_mix(self): + # GH #1268 + grouped = self.df.groupby('A') + + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ + 'bar', 'std' + ]])], ['D', 'sum']]) + result = grouped.aggregate(d) + d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ + 'bar', 'std' + ]])], ['D', ['sum']]]) + result2 = grouped.aggregate(d2) + + d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ + 'bar', 'std' + ]])], ['D', {'sum': 'sum'}]]) + expected = grouped.aggregate(d3) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_agg_callables(self): + # GH 7929 + df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) + + class fn_class(object): + + def __call__(self, x): + return sum(x) + + equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(), + partial(sum), fn_class()] + + expected = df.groupby("foo").agg(sum) + for ecall in equiv_callables: + result = df.groupby('foo').agg(ecall) + assert_frame_equal(result, expected) + + def test__cython_agg_general(self): + ops = [('mean', np.mean), + ('median', np.median), + ('var', np.var), + ('add', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), ] + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op, ) + raise + + def test_cython_agg_empty_buckets(self): + ops = [('mean', np.mean), + ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), + ('var', lambda x: np.var(x, ddof=1)), + ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), ] + + df = pd.DataFrame([11, 12, 13]) + grps = range(0, 55, 5) + + for op, targop in ops: + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + def test_agg_over_numpy_arrays(self): + # GH 3788 + df = pd.DataFrame([[1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])]], + columns=['category', 'arraydata']) + result = df.groupby('category').agg(sum) + + expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] + expected_index = pd.Index([1, 2], name='category') + expected_column = ['arraydata'] + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_column) + + assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 605b327208a03..8952b520f4f78 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,67 +1,19 @@ # -*- coding: utf-8 -*- from __future__ import print_function -from numpy import nan - -from pandas.core.index import Index, MultiIndex, CategoricalIndex -from pandas.core.api import DataFrame, Categorical - -from pandas.core.series import Series - -from pandas.util.testing import (assert_frame_equal, assert_series_equal - ) +from datetime import datetime -from pandas.compat import (lmap) - -from pandas import compat - -import pandas.core.common as com import numpy as np +from numpy import nan -import pandas.util.testing as tm import pandas as pd +from pandas import (Index, MultiIndex, CategoricalIndex, + DataFrame, Categorical, Series) +from pandas.util.testing import assert_frame_equal, assert_series_equal +import pandas.util.testing as tm +from .common import MixIn -class TestGroupByCategorical(tm.TestCase): - - def setUp(self): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) +class TestGroupByCategorical(MixIn, tm.TestCase): def test_level_groupby_get_group(self): # GH15155 @@ -210,8 +162,9 @@ def test_groupby_datetime_categorical(self): def test_groupby_categorical_index(self): + s = np.random.RandomState(12345) levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=20) + codes = s.randint(0, 4, size=20) cats = Categorical.from_codes(codes, levels, ordered=True) df = DataFrame( np.repeat( @@ -264,70 +217,15 @@ def test_groupby_unstack_categorical(self): expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) tm.assert_series_equal(result, expected) - def test_groupby_categorical_unequal_len(self): + def test_groupby_bins_unequal_len(self): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - # The raises only happens with categorical, not with series of types - # category bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here - self.assertRaises(ValueError, lambda: series.groupby(bins).mean()) - - def test_groupby_categorical_two_columns(self): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) + def f(): + series.groupby(bins).mean() + self.assertRaises(ValueError, f) def test_groupby_multi_categorical_as_index(self): # GH13204 @@ -454,35 +352,148 @@ def test_groupby_categorical_no_compress(self): exp = np.array([1, 2, 4, np.nan]) self.assert_numpy_array_equal(result, exp) + def test_groupby_sort_categorical(self): + # dataframe groupby sort was being ignored # GH 8868 + df = DataFrame([['(7.5, 10]', 10, 10], + ['(7.5, 10]', 8, 20], + ['(2.5, 5]', 5, 30], + ['(5, 7.5]', 6, 40], + ['(2.5, 5]', 4, 50], + ['(0, 2.5]', 1, 60], + ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) + df['range'] = Categorical(df['range'], ordered=True) + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range', ordered=True) + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) + + col = 'range' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + df['range'] = Categorical(df['range'], ordered=False) + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range') + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) + + index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', + '(0, 2.5]'], + categories=['(7.5, 10]', '(2.5, 5]', + '(5, 7.5]', '(0, 2.5]'], + name='range') + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + index=index, columns=['foo', 'bar']) + + col = 'range' + # this is an unordered categorical, but we allow this #### + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + + def test_groupby_sort_categorical_datetimelike(self): + # GH10505 + + # use same data as test_groupby_sort_categorical, which category is + # corresponding to datetime.month + df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), + datetime(2011, 2, 1), datetime(2011, 5, 1), + datetime(2011, 2, 1), datetime(2011, 1, 1), + datetime(2011, 5, 1)], + 'foo': [10, 8, 5, 6, 4, 1, 7], + 'bar': [10, 20, 30, 40, 50, 60, 70]}, + columns=['dt', 'foo', 'bar']) + + # ordered=True + df['dt'] = Categorical(df['dt'], ordered=True) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt', ordered=True) + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt', ordered=True) + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + # ordered = False + df['dt'] = Categorical(df['dt'], ordered=False) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt') + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt') + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() - + def test_groupby_categorical_two_columns(self): -def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): - tups = lmap(tuple, df[keys].values) - tups = com._asarray_tuplesafe(tups) - expected = f(df.groupby(tups)[field]) - for k, v in compat.iteritems(expected): - assert (result[k] == v) + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') -def test_decons(): - from pandas.core.groupby import decons_group_index, get_group_index + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) + exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, + index=exp_index) + tm.assert_frame_equal(res, exp) - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) + # Grouping on two columns + groups_double_key = test.groupby(["cat", "ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), + "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" + ]) + tm.assert_frame_equal(res, exp) - for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = test[(test.cat == c) & (test.ints == i)] + assert_frame_equal(result, expected) - shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] - testit(label_list, shape) + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values, 'C2']) - shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] - testit(label_list, shape) + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product( + [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, + nan, nan, nan, nan, 4, 5], + "C3": [nan, nan, nan, nan, 10, 100, + nan, nan, nan, nan, 200, 34]}, index=idx) + tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index df4707fcef3f0..458e869130190 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1,20 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import print_function -import nose from string import ascii_lowercase from datetime import datetime from numpy import nan -from pandas.types.common import _ensure_platform_int -from pandas import date_range, bdate_range, Timestamp, isnull -from pandas.core.index import Index, MultiIndex, CategoricalIndex -from pandas.core.api import Categorical, DataFrame +from pandas import (date_range, bdate_range, Timestamp, + isnull, Index, MultiIndex, DataFrame, Series) from pandas.core.common import UnsupportedFunctionCall -from pandas.core.groupby import (SpecificationError, DataError, _nargsort, - _lexsort_indexer) -from pandas.core.series import Series -from pandas.core.config import option_context from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, assert_index_equal, assertRaisesRegexp) @@ -24,57 +17,16 @@ from pandas.core.panel import Panel from pandas.tools.merge import concat from collections import defaultdict -from functools import partial import pandas.core.common as com import numpy as np import pandas.core.nanops as nanops - import pandas.util.testing as tm import pandas as pd +from .common import MixIn -class TestGroupBy(tm.TestCase): - - def setUp(self): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) +class TestGroupBy(MixIn, tm.TestCase): def test_basic(self): def checkit(dtype): @@ -774,12 +726,12 @@ def max_value(group): def test_groupby_return_type(self): # GH2893, return a reduced type - df1 = DataFrame([{"val1": 1, - "val2": 20}, {"val1": 1, - "val2": 19}, {"val1": 2, - "val2": 27}, {"val1": 2, - "val2": 12} - ]) + df1 = DataFrame( + [{"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12} + ]) def func(dataf): return dataf["val2"] - dataf["val2"].mean() @@ -787,12 +739,12 @@ def func(dataf): result = df1.groupby("val1", squeeze=True).apply(func) tm.assertIsInstance(result, Series) - df2 = DataFrame([{"val1": 1, - "val2": 20}, {"val1": 1, - "val2": 19}, {"val1": 1, - "val2": 27}, {"val1": 1, - "val2": 12} - ]) + df2 = DataFrame( + [{"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 1, "val2": 27}, + {"val1": 1, "val2": 12} + ]) def func(dataf): return dataf["val2"] - dataf["val2"].mean() @@ -902,6 +854,7 @@ def test_get_group(self): lambda: g.get_group(('foo', 'bar', 'baz'))) def test_get_group_empty_bins(self): + d = pd.DataFrame([3, 1, 7, 6]) bins = [0, 5, 10, 15] g = d.groupby(pd.cut(d[0], bins)) @@ -1043,266 +996,6 @@ def test_basic_regression(self): grouped = result.groupby(groupings) grouped.mean() - def test_transform(self): - data = Series(np.arange(9) // 3, index=np.arange(9)) - - index = np.arange(9) - np.random.shuffle(index) - data = data.reindex(index) - - grouped = data.groupby(lambda x: x // 3) - - transformed = grouped.transform(lambda x: x * x.sum()) - self.assertEqual(transformed[7], 12) - - # GH 8046 - # make sure that we preserve the input order - - df = DataFrame( - np.arange(6, dtype='int64').reshape( - 3, 2), columns=["a", "b"], index=[0, 2, 1]) - key = [0, 0, 1] - expected = df.sort_index().groupby(key).transform( - lambda x: x - x.mean()).groupby(key).mean() - result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( - key).mean() - assert_frame_equal(result, expected) - - def demean(arr): - return arr - arr.mean() - - people = DataFrame(np.random.randn(5, 5), - columns=['a', 'b', 'c', 'd', 'e'], - index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) - key = ['one', 'two', 'one', 'two', 'one'] - result = people.groupby(key).transform(demean).groupby(key).mean() - expected = people.groupby(key).apply(demean).groupby(key).mean() - assert_frame_equal(result, expected) - - # GH 8430 - df = tm.makeTimeDataFrame() - g = df.groupby(pd.TimeGrouper('M')) - g.transform(lambda x: x - 1) - - # GH 9700 - df = DataFrame({'a': range(5, 10), 'b': range(5)}) - result = df.groupby('a').transform(max) - expected = DataFrame({'b': range(5)}) - tm.assert_frame_equal(result, expected) - - def test_transform_fast(self): - - df = DataFrame({'id': np.arange(100000) / 3, - 'val': np.random.randn(100000)}) - - grp = df.groupby('id')['val'] - - values = np.repeat(grp.mean().values, - _ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index, name='val') - - result = grp.transform(np.mean) - assert_series_equal(result, expected) - - result = grp.transform('mean') - assert_series_equal(result, expected) - - # GH 12737 - df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], - 'd': pd.date_range('2014-1-1', '2014-1-4'), - 'i': [1, 2, 3, 4]}, - columns=['grouping', 'f', 'i', 'd']) - result = df.groupby('grouping').transform('first') - - dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] - expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], - 'd': dates, - 'i': [1, 2, 2, 4]}, - columns=['f', 'i', 'd']) - assert_frame_equal(result, expected) - - # selection - result = df.groupby('grouping')[['f', 'i']].transform('first') - expected = expected[['f', 'i']] - assert_frame_equal(result, expected) - - # dup columns - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) - result = df.groupby('g').transform('first') - expected = df.drop('g', axis=1) - assert_frame_equal(result, expected) - - def test_transform_broadcast(self): - grouped = self.ts.groupby(lambda x: x.month) - result = grouped.transform(np.mean) - - self.assert_index_equal(result.index, self.ts.index) - for _, gp in grouped: - assert_fp_equal(result.reindex(gp.index), gp.mean()) - - grouped = self.tsframe.groupby(lambda x: x.month) - result = grouped.transform(np.mean) - self.assert_index_equal(result.index, self.tsframe.index) - for _, gp in grouped: - agged = gp.mean() - res = result.reindex(gp.index) - for col in self.tsframe: - assert_fp_equal(res[col], agged[col]) - - # group columns - grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis=1) - result = grouped.transform(np.mean) - self.assert_index_equal(result.index, self.tsframe.index) - self.assert_index_equal(result.columns, self.tsframe.columns) - for _, gp in grouped: - agged = gp.mean(1) - res = result.reindex(columns=gp.columns) - for idx in gp.index: - assert_fp_equal(res.xs(idx), agged[idx]) - - def test_transform_axis(self): - - # make sure that we are setting the axes - # correctly when on axis=0 or 1 - # in the presence of a non-monotonic indexer - # GH12713 - - base = self.tsframe.iloc[0:5] - r = len(base.index) - c = len(base.columns) - tso = DataFrame(np.random.randn(r, c), - index=base.index, - columns=base.columns, - dtype='float64') - # monotonic - ts = tso - grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: x - x.mean()) - assert_frame_equal(result, expected) - - ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - assert_frame_equal(result, expected) - - # non-monotonic - ts = tso.iloc[[1, 0] + list(range(2, len(base)))] - grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: x - x.mean()) - assert_frame_equal(result, expected) - - ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - assert_frame_equal(result, expected) - - def test_transform_dtype(self): - # GH 9807 - # Check transform dtype output is preserved - df = DataFrame([[1, 3], [2, 3]]) - result = df.groupby(1).transform('mean') - expected = DataFrame([[1.5], [1.5]]) - assert_frame_equal(result, expected) - - def test_transform_bug(self): - # GH 5712 - # transforming on a datetime column - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - result = df.groupby('A')['B'].transform( - lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name='B') - assert_series_equal(result, expected) - - def test_transform_multiple(self): - grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) - - grouped.transform(lambda x: x * 2) - grouped.transform(np.mean) - - def test_dispatch_transform(self): - df = self.tsframe[::5].reindex(self.tsframe.index) - - grouped = df.groupby(lambda x: x.month) - - filled = grouped.fillna(method='pad') - fillit = lambda x: x.fillna(method='pad') - expected = df.groupby(lambda x: x.month).transform(fillit) - assert_frame_equal(filled, expected) - - def test_transform_select_columns(self): - f = lambda x: x.mean() - result = self.df.groupby('A')['C', 'D'].transform(f) - - selection = self.df[['C', 'D']] - expected = selection.groupby(self.df['A']).transform(f) - - assert_frame_equal(result, expected) - - def test_transform_exclude_nuisance(self): - - # this also tests orderings in transform between - # series/frame to make sure it's consistent - expected = {} - grouped = self.df.groupby('A') - expected['C'] = grouped['C'].transform(np.mean) - expected['D'] = grouped['D'].transform(np.mean) - expected = DataFrame(expected) - result = self.df.groupby('A').transform(np.mean) - - assert_frame_equal(result, expected) - - def test_transform_function_aliases(self): - result = self.df.groupby('A').transform('mean') - expected = self.df.groupby('A').transform(np.mean) - assert_frame_equal(result, expected) - - result = self.df.groupby('A')['C'].transform('mean') - expected = self.df.groupby('A')['C'].transform(np.mean) - assert_series_equal(result, expected) - - def test_series_fast_transform_date(self): - # GH 13191 - df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], - 'd': pd.date_range('2014-1-1', '2014-1-4')}) - result = df.groupby('grouping')['d'].transform('first') - dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-4')] - expected = pd.Series(dates, name='d') - assert_series_equal(result, expected) - - def test_transform_length(self): - # GH 9697 - df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) - expected = pd.Series([3.0] * 4) - - def nsum(x): - return np.nansum(x) - - results = [df.groupby('col1').transform(sum)['col2'], - df.groupby('col1')['col2'].transform(sum), - df.groupby('col1').transform(nsum)['col2'], - df.groupby('col1')['col2'].transform(nsum)] - for result in results: - assert_series_equal(result, expected, check_names=False) - - def test_transform_coercion(self): - - # 14457 - # when we are transforming be sure to not coerce - # via assignment - df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) - g = df.groupby('A') - - expected = g.transform(np.mean) - result = g.transform(lambda x: np.mean(x)) - assert_frame_equal(result, expected) - def test_with_na(self): index = Index(np.arange(10)) @@ -1330,58 +1023,6 @@ def f(x): assert_series_equal(agged, expected, check_dtype=False) self.assertTrue(issubclass(agged.dtype.type, np.dtype(dtype).type)) - def test_groupby_transform_with_int(self): - - # GH 3740, make sure that we might upcast on item-by-item transform - - # floats - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), - C=Series( - [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=Series( - [-1, 0, 1, -1, 0, 1], dtype='float64'))) - assert_frame_equal(result, expected) - - # int case - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, - C=[1, 2, 3, 1, 2, 3], D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) - assert_frame_equal(result, expected) - - # int that needs float conversion - s = Series([2, 3, 4, 10, 5, -1]) - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - - s1 = s.iloc[0:3] - s1 = (s1 - s1.mean()) / s1.std() - s2 = s.iloc[3:6] - s2 = (s2 - s2.mean()) / s2.std() - expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) - assert_frame_equal(result, expected) - - # int downcasting - result = df.groupby('A').transform(lambda x: x * 2 / 2) - expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) - assert_frame_equal(result, expected) - - def test_groupby_transform_with_nan_group(self): - # GH 9941 - df = pd.DataFrame({'a': range(10), - 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - result = df.groupby(df.b)['a'].transform(max) - expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], - name='a') - assert_series_equal(result, expected) - def test_indices_concatenation_order(self): # GH 2808 @@ -1845,6 +1486,7 @@ def check_nunique(df, keys, as_index=True): def test_series_groupby_value_counts(self): from itertools import product + np.random.seed(1234) def rebuild_index(df): arr = list(map(df.index.get_level_values, range(df.index.nlevels))) @@ -2220,51 +1862,6 @@ def test_builtins_apply(self): # GH8155 assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) - def test_cythonized_aggers(self): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], - 'B': ['A', 'B'] * 6, - 'C': np.random.randn(12)} - df = DataFrame(data) - df.loc[2:10:2, 'C'] = nan - - def _testit(name): - - op = lambda x: getattr(x, name)() - - # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {} - for cat, group in grouped: - exp[cat] = op(group['C']) - exp = DataFrame({'C': exp}) - exp.index.name = 'A' - result = op(grouped) - assert_frame_equal(result, exp) - - # multiple columns - grouped = df.groupby(['A', 'B']) - expd = {} - for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) - exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' - - result = op(grouped)['C'] - if not tm._incompat_bottleneck_version(name): - assert_series_equal(result, exp) - - _testit('count') - _testit('sum') - _testit('std') - _testit('var') - _testit('sem') - _testit('mean') - _testit('median') - _testit('prod') - _testit('min') - _testit('max') - def test_max_min_non_numeric(self): # #2700 aa = DataFrame({'nn': [11, 11, 22, 22], @@ -2399,31 +1996,6 @@ def test_arg_passthru(self): result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - def test_cython_agg_boolean(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': np.random.randint(0, 2, 50).astype('bool')}) - result = frame.groupby('a')['b'].mean() - expected = frame.groupby('a')['b'].agg(np.mean) - - assert_series_equal(result, expected) - - def test_cython_agg_nothing_to_agg(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - self.assertRaises(DataError, frame.groupby('a')['b'].mean) - - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean) - - def test_cython_agg_nothing_to_agg_with_dates(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25, - 'dates': pd.date_range('now', periods=50, - freq='T')}) - with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"): - frame.groupby('b').dates.mean() - def test_groupby_timedelta_cython_count(self): df = DataFrame({'g': list('ab' * 2), 'delt': np.arange(4).astype('timedelta64[ns]')}) @@ -2433,15 +2005,6 @@ def test_groupby_timedelta_cython_count(self): result = df.groupby('g').delt.count() tm.assert_series_equal(expected, result) - def test_cython_agg_frame_columns(self): - # #2113 - df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) - - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - def test_wrap_aggregated_output_multindex(self): df = self.mframe.T df['baz', 'two'] = 'peekaboo' @@ -2616,15 +2179,6 @@ def test_grouping_labels(self): exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) - def test_cython_fail_agg(self): - dr = bdate_range('1/1/2000', periods=50) - ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) - - grouped = ts.groupby(lambda x: x.month) - summed = grouped.sum() - expected = grouped.agg(np.sum) - assert_series_equal(summed, expected) - def test_apply_series_to_frame(self): def f(piece): with np.errstate(invalid='ignore'): @@ -3051,30 +2605,6 @@ def test_grouping_ndarray(self): assert_frame_equal(result, expected, check_names=False ) # Note: no names when grouping by value - def test_agg_consistency(self): - # agg with ([]) and () not consistent - # GH 6715 - - def P1(a): - try: - return np.percentile(a.dropna(), q=1) - except: - return np.nan - - import datetime as dt - df = DataFrame({'col1': [1, 2, 3, 4], - 'col2': [10, 25, 26, 31], - 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), - dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) - - g = df.groupby('date') - - expected = g.agg([P1]) - expected.columns = expected.columns.levels[0] - - result = g.agg(P1) - assert_frame_equal(result, expected) - def test_apply_typecast_fail(self): df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], 'c': np.tile( @@ -3159,28 +2689,6 @@ def f(g): result = grouped.apply(f) self.assertTrue('value3' in result) - def test_transform_mixed_type(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) - - def f(group): - group['g'] = group['d'] * 2 - return group[:1] - - grouped = df.groupby('c') - result = grouped.apply(f) - - self.assertEqual(result['d'].dtype, np.float64) - - # this is by definition a mutating operation! - with option_context('mode.chained_assignment', None): - for key, group in grouped: - res = f(group) - assert_frame_equal(res, result.loc[key]) - def test_groupby_wrong_multi_labels(self): from pandas import read_csv data = """index,foo,bar,baz,spam,data @@ -3768,20 +3276,6 @@ def test_no_nonsense_name(self): result = s.groupby(self.frame['A']).agg(np.sum) self.assertIsNone(result.name) - def test_wrap_agg_out(self): - grouped = self.three_group.groupby(['A', 'B']) - - def func(ser): - if ser.dtype == np.object: - raise TypeError - else: - return ser.sum() - - result = grouped.aggregate(func) - exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] - expected = exp_grouped.groupby(['A', 'B']).aggregate(func) - assert_frame_equal(result, expected) - def test_multifunc_sum_bug(self): # GH #1065 x = DataFrame(np.arange(9).reshape(3, 3)) @@ -3839,110 +3333,6 @@ def test_getitem_numeric_column_names(self): assert_frame_equal(result2, expected) assert_frame_equal(result3, expected) - def test_agg_multiple_functions_maintain_order(self): - # GH #610 - funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] - result = self.df.groupby('A')['C'].agg(funcs) - exp_cols = Index(['mean', 'max', 'min']) - - self.assert_index_equal(result.columns, exp_cols) - - def test_multiple_functions_tuples_and_non_tuples(self): - # #1359 - - funcs = [('foo', 'mean'), 'std'] - ex_funcs = [('foo', 'mean'), ('std', 'std')] - - result = self.df.groupby('A')['C'].agg(funcs) - expected = self.df.groupby('A')['C'].agg(ex_funcs) - assert_frame_equal(result, expected) - - result = self.df.groupby('A').agg(funcs) - expected = self.df.groupby('A').agg(ex_funcs) - assert_frame_equal(result, expected) - - def test_agg_multiple_functions_too_many_lambdas(self): - grouped = self.df.groupby('A') - funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] - - self.assertRaises(SpecificationError, grouped.agg, funcs) - - def test_more_flexible_frame_multi_function(self): - from pandas import concat - - grouped = self.df.groupby('A') - - exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) - exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) - - expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) - expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - - d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) - result = grouped.aggregate(d) - - assert_frame_equal(result, expected) - - # be careful - result = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - expected = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - assert_frame_equal(result, expected) - - def foo(x): - return np.mean(x) - - def bar(x): - return np.std(x, ddof=1) - - d = OrderedDict([['C', np.mean], ['D', OrderedDict( - [['foo', np.mean], ['bar', np.std]])]]) - result = grouped.aggregate(d) - - d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) - expected = grouped.aggregate(d) - - assert_frame_equal(result, expected) - - def test_multi_function_flexible_mix(self): - # GH #1268 - grouped = self.df.groupby('A') - - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ - 'bar', 'std' - ]])], ['D', 'sum']]) - result = grouped.aggregate(d) - d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ - 'bar', 'std' - ]])], ['D', ['sum']]]) - result2 = grouped.aggregate(d2) - - d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ - 'bar', 'std' - ]])], ['D', {'sum': 'sum'}]]) - expected = grouped.aggregate(d3) - - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - - def test_agg_callables(self): - # GH 7929 - df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) - - class fn_class(object): - - def __call__(self, x): - return sum(x) - - equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(), - partial(sum), fn_class()] - - expected = df.groupby("foo").agg(sum) - for ecall in equiv_callables: - result = df.groupby('foo').agg(ecall) - assert_frame_equal(result, expected) - def test_set_group_name(self): def f(group): assert group.name is not None @@ -3980,97 +3370,6 @@ def test_no_dummy_key_names(self): ]).sum() self.assertEqual(result.index.names, (None, None)) - def test_groupby_sort_categorical(self): - # dataframe groupby sort was being ignored # GH 8868 - df = DataFrame([['(7.5, 10]', 10, 10], - ['(7.5, 10]', 8, 20], - ['(2.5, 5]', 5, 30], - ['(5, 7.5]', 6, 40], - ['(2.5, 5]', 4, 50], - ['(0, 2.5]', 1, 60], - ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) - df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range', ordered=True) - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - col = 'range' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) - - df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range') - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', - '(0, 2.5]'], - categories=['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], - name='range') - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) - - col = 'range' - # this is an unordered categorical, but we allow this #### - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - def test_groupby_sort_categorical_datetimelike(self): - # GH10505 - - # use same data as test_groupby_sort_categorical, which category is - # corresponding to datetime.month - df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), - datetime(2011, 2, 1), datetime(2011, 5, 1), - datetime(2011, 2, 1), datetime(2011, 1, 1), - datetime(2011, 5, 1)], - 'foo': [10, 8, 5, 6, 4, 1, 7], - 'bar': [10, 20, 30, 40, 50, 60, 70]}, - columns=['dt', 'foo', 'bar']) - - # ordered=True - df['dt'] = Categorical(df['dt'], ordered=True) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt', ordered=True) - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt', ordered=True) - - col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) - - # ordered = False - df['dt'] = Categorical(df['dt'], ordered=False) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt') - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt') - - col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - def test_groupby_sort_multiindex_series(self): # series multiindex groupby sort argument was not being passed through # _compress_group_index @@ -4088,169 +3387,6 @@ def test_groupby_sort_multiindex_series(self): result = mseries.groupby(level=['a', 'b'], sort=True).first() assert_series_equal(result, mseries_result.sort_index()) - def test_groupby_groups_datetimeindex(self): - # #1430 - from pandas.tseries.api import DatetimeIndex - periods = 1000 - ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange(periods), - 'low': np.arange(periods)}, index=ind) - grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) - - # it works! - groups = grouped.groups - tm.assertIsInstance(list(groups.keys())[0], datetime) - - # GH 11442 - index = pd.date_range('2015/01/01', periods=5, name='date') - df = pd.DataFrame({'A': [5, 6, 7, 8, 9], - 'B': [1, 2, 3, 4, 5]}, index=index) - result = df.groupby(level='date').groups - dates = ['2015-01-05', '2015-01-04', '2015-01-03', - '2015-01-02', '2015-01-01'] - expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') - for date in dates} - tm.assert_dict_equal(result, expected) - - grouped = df.groupby(level='date') - for date in dates: - result = grouped.get_group(date) - data = [[df.loc[date, 'A'], df.loc[date, 'B']]] - expected_index = pd.DatetimeIndex([date], name='date') - expected = pd.DataFrame(data, - columns=list('AB'), - index=expected_index) - tm.assert_frame_equal(result, expected) - - def test_groupby_groups_datetimeindex_tz(self): - # GH 3950 - dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'datetime': dates, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['datetime'] = df['datetime'].apply( - lambda d: Timestamp(d, tz='US/Pacific')) - - exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 09:00:00'], - tz='US/Pacific', name='datetime') - exp_idx2 = Index(['a', 'b'] * 3, name='label') - exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], - 'value2': [1, 2, 2, 1, 1, 2]}, - index=exp_idx, columns=['value1', 'value2']) - - result = df.groupby(['datetime', 'label']).sum() - assert_frame_equal(result, expected) - - # by level - didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo') - df = DataFrame({'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2, 3, 1, 2, 3]}, - index=didx) - - exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], tz='Asia/Tokyo') - expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, - index=exp_idx, columns=['value1', 'value2']) - - result = df.groupby(level=0).sum() - assert_frame_equal(result, expected) - - def test_frame_datetime64_handling_groupby(self): - # it works! - df = DataFrame([(3, np.datetime64('2012-07-03')), - (3, np.datetime64('2012-07-04'))], - columns=['a', 'date']) - result = df.groupby('a').first() - self.assertEqual(result['date'][3], Timestamp('2012-07-03')) - - def test_groupby_multi_timezone(self): - - # combining multiple / different timezones yields UTC - - data = """0,2000-01-28 16:47:00,America/Chicago -1,2000-01-29 16:48:00,America/Chicago -2,2000-01-30 16:49:00,America/Los_Angeles -3,2000-01-31 16:50:00,America/Chicago -4,2000-01-01 16:50:00,America/New_York""" - - df = pd.read_csv(StringIO(data), header=None, - names=['value', 'date', 'tz']) - result = df.groupby('tz').date.apply( - lambda x: pd.to_datetime(x).dt.tz_localize(x.name)) - - expected = Series([Timestamp('2000-01-28 16:47:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-29 16:48:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-30 16:49:00-0800', - tz='America/Los_Angeles'), - Timestamp('2000-01-31 16:50:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-01 16:50:00-0500', - tz='America/New_York')], - name='date', - dtype=object) - assert_series_equal(result, expected) - - tz = 'America/Chicago' - res_values = df.groupby('tz').date.get_group(tz) - result = pd.to_datetime(res_values).dt.tz_localize(tz) - exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00', - '2000-01-31 16:50:00'], - index=[0, 1, 3], name='date') - expected = pd.to_datetime(exp_values).dt.tz_localize(tz) - assert_series_equal(result, expected) - - def test_groupby_groups_periods(self): - dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'period': [pd.Period(d, freq='H') for d in dates], - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - - exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00', - '2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 09:00:00'], - freq='H', name='period') - exp_idx2 = Index(['a', 'b'] * 3, name='label') - exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], - 'value2': [1, 2, 2, 1, 1, 2]}, - index=exp_idx, columns=['value1', 'value2']) - - result = df.groupby(['period', 'label']).sum() - assert_frame_equal(result, expected) - - # by level - didx = pd.PeriodIndex(dates, freq='H') - df = DataFrame({'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2, 3, 1, 2, 3]}, - index=didx) - - exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], freq='H') - expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, - index=exp_idx, columns=['value1', 'value2']) - - result = df.groupby(level=0).sum() - assert_frame_equal(result, expected) - def test_groupby_reindex_inside_function(self): from pandas.tseries.api import DatetimeIndex @@ -4336,33 +3472,21 @@ def test_median_empty_bins(self): def test_groupby_non_arithmetic_agg_types(self): # GH9311, GH6620 - df = pd.DataFrame([{'a': 1, - 'b': 1}, {'a': 1, - 'b': 2}, {'a': 2, - 'b': 3}, {'a': 2, - 'b': 4}]) + df = pd.DataFrame( + [{'a': 1, 'b': 1}, + {'a': 1, 'b': 2}, + {'a': 2, 'b': 3}, + {'a': 2, 'b': 4}]) dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] - grp_exp = {'first': {'df': [{'a': 1, - 'b': 1}, {'a': 2, - 'b': 3}]}, - 'last': {'df': [{'a': 1, - 'b': 2}, {'a': 2, - 'b': 4}]}, - 'min': {'df': [{'a': 1, - 'b': 1}, {'a': 2, - 'b': 3}]}, - 'max': {'df': [{'a': 1, - 'b': 2}, {'a': 2, - 'b': 4}]}, - 'nth': {'df': [{'a': 1, - 'b': 2}, {'a': 2, - 'b': 4}], + grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, + 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, + 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, + 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, + 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], 'args': [1]}, - 'count': {'df': [{'a': 1, - 'b': 2}, {'a': 2, - 'b': 2}], + 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], 'out_type': 'int64'}} for dtype in dtypes: @@ -4414,37 +3538,6 @@ def test_groupby_non_arithmetic_agg_intlike_precision(self): res = getattr(grpd, method)(*data['args']) self.assertEqual(res.iloc[0].b, data['expected']) - def test_groupby_first_datetime64(self): - df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1].view('M8[ns]') - - self.assertTrue(issubclass(df[1].dtype.type, np.datetime64)) - - result = df.groupby(level=0).first() - got_dt = result[1].dtype - self.assertTrue(issubclass(got_dt.type, np.datetime64)) - - result = df[1].groupby(level=0).first() - got_dt = result.dtype - self.assertTrue(issubclass(got_dt.type, np.datetime64)) - - def test_groupby_max_datetime64(self): - # GH 5869 - # datetimelike dtype conversion from int - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = df.groupby('A')['A'].apply(lambda x: x.max()) - result = df.groupby('A')['A'].max() - assert_series_equal(result, expected) - - def test_groupby_datetime64_32_bit(self): - # GH 6410 / numpy 4328 - # 32-bit under 1.9-dev indexing issue - - df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) - result = df.groupby("A")["B"].transform(min) - expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') - assert_series_equal(result, expected) - def test_groupby_multiindex_missing_pair(self): # GH9049 df = DataFrame({'group1': ['a', 'a', 'a', 'b'], @@ -4613,381 +3706,6 @@ def test_groupby_with_small_elem(self): res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_groupby_with_timezone_selection(self): - # GH 11616 - # Test that column selection returns output in correct timezone. - np.random.seed(42) - df = pd.DataFrame({ - 'factor': np.random.randint(0, 3, size=60), - 'time': pd.date_range('01/01/2000 00:00', periods=60, - freq='s', tz='UTC') - }) - df1 = df.groupby('factor').max()['time'] - df2 = df.groupby('factor')['time'].max() - tm.assert_series_equal(df1, df2) - - def test_timezone_info(self): - # GH 11682 - # Timezone info lost when broadcasting scalar datetime to DataFrame - tm._skip_if_no_pytz() - import pytz - - df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]}) - self.assertEqual(df['b'][0].tzinfo, pytz.utc) - df = pd.DataFrame({'a': [1, 2, 3]}) - df['b'] = datetime.now(pytz.utc) - self.assertEqual(df['b'][0].tzinfo, pytz.utc) - - def test_groupby_with_timegrouper(self): - # GH 4161 - # TimeGrouper requires a sorted index - # also verifies that the resultant index has the correct name - import datetime as DT - df_original = DataFrame({ - 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), - 'Quantity': [18, 3, 5, 1, 9, 3], - 'Date': [ - DT.datetime(2013, 9, 1, 13, 0), - DT.datetime(2013, 9, 1, 13, 5), - DT.datetime(2013, 10, 1, 20, 0), - DT.datetime(2013, 10, 3, 10, 0), - DT.datetime(2013, 12, 2, 12, 0), - DT.datetime(2013, 9, 2, 14, 0), - ] - }) - - # GH 6908 change target column's order - df_reordered = df_original.sort_values(by='Quantity') - - for df in [df_original, df_reordered]: - df = df.set_index(['Date']) - - expected = DataFrame( - {'Quantity': np.nan}, - index=date_range('20130901 13:00:00', - '20131205 13:00:00', freq='5D', - name='Date', closed='left')) - expected.iloc[[0, 6, 18], 0] = np.array( - [24., 6., 9.], dtype='float64') - - result1 = df.resample('5D') .sum() - assert_frame_equal(result1, expected) - - df_sorted = df.sort_index() - result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() - assert_frame_equal(result2, expected) - - result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() - assert_frame_equal(result3, expected) - - def test_groupby_with_timegrouper_methods(self): - # GH 3881 - # make sure API of timegrouper conforms - - import datetime as DT - df_original = pd.DataFrame({ - 'Branch': 'A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 8, 9, 3], - 'Date': [ - DT.datetime(2013, 1, 1, 13, 0), - DT.datetime(2013, 1, 1, 13, 5), - DT.datetime(2013, 10, 1, 20, 0), - DT.datetime(2013, 10, 2, 10, 0), - DT.datetime(2013, 12, 2, 12, 0), - DT.datetime(2013, 12, 2, 14, 0), - ] - }) - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) - - for df in [df_original, df_sorted]: - df = df.set_index('Date', drop=False) - g = df.groupby(pd.TimeGrouper('6M')) - self.assertTrue(g.group_keys) - self.assertTrue(isinstance(g.grouper, pd.core.groupby.BinGrouper)) - groups = g.groups - self.assertTrue(isinstance(groups, dict)) - self.assertTrue(len(groups) == 3) - - def test_timegrouper_with_reg_groups(self): - - # GH 3794 - # allow combinateion of timegrouper/reg groups - - import datetime as DT - - df_original = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - DT.datetime(2013, 1, 1, 13, 0), - DT.datetime(2013, 1, 1, 13, 5), - DT.datetime(2013, 10, 1, 20, 0), - DT.datetime(2013, 10, 2, 10, 0), - DT.datetime(2013, 10, 1, 20, 0), - DT.datetime(2013, 10, 2, 10, 0), - DT.datetime(2013, 12, 2, 12, 0), - DT.datetime(2013, 12, 2, 14, 0), - ] - }).set_index('Date') - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) - - for df in [df_original, df_sorted]: - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - DT.datetime(2013, 12, 31, 0, 0), - DT.datetime(2013, 12, 31, 0, 0), - DT.datetime(2013, 12, 31, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - - result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() - assert_frame_equal(result, expected) - - expected = DataFrame({ - 'Buyer': 'Carl Mark Carl Joe'.split(), - 'Quantity': [1, 3, 9, 18], - 'Date': [ - DT.datetime(2013, 1, 1, 0, 0), - DT.datetime(2013, 1, 1, 0, 0), - DT.datetime(2013, 7, 1, 0, 0), - DT.datetime(2013, 7, 1, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() - assert_frame_equal(result, expected) - - df_original = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - DT.datetime(2013, 10, 1, 13, 0), - DT.datetime(2013, 10, 1, 13, 5), - DT.datetime(2013, 10, 1, 20, 0), - DT.datetime(2013, 10, 2, 10, 0), - DT.datetime(2013, 10, 1, 20, 0), - DT.datetime(2013, 10, 2, 10, 0), - DT.datetime(2013, 10, 2, 12, 0), - DT.datetime(2013, 10, 2, 14, 0), - ] - }).set_index('Date') - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) - for df in [df_original, df_sorted]: - - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark Carl Joe'.split(), - 'Quantity': [6, 8, 3, 4, 10], - 'Date': [ - DT.datetime(2013, 10, 1, 0, 0), - DT.datetime(2013, 10, 1, 0, 0), - DT.datetime(2013, 10, 1, 0, 0), - DT.datetime(2013, 10, 2, 0, 0), - DT.datetime(2013, 10, 2, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - - result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() - assert_frame_equal(result, expected) - - result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - DT.datetime(2013, 10, 31, 0, 0), - DT.datetime(2013, 10, 31, 0, 0), - DT.datetime(2013, 10, 31, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - assert_frame_equal(result, expected) - - # passing the name - df = df.reset_index() - result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' - ]).sum() - assert_frame_equal(result, expected) - - with self.assertRaises(KeyError): - df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() - - # passing the level - df = df.set_index('Date') - result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer' - ]).sum() - assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum( - ) - assert_frame_equal(result, expected) - - with self.assertRaises(ValueError): - df.groupby([pd.Grouper(freq='1M', level='foo'), - 'Buyer']).sum() - - # multi names - df = df.copy() - df['Date'] = df.index + pd.offsets.MonthEnd(2) - result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' - ]).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - DT.datetime(2013, 11, 30, 0, 0), - DT.datetime(2013, 11, 30, 0, 0), - DT.datetime(2013, 11, 30, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - assert_frame_equal(result, expected) - - # error as we have both a level and a name! - with self.assertRaises(ValueError): - df.groupby([pd.Grouper(freq='1M', key='Date', - level='Date'), 'Buyer']).sum() - - # single groupers - expected = DataFrame({'Quantity': [31], - 'Date': [DT.datetime(2013, 10, 31, 0, 0) - ]}).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M')).sum() - assert_frame_equal(result, expected) - - result = df.groupby([pd.Grouper(freq='1M')]).sum() - assert_frame_equal(result, expected) - - expected = DataFrame({'Quantity': [31], - 'Date': [DT.datetime(2013, 11, 30, 0, 0) - ]}).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() - assert_frame_equal(result, expected) - - result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() - assert_frame_equal(result, expected) - - # GH 6764 multiple grouping with/without sort - df = DataFrame({ - 'date': pd.to_datetime([ - '20121002', '20121007', '20130130', '20130202', '20130305', - '20121002', '20121207', '20130130', '20130202', '20130305', - '20130202', '20130305' - ]), - 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], - 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, - 359, 801], - 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] - }).set_index('date') - - for freq in ['D', 'M', 'A', 'Q-APR']: - expected = df.groupby('user_id')[ - 'whole_cost'].resample( - freq).sum().dropna().reorder_levels( - ['date', 'user_id']).sort_index().astype('int64') - expected.name = 'whole_cost' - - result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), - 'user_id'])['whole_cost'].sum() - assert_series_equal(result1, expected) - - result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])[ - 'whole_cost'].sum() - assert_series_equal(result2, expected) - - def test_timegrouper_get_group(self): - # GH 6914 - - df_original = DataFrame({ - 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(), - 'Quantity': [18, 3, 5, 1, 9, 3], - 'Date': [datetime(2013, 9, 1, 13, 0), - datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 3, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 9, 2, 14, 0), ] - }) - df_reordered = df_original.sort_values(by='Quantity') - - # single grouping - expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], - df_original.iloc[[4]]] - dt_list = ['2013-09-30', '2013-10-31', '2013-12-31'] - - for df in [df_original, df_reordered]: - grouped = df.groupby(pd.Grouper(freq='M', key='Date')) - for t, expected in zip(dt_list, expected_list): - dt = pd.Timestamp(t) - result = grouped.get_group(dt) - assert_frame_equal(result, expected) - - # multiple grouping - expected_list = [df_original.iloc[[1]], df_original.iloc[[3]], - df_original.iloc[[4]]] - g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), - ('Joe', '2013-12-31')] - - for df in [df_original, df_reordered]: - grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')]) - for (b, t), expected in zip(g_list, expected_list): - dt = pd.Timestamp(t) - result = grouped.get_group((b, dt)) - assert_frame_equal(result, expected) - - # with index - df_original = df_original.set_index('Date') - df_reordered = df_original.sort_values(by='Quantity') - - expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], - df_original.iloc[[4]]] - - for df in [df_original, df_reordered]: - grouped = df.groupby(pd.Grouper(freq='M')) - for t, expected in zip(dt_list, expected_list): - dt = pd.Timestamp(t) - result = grouped.get_group(dt) - assert_frame_equal(result, expected) - - def test_timegrouper_apply_return_type_series(self): - # Using `apply` with the `TimeGrouper` should give the - # same return type as an `apply` with a `Grouper`. - # Issue #11742 - df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], - 'value': [10, 13]}) - df_dt = df.copy() - df_dt['date'] = pd.to_datetime(df_dt['date']) - - def sumfunc_series(x): - return pd.Series([x['value'].sum()], ('sum',)) - - expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series) - result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) - .apply(sumfunc_series)) - assert_frame_equal(result.reset_index(drop=True), - expected.reset_index(drop=True)) - - def test_timegrouper_apply_return_type_value(self): - # Using `apply` with the `TimeGrouper` should give the - # same return type as an `apply` with a `Grouper`. - # Issue #11742 - df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], - 'value': [10, 13]}) - df_dt = df.copy() - df_dt['date'] = pd.to_datetime(df_dt['date']) - - def sumfunc_value(x): - return x.value.sum() - - expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) - result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) - .apply(sumfunc_value)) - assert_series_equal(result.reset_index(drop=True), - expected.reset_index(drop=True)) - def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) g = df.groupby('A') @@ -5326,106 +4044,6 @@ def test_tab_completion(self): 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) self.assertEqual(results, expected) - def test_lexsort_indexer(self): - keys = [[nan] * 5 + list(range(100)) + [nan] * 5] - # orders=True, na_position='last' - result = _lexsort_indexer(keys, orders=True, na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=True, na_position='first' - result = _lexsort_indexer(keys, orders=True, na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='last' - result = _lexsort_indexer(keys, orders=False, na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='first' - result = _lexsort_indexer(keys, orders=False, na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - def test_nargsort(self): - # np.argsort(items) places NaNs last - items = [nan] * 5 + list(range(100)) + [nan] * 5 - # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype='O') - - try: - # GH 2785; due to a regression in NumPy1.6.2 - np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) - np.argsort(items2, kind='mergesort') - except TypeError: - raise nose.SkipTest('requested sort not available for type') - - # mergesort is the most difficult to get right because we want it to be - # stable. - - # According to numpy/core/tests/test_multiarray, """The number of - # sorted items must be greater than ~50 to check the actual algorithm - # because quick and merge sort fall over to insertion sort for small - # arrays.""" - - # mergesort, ascending=True, na_position='last' - result = _nargsort(items, kind='mergesort', ascending=True, - na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = _nargsort(items, kind='mergesort', ascending=True, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = _nargsort(items, kind='mergesort', ascending=False, - na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = _nargsort(items, kind='mergesort', ascending=False, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='last' - result = _nargsort(items2, kind='mergesort', ascending=True, - na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = _nargsort(items2, kind='mergesort', ascending=True, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = _nargsort(items2, kind='mergesort', ascending=False, - na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = _nargsort(items2, kind='mergesort', ascending=False, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - def test_datetime_count(self): - df = DataFrame({'a': [1, 2, 3] * 2, - 'dates': pd.date_range('now', periods=6, freq='T')}) - result = df.groupby('a').dates.count() - expected = Series([ - 2, 2, 2 - ], index=Index([1, 2, 3], name='a'), name='dates') - tm.assert_series_equal(result, expected) - def test_lower_int_prec_count(self): df = DataFrame({'a': np.array( [0, 1, 2, 100], np.int8), @@ -5462,179 +4080,6 @@ def __eq__(self, other): list('ab'), name='grp')) tm.assert_frame_equal(result, expected) - def test__cython_agg_general(self): - ops = [('mean', np.mean), - ('median', np.median), - ('var', np.var), - ('add', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), ] - df = DataFrame(np.random.randn(1000)) - labels = np.random.randint(0, 50, size=1000).astype(float) - - for op, targop in ops: - result = df.groupby(labels)._cython_agg_general(op) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise - - def test_cython_agg_empty_buckets(self): - ops = [('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), ] - - df = pd.DataFrame([11, 12, 13]) - grps = range(0, 55, 5) - - for op, targop in ops: - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise - - def test_cython_group_transform_algos(self): - # GH 4095 - dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, - np.uint64, np.float32, np.float64] - - ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]), - (pd.algos.group_cumsum, np.cumsum, dtypes)] - - is_datetimelike = False - for pd_op, np_op, dtypes in ops: - for dtype in dtypes: - data = np.array([[1], [2], [3], [4]], dtype=dtype) - ans = np.zeros_like(data) - labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, is_datetimelike) - self.assert_numpy_array_equal(np_op(data), ans[:, 0], - check_dtype=False) - - # with nans - labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) - - data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') - actual = np.zeros_like(data) - actual.fill(np.nan) - pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') - self.assert_numpy_array_equal(actual[:, 0], expected) - - actual = np.zeros_like(data) - actual.fill(np.nan) - pd.algos.group_cumsum(actual, data, labels, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') - self.assert_numpy_array_equal(actual[:, 0], expected) - - # timedelta - is_datetimelike = True - data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - actual = np.zeros_like(data, dtype='int64') - pd.algos.group_cumsum(actual, data.view('int64'), labels, - is_datetimelike) - expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( - 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), - np.timedelta64(5, 'ns')]) - self.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) - - def test_cython_transform(self): - # GH 4095 - ops = [(('cumprod', - ()), lambda x: x.cumprod()), (('cumsum', ()), - lambda x: x.cumsum()), - (('shift', (-1, )), - lambda x: x.shift(-1)), (('shift', - (1, )), lambda x: x.shift())] - - s = Series(np.random.randn(1000)) - s_missing = s.copy() - s_missing.iloc[2:10] = np.nan - labels = np.random.randint(0, 50, size=1000).astype(float) - - # series - for (op, args), targop in ops: - for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) - - tm.assert_series_equal(expected, - data.groupby(labels).transform(op, - *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) - - strings = list('qwertyuiopasdfghjklz') - strings_missing = strings[:] - strings_missing[5] = np.nan - df = DataFrame({'float': s, - 'float_missing': s_missing, - 'int': [1, 1, 1, 1, 2] * 200, - 'datetime': pd.date_range('1990-1-1', periods=1000), - 'timedelta': pd.timedelta_range(1, freq='s', - periods=1000), - 'string': strings * 50, - 'string_missing': strings_missing * 50}) - df['cat'] = df['string'].astype('category') - - df2 = df.copy() - df2.index = pd.MultiIndex.from_product([range(100), range(10)]) - - # DataFrame - Single and MultiIndex, - # group by values, index level, columns - for df in [df, df2]: - for gb_target in [dict(by=labels), dict(level=0), dict(by='string') - ]: # dict(by='string_missing')]: - # dict(by=['int','string'])]: - - gb = df.groupby(**gb_target) - # whitelisted methods set the selection before applying - # bit a of hack to make sure the cythonized shift - # is equivalent to pre 0.17.1 behavior - if op == 'shift': - gb._set_group_selection() - - for (op, args), targop in ops: - if op != 'shift' and 'int' not in gb_target: - # numeric apply fastpath promotes dtype so have - # to apply seperately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) - expected = pd.concat([f, i], axis=1) - else: - expected = gb.apply(targop) - - expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal(expected, getattr(gb, op)(*args)) - # individual columns - for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': - self.assertRaises(DataError, gb[c].transform, op) - self.assertRaises(DataError, getattr(gb[c], op)) - else: - expected = gb[c].apply(targop) - expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) - def test_groupby_cumprod(self): # GH 4095 df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) @@ -5784,27 +4229,6 @@ def test_func(x): tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) - def test_first_last_max_min_on_time_data(self): - # GH 10295 - # Verify that NaT is not in the result of max, min, first and last on - # Dataframe with datetime or timedelta values. - from datetime import timedelta as td - df_test = DataFrame( - {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11', - '2015-07-23 12:12', nan], - 'td': [nan, td(days=1), td(days=2), td(days=3), nan]}) - df_test.dt = pd.to_datetime(df_test.dt) - df_test['group'] = 'A' - df_ref = df_test[df_test.dt.notnull()] - - grouped_test = df_test.groupby('group') - grouped_ref = df_ref.groupby('group') - - assert_frame_equal(grouped_ref.max(), grouped_test.max()) - assert_frame_equal(grouped_ref.min(), grouped_test.min()) - assert_frame_equal(grouped_ref.first(), grouped_test.first()) - assert_frame_equal(grouped_ref.last(), grouped_test.last()) - def test_groupby_preserves_sort(self): # Test to ensure that groupby always preserves sort order of original # object. Issue #8588 and #9651 @@ -5854,21 +4278,6 @@ def test_nunique_with_empty_series(self): expected = pd.Series(name='name', dtype='int64') tm.assert_series_equal(result, expected) - def test_transform_with_non_scalar_group(self): - # GH 10165 - cols = pd.MultiIndex.from_tuples([ - ('syn', 'A'), ('mis', 'A'), ('non', 'A'), - ('syn', 'C'), ('mis', 'C'), ('non', 'C'), - ('syn', 'T'), ('mis', 'T'), ('non', 'T'), - ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) - df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), - columns=cols, - index=['A', 'C', 'G', 'T']) - self.assertRaisesRegexp(ValueError, 'transform must return a scalar ' - 'value for each group.*', df.groupby - (axis=1, level=1).transform, - lambda z: z.div(z.sum(axis=1), axis=0)) - def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) @@ -5927,23 +4336,6 @@ def test_pivot_table_values_key_error(self): df.reset_index().pivot_table(index='year', columns='month', values='badname', aggfunc='count') - def test_agg_over_numpy_arrays(self): - # GH 3788 - df = pd.DataFrame([[1, np.array([10, 20, 30])], - [1, np.array([40, 50, 60])], - [2, np.array([20, 30, 40])]], - columns=['category', 'arraydata']) - result = df.groupby('category').agg(sum) - - expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] - expected_index = pd.Index([1, 2], name='category') - expected_column = ['arraydata'] - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_column) - - assert_frame_equal(result, expected) - def test_cummin_cummax(self): # GH 15048 num_types = [np.int32, np.int64, np.float32, np.float64] @@ -6024,10 +4416,6 @@ def test_cummin_cummax(self): tm.assert_frame_equal(expected, result) -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() - - def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) diff --git a/pandas/tests/groupby/test_misc.py b/pandas/tests/groupby/test_misc.py new file mode 100644 index 0000000000000..c9d8ad4231cfb --- /dev/null +++ b/pandas/tests/groupby/test_misc.py @@ -0,0 +1,101 @@ +""" misc non-groupby routines, as they are defined in core/groupby.py """ + +import nose +import numpy as np +from numpy import nan +from pandas.util import testing as tm +from pandas.core.groupby import _nargsort, _lexsort_indexer + + +class TestSorting(tm.TestCase): + + def test_lexsort_indexer(self): + keys = [[nan] * 5 + list(range(100)) + [nan] * 5] + # orders=True, na_position='last' + result = _lexsort_indexer(keys, orders=True, na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=True, na_position='first' + result = _lexsort_indexer(keys, orders=True, na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='last' + result = _lexsort_indexer(keys, orders=False, na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='first' + result = _lexsort_indexer(keys, orders=False, na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + def test_nargsort(self): + # np.argsort(items) places NaNs last + items = [nan] * 5 + list(range(100)) + [nan] * 5 + # np.argsort(items2) may not place NaNs first + items2 = np.array(items, dtype='O') + + try: + # GH 2785; due to a regression in NumPy1.6.2 + np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) + np.argsort(items2, kind='mergesort') + except TypeError: + raise nose.SkipTest('requested sort not available for type') + + # mergesort is the most difficult to get right because we want it to be + # stable. + + # According to numpy/core/tests/test_multiarray, """The number of + # sorted items must be greater than ~50 to check the actual algorithm + # because quick and merge sort fall over to insertion sort for small + # arrays.""" + + # mergesort, ascending=True, na_position='last' + result = _nargsort(items, kind='mergesort', ascending=True, + na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = _nargsort(items, kind='mergesort', ascending=True, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = _nargsort(items, kind='mergesort', ascending=False, + na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = _nargsort(items, kind='mergesort', ascending=False, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='last' + result = _nargsort(items2, kind='mergesort', ascending=True, + na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = _nargsort(items2, kind='mergesort', ascending=True, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = _nargsort(items2, kind='mergesort', ascending=False, + na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = _nargsort(items2, kind='mergesort', ascending=False, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py new file mode 100644 index 0000000000000..3142b74b56778 --- /dev/null +++ b/pandas/tests/groupby/test_timegrouper.py @@ -0,0 +1,609 @@ +""" test with the TimeGrouper / grouping with datetimes """ + +from datetime import datetime +import numpy as np +from numpy import nan + +import pandas as pd +from pandas import DataFrame, date_range, Index, Series, MultiIndex, Timestamp +from pandas.compat import StringIO +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal, assert_series_equal + + +class TestGroupBy(tm.TestCase): + + def test_groupby_with_timegrouper(self): + # GH 4161 + # TimeGrouper requires a sorted index + # also verifies that the resultant index has the correct name + df_original = DataFrame({ + 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), + 'Quantity': [18, 3, 5, 1, 9, 3], + 'Date': [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ] + }) + + # GH 6908 change target column's order + df_reordered = df_original.sort_values(by='Quantity') + + for df in [df_original, df_reordered]: + df = df.set_index(['Date']) + + expected = DataFrame( + {'Quantity': np.nan}, + index=date_range('20130901 13:00:00', + '20131205 13:00:00', freq='5D', + name='Date', closed='left')) + expected.iloc[[0, 6, 18], 0] = np.array( + [24., 6., 9.], dtype='float64') + + result1 = df.resample('5D') .sum() + assert_frame_equal(result1, expected) + + df_sorted = df.sort_index() + result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result2, expected) + + result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result3, expected) + + def test_groupby_with_timegrouper_methods(self): + # GH 3881 + # make sure API of timegrouper conforms + + df_original = pd.DataFrame({ + 'Branch': 'A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), + 'Quantity': [1, 3, 5, 8, 9, 3], + 'Date': [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ] + }) + + df_sorted = df_original.sort_values(by='Quantity', ascending=False) + + for df in [df_original, df_sorted]: + df = df.set_index('Date', drop=False) + g = df.groupby(pd.TimeGrouper('6M')) + self.assertTrue(g.group_keys) + self.assertTrue(isinstance(g.grouper, pd.core.groupby.BinGrouper)) + groups = g.groups + self.assertTrue(isinstance(groups, dict)) + self.assertTrue(len(groups) == 3) + + def test_timegrouper_with_reg_groups(self): + + # GH 3794 + # allow combinateion of timegrouper/reg groups + + df_original = DataFrame({ + 'Branch': 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], + 'Date': [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ] + }).set_index('Date') + + df_sorted = df_original.sort_values(by='Quantity', ascending=False) + + for df in [df_original, df_sorted]: + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10, 18, 3], + 'Date': [ + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + ] + }).set_index(['Date', 'Buyer']) + + result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() + assert_frame_equal(result, expected) + + expected = DataFrame({ + 'Buyer': 'Carl Mark Carl Joe'.split(), + 'Quantity': [1, 3, 9, 18], + 'Date': [ + datetime(2013, 1, 1, 0, 0), + datetime(2013, 1, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + ] + }).set_index(['Date', 'Buyer']) + result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() + assert_frame_equal(result, expected) + + df_original = DataFrame({ + 'Branch': 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], + 'Date': [ + datetime(2013, 10, 1, 13, 0), + datetime(2013, 10, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 10, 2, 14, 0), + ] + }).set_index('Date') + + df_sorted = df_original.sort_values(by='Quantity', ascending=False) + for df in [df_original, df_sorted]: + + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark Carl Joe'.split(), + 'Quantity': [6, 8, 3, 4, 10], + 'Date': [ + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 2, 0, 0), + datetime(2013, 10, 2, 0, 0), + ] + }).set_index(['Date', 'Buyer']) + + result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10, 18, 3], + 'Date': [ + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + ] + }).set_index(['Date', 'Buyer']) + assert_frame_equal(result, expected) + + # passing the name + df = df.reset_index() + result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' + ]).sum() + assert_frame_equal(result, expected) + + with self.assertRaises(KeyError): + df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() + + # passing the level + df = df.set_index('Date') + result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer' + ]).sum() + assert_frame_equal(result, expected) + result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum( + ) + assert_frame_equal(result, expected) + + with self.assertRaises(ValueError): + df.groupby([pd.Grouper(freq='1M', level='foo'), + 'Buyer']).sum() + + # multi names + df = df.copy() + df['Date'] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' + ]).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10, 18, 3], + 'Date': [ + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + ] + }).set_index(['Date', 'Buyer']) + assert_frame_equal(result, expected) + + # error as we have both a level and a name! + with self.assertRaises(ValueError): + df.groupby([pd.Grouper(freq='1M', key='Date', + level='Date'), 'Buyer']).sum() + + # single groupers + expected = DataFrame({'Quantity': [31], + 'Date': [datetime(2013, 10, 31, 0, 0) + ]}).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M')).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M')]).sum() + assert_frame_equal(result, expected) + + expected = DataFrame({'Quantity': [31], + 'Date': [datetime(2013, 11, 30, 0, 0) + ]}).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() + assert_frame_equal(result, expected) + + # GH 6764 multiple grouping with/without sort + df = DataFrame({ + 'date': pd.to_datetime([ + '20121002', '20121007', '20130130', '20130202', '20130305', + '20121002', '20121207', '20130130', '20130202', '20130305', + '20130202', '20130305' + ]), + 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, + 359, 801], + 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] + }).set_index('date') + + for freq in ['D', 'M', 'A', 'Q-APR']: + expected = df.groupby('user_id')[ + 'whole_cost'].resample( + freq).sum().dropna().reorder_levels( + ['date', 'user_id']).sort_index().astype('int64') + expected.name = 'whole_cost' + + result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), + 'user_id'])['whole_cost'].sum() + assert_series_equal(result1, expected) + + result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])[ + 'whole_cost'].sum() + assert_series_equal(result2, expected) + + def test_timegrouper_get_group(self): + # GH 6914 + + df_original = DataFrame({ + 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(), + 'Quantity': [18, 3, 5, 1, 9, 3], + 'Date': [datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), ] + }) + df_reordered = df_original.sort_values(by='Quantity') + + # single grouping + expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], + df_original.iloc[[4]]] + dt_list = ['2013-09-30', '2013-10-31', '2013-12-31'] + + for df in [df_original, df_reordered]: + grouped = df.groupby(pd.Grouper(freq='M', key='Date')) + for t, expected in zip(dt_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group(dt) + assert_frame_equal(result, expected) + + # multiple grouping + expected_list = [df_original.iloc[[1]], df_original.iloc[[3]], + df_original.iloc[[4]]] + g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), + ('Joe', '2013-12-31')] + + for df in [df_original, df_reordered]: + grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')]) + for (b, t), expected in zip(g_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group((b, dt)) + assert_frame_equal(result, expected) + + # with index + df_original = df_original.set_index('Date') + df_reordered = df_original.sort_values(by='Quantity') + + expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], + df_original.iloc[[4]]] + + for df in [df_original, df_reordered]: + grouped = df.groupby(pd.Grouper(freq='M')) + for t, expected in zip(dt_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group(dt) + assert_frame_equal(result, expected) + + def test_timegrouper_apply_return_type_series(self): + # Using `apply` with the `TimeGrouper` should give the + # same return type as an `apply` with a `Grouper`. + # Issue #11742 + df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], + 'value': [10, 13]}) + df_dt = df.copy() + df_dt['date'] = pd.to_datetime(df_dt['date']) + + def sumfunc_series(x): + return pd.Series([x['value'].sum()], ('sum',)) + + expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series) + result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) + .apply(sumfunc_series)) + assert_frame_equal(result.reset_index(drop=True), + expected.reset_index(drop=True)) + + def test_timegrouper_apply_return_type_value(self): + # Using `apply` with the `TimeGrouper` should give the + # same return type as an `apply` with a `Grouper`. + # Issue #11742 + df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], + 'value': [10, 13]}) + df_dt = df.copy() + df_dt['date'] = pd.to_datetime(df_dt['date']) + + def sumfunc_value(x): + return x.value.sum() + + expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) + result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) + .apply(sumfunc_value)) + assert_series_equal(result.reset_index(drop=True), + expected.reset_index(drop=True)) + + def test_groupby_groups_datetimeindex(self): + # #1430 + from pandas.tseries.api import DatetimeIndex + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange(periods), + 'low': np.arange(periods)}, index=ind) + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + + # it works! + groups = grouped.groups + tm.assertIsInstance(list(groups.keys())[0], datetime) + + # GH 11442 + index = pd.date_range('2015/01/01', periods=5, name='date') + df = pd.DataFrame({'A': [5, 6, 7, 8, 9], + 'B': [1, 2, 3, 4, 5]}, index=index) + result = df.groupby(level='date').groups + dates = ['2015-01-05', '2015-01-04', '2015-01-03', + '2015-01-02', '2015-01-01'] + expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') + for date in dates} + tm.assert_dict_equal(result, expected) + + grouped = df.groupby(level='date') + for date in dates: + result = grouped.get_group(date) + data = [[df.loc[date, 'A'], df.loc[date, 'B']]] + expected_index = pd.DatetimeIndex([date], name='date') + expected = pd.DataFrame(data, + columns=list('AB'), + index=expected_index) + tm.assert_frame_equal(result, expected) + + def test_groupby_groups_datetimeindex_tz(self): + # GH 3950 + dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 09:00:00'] + df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], + 'datetime': dates, + 'value1': np.arange(6, dtype='int64'), + 'value2': [1, 2] * 3}) + df['datetime'] = df['datetime'].apply( + lambda d: Timestamp(d, tz='US/Pacific')) + + exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', + '2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00', + '2011-07-19 09:00:00'], + tz='US/Pacific', name='datetime') + exp_idx2 = Index(['a', 'b'] * 3, name='label') + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], + 'value2': [1, 2, 2, 1, 1, 2]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(['datetime', 'label']).sum() + assert_frame_equal(result, expected) + + # by level + didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo') + df = DataFrame({'value1': np.arange(6, dtype='int64'), + 'value2': [1, 2, 3, 1, 2, 3]}, + index=didx) + + exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], tz='Asia/Tokyo') + expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(level=0).sum() + assert_frame_equal(result, expected) + + def test_frame_datetime64_handling_groupby(self): + # it works! + df = DataFrame([(3, np.datetime64('2012-07-03')), + (3, np.datetime64('2012-07-04'))], + columns=['a', 'date']) + result = df.groupby('a').first() + self.assertEqual(result['date'][3], Timestamp('2012-07-03')) + + def test_groupby_multi_timezone(self): + + # combining multiple / different timezones yields UTC + + data = """0,2000-01-28 16:47:00,America/Chicago +1,2000-01-29 16:48:00,America/Chicago +2,2000-01-30 16:49:00,America/Los_Angeles +3,2000-01-31 16:50:00,America/Chicago +4,2000-01-01 16:50:00,America/New_York""" + + df = pd.read_csv(StringIO(data), header=None, + names=['value', 'date', 'tz']) + result = df.groupby('tz').date.apply( + lambda x: pd.to_datetime(x).dt.tz_localize(x.name)) + + expected = Series([Timestamp('2000-01-28 16:47:00-0600', + tz='America/Chicago'), + Timestamp('2000-01-29 16:48:00-0600', + tz='America/Chicago'), + Timestamp('2000-01-30 16:49:00-0800', + tz='America/Los_Angeles'), + Timestamp('2000-01-31 16:50:00-0600', + tz='America/Chicago'), + Timestamp('2000-01-01 16:50:00-0500', + tz='America/New_York')], + name='date', + dtype=object) + assert_series_equal(result, expected) + + tz = 'America/Chicago' + res_values = df.groupby('tz').date.get_group(tz) + result = pd.to_datetime(res_values).dt.tz_localize(tz) + exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00', + '2000-01-31 16:50:00'], + index=[0, 1, 3], name='date') + expected = pd.to_datetime(exp_values).dt.tz_localize(tz) + assert_series_equal(result, expected) + + def test_groupby_groups_periods(self): + dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 09:00:00'] + df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], + 'period': [pd.Period(d, freq='H') for d in dates], + 'value1': np.arange(6, dtype='int64'), + 'value2': [1, 2] * 3}) + + exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00', + '2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00', + '2011-07-19 09:00:00'], + freq='H', name='period') + exp_idx2 = Index(['a', 'b'] * 3, name='label') + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], + 'value2': [1, 2, 2, 1, 1, 2]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(['period', 'label']).sum() + assert_frame_equal(result, expected) + + # by level + didx = pd.PeriodIndex(dates, freq='H') + df = DataFrame({'value1': np.arange(6, dtype='int64'), + 'value2': [1, 2, 3, 1, 2, 3]}, + index=didx) + + exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], freq='H') + expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(level=0).sum() + assert_frame_equal(result, expected) + + def test_groupby_first_datetime64(self): + df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) + df[1] = df[1].view('M8[ns]') + + self.assertTrue(issubclass(df[1].dtype.type, np.datetime64)) + + result = df.groupby(level=0).first() + got_dt = result[1].dtype + self.assertTrue(issubclass(got_dt.type, np.datetime64)) + + result = df[1].groupby(level=0).first() + got_dt = result.dtype + self.assertTrue(issubclass(got_dt.type, np.datetime64)) + + def test_groupby_max_datetime64(self): + # GH 5869 + # datetimelike dtype conversion from int + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + expected = df.groupby('A')['A'].apply(lambda x: x.max()) + result = df.groupby('A')['A'].max() + assert_series_equal(result, expected) + + def test_groupby_datetime64_32_bit(self): + # GH 6410 / numpy 4328 + # 32-bit under 1.9-dev indexing issue + + df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) + result = df.groupby("A")["B"].transform(min) + expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') + assert_series_equal(result, expected) + + def test_groupby_with_timezone_selection(self): + # GH 11616 + # Test that column selection returns output in correct timezone. + np.random.seed(42) + df = pd.DataFrame({ + 'factor': np.random.randint(0, 3, size=60), + 'time': pd.date_range('01/01/2000 00:00', periods=60, + freq='s', tz='UTC') + }) + df1 = df.groupby('factor').max()['time'] + df2 = df.groupby('factor')['time'].max() + tm.assert_series_equal(df1, df2) + + def test_timezone_info(self): + # GH 11682 + # Timezone info lost when broadcasting scalar datetime to DataFrame + tm._skip_if_no_pytz() + import pytz + + df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]}) + self.assertEqual(df['b'][0].tzinfo, pytz.utc) + df = pd.DataFrame({'a': [1, 2, 3]}) + df['b'] = datetime.now(pytz.utc) + self.assertEqual(df['b'][0].tzinfo, pytz.utc) + + def test_datetime_count(self): + df = DataFrame({'a': [1, 2, 3] * 2, + 'dates': pd.date_range('now', periods=6, freq='T')}) + result = df.groupby('a').dates.count() + expected = Series([ + 2, 2, 2 + ], index=Index([1, 2, 3], name='a'), name='dates') + tm.assert_series_equal(result, expected) + + def test_first_last_max_min_on_time_data(self): + # GH 10295 + # Verify that NaT is not in the result of max, min, first and last on + # Dataframe with datetime or timedelta values. + from datetime import timedelta as td + df_test = DataFrame( + {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11', + '2015-07-23 12:12', nan], + 'td': [nan, td(days=1), td(days=2), td(days=3), nan]}) + df_test.dt = pd.to_datetime(df_test.dt) + df_test['group'] = 'A' + df_ref = df_test[df_test.dt.notnull()] + + grouped_test = df_test.groupby('group') + grouped_ref = df_ref.groupby('group') + + assert_frame_equal(grouped_ref.max(), grouped_test.max()) + assert_frame_equal(grouped_ref.min(), grouped_test.min()) + assert_frame_equal(grouped_ref.first(), grouped_test.first()) + assert_frame_equal(grouped_ref.last(), grouped_test.last()) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py new file mode 100644 index 0000000000000..cf5e9eb26ff13 --- /dev/null +++ b/pandas/tests/groupby/test_transform.py @@ -0,0 +1,494 @@ +""" test with the .transform """ + +import numpy as np +import pandas as pd +from pandas.util import testing as tm +from pandas import Series, DataFrame, Timestamp, MultiIndex, concat +from pandas.types.common import _ensure_platform_int +from .common import MixIn, assert_fp_equal + +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.core.groupby import DataError +from pandas.core.config import option_context + + +class TestGroupBy(MixIn, tm.TestCase): + + def test_transform(self): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + self.assertEqual(transformed[7], 12) + + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame( + np.arange(6, dtype='int64').reshape( + 3, 2), columns=["a", "b"], index=[0, 2, 1]) + key = [0, 0, 1] + expected = df.sort_index().groupby(key).transform( + lambda x: x - x.mean()).groupby(key).mean() + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( + key).mean() + assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean() + + people = DataFrame(np.random.randn(5, 5), + columns=['a', 'b', 'c', 'd', 'e'], + index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) + key = ['one', 'two', 'one', 'two', 'one'] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key).apply(demean).groupby(key).mean() + assert_frame_equal(result, expected) + + # GH 8430 + df = tm.makeTimeDataFrame() + g = df.groupby(pd.TimeGrouper('M')) + g.transform(lambda x: x - 1) + + # GH 9700 + df = DataFrame({'a': range(5, 10), 'b': range(5)}) + result = df.groupby('a').transform(max) + expected = DataFrame({'b': range(5)}) + tm.assert_frame_equal(result, expected) + + def test_transform_fast(self): + + df = DataFrame({'id': np.arange(100000) / 3, + 'val': np.random.randn(100000)}) + + grp = df.groupby('id')['val'] + + values = np.repeat(grp.mean().values, + _ensure_platform_int(grp.count().values)) + expected = pd.Series(values, index=df.index, name='val') + + result = grp.transform(np.mean) + assert_series_equal(result, expected) + + result = grp.transform('mean') + assert_series_equal(result, expected) + + # GH 12737 + df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], + 'd': pd.date_range('2014-1-1', '2014-1-4'), + 'i': [1, 2, 3, 4]}, + columns=['grouping', 'f', 'i', 'd']) + result = df.groupby('grouping').transform('first') + + dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] + expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], + 'd': dates, + 'i': [1, 2, 2, 4]}, + columns=['f', 'i', 'd']) + assert_frame_equal(result, expected) + + # selection + result = df.groupby('grouping')[['f', 'i']].transform('first') + expected = expected[['f', 'i']] + assert_frame_equal(result, expected) + + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) + result = df.groupby('g').transform('first') + expected = df.drop('g', axis=1) + assert_frame_equal(result, expected) + + def test_transform_broadcast(self): + grouped = self.ts.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + + self.assert_index_equal(result.index, self.ts.index) + for _, gp in grouped: + assert_fp_equal(result.reindex(gp.index), gp.mean()) + + grouped = self.tsframe.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + self.assert_index_equal(result.index, self.tsframe.index) + for _, gp in grouped: + agged = gp.mean() + res = result.reindex(gp.index) + for col in self.tsframe: + assert_fp_equal(res[col], agged[col]) + + # group columns + grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis=1) + result = grouped.transform(np.mean) + self.assert_index_equal(result.index, self.tsframe.index) + self.assert_index_equal(result.columns, self.tsframe.columns) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + assert_fp_equal(res.xs(idx), agged[idx]) + + def test_transform_axis(self): + + # make sure that we are setting the axes + # correctly when on axis=0 or 1 + # in the presence of a non-monotonic indexer + # GH12713 + + base = self.tsframe.iloc[0:5] + r = len(base.index) + c = len(base.columns) + tso = DataFrame(np.random.randn(r, c), + index=base.index, + columns=base.columns, + dtype='float64') + # monotonic + ts = tso + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: x - x.mean()) + assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + assert_frame_equal(result, expected) + + # non-monotonic + ts = tso.iloc[[1, 0] + list(range(2, len(base)))] + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: x - x.mean()) + assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + assert_frame_equal(result, expected) + + def test_transform_dtype(self): + # GH 9807 + # Check transform dtype output is preserved + df = DataFrame([[1, 3], [2, 3]]) + result = df.groupby(1).transform('mean') + expected = DataFrame([[1.5], [1.5]]) + assert_frame_equal(result, expected) + + def test_transform_bug(self): + # GH 5712 + # transforming on a datetime column + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + result = df.groupby('A')['B'].transform( + lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name='B') + assert_series_equal(result, expected) + + def test_transform_multiple(self): + grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) + + grouped.transform(lambda x: x * 2) + grouped.transform(np.mean) + + def test_dispatch_transform(self): + df = self.tsframe[::5].reindex(self.tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + filled = grouped.fillna(method='pad') + fillit = lambda x: x.fillna(method='pad') + expected = df.groupby(lambda x: x.month).transform(fillit) + assert_frame_equal(filled, expected) + + def test_transform_select_columns(self): + f = lambda x: x.mean() + result = self.df.groupby('A')['C', 'D'].transform(f) + + selection = self.df[['C', 'D']] + expected = selection.groupby(self.df['A']).transform(f) + + assert_frame_equal(result, expected) + + def test_transform_exclude_nuisance(self): + + # this also tests orderings in transform between + # series/frame to make sure it's consistent + expected = {} + grouped = self.df.groupby('A') + expected['C'] = grouped['C'].transform(np.mean) + expected['D'] = grouped['D'].transform(np.mean) + expected = DataFrame(expected) + result = self.df.groupby('A').transform(np.mean) + + assert_frame_equal(result, expected) + + def test_transform_function_aliases(self): + result = self.df.groupby('A').transform('mean') + expected = self.df.groupby('A').transform(np.mean) + assert_frame_equal(result, expected) + + result = self.df.groupby('A')['C'].transform('mean') + expected = self.df.groupby('A')['C'].transform(np.mean) + assert_series_equal(result, expected) + + def test_series_fast_transform_date(self): + # GH 13191 + df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], + 'd': pd.date_range('2014-1-1', '2014-1-4')}) + result = df.groupby('grouping')['d'].transform('first') + dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-4')] + expected = pd.Series(dates, name='d') + assert_series_equal(result, expected) + + def test_transform_length(self): + # GH 9697 + df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) + expected = pd.Series([3.0] * 4) + + def nsum(x): + return np.nansum(x) + + results = [df.groupby('col1').transform(sum)['col2'], + df.groupby('col1')['col2'].transform(sum), + df.groupby('col1').transform(nsum)['col2'], + df.groupby('col1')['col2'].transform(nsum)] + for result in results: + assert_series_equal(result, expected, check_names=False) + + def test_transform_coercion(self): + + # 14457 + # when we are transforming be sure to not coerce + # via assignment + df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) + g = df.groupby('A') + + expected = g.transform(np.mean) + result = g.transform(lambda x: np.mean(x)) + assert_frame_equal(result, expected) + + def test_groupby_transform_with_int(self): + + # GH 3740, make sure that we might upcast on item-by-item transform + + # floats + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), + C=Series( + [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=Series( + [-1, 0, 1, -1, 0, 1], dtype='float64'))) + assert_frame_equal(result, expected) + + # int case + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, + C=[1, 2, 3, 1, 2, 3], D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) + assert_frame_equal(result, expected) + + # int that needs float conversion + s = Series([2, 3, 4, 10, 5, -1]) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + + s1 = s.iloc[0:3] + s1 = (s1 - s1.mean()) / s1.std() + s2 = s.iloc[3:6] + s2 = (s2 - s2.mean()) / s2.std() + expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) + assert_frame_equal(result, expected) + + # int downcasting + result = df.groupby('A').transform(lambda x: x * 2 / 2) + expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) + assert_frame_equal(result, expected) + + def test_groupby_transform_with_nan_group(self): + # GH 9941 + df = pd.DataFrame({'a': range(10), + 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)['a'].transform(max) + expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], + name='a') + assert_series_equal(result, expected) + + def test_transform_mixed_type(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] + ]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + self.assertEqual(result['d'].dtype, np.float64) + + # this is by definition a mutating operation! + with option_context('mode.chained_assignment', None): + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.loc[key]) + + def test_cython_group_transform_algos(self): + # GH 4095 + dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, + np.uint64, np.float32, np.float64] + + ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]), + (pd.algos.group_cumsum, np.cumsum, dtypes)] + + is_datetimelike = False + for pd_op, np_op, dtypes in ops: + for dtype in dtypes: + data = np.array([[1], [2], [3], [4]], dtype=dtype) + ans = np.zeros_like(data) + labels = np.array([0, 0, 0, 0], dtype=np.int64) + pd_op(ans, data, labels, is_datetimelike) + self.assert_numpy_array_equal(np_op(data), ans[:, 0], + check_dtype=False) + + # with nans + labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + + data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') + actual = np.zeros_like(data) + actual.fill(np.nan) + pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') + self.assert_numpy_array_equal(actual[:, 0], expected) + + actual = np.zeros_like(data) + actual.fill(np.nan) + pd.algos.group_cumsum(actual, data, labels, is_datetimelike) + expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') + self.assert_numpy_array_equal(actual[:, 0], expected) + + # timedelta + is_datetimelike = True + data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] + actual = np.zeros_like(data, dtype='int64') + pd.algos.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) + expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( + 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), + np.timedelta64(5, 'ns')]) + self.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) + + def test_cython_transform(self): + # GH 4095 + ops = [(('cumprod', + ()), lambda x: x.cumprod()), (('cumsum', ()), + lambda x: x.cumsum()), + (('shift', (-1, )), + lambda x: x.shift(-1)), (('shift', + (1, )), lambda x: x.shift())] + + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + + # series + for (op, args), targop in ops: + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal(expected, + data.groupby(labels).transform(op, + *args)) + tm.assert_series_equal(expected, getattr( + data.groupby(labels), op)(*args)) + + strings = list('qwertyuiopasdfghjklz') + strings_missing = strings[:] + strings_missing[5] = np.nan + df = DataFrame({'float': s, + 'float_missing': s_missing, + 'int': [1, 1, 1, 1, 2] * 200, + 'datetime': pd.date_range('1990-1-1', periods=1000), + 'timedelta': pd.timedelta_range(1, freq='s', + periods=1000), + 'string': strings * 50, + 'string_missing': strings_missing * 50}) + df['cat'] = df['string'].astype('category') + + df2 = df.copy() + df2.index = pd.MultiIndex.from_product([range(100), range(10)]) + + # DataFrame - Single and MultiIndex, + # group by values, index level, columns + for df in [df, df2]: + for gb_target in [dict(by=labels), dict(level=0), dict(by='string') + ]: # dict(by='string_missing')]: + # dict(by=['int','string'])]: + + gb = df.groupby(**gb_target) + # whitelisted methods set the selection before applying + # bit a of hack to make sure the cythonized shift + # is equivalent to pre 0.17.1 behavior + if op == 'shift': + gb._set_group_selection() + + for (op, args), targop in ops: + if op != 'shift' and 'int' not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply seperately and concat + i = gb[['int']].apply(targop) + f = gb[['float', 'float_missing']].apply(targop) + expected = pd.concat([f, i], axis=1) + else: + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + tm.assert_frame_equal(expected, + gb.transform(op, *args).sort_index( + axis=1)) + tm.assert_frame_equal(expected, getattr(gb, op)(*args)) + # individual columns + for c in df: + if c not in ['float', 'int', 'float_missing' + ] and op != 'shift': + self.assertRaises(DataError, gb[c].transform, op) + self.assertRaises(DataError, getattr(gb[c], op)) + else: + expected = gb[c].apply(targop) + expected.name = c + tm.assert_series_equal(expected, + gb[c].transform(op, *args)) + tm.assert_series_equal(expected, + getattr(gb[c], op)(*args)) + + def test_transform_with_non_scalar_group(self): + # GH 10165 + cols = pd.MultiIndex.from_tuples([ + ('syn', 'A'), ('mis', 'A'), ('non', 'A'), + ('syn', 'C'), ('mis', 'C'), ('non', 'C'), + ('syn', 'T'), ('mis', 'T'), ('non', 'T'), + ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) + df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), + columns=cols, + index=['A', 'C', 'G', 'T']) + self.assertRaisesRegexp(ValueError, 'transform must return a scalar ' + 'value for each group.*', df.groupby + (axis=1, level=1).transform, + lambda z: z.div(z.sum(axis=1), axis=0)) From d20c45f67da97c1a7dbeb308478efd596cf94a68 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Feb 2017 08:04:01 -0500 Subject: [PATCH 110/338] TST: mark gbq streaming insert tests as slow --- ci/requirements-3.4_SLOW.pip | 3 +++ pandas/io/tests/test_gbq.py | 1 + 2 files changed, 4 insertions(+) create mode 100644 ci/requirements-3.4_SLOW.pip diff --git a/ci/requirements-3.4_SLOW.pip b/ci/requirements-3.4_SLOW.pip new file mode 100644 index 0000000000000..05c938abcbab6 --- /dev/null +++ b/ci/requirements-3.4_SLOW.pip @@ -0,0 +1,3 @@ +httplib2 +google-api-python-client +oauth2client diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 0507f0d89661c..457e2d218cb33 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -938,6 +938,7 @@ def test_upload_data_if_table_exists_replace(self): private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5) + @tm.slow def test_google_upload_errors_should_raise_exception(self): destination_table = DESTINATION_TABLE + "5" From 5d7a777a04bc3536e787c7a3118cad882bcce46d Mon Sep 17 00:00:00 2001 From: Kernc Date: Wed, 8 Feb 2017 09:30:47 -0500 Subject: [PATCH 111/338] ENH: .squeeze has gained the axis parameter closes #15339 Author: Kernc Closes #15335 from kernc/squeeze_axis_param and squashes the following commits: 44d3c54 [Kernc] fixup! ENH: .squeeze accepts axis parameter cc018c9 [Kernc] ENH: .squeeze accepts axis parameter --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/compat/numpy/function.py | 7 ------- pandas/core/generic.py | 24 +++++++++++++++++++----- pandas/tests/test_generic.py | 18 ++++++++++++++---- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3f6c06e20b546..9afcf85c929a7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -149,6 +149,7 @@ Other enhancements - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) +- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 72e89586d0280..eb9e9ecc359b2 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -214,13 +214,6 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method='kwargs') -# Currently, numpy (v1.11) has backwards compatibility checks -# in place so that this 'kwargs' parameter is technically -# unnecessary, but in the long-run, this will be needed. -SQUEEZE_DEFAULTS = dict(axis=None) -validate_squeeze = CompatValidator(SQUEEZE_DEFAULTS, fname='squeeze', - method='kwargs') - TAKE_DEFAULTS = OrderedDict() TAKE_DEFAULTS['out'] = None TAKE_DEFAULTS['mode'] = 'raise' diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb2664a5b8d28..228dd2acd2124 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -532,13 +532,27 @@ def pop(self, item): return result - def squeeze(self, **kwargs): - """Squeeze length 1 dimensions.""" - nv.validate_squeeze(tuple(), kwargs) + def squeeze(self, axis=None): + """ + Squeeze length 1 dimensions. + Parameters + ---------- + axis : None, integer or string axis name, optional + The axis to squeeze if 1-sized. + + .. versionadded:: 0.20.0 + + Returns + ------- + scalar if 1-sized, else original object + """ + axis = (self._AXIS_NAMES if axis is None else + (self._get_axis_number(axis),)) try: - return self.iloc[tuple([0 if len(a) == 1 else slice(None) - for a in self.axes])] + return self.iloc[ + tuple([0 if i in axis and len(a) == 1 else slice(None) + for i, a in enumerate(self.axes)])] except: return self diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 916d7ae0b0ec4..bb341c26d454e 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1770,6 +1770,20 @@ def test_squeeze(self): [tm.assert_series_equal(empty_series, higher_dim.squeeze()) for higher_dim in [empty_series, empty_frame, empty_panel]] + # axis argument + df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] + tm.assert_equal(df.shape, (1, 1)) + tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis='index'), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) + tm.assert_series_equal(df.squeeze(axis='columns'), df.iloc[:, 0]) + tm.assert_equal(df.squeeze(), df.iloc[0, 0]) + tm.assertRaises(ValueError, df.squeeze, axis=2) + tm.assertRaises(ValueError, df.squeeze, axis='x') + + df = tm.makeTimeDataFrame(3) + tm.assert_frame_equal(df.squeeze(axis=0), df) + def test_numpy_squeeze(self): s = tm.makeFloatSeries() tm.assert_series_equal(np.squeeze(s), s) @@ -1777,10 +1791,6 @@ def test_numpy_squeeze(self): df = tm.makeTimeDataFrame().reindex(columns=['A']) tm.assert_series_equal(np.squeeze(df), df['A']) - msg = "the 'axis' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, - np.squeeze, s, axis=0) - def test_transpose(self): msg = (r"transpose\(\) got multiple values for " r"keyword argument 'axes'") From 0e21767c5a5f8a94c1d08943ad8455573dec7d8b Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Wed, 8 Feb 2017 15:58:25 +0100 Subject: [PATCH 112/338] Small documentation fix for MultiIndex.sortlevel (#15345) * doc fix for return values of MultiIndex.sortlevel * MultiIndex.sortlevel docs improved after feedback --- pandas/indexes/multi.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d2469cf1a3eed..9ab07d87fd13b 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1399,7 +1399,11 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Returns ------- - sorted_index : MultiIndex + sorted_index : pd.MultiIndex + Resulting index + indexer : np.ndarray + Indices of output values in original index + """ from pandas.core.groupby import _indexer_from_factorized From 8ad25d4e03599058f5fe9f3e8838b371481b214f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Feb 2017 12:42:21 -0500 Subject: [PATCH 113/338] CI: use pip install for statsmodels --- ci/requirements-2.7.pip | 1 + ci/requirements-2.7.run | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index d16b932c8be4f..d7266fe88fb32 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -1,3 +1,4 @@ +statsmodels blosc httplib2 google-api-python-client==1.2 diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index b5fc919297c76..62e31e4ae24e3 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -18,6 +18,5 @@ patsy pymysql=0.6.3 html5lib=1.0b2 beautiful-soup=4.2.1 -statsmodels jinja2=2.8 xarray=0.8.0 From 97a02021ed13f08b2f4d62b70696321771a4de32 Mon Sep 17 00:00:00 2001 From: TrigonaMinima Date: Thu, 9 Feb 2017 03:35:43 +0530 Subject: [PATCH 114/338] TST: Period tests reorg xref #14854 --- pandas/tests/frame/test_period.py | 139 + pandas/tests/indexes/period/test_asfreq.py | 154 + .../tests/indexes/period/test_construction.py | 486 ++ .../indexes/period/test_ops.py} | 798 ++- .../indexes/period/test_partial_slicing.py | 139 + pandas/tests/indexes/period/test_period.py | 583 +- pandas/tests/indexes/period/test_setops.py | 157 + pandas/tests/indexes/period/test_tools.py | 449 ++ pandas/tests/scalar/test_period.py | 2074 +++++++ pandas/tests/series/test_period.py | 248 + pandas/tseries/tests/test_period.py | 5065 ----------------- 11 files changed, 5190 insertions(+), 5102 deletions(-) create mode 100644 pandas/tests/frame/test_period.py create mode 100644 pandas/tests/indexes/period/test_asfreq.py create mode 100644 pandas/tests/indexes/period/test_construction.py rename pandas/{tseries/tests/test_base.py => tests/indexes/period/test_ops.py} (58%) create mode 100644 pandas/tests/indexes/period/test_partial_slicing.py create mode 100644 pandas/tests/indexes/period/test_setops.py create mode 100644 pandas/tests/indexes/period/test_tools.py create mode 100644 pandas/tests/scalar/test_period.py create mode 100644 pandas/tests/series/test_period.py delete mode 100644 pandas/tseries/tests/test_period.py diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py new file mode 100644 index 0000000000000..84d10a2e78d28 --- /dev/null +++ b/pandas/tests/frame/test_period.py @@ -0,0 +1,139 @@ +import numpy as np +from numpy.random import randn +from datetime import timedelta + +import pandas as pd +import pandas.util.testing as tm +from pandas import (PeriodIndex, period_range, DataFrame, date_range, + Index, to_datetime, DatetimeIndex) + + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_as_frame_columns(self): + rng = period_range('1/1/2000', periods=5) + df = DataFrame(randn(10, 5), columns=rng) + + ts = df[rng[0]] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + # GH # 1211 + repr(df) + + ts = df['1/1/2000'] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + def test_frame_setitem(self): + rng = period_range('1/1/2000', periods=5, name='index') + df = DataFrame(randn(5, 3), index=rng) + + df['Index'] = rng + rs = Index(df['Index']) + tm.assert_index_equal(rs, rng, check_names=False) + self.assertEqual(rs.name, 'Index') + self.assertEqual(rng.name, 'index') + + rs = df.reset_index().set_index('index') + tm.assertIsInstance(rs.index, PeriodIndex) + tm.assert_index_equal(rs.index, rng) + + def test_frame_to_time_stamp(self): + K = 5 + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + df = DataFrame(randn(len(index), K), index=index) + df['mix'] = 'a' + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = df.to_timestamp('D', 'end') + tm.assert_index_equal(result.index, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') + result = df.to_timestamp('D', 'start') + tm.assert_index_equal(result.index, exp_index) + + def _get_with_delta(delta, freq='A-DEC'): + return date_range(to_datetime('1/1/2001') + delta, + to_datetime('12/31/2009') + delta, freq=freq) + + delta = timedelta(hours=23) + result = df.to_timestamp('H', 'end') + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp('T', 'end') + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.index, exp_index) + + result = df.to_timestamp('S', 'end') + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.index, exp_index) + + # columns + df = df.T + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = df.to_timestamp('D', 'end', axis=1) + tm.assert_index_equal(result.columns, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') + result = df.to_timestamp('D', 'start', axis=1) + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23) + result = df.to_timestamp('H', 'end', axis=1) + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp('T', 'end', axis=1) + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.columns, exp_index) + + result = df.to_timestamp('S', 'end', axis=1) + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.columns, exp_index) + + # invalid axis + tm.assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) + + result1 = df.to_timestamp('5t', axis=1) + result2 = df.to_timestamp('t', axis=1) + expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS') + self.assertTrue(isinstance(result1.columns, DatetimeIndex)) + self.assertTrue(isinstance(result2.columns, DatetimeIndex)) + self.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) + self.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) + # PeriodIndex.to_timestamp always use 'infer' + self.assertEqual(result1.columns.freqstr, 'AS-JAN') + self.assertEqual(result2.columns.freqstr, 'AS-JAN') + + def test_frame_index_to_string(self): + index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + frame = DataFrame(np.random.randn(3, 4), index=index) + + # it works! + frame.to_string() + + def test_align_frame(self): + rng = period_range('1/1/2000', '1/1/2010', freq='A') + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected.values[1::2] = np.nan + tm.assert_frame_equal(result, expected) + + result = ts + _permute(ts[::2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py new file mode 100644 index 0000000000000..96e3d0bbd8abc --- /dev/null +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -0,0 +1,154 @@ +import numpy as np + +import pandas as pd +from pandas.util import testing as tm +from pandas import PeriodIndex, Series, DataFrame + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_asfreq(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') + pi2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') + pi3 = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2001') + pi4 = PeriodIndex(freq='D', start='1/1/2001', end='1/1/2001') + pi5 = PeriodIndex(freq='H', start='1/1/2001', end='1/1/2001 00:00') + pi6 = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 00:00') + pi7 = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') + + self.assertEqual(pi1.asfreq('Q', 'S'), pi2) + self.assertEqual(pi1.asfreq('Q', 's'), pi2) + self.assertEqual(pi1.asfreq('M', 'start'), pi3) + self.assertEqual(pi1.asfreq('D', 'StarT'), pi4) + self.assertEqual(pi1.asfreq('H', 'beGIN'), pi5) + self.assertEqual(pi1.asfreq('Min', 'S'), pi6) + self.assertEqual(pi1.asfreq('S', 'S'), pi7) + + self.assertEqual(pi2.asfreq('A', 'S'), pi1) + self.assertEqual(pi2.asfreq('M', 'S'), pi3) + self.assertEqual(pi2.asfreq('D', 'S'), pi4) + self.assertEqual(pi2.asfreq('H', 'S'), pi5) + self.assertEqual(pi2.asfreq('Min', 'S'), pi6) + self.assertEqual(pi2.asfreq('S', 'S'), pi7) + + self.assertEqual(pi3.asfreq('A', 'S'), pi1) + self.assertEqual(pi3.asfreq('Q', 'S'), pi2) + self.assertEqual(pi3.asfreq('D', 'S'), pi4) + self.assertEqual(pi3.asfreq('H', 'S'), pi5) + self.assertEqual(pi3.asfreq('Min', 'S'), pi6) + self.assertEqual(pi3.asfreq('S', 'S'), pi7) + + self.assertEqual(pi4.asfreq('A', 'S'), pi1) + self.assertEqual(pi4.asfreq('Q', 'S'), pi2) + self.assertEqual(pi4.asfreq('M', 'S'), pi3) + self.assertEqual(pi4.asfreq('H', 'S'), pi5) + self.assertEqual(pi4.asfreq('Min', 'S'), pi6) + self.assertEqual(pi4.asfreq('S', 'S'), pi7) + + self.assertEqual(pi5.asfreq('A', 'S'), pi1) + self.assertEqual(pi5.asfreq('Q', 'S'), pi2) + self.assertEqual(pi5.asfreq('M', 'S'), pi3) + self.assertEqual(pi5.asfreq('D', 'S'), pi4) + self.assertEqual(pi5.asfreq('Min', 'S'), pi6) + self.assertEqual(pi5.asfreq('S', 'S'), pi7) + + self.assertEqual(pi6.asfreq('A', 'S'), pi1) + self.assertEqual(pi6.asfreq('Q', 'S'), pi2) + self.assertEqual(pi6.asfreq('M', 'S'), pi3) + self.assertEqual(pi6.asfreq('D', 'S'), pi4) + self.assertEqual(pi6.asfreq('H', 'S'), pi5) + self.assertEqual(pi6.asfreq('S', 'S'), pi7) + + self.assertEqual(pi7.asfreq('A', 'S'), pi1) + self.assertEqual(pi7.asfreq('Q', 'S'), pi2) + self.assertEqual(pi7.asfreq('M', 'S'), pi3) + self.assertEqual(pi7.asfreq('D', 'S'), pi4) + self.assertEqual(pi7.asfreq('H', 'S'), pi5) + self.assertEqual(pi7.asfreq('Min', 'S'), pi6) + + self.assertRaises(ValueError, pi7.asfreq, 'T', 'foo') + result1 = pi1.asfreq('3M') + result2 = pi1.asfreq('M') + expected = PeriodIndex(freq='M', start='2001-12', end='2001-12') + self.assert_numpy_array_equal(result1.asi8, expected.asi8) + self.assertEqual(result1.freqstr, '3M') + self.assert_numpy_array_equal(result2.asi8, expected.asi8) + self.assertEqual(result2.freqstr, 'M') + + def test_asfreq_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') + result = idx.asfreq(freq='Q') + expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') + tm.assert_index_equal(result, expected) + + def test_asfreq_mult_pi(self): + pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') + + for freq in ['D', '3D']: + result = pi.asfreq(freq) + exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT', + '2001-04-30'], freq=freq) + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + result = pi.asfreq(freq, how='S') + exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT', + '2001-03-01'], freq=freq) + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + def test_asfreq_combined_pi(self): + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='H') + exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']): + result = pi.asfreq(freq, how=how) + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + for freq in ['1D1H', '1H1D']: + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', + 'NaT'], freq=freq) + result = pi.asfreq('H') + exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'], + freq='H') + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', + 'NaT'], freq=freq) + result = pi.asfreq('H', how='S') + exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='H') + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + def test_asfreq_ts(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/31/2010') + ts = Series(np.random.randn(len(index)), index=index) + df = DataFrame(np.random.randn(len(index), 3), index=index) + + result = ts.asfreq('D', how='end') + df_result = df.asfreq('D', how='end') + exp_index = index.asfreq('D', how='end') + self.assertEqual(len(result), len(ts)) + tm.assert_index_equal(result.index, exp_index) + tm.assert_index_equal(df_result.index, exp_index) + + result = ts.asfreq('D', how='start') + self.assertEqual(len(result), len(ts)) + tm.assert_index_equal(result.index, index.asfreq('D', how='start')) + + def test_astype_asfreq(self): + pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D') + exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + tm.assert_index_equal(pi1.asfreq('M'), exp) + tm.assert_index_equal(pi1.astype('period[M]'), exp) + + exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') + tm.assert_index_equal(pi1.asfreq('3M'), exp) + tm.assert_index_equal(pi1.astype('period[3M]'), exp) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py new file mode 100644 index 0000000000000..c1299c6abeda3 --- /dev/null +++ b/pandas/tests/indexes/period/test_construction.py @@ -0,0 +1,486 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +import pandas.tseries.period as period +from pandas.compat import lrange, PY3, text_type, lmap +from pandas import (Period, PeriodIndex, period_range, offsets, date_range, + Series, Index) + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='D')] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.Index(np.array(arr), dtype=object)) + + def test_constructor_use_start_freq(self): + # GH #1118 + p = Period('4/2/2012', freq='B') + index = PeriodIndex(start=p, periods=10) + expected = PeriodIndex(start='4/2/2012', periods=10, freq='B') + tm.assert_index_equal(index, expected) + + def test_constructor_field_arrays(self): + # GH #1264 + + years = np.arange(1990, 2010).repeat(4)[2:-2] + quarters = np.tile(np.arange(1, 5), 20)[2:-2] + + index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') + expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') + tm.assert_index_equal(index, expected) + + index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC') + tm.assert_numpy_array_equal(index.asi8, index2.asi8) + + index = PeriodIndex(year=years, quarter=quarters) + tm.assert_index_equal(index, expected) + + years = [2007, 2007, 2007] + months = [1, 2] + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='M') + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='2M') + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='M', start=Period('2007-01', freq='M')) + + years = [2007, 2007, 2007] + months = [1, 2, 3] + idx = PeriodIndex(year=years, month=months, freq='M') + exp = period_range('2007-01', periods=3, freq='M') + tm.assert_index_equal(idx, exp) + + def test_constructor_U(self): + # U was used as undefined period + self.assertRaises(ValueError, period_range, '2007-1-1', periods=500, + freq='X') + + def test_constructor_nano(self): + idx = period_range(start=Period(ordinal=1, freq='N'), + end=Period(ordinal=4, freq='N'), freq='N') + exp = PeriodIndex([Period(ordinal=1, freq='N'), + Period(ordinal=2, freq='N'), + Period(ordinal=3, freq='N'), + Period(ordinal=4, freq='N')], freq='N') + tm.assert_index_equal(idx, exp) + + def test_constructor_arrays_negative_year(self): + years = np.arange(1960, 2000, dtype=np.int64).repeat(4) + quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + self.assert_numpy_array_equal(pindex.year, years) + self.assert_numpy_array_equal(pindex.quarter, quarters) + + def test_constructor_invalid_quarters(self): + self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), + quarter=lrange(4), freq='Q-DEC') + + def test_constructor_corner(self): + self.assertRaises(ValueError, PeriodIndex, periods=10, freq='A') + + start = Period('2007', freq='A-JUN') + end = Period('2010', freq='A-DEC') + self.assertRaises(ValueError, PeriodIndex, start=start, end=end) + self.assertRaises(ValueError, PeriodIndex, start=start) + self.assertRaises(ValueError, PeriodIndex, end=end) + + result = period_range('2007-01', periods=10.5, freq='M') + exp = period_range('2007-01', periods=10, freq='M') + tm.assert_index_equal(result, exp) + + def test_constructor_fromarraylike(self): + idx = period_range('2007-01', periods=20, freq='M') + + # values is an array of Period, thus can retrieve freq + tm.assert_index_equal(PeriodIndex(idx.values), idx) + tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) + + self.assertRaises(ValueError, PeriodIndex, idx._values) + self.assertRaises(ValueError, PeriodIndex, list(idx._values)) + self.assertRaises(ValueError, PeriodIndex, + data=Period('2007', freq='A')) + + result = PeriodIndex(iter(idx)) + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx) + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx, freq='M') + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx, freq=offsets.MonthEnd()) + tm.assert_index_equal(result, idx) + self.assertTrue(result.freq, 'M') + + result = PeriodIndex(idx, freq='2M') + tm.assert_index_equal(result, idx.asfreq('2M')) + self.assertTrue(result.freq, '2M') + + result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) + tm.assert_index_equal(result, idx.asfreq('2M')) + self.assertTrue(result.freq, '2M') + + result = PeriodIndex(idx, freq='D') + exp = idx.asfreq('D', 'e') + tm.assert_index_equal(result, exp) + + def test_constructor_datetime64arr(self): + vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) + vals = vals.view(np.dtype('M8[us]')) + + self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + + def test_constructor_dtype(self): + # passing a dtype with a tz should localize + idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]') + exp = PeriodIndex(['2013-01', '2013-03'], freq='M') + tm.assert_index_equal(idx, exp) + self.assertEqual(idx.dtype, 'period[M]') + + idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]') + exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D') + tm.assert_index_equal(idx, exp) + self.assertEqual(idx.dtype, 'period[3D]') + + # if we already have a freq and its not the same, then asfreq + # (not changed) + idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D') + + res = PeriodIndex(idx, dtype='period[M]') + exp = PeriodIndex(['2013-01', '2013-01'], freq='M') + tm.assert_index_equal(res, exp) + self.assertEqual(res.dtype, 'period[M]') + + res = PeriodIndex(idx, freq='M') + tm.assert_index_equal(res, exp) + self.assertEqual(res.dtype, 'period[M]') + + msg = 'specified freq and dtype are different' + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(['2011-01'], freq='M', dtype='period[D]') + + def test_constructor_empty(self): + idx = pd.PeriodIndex([], freq='M') + tm.assertIsInstance(idx, PeriodIndex) + self.assertEqual(len(idx), 0) + self.assertEqual(idx.freq, 'M') + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + pd.PeriodIndex([]) + + def test_constructor_pi_nat(self): + idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='M')]) + exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='M')])) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='M')]) + exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(np.array([pd.NaT, pd.NaT, + Period('2011-01', freq='M'), + Period('2011-01', freq='M')])) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex([pd.NaT, pd.NaT]) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(np.array([pd.NaT, pd.NaT])) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(['NaT', 'NaT']) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(np.array(['NaT', 'NaT'])) + + def test_constructor_incompat_freq(self): + msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='D')]) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='D')])) + + # first element is pd.NaT + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex([pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='D')]) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='D')])) + + def test_constructor_mixed(self): + idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')]) + exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')]) + exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT, + '2012-01-01']) + exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D') + tm.assert_index_equal(idx, exp) + + def test_constructor_simple_new(self): + idx = period_range('2007-01', name='p', periods=2, freq='M') + result = idx._simple_new(idx, 'p', freq=idx.freq) + tm.assert_index_equal(result, idx) + + result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) + tm.assert_index_equal(result, idx) + + result = idx._simple_new([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')], + 'p', freq=idx.freq) + self.assert_index_equal(result, idx) + + result = idx._simple_new(np.array([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')]), + 'p', freq=idx.freq) + self.assert_index_equal(result, idx) + + def test_constructor_simple_new_empty(self): + # GH13079 + idx = PeriodIndex([], freq='M', name='p') + result = idx._simple_new(idx, name='p', freq='M') + tm.assert_index_equal(result, idx) + + def test_constructor_simple_new_floats(self): + # GH13079 + for floats in [[1.1], np.array([1.1])]: + with self.assertRaises(TypeError): + pd.PeriodIndex._simple_new(floats, freq='M') + + def test_constructor_nat(self): + self.assertRaises(ValueError, period_range, start='NaT', + end='2011-01-01', freq='M') + self.assertRaises(ValueError, period_range, start='2011-01-01', + end='NaT', freq='M') + + def test_constructor_year_and_quarter(self): + year = pd.Series([2001, 2002, 2003]) + quarter = year - 2000 + idx = PeriodIndex(year=year, quarter=quarter) + strs = ['%dQ%d' % t for t in zip(quarter, year)] + lops = list(map(Period, strs)) + p = PeriodIndex(lops) + tm.assert_index_equal(p, idx) + + def test_constructor_freq_mult(self): + # GH #7811 + for func in [PeriodIndex, period_range]: + # must be the same, but for sure... + pidx = func(start='2014-01', freq='2M', periods=4) + expected = PeriodIndex(['2014-01', '2014-03', + '2014-05', '2014-07'], freq='2M') + tm.assert_index_equal(pidx, expected) + + pidx = func(start='2014-01-02', end='2014-01-15', freq='3D') + expected = PeriodIndex(['2014-01-02', '2014-01-05', + '2014-01-08', '2014-01-11', + '2014-01-14'], freq='3D') + tm.assert_index_equal(pidx, expected) + + pidx = func(end='2014-01-01 17:00', freq='4H', periods=3) + expected = PeriodIndex(['2014-01-01 09:00', '2014-01-01 13:00', + '2014-01-01 17:00'], freq='4H') + tm.assert_index_equal(pidx, expected) + + msg = ('Frequency must be positive, because it' + ' represents span: -1M') + with tm.assertRaisesRegexp(ValueError, msg): + PeriodIndex(['2011-01'], freq='-1M') + + msg = ('Frequency must be positive, because it' ' represents span: 0M') + with tm.assertRaisesRegexp(ValueError, msg): + PeriodIndex(['2011-01'], freq='0M') + + msg = ('Frequency must be positive, because it' ' represents span: 0M') + with tm.assertRaisesRegexp(ValueError, msg): + period_range('2011-01', periods=3, freq='0M') + + def test_constructor_freq_mult_dti_compat(self): + import itertools + mults = [1, 2, 3, 4, 5] + freqs = ['A', 'M', 'D', 'T', 'S'] + for mult, freq in itertools.product(mults, freqs): + freqstr = str(mult) + freq + pidx = PeriodIndex(start='2014-04-01', freq=freqstr, periods=10) + expected = date_range(start='2014-04-01', freq=freqstr, + periods=10).to_period(freqstr) + tm.assert_index_equal(pidx, expected) + + def test_constructor_freq_combined(self): + for freq in ['1D1H', '1H1D']: + pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq) + expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'], + freq='25H') + for freq, func in zip(['1D1H', '1H1D'], [PeriodIndex, period_range]): + pidx = func(start='2016-01-01', periods=2, freq=freq) + expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'], + freq='25H') + tm.assert_index_equal(pidx, expected) + + def test_constructor(self): + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + self.assertEqual(len(pi), 9) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') + self.assertEqual(len(pi), 4 * 9) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + self.assertEqual(len(pi), 12 * 9) + + pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') + self.assertEqual(len(pi), 365 * 9 + 2) + + pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') + self.assertEqual(len(pi), 261 * 9) + + pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') + self.assertEqual(len(pi), 365 * 24) + + pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') + self.assertEqual(len(pi), 24 * 60) + + pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') + self.assertEqual(len(pi), 24 * 60 * 60) + + start = Period('02-Apr-2005', 'B') + i1 = PeriodIndex(start=start, periods=20) + self.assertEqual(len(i1), 20) + self.assertEqual(i1.freq, start.freq) + self.assertEqual(i1[0], start) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + self.assertEqual(len(i1), 10) + self.assertEqual(i1.freq, end_intv.freq) + self.assertEqual(i1[-1], end_intv) + + end_intv = Period('2006-12-31', '1w') + i2 = PeriodIndex(end=end_intv, periods=10) + self.assertEqual(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + self.assertEqual(i1.freq, i2.freq) + + end_intv = Period('2006-12-31', ('w', 1)) + i2 = PeriodIndex(end=end_intv, periods=10) + self.assertEqual(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + self.assertEqual(i1.freq, i2.freq) + + try: + PeriodIndex(start=start, end=end_intv) + raise AssertionError('Cannot allow mixed freq for start and end') + except ValueError: + pass + + end_intv = Period('2005-05-01', 'B') + i1 = PeriodIndex(start=start, end=end_intv) + + try: + PeriodIndex(start=start) + raise AssertionError( + 'Must specify periods if missing start or end') + except ValueError: + pass + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) + + i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) + + # Mixed freq should fail + vals = [end_intv, Period('2006-12-31', 'w')] + self.assertRaises(ValueError, PeriodIndex, vals) + vals = np.array(vals) + self.assertRaises(ValueError, PeriodIndex, vals) + + def test_recreate_from_data(self): + for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: + org = PeriodIndex(start='2001/04/01', freq=o, periods=1) + idx = PeriodIndex(org.values, freq=o) + tm.assert_index_equal(idx, org) + + def test_map_with_string_constructor(self): + raw = [2005, 2007, 2009] + index = PeriodIndex(raw, freq='A') + types = str, + + if PY3: + # unicode + types += text_type, + + for t in types: + expected = Index(lmap(t, raw)) + res = index.map(t) + + # should return an Index + tm.assertIsInstance(res, Index) + + # preserve element types + self.assertTrue(all(isinstance(resi, t) for resi in res)) + + # lastly, values should compare equal + tm.assert_index_equal(res, expected) + + +class TestSeriesPeriod(tm.TestCase): + + def setUp(self): + self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + + def test_constructor_cant_cast_period(self): + with tm.assertRaises(TypeError): + Series(period_range('2000-01-01', periods=10, freq='D'), + dtype=float) + + def test_constructor_cast_object(self): + s = Series(period_range('1/1/2000', periods=10), dtype=object) + exp = Series(period_range('1/1/2000', periods=10)) + tm.assert_series_equal(s, exp) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tests/indexes/period/test_ops.py similarity index 58% rename from pandas/tseries/tests/test_base.py rename to pandas/tests/indexes/period/test_ops.py index 114cb02205d4f..70759e8659c25 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,13 +1,14 @@ -from __future__ import print_function -from datetime import timedelta import numpy as np +from datetime import timedelta, datetime + import pandas as pd -from pandas import (Series, Index, Period, DatetimeIndex, PeriodIndex, - Timedelta, _np_version_under1p10) import pandas.tslib as tslib -import pandas.tseries.period as period - import pandas.util.testing as tm +import pandas.tseries.period as period +from pandas.compat import lrange +from pandas import (DatetimeIndex, PeriodIndex, period_range, Series, Period, + _np_version_under1p10, Index, Timedelta, offsets, + _np_version_under1p9) from pandas.tests.test_base import Ops @@ -473,6 +474,13 @@ def test_difference(self): result_union = rng.difference(other) tm.assert_index_equal(result_union, expected) + def test_sub(self): + rng = period_range('2007-01', periods=50) + + result = rng - 5 + exp = rng + (-5) + tm.assert_index_equal(result, exp) + def test_sub_isub(self): # previously performed setop, now raises TypeError (GH14164) @@ -1020,3 +1028,781 @@ def test_equals(self): self.assertFalse(idx.asobject.equals(idx3)) self.assertFalse(idx.equals(list(idx3))) self.assertFalse(idx.equals(pd.Series(idx3))) + + +class TestPeriodIndexSeriesMethods(tm.TestCase): + """ Test PeriodIndex and Period Series Ops consistency """ + + def _check(self, values, func, expected): + idx = pd.PeriodIndex(values) + result = func(idx) + if isinstance(expected, pd.Index): + tm.assert_index_equal(result, expected) + else: + # comp op results in bool + tm.assert_numpy_array_equal(result, expected) + + s = pd.Series(values) + result = func(s) + + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) + + def test_pi_ops(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + expected = PeriodIndex(['2011-03', '2011-04', + '2011-05', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + result = idx - Period('2011-01', freq='M') + exp = pd.Index([0, 1, 2, 3], name='idx') + tm.assert_index_equal(result, exp) + + result = Period('2011-01', freq='M') - idx + exp = pd.Index([0, -1, -2, -3], name='idx') + tm.assert_index_equal(result, exp) + + def test_pi_ops_errors(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + s = pd.Series(idx) + + msg = r"unsupported operand type\(s\)" + + for obj in [idx, s]: + for ng in ["str", 1.5]: + with tm.assertRaisesRegexp(TypeError, msg): + obj + ng + + with tm.assertRaises(TypeError): + # error message differs between PY2 and 3 + ng + obj + + with tm.assertRaisesRegexp(TypeError, msg): + obj - ng + + with tm.assertRaises(TypeError): + np.add(obj, ng) + + if _np_version_under1p10: + self.assertIs(np.add(ng, obj), NotImplemented) + else: + with tm.assertRaises(TypeError): + np.add(ng, obj) + + with tm.assertRaises(TypeError): + np.subtract(obj, ng) + + if _np_version_under1p10: + self.assertIs(np.subtract(ng, obj), NotImplemented) + else: + with tm.assertRaises(TypeError): + np.subtract(ng, obj) + + def test_pi_ops_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + expected = PeriodIndex(['2011-03', '2011-04', + 'NaT', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) + + self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + + # freq with mult + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='2M', name='idx') + expected = PeriodIndex(['2011-07', '2011-08', + 'NaT', '2011-10'], freq='2M', name='idx') + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) + + def test_pi_ops_array_int(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + f = lambda x: x + np.array([1, 2, 3, 4]) + exp = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) + exp = PeriodIndex(['2011-05', '2011-01', 'NaT', + '2011-06'], freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - np.array([1, 2, 3, 4]) + exp = PeriodIndex(['2010-12', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) + exp = PeriodIndex(['2010-10', '2010-12', 'NaT', + '2011-06'], freq='M', name='idx') + self._check(idx, f, exp) + + def test_pi_ops_offset(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + f = lambda x: x + offsets.Day() + exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', + '2011-04-02'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x + offsets.Day(2) + exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', + '2011-04-03'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - offsets.Day(2) + exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', + '2011-03-30'], freq='D', name='idx') + self._check(idx, f, exp) + + def test_pi_offset_errors(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + s = pd.Series(idx) + + # Series op is applied per Period instance, thus error is raised + # from Period + msg_idx = r"Input has different freq from PeriodIndex\(freq=D\)" + msg_s = r"Input cannot be converted to Period\(freq=D\)" + for obj, msg in [(idx, msg_idx), (s, msg_s)]: + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + obj + offsets.Hour(2) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + offsets.Hour(2) + obj + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + obj - offsets.Hour(2) + + def test_pi_sub_period(self): + # GH 13071 + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + result = idx - pd.Period('2012-01', freq='M') + exp = pd.Index([-12, -11, -10, -9], name='idx') + tm.assert_index_equal(result, exp) + + result = np.subtract(idx, pd.Period('2012-01', freq='M')) + tm.assert_index_equal(result, exp) + + result = pd.Period('2012-01', freq='M') - idx + exp = pd.Index([12, 11, 10, 9], name='idx') + tm.assert_index_equal(result, exp) + + result = np.subtract(pd.Period('2012-01', freq='M'), idx) + if _np_version_under1p10: + self.assertIs(result, NotImplemented) + else: + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') + tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) + tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + + def test_pi_sub_pdnat(self): + # GH 13071 + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') + tm.assert_index_equal(pd.NaT - idx, exp) + tm.assert_index_equal(idx - pd.NaT, exp) + + def test_pi_sub_period_nat(self): + # GH 13071 + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M', name='idx') + + result = idx - pd.Period('2012-01', freq='M') + exp = pd.Index([-12, np.nan, -10, -9], name='idx') + tm.assert_index_equal(result, exp) + + result = pd.Period('2012-01', freq='M') - idx + exp = pd.Index([12, np.nan, 10, 9], name='idx') + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') + tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) + tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + + def test_pi_comp_period(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > pd.Period('2011-03', freq='M') + exp = np.array([False, False, False, True], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + def test_pi_comp_period_nat(self): + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x == tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: x != tslib.NaT + exp = np.array([True, True, True, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x < pd.Period('2011-03', freq='M') + exp = np.array([True, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: tslib.NaT >= x + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + +class TestSeriesPeriod(tm.TestCase): + + def setUp(self): + self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + + def test_ops_series_timedelta(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Period('2015-01-02', freq='D'), + pd.Period('2015-01-03', freq='D')], name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + tm.assert_series_equal(s + pd.tseries.offsets.Day(), exp) + tm.assert_series_equal(pd.tseries.offsets.Day() + s, exp) + + def test_ops_series_period(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + p = pd.Period('2015-01-10', freq='D') + # dtype will be object because of original dtype + exp = pd.Series([9, 8], name='xxx', dtype=object) + tm.assert_series_equal(p - s, exp) + tm.assert_series_equal(s - p, -exp) + + s2 = pd.Series([pd.Period('2015-01-05', freq='D'), + pd.Period('2015-01-04', freq='D')], name='xxx') + self.assertEqual(s2.dtype, object) + + exp = pd.Series([4, 2], name='xxx', dtype=object) + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + def test_ops_frame_period(self): + # GH 13043 + df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), + pd.Period('2015-02', freq='M')], + 'B': [pd.Period('2014-01', freq='M'), + pd.Period('2014-02', freq='M')]}) + self.assertEqual(df['A'].dtype, object) + self.assertEqual(df['B'].dtype, object) + + p = pd.Period('2015-03', freq='M') + # dtype will be object because of original dtype + exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), + 'B': np.array([14, 13], dtype=object)}) + tm.assert_frame_equal(p - df, exp) + tm.assert_frame_equal(df - p, -exp) + + df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')], + 'B': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')]}) + self.assertEqual(df2['A'].dtype, object) + self.assertEqual(df2['B'].dtype, object) + + exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), + 'B': np.array([16, 16], dtype=object)}) + tm.assert_frame_equal(df2 - df, exp) + tm.assert_frame_equal(df - df2, -exp) + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_getitem_index(self): + idx = period_range('2007-01', periods=10, freq='M', name='x') + + result = idx[[1, 3, 5]] + exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], + freq='M', name='x') + tm.assert_index_equal(result, exp) + + result = idx[[True, True, False, False, False, + True, True, False, False, False]] + exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], + freq='M', name='x') + tm.assert_index_equal(result, exp) + + def test_getitem_partial(self): + rng = period_range('2007-01', periods=50, freq='M') + ts = Series(np.random.randn(len(rng)), rng) + + self.assertRaises(KeyError, ts.__getitem__, '2006') + + result = ts['2008'] + self.assertTrue((result.index.year == 2008).all()) + + result = ts['2008':'2009'] + self.assertEqual(len(result), 24) + + result = ts['2008-1':'2009-12'] + self.assertEqual(len(result), 24) + + result = ts['2008Q1':'2009Q4'] + self.assertEqual(len(result), 24) + + result = ts[:'2009'] + self.assertEqual(len(result), 36) + + result = ts['2009':] + self.assertEqual(len(result), 50 - 24) + + exp = result + result = ts[24:] + tm.assert_series_equal(exp, result) + + ts = ts[10:].append(ts[10:]) + self.assertRaisesRegexp(KeyError, + "left slice bound for non-unique " + "label: '2008'", + ts.__getitem__, slice('2008', '2009')) + + def test_getitem_datetime(self): + rng = period_range(start='2012-01-01', periods=10, freq='W-MON') + ts = Series(lrange(len(rng)), index=rng) + + dt1 = datetime(2011, 10, 2) + dt4 = datetime(2012, 4, 20) + + rs = ts[dt1:dt4] + tm.assert_series_equal(rs, ts) + + def test_getitem_nat(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) + self.assertIs(idx[1], tslib.NaT) + + s = pd.Series([0, 1, 2], index=idx) + self.assertEqual(s[pd.NaT], 1) + + s = pd.Series(idx, index=idx) + self.assertEqual(s[pd.Period('2011-01', freq='M')], + pd.Period('2011-01', freq='M')) + self.assertIs(s[pd.NaT], tslib.NaT) + + def test_getitem_list_periods(self): + # GH 7710 + rng = period_range(start='2012-01-01', periods=10, freq='D') + ts = Series(lrange(len(rng)), index=rng) + exp = ts.iloc[[1]] + tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) + + def test_getitem_seconds(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', + periods=4000) + pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', + '2013/02/01 09:00'] + for v in values: + if _np_version_under1p9: + with tm.assertRaises(ValueError): + idx[v] + else: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with tm.assertRaises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) + tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) + for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s[d], s) + + def test_getitem_day(self): + # GH 6716 + # Confirm DatetimeIndex and PeriodIndex works identically + didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) + pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', + '2013/02/01 09:00'] + for v in values: + + if _np_version_under1p9: + with tm.assertRaises(ValueError): + idx[v] + else: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with tm.assertRaises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + tm.assert_series_equal(s['2013/01'], s[0:31]) + tm.assert_series_equal(s['2013/02'], s[31:59]) + tm.assert_series_equal(s['2014'], s[365:]) + + invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + for v in invalid: + with tm.assertRaises(KeyError): + s[v] + + def test_take(self): + index = PeriodIndex(start='1/1/10', end='12/31/12', freq='D', + name='idx') + expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), + datetime(2010, 1, 9), datetime(2010, 1, 13)], + freq='D', name='idx') + + taken1 = index.take([5, 6, 8, 12]) + taken2 = index[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + tm.assertIsInstance(taken, PeriodIndex) + self.assertEqual(taken.freq, index.freq) + self.assertEqual(taken.name, expected.name) + + def test_take_fill_value(self): + # GH 12631 + idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + name='xxx', freq='D') + result = idx.take(np.array([1, 0, -1])) + expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', freq='D') + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'], + name='xxx', freq='D') + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', freq='D') + tm.assert_index_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + idx.take(np.array([1, -5])) + + def test_get_loc_msg(self): + idx = period_range('2000-1-1', freq='A', periods=10) + bad_period = Period('2012', 'A') + self.assertRaises(KeyError, idx.get_loc, bad_period) + + try: + idx.get_loc(bad_period) + except KeyError as inst: + self.assertEqual(inst.args[0], bad_period) + + def test_get_loc_nat(self): + didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) + pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + + # check DatetimeIndex compat + for idx in [didx, pidx]: + self.assertEqual(idx.get_loc(pd.NaT), 1) + self.assertEqual(idx.get_loc(None), 1) + self.assertEqual(idx.get_loc(float('nan')), 1) + self.assertEqual(idx.get_loc(np.nan), 1) + + +class TestComparisons(tm.TestCase): + + def setUp(self): + self.january1 = Period('2000-01', 'M') + self.january2 = Period('2000-01', 'M') + self.february = Period('2000-02', 'M') + self.march = Period('2000-03', 'M') + self.day = Period('2012-01-01', 'D') + + def test_equal(self): + self.assertEqual(self.january1, self.january2) + + def test_equal_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 == self.day + + def test_notEqual(self): + self.assertNotEqual(self.january1, 1) + self.assertNotEqual(self.january1, self.february) + + def test_greater(self): + self.assertTrue(self.february > self.january1) + + def test_greater_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 > self.day + + def test_greater_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 > 1 + + def test_greaterEqual(self): + self.assertTrue(self.january1 >= self.january2) + + def test_greaterEqual_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 >= self.day + + with tm.assertRaises(TypeError): + print(self.january1 >= 1) + + def test_smallerEqual(self): + self.assertTrue(self.january1 <= self.january2) + + def test_smallerEqual_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 <= self.day + + def test_smallerEqual_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 <= 1 + + def test_smaller(self): + self.assertTrue(self.january1 < self.february) + + def test_smaller_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 < self.day + + def test_smaller_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 < 1 + + def test_sort(self): + periods = [self.march, self.january1, self.february] + correctPeriods = [self.january1, self.february, self.march] + self.assertEqual(sorted(periods), correctPeriods) + + def test_period_nat_comp(self): + p_nat = Period('NaT', freq='D') + p = Period('2011-01-01', freq='D') + + nat = pd.Timestamp('NaT') + t = pd.Timestamp('2011-01-01') + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t), + (t, nat), (nat, nat)]: + self.assertEqual(left < right, False) + self.assertEqual(left > right, False) + self.assertEqual(left == right, False) + self.assertEqual(left != right, True) + self.assertEqual(left <= right, False) + self.assertEqual(left >= right, False) + + def test_pi_pi_comp(self): + + for freq in ['M', '2M', '3M']: + base = PeriodIndex(['2011-01', '2011-02', + '2011-03', '2011-04'], freq=freq) + p = Period('2011-02', freq=freq) + + exp = np.array([False, True, False, False]) + self.assert_numpy_array_equal(base == p, exp) + self.assert_numpy_array_equal(p == base, exp) + + exp = np.array([True, False, True, True]) + self.assert_numpy_array_equal(base != p, exp) + self.assert_numpy_array_equal(p != base, exp) + + exp = np.array([False, False, True, True]) + self.assert_numpy_array_equal(base > p, exp) + self.assert_numpy_array_equal(p < base, exp) + + exp = np.array([True, False, False, False]) + self.assert_numpy_array_equal(base < p, exp) + self.assert_numpy_array_equal(p > base, exp) + + exp = np.array([False, True, True, True]) + self.assert_numpy_array_equal(base >= p, exp) + self.assert_numpy_array_equal(p <= base, exp) + + exp = np.array([True, True, False, False]) + self.assert_numpy_array_equal(base <= p, exp) + self.assert_numpy_array_equal(p >= base, exp) + + idx = PeriodIndex(['2011-02', '2011-01', '2011-03', + '2011-05'], freq=freq) + + exp = np.array([False, False, True, False]) + self.assert_numpy_array_equal(base == idx, exp) + + exp = np.array([True, True, False, True]) + self.assert_numpy_array_equal(base != idx, exp) + + exp = np.array([False, True, False, False]) + self.assert_numpy_array_equal(base > idx, exp) + + exp = np.array([True, False, False, True]) + self.assert_numpy_array_equal(base < idx, exp) + + exp = np.array([False, True, True, False]) + self.assert_numpy_array_equal(base >= idx, exp) + + exp = np.array([True, False, True, True]) + self.assert_numpy_array_equal(base <= idx, exp) + + # different base freq + msg = "Input has different freq=A-DEC from PeriodIndex" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') + base <= idx + + # different mult + msg = "Input has different freq=4M from PeriodIndex" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='4M') + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='4M') >= base + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') + base <= idx + + def test_pi_nat_comp(self): + for freq in ['M', '2M', '3M']: + idx1 = PeriodIndex( + ['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + + result = idx1 > Period('2011-02', freq=freq) + exp = np.array([False, False, False, True]) + self.assert_numpy_array_equal(result, exp) + result = Period('2011-02', freq=freq) < idx1 + self.assert_numpy_array_equal(result, exp) + + result = idx1 == Period('NaT', freq=freq) + exp = np.array([False, False, False, False]) + self.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) == idx1 + self.assert_numpy_array_equal(result, exp) + + result = idx1 != Period('NaT', freq=freq) + exp = np.array([True, True, True, True]) + self.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) != idx1 + self.assert_numpy_array_equal(result, exp) + + idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', + 'NaT'], freq=freq) + result = idx1 < idx2 + exp = np.array([True, False, False, False]) + self.assert_numpy_array_equal(result, exp) + + result = idx1 == idx2 + exp = np.array([False, False, False, False]) + self.assert_numpy_array_equal(result, exp) + + result = idx1 != idx2 + exp = np.array([True, True, True, True]) + self.assert_numpy_array_equal(result, exp) + + result = idx1 == idx1 + exp = np.array([True, True, False, True]) + self.assert_numpy_array_equal(result, exp) + + result = idx1 != idx1 + exp = np.array([False, False, True, False]) + self.assert_numpy_array_equal(result, exp) + + diff = PeriodIndex(['2011-02', '2011-01', '2011-04', + 'NaT'], freq='4M') + msg = "Input has different freq=4M from PeriodIndex" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + idx1 > diff + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + idx1 == diff diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py new file mode 100644 index 0000000000000..b051c4a0dcab1 --- /dev/null +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -0,0 +1,139 @@ +import numpy as np + +import pandas as pd +from pandas.util import testing as tm +from pandas import (Series, period_range, DatetimeIndex, PeriodIndex, + DataFrame, _np_version_under1p12, Period) + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), + period_range('2014-01', periods=20, freq='M')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) + assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[:Period('2014-10'):-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:'2014-10':-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC['2015-02':'2014-10':-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period('2015-02'):Period('2014-10'):-1], + SLC[13:8:-1]) + assert_slices_equivalent(SLC['2015-02':Period('2014-10'):-1], + SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period('2015-02'):'2014-10':-1], + SLC[13:8:-1]) + + assert_slices_equivalent(SLC['2014-10':'2015-02':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), + period_range('2014-01', periods=20, freq='M')) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + + def test_slice_keep_name(self): + idx = period_range('20010101', periods=10, freq='D', name='bob') + self.assertEqual(idx.name, idx[1:].name) + + def test_pindex_slice_index(self): + pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='M') + s = Series(np.random.rand(len(pi)), index=pi) + res = s['2010'] + exp = s[0:12] + tm.assert_series_equal(res, exp) + res = s['2011'] + exp = s[12:24] + tm.assert_series_equal(res, exp) + + def test_range_slice_day(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) + pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) + + # changed to TypeError in 1.12 + # https://github.com/numpy/numpy/pull/6271 + exc = IndexError if _np_version_under1p12 else TypeError + + for idx in [didx, pidx]: + # slices against index should raise IndexError + values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', + '2013/02/01 09:00'] + for v in values: + with tm.assertRaises(exc): + idx[v:] + + s = Series(np.random.rand(len(idx)), index=idx) + + tm.assert_series_equal(s['2013/01/02':], s[1:]) + tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) + tm.assert_series_equal(s['2013/02':], s[31:]) + tm.assert_series_equal(s['2014':], s[365:]) + + invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + for v in invalid: + with tm.assertRaises(exc): + idx[v:] + + def test_range_slice_seconds(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', + periods=4000) + pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + + # changed to TypeError in 1.12 + # https://github.com/numpy/numpy/pull/6271 + exc = IndexError if _np_version_under1p12 else TypeError + + for idx in [didx, pidx]: + # slices against index should raise IndexError + values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', + '2013/02/01 09:00'] + for v in values: + with tm.assertRaises(exc): + idx[v:] + + s = Series(np.random.rand(len(idx)), index=idx) + + tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], + s[300:660]) + tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], + s[3600:3960]) + tm.assert_series_equal(s['2013/01/01 10H':], s[3600:]) + tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) + for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s[d:], s) + + def test_range_slice_outofbounds(self): + # GH 5407 + didx = DatetimeIndex(start='2013/10/01', freq='D', periods=10) + pidx = PeriodIndex(start='2013/10/01', freq='D', periods=10) + + for idx in [didx, pidx]: + df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) + empty = DataFrame(index=idx.__class__([], freq='D'), + columns=['units']) + empty['units'] = empty['units'].astype('int64') + + tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) + tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) + tm.assert_frame_equal(df['2013/10/01':'2013/10/02'], df.iloc[:2]) + tm.assert_frame_equal(df['2013/10/02':'2013/09/30'], empty) + tm.assert_frame_equal(df['2013/10/15':'2013/10/17'], empty) + tm.assert_frame_equal(df['2013-06':'2013-09'], empty) + tm.assert_frame_equal(df['2013-11':'2013-12'], empty) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 33653c92da719..6a8128bb8985f 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -1,10 +1,12 @@ import numpy as np +from numpy.random import randn from datetime import timedelta import pandas as pd from pandas.util import testing as tm from pandas import (PeriodIndex, period_range, notnull, DatetimeIndex, NaT, - Index, Period, Int64Index) + Index, Period, Int64Index, Series, DataFrame, date_range, + offsets) from ..datetimelike import DatetimeLike @@ -20,26 +22,6 @@ def setUp(self): def create_index(self): return period_range('20130101', periods=5, freq='D') - def test_construction_base_constructor(self): - # GH 13664 - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='M')] - tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) - - arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] - tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) - - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='D')] - tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) - - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.Index(np.array(arr), dtype=object)) - def test_astype(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -68,16 +50,6 @@ def test_astype_raises(self): self.assertRaises(ValueError, idx.astype, 'timedelta64') self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') - def test_shift(self): - - # test shift for PeriodIndex - # GH8083 - drange = self.create_index() - result = drange.shift(1) - expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', '2013-01-06'], freq='D') - self.assert_index_equal(result, expected) - def test_pickle_compat_construction(self): pass @@ -231,3 +203,552 @@ def test_difference_freq(self): expected = PeriodIndex(["20160920", "20160921"], freq='D') tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal('freq', idx_diff, expected) + + def test_hash_error(self): + index = period_range('20010101', periods=10) + with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % + type(index).__name__): + hash(index) + + def test_make_time_series(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + series = Series(1, index=index) + tm.assertIsInstance(series, Series) + + def test_shallow_copy_empty(self): + + # GH13067 + idx = PeriodIndex([], freq='M') + result = idx._shallow_copy() + expected = idx + + tm.assert_index_equal(result, expected) + + def test_dtype_str(self): + pi = pd.PeriodIndex([], freq='M') + self.assertEqual(pi.dtype_str, 'period[M]') + self.assertEqual(pi.dtype_str, str(pi.dtype)) + + pi = pd.PeriodIndex([], freq='3M') + self.assertEqual(pi.dtype_str, 'period[3M]') + self.assertEqual(pi.dtype_str, str(pi.dtype)) + + def test_view_asi8(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + def test_values(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx._values, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx._values, exp) + + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx._values, exp) + + def test_period_index_length(self): + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + self.assertEqual(len(pi), 9) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') + self.assertEqual(len(pi), 4 * 9) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + self.assertEqual(len(pi), 12 * 9) + + start = Period('02-Apr-2005', 'B') + i1 = PeriodIndex(start=start, periods=20) + self.assertEqual(len(i1), 20) + self.assertEqual(i1.freq, start.freq) + self.assertEqual(i1[0], start) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + self.assertEqual(len(i1), 10) + self.assertEqual(i1.freq, end_intv.freq) + self.assertEqual(i1[-1], end_intv) + + end_intv = Period('2006-12-31', '1w') + i2 = PeriodIndex(end=end_intv, periods=10) + self.assertEqual(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + self.assertEqual(i1.freq, i2.freq) + + end_intv = Period('2006-12-31', ('w', 1)) + i2 = PeriodIndex(end=end_intv, periods=10) + self.assertEqual(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + self.assertEqual(i1.freq, i2.freq) + + try: + PeriodIndex(start=start, end=end_intv) + raise AssertionError('Cannot allow mixed freq for start and end') + except ValueError: + pass + + end_intv = Period('2005-05-01', 'B') + i1 = PeriodIndex(start=start, end=end_intv) + + try: + PeriodIndex(start=start) + raise AssertionError( + 'Must specify periods if missing start or end') + except ValueError: + pass + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) + + i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) + + # Mixed freq should fail + vals = [end_intv, Period('2006-12-31', 'w')] + self.assertRaises(ValueError, PeriodIndex, vals) + vals = np.array(vals) + self.assertRaises(ValueError, PeriodIndex, vals) + + def test_fields(self): + # year, month, day, hour, minute + # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter + # qyear + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2005') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2002') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2002') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='D', start='12/1/2001', end='6/1/2001') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='B', start='12/1/2001', end='6/1/2001') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='H', start='12/31/2001', end='1/1/2002 23:00') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='Min', start='12/31/2001', end='1/1/2002 00:20') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='S', start='12/31/2001 00:00:00', + end='12/31/2001 00:05:00') + self._check_all_fields(pi) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + self._check_all_fields(i1) + + def _check_all_fields(self, periodindex): + fields = ['year', 'month', 'day', 'hour', 'minute', 'second', + 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', + 'quarter', 'qyear', 'days_in_month', 'is_leap_year'] + + periods = list(periodindex) + s = pd.Series(periodindex) + + for field in fields: + field_idx = getattr(periodindex, field) + self.assertEqual(len(periodindex), len(field_idx)) + for x, val in zip(periods, field_idx): + self.assertEqual(getattr(x, field), val) + + if len(s) == 0: + continue + + field_s = getattr(s.dt, field) + self.assertEqual(len(periodindex), len(field_s)) + for x, val in zip(periods, field_s): + self.assertEqual(getattr(x, field), val) + + def test_indexing(self): + + # GH 4390, iat incorrectly indexing + index = period_range('1/1/2001', periods=10) + s = Series(randn(10), index=index) + expected = s[index[0]] + result = s.iat[0] + self.assertEqual(expected, result) + + def test_period_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = period_range('2011/01/01', periods=6, freq='M') + idx2 = period_range('2013', periods=6, freq='A') + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.set_index(idx2) + tm.assert_index_equal(df.index, idx2) + + def test_factorize(self): + idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', + '2014-03', '2014-03'], freq='M') + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01'], freq='M') + + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) + arr, idx = idx2.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) + exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') + arr, idx = idx2.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + def test_asobject_like(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + def test_is_(self): + create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', + end='12/1/2009') + index = create_index() + self.assertEqual(index.is_(index), True) + self.assertEqual(index.is_(create_index()), False) + self.assertEqual(index.is_(index.view()), True) + self.assertEqual( + index.is_(index.view().view().view().view().view()), True) + self.assertEqual(index.view().is_(index), True) + ind2 = index.view() + index.name = "Apple" + self.assertEqual(ind2.is_(index), True) + self.assertEqual(index.is_(index[:]), False) + self.assertEqual(index.is_(index.asfreq('M')), False) + self.assertEqual(index.is_(index.asfreq('A')), False) + self.assertEqual(index.is_(index - 2), False) + self.assertEqual(index.is_(index - 0), False) + + def test_comp_period(self): + idx = period_range('2007-01', periods=20, freq='M') + + result = idx < idx[10] + exp = idx.values < idx.values[10] + self.assert_numpy_array_equal(result, exp) + + def test_contains(self): + rng = period_range('2007-01', freq='M', periods=10) + + self.assertTrue(Period('2007-01', freq='M') in rng) + self.assertFalse(Period('2007-01', freq='D') in rng) + self.assertFalse(Period('2007-01', freq='2M') in rng) + + def test_contains_nat(self): + # GH13582 + idx = period_range('2007-01', freq='M', periods=10) + self.assertFalse(pd.NaT in idx) + self.assertFalse(None in idx) + self.assertFalse(float('nan') in idx) + self.assertFalse(np.nan in idx) + + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertTrue(pd.NaT in idx) + self.assertTrue(None in idx) + self.assertTrue(float('nan') in idx) + self.assertTrue(np.nan in idx) + + def test_periods_number_check(self): + with tm.assertRaises(ValueError): + period_range('2011-1-1', '2012-1-1', 'B') + + def test_start_time(self): + index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') + expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + tm.assert_index_equal(index.start_time, expected_index) + + def test_end_time(self): + index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') + expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') + tm.assert_index_equal(index.end_time, expected_index) + + def test_index_duplicate_periods(self): + # monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts[2007] + expected = ts[1:3] + tm.assert_series_equal(result, expected) + result[:] = 1 + self.assertTrue((ts[1:3] == 1).all()) + + # not monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN') + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts[2007] + expected = ts[idx == 2007] + tm.assert_series_equal(result, expected) + + def test_index_unique(self): + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') + self.assert_index_equal(idx.unique(), expected) + self.assertEqual(idx.nunique(), 3) + + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', + tz='US/Eastern') + expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', + tz='US/Eastern') + self.assert_index_equal(idx.unique(), expected) + self.assertEqual(idx.nunique(), 3) + + def test_shift_gh8083(self): + + # test shift for PeriodIndex + # GH8083 + drange = self.create_index() + result = drange.shift(1) + expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', + '2013-01-05', '2013-01-06'], freq='D') + self.assert_index_equal(result, expected) + + def test_shift(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') + + tm.assert_index_equal(pi1.shift(0), pi1) + + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) + + def test_shift_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(1) + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', + '2011-05'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + + def test_shift_ndarray(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(np.array([1, 2, 3, 4])) + expected = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(np.array([1, -2, 3, -4])) + expected = PeriodIndex(['2011-02', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + + def test_negative_ordinals(self): + Period(ordinal=-1000, freq='A') + Period(ordinal=0, freq='A') + + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') + tm.assert_index_equal(idx1, idx2) + + def test_pindex_fieldaccessor_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2012-03', '2012-04'], freq='D') + + exp = np.array([2011, 2011, -1, 2012, 2012], dtype=np.int64) + self.assert_numpy_array_equal(idx.year, exp) + exp = np.array([1, 2, -1, 3, 4], dtype=np.int64) + self.assert_numpy_array_equal(idx.month, exp) + + def test_pindex_qaccess(self): + pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') + s = Series(np.random.rand(len(pi)), index=pi).cumsum() + # Todo: fix these accessors! + self.assertEqual(s['05Q4'], s[2]) + + def test_numpy_repeat(self): + index = period_range('20010101', periods=2) + expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), + Period('2001-01-02'), Period('2001-01-02')]) + + tm.assert_index_equal(np.repeat(index, 2), expected) + + msg = "the 'axis' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.repeat, index, 2, axis=1) + + def test_pindex_multiples(self): + pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') + expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07', + '2011-09', '2011-11'], freq='2M') + tm.assert_index_equal(pi, expected) + self.assertEqual(pi.freq, offsets.MonthEnd(2)) + self.assertEqual(pi.freqstr, '2M') + + pi = period_range(start='1/1/11', end='12/31/11', freq='2M') + tm.assert_index_equal(pi, expected) + self.assertEqual(pi.freq, offsets.MonthEnd(2)) + self.assertEqual(pi.freqstr, '2M') + + pi = period_range(start='1/1/11', periods=6, freq='2M') + tm.assert_index_equal(pi, expected) + self.assertEqual(pi.freq, offsets.MonthEnd(2)) + self.assertEqual(pi.freqstr, '2M') + + def test_iteration(self): + index = PeriodIndex(start='1/1/10', periods=4, freq='B') + + result = list(index) + tm.assertIsInstance(result[0], Period) + self.assertEqual(result[0].freq, index.freq) + + def test_is_full(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + self.assertFalse(index.is_full) + + index = PeriodIndex([2005, 2006, 2007], freq='A') + self.assertTrue(index.is_full) + + index = PeriodIndex([2005, 2005, 2007], freq='A') + self.assertFalse(index.is_full) + + index = PeriodIndex([2005, 2005, 2006], freq='A') + self.assertTrue(index.is_full) + + index = PeriodIndex([2006, 2005, 2005], freq='A') + self.assertRaises(ValueError, getattr, index, 'is_full') + + self.assertTrue(index[:0].is_full) + + def test_with_multi_index(self): + # #1705 + index = date_range('1/1/2012', periods=4, freq='12H') + index_as_arrays = [index.to_period(freq='D'), index.hour] + + s = Series([0, 1, 2, 3], index_as_arrays) + + tm.assertIsInstance(s.index.levels[0], PeriodIndex) + + tm.assertIsInstance(s.index.values[0][0], Period) + + def test_convert_array_of_periods(self): + rng = period_range('1/1/2000', periods=20, freq='D') + periods = list(rng) + + result = pd.Index(periods) + tm.assertIsInstance(result, PeriodIndex) + + def test_append_concat(self): + # #1815 + d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') + d2 = date_range('12/31/2000', '12/31/2009', freq='A-DEC') + + s1 = Series(np.random.randn(10), d1) + s2 = Series(np.random.randn(10), d2) + + s1 = s1.to_period() + s2 = s2.to_period() + + # drops index + result = pd.concat([s1, s2]) + tm.assertIsInstance(result.index, PeriodIndex) + self.assertEqual(result.index[0], s1.index[0]) + + def test_pickle_freq(self): + # GH2891 + prng = period_range('1/1/2011', '1/1/2012', freq='M') + new_prng = self.round_trip_pickle(prng) + self.assertEqual(new_prng.freq, offsets.MonthEnd()) + self.assertEqual(new_prng.freqstr, 'M') + + def test_map(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + result = index.map(lambda x: x + 1) + expected = index + 1 + tm.assert_index_equal(result, expected) + + result = index.map(lambda x: x.ordinal) + exp = Index([x.ordinal for x in index]) + tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py new file mode 100644 index 0000000000000..06e15f9175ed8 --- /dev/null +++ b/pandas/tests/indexes/period/test_setops.py @@ -0,0 +1,157 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +import pandas.tseries.period as period +from pandas import period_range, PeriodIndex, Index, date_range + + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_joins(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + for kind in ['inner', 'outer', 'left', 'right']: + joined = index.join(index[:-5], how=kind) + + tm.assertIsInstance(joined, PeriodIndex) + self.assertEqual(joined.freq, index.freq) + + def test_join_self(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + for kind in ['inner', 'outer', 'left', 'right']: + res = index.join(index, how=kind) + self.assertIs(index, res) + + def test_join_does_not_recur(self): + df = tm.makeCustomDataframe( + 3, 2, data_gen_f=lambda *args: np.random.randint(2), + c_idx_type='p', r_idx_type='dt') + s = df.iloc[:2, 0] + + res = s.index.join(df.columns, how='outer') + expected = Index([s.index[0], s.index[1], + df.columns[0], df.columns[1]], object) + tm.assert_index_equal(res, expected) + + def test_union(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + result = index[:-5].union(index[10:]) + tm.assert_index_equal(result, index) + + # not in order + result = _permute(index[:-5]).union(_permute(index[10:])) + tm.assert_index_equal(result, index) + + # raise if different frequencies + index = period_range('1/1/2000', '1/20/2000', freq='D') + index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + with tm.assertRaises(period.IncompatibleFrequency): + index.union(index2) + + msg = 'can only call with other PeriodIndex-ed objects' + with tm.assertRaisesRegexp(ValueError, msg): + index.join(index.to_timestamp()) + + index3 = period_range('1/1/2000', '1/20/2000', freq='2D') + with tm.assertRaises(period.IncompatibleFrequency): + index.join(index3) + + def test_union_dataframe_index(self): + rng1 = pd.period_range('1/1/1999', '1/1/2012', freq='M') + s1 = pd.Series(np.random.randn(len(rng1)), rng1) + + rng2 = pd.period_range('1/1/1980', '12/1/2001', freq='M') + s2 = pd.Series(np.random.randn(len(rng2)), rng2) + df = pd.DataFrame({'s1': s1, 's2': s2}) + + exp = pd.period_range('1/1/1980', '1/1/2012', freq='M') + self.assert_index_equal(df.index, exp) + + def test_intersection(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + result = index[:-5].intersection(index[10:]) + tm.assert_index_equal(result, index[10:-5]) + + # not in order + left = _permute(index[:-5]) + right = _permute(index[10:]) + result = left.intersection(right).sort_values() + tm.assert_index_equal(result, index[10:-5]) + + # raise if different frequencies + index = period_range('1/1/2000', '1/20/2000', freq='D') + index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + with tm.assertRaises(period.IncompatibleFrequency): + index.intersection(index2) + + index3 = period_range('1/1/2000', '1/20/2000', freq='2D') + with tm.assertRaises(period.IncompatibleFrequency): + index.intersection(index3) + + def test_intersection_cases(self): + base = period_range('6/1/2000', '6/30/2000', freq='D', name='idx') + + # if target has the same name, it is preserved + rng2 = period_range('5/15/2000', '6/20/2000', freq='D', name='idx') + expected2 = period_range('6/1/2000', '6/20/2000', freq='D', + name='idx') + + # if target name is different, it will be reset + rng3 = period_range('5/15/2000', '6/20/2000', freq='D', name='other') + expected3 = period_range('6/1/2000', '6/20/2000', freq='D', + name=None) + + rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') + expected4 = PeriodIndex([], name='idx', freq='D') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), + (rng4, expected4)]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + # non-monotonic + base = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-02', + '2011-01-03'], freq='D', name='idx') + + rng2 = PeriodIndex(['2011-01-04', '2011-01-02', + '2011-02-02', '2011-02-03'], + freq='D', name='idx') + expected2 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', + name='idx') + + rng3 = PeriodIndex(['2011-01-04', '2011-01-02', '2011-02-02', + '2011-02-03'], + freq='D', name='other') + expected3 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', + name=None) + + rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') + expected4 = PeriodIndex([], freq='D', name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), + (rng4, expected4)]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, 'D') + + # empty same freq + rng = date_range('6/1/2000', '6/15/2000', freq='T') + result = rng[0:0].intersection(rng) + self.assertEqual(len(result), 0) + + result = rng.intersection(rng[0:0]) + self.assertEqual(len(result), 0) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py new file mode 100644 index 0000000000000..e09d405afd375 --- /dev/null +++ b/pandas/tests/indexes/period/test_tools.py @@ -0,0 +1,449 @@ +import numpy as np +from datetime import datetime, timedelta + +import pandas as pd +import pandas.util.testing as tm +import pandas.tseries.period as period +from pandas.compat import lrange +from pandas.tseries.frequencies import get_freq, MONTHS +from pandas._period import period_ordinal, period_asfreq +from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, + date_range, to_datetime, period_range) + + +class TestPeriodRepresentation(tm.TestCase): + """ + Wish to match NumPy units + """ + + def _check_freq(self, freq, base_date): + rng = PeriodIndex(start=base_date, periods=10, freq=freq) + exp = np.arange(10, dtype=np.int64) + self.assert_numpy_array_equal(rng._values, exp) + self.assert_numpy_array_equal(rng.asi8, exp) + + def test_annual(self): + self._check_freq('A', 1970) + + def test_monthly(self): + self._check_freq('M', '1970-01') + + def test_weekly(self): + self._check_freq('W-THU', '1970-01-01') + + def test_daily(self): + self._check_freq('D', '1970-01-01') + + def test_business_daily(self): + self._check_freq('B', '1970-01-01') + + def test_hourly(self): + self._check_freq('H', '1970-01-01') + + def test_minutely(self): + self._check_freq('T', '1970-01-01') + + def test_secondly(self): + self._check_freq('S', '1970-01-01') + + def test_millisecondly(self): + self._check_freq('L', '1970-01-01') + + def test_microsecondly(self): + self._check_freq('U', '1970-01-01') + + def test_nanosecondly(self): + self._check_freq('N', '1970-01-01') + + def test_negone_ordinals(self): + freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] + + period = Period(ordinal=-1, freq='D') + for freq in freqs: + repr(period.asfreq(freq)) + + for freq in freqs: + period = Period(ordinal=-1, freq=freq) + repr(period) + self.assertEqual(period.year, 1969) + + period = Period(ordinal=-1, freq='B') + repr(period) + period = Period(ordinal=-1, freq='W') + repr(period) + + +class TestTslib(tm.TestCase): + def test_intraday_conversion_factors(self): + self.assertEqual(period_asfreq( + 1, get_freq('D'), get_freq('H'), False), 24) + self.assertEqual(period_asfreq( + 1, get_freq('D'), get_freq('T'), False), 1440) + self.assertEqual(period_asfreq( + 1, get_freq('D'), get_freq('S'), False), 86400) + self.assertEqual(period_asfreq(1, get_freq( + 'D'), get_freq('L'), False), 86400000) + self.assertEqual(period_asfreq(1, get_freq( + 'D'), get_freq('U'), False), 86400000000) + self.assertEqual(period_asfreq(1, get_freq( + 'D'), get_freq('N'), False), 86400000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('H'), get_freq('T'), False), 60) + self.assertEqual(period_asfreq( + 1, get_freq('H'), get_freq('S'), False), 3600) + self.assertEqual(period_asfreq(1, get_freq('H'), + get_freq('L'), False), 3600000) + self.assertEqual(period_asfreq(1, get_freq( + 'H'), get_freq('U'), False), 3600000000) + self.assertEqual(period_asfreq(1, get_freq( + 'H'), get_freq('N'), False), 3600000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('T'), get_freq('S'), False), 60) + self.assertEqual(period_asfreq( + 1, get_freq('T'), get_freq('L'), False), 60000) + self.assertEqual(period_asfreq(1, get_freq( + 'T'), get_freq('U'), False), 60000000) + self.assertEqual(period_asfreq(1, get_freq( + 'T'), get_freq('N'), False), 60000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('S'), get_freq('L'), False), 1000) + self.assertEqual(period_asfreq(1, get_freq('S'), + get_freq('U'), False), 1000000) + self.assertEqual(period_asfreq(1, get_freq( + 'S'), get_freq('N'), False), 1000000000) + + self.assertEqual(period_asfreq( + 1, get_freq('L'), get_freq('U'), False), 1000) + self.assertEqual(period_asfreq(1, get_freq('L'), + get_freq('N'), False), 1000000) + + self.assertEqual(period_asfreq( + 1, get_freq('U'), get_freq('N'), False), 1000) + + def test_period_ordinal_start_values(self): + # information for 1.1.1970 + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('A'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('M'))) + self.assertEqual(1, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('W'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('D'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, + get_freq('B'))) + + def test_period_ordinal_week(self): + self.assertEqual(1, period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, + get_freq('W'))) + self.assertEqual(2, period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, + get_freq('W'))) + + self.assertEqual(2284, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, + get_freq('W'))) + self.assertEqual(2285, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, + get_freq('W'))) + + def test_period_ordinal_business_day(self): + # Thursday + self.assertEqual(11415, period_ordinal(2013, 10, 3, 0, 0, 0, 0, 0, + get_freq('B'))) + # Friday + self.assertEqual(11416, period_ordinal(2013, 10, 4, 0, 0, 0, 0, 0, + get_freq('B'))) + # Saturday + self.assertEqual(11417, period_ordinal(2013, 10, 5, 0, 0, 0, 0, 0, + get_freq('B'))) + # Sunday + self.assertEqual(11417, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, + get_freq('B'))) + # Monday + self.assertEqual(11417, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, + get_freq('B'))) + # Tuesday + self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, + get_freq('B'))) + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_tolist(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + rs = index.tolist() + [tm.assertIsInstance(x, Period) for x in rs] + + recon = PeriodIndex(rs) + tm.assert_index_equal(index, recon) + + def test_to_timestamp(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + series = Series(1, index=index, name='foo') + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = series.to_timestamp(how='end') + tm.assert_index_equal(result.index, exp_index) + self.assertEqual(result.name, 'foo') + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') + result = series.to_timestamp(how='start') + tm.assert_index_equal(result.index, exp_index) + + def _get_with_delta(delta, freq='A-DEC'): + return date_range(to_datetime('1/1/2001') + delta, + to_datetime('12/31/2009') + delta, freq=freq) + + delta = timedelta(hours=23) + result = series.to_timestamp('H', 'end') + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = series.to_timestamp('T', 'end') + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.index, exp_index) + + result = series.to_timestamp('S', 'end') + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + tm.assert_index_equal(result.index, exp_index) + + index = PeriodIndex(freq='H', start='1/1/2001', end='1/2/2001') + series = Series(1, index=index, name='foo') + + exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', + freq='H') + result = series.to_timestamp(how='end') + tm.assert_index_equal(result.index, exp_index) + self.assertEqual(result.name, 'foo') + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(lrange(1, 5), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp('D', 'end') + expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) + tm.assert_index_equal(stamps, expected) + + def test_to_timestamp_preserve_name(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', + name='foo') + self.assertEqual(index.name, 'foo') + + conv = index.to_timestamp('D') + self.assertEqual(conv.name, 'foo') + + def test_to_timestamp_repr_is_code(self): + zs = [Timestamp('99-04-17 00:00:00', tz='UTC'), + Timestamp('2001-04-17 00:00:00', tz='UTC'), + Timestamp('2001-04-17 00:00:00', tz='America/Los_Angeles'), + Timestamp('2001-04-17 00:00:00', tz=None)] + for z in zs: + self.assertEqual(eval(repr(z)), z) + + def test_to_timestamp_pi_nat(self): + # GH 7228 + index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', + name='idx') + + result = index.to_timestamp('D') + expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), + datetime(2011, 2, 1)], name='idx') + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, 'idx') + + result2 = result.to_period(freq='M') + tm.assert_index_equal(result2, index) + self.assertEqual(result2.name, 'idx') + + result3 = result.to_period(freq='3M') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') + self.assert_index_equal(result3, exp) + self.assertEqual(result3.freqstr, '3M') + + msg = ('Frequency must be positive, because it' + ' represents span: -2A') + with tm.assertRaisesRegexp(ValueError, msg): + result.to_period(freq='-2A') + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='2M', name='idx') + result = idx.to_timestamp() + expected = DatetimeIndex( + ['2011-01-01', 'NaT', '2011-02-01'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E') + expected = DatetimeIndex( + ['2011-02-28', 'NaT', '2011-03-31'], name='idx') + self.assert_index_equal(result, expected) + + def test_to_timestamp_pi_combined(self): + idx = PeriodIndex(start='2011', periods=2, freq='1D1H', name='idx') + result = idx.to_timestamp() + expected = DatetimeIndex( + ['2011-01-01 00:00', '2011-01-02 01:00'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E') + expected = DatetimeIndex( + ['2011-01-02 00:59:59', '2011-01-03 01:59:59'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E', freq='H') + expected = DatetimeIndex( + ['2011-01-02 00:00', '2011-01-03 01:00'], name='idx') + self.assert_index_equal(result, expected) + + def test_to_timestamp_to_period_astype(self): + idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') + + res = idx.astype('period[M]') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') + tm.assert_index_equal(res, exp) + + res = idx.astype('period[3M]') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') + self.assert_index_equal(res, exp) + + def test_dti_to_period(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + pi1 = dti.to_period() + pi2 = dti.to_period(freq='D') + pi3 = dti.to_period(freq='3D') + + self.assertEqual(pi1[0], Period('Jan 2005', freq='M')) + self.assertEqual(pi2[0], Period('1/31/2005', freq='D')) + self.assertEqual(pi3[0], Period('1/31/2005', freq='3D')) + + self.assertEqual(pi1[-1], Period('Nov 2005', freq='M')) + self.assertEqual(pi2[-1], Period('11/30/2005', freq='D')) + self.assertEqual(pi3[-1], Period('11/30/2005', freq='3D')) + + tm.assert_index_equal(pi1, period_range('1/1/2005', '11/1/2005', + freq='M')) + tm.assert_index_equal(pi2, period_range('1/1/2005', '11/1/2005', + freq='M').asfreq('D')) + tm.assert_index_equal(pi3, period_range('1/1/2005', '11/1/2005', + freq='M').asfreq('3D')) + + def test_period_astype_to_timestamp(self): + pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) + tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) + tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]') + tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]', how='end') + tm.assert_index_equal(res, exp) + + def test_to_period_quarterly(self): + # make sure we can make the round trip + for month in MONTHS: + freq = 'Q-%s' % month + rng = period_range('1989Q3', '1991Q3', freq=freq) + stamps = rng.to_timestamp() + result = stamps.to_period(freq) + tm.assert_index_equal(rng, result) + + def test_to_period_quarterlyish(self): + offsets = ['BQ', 'QS', 'BQS'] + for off in offsets: + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + self.assertEqual(prng.freq, 'Q-DEC') + + def test_to_period_annualish(self): + offsets = ['BA', 'AS', 'BAS'] + for off in offsets: + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + self.assertEqual(prng.freq, 'A-DEC') + + def test_to_period_monthish(self): + offsets = ['MS', 'BM'] + for off in offsets: + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + self.assertEqual(prng.freq, 'M') + + rng = date_range('01-Jan-2012', periods=8, freq='M') + prng = rng.to_period() + self.assertEqual(prng.freq, 'M') + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + date_range('01-Jan-2012', periods=8, freq='EOM') + + def test_period_dt64_round_trip(self): + dti = date_range('1/1/2000', '1/7/2002', freq='B') + pi = dti.to_period() + tm.assert_index_equal(pi.to_timestamp(), dti) + + dti = date_range('1/1/2000', '1/7/2002', freq='B') + pi = dti.to_period(freq='H') + tm.assert_index_equal(pi.to_timestamp(), dti) + + def test_to_timestamp_1703(self): + index = period_range('1/1/2012', periods=4, freq='D') + + result = index.to_timestamp() + self.assertEqual(result[0], Timestamp('1/1/2012')) + + def test_to_datetime_depr(self): + index = period_range('1/1/2012', periods=4, freq='D') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = index.to_datetime() + self.assertEqual(result[0], Timestamp('1/1/2012')) + + def test_combine_first(self): + # GH 3367 + didx = pd.DatetimeIndex(start='1950-01-31', end='1950-07-31', freq='M') + pidx = pd.PeriodIndex(start=pd.Period('1950-1'), + end=pd.Period('1950-7'), freq='M') + # check to be consistent with DatetimeIndex + for idx in [didx, pidx]: + a = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx) + b = pd.Series([9, 9, 9, 9, 9, 9, 9], index=idx) + + result = a.combine_first(b) + expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, + dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_searchsorted(self): + for freq in ['D', '2D']: + pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03', + '2014-01-04', '2014-01-05'], freq=freq) + + p1 = pd.Period('2014-01-01', freq=freq) + self.assertEqual(pidx.searchsorted(p1), 0) + + p2 = pd.Period('2014-01-04', freq=freq) + self.assertEqual(pidx.searchsorted(p2), 3) + + msg = "Input has different freq=H from PeriodIndex" + with self.assertRaisesRegexp(period.IncompatibleFrequency, msg): + pidx.searchsorted(pd.Period('2014-01-01', freq='H')) + + msg = "Input has different freq=5D from PeriodIndex" + with self.assertRaisesRegexp(period.IncompatibleFrequency, msg): + pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) + + with tm.assert_produces_warning(FutureWarning): + pidx.searchsorted(key=p2) diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py new file mode 100644 index 0000000000000..c94a7c62a6dc9 --- /dev/null +++ b/pandas/tests/scalar/test_period.py @@ -0,0 +1,2074 @@ +import numpy as np +from datetime import datetime, date, timedelta + +import pandas as pd +import pandas.util.testing as tm +import pandas.tseries.period as period +from pandas.compat import text_type, iteritems +from pandas.compat.numpy import np_datetime64_compat +from pandas import Period, Timestamp, tslib, offsets, _period +from pandas.tseries.frequencies import DAYS, MONTHS, _period_code_map + + +class TestPeriodProperties(tm.TestCase): + "Test properties such as year, month, weekday, etc...." + + def test_quarterly_negative_ordinals(self): + p = Period(ordinal=-1, freq='Q-DEC') + self.assertEqual(p.year, 1969) + self.assertEqual(p.quarter, 4) + self.assertIsInstance(p, Period) + + p = Period(ordinal=-2, freq='Q-DEC') + self.assertEqual(p.year, 1969) + self.assertEqual(p.quarter, 3) + self.assertIsInstance(p, Period) + + p = Period(ordinal=-2, freq='M') + self.assertEqual(p.year, 1969) + self.assertEqual(p.month, 11) + self.assertIsInstance(p, Period) + + def test_period_cons_quarterly(self): + # bugs in scikits.timeseries + for month in MONTHS: + freq = 'Q-%s' % month + exp = Period('1989Q3', freq=freq) + self.assertIn('1989Q3', str(exp)) + stamp = exp.to_timestamp('D', how='end') + p = Period(stamp, freq=freq) + self.assertEqual(p, exp) + + stamp = exp.to_timestamp('3D', how='end') + p = Period(stamp, freq=freq) + self.assertEqual(p, exp) + + def test_period_cons_annual(self): + # bugs in scikits.timeseries + for month in MONTHS: + freq = 'A-%s' % month + exp = Period('1989', freq=freq) + stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) + p = Period(stamp, freq=freq) + self.assertEqual(p, exp + 1) + self.assertIsInstance(p, Period) + + def test_period_cons_weekly(self): + for num in range(10, 17): + daystr = '2011-02-%d' % num + for day in DAYS: + freq = 'W-%s' % day + + result = Period(daystr, freq=freq) + expected = Period(daystr, freq='D').asfreq(freq) + self.assertEqual(result, expected) + self.assertIsInstance(result, Period) + + def test_period_from_ordinal(self): + p = pd.Period('2011-01', freq='M') + res = pd.Period._from_ordinal(p.ordinal, freq='M') + self.assertEqual(p, res) + self.assertIsInstance(res, Period) + + def test_period_cons_nat(self): + p = Period('NaT', freq='M') + self.assertIs(p, pd.NaT) + + p = Period('nat', freq='W-SUN') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT, freq='D') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT, freq='3D') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT, freq='1D1H') + self.assertIs(p, pd.NaT) + + p = Period('NaT') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT) + self.assertIs(p, pd.NaT) + + def test_cons_null_like(self): + # check Timestamp compat + self.assertIs(Timestamp('NaT'), pd.NaT) + self.assertIs(Period('NaT'), pd.NaT) + + self.assertIs(Timestamp(None), pd.NaT) + self.assertIs(Period(None), pd.NaT) + + self.assertIs(Timestamp(float('nan')), pd.NaT) + self.assertIs(Period(float('nan')), pd.NaT) + + self.assertIs(Timestamp(np.nan), pd.NaT) + self.assertIs(Period(np.nan), pd.NaT) + + def test_period_cons_mult(self): + p1 = Period('2011-01', freq='3M') + p2 = Period('2011-01', freq='M') + self.assertEqual(p1.ordinal, p2.ordinal) + + self.assertEqual(p1.freq, offsets.MonthEnd(3)) + self.assertEqual(p1.freqstr, '3M') + + self.assertEqual(p2.freq, offsets.MonthEnd()) + self.assertEqual(p2.freqstr, 'M') + + result = p1 + 1 + self.assertEqual(result.ordinal, (p2 + 3).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '3M') + + result = p1 - 1 + self.assertEqual(result.ordinal, (p2 - 3).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '3M') + + msg = ('Frequency must be positive, because it' + ' represents span: -3M') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-3M') + + msg = ('Frequency must be positive, because it' ' represents span: 0M') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='0M') + + def test_period_cons_combined(self): + p = [(Period('2011-01', freq='1D1H'), + Period('2011-01', freq='1H1D'), + Period('2011-01', freq='H')), + (Period(ordinal=1, freq='1D1H'), + Period(ordinal=1, freq='1H1D'), + Period(ordinal=1, freq='H'))] + + for p1, p2, p3 in p: + self.assertEqual(p1.ordinal, p3.ordinal) + self.assertEqual(p2.ordinal, p3.ordinal) + + self.assertEqual(p1.freq, offsets.Hour(25)) + self.assertEqual(p1.freqstr, '25H') + + self.assertEqual(p2.freq, offsets.Hour(25)) + self.assertEqual(p2.freqstr, '25H') + + self.assertEqual(p3.freq, offsets.Hour()) + self.assertEqual(p3.freqstr, 'H') + + result = p1 + 1 + self.assertEqual(result.ordinal, (p3 + 25).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '25H') + + result = p2 + 1 + self.assertEqual(result.ordinal, (p3 + 25).ordinal) + self.assertEqual(result.freq, p2.freq) + self.assertEqual(result.freqstr, '25H') + + result = p1 - 1 + self.assertEqual(result.ordinal, (p3 - 25).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '25H') + + result = p2 - 1 + self.assertEqual(result.ordinal, (p3 - 25).ordinal) + self.assertEqual(result.freq, p2.freq) + self.assertEqual(result.freqstr, '25H') + + msg = ('Frequency must be positive, because it' + ' represents span: -25H') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-1D1H') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-1H1D') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='-1D1H') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='-1H1D') + + msg = ('Frequency must be positive, because it' + ' represents span: 0D') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='0D0H') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='0D0H') + + # You can only combine together day and intraday offsets + msg = ('Invalid frequency: 1W1D') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='1W1D') + msg = ('Invalid frequency: 1D1W') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='1D1W') + + def test_timestamp_tz_arg(self): + tm._skip_if_no_pytz() + import pytz + for case in ['Europe/Brussels', 'Asia/Tokyo', 'US/Pacific']: + p = Period('1/1/2005', freq='M').to_timestamp(tz=case) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) + exp_zone = pytz.timezone(case).normalize(p) + + self.assertEqual(p, exp) + self.assertEqual(p.tz, exp_zone.tzinfo) + self.assertEqual(p.tz, exp.tz) + + p = Period('1/1/2005', freq='3H').to_timestamp(tz=case) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) + exp_zone = pytz.timezone(case).normalize(p) + + self.assertEqual(p, exp) + self.assertEqual(p.tz, exp_zone.tzinfo) + self.assertEqual(p.tz, exp.tz) + + p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=case) + exp = Timestamp('31/12/2005', tz='UTC').tz_convert(case) + exp_zone = pytz.timezone(case).normalize(p) + + self.assertEqual(p, exp) + self.assertEqual(p.tz, exp_zone.tzinfo) + self.assertEqual(p.tz, exp.tz) + + p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=case) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) + exp_zone = pytz.timezone(case).normalize(p) + + self.assertEqual(p, exp) + self.assertEqual(p.tz, exp_zone.tzinfo) + self.assertEqual(p.tz, exp.tz) + + def test_timestamp_tz_arg_dateutil(self): + from pandas.tslib import _dateutil_gettz as gettz + from pandas.tslib import maybe_get_tz + for case in ['dateutil/Europe/Brussels', 'dateutil/Asia/Tokyo', + 'dateutil/US/Pacific']: + p = Period('1/1/2005', freq='M').to_timestamp( + tz=maybe_get_tz(case)) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) + self.assertEqual(p, exp) + self.assertEqual(p.tz, gettz(case.split('/', 1)[1])) + self.assertEqual(p.tz, exp.tz) + + p = Period('1/1/2005', + freq='M').to_timestamp(freq='3H', tz=maybe_get_tz(case)) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) + self.assertEqual(p, exp) + self.assertEqual(p.tz, gettz(case.split('/', 1)[1])) + self.assertEqual(p.tz, exp.tz) + + def test_timestamp_tz_arg_dateutil_from_string(self): + from pandas.tslib import _dateutil_gettz as gettz + p = Period('1/1/2005', + freq='M').to_timestamp(tz='dateutil/Europe/Brussels') + self.assertEqual(p.tz, gettz('Europe/Brussels')) + + def test_timestamp_mult(self): + p = pd.Period('2011-01', freq='M') + self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) + self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-01-31')) + + p = pd.Period('2011-01', freq='3M') + self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) + self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-03-31')) + + def test_period_constructor(self): + i1 = Period('1/1/2005', freq='M') + i2 = Period('Jan 2005') + + self.assertEqual(i1, i2) + + i1 = Period('2005', freq='A') + i2 = Period('2005') + i3 = Period('2005', freq='a') + + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + + i4 = Period('2005', freq='M') + i5 = Period('2005', freq='m') + + self.assertRaises(ValueError, i1.__ne__, i4) + self.assertEqual(i4, i5) + + i1 = Period.now('Q') + i2 = Period(datetime.now(), freq='Q') + i3 = Period.now('q') + + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + + # Biz day construction, roll forward if non-weekday + i1 = Period('3/10/12', freq='B') + i2 = Period('3/10/12', freq='D') + self.assertEqual(i1, i2.asfreq('B')) + i2 = Period('3/11/12', freq='D') + self.assertEqual(i1, i2.asfreq('B')) + i2 = Period('3/12/12', freq='D') + self.assertEqual(i1, i2.asfreq('B')) + + i3 = Period('3/10/12', freq='b') + self.assertEqual(i1, i3) + + i1 = Period(year=2005, quarter=1, freq='Q') + i2 = Period('1/1/2005', freq='Q') + self.assertEqual(i1, i2) + + i1 = Period(year=2005, quarter=3, freq='Q') + i2 = Period('9/1/2005', freq='Q') + self.assertEqual(i1, i2) + + i1 = Period(year=2005, month=3, day=1, freq='D') + i2 = Period('3/1/2005', freq='D') + self.assertEqual(i1, i2) + + i3 = Period(year=2005, month=3, day=1, freq='d') + self.assertEqual(i1, i3) + + i1 = Period(year=2012, month=3, day=10, freq='B') + i2 = Period('3/12/12', freq='B') + self.assertEqual(i1, i2) + + i1 = Period('2005Q1') + i2 = Period(year=2005, quarter=1, freq='Q') + i3 = Period('2005q1') + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + + i1 = Period('05Q1') + self.assertEqual(i1, i2) + lower = Period('05q1') + self.assertEqual(i1, lower) + + i1 = Period('1Q2005') + self.assertEqual(i1, i2) + lower = Period('1q2005') + self.assertEqual(i1, lower) + + i1 = Period('1Q05') + self.assertEqual(i1, i2) + lower = Period('1q05') + self.assertEqual(i1, lower) + + i1 = Period('4Q1984') + self.assertEqual(i1.year, 1984) + lower = Period('4q1984') + self.assertEqual(i1, lower) + + i1 = Period('1982', freq='min') + i2 = Period('1982', freq='MIN') + self.assertEqual(i1, i2) + i2 = Period('1982', freq=('Min', 1)) + self.assertEqual(i1, i2) + + expected = Period('2007-01', freq='M') + i1 = Period('200701', freq='M') + self.assertEqual(i1, expected) + + i1 = Period('200701', freq='M') + self.assertEqual(i1, expected) + + i1 = Period(200701, freq='M') + self.assertEqual(i1, expected) + + i1 = Period(ordinal=200701, freq='M') + self.assertEqual(i1.year, 18695) + + i1 = Period(datetime(2007, 1, 1), freq='M') + i2 = Period('200701', freq='M') + self.assertEqual(i1, i2) + + i1 = Period(date(2007, 1, 1), freq='M') + i2 = Period(datetime(2007, 1, 1), freq='M') + i3 = Period(np.datetime64('2007-01-01'), freq='M') + i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') + i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + self.assertEqual(i1, i4) + self.assertEqual(i1, i5) + + i1 = Period('2007-01-01 09:00:00.001') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + self.assertEqual(i1, expected) + + expected = Period(np_datetime64_compat( + '2007-01-01 09:00:00.001Z'), freq='L') + self.assertEqual(i1, expected) + + i1 = Period('2007-01-01 09:00:00.00101') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + self.assertEqual(i1, expected) + + expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), + freq='U') + self.assertEqual(i1, expected) + + self.assertRaises(ValueError, Period, ordinal=200701) + + self.assertRaises(ValueError, Period, '2007-1-1', freq='X') + + def test_period_constructor_offsets(self): + self.assertEqual(Period('1/1/2005', freq=offsets.MonthEnd()), + Period('1/1/2005', freq='M')) + self.assertEqual(Period('2005', freq=offsets.YearEnd()), + Period('2005', freq='A')) + self.assertEqual(Period('2005', freq=offsets.MonthEnd()), + Period('2005', freq='M')) + self.assertEqual(Period('3/10/12', freq=offsets.BusinessDay()), + Period('3/10/12', freq='B')) + self.assertEqual(Period('3/10/12', freq=offsets.Day()), + Period('3/10/12', freq='D')) + + self.assertEqual(Period(year=2005, quarter=1, + freq=offsets.QuarterEnd(startingMonth=12)), + Period(year=2005, quarter=1, freq='Q')) + self.assertEqual(Period(year=2005, quarter=2, + freq=offsets.QuarterEnd(startingMonth=12)), + Period(year=2005, quarter=2, freq='Q')) + + self.assertEqual(Period(year=2005, month=3, day=1, freq=offsets.Day()), + Period(year=2005, month=3, day=1, freq='D')) + self.assertEqual(Period(year=2012, month=3, day=10, + freq=offsets.BDay()), + Period(year=2012, month=3, day=10, freq='B')) + + expected = Period('2005-03-01', freq='3D') + self.assertEqual(Period(year=2005, month=3, day=1, + freq=offsets.Day(3)), expected) + self.assertEqual(Period(year=2005, month=3, day=1, freq='3D'), + expected) + + self.assertEqual(Period(year=2012, month=3, day=10, + freq=offsets.BDay(3)), + Period(year=2012, month=3, day=10, freq='3B')) + + self.assertEqual(Period(200701, freq=offsets.MonthEnd()), + Period(200701, freq='M')) + + i1 = Period(ordinal=200701, freq=offsets.MonthEnd()) + i2 = Period(ordinal=200701, freq='M') + self.assertEqual(i1, i2) + self.assertEqual(i1.year, 18695) + self.assertEqual(i2.year, 18695) + + i1 = Period(datetime(2007, 1, 1), freq='M') + i2 = Period('200701', freq='M') + self.assertEqual(i1, i2) + + i1 = Period(date(2007, 1, 1), freq='M') + i2 = Period(datetime(2007, 1, 1), freq='M') + i3 = Period(np.datetime64('2007-01-01'), freq='M') + i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') + i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + self.assertEqual(i1, i4) + self.assertEqual(i1, i5) + + i1 = Period('2007-01-01 09:00:00.001') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + self.assertEqual(i1, expected) + + expected = Period(np_datetime64_compat( + '2007-01-01 09:00:00.001Z'), freq='L') + self.assertEqual(i1, expected) + + i1 = Period('2007-01-01 09:00:00.00101') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + self.assertEqual(i1, expected) + + expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), + freq='U') + self.assertEqual(i1, expected) + + self.assertRaises(ValueError, Period, ordinal=200701) + + self.assertRaises(ValueError, Period, '2007-1-1', freq='X') + + def test_freq_str(self): + i1 = Period('1982', freq='Min') + self.assertEqual(i1.freq, offsets.Minute()) + self.assertEqual(i1.freqstr, 'T') + + def test_period_deprecated_freq(self): + cases = {"M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], + "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], + "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], + "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "T": ["minute", "MINUTE", "MINUTELY", "minutely"], + "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], + "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], + "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], + "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + for exp, freqs in iteritems(cases): + for freq in freqs: + with self.assertRaisesRegexp(ValueError, msg): + Period('2016-03-01 09:00', freq=freq) + with self.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq=freq) + + # check supported freq-aliases still works + p1 = Period('2016-03-01 09:00', freq=exp) + p2 = Period(ordinal=1, freq=exp) + tm.assertIsInstance(p1, Period) + tm.assertIsInstance(p2, Period) + + def test_hash(self): + self.assertEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01-01', freq='D')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='3M')), + hash(Period('2011-01', freq='2M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-02', freq='M'))) + + def test_repr(self): + p = Period('Jan-2000') + self.assertIn('2000-01', repr(p)) + + p = Period('2000-12-15') + self.assertIn('2000-12-15', repr(p)) + + def test_repr_nat(self): + p = Period('nat', freq='M') + self.assertIn(repr(tslib.NaT), repr(p)) + + def test_millisecond_repr(self): + p = Period('2000-01-01 12:15:02.123') + + self.assertEqual("Period('2000-01-01 12:15:02.123', 'L')", repr(p)) + + def test_microsecond_repr(self): + p = Period('2000-01-01 12:15:02.123567') + + self.assertEqual("Period('2000-01-01 12:15:02.123567', 'U')", repr(p)) + + def test_strftime(self): + p = Period('2000-1-1 12:34:12', freq='S') + res = p.strftime('%Y-%m-%d %H:%M:%S') + self.assertEqual(res, '2000-01-01 12:34:12') + tm.assertIsInstance(res, text_type) # GH3363 + + def test_sub_delta(self): + left, right = Period('2011', freq='A'), Period('2007', freq='A') + result = left - right + self.assertEqual(result, 4) + + with self.assertRaises(period.IncompatibleFrequency): + left - Period('2007-01', freq='M') + + def test_to_timestamp(self): + p = Period('1982', freq='A') + start_ts = p.to_timestamp(how='S') + aliases = ['s', 'StarT', 'BEGIn'] + for a in aliases: + self.assertEqual(start_ts, p.to_timestamp('D', how=a)) + # freq with mult should not affect to the result + self.assertEqual(start_ts, p.to_timestamp('3D', how=a)) + + end_ts = p.to_timestamp(how='E') + aliases = ['e', 'end', 'FINIsH'] + for a in aliases: + self.assertEqual(end_ts, p.to_timestamp('D', how=a)) + self.assertEqual(end_ts, p.to_timestamp('3D', how=a)) + + from_lst = ['A', 'Q', 'M', 'W', 'B', 'D', 'H', 'Min', 'S'] + + def _ex(p): + return Timestamp((p + 1).start_time.value - 1) + + for i, fcode in enumerate(from_lst): + p = Period('1982', freq=fcode) + result = p.to_timestamp().to_period(fcode) + self.assertEqual(result, p) + + self.assertEqual(p.start_time, p.to_timestamp(how='S')) + + self.assertEqual(p.end_time, _ex(p)) + + # Frequency other than daily + + p = Period('1985', freq='A') + + result = p.to_timestamp('H', how='end') + expected = datetime(1985, 12, 31, 23) + self.assertEqual(result, expected) + result = p.to_timestamp('3H', how='end') + self.assertEqual(result, expected) + + result = p.to_timestamp('T', how='end') + expected = datetime(1985, 12, 31, 23, 59) + self.assertEqual(result, expected) + result = p.to_timestamp('2T', how='end') + self.assertEqual(result, expected) + + result = p.to_timestamp(how='end') + expected = datetime(1985, 12, 31) + self.assertEqual(result, expected) + + expected = datetime(1985, 1, 1) + result = p.to_timestamp('H', how='start') + self.assertEqual(result, expected) + result = p.to_timestamp('T', how='start') + self.assertEqual(result, expected) + result = p.to_timestamp('S', how='start') + self.assertEqual(result, expected) + result = p.to_timestamp('3H', how='start') + self.assertEqual(result, expected) + result = p.to_timestamp('5S', how='start') + self.assertEqual(result, expected) + + def test_start_time(self): + freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] + xp = datetime(2012, 1, 1) + for f in freq_lst: + p = Period('2012', freq=f) + self.assertEqual(p.start_time, xp) + self.assertEqual(Period('2012', freq='B').start_time, + datetime(2012, 1, 2)) + self.assertEqual(Period('2012', freq='W').start_time, + datetime(2011, 12, 26)) + + def test_end_time(self): + p = Period('2012', freq='A') + + def _ex(*args): + return Timestamp(Timestamp(datetime(*args)).value - 1) + + xp = _ex(2013, 1, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='Q') + xp = _ex(2012, 4, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='M') + xp = _ex(2012, 2, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='D') + xp = _ex(2012, 1, 2) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='H') + xp = _ex(2012, 1, 1, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='B') + xp = _ex(2012, 1, 3) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='W') + xp = _ex(2012, 1, 2) + self.assertEqual(xp, p.end_time) + + # Test for GH 11738 + p = Period('2012', freq='15D') + xp = _ex(2012, 1, 16) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='1D1H') + xp = _ex(2012, 1, 2, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='1H1D') + xp = _ex(2012, 1, 2, 1) + self.assertEqual(xp, p.end_time) + + def test_anchor_week_end_time(self): + def _ex(*args): + return Timestamp(Timestamp(datetime(*args)).value - 1) + + p = Period('2013-1-1', 'W-SAT') + xp = _ex(2013, 1, 6) + self.assertEqual(p.end_time, xp) + + def test_properties_annually(self): + # Test properties on Periods with annually frequency. + a_date = Period(freq='A', year=2007) + self.assertEqual(a_date.year, 2007) + + def test_properties_quarterly(self): + # Test properties on Periods with daily frequency. + qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) + qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) + qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) + # + for x in range(3): + for qd in (qedec_date, qejan_date, qejun_date): + self.assertEqual((qd + x).qyear, 2007) + self.assertEqual((qd + x).quarter, x + 1) + + def test_properties_monthly(self): + # Test properties on Periods with daily frequency. + m_date = Period(freq='M', year=2007, month=1) + for x in range(11): + m_ival_x = m_date + x + self.assertEqual(m_ival_x.year, 2007) + if 1 <= x + 1 <= 3: + self.assertEqual(m_ival_x.quarter, 1) + elif 4 <= x + 1 <= 6: + self.assertEqual(m_ival_x.quarter, 2) + elif 7 <= x + 1 <= 9: + self.assertEqual(m_ival_x.quarter, 3) + elif 10 <= x + 1 <= 12: + self.assertEqual(m_ival_x.quarter, 4) + self.assertEqual(m_ival_x.month, x + 1) + + def test_properties_weekly(self): + # Test properties on Periods with daily frequency. + w_date = Period(freq='W', year=2007, month=1, day=7) + # + self.assertEqual(w_date.year, 2007) + self.assertEqual(w_date.quarter, 1) + self.assertEqual(w_date.month, 1) + self.assertEqual(w_date.week, 1) + self.assertEqual((w_date - 1).week, 52) + self.assertEqual(w_date.days_in_month, 31) + self.assertEqual(Period(freq='W', year=2012, + month=2, day=1).days_in_month, 29) + + def test_properties_weekly_legacy(self): + # Test properties on Periods with daily frequency. + w_date = Period(freq='W', year=2007, month=1, day=7) + self.assertEqual(w_date.year, 2007) + self.assertEqual(w_date.quarter, 1) + self.assertEqual(w_date.month, 1) + self.assertEqual(w_date.week, 1) + self.assertEqual((w_date - 1).week, 52) + self.assertEqual(w_date.days_in_month, 31) + + exp = Period(freq='W', year=2012, month=2, day=1) + self.assertEqual(exp.days_in_month, 29) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=7) + + def test_properties_daily(self): + # Test properties on Periods with daily frequency. + b_date = Period(freq='B', year=2007, month=1, day=1) + # + self.assertEqual(b_date.year, 2007) + self.assertEqual(b_date.quarter, 1) + self.assertEqual(b_date.month, 1) + self.assertEqual(b_date.day, 1) + self.assertEqual(b_date.weekday, 0) + self.assertEqual(b_date.dayofyear, 1) + self.assertEqual(b_date.days_in_month, 31) + self.assertEqual(Period(freq='B', year=2012, + month=2, day=1).days_in_month, 29) + # + d_date = Period(freq='D', year=2007, month=1, day=1) + # + self.assertEqual(d_date.year, 2007) + self.assertEqual(d_date.quarter, 1) + self.assertEqual(d_date.month, 1) + self.assertEqual(d_date.day, 1) + self.assertEqual(d_date.weekday, 0) + self.assertEqual(d_date.dayofyear, 1) + self.assertEqual(d_date.days_in_month, 31) + self.assertEqual(Period(freq='D', year=2012, month=2, + day=1).days_in_month, 29) + + def test_properties_hourly(self): + # Test properties on Periods with hourly frequency. + h_date1 = Period(freq='H', year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0) + + for h_date in [h_date1, h_date2]: + self.assertEqual(h_date.year, 2007) + self.assertEqual(h_date.quarter, 1) + self.assertEqual(h_date.month, 1) + self.assertEqual(h_date.day, 1) + self.assertEqual(h_date.weekday, 0) + self.assertEqual(h_date.dayofyear, 1) + self.assertEqual(h_date.hour, 0) + self.assertEqual(h_date.days_in_month, 31) + self.assertEqual(Period(freq='H', year=2012, month=2, day=1, + hour=0).days_in_month, 29) + + def test_properties_minutely(self): + # Test properties on Periods with minutely frequency. + t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + # + self.assertEqual(t_date.quarter, 1) + self.assertEqual(t_date.month, 1) + self.assertEqual(t_date.day, 1) + self.assertEqual(t_date.weekday, 0) + self.assertEqual(t_date.dayofyear, 1) + self.assertEqual(t_date.hour, 0) + self.assertEqual(t_date.minute, 0) + self.assertEqual(t_date.days_in_month, 31) + self.assertEqual(Period(freq='D', year=2012, month=2, day=1, hour=0, + minute=0).days_in_month, 29) + + def test_properties_secondly(self): + # Test properties on Periods with secondly frequency. + s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + # + self.assertEqual(s_date.year, 2007) + self.assertEqual(s_date.quarter, 1) + self.assertEqual(s_date.month, 1) + self.assertEqual(s_date.day, 1) + self.assertEqual(s_date.weekday, 0) + self.assertEqual(s_date.dayofyear, 1) + self.assertEqual(s_date.hour, 0) + self.assertEqual(s_date.minute, 0) + self.assertEqual(s_date.second, 0) + self.assertEqual(s_date.days_in_month, 31) + self.assertEqual(Period(freq='Min', year=2012, month=2, day=1, hour=0, + minute=0, second=0).days_in_month, 29) + + def test_properties_nat(self): + p_nat = Period('NaT', freq='M') + t_nat = pd.Timestamp('NaT') + self.assertIs(p_nat, t_nat) + + # confirm Period('NaT') work identical with Timestamp('NaT') + for f in ['year', 'month', 'day', 'hour', 'minute', 'second', 'week', + 'dayofyear', 'quarter', 'days_in_month']: + self.assertTrue(np.isnan(getattr(p_nat, f))) + self.assertTrue(np.isnan(getattr(t_nat, f))) + + def test_pnow(self): + dt = datetime.now() + + val = period.pnow('D') + exp = Period(dt, freq='D') + self.assertEqual(val, exp) + + val2 = period.pnow('2D') + exp2 = Period(dt, freq='2D') + self.assertEqual(val2, exp2) + self.assertEqual(val.ordinal, val2.ordinal) + self.assertEqual(val.ordinal, exp2.ordinal) + + def test_constructor_corner(self): + expected = Period('2007-01', freq='2M') + self.assertEqual(Period(year=2007, month=1, freq='2M'), expected) + + self.assertRaises(ValueError, Period, datetime.now()) + self.assertRaises(ValueError, Period, datetime.now().date()) + self.assertRaises(ValueError, Period, 1.6, freq='D') + self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') + self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') + self.assertIs(Period(None), pd.NaT) + self.assertRaises(ValueError, Period, month=1) + + p = Period('2007-01-01', freq='D') + + result = Period(p, freq='A') + exp = Period('2007', freq='A') + self.assertEqual(result, exp) + + def test_constructor_infer_freq(self): + p = Period('2007-01-01') + self.assertEqual(p.freq, 'D') + + p = Period('2007-01-01 07') + self.assertEqual(p.freq, 'H') + + p = Period('2007-01-01 07:10') + self.assertEqual(p.freq, 'T') + + p = Period('2007-01-01 07:10:15') + self.assertEqual(p.freq, 'S') + + p = Period('2007-01-01 07:10:15.123') + self.assertEqual(p.freq, 'L') + + p = Period('2007-01-01 07:10:15.123000') + self.assertEqual(p.freq, 'L') + + p = Period('2007-01-01 07:10:15.123400') + self.assertEqual(p.freq, 'U') + + def test_asfreq_MS(self): + initial = Period("2013") + + self.assertEqual(initial.asfreq(freq="M", how="S"), + Period('2013-01', 'M')) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + initial.asfreq(freq="MS", how="S") + + with tm.assertRaisesRegexp(ValueError, msg): + pd.Period('2013-01', 'MS') + + self.assertTrue(_period_code_map.get("MS") is None) + + def test_badinput(self): + self.assertRaises(ValueError, Period, '-2000', 'A') + self.assertRaises(tslib.DateParseError, Period, '0', 'A') + self.assertRaises(tslib.DateParseError, Period, '1/1/-2000', 'A') + + def test_multiples(self): + result1 = Period('1989', freq='2A') + result2 = Period('1989', freq='A') + self.assertEqual(result1.ordinal, result2.ordinal) + self.assertEqual(result1.freqstr, '2A-DEC') + self.assertEqual(result2.freqstr, 'A-DEC') + self.assertEqual(result1.freq, offsets.YearEnd(2)) + self.assertEqual(result2.freq, offsets.YearEnd()) + + self.assertEqual((result1 + 1).ordinal, result1.ordinal + 2) + self.assertEqual((1 + result1).ordinal, result1.ordinal + 2) + self.assertEqual((result1 - 1).ordinal, result2.ordinal - 2) + self.assertEqual((-1 + result1).ordinal, result2.ordinal - 2) + + def test_round_trip(self): + + p = Period('2000Q1') + new_p = self.round_trip_pickle(p) + self.assertEqual(new_p, p) + + +class TestPeriodField(tm.TestCase): + + def test_get_period_field_raises_on_out_of_range(self): + self.assertRaises(ValueError, _period.get_period_field, -1, 0, 0) + + def test_get_period_field_array_raises_on_out_of_range(self): + self.assertRaises(ValueError, _period.get_period_field_arr, -1, + np.empty(1), 0) + + +class TestFreqConversion(tm.TestCase): + "Test frequency conversion of date objects" + + def test_asfreq_corner(self): + val = Period(freq='A', year=2007) + result1 = val.asfreq('5t') + result2 = val.asfreq('t') + expected = Period('2007-12-31 23:59', freq='t') + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freqstr, '5T') + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freqstr, 'T') + + def test_conv_annual(self): + # frequency conversion tests: from Annual Frequency + + ival_A = Period(freq='A', year=2007) + + ival_AJAN = Period(freq="A-JAN", year=2007) + ival_AJUN = Period(freq="A-JUN", year=2007) + ival_ANOV = Period(freq="A-NOV", year=2007) + + ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) + ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) + ival_A_to_M_start = Period(freq='M', year=2007, month=1) + ival_A_to_M_end = Period(freq='M', year=2007, month=12) + ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + + ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) + + self.assertEqual(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) + self.assertEqual(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) + self.assertEqual(ival_A.asfreq('M', 's'), ival_A_to_M_start) + self.assertEqual(ival_A.asfreq('M', 'E'), ival_A_to_M_end) + self.assertEqual(ival_A.asfreq('W', 'S'), ival_A_to_W_start) + self.assertEqual(ival_A.asfreq('W', 'E'), ival_A_to_W_end) + self.assertEqual(ival_A.asfreq('B', 'S'), ival_A_to_B_start) + self.assertEqual(ival_A.asfreq('B', 'E'), ival_A_to_B_end) + self.assertEqual(ival_A.asfreq('D', 'S'), ival_A_to_D_start) + self.assertEqual(ival_A.asfreq('D', 'E'), ival_A_to_D_end) + self.assertEqual(ival_A.asfreq('H', 'S'), ival_A_to_H_start) + self.assertEqual(ival_A.asfreq('H', 'E'), ival_A_to_H_end) + self.assertEqual(ival_A.asfreq('min', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('min', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('T', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('T', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('S', 'S'), ival_A_to_S_start) + self.assertEqual(ival_A.asfreq('S', 'E'), ival_A_to_S_end) + + self.assertEqual(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) + self.assertEqual(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) + + self.assertEqual(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) + self.assertEqual(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) + + self.assertEqual(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) + self.assertEqual(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) + + self.assertEqual(ival_A.asfreq('A'), ival_A) + + def test_conv_quarterly(self): + # frequency conversion tests: from Quarterly Frequency + + ival_Q = Period(freq='Q', year=2007, quarter=1) + ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) + + ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) + ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) + + ival_Q_to_A = Period(freq='A', year=2007) + ival_Q_to_M_start = Period(freq='M', year=2007, month=1) + ival_Q_to_M_end = Period(freq='M', year=2007, month=3) + ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23) + ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23, + minute=59, second=59) + + ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) + + self.assertEqual(ival_Q.asfreq('A'), ival_Q_to_A) + self.assertEqual(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) + + self.assertEqual(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) + self.assertEqual(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) + self.assertEqual(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) + self.assertEqual(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) + self.assertEqual(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) + self.assertEqual(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) + self.assertEqual(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) + self.assertEqual(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) + self.assertEqual(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) + self.assertEqual(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) + self.assertEqual(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) + self.assertEqual(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) + self.assertEqual(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) + self.assertEqual(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) + + self.assertEqual(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) + self.assertEqual(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) + self.assertEqual(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) + self.assertEqual(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) + + self.assertEqual(ival_Q.asfreq('Q'), ival_Q) + + def test_conv_monthly(self): + # frequency conversion tests: from Monthly Frequency + + ival_M = Period(freq='M', year=2007, month=1) + ival_M_end_of_year = Period(freq='M', year=2007, month=12) + ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) + ival_M_to_A = Period(freq='A', year=2007) + ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23) + ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, + minute=59, second=59) + + self.assertEqual(ival_M.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M_end_of_year.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M.asfreq('Q'), ival_M_to_Q) + self.assertEqual(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) + + self.assertEqual(ival_M.asfreq('W', 'S'), ival_M_to_W_start) + self.assertEqual(ival_M.asfreq('W', 'E'), ival_M_to_W_end) + self.assertEqual(ival_M.asfreq('B', 'S'), ival_M_to_B_start) + self.assertEqual(ival_M.asfreq('B', 'E'), ival_M_to_B_end) + self.assertEqual(ival_M.asfreq('D', 'S'), ival_M_to_D_start) + self.assertEqual(ival_M.asfreq('D', 'E'), ival_M_to_D_end) + self.assertEqual(ival_M.asfreq('H', 'S'), ival_M_to_H_start) + self.assertEqual(ival_M.asfreq('H', 'E'), ival_M_to_H_end) + self.assertEqual(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) + self.assertEqual(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) + self.assertEqual(ival_M.asfreq('S', 'S'), ival_M_to_S_start) + self.assertEqual(ival_M.asfreq('S', 'E'), ival_M_to_S_end) + + self.assertEqual(ival_M.asfreq('M'), ival_M) + + def test_conv_weekly(self): + # frequency conversion tests: from Weekly Frequency + ival_W = Period(freq='W', year=2007, month=1, day=1) + + ival_WSUN = Period(freq='W', year=2007, month=1, day=7) + ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6) + ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5) + ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4) + ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3) + ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2) + ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31) + ival_W_to_A = Period(freq='A', year=2007) + ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_W_to_M = Period(freq='M', year=2007, month=1) + + if Period(freq='D', year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq='A', year=2007) + else: + ival_W_to_A_end_of_year = Period(freq='A', year=2008) + + if Period(freq='D', year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) + else: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) + + if Period(freq='D', year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) + else: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) + + ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) + ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, + minute=59, second=59) + + self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) + self.assertEqual(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) + self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) + self.assertEqual(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + + self.assertEqual(ival_W.asfreq('W'), ival_W) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + ival_W.asfreq('WK') + + def test_conv_weekly_legacy(self): + # frequency conversion tests: from Weekly Frequency + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=1) + + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-SAT', year=2007, month=1, day=6) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-FRI', year=2007, month=1, day=5) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-THU', year=2007, month=1, day=4) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-WED', year=2007, month=1, day=3) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-TUE', year=2007, month=1, day=2) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-MON', year=2007, month=1, day=1) + + def test_conv_business(self): + # frequency conversion tests: from Business Frequency" + + ival_B = Period(freq='B', year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) + + ival_B_to_A = Period(freq='A', year=2007) + ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_B_to_M = Period(freq='M', year=2007, month=1) + ival_B_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) + ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, + minute=59, second=59) + + self.assertEqual(ival_B.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B_end_of_year.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B_end_of_month.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B.asfreq('W'), ival_B_to_W) + self.assertEqual(ival_B_end_of_week.asfreq('W'), ival_B_to_W) + + self.assertEqual(ival_B.asfreq('D'), ival_B_to_D) + + self.assertEqual(ival_B.asfreq('H', 'S'), ival_B_to_H_start) + self.assertEqual(ival_B.asfreq('H', 'E'), ival_B_to_H_end) + self.assertEqual(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) + self.assertEqual(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) + self.assertEqual(ival_B.asfreq('S', 'S'), ival_B_to_S_start) + self.assertEqual(ival_B.asfreq('S', 'E'), ival_B_to_S_end) + + self.assertEqual(ival_B.asfreq('B'), ival_B) + + def test_conv_daily(self): + # frequency conversion tests: from Business Frequency" + + ival_D = Period(freq='D', year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) + + ival_D_friday = Period(freq='D', year=2007, month=1, day=5) + ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) + ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) + + # TODO: unused? + # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) + + ival_B_friday = Period(freq='B', year=2007, month=1, day=5) + ival_B_monday = Period(freq='B', year=2007, month=1, day=8) + + ival_D_to_A = Period(freq='A', year=2007) + + ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) + ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) + ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) + + ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) + ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) + ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) + + ival_D_to_M = Period(freq='M', year=2007, month=1) + ival_D_to_W = Period(freq='W', year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) + ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, + minute=59, second=59) + + self.assertEqual(ival_D.asfreq('A'), ival_D_to_A) + + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JAN'), + ival_Deoq_to_AJAN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JUN'), + ival_Deoq_to_AJUN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-DEC'), + ival_Deoq_to_ADEC) + + self.assertEqual(ival_D_end_of_year.asfreq('A'), ival_D_to_A) + self.assertEqual(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) + self.assertEqual(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) + self.assertEqual(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D_end_of_month.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D.asfreq('W'), ival_D_to_W) + self.assertEqual(ival_D_end_of_week.asfreq('W'), ival_D_to_W) + + self.assertEqual(ival_D_friday.asfreq('B'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) + self.assertEqual(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) + + self.assertEqual(ival_D.asfreq('H', 'S'), ival_D_to_H_start) + self.assertEqual(ival_D.asfreq('H', 'E'), ival_D_to_H_end) + self.assertEqual(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) + self.assertEqual(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) + self.assertEqual(ival_D.asfreq('S', 'S'), ival_D_to_S_start) + self.assertEqual(ival_D.asfreq('S', 'E'), ival_D_to_S_end) + + self.assertEqual(ival_D.asfreq('D'), ival_D) + + def test_conv_hourly(self): + # frequency conversion tests: from Hourly Frequency" + + ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, + hour=23) + ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, + hour=23) + ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, + hour=23) + ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, + hour=23) + + ival_H_to_A = Period(freq='A', year=2007) + ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_H_to_M = Period(freq='M', year=2007, month=1) + ival_H_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) + + ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=59) + ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=59, second=59) + + self.assertEqual(ival_H.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H_end_of_year.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H_end_of_month.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H_end_of_week.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H_end_of_day.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H.asfreq('B'), ival_H_to_B) + self.assertEqual(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) + + self.assertEqual(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) + self.assertEqual(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) + self.assertEqual(ival_H.asfreq('S', 'S'), ival_H_to_S_start) + self.assertEqual(ival_H.asfreq('S', 'E'), ival_H_to_S_end) + + self.assertEqual(ival_H.asfreq('H'), ival_H) + + def test_conv_minutely(self): + # frequency conversion tests: from Minutely Frequency" + + ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=59) + + ival_T_to_A = Period(freq='A', year=2007) + ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_T_to_M = Period(freq='M', year=2007, month=1) + ival_T_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=59) + + self.assertEqual(ival_T.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T_end_of_year.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T_end_of_month.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T_end_of_week.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T_end_of_day.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T.asfreq('H'), ival_T_to_H) + self.assertEqual(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) + + self.assertEqual(ival_T.asfreq('S', 'S'), ival_T_to_S_start) + self.assertEqual(ival_T.asfreq('S', 'E'), ival_T_to_S_end) + + self.assertEqual(ival_T.asfreq('Min'), ival_T) + + def test_conv_secondly(self): + # frequency conversion tests: from Secondly Frequency" + + ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, + second=0) + ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, + hour=23, minute=59, second=59) + ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=59, second=59) + ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=59) + + ival_S_to_A = Period(freq='A', year=2007) + ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_S_to_M = Period(freq='M', year=2007, month=1) + ival_S_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + + self.assertEqual(ival_S.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S_end_of_year.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S_end_of_month.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S_end_of_week.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S_end_of_day.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S.asfreq('Min'), ival_S_to_T) + self.assertEqual(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) + + self.assertEqual(ival_S.asfreq('S'), ival_S) + + def test_asfreq_mult(self): + # normal freq to mult freq + p = Period(freq='A', year=2007) + # ordinal will not change + for freq in ['3A', offsets.YearEnd(3)]: + result = p.asfreq(freq) + expected = Period('2007', freq='3A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + # ordinal will not change + for freq in ['3A', offsets.YearEnd(3)]: + result = p.asfreq(freq, how='S') + expected = Period('2007', freq='3A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + # mult freq to normal freq + p = Period(freq='3A', year=2007) + # ordinal will change because how=E is the default + for freq in ['A', offsets.YearEnd()]: + result = p.asfreq(freq) + expected = Period('2009', freq='A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + # ordinal will not change + for freq in ['A', offsets.YearEnd()]: + result = p.asfreq(freq, how='S') + expected = Period('2007', freq='A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + p = Period(freq='A', year=2007) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq) + expected = Period('2007-12', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq, how='S') + expected = Period('2007-01', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + p = Period(freq='3A', year=2007) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq) + expected = Period('2009-12', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq, how='S') + expected = Period('2007-01', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + def test_asfreq_combined(self): + # normal freq to combined freq + p = Period('2007', freq='H') + + # ordinal will not change + expected = Period('2007', freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): + result = p.asfreq(freq, how=how) + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + # combined freq to normal freq + p1 = Period(freq='1D1H', year=2007) + p2 = Period(freq='1H1D', year=2007) + + # ordinal will change because how=E is the default + result1 = p1.asfreq('H') + result2 = p2.asfreq('H') + expected = Period('2007-01-02', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + # ordinal will not change + result1 = p1.asfreq('H', how='S') + result2 = p2.asfreq('H', how='S') + expected = Period('2007-01-01', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + def test_is_leap_year(self): + # GH 13727 + for freq in ['A', 'M', 'D', 'H']: + p = Period('2000-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + self.assertIsInstance(p.is_leap_year, bool) + + p = Period('1999-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + + p = Period('2004-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + + p = Period('2100-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + + +class TestMethods(tm.TestCase): + + def test_add(self): + dt1 = Period(freq='D', year=2008, month=1, day=1) + dt2 = Period(freq='D', year=2008, month=1, day=2) + self.assertEqual(dt1 + 1, dt2) + self.assertEqual(1 + dt1, dt2) + + def test_add_pdnat(self): + p = pd.Period('2011-01', freq='M') + self.assertIs(p + pd.NaT, pd.NaT) + self.assertIs(pd.NaT + p, pd.NaT) + + p = pd.Period('NaT', freq='M') + self.assertIs(p + pd.NaT, pd.NaT) + self.assertIs(pd.NaT + p, pd.NaT) + + def test_add_raises(self): + # GH 4731 + dt1 = Period(freq='D', year=2008, month=1, day=1) + dt2 = Period(freq='D', year=2008, month=1, day=2) + msg = r"unsupported operand type\(s\)" + with tm.assertRaisesRegexp(TypeError, msg): + dt1 + "str" + + msg = r"unsupported operand type\(s\)" + with tm.assertRaisesRegexp(TypeError, msg): + "str" + dt1 + + with tm.assertRaisesRegexp(TypeError, msg): + dt1 + dt2 + + def test_sub(self): + dt1 = Period('2011-01-01', freq='D') + dt2 = Period('2011-01-15', freq='D') + + self.assertEqual(dt1 - dt2, -14) + self.assertEqual(dt2 - dt1, 14) + + msg = r"Input has different freq=M from Period\(freq=D\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + dt1 - pd.Period('2011-02', freq='M') + + def test_add_offset(self): + # freq is DateOffset + for freq in ['A', '2A', '3A']: + p = Period('2011', freq=freq) + exp = Period('2013', freq=freq) + self.assertEqual(p + offsets.YearEnd(2), exp) + self.assertEqual(offsets.YearEnd(2) + p, exp) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + with tm.assertRaises(period.IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + with tm.assertRaises(period.IncompatibleFrequency): + o + p + + for freq in ['M', '2M', '3M']: + p = Period('2011-03', freq=freq) + exp = Period('2011-05', freq=freq) + self.assertEqual(p + offsets.MonthEnd(2), exp) + self.assertEqual(offsets.MonthEnd(2) + p, exp) + + exp = Period('2012-03', freq=freq) + self.assertEqual(p + offsets.MonthEnd(12), exp) + self.assertEqual(offsets.MonthEnd(12) + p, exp) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + with tm.assertRaises(period.IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + with tm.assertRaises(period.IncompatibleFrequency): + o + p + + # freq is Tick + for freq in ['D', '2D', '3D']: + p = Period('2011-04-01', freq=freq) + + exp = Period('2011-04-06', freq=freq) + self.assertEqual(p + offsets.Day(5), exp) + self.assertEqual(offsets.Day(5) + p, exp) + + exp = Period('2011-04-02', freq=freq) + self.assertEqual(p + offsets.Hour(24), exp) + self.assertEqual(offsets.Hour(24) + p, exp) + + exp = Period('2011-04-03', freq=freq) + self.assertEqual(p + np.timedelta64(2, 'D'), exp) + with tm.assertRaises(TypeError): + np.timedelta64(2, 'D') + p + + exp = Period('2011-04-02', freq=freq) + self.assertEqual(p + np.timedelta64(3600 * 24, 's'), exp) + with tm.assertRaises(TypeError): + np.timedelta64(3600 * 24, 's') + p + + exp = Period('2011-03-30', freq=freq) + self.assertEqual(p + timedelta(-2), exp) + self.assertEqual(timedelta(-2) + p, exp) + + exp = Period('2011-04-03', freq=freq) + self.assertEqual(p + timedelta(hours=48), exp) + self.assertEqual(timedelta(hours=48) + p, exp) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(4, 'h'), + timedelta(hours=23)]: + with tm.assertRaises(period.IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + with tm.assertRaises(period.IncompatibleFrequency): + o + p + + for freq in ['H', '2H', '3H']: + p = Period('2011-04-01 09:00', freq=freq) + + exp = Period('2011-04-03 09:00', freq=freq) + self.assertEqual(p + offsets.Day(2), exp) + self.assertEqual(offsets.Day(2) + p, exp) + + exp = Period('2011-04-01 12:00', freq=freq) + self.assertEqual(p + offsets.Hour(3), exp) + self.assertEqual(offsets.Hour(3) + p, exp) + + exp = Period('2011-04-01 12:00', freq=freq) + self.assertEqual(p + np.timedelta64(3, 'h'), exp) + with tm.assertRaises(TypeError): + np.timedelta64(3, 'h') + p + + exp = Period('2011-04-01 10:00', freq=freq) + self.assertEqual(p + np.timedelta64(3600, 's'), exp) + with tm.assertRaises(TypeError): + np.timedelta64(3600, 's') + p + + exp = Period('2011-04-01 11:00', freq=freq) + self.assertEqual(p + timedelta(minutes=120), exp) + self.assertEqual(timedelta(minutes=120) + p, exp) + + exp = Period('2011-04-05 12:00', freq=freq) + self.assertEqual(p + timedelta(days=4, minutes=180), exp) + self.assertEqual(timedelta(days=4, minutes=180) + p, exp) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(3200, 's'), + timedelta(hours=23, minutes=30)]: + with tm.assertRaises(period.IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + with tm.assertRaises(period.IncompatibleFrequency): + o + p + + def test_add_offset_nat(self): + # freq is DateOffset + for freq in ['A', '2A', '3A']: + p = Period('NaT', freq=freq) + for o in [offsets.YearEnd(2)]: + self.assertIs(p + o, tslib.NaT) + self.assertIs(o + p, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + self.assertIs(p + o, tslib.NaT) + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + self.assertIs(o + p, tslib.NaT) + + for freq in ['M', '2M', '3M']: + p = Period('NaT', freq=freq) + for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: + self.assertIs(p + o, tslib.NaT) + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + self.assertIs(o + p, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + self.assertIs(p + o, tslib.NaT) + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + self.assertIs(o + p, tslib.NaT) + + # freq is Tick + for freq in ['D', '2D', '3D']: + p = Period('NaT', freq=freq) + for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), + np.timedelta64(3600 * 24, 's'), timedelta(-2), + timedelta(hours=48)]: + self.assertIs(p + o, tslib.NaT) + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + self.assertIs(o + p, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(4, 'h'), + timedelta(hours=23)]: + self.assertIs(p + o, tslib.NaT) + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + self.assertIs(o + p, tslib.NaT) + + for freq in ['H', '2H', '3H']: + p = Period('NaT', freq=freq) + for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), + np.timedelta64(3600, 's'), timedelta(minutes=120), + timedelta(days=4, minutes=180)]: + self.assertIs(p + o, tslib.NaT) + + if not isinstance(o, np.timedelta64): + self.assertIs(o + p, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(3200, 's'), + timedelta(hours=23, minutes=30)]: + self.assertIs(p + o, tslib.NaT) + + if isinstance(o, np.timedelta64): + with tm.assertRaises(TypeError): + o + p + else: + self.assertIs(o + p, tslib.NaT) + + def test_sub_pdnat(self): + # GH 13071 + p = pd.Period('2011-01', freq='M') + self.assertIs(p - pd.NaT, pd.NaT) + self.assertIs(pd.NaT - p, pd.NaT) + + p = pd.Period('NaT', freq='M') + self.assertIs(p - pd.NaT, pd.NaT) + self.assertIs(pd.NaT - p, pd.NaT) + + def test_sub_offset(self): + # freq is DateOffset + for freq in ['A', '2A', '3A']: + p = Period('2011', freq=freq) + self.assertEqual(p - offsets.YearEnd(2), Period('2009', freq=freq)) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + with tm.assertRaises(period.IncompatibleFrequency): + p - o + + for freq in ['M', '2M', '3M']: + p = Period('2011-03', freq=freq) + self.assertEqual(p - offsets.MonthEnd(2), + Period('2011-01', freq=freq)) + self.assertEqual(p - offsets.MonthEnd(12), + Period('2010-03', freq=freq)) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + with tm.assertRaises(period.IncompatibleFrequency): + p - o + + # freq is Tick + for freq in ['D', '2D', '3D']: + p = Period('2011-04-01', freq=freq) + self.assertEqual(p - offsets.Day(5), + Period('2011-03-27', freq=freq)) + self.assertEqual(p - offsets.Hour(24), + Period('2011-03-31', freq=freq)) + self.assertEqual(p - np.timedelta64(2, 'D'), + Period('2011-03-30', freq=freq)) + self.assertEqual(p - np.timedelta64(3600 * 24, 's'), + Period('2011-03-31', freq=freq)) + self.assertEqual(p - timedelta(-2), + Period('2011-04-03', freq=freq)) + self.assertEqual(p - timedelta(hours=48), + Period('2011-03-30', freq=freq)) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(4, 'h'), + timedelta(hours=23)]: + with tm.assertRaises(period.IncompatibleFrequency): + p - o + + for freq in ['H', '2H', '3H']: + p = Period('2011-04-01 09:00', freq=freq) + self.assertEqual(p - offsets.Day(2), + Period('2011-03-30 09:00', freq=freq)) + self.assertEqual(p - offsets.Hour(3), + Period('2011-04-01 06:00', freq=freq)) + self.assertEqual(p - np.timedelta64(3, 'h'), + Period('2011-04-01 06:00', freq=freq)) + self.assertEqual(p - np.timedelta64(3600, 's'), + Period('2011-04-01 08:00', freq=freq)) + self.assertEqual(p - timedelta(minutes=120), + Period('2011-04-01 07:00', freq=freq)) + self.assertEqual(p - timedelta(days=4, minutes=180), + Period('2011-03-28 06:00', freq=freq)) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(3200, 's'), + timedelta(hours=23, minutes=30)]: + with tm.assertRaises(period.IncompatibleFrequency): + p - o + + def test_sub_offset_nat(self): + # freq is DateOffset + for freq in ['A', '2A', '3A']: + p = Period('NaT', freq=freq) + for o in [offsets.YearEnd(2)]: + self.assertIs(p - o, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + self.assertIs(p - o, tslib.NaT) + + for freq in ['M', '2M', '3M']: + p = Period('NaT', freq=freq) + for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: + self.assertIs(p - o, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + self.assertIs(p - o, tslib.NaT) + + # freq is Tick + for freq in ['D', '2D', '3D']: + p = Period('NaT', freq=freq) + for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), + np.timedelta64(3600 * 24, 's'), timedelta(-2), + timedelta(hours=48)]: + self.assertIs(p - o, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(4, 'h'), + timedelta(hours=23)]: + self.assertIs(p - o, tslib.NaT) + + for freq in ['H', '2H', '3H']: + p = Period('NaT', freq=freq) + for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), + np.timedelta64(3600, 's'), timedelta(minutes=120), + timedelta(days=4, minutes=180)]: + self.assertIs(p - o, tslib.NaT) + + for o in [offsets.YearBegin(2), offsets.MonthBegin(1), + offsets.Minute(), np.timedelta64(3200, 's'), + timedelta(hours=23, minutes=30)]: + self.assertIs(p - o, tslib.NaT) + + def test_nat_ops(self): + for freq in ['M', '2M', '3M']: + p = Period('NaT', freq=freq) + self.assertIs(p + 1, tslib.NaT) + self.assertIs(1 + p, tslib.NaT) + self.assertIs(p - 1, tslib.NaT) + self.assertIs(p - Period('2011-01', freq=freq), tslib.NaT) + self.assertIs(Period('2011-01', freq=freq) - p, tslib.NaT) + + def test_period_ops_offset(self): + p = Period('2011-04-01', freq='D') + result = p + offsets.Day() + exp = pd.Period('2011-04-02', freq='D') + self.assertEqual(result, exp) + + result = p - offsets.Day(2) + exp = pd.Period('2011-03-30', freq='D') + self.assertEqual(result, exp) + + msg = r"Input cannot be converted to Period\(freq=D\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + p + offsets.Hour(2) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + p - offsets.Hour(2) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py new file mode 100644 index 0000000000000..f1ae7765648ca --- /dev/null +++ b/pandas/tests/series/test_period.py @@ -0,0 +1,248 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +import pandas.tseries.period as period +from pandas import Series, period_range, DataFrame, Period + + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestSeriesPeriod(tm.TestCase): + + def setUp(self): + self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + + def test_auto_conversion(self): + series = Series(list(period_range('2000-01-01', periods=10, freq='D'))) + self.assertEqual(series.dtype, 'object') + + series = pd.Series([pd.Period('2011-01-01', freq='D'), + pd.Period('2011-02-01', freq='D')]) + self.assertEqual(series.dtype, 'object') + + def test_getitem(self): + self.assertEqual(self.series[1], pd.Period('2000-01-02', freq='D')) + + result = self.series[[2, 4]] + exp = pd.Series([pd.Period('2000-01-03', freq='D'), + pd.Period('2000-01-05', freq='D')], + index=[2, 4]) + self.assert_series_equal(result, exp) + self.assertEqual(result.dtype, 'object') + + def test_isnull(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + tm.assert_series_equal(s.isnull(), Series([False, True])) + tm.assert_series_equal(s.notnull(), Series([True, False])) + + def test_fillna(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + + res = s.fillna(pd.Period('2012-01', freq='M')) + exp = Series([pd.Period('2011-01', freq='M'), + pd.Period('2012-01', freq='M')]) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'object') + + res = s.fillna('XXX') + exp = Series([pd.Period('2011-01', freq='M'), 'XXX']) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'object') + + def test_dropna(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + tm.assert_series_equal(s.dropna(), + Series([pd.Period('2011-01', freq='M')])) + + def test_series_comparison_scalars(self): + val = pd.Period('2000-01-04', freq='D') + result = self.series > val + expected = pd.Series([x > val for x in self.series]) + tm.assert_series_equal(result, expected) + + val = self.series[5] + result = self.series > val + expected = pd.Series([x > val for x in self.series]) + tm.assert_series_equal(result, expected) + + def test_between(self): + left, right = self.series[[2, 7]] + result = self.series.between(left, right) + expected = (self.series >= left) & (self.series <= right) + tm.assert_series_equal(result, expected) + + # --------------------------------------------------------------------- + # NaT support + + """ + # ToDo: Enable when support period dtype + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, iNaT], dtype='period[D]') + + val = series[3] + self.assertTrue(isnull(val)) + + series[2] = val + self.assertTrue(isnull(series[2])) + + def test_NaT_cast(self): + result = Series([np.nan]).astype('period[D]') + expected = Series([NaT]) + tm.assert_series_equal(result, expected) + """ + + def test_set_none_nan(self): + # currently Period is stored as object dtype, not as NaT + self.series[3] = None + self.assertIs(self.series[3], None) + + self.series[3:5] = None + self.assertIs(self.series[4], None) + + self.series[5] = np.nan + self.assertTrue(np.isnan(self.series[5])) + + self.series[5:7] = np.nan + self.assertTrue(np.isnan(self.series[6])) + + def test_intercept_astype_object(self): + expected = self.series.astype('object') + + df = DataFrame({'a': self.series, + 'b': np.random.randn(len(self.series))}) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + def test_comp_series_period_scalar(self): + # GH 13200 + for freq in ['M', '2M', '3M']: + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + p = Period('2011-02', freq=freq) + + exp = pd.Series([False, True, False, False]) + tm.assert_series_equal(base == p, exp) + tm.assert_series_equal(p == base, exp) + + exp = pd.Series([True, False, True, True]) + tm.assert_series_equal(base != p, exp) + tm.assert_series_equal(p != base, exp) + + exp = pd.Series([False, False, True, True]) + tm.assert_series_equal(base > p, exp) + tm.assert_series_equal(p < base, exp) + + exp = pd.Series([True, False, False, False]) + tm.assert_series_equal(base < p, exp) + tm.assert_series_equal(p > base, exp) + + exp = pd.Series([False, True, True, True]) + tm.assert_series_equal(base >= p, exp) + tm.assert_series_equal(p <= base, exp) + + exp = pd.Series([True, True, False, False]) + tm.assert_series_equal(base <= p, exp) + tm.assert_series_equal(p >= base, exp) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + def test_comp_series_period_series(self): + # GH 13200 + for freq in ['M', '2M', '3M']: + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + + s = Series([Period(x, freq=freq) for x in + ['2011-02', '2011-01', '2011-03', '2011-05']]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == s, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != s, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > s, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < s, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= s, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= s, exp) + + s2 = Series([Period(x, freq='A') for x in + ['2011', '2011', '2011', '2011']]) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= s2 + + def test_comp_series_period_object(self): + # GH 13200 + base = Series([Period('2011', freq='A'), Period('2011-02', freq='M'), + Period('2013', freq='A'), Period('2011-04', freq='M')]) + + s = Series([Period('2012', freq='A'), Period('2011-01', freq='M'), + Period('2013', freq='A'), Period('2011-05', freq='M')]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == s, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != s, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > s, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < s, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= s, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= s, exp) + + def test_align_series(self): + rng = period_range('1/1/2000', '1/1/2010', freq='A') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected[1::2] = np.nan + tm.assert_series_equal(result, expected) + + result = ts + _permute(ts[::2]) + tm.assert_series_equal(result, expected) + + # it works! + for kind in ['inner', 'outer', 'left', 'right']: + ts.align(ts[::2], join=kind) + msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + ts + ts.asfreq('D', how="end") diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py deleted file mode 100644 index 3459da9d2b5c5..0000000000000 --- a/pandas/tseries/tests/test_period.py +++ /dev/null @@ -1,5065 +0,0 @@ -"""Tests suite for Period handling. - -Parts derived from scikits.timeseries code, original authors: -- Pierre Gerard-Marchant & Matt Knox -- pierregm_at_uga_dot_edu - mattknow_ca_at_hotmail_dot_com - -""" - -import numpy as np -from numpy.random import randn -from datetime import datetime, date, timedelta - -import pandas as pd -import pandas.util.testing as tm -import pandas.tseries.period as period -import pandas.tseries.offsets as offsets -from pandas.tseries.tools import to_datetime -from pandas.tseries.period import Period, PeriodIndex, period_range -from pandas.tseries.index import DatetimeIndex, date_range, Index -from pandas._period import period_ordinal, period_asfreq -from pandas.compat import range, lrange, lmap, zip, text_type, PY3, iteritems -from pandas.compat.numpy import np_datetime64_compat -from pandas.tseries.frequencies import (MONTHS, DAYS, _period_code_map, - get_freq) -from pandas import (Series, DataFrame, Timestamp, _period, tslib, - _np_version_under1p9, _np_version_under1p10, - _np_version_under1p12) - - -class TestPeriodProperties(tm.TestCase): - "Test properties such as year, month, weekday, etc...." - - def test_quarterly_negative_ordinals(self): - p = Period(ordinal=-1, freq='Q-DEC') - self.assertEqual(p.year, 1969) - self.assertEqual(p.quarter, 4) - self.assertIsInstance(p, Period) - - p = Period(ordinal=-2, freq='Q-DEC') - self.assertEqual(p.year, 1969) - self.assertEqual(p.quarter, 3) - self.assertIsInstance(p, Period) - - p = Period(ordinal=-2, freq='M') - self.assertEqual(p.year, 1969) - self.assertEqual(p.month, 11) - self.assertIsInstance(p, Period) - - def test_period_cons_quarterly(self): - # bugs in scikits.timeseries - for month in MONTHS: - freq = 'Q-%s' % month - exp = Period('1989Q3', freq=freq) - self.assertIn('1989Q3', str(exp)) - stamp = exp.to_timestamp('D', how='end') - p = Period(stamp, freq=freq) - self.assertEqual(p, exp) - - stamp = exp.to_timestamp('3D', how='end') - p = Period(stamp, freq=freq) - self.assertEqual(p, exp) - - def test_period_cons_annual(self): - # bugs in scikits.timeseries - for month in MONTHS: - freq = 'A-%s' % month - exp = Period('1989', freq=freq) - stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) - p = Period(stamp, freq=freq) - self.assertEqual(p, exp + 1) - self.assertIsInstance(p, Period) - - def test_period_cons_weekly(self): - for num in range(10, 17): - daystr = '2011-02-%d' % num - for day in DAYS: - freq = 'W-%s' % day - - result = Period(daystr, freq=freq) - expected = Period(daystr, freq='D').asfreq(freq) - self.assertEqual(result, expected) - self.assertIsInstance(result, Period) - - def test_period_from_ordinal(self): - p = pd.Period('2011-01', freq='M') - res = pd.Period._from_ordinal(p.ordinal, freq='M') - self.assertEqual(p, res) - self.assertIsInstance(res, Period) - - def test_period_cons_nat(self): - p = Period('NaT', freq='M') - self.assertIs(p, pd.NaT) - - p = Period('nat', freq='W-SUN') - self.assertIs(p, pd.NaT) - - p = Period(tslib.iNaT, freq='D') - self.assertIs(p, pd.NaT) - - p = Period(tslib.iNaT, freq='3D') - self.assertIs(p, pd.NaT) - - p = Period(tslib.iNaT, freq='1D1H') - self.assertIs(p, pd.NaT) - - p = Period('NaT') - self.assertIs(p, pd.NaT) - - p = Period(tslib.iNaT) - self.assertIs(p, pd.NaT) - - def test_cons_null_like(self): - # check Timestamp compat - self.assertIs(Timestamp('NaT'), pd.NaT) - self.assertIs(Period('NaT'), pd.NaT) - - self.assertIs(Timestamp(None), pd.NaT) - self.assertIs(Period(None), pd.NaT) - - self.assertIs(Timestamp(float('nan')), pd.NaT) - self.assertIs(Period(float('nan')), pd.NaT) - - self.assertIs(Timestamp(np.nan), pd.NaT) - self.assertIs(Period(np.nan), pd.NaT) - - def test_period_cons_mult(self): - p1 = Period('2011-01', freq='3M') - p2 = Period('2011-01', freq='M') - self.assertEqual(p1.ordinal, p2.ordinal) - - self.assertEqual(p1.freq, offsets.MonthEnd(3)) - self.assertEqual(p1.freqstr, '3M') - - self.assertEqual(p2.freq, offsets.MonthEnd()) - self.assertEqual(p2.freqstr, 'M') - - result = p1 + 1 - self.assertEqual(result.ordinal, (p2 + 3).ordinal) - self.assertEqual(result.freq, p1.freq) - self.assertEqual(result.freqstr, '3M') - - result = p1 - 1 - self.assertEqual(result.ordinal, (p2 - 3).ordinal) - self.assertEqual(result.freq, p1.freq) - self.assertEqual(result.freqstr, '3M') - - msg = ('Frequency must be positive, because it' - ' represents span: -3M') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='-3M') - - msg = ('Frequency must be positive, because it' ' represents span: 0M') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='0M') - - def test_period_cons_combined(self): - p = [(Period('2011-01', freq='1D1H'), - Period('2011-01', freq='1H1D'), - Period('2011-01', freq='H')), - (Period(ordinal=1, freq='1D1H'), - Period(ordinal=1, freq='1H1D'), - Period(ordinal=1, freq='H'))] - - for p1, p2, p3 in p: - self.assertEqual(p1.ordinal, p3.ordinal) - self.assertEqual(p2.ordinal, p3.ordinal) - - self.assertEqual(p1.freq, offsets.Hour(25)) - self.assertEqual(p1.freqstr, '25H') - - self.assertEqual(p2.freq, offsets.Hour(25)) - self.assertEqual(p2.freqstr, '25H') - - self.assertEqual(p3.freq, offsets.Hour()) - self.assertEqual(p3.freqstr, 'H') - - result = p1 + 1 - self.assertEqual(result.ordinal, (p3 + 25).ordinal) - self.assertEqual(result.freq, p1.freq) - self.assertEqual(result.freqstr, '25H') - - result = p2 + 1 - self.assertEqual(result.ordinal, (p3 + 25).ordinal) - self.assertEqual(result.freq, p2.freq) - self.assertEqual(result.freqstr, '25H') - - result = p1 - 1 - self.assertEqual(result.ordinal, (p3 - 25).ordinal) - self.assertEqual(result.freq, p1.freq) - self.assertEqual(result.freqstr, '25H') - - result = p2 - 1 - self.assertEqual(result.ordinal, (p3 - 25).ordinal) - self.assertEqual(result.freq, p2.freq) - self.assertEqual(result.freqstr, '25H') - - msg = ('Frequency must be positive, because it' - ' represents span: -25H') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='-1D1H') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='-1H1D') - with tm.assertRaisesRegexp(ValueError, msg): - Period(ordinal=1, freq='-1D1H') - with tm.assertRaisesRegexp(ValueError, msg): - Period(ordinal=1, freq='-1H1D') - - msg = ('Frequency must be positive, because it' - ' represents span: 0D') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='0D0H') - with tm.assertRaisesRegexp(ValueError, msg): - Period(ordinal=1, freq='0D0H') - - # You can only combine together day and intraday offsets - msg = ('Invalid frequency: 1W1D') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='1W1D') - msg = ('Invalid frequency: 1D1W') - with tm.assertRaisesRegexp(ValueError, msg): - Period('2011-01', freq='1D1W') - - def test_timestamp_tz_arg(self): - tm._skip_if_no_pytz() - import pytz - for case in ['Europe/Brussels', 'Asia/Tokyo', 'US/Pacific']: - p = Period('1/1/2005', freq='M').to_timestamp(tz=case) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - self.assertEqual(p, exp) - self.assertEqual(p.tz, exp_zone.tzinfo) - self.assertEqual(p.tz, exp.tz) - - p = Period('1/1/2005', freq='3H').to_timestamp(tz=case) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - self.assertEqual(p, exp) - self.assertEqual(p.tz, exp_zone.tzinfo) - self.assertEqual(p.tz, exp.tz) - - p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=case) - exp = Timestamp('31/12/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - self.assertEqual(p, exp) - self.assertEqual(p.tz, exp_zone.tzinfo) - self.assertEqual(p.tz, exp.tz) - - p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=case) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - self.assertEqual(p, exp) - self.assertEqual(p.tz, exp_zone.tzinfo) - self.assertEqual(p.tz, exp.tz) - - def test_timestamp_tz_arg_dateutil(self): - from pandas.tslib import _dateutil_gettz as gettz - from pandas.tslib import maybe_get_tz - for case in ['dateutil/Europe/Brussels', 'dateutil/Asia/Tokyo', - 'dateutil/US/Pacific']: - p = Period('1/1/2005', freq='M').to_timestamp( - tz=maybe_get_tz(case)) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - self.assertEqual(p, exp) - self.assertEqual(p.tz, gettz(case.split('/', 1)[1])) - self.assertEqual(p.tz, exp.tz) - - p = Period('1/1/2005', - freq='M').to_timestamp(freq='3H', tz=maybe_get_tz(case)) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - self.assertEqual(p, exp) - self.assertEqual(p.tz, gettz(case.split('/', 1)[1])) - self.assertEqual(p.tz, exp.tz) - - def test_timestamp_tz_arg_dateutil_from_string(self): - from pandas.tslib import _dateutil_gettz as gettz - p = Period('1/1/2005', - freq='M').to_timestamp(tz='dateutil/Europe/Brussels') - self.assertEqual(p.tz, gettz('Europe/Brussels')) - - def test_timestamp_mult(self): - p = pd.Period('2011-01', freq='M') - self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) - self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-01-31')) - - p = pd.Period('2011-01', freq='3M') - self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) - self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-03-31')) - - def test_period_constructor(self): - i1 = Period('1/1/2005', freq='M') - i2 = Period('Jan 2005') - - self.assertEqual(i1, i2) - - i1 = Period('2005', freq='A') - i2 = Period('2005') - i3 = Period('2005', freq='a') - - self.assertEqual(i1, i2) - self.assertEqual(i1, i3) - - i4 = Period('2005', freq='M') - i5 = Period('2005', freq='m') - - self.assertRaises(ValueError, i1.__ne__, i4) - self.assertEqual(i4, i5) - - i1 = Period.now('Q') - i2 = Period(datetime.now(), freq='Q') - i3 = Period.now('q') - - self.assertEqual(i1, i2) - self.assertEqual(i1, i3) - - # Biz day construction, roll forward if non-weekday - i1 = Period('3/10/12', freq='B') - i2 = Period('3/10/12', freq='D') - self.assertEqual(i1, i2.asfreq('B')) - i2 = Period('3/11/12', freq='D') - self.assertEqual(i1, i2.asfreq('B')) - i2 = Period('3/12/12', freq='D') - self.assertEqual(i1, i2.asfreq('B')) - - i3 = Period('3/10/12', freq='b') - self.assertEqual(i1, i3) - - i1 = Period(year=2005, quarter=1, freq='Q') - i2 = Period('1/1/2005', freq='Q') - self.assertEqual(i1, i2) - - i1 = Period(year=2005, quarter=3, freq='Q') - i2 = Period('9/1/2005', freq='Q') - self.assertEqual(i1, i2) - - i1 = Period(year=2005, month=3, day=1, freq='D') - i2 = Period('3/1/2005', freq='D') - self.assertEqual(i1, i2) - - i3 = Period(year=2005, month=3, day=1, freq='d') - self.assertEqual(i1, i3) - - i1 = Period(year=2012, month=3, day=10, freq='B') - i2 = Period('3/12/12', freq='B') - self.assertEqual(i1, i2) - - i1 = Period('2005Q1') - i2 = Period(year=2005, quarter=1, freq='Q') - i3 = Period('2005q1') - self.assertEqual(i1, i2) - self.assertEqual(i1, i3) - - i1 = Period('05Q1') - self.assertEqual(i1, i2) - lower = Period('05q1') - self.assertEqual(i1, lower) - - i1 = Period('1Q2005') - self.assertEqual(i1, i2) - lower = Period('1q2005') - self.assertEqual(i1, lower) - - i1 = Period('1Q05') - self.assertEqual(i1, i2) - lower = Period('1q05') - self.assertEqual(i1, lower) - - i1 = Period('4Q1984') - self.assertEqual(i1.year, 1984) - lower = Period('4q1984') - self.assertEqual(i1, lower) - - i1 = Period('1982', freq='min') - i2 = Period('1982', freq='MIN') - self.assertEqual(i1, i2) - i2 = Period('1982', freq=('Min', 1)) - self.assertEqual(i1, i2) - - expected = Period('2007-01', freq='M') - i1 = Period('200701', freq='M') - self.assertEqual(i1, expected) - - i1 = Period('200701', freq='M') - self.assertEqual(i1, expected) - - i1 = Period(200701, freq='M') - self.assertEqual(i1, expected) - - i1 = Period(ordinal=200701, freq='M') - self.assertEqual(i1.year, 18695) - - i1 = Period(datetime(2007, 1, 1), freq='M') - i2 = Period('200701', freq='M') - self.assertEqual(i1, i2) - - i1 = Period(date(2007, 1, 1), freq='M') - i2 = Period(datetime(2007, 1, 1), freq='M') - i3 = Period(np.datetime64('2007-01-01'), freq='M') - i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') - i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') - self.assertEqual(i1, i2) - self.assertEqual(i1, i3) - self.assertEqual(i1, i4) - self.assertEqual(i1, i5) - - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') - self.assertEqual(i1, expected) - - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') - self.assertEqual(i1, expected) - - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') - self.assertEqual(i1, expected) - - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') - self.assertEqual(i1, expected) - - self.assertRaises(ValueError, Period, ordinal=200701) - - self.assertRaises(ValueError, Period, '2007-1-1', freq='X') - - def test_period_constructor_offsets(self): - self.assertEqual(Period('1/1/2005', freq=offsets.MonthEnd()), - Period('1/1/2005', freq='M')) - self.assertEqual(Period('2005', freq=offsets.YearEnd()), - Period('2005', freq='A')) - self.assertEqual(Period('2005', freq=offsets.MonthEnd()), - Period('2005', freq='M')) - self.assertEqual(Period('3/10/12', freq=offsets.BusinessDay()), - Period('3/10/12', freq='B')) - self.assertEqual(Period('3/10/12', freq=offsets.Day()), - Period('3/10/12', freq='D')) - - self.assertEqual(Period(year=2005, quarter=1, - freq=offsets.QuarterEnd(startingMonth=12)), - Period(year=2005, quarter=1, freq='Q')) - self.assertEqual(Period(year=2005, quarter=2, - freq=offsets.QuarterEnd(startingMonth=12)), - Period(year=2005, quarter=2, freq='Q')) - - self.assertEqual(Period(year=2005, month=3, day=1, freq=offsets.Day()), - Period(year=2005, month=3, day=1, freq='D')) - self.assertEqual(Period(year=2012, month=3, day=10, - freq=offsets.BDay()), - Period(year=2012, month=3, day=10, freq='B')) - - expected = Period('2005-03-01', freq='3D') - self.assertEqual(Period(year=2005, month=3, day=1, - freq=offsets.Day(3)), expected) - self.assertEqual(Period(year=2005, month=3, day=1, freq='3D'), - expected) - - self.assertEqual(Period(year=2012, month=3, day=10, - freq=offsets.BDay(3)), - Period(year=2012, month=3, day=10, freq='3B')) - - self.assertEqual(Period(200701, freq=offsets.MonthEnd()), - Period(200701, freq='M')) - - i1 = Period(ordinal=200701, freq=offsets.MonthEnd()) - i2 = Period(ordinal=200701, freq='M') - self.assertEqual(i1, i2) - self.assertEqual(i1.year, 18695) - self.assertEqual(i2.year, 18695) - - i1 = Period(datetime(2007, 1, 1), freq='M') - i2 = Period('200701', freq='M') - self.assertEqual(i1, i2) - - i1 = Period(date(2007, 1, 1), freq='M') - i2 = Period(datetime(2007, 1, 1), freq='M') - i3 = Period(np.datetime64('2007-01-01'), freq='M') - i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') - i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') - self.assertEqual(i1, i2) - self.assertEqual(i1, i3) - self.assertEqual(i1, i4) - self.assertEqual(i1, i5) - - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') - self.assertEqual(i1, expected) - - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') - self.assertEqual(i1, expected) - - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') - self.assertEqual(i1, expected) - - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') - self.assertEqual(i1, expected) - - self.assertRaises(ValueError, Period, ordinal=200701) - - self.assertRaises(ValueError, Period, '2007-1-1', freq='X') - - def test_freq_str(self): - i1 = Period('1982', freq='Min') - self.assertEqual(i1.freq, offsets.Minute()) - self.assertEqual(i1.freqstr, 'T') - - def test_period_deprecated_freq(self): - cases = {"M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], - "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], - "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], - "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], - "T": ["minute", "MINUTE", "MINUTELY", "minutely"], - "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], - "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], - "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], - "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - for exp, freqs in iteritems(cases): - for freq in freqs: - with self.assertRaisesRegexp(ValueError, msg): - Period('2016-03-01 09:00', freq=freq) - with self.assertRaisesRegexp(ValueError, msg): - Period(ordinal=1, freq=freq) - - # check supported freq-aliases still works - p1 = Period('2016-03-01 09:00', freq=exp) - p2 = Period(ordinal=1, freq=exp) - tm.assertIsInstance(p1, Period) - tm.assertIsInstance(p2, Period) - - def test_hash(self): - self.assertEqual(hash(Period('2011-01', freq='M')), - hash(Period('2011-01', freq='M'))) - - self.assertNotEqual(hash(Period('2011-01-01', freq='D')), - hash(Period('2011-01', freq='M'))) - - self.assertNotEqual(hash(Period('2011-01', freq='3M')), - hash(Period('2011-01', freq='2M'))) - - self.assertNotEqual(hash(Period('2011-01', freq='M')), - hash(Period('2011-02', freq='M'))) - - def test_repr(self): - p = Period('Jan-2000') - self.assertIn('2000-01', repr(p)) - - p = Period('2000-12-15') - self.assertIn('2000-12-15', repr(p)) - - def test_repr_nat(self): - p = Period('nat', freq='M') - self.assertIn(repr(tslib.NaT), repr(p)) - - def test_millisecond_repr(self): - p = Period('2000-01-01 12:15:02.123') - - self.assertEqual("Period('2000-01-01 12:15:02.123', 'L')", repr(p)) - - def test_microsecond_repr(self): - p = Period('2000-01-01 12:15:02.123567') - - self.assertEqual("Period('2000-01-01 12:15:02.123567', 'U')", repr(p)) - - def test_strftime(self): - p = Period('2000-1-1 12:34:12', freq='S') - res = p.strftime('%Y-%m-%d %H:%M:%S') - self.assertEqual(res, '2000-01-01 12:34:12') - tm.assertIsInstance(res, text_type) # GH3363 - - def test_sub_delta(self): - left, right = Period('2011', freq='A'), Period('2007', freq='A') - result = left - right - self.assertEqual(result, 4) - - with self.assertRaises(period.IncompatibleFrequency): - left - Period('2007-01', freq='M') - - def test_to_timestamp(self): - p = Period('1982', freq='A') - start_ts = p.to_timestamp(how='S') - aliases = ['s', 'StarT', 'BEGIn'] - for a in aliases: - self.assertEqual(start_ts, p.to_timestamp('D', how=a)) - # freq with mult should not affect to the result - self.assertEqual(start_ts, p.to_timestamp('3D', how=a)) - - end_ts = p.to_timestamp(how='E') - aliases = ['e', 'end', 'FINIsH'] - for a in aliases: - self.assertEqual(end_ts, p.to_timestamp('D', how=a)) - self.assertEqual(end_ts, p.to_timestamp('3D', how=a)) - - from_lst = ['A', 'Q', 'M', 'W', 'B', 'D', 'H', 'Min', 'S'] - - def _ex(p): - return Timestamp((p + 1).start_time.value - 1) - - for i, fcode in enumerate(from_lst): - p = Period('1982', freq=fcode) - result = p.to_timestamp().to_period(fcode) - self.assertEqual(result, p) - - self.assertEqual(p.start_time, p.to_timestamp(how='S')) - - self.assertEqual(p.end_time, _ex(p)) - - # Frequency other than daily - - p = Period('1985', freq='A') - - result = p.to_timestamp('H', how='end') - expected = datetime(1985, 12, 31, 23) - self.assertEqual(result, expected) - result = p.to_timestamp('3H', how='end') - self.assertEqual(result, expected) - - result = p.to_timestamp('T', how='end') - expected = datetime(1985, 12, 31, 23, 59) - self.assertEqual(result, expected) - result = p.to_timestamp('2T', how='end') - self.assertEqual(result, expected) - - result = p.to_timestamp(how='end') - expected = datetime(1985, 12, 31) - self.assertEqual(result, expected) - - expected = datetime(1985, 1, 1) - result = p.to_timestamp('H', how='start') - self.assertEqual(result, expected) - result = p.to_timestamp('T', how='start') - self.assertEqual(result, expected) - result = p.to_timestamp('S', how='start') - self.assertEqual(result, expected) - result = p.to_timestamp('3H', how='start') - self.assertEqual(result, expected) - result = p.to_timestamp('5S', how='start') - self.assertEqual(result, expected) - - def test_start_time(self): - freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] - xp = datetime(2012, 1, 1) - for f in freq_lst: - p = Period('2012', freq=f) - self.assertEqual(p.start_time, xp) - self.assertEqual(Period('2012', freq='B').start_time, - datetime(2012, 1, 2)) - self.assertEqual(Period('2012', freq='W').start_time, - datetime(2011, 12, 26)) - - def test_end_time(self): - p = Period('2012', freq='A') - - def _ex(*args): - return Timestamp(Timestamp(datetime(*args)).value - 1) - - xp = _ex(2013, 1, 1) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='Q') - xp = _ex(2012, 4, 1) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='M') - xp = _ex(2012, 2, 1) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='D') - xp = _ex(2012, 1, 2) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='H') - xp = _ex(2012, 1, 1, 1) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='B') - xp = _ex(2012, 1, 3) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='W') - xp = _ex(2012, 1, 2) - self.assertEqual(xp, p.end_time) - - # Test for GH 11738 - p = Period('2012', freq='15D') - xp = _ex(2012, 1, 16) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='1D1H') - xp = _ex(2012, 1, 2, 1) - self.assertEqual(xp, p.end_time) - - p = Period('2012', freq='1H1D') - xp = _ex(2012, 1, 2, 1) - self.assertEqual(xp, p.end_time) - - def test_anchor_week_end_time(self): - def _ex(*args): - return Timestamp(Timestamp(datetime(*args)).value - 1) - - p = Period('2013-1-1', 'W-SAT') - xp = _ex(2013, 1, 6) - self.assertEqual(p.end_time, xp) - - def test_properties_annually(self): - # Test properties on Periods with annually frequency. - a_date = Period(freq='A', year=2007) - self.assertEqual(a_date.year, 2007) - - def test_properties_quarterly(self): - # Test properties on Periods with daily frequency. - qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) - qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) - qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) - # - for x in range(3): - for qd in (qedec_date, qejan_date, qejun_date): - self.assertEqual((qd + x).qyear, 2007) - self.assertEqual((qd + x).quarter, x + 1) - - def test_properties_monthly(self): - # Test properties on Periods with daily frequency. - m_date = Period(freq='M', year=2007, month=1) - for x in range(11): - m_ival_x = m_date + x - self.assertEqual(m_ival_x.year, 2007) - if 1 <= x + 1 <= 3: - self.assertEqual(m_ival_x.quarter, 1) - elif 4 <= x + 1 <= 6: - self.assertEqual(m_ival_x.quarter, 2) - elif 7 <= x + 1 <= 9: - self.assertEqual(m_ival_x.quarter, 3) - elif 10 <= x + 1 <= 12: - self.assertEqual(m_ival_x.quarter, 4) - self.assertEqual(m_ival_x.month, x + 1) - - def test_properties_weekly(self): - # Test properties on Periods with daily frequency. - w_date = Period(freq='W', year=2007, month=1, day=7) - # - self.assertEqual(w_date.year, 2007) - self.assertEqual(w_date.quarter, 1) - self.assertEqual(w_date.month, 1) - self.assertEqual(w_date.week, 1) - self.assertEqual((w_date - 1).week, 52) - self.assertEqual(w_date.days_in_month, 31) - self.assertEqual(Period(freq='W', year=2012, - month=2, day=1).days_in_month, 29) - - def test_properties_weekly_legacy(self): - # Test properties on Periods with daily frequency. - w_date = Period(freq='W', year=2007, month=1, day=7) - self.assertEqual(w_date.year, 2007) - self.assertEqual(w_date.quarter, 1) - self.assertEqual(w_date.month, 1) - self.assertEqual(w_date.week, 1) - self.assertEqual((w_date - 1).week, 52) - self.assertEqual(w_date.days_in_month, 31) - - exp = Period(freq='W', year=2012, month=2, day=1) - self.assertEqual(exp.days_in_month, 29) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK', year=2007, month=1, day=7) - - def test_properties_daily(self): - # Test properties on Periods with daily frequency. - b_date = Period(freq='B', year=2007, month=1, day=1) - # - self.assertEqual(b_date.year, 2007) - self.assertEqual(b_date.quarter, 1) - self.assertEqual(b_date.month, 1) - self.assertEqual(b_date.day, 1) - self.assertEqual(b_date.weekday, 0) - self.assertEqual(b_date.dayofyear, 1) - self.assertEqual(b_date.days_in_month, 31) - self.assertEqual(Period(freq='B', year=2012, - month=2, day=1).days_in_month, 29) - # - d_date = Period(freq='D', year=2007, month=1, day=1) - # - self.assertEqual(d_date.year, 2007) - self.assertEqual(d_date.quarter, 1) - self.assertEqual(d_date.month, 1) - self.assertEqual(d_date.day, 1) - self.assertEqual(d_date.weekday, 0) - self.assertEqual(d_date.dayofyear, 1) - self.assertEqual(d_date.days_in_month, 31) - self.assertEqual(Period(freq='D', year=2012, month=2, - day=1).days_in_month, 29) - - def test_properties_hourly(self): - # Test properties on Periods with hourly frequency. - h_date1 = Period(freq='H', year=2007, month=1, day=1, hour=0) - h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0) - - for h_date in [h_date1, h_date2]: - self.assertEqual(h_date.year, 2007) - self.assertEqual(h_date.quarter, 1) - self.assertEqual(h_date.month, 1) - self.assertEqual(h_date.day, 1) - self.assertEqual(h_date.weekday, 0) - self.assertEqual(h_date.dayofyear, 1) - self.assertEqual(h_date.hour, 0) - self.assertEqual(h_date.days_in_month, 31) - self.assertEqual(Period(freq='H', year=2012, month=2, day=1, - hour=0).days_in_month, 29) - - def test_properties_minutely(self): - # Test properties on Periods with minutely frequency. - t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - # - self.assertEqual(t_date.quarter, 1) - self.assertEqual(t_date.month, 1) - self.assertEqual(t_date.day, 1) - self.assertEqual(t_date.weekday, 0) - self.assertEqual(t_date.dayofyear, 1) - self.assertEqual(t_date.hour, 0) - self.assertEqual(t_date.minute, 0) - self.assertEqual(t_date.days_in_month, 31) - self.assertEqual(Period(freq='D', year=2012, month=2, day=1, hour=0, - minute=0).days_in_month, 29) - - def test_properties_secondly(self): - # Test properties on Periods with secondly frequency. - s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - # - self.assertEqual(s_date.year, 2007) - self.assertEqual(s_date.quarter, 1) - self.assertEqual(s_date.month, 1) - self.assertEqual(s_date.day, 1) - self.assertEqual(s_date.weekday, 0) - self.assertEqual(s_date.dayofyear, 1) - self.assertEqual(s_date.hour, 0) - self.assertEqual(s_date.minute, 0) - self.assertEqual(s_date.second, 0) - self.assertEqual(s_date.days_in_month, 31) - self.assertEqual(Period(freq='Min', year=2012, month=2, day=1, hour=0, - minute=0, second=0).days_in_month, 29) - - def test_properties_nat(self): - p_nat = Period('NaT', freq='M') - t_nat = pd.Timestamp('NaT') - self.assertIs(p_nat, t_nat) - - # confirm Period('NaT') work identical with Timestamp('NaT') - for f in ['year', 'month', 'day', 'hour', 'minute', 'second', 'week', - 'dayofyear', 'quarter', 'days_in_month']: - self.assertTrue(np.isnan(getattr(p_nat, f))) - self.assertTrue(np.isnan(getattr(t_nat, f))) - - def test_pnow(self): - dt = datetime.now() - - val = period.pnow('D') - exp = Period(dt, freq='D') - self.assertEqual(val, exp) - - val2 = period.pnow('2D') - exp2 = Period(dt, freq='2D') - self.assertEqual(val2, exp2) - self.assertEqual(val.ordinal, val2.ordinal) - self.assertEqual(val.ordinal, exp2.ordinal) - - def test_constructor_corner(self): - expected = Period('2007-01', freq='2M') - self.assertEqual(Period(year=2007, month=1, freq='2M'), expected) - - self.assertRaises(ValueError, Period, datetime.now()) - self.assertRaises(ValueError, Period, datetime.now().date()) - self.assertRaises(ValueError, Period, 1.6, freq='D') - self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') - self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') - self.assertIs(Period(None), pd.NaT) - self.assertRaises(ValueError, Period, month=1) - - p = Period('2007-01-01', freq='D') - - result = Period(p, freq='A') - exp = Period('2007', freq='A') - self.assertEqual(result, exp) - - def test_constructor_infer_freq(self): - p = Period('2007-01-01') - self.assertEqual(p.freq, 'D') - - p = Period('2007-01-01 07') - self.assertEqual(p.freq, 'H') - - p = Period('2007-01-01 07:10') - self.assertEqual(p.freq, 'T') - - p = Period('2007-01-01 07:10:15') - self.assertEqual(p.freq, 'S') - - p = Period('2007-01-01 07:10:15.123') - self.assertEqual(p.freq, 'L') - - p = Period('2007-01-01 07:10:15.123000') - self.assertEqual(p.freq, 'L') - - p = Period('2007-01-01 07:10:15.123400') - self.assertEqual(p.freq, 'U') - - def test_asfreq_MS(self): - initial = Period("2013") - - self.assertEqual(initial.asfreq(freq="M", how="S"), - Period('2013-01', 'M')) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - initial.asfreq(freq="MS", how="S") - - with tm.assertRaisesRegexp(ValueError, msg): - pd.Period('2013-01', 'MS') - - self.assertTrue(_period_code_map.get("MS") is None) - - -def noWrap(item): - return item - - -class TestFreqConversion(tm.TestCase): - "Test frequency conversion of date objects" - - def test_asfreq_corner(self): - val = Period(freq='A', year=2007) - result1 = val.asfreq('5t') - result2 = val.asfreq('t') - expected = Period('2007-12-31 23:59', freq='t') - self.assertEqual(result1.ordinal, expected.ordinal) - self.assertEqual(result1.freqstr, '5T') - self.assertEqual(result2.ordinal, expected.ordinal) - self.assertEqual(result2.freqstr, 'T') - - def test_conv_annual(self): - # frequency conversion tests: from Annual Frequency - - ival_A = Period(freq='A', year=2007) - - ival_AJAN = Period(freq="A-JAN", year=2007) - ival_AJUN = Period(freq="A-JUN", year=2007) - ival_ANOV = Period(freq="A-NOV", year=2007) - - ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) - ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) - ival_A_to_M_start = Period(freq='M', year=2007, month=1) - ival_A_to_M_end = Period(freq='M', year=2007, month=12) - ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31) - ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) - ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - - ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) - ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) - ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) - - self.assertEqual(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) - self.assertEqual(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) - self.assertEqual(ival_A.asfreq('M', 's'), ival_A_to_M_start) - self.assertEqual(ival_A.asfreq('M', 'E'), ival_A_to_M_end) - self.assertEqual(ival_A.asfreq('W', 'S'), ival_A_to_W_start) - self.assertEqual(ival_A.asfreq('W', 'E'), ival_A_to_W_end) - self.assertEqual(ival_A.asfreq('B', 'S'), ival_A_to_B_start) - self.assertEqual(ival_A.asfreq('B', 'E'), ival_A_to_B_end) - self.assertEqual(ival_A.asfreq('D', 'S'), ival_A_to_D_start) - self.assertEqual(ival_A.asfreq('D', 'E'), ival_A_to_D_end) - self.assertEqual(ival_A.asfreq('H', 'S'), ival_A_to_H_start) - self.assertEqual(ival_A.asfreq('H', 'E'), ival_A_to_H_end) - self.assertEqual(ival_A.asfreq('min', 'S'), ival_A_to_T_start) - self.assertEqual(ival_A.asfreq('min', 'E'), ival_A_to_T_end) - self.assertEqual(ival_A.asfreq('T', 'S'), ival_A_to_T_start) - self.assertEqual(ival_A.asfreq('T', 'E'), ival_A_to_T_end) - self.assertEqual(ival_A.asfreq('S', 'S'), ival_A_to_S_start) - self.assertEqual(ival_A.asfreq('S', 'E'), ival_A_to_S_end) - - self.assertEqual(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) - self.assertEqual(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) - - self.assertEqual(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) - self.assertEqual(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) - - self.assertEqual(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) - self.assertEqual(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) - - self.assertEqual(ival_A.asfreq('A'), ival_A) - - def test_conv_quarterly(self): - # frequency conversion tests: from Quarterly Frequency - - ival_Q = Period(freq='Q', year=2007, quarter=1) - ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) - - ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) - ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - - ival_Q_to_A = Period(freq='A', year=2007) - ival_Q_to_M_start = Period(freq='M', year=2007, month=1) - ival_Q_to_M_end = Period(freq='M', year=2007, month=3) - ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31) - ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) - ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23) - ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23, - minute=59, second=59) - - ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) - - ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) - - self.assertEqual(ival_Q.asfreq('A'), ival_Q_to_A) - self.assertEqual(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) - - self.assertEqual(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) - self.assertEqual(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) - self.assertEqual(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) - self.assertEqual(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) - self.assertEqual(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) - self.assertEqual(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) - self.assertEqual(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) - self.assertEqual(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) - self.assertEqual(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) - self.assertEqual(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) - self.assertEqual(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) - self.assertEqual(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) - self.assertEqual(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) - self.assertEqual(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) - - self.assertEqual(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) - self.assertEqual(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) - self.assertEqual(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) - self.assertEqual(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) - - self.assertEqual(ival_Q.asfreq('Q'), ival_Q) - - def test_conv_monthly(self): - # frequency conversion tests: from Monthly Frequency - - ival_M = Period(freq='M', year=2007, month=1) - ival_M_end_of_year = Period(freq='M', year=2007, month=12) - ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) - ival_M_to_A = Period(freq='A', year=2007) - ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31) - ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) - ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23) - ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, - minute=59, second=59) - - self.assertEqual(ival_M.asfreq('A'), ival_M_to_A) - self.assertEqual(ival_M_end_of_year.asfreq('A'), ival_M_to_A) - self.assertEqual(ival_M.asfreq('Q'), ival_M_to_Q) - self.assertEqual(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) - - self.assertEqual(ival_M.asfreq('W', 'S'), ival_M_to_W_start) - self.assertEqual(ival_M.asfreq('W', 'E'), ival_M_to_W_end) - self.assertEqual(ival_M.asfreq('B', 'S'), ival_M_to_B_start) - self.assertEqual(ival_M.asfreq('B', 'E'), ival_M_to_B_end) - self.assertEqual(ival_M.asfreq('D', 'S'), ival_M_to_D_start) - self.assertEqual(ival_M.asfreq('D', 'E'), ival_M_to_D_end) - self.assertEqual(ival_M.asfreq('H', 'S'), ival_M_to_H_start) - self.assertEqual(ival_M.asfreq('H', 'E'), ival_M_to_H_end) - self.assertEqual(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) - self.assertEqual(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) - self.assertEqual(ival_M.asfreq('S', 'S'), ival_M_to_S_start) - self.assertEqual(ival_M.asfreq('S', 'E'), ival_M_to_S_end) - - self.assertEqual(ival_M.asfreq('M'), ival_M) - - def test_conv_weekly(self): - # frequency conversion tests: from Weekly Frequency - ival_W = Period(freq='W', year=2007, month=1, day=1) - - ival_WSUN = Period(freq='W', year=2007, month=1, day=7) - ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6) - ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5) - ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4) - ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3) - ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2) - ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31) - ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31) - ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) - else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) - - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) - else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) - - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) - else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) - self.assertEqual(ival_W_end_of_year.asfreq('A'), - ival_W_to_A_end_of_year) - self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) - self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) - self.assertEqual(ival_W_end_of_month.asfreq('M'), - ival_W_to_M_end_of_month) - - self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - self.assertEqual(ival_W.asfreq('W'), ival_W) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - ival_W.asfreq('WK') - - def test_conv_weekly_legacy(self): - # frequency conversion tests: from Weekly Frequency - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK', year=2007, month=1, day=1) - - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-SAT', year=2007, month=1, day=6) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-FRI', year=2007, month=1, day=5) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-THU', year=2007, month=1, day=4) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-WED', year=2007, month=1, day=3) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-TUE', year=2007, month=1, day=2) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-MON', year=2007, month=1, day=1) - - def test_conv_business(self): - # frequency conversion tests: from Business Frequency" - - ival_B = Period(freq='B', year=2007, month=1, day=1) - ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) - ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) - ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) - ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) - - ival_B_to_A = Period(freq='A', year=2007) - ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_B_to_M = Period(freq='M', year=2007, month=1) - ival_B_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - self.assertEqual(ival_B.asfreq('A'), ival_B_to_A) - self.assertEqual(ival_B_end_of_year.asfreq('A'), ival_B_to_A) - self.assertEqual(ival_B.asfreq('Q'), ival_B_to_Q) - self.assertEqual(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) - self.assertEqual(ival_B.asfreq('M'), ival_B_to_M) - self.assertEqual(ival_B_end_of_month.asfreq('M'), ival_B_to_M) - self.assertEqual(ival_B.asfreq('W'), ival_B_to_W) - self.assertEqual(ival_B_end_of_week.asfreq('W'), ival_B_to_W) - - self.assertEqual(ival_B.asfreq('D'), ival_B_to_D) - - self.assertEqual(ival_B.asfreq('H', 'S'), ival_B_to_H_start) - self.assertEqual(ival_B.asfreq('H', 'E'), ival_B_to_H_end) - self.assertEqual(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) - self.assertEqual(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) - self.assertEqual(ival_B.asfreq('S', 'S'), ival_B_to_S_start) - self.assertEqual(ival_B.asfreq('S', 'E'), ival_B_to_S_end) - - self.assertEqual(ival_B.asfreq('B'), ival_B) - - def test_conv_daily(self): - # frequency conversion tests: from Business Frequency" - - ival_D = Period(freq='D', year=2007, month=1, day=1) - ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) - ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) - ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) - ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) - - ival_D_friday = Period(freq='D', year=2007, month=1, day=5) - ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) - ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) - - # TODO: unused? - # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) - - ival_B_friday = Period(freq='B', year=2007, month=1, day=5) - ival_B_monday = Period(freq='B', year=2007, month=1, day=8) - - ival_D_to_A = Period(freq='A', year=2007) - - ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) - ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) - ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) - - ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) - ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) - ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) - - ival_D_to_M = Period(freq='M', year=2007, month=1) - ival_D_to_W = Period(freq='W', year=2007, month=1, day=7) - - ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - self.assertEqual(ival_D.asfreq('A'), ival_D_to_A) - - self.assertEqual(ival_D_end_of_quarter.asfreq('A-JAN'), - ival_Deoq_to_AJAN) - self.assertEqual(ival_D_end_of_quarter.asfreq('A-JUN'), - ival_Deoq_to_AJUN) - self.assertEqual(ival_D_end_of_quarter.asfreq('A-DEC'), - ival_Deoq_to_ADEC) - - self.assertEqual(ival_D_end_of_year.asfreq('A'), ival_D_to_A) - self.assertEqual(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) - self.assertEqual(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) - self.assertEqual(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) - self.assertEqual(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) - self.assertEqual(ival_D.asfreq('M'), ival_D_to_M) - self.assertEqual(ival_D_end_of_month.asfreq('M'), ival_D_to_M) - self.assertEqual(ival_D.asfreq('W'), ival_D_to_W) - self.assertEqual(ival_D_end_of_week.asfreq('W'), ival_D_to_W) - - self.assertEqual(ival_D_friday.asfreq('B'), ival_B_friday) - self.assertEqual(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) - self.assertEqual(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) - self.assertEqual(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) - self.assertEqual(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) - - self.assertEqual(ival_D.asfreq('H', 'S'), ival_D_to_H_start) - self.assertEqual(ival_D.asfreq('H', 'E'), ival_D_to_H_end) - self.assertEqual(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) - self.assertEqual(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) - self.assertEqual(ival_D.asfreq('S', 'S'), ival_D_to_S_start) - self.assertEqual(ival_D.asfreq('S', 'E'), ival_D_to_S_end) - - self.assertEqual(ival_D.asfreq('D'), ival_D) - - def test_conv_hourly(self): - # frequency conversion tests: from Hourly Frequency" - - ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, - hour=23) - ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, - hour=23) - ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, - hour=23) - ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, - hour=23) - ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, - hour=23) - - ival_H_to_A = Period(freq='A', year=2007) - ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_H_to_M = Period(freq='M', year=2007, month=1) - ival_H_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) - - ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=59) - ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=59, second=59) - - self.assertEqual(ival_H.asfreq('A'), ival_H_to_A) - self.assertEqual(ival_H_end_of_year.asfreq('A'), ival_H_to_A) - self.assertEqual(ival_H.asfreq('Q'), ival_H_to_Q) - self.assertEqual(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) - self.assertEqual(ival_H.asfreq('M'), ival_H_to_M) - self.assertEqual(ival_H_end_of_month.asfreq('M'), ival_H_to_M) - self.assertEqual(ival_H.asfreq('W'), ival_H_to_W) - self.assertEqual(ival_H_end_of_week.asfreq('W'), ival_H_to_W) - self.assertEqual(ival_H.asfreq('D'), ival_H_to_D) - self.assertEqual(ival_H_end_of_day.asfreq('D'), ival_H_to_D) - self.assertEqual(ival_H.asfreq('B'), ival_H_to_B) - self.assertEqual(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) - - self.assertEqual(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) - self.assertEqual(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) - self.assertEqual(ival_H.asfreq('S', 'S'), ival_H_to_S_start) - self.assertEqual(ival_H.asfreq('S', 'E'), ival_H_to_S_end) - - self.assertEqual(ival_H.asfreq('H'), ival_H) - - def test_conv_minutely(self): - # frequency conversion tests: from Minutely Frequency" - - ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=59) - - ival_T_to_A = Period(freq='A', year=2007) - ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_T_to_M = Period(freq='M', year=2007, month=1) - ival_T_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - - ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=59) - - self.assertEqual(ival_T.asfreq('A'), ival_T_to_A) - self.assertEqual(ival_T_end_of_year.asfreq('A'), ival_T_to_A) - self.assertEqual(ival_T.asfreq('Q'), ival_T_to_Q) - self.assertEqual(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) - self.assertEqual(ival_T.asfreq('M'), ival_T_to_M) - self.assertEqual(ival_T_end_of_month.asfreq('M'), ival_T_to_M) - self.assertEqual(ival_T.asfreq('W'), ival_T_to_W) - self.assertEqual(ival_T_end_of_week.asfreq('W'), ival_T_to_W) - self.assertEqual(ival_T.asfreq('D'), ival_T_to_D) - self.assertEqual(ival_T_end_of_day.asfreq('D'), ival_T_to_D) - self.assertEqual(ival_T.asfreq('B'), ival_T_to_B) - self.assertEqual(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) - self.assertEqual(ival_T.asfreq('H'), ival_T_to_H) - self.assertEqual(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) - - self.assertEqual(ival_T.asfreq('S', 'S'), ival_T_to_S_start) - self.assertEqual(ival_T.asfreq('S', 'E'), ival_T_to_S_end) - - self.assertEqual(ival_T.asfreq('Min'), ival_T) - - def test_conv_secondly(self): - # frequency conversion tests: from Secondly Frequency" - - ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, - second=0) - ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, - hour=23, minute=59, second=59) - ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=59, second=59) - ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=0, second=59) - - ival_S_to_A = Period(freq='A', year=2007) - ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_S_to_M = Period(freq='M', year=2007, month=1) - ival_S_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - - self.assertEqual(ival_S.asfreq('A'), ival_S_to_A) - self.assertEqual(ival_S_end_of_year.asfreq('A'), ival_S_to_A) - self.assertEqual(ival_S.asfreq('Q'), ival_S_to_Q) - self.assertEqual(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) - self.assertEqual(ival_S.asfreq('M'), ival_S_to_M) - self.assertEqual(ival_S_end_of_month.asfreq('M'), ival_S_to_M) - self.assertEqual(ival_S.asfreq('W'), ival_S_to_W) - self.assertEqual(ival_S_end_of_week.asfreq('W'), ival_S_to_W) - self.assertEqual(ival_S.asfreq('D'), ival_S_to_D) - self.assertEqual(ival_S_end_of_day.asfreq('D'), ival_S_to_D) - self.assertEqual(ival_S.asfreq('B'), ival_S_to_B) - self.assertEqual(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) - self.assertEqual(ival_S.asfreq('H'), ival_S_to_H) - self.assertEqual(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) - self.assertEqual(ival_S.asfreq('Min'), ival_S_to_T) - self.assertEqual(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) - - self.assertEqual(ival_S.asfreq('S'), ival_S) - - def test_asfreq_mult(self): - # normal freq to mult freq - p = Period(freq='A', year=2007) - # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq) - expected = Period('2007', freq='3A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='3A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - # mult freq to normal freq - p = Period(freq='3A', year=2007) - # ordinal will change because how=E is the default - for freq in ['A', offsets.YearEnd()]: - result = p.asfreq(freq) - expected = Period('2009', freq='A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - # ordinal will not change - for freq in ['A', offsets.YearEnd()]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - p = Period(freq='A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq) - expected = Period('2007-12', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - p = Period(freq='3A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq) - expected = Period('2009-12', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - def test_asfreq_combined(self): - # normal freq to combined freq - p = Period('2007', freq='H') - - # ordinal will not change - expected = Period('2007', freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): - result = p.asfreq(freq, how=how) - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - # combined freq to normal freq - p1 = Period(freq='1D1H', year=2007) - p2 = Period(freq='1H1D', year=2007) - - # ordinal will change because how=E is the default - result1 = p1.asfreq('H') - result2 = p2.asfreq('H') - expected = Period('2007-01-02', freq='H') - self.assertEqual(result1, expected) - self.assertEqual(result1.ordinal, expected.ordinal) - self.assertEqual(result1.freq, expected.freq) - self.assertEqual(result2, expected) - self.assertEqual(result2.ordinal, expected.ordinal) - self.assertEqual(result2.freq, expected.freq) - - # ordinal will not change - result1 = p1.asfreq('H', how='S') - result2 = p2.asfreq('H', how='S') - expected = Period('2007-01-01', freq='H') - self.assertEqual(result1, expected) - self.assertEqual(result1.ordinal, expected.ordinal) - self.assertEqual(result1.freq, expected.freq) - self.assertEqual(result2, expected) - self.assertEqual(result2.ordinal, expected.ordinal) - self.assertEqual(result2.freq, expected.freq) - - def test_is_leap_year(self): - # GH 13727 - for freq in ['A', 'M', 'D', 'H']: - p = Period('2000-01-01 00:00:00', freq=freq) - self.assertTrue(p.is_leap_year) - self.assertIsInstance(p.is_leap_year, bool) - - p = Period('1999-01-01 00:00:00', freq=freq) - self.assertFalse(p.is_leap_year) - - p = Period('2004-01-01 00:00:00', freq=freq) - self.assertTrue(p.is_leap_year) - - p = Period('2100-01-01 00:00:00', freq=freq) - self.assertFalse(p.is_leap_year) - - -class TestPeriodIndex(tm.TestCase): - - def setUp(self): - pass - - def test_hash_error(self): - index = period_range('20010101', periods=10) - with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % - type(index).__name__): - hash(index) - - def test_make_time_series(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - series = Series(1, index=index) - tm.assertIsInstance(series, Series) - - def test_constructor_use_start_freq(self): - # GH #1118 - p = Period('4/2/2012', freq='B') - index = PeriodIndex(start=p, periods=10) - expected = PeriodIndex(start='4/2/2012', periods=10, freq='B') - tm.assert_index_equal(index, expected) - - def test_constructor_field_arrays(self): - # GH #1264 - - years = np.arange(1990, 2010).repeat(4)[2:-2] - quarters = np.tile(np.arange(1, 5), 20)[2:-2] - - index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') - expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') - tm.assert_index_equal(index, expected) - - index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC') - tm.assert_numpy_array_equal(index.asi8, index2.asi8) - - index = PeriodIndex(year=years, quarter=quarters) - tm.assert_index_equal(index, expected) - - years = [2007, 2007, 2007] - months = [1, 2] - self.assertRaises(ValueError, PeriodIndex, year=years, month=months, - freq='M') - self.assertRaises(ValueError, PeriodIndex, year=years, month=months, - freq='2M') - self.assertRaises(ValueError, PeriodIndex, year=years, month=months, - freq='M', start=Period('2007-01', freq='M')) - - years = [2007, 2007, 2007] - months = [1, 2, 3] - idx = PeriodIndex(year=years, month=months, freq='M') - exp = period_range('2007-01', periods=3, freq='M') - tm.assert_index_equal(idx, exp) - - def test_constructor_U(self): - # U was used as undefined period - self.assertRaises(ValueError, period_range, '2007-1-1', periods=500, - freq='X') - - def test_constructor_nano(self): - idx = period_range(start=Period(ordinal=1, freq='N'), - end=Period(ordinal=4, freq='N'), freq='N') - exp = PeriodIndex([Period(ordinal=1, freq='N'), - Period(ordinal=2, freq='N'), - Period(ordinal=3, freq='N'), - Period(ordinal=4, freq='N')], freq='N') - tm.assert_index_equal(idx, exp) - - def test_constructor_arrays_negative_year(self): - years = np.arange(1960, 2000, dtype=np.int64).repeat(4) - quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) - - pindex = PeriodIndex(year=years, quarter=quarters) - - self.assert_numpy_array_equal(pindex.year, years) - self.assert_numpy_array_equal(pindex.quarter, quarters) - - def test_constructor_invalid_quarters(self): - self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), - quarter=lrange(4), freq='Q-DEC') - - def test_constructor_corner(self): - self.assertRaises(ValueError, PeriodIndex, periods=10, freq='A') - - start = Period('2007', freq='A-JUN') - end = Period('2010', freq='A-DEC') - self.assertRaises(ValueError, PeriodIndex, start=start, end=end) - self.assertRaises(ValueError, PeriodIndex, start=start) - self.assertRaises(ValueError, PeriodIndex, end=end) - - result = period_range('2007-01', periods=10.5, freq='M') - exp = period_range('2007-01', periods=10, freq='M') - tm.assert_index_equal(result, exp) - - def test_constructor_fromarraylike(self): - idx = period_range('2007-01', periods=20, freq='M') - - # values is an array of Period, thus can retrieve freq - tm.assert_index_equal(PeriodIndex(idx.values), idx) - tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) - - self.assertRaises(ValueError, PeriodIndex, idx._values) - self.assertRaises(ValueError, PeriodIndex, list(idx._values)) - self.assertRaises(ValueError, PeriodIndex, - data=Period('2007', freq='A')) - - result = PeriodIndex(iter(idx)) - tm.assert_index_equal(result, idx) - - result = PeriodIndex(idx) - tm.assert_index_equal(result, idx) - - result = PeriodIndex(idx, freq='M') - tm.assert_index_equal(result, idx) - - result = PeriodIndex(idx, freq=offsets.MonthEnd()) - tm.assert_index_equal(result, idx) - self.assertTrue(result.freq, 'M') - - result = PeriodIndex(idx, freq='2M') - tm.assert_index_equal(result, idx.asfreq('2M')) - self.assertTrue(result.freq, '2M') - - result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) - tm.assert_index_equal(result, idx.asfreq('2M')) - self.assertTrue(result.freq, '2M') - - result = PeriodIndex(idx, freq='D') - exp = idx.asfreq('D', 'e') - tm.assert_index_equal(result, exp) - - def test_constructor_datetime64arr(self): - vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) - vals = vals.view(np.dtype('M8[us]')) - - self.assertRaises(ValueError, PeriodIndex, vals, freq='D') - - def test_constructor_dtype(self): - # passing a dtype with a tz should localize - idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]') - exp = PeriodIndex(['2013-01', '2013-03'], freq='M') - tm.assert_index_equal(idx, exp) - self.assertEqual(idx.dtype, 'period[M]') - - idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]') - exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D') - tm.assert_index_equal(idx, exp) - self.assertEqual(idx.dtype, 'period[3D]') - - # if we already have a freq and its not the same, then asfreq - # (not changed) - idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D') - - res = PeriodIndex(idx, dtype='period[M]') - exp = PeriodIndex(['2013-01', '2013-01'], freq='M') - tm.assert_index_equal(res, exp) - self.assertEqual(res.dtype, 'period[M]') - - res = PeriodIndex(idx, freq='M') - tm.assert_index_equal(res, exp) - self.assertEqual(res.dtype, 'period[M]') - - msg = 'specified freq and dtype are different' - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - PeriodIndex(['2011-01'], freq='M', dtype='period[D]') - - def test_constructor_empty(self): - idx = pd.PeriodIndex([], freq='M') - tm.assertIsInstance(idx, PeriodIndex) - self.assertEqual(len(idx), 0) - self.assertEqual(idx.freq, 'M') - - with tm.assertRaisesRegexp(ValueError, 'freq not specified'): - pd.PeriodIndex([]) - - def test_constructor_pi_nat(self): - idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='M')]) - exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') - tm.assert_index_equal(idx, exp) - - idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='M')])) - tm.assert_index_equal(idx, exp) - - idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='M')]) - exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M') - tm.assert_index_equal(idx, exp) - - idx = PeriodIndex(np.array([pd.NaT, pd.NaT, - Period('2011-01', freq='M'), - Period('2011-01', freq='M')])) - tm.assert_index_equal(idx, exp) - - idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M') - tm.assert_index_equal(idx, exp) - - with tm.assertRaisesRegexp(ValueError, 'freq not specified'): - PeriodIndex([pd.NaT, pd.NaT]) - - with tm.assertRaisesRegexp(ValueError, 'freq not specified'): - PeriodIndex(np.array([pd.NaT, pd.NaT])) - - with tm.assertRaisesRegexp(ValueError, 'freq not specified'): - PeriodIndex(['NaT', 'NaT']) - - with tm.assertRaisesRegexp(ValueError, 'freq not specified'): - PeriodIndex(np.array(['NaT', 'NaT'])) - - def test_constructor_incompat_freq(self): - msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - PeriodIndex([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='D')]) - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='D')])) - - # first element is pd.NaT - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - PeriodIndex([pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='D')]) - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='D')])) - - def test_constructor_mixed(self): - idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')]) - exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') - tm.assert_index_equal(idx, exp) - - idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')]) - exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M') - tm.assert_index_equal(idx, exp) - - idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT, - '2012-01-01']) - exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D') - tm.assert_index_equal(idx, exp) - - def test_constructor_simple_new(self): - idx = period_range('2007-01', name='p', periods=2, freq='M') - result = idx._simple_new(idx, 'p', freq=idx.freq) - tm.assert_index_equal(result, idx) - - result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) - tm.assert_index_equal(result, idx) - - result = idx._simple_new([pd.Period('2007-01', freq='M'), - pd.Period('2007-02', freq='M')], - 'p', freq=idx.freq) - self.assert_index_equal(result, idx) - - result = idx._simple_new(np.array([pd.Period('2007-01', freq='M'), - pd.Period('2007-02', freq='M')]), - 'p', freq=idx.freq) - self.assert_index_equal(result, idx) - - def test_constructor_simple_new_empty(self): - # GH13079 - idx = PeriodIndex([], freq='M', name='p') - result = idx._simple_new(idx, name='p', freq='M') - tm.assert_index_equal(result, idx) - - def test_constructor_simple_new_floats(self): - # GH13079 - for floats in [[1.1], np.array([1.1])]: - with self.assertRaises(TypeError): - pd.PeriodIndex._simple_new(floats, freq='M') - - def test_shallow_copy_empty(self): - - # GH13067 - idx = PeriodIndex([], freq='M') - result = idx._shallow_copy() - expected = idx - - tm.assert_index_equal(result, expected) - - def test_constructor_nat(self): - self.assertRaises(ValueError, period_range, start='NaT', - end='2011-01-01', freq='M') - self.assertRaises(ValueError, period_range, start='2011-01-01', - end='NaT', freq='M') - - def test_constructor_year_and_quarter(self): - year = pd.Series([2001, 2002, 2003]) - quarter = year - 2000 - idx = PeriodIndex(year=year, quarter=quarter) - strs = ['%dQ%d' % t for t in zip(quarter, year)] - lops = list(map(Period, strs)) - p = PeriodIndex(lops) - tm.assert_index_equal(p, idx) - - def test_constructor_freq_mult(self): - # GH #7811 - for func in [PeriodIndex, period_range]: - # must be the same, but for sure... - pidx = func(start='2014-01', freq='2M', periods=4) - expected = PeriodIndex(['2014-01', '2014-03', - '2014-05', '2014-07'], freq='2M') - tm.assert_index_equal(pidx, expected) - - pidx = func(start='2014-01-02', end='2014-01-15', freq='3D') - expected = PeriodIndex(['2014-01-02', '2014-01-05', - '2014-01-08', '2014-01-11', - '2014-01-14'], freq='3D') - tm.assert_index_equal(pidx, expected) - - pidx = func(end='2014-01-01 17:00', freq='4H', periods=3) - expected = PeriodIndex(['2014-01-01 09:00', '2014-01-01 13:00', - '2014-01-01 17:00'], freq='4H') - tm.assert_index_equal(pidx, expected) - - msg = ('Frequency must be positive, because it' - ' represents span: -1M') - with tm.assertRaisesRegexp(ValueError, msg): - PeriodIndex(['2011-01'], freq='-1M') - - msg = ('Frequency must be positive, because it' ' represents span: 0M') - with tm.assertRaisesRegexp(ValueError, msg): - PeriodIndex(['2011-01'], freq='0M') - - msg = ('Frequency must be positive, because it' ' represents span: 0M') - with tm.assertRaisesRegexp(ValueError, msg): - period_range('2011-01', periods=3, freq='0M') - - def test_constructor_freq_mult_dti_compat(self): - import itertools - mults = [1, 2, 3, 4, 5] - freqs = ['A', 'M', 'D', 'T', 'S'] - for mult, freq in itertools.product(mults, freqs): - freqstr = str(mult) + freq - pidx = PeriodIndex(start='2014-04-01', freq=freqstr, periods=10) - expected = date_range(start='2014-04-01', freq=freqstr, - periods=10).to_period(freqstr) - tm.assert_index_equal(pidx, expected) - - def test_constructor_freq_combined(self): - for freq in ['1D1H', '1H1D']: - pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq) - expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'], - freq='25H') - for freq, func in zip(['1D1H', '1H1D'], [PeriodIndex, period_range]): - pidx = func(start='2016-01-01', periods=2, freq=freq) - expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'], - freq='25H') - tm.assert_index_equal(pidx, expected) - - def test_dtype_str(self): - pi = pd.PeriodIndex([], freq='M') - self.assertEqual(pi.dtype_str, 'period[M]') - self.assertEqual(pi.dtype_str, str(pi.dtype)) - - pi = pd.PeriodIndex([], freq='3M') - self.assertEqual(pi.dtype_str, 'period[3M]') - self.assertEqual(pi.dtype_str, str(pi.dtype)) - - def test_view_asi8(self): - idx = pd.PeriodIndex([], freq='M') - - exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) - tm.assert_numpy_array_equal(idx.asi8, exp) - - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') - - exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) - tm.assert_numpy_array_equal(idx.asi8, exp) - - exp = np.array([14975, -9223372036854775808], dtype=np.int64) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.view('i8'), exp) - tm.assert_numpy_array_equal(idx.asi8, exp) - - def test_values(self): - idx = pd.PeriodIndex([], freq='M') - - exp = np.array([], dtype=np.object) - tm.assert_numpy_array_equal(idx.values, exp) - tm.assert_numpy_array_equal(idx.get_values(), exp) - exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) - - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') - - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) - tm.assert_numpy_array_equal(idx.values, exp) - tm.assert_numpy_array_equal(idx.get_values(), exp) - exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) - - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) - tm.assert_numpy_array_equal(idx.values, exp) - tm.assert_numpy_array_equal(idx.get_values(), exp) - exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) - - def test_asobject_like(self): - idx = pd.PeriodIndex([], freq='M') - - exp = np.array([], dtype=object) - tm.assert_numpy_array_equal(idx.asobject.values, exp) - tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') - - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) - tm.assert_numpy_array_equal(idx.asobject.values, exp) - tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.asobject.values, exp) - tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - - def test_is_(self): - create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', - end='12/1/2009') - index = create_index() - self.assertEqual(index.is_(index), True) - self.assertEqual(index.is_(create_index()), False) - self.assertEqual(index.is_(index.view()), True) - self.assertEqual( - index.is_(index.view().view().view().view().view()), True) - self.assertEqual(index.view().is_(index), True) - ind2 = index.view() - index.name = "Apple" - self.assertEqual(ind2.is_(index), True) - self.assertEqual(index.is_(index[:]), False) - self.assertEqual(index.is_(index.asfreq('M')), False) - self.assertEqual(index.is_(index.asfreq('A')), False) - self.assertEqual(index.is_(index - 2), False) - self.assertEqual(index.is_(index - 0), False) - - def test_comp_period(self): - idx = period_range('2007-01', periods=20, freq='M') - - result = idx < idx[10] - exp = idx.values < idx.values[10] - self.assert_numpy_array_equal(result, exp) - - def test_getitem_index(self): - idx = period_range('2007-01', periods=10, freq='M', name='x') - - result = idx[[1, 3, 5]] - exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], - freq='M', name='x') - tm.assert_index_equal(result, exp) - - result = idx[[True, True, False, False, False, - True, True, False, False, False]] - exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], - freq='M', name='x') - tm.assert_index_equal(result, exp) - - def test_getitem_partial(self): - rng = period_range('2007-01', periods=50, freq='M') - ts = Series(np.random.randn(len(rng)), rng) - - self.assertRaises(KeyError, ts.__getitem__, '2006') - - result = ts['2008'] - self.assertTrue((result.index.year == 2008).all()) - - result = ts['2008':'2009'] - self.assertEqual(len(result), 24) - - result = ts['2008-1':'2009-12'] - self.assertEqual(len(result), 24) - - result = ts['2008Q1':'2009Q4'] - self.assertEqual(len(result), 24) - - result = ts[:'2009'] - self.assertEqual(len(result), 36) - - result = ts['2009':] - self.assertEqual(len(result), 50 - 24) - - exp = result - result = ts[24:] - tm.assert_series_equal(exp, result) - - ts = ts[10:].append(ts[10:]) - self.assertRaisesRegexp(KeyError, - "left slice bound for non-unique " - "label: '2008'", - ts.__getitem__, slice('2008', '2009')) - - def test_getitem_datetime(self): - rng = period_range(start='2012-01-01', periods=10, freq='W-MON') - ts = Series(lrange(len(rng)), index=rng) - - dt1 = datetime(2011, 10, 2) - dt4 = datetime(2012, 4, 20) - - rs = ts[dt1:dt4] - tm.assert_series_equal(rs, ts) - - def test_getitem_nat(self): - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') - self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) - self.assertIs(idx[1], tslib.NaT) - - s = pd.Series([0, 1, 2], index=idx) - self.assertEqual(s[pd.NaT], 1) - - s = pd.Series(idx, index=idx) - self.assertEqual(s[pd.Period('2011-01', freq='M')], - pd.Period('2011-01', freq='M')) - self.assertIs(s[pd.NaT], tslib.NaT) - - def test_getitem_list_periods(self): - # GH 7710 - rng = period_range(start='2012-01-01', periods=10, freq='D') - ts = Series(lrange(len(rng)), index=rng) - exp = ts.iloc[[1]] - tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) - - def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - period_range('2014-01', periods=20, freq='M')) - SLC = pd.IndexSlice - - def assert_slices_equivalent(l_slc, i_slc): - tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) - tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - - assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) - - assert_slices_equivalent(SLC[:Period('2014-10'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10':-1], SLC[:8:-1]) - - assert_slices_equivalent(SLC['2015-02':'2014-10':-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[Period('2015-02'):Period('2014-10'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02':Period('2014-10'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Period('2015-02'):'2014-10':-1], - SLC[13:8:-1]) - - assert_slices_equivalent(SLC['2014-10':'2015-02':-1], SLC[:0]) - - def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - period_range('2014-01', periods=20, freq='M')) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - - def test_contains(self): - rng = period_range('2007-01', freq='M', periods=10) - - self.assertTrue(Period('2007-01', freq='M') in rng) - self.assertFalse(Period('2007-01', freq='D') in rng) - self.assertFalse(Period('2007-01', freq='2M') in rng) - - def test_contains_nat(self): - # GH13582 - idx = period_range('2007-01', freq='M', periods=10) - self.assertFalse(pd.NaT in idx) - self.assertFalse(None in idx) - self.assertFalse(float('nan') in idx) - self.assertFalse(np.nan in idx) - - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') - self.assertTrue(pd.NaT in idx) - self.assertTrue(None in idx) - self.assertTrue(float('nan') in idx) - self.assertTrue(np.nan in idx) - - def test_sub(self): - rng = period_range('2007-01', periods=50) - - result = rng - 5 - exp = rng + (-5) - tm.assert_index_equal(result, exp) - - def test_periods_number_check(self): - with tm.assertRaises(ValueError): - period_range('2011-1-1', '2012-1-1', 'B') - - def test_tolist(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - rs = index.tolist() - [tm.assertIsInstance(x, Period) for x in rs] - - recon = PeriodIndex(rs) - tm.assert_index_equal(index, recon) - - def test_to_timestamp(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - series = Series(1, index=index, name='foo') - - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - result = series.to_timestamp(how='end') - tm.assert_index_equal(result.index, exp_index) - self.assertEqual(result.name, 'foo') - - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = series.to_timestamp(how='start') - tm.assert_index_equal(result.index, exp_index) - - def _get_with_delta(delta, freq='A-DEC'): - return date_range(to_datetime('1/1/2001') + delta, - to_datetime('12/31/2009') + delta, freq=freq) - - delta = timedelta(hours=23) - result = series.to_timestamp('H', 'end') - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.index, exp_index) - - delta = timedelta(hours=23, minutes=59) - result = series.to_timestamp('T', 'end') - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.index, exp_index) - - result = series.to_timestamp('S', 'end') - delta = timedelta(hours=23, minutes=59, seconds=59) - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.index, exp_index) - - index = PeriodIndex(freq='H', start='1/1/2001', end='1/2/2001') - series = Series(1, index=index, name='foo') - - exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', - freq='H') - result = series.to_timestamp(how='end') - tm.assert_index_equal(result.index, exp_index) - self.assertEqual(result.name, 'foo') - - def test_to_timestamp_quarterly_bug(self): - years = np.arange(1960, 2000).repeat(4) - quarters = np.tile(lrange(1, 5), 40) - - pindex = PeriodIndex(year=years, quarter=quarters) - - stamps = pindex.to_timestamp('D', 'end') - expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) - tm.assert_index_equal(stamps, expected) - - def test_to_timestamp_preserve_name(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', - name='foo') - self.assertEqual(index.name, 'foo') - - conv = index.to_timestamp('D') - self.assertEqual(conv.name, 'foo') - - def test_to_timestamp_repr_is_code(self): - zs = [Timestamp('99-04-17 00:00:00', tz='UTC'), - Timestamp('2001-04-17 00:00:00', tz='UTC'), - Timestamp('2001-04-17 00:00:00', tz='America/Los_Angeles'), - Timestamp('2001-04-17 00:00:00', tz=None)] - for z in zs: - self.assertEqual(eval(repr(z)), z) - - def test_to_timestamp_pi_nat(self): - # GH 7228 - index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', - name='idx') - - result = index.to_timestamp('D') - expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), - datetime(2011, 2, 1)], name='idx') - tm.assert_index_equal(result, expected) - self.assertEqual(result.name, 'idx') - - result2 = result.to_period(freq='M') - tm.assert_index_equal(result2, index) - self.assertEqual(result2.name, 'idx') - - result3 = result.to_period(freq='3M') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') - self.assert_index_equal(result3, exp) - self.assertEqual(result3.freqstr, '3M') - - msg = ('Frequency must be positive, because it' - ' represents span: -2A') - with tm.assertRaisesRegexp(ValueError, msg): - result.to_period(freq='-2A') - - def test_to_timestamp_pi_mult(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='2M', name='idx') - result = idx.to_timestamp() - expected = DatetimeIndex( - ['2011-01-01', 'NaT', '2011-02-01'], name='idx') - self.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex( - ['2011-02-28', 'NaT', '2011-03-31'], name='idx') - self.assert_index_equal(result, expected) - - def test_to_timestamp_pi_combined(self): - idx = PeriodIndex(start='2011', periods=2, freq='1D1H', name='idx') - result = idx.to_timestamp() - expected = DatetimeIndex( - ['2011-01-01 00:00', '2011-01-02 01:00'], name='idx') - self.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex( - ['2011-01-02 00:59:59', '2011-01-03 01:59:59'], name='idx') - self.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E', freq='H') - expected = DatetimeIndex( - ['2011-01-02 00:00', '2011-01-03 01:00'], name='idx') - self.assert_index_equal(result, expected) - - def test_to_timestamp_to_period_astype(self): - idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') - - res = idx.astype('period[M]') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') - tm.assert_index_equal(res, exp) - - res = idx.astype('period[3M]') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') - self.assert_index_equal(res, exp) - - def test_start_time(self): - index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') - tm.assert_index_equal(index.start_time, expected_index) - - def test_end_time(self): - index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - tm.assert_index_equal(index.end_time, expected_index) - - def test_as_frame_columns(self): - rng = period_range('1/1/2000', periods=5) - df = DataFrame(randn(10, 5), columns=rng) - - ts = df[rng[0]] - tm.assert_series_equal(ts, df.iloc[:, 0]) - - # GH # 1211 - repr(df) - - ts = df['1/1/2000'] - tm.assert_series_equal(ts, df.iloc[:, 0]) - - def test_indexing(self): - - # GH 4390, iat incorrectly indexing - index = period_range('1/1/2001', periods=10) - s = Series(randn(10), index=index) - expected = s[index[0]] - result = s.iat[0] - self.assertEqual(expected, result) - - def test_frame_setitem(self): - rng = period_range('1/1/2000', periods=5, name='index') - df = DataFrame(randn(5, 3), index=rng) - - df['Index'] = rng - rs = Index(df['Index']) - tm.assert_index_equal(rs, rng, check_names=False) - self.assertEqual(rs.name, 'Index') - self.assertEqual(rng.name, 'index') - - rs = df.reset_index().set_index('index') - tm.assertIsInstance(rs.index, PeriodIndex) - tm.assert_index_equal(rs.index, rng) - - def test_period_set_index_reindex(self): - # GH 6631 - df = DataFrame(np.random.random(6)) - idx1 = period_range('2011/01/01', periods=6, freq='M') - idx2 = period_range('2013', periods=6, freq='A') - - df = df.set_index(idx1) - tm.assert_index_equal(df.index, idx1) - df = df.set_index(idx2) - tm.assert_index_equal(df.index, idx2) - - def test_frame_to_time_stamp(self): - K = 5 - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - df = DataFrame(randn(len(index), K), index=index) - df['mix'] = 'a' - - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - result = df.to_timestamp('D', 'end') - tm.assert_index_equal(result.index, exp_index) - tm.assert_numpy_array_equal(result.values, df.values) - - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = df.to_timestamp('D', 'start') - tm.assert_index_equal(result.index, exp_index) - - def _get_with_delta(delta, freq='A-DEC'): - return date_range(to_datetime('1/1/2001') + delta, - to_datetime('12/31/2009') + delta, freq=freq) - - delta = timedelta(hours=23) - result = df.to_timestamp('H', 'end') - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.index, exp_index) - - delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp('T', 'end') - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.index, exp_index) - - result = df.to_timestamp('S', 'end') - delta = timedelta(hours=23, minutes=59, seconds=59) - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.index, exp_index) - - # columns - df = df.T - - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - result = df.to_timestamp('D', 'end', axis=1) - tm.assert_index_equal(result.columns, exp_index) - tm.assert_numpy_array_equal(result.values, df.values) - - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = df.to_timestamp('D', 'start', axis=1) - tm.assert_index_equal(result.columns, exp_index) - - delta = timedelta(hours=23) - result = df.to_timestamp('H', 'end', axis=1) - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.columns, exp_index) - - delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp('T', 'end', axis=1) - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.columns, exp_index) - - result = df.to_timestamp('S', 'end', axis=1) - delta = timedelta(hours=23, minutes=59, seconds=59) - exp_index = _get_with_delta(delta) - tm.assert_index_equal(result.columns, exp_index) - - # invalid axis - tm.assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) - - result1 = df.to_timestamp('5t', axis=1) - result2 = df.to_timestamp('t', axis=1) - expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS') - self.assertTrue(isinstance(result1.columns, DatetimeIndex)) - self.assertTrue(isinstance(result2.columns, DatetimeIndex)) - self.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) - self.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) - # PeriodIndex.to_timestamp always use 'infer' - self.assertEqual(result1.columns.freqstr, 'AS-JAN') - self.assertEqual(result2.columns.freqstr, 'AS-JAN') - - def test_index_duplicate_periods(self): - # monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts[2007] - expected = ts[1:3] - tm.assert_series_equal(result, expected) - result[:] = 1 - self.assertTrue((ts[1:3] == 1).all()) - - # not monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN') - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts[2007] - expected = ts[idx == 2007] - tm.assert_series_equal(result, expected) - - def test_index_unique(self): - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') - expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') - self.assert_index_equal(idx.unique(), expected) - self.assertEqual(idx.nunique(), 3) - - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', - tz='US/Eastern') - expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', - tz='US/Eastern') - self.assert_index_equal(idx.unique(), expected) - self.assertEqual(idx.nunique(), 3) - - def test_constructor(self): - pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - self.assertEqual(len(pi), 9) - - pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - self.assertEqual(len(pi), 4 * 9) - - pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - self.assertEqual(len(pi), 12 * 9) - - pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') - self.assertEqual(len(pi), 365 * 9 + 2) - - pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') - self.assertEqual(len(pi), 261 * 9) - - pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') - self.assertEqual(len(pi), 365 * 24) - - pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') - self.assertEqual(len(pi), 24 * 60) - - pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') - self.assertEqual(len(pi), 24 * 60 * 60) - - start = Period('02-Apr-2005', 'B') - i1 = PeriodIndex(start=start, periods=20) - self.assertEqual(len(i1), 20) - self.assertEqual(i1.freq, start.freq) - self.assertEqual(i1[0], start) - - end_intv = Period('2006-12-31', 'W') - i1 = PeriodIndex(end=end_intv, periods=10) - self.assertEqual(len(i1), 10) - self.assertEqual(i1.freq, end_intv.freq) - self.assertEqual(i1[-1], end_intv) - - end_intv = Period('2006-12-31', '1w') - i2 = PeriodIndex(end=end_intv, periods=10) - self.assertEqual(len(i1), len(i2)) - self.assertTrue((i1 == i2).all()) - self.assertEqual(i1.freq, i2.freq) - - end_intv = Period('2006-12-31', ('w', 1)) - i2 = PeriodIndex(end=end_intv, periods=10) - self.assertEqual(len(i1), len(i2)) - self.assertTrue((i1 == i2).all()) - self.assertEqual(i1.freq, i2.freq) - - try: - PeriodIndex(start=start, end=end_intv) - raise AssertionError('Cannot allow mixed freq for start and end') - except ValueError: - pass - - end_intv = Period('2005-05-01', 'B') - i1 = PeriodIndex(start=start, end=end_intv) - - try: - PeriodIndex(start=start) - raise AssertionError( - 'Must specify periods if missing start or end') - except ValueError: - pass - - # infer freq from first element - i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) - self.assertEqual(len(i2), 2) - self.assertEqual(i2[0], end_intv) - - i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) - self.assertEqual(len(i2), 2) - self.assertEqual(i2[0], end_intv) - - # Mixed freq should fail - vals = [end_intv, Period('2006-12-31', 'w')] - self.assertRaises(ValueError, PeriodIndex, vals) - vals = np.array(vals) - self.assertRaises(ValueError, PeriodIndex, vals) - - def test_numpy_repeat(self): - index = period_range('20010101', periods=2) - expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), - Period('2001-01-02'), Period('2001-01-02')]) - - tm.assert_index_equal(np.repeat(index, 2), expected) - - msg = "the 'axis' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.repeat, index, 2, axis=1) - - def test_shift(self): - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') - - tm.assert_index_equal(pi1.shift(0), pi1) - - self.assertEqual(len(pi1), len(pi2)) - self.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') - self.assertEqual(len(pi1), len(pi2)) - self.assert_index_equal(pi1.shift(-1), pi2) - - pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') - self.assertEqual(len(pi1), len(pi2)) - self.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') - self.assertEqual(len(pi1), len(pi2)) - self.assert_index_equal(pi1.shift(-1), pi2) - - pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') - self.assertEqual(len(pi1), len(pi2)) - self.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') - self.assertEqual(len(pi1), len(pi2)) - self.assert_index_equal(pi1.shift(-1), pi2) - - def test_shift_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(1) - expected = PeriodIndex(['2011-02', '2011-03', 'NaT', - '2011-05'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - - def test_shift_ndarray(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex(['2011-02', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - - def test_asfreq(self): - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') - pi2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') - pi3 = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2001') - pi4 = PeriodIndex(freq='D', start='1/1/2001', end='1/1/2001') - pi5 = PeriodIndex(freq='H', start='1/1/2001', end='1/1/2001 00:00') - pi6 = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 00:00') - pi7 = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') - - self.assertEqual(pi1.asfreq('Q', 'S'), pi2) - self.assertEqual(pi1.asfreq('Q', 's'), pi2) - self.assertEqual(pi1.asfreq('M', 'start'), pi3) - self.assertEqual(pi1.asfreq('D', 'StarT'), pi4) - self.assertEqual(pi1.asfreq('H', 'beGIN'), pi5) - self.assertEqual(pi1.asfreq('Min', 'S'), pi6) - self.assertEqual(pi1.asfreq('S', 'S'), pi7) - - self.assertEqual(pi2.asfreq('A', 'S'), pi1) - self.assertEqual(pi2.asfreq('M', 'S'), pi3) - self.assertEqual(pi2.asfreq('D', 'S'), pi4) - self.assertEqual(pi2.asfreq('H', 'S'), pi5) - self.assertEqual(pi2.asfreq('Min', 'S'), pi6) - self.assertEqual(pi2.asfreq('S', 'S'), pi7) - - self.assertEqual(pi3.asfreq('A', 'S'), pi1) - self.assertEqual(pi3.asfreq('Q', 'S'), pi2) - self.assertEqual(pi3.asfreq('D', 'S'), pi4) - self.assertEqual(pi3.asfreq('H', 'S'), pi5) - self.assertEqual(pi3.asfreq('Min', 'S'), pi6) - self.assertEqual(pi3.asfreq('S', 'S'), pi7) - - self.assertEqual(pi4.asfreq('A', 'S'), pi1) - self.assertEqual(pi4.asfreq('Q', 'S'), pi2) - self.assertEqual(pi4.asfreq('M', 'S'), pi3) - self.assertEqual(pi4.asfreq('H', 'S'), pi5) - self.assertEqual(pi4.asfreq('Min', 'S'), pi6) - self.assertEqual(pi4.asfreq('S', 'S'), pi7) - - self.assertEqual(pi5.asfreq('A', 'S'), pi1) - self.assertEqual(pi5.asfreq('Q', 'S'), pi2) - self.assertEqual(pi5.asfreq('M', 'S'), pi3) - self.assertEqual(pi5.asfreq('D', 'S'), pi4) - self.assertEqual(pi5.asfreq('Min', 'S'), pi6) - self.assertEqual(pi5.asfreq('S', 'S'), pi7) - - self.assertEqual(pi6.asfreq('A', 'S'), pi1) - self.assertEqual(pi6.asfreq('Q', 'S'), pi2) - self.assertEqual(pi6.asfreq('M', 'S'), pi3) - self.assertEqual(pi6.asfreq('D', 'S'), pi4) - self.assertEqual(pi6.asfreq('H', 'S'), pi5) - self.assertEqual(pi6.asfreq('S', 'S'), pi7) - - self.assertEqual(pi7.asfreq('A', 'S'), pi1) - self.assertEqual(pi7.asfreq('Q', 'S'), pi2) - self.assertEqual(pi7.asfreq('M', 'S'), pi3) - self.assertEqual(pi7.asfreq('D', 'S'), pi4) - self.assertEqual(pi7.asfreq('H', 'S'), pi5) - self.assertEqual(pi7.asfreq('Min', 'S'), pi6) - - self.assertRaises(ValueError, pi7.asfreq, 'T', 'foo') - result1 = pi1.asfreq('3M') - result2 = pi1.asfreq('M') - expected = PeriodIndex(freq='M', start='2001-12', end='2001-12') - self.assert_numpy_array_equal(result1.asi8, expected.asi8) - self.assertEqual(result1.freqstr, '3M') - self.assert_numpy_array_equal(result2.asi8, expected.asi8) - self.assertEqual(result2.freqstr, 'M') - - def test_asfreq_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') - result = idx.asfreq(freq='Q') - expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') - tm.assert_index_equal(result, expected) - - def test_asfreq_mult_pi(self): - pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') - - for freq in ['D', '3D']: - result = pi.asfreq(freq) - exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT', - '2001-04-30'], freq=freq) - self.assert_index_equal(result, exp) - self.assertEqual(result.freq, exp.freq) - - result = pi.asfreq(freq, how='S') - exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT', - '2001-03-01'], freq=freq) - self.assert_index_equal(result, exp) - self.assertEqual(result.freq, exp.freq) - - def test_asfreq_combined_pi(self): - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='H') - exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']): - result = pi.asfreq(freq, how=how) - self.assert_index_equal(result, exp) - self.assertEqual(result.freq, exp.freq) - - for freq in ['1D1H', '1H1D']: - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', - 'NaT'], freq=freq) - result = pi.asfreq('H') - exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'], - freq='H') - self.assert_index_equal(result, exp) - self.assertEqual(result.freq, exp.freq) - - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', - 'NaT'], freq=freq) - result = pi.asfreq('H', how='S') - exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='H') - self.assert_index_equal(result, exp) - self.assertEqual(result.freq, exp.freq) - - def test_period_index_length(self): - pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - self.assertEqual(len(pi), 9) - - pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - self.assertEqual(len(pi), 4 * 9) - - pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - self.assertEqual(len(pi), 12 * 9) - - start = Period('02-Apr-2005', 'B') - i1 = PeriodIndex(start=start, periods=20) - self.assertEqual(len(i1), 20) - self.assertEqual(i1.freq, start.freq) - self.assertEqual(i1[0], start) - - end_intv = Period('2006-12-31', 'W') - i1 = PeriodIndex(end=end_intv, periods=10) - self.assertEqual(len(i1), 10) - self.assertEqual(i1.freq, end_intv.freq) - self.assertEqual(i1[-1], end_intv) - - end_intv = Period('2006-12-31', '1w') - i2 = PeriodIndex(end=end_intv, periods=10) - self.assertEqual(len(i1), len(i2)) - self.assertTrue((i1 == i2).all()) - self.assertEqual(i1.freq, i2.freq) - - end_intv = Period('2006-12-31', ('w', 1)) - i2 = PeriodIndex(end=end_intv, periods=10) - self.assertEqual(len(i1), len(i2)) - self.assertTrue((i1 == i2).all()) - self.assertEqual(i1.freq, i2.freq) - - try: - PeriodIndex(start=start, end=end_intv) - raise AssertionError('Cannot allow mixed freq for start and end') - except ValueError: - pass - - end_intv = Period('2005-05-01', 'B') - i1 = PeriodIndex(start=start, end=end_intv) - - try: - PeriodIndex(start=start) - raise AssertionError( - 'Must specify periods if missing start or end') - except ValueError: - pass - - # infer freq from first element - i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) - self.assertEqual(len(i2), 2) - self.assertEqual(i2[0], end_intv) - - i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) - self.assertEqual(len(i2), 2) - self.assertEqual(i2[0], end_intv) - - # Mixed freq should fail - vals = [end_intv, Period('2006-12-31', 'w')] - self.assertRaises(ValueError, PeriodIndex, vals) - vals = np.array(vals) - self.assertRaises(ValueError, PeriodIndex, vals) - - def test_frame_index_to_string(self): - index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') - frame = DataFrame(np.random.randn(3, 4), index=index) - - # it works! - frame.to_string() - - def test_asfreq_ts(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/31/2010') - ts = Series(np.random.randn(len(index)), index=index) - df = DataFrame(np.random.randn(len(index), 3), index=index) - - result = ts.asfreq('D', how='end') - df_result = df.asfreq('D', how='end') - exp_index = index.asfreq('D', how='end') - self.assertEqual(len(result), len(ts)) - tm.assert_index_equal(result.index, exp_index) - tm.assert_index_equal(df_result.index, exp_index) - - result = ts.asfreq('D', how='start') - self.assertEqual(len(result), len(ts)) - tm.assert_index_equal(result.index, index.asfreq('D', how='start')) - - def test_badinput(self): - self.assertRaises(ValueError, Period, '-2000', 'A') - self.assertRaises(tslib.DateParseError, Period, '0', 'A') - self.assertRaises(tslib.DateParseError, Period, '1/1/-2000', 'A') - - def test_negative_ordinals(self): - Period(ordinal=-1000, freq='A') - Period(ordinal=0, freq='A') - - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') - tm.assert_index_equal(idx1, idx2) - - def test_dti_to_period(self): - dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') - pi1 = dti.to_period() - pi2 = dti.to_period(freq='D') - pi3 = dti.to_period(freq='3D') - - self.assertEqual(pi1[0], Period('Jan 2005', freq='M')) - self.assertEqual(pi2[0], Period('1/31/2005', freq='D')) - self.assertEqual(pi3[0], Period('1/31/2005', freq='3D')) - - self.assertEqual(pi1[-1], Period('Nov 2005', freq='M')) - self.assertEqual(pi2[-1], Period('11/30/2005', freq='D')) - self.assertEqual(pi3[-1], Period('11/30/2005', freq='3D')) - - tm.assert_index_equal(pi1, period_range('1/1/2005', '11/1/2005', - freq='M')) - tm.assert_index_equal(pi2, period_range('1/1/2005', '11/1/2005', - freq='M').asfreq('D')) - tm.assert_index_equal(pi3, period_range('1/1/2005', '11/1/2005', - freq='M').asfreq('3D')) - - def test_pindex_slice_index(self): - pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='M') - s = Series(np.random.rand(len(pi)), index=pi) - res = s['2010'] - exp = s[0:12] - tm.assert_series_equal(res, exp) - res = s['2011'] - exp = s[12:24] - tm.assert_series_equal(res, exp) - - def test_getitem_day(self): - # GH 6716 - # Confirm DatetimeIndex and PeriodIndex works identically - didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) - pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) - - for idx in [didx, pidx]: - # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] - for v in values: - - if _np_version_under1p9: - with tm.assertRaises(ValueError): - idx[v] - else: - # GH7116 - # these show deprecations as we are trying - # to slice with non-integer indexers - # with tm.assertRaises(IndexError): - # idx[v] - continue - - s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01'], s[0:31]) - tm.assert_series_equal(s['2013/02'], s[31:59]) - tm.assert_series_equal(s['2014'], s[365:]) - - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] - for v in invalid: - with tm.assertRaises(KeyError): - s[v] - - def test_range_slice_day(self): - # GH 6716 - didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) - pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) - - # changed to TypeError in 1.12 - # https://github.com/numpy/numpy/pull/6271 - exc = IndexError if _np_version_under1p12 else TypeError - - for idx in [didx, pidx]: - # slices against index should raise IndexError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] - for v in values: - with tm.assertRaises(exc): - idx[v:] - - s = Series(np.random.rand(len(idx)), index=idx) - - tm.assert_series_equal(s['2013/01/02':], s[1:]) - tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) - tm.assert_series_equal(s['2013/02':], s[31:]) - tm.assert_series_equal(s['2014':], s[365:]) - - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] - for v in invalid: - with tm.assertRaises(exc): - idx[v:] - - def test_getitem_seconds(self): - # GH 6716 - didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) - - for idx in [didx, pidx]: - # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] - for v in values: - if _np_version_under1p9: - with tm.assertRaises(ValueError): - idx[v] - else: - # GH7116 - # these show deprecations as we are trying - # to slice with non-integer indexers - # with tm.assertRaises(IndexError): - # idx[v] - continue - - s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) - tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) - for d in ['2013/01/01', '2013/01', '2013']: - tm.assert_series_equal(s[d], s) - - def test_range_slice_seconds(self): - # GH 6716 - didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) - - # changed to TypeError in 1.12 - # https://github.com/numpy/numpy/pull/6271 - exc = IndexError if _np_version_under1p12 else TypeError - - for idx in [didx, pidx]: - # slices against index should raise IndexError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] - for v in values: - with tm.assertRaises(exc): - idx[v:] - - s = Series(np.random.rand(len(idx)), index=idx) - - tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], - s[300:660]) - tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], - s[3600:3960]) - tm.assert_series_equal(s['2013/01/01 10H':], s[3600:]) - tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) - for d in ['2013/01/01', '2013/01', '2013']: - tm.assert_series_equal(s[d:], s) - - def test_range_slice_outofbounds(self): - # GH 5407 - didx = DatetimeIndex(start='2013/10/01', freq='D', periods=10) - pidx = PeriodIndex(start='2013/10/01', freq='D', periods=10) - - for idx in [didx, pidx]: - df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=idx.__class__([], freq='D'), - columns=['units']) - empty['units'] = empty['units'].astype('int64') - - tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) - tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) - tm.assert_frame_equal(df['2013/10/01':'2013/10/02'], df.iloc[:2]) - tm.assert_frame_equal(df['2013/10/02':'2013/09/30'], empty) - tm.assert_frame_equal(df['2013/10/15':'2013/10/17'], empty) - tm.assert_frame_equal(df['2013-06':'2013-09'], empty) - tm.assert_frame_equal(df['2013-11':'2013-12'], empty) - - def test_astype_asfreq(self): - pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D') - exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - tm.assert_index_equal(pi1.asfreq('M'), exp) - tm.assert_index_equal(pi1.astype('period[M]'), exp) - - exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') - tm.assert_index_equal(pi1.asfreq('3M'), exp) - tm.assert_index_equal(pi1.astype('period[3M]'), exp) - - def test_pindex_fieldaccessor_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2012-03', '2012-04'], freq='D') - - exp = np.array([2011, 2011, -1, 2012, 2012], dtype=np.int64) - self.assert_numpy_array_equal(idx.year, exp) - exp = np.array([1, 2, -1, 3, 4], dtype=np.int64) - self.assert_numpy_array_equal(idx.month, exp) - - def test_pindex_qaccess(self): - pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') - s = Series(np.random.rand(len(pi)), index=pi).cumsum() - # Todo: fix these accessors! - self.assertEqual(s['05Q4'], s[2]) - - def test_period_dt64_round_trip(self): - dti = date_range('1/1/2000', '1/7/2002', freq='B') - pi = dti.to_period() - tm.assert_index_equal(pi.to_timestamp(), dti) - - dti = date_range('1/1/2000', '1/7/2002', freq='B') - pi = dti.to_period(freq='H') - tm.assert_index_equal(pi.to_timestamp(), dti) - - def test_period_astype_to_timestamp(self): - pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) - tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) - tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - tz='US/Eastern') - res = pi.astype('datetime64[ns, US/Eastern]') - tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], - tz='US/Eastern') - res = pi.astype('datetime64[ns, US/Eastern]', how='end') - tm.assert_index_equal(res, exp) - - def test_to_period_quarterly(self): - # make sure we can make the round trip - for month in MONTHS: - freq = 'Q-%s' % month - rng = period_range('1989Q3', '1991Q3', freq=freq) - stamps = rng.to_timestamp() - result = stamps.to_period(freq) - tm.assert_index_equal(rng, result) - - def test_to_period_quarterlyish(self): - offsets = ['BQ', 'QS', 'BQS'] - for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) - prng = rng.to_period() - self.assertEqual(prng.freq, 'Q-DEC') - - def test_to_period_annualish(self): - offsets = ['BA', 'AS', 'BAS'] - for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) - prng = rng.to_period() - self.assertEqual(prng.freq, 'A-DEC') - - def test_to_period_monthish(self): - offsets = ['MS', 'BM'] - for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) - prng = rng.to_period() - self.assertEqual(prng.freq, 'M') - - rng = date_range('01-Jan-2012', periods=8, freq='M') - prng = rng.to_period() - self.assertEqual(prng.freq, 'M') - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - date_range('01-Jan-2012', periods=8, freq='EOM') - - def test_multiples(self): - result1 = Period('1989', freq='2A') - result2 = Period('1989', freq='A') - self.assertEqual(result1.ordinal, result2.ordinal) - self.assertEqual(result1.freqstr, '2A-DEC') - self.assertEqual(result2.freqstr, 'A-DEC') - self.assertEqual(result1.freq, offsets.YearEnd(2)) - self.assertEqual(result2.freq, offsets.YearEnd()) - - self.assertEqual((result1 + 1).ordinal, result1.ordinal + 2) - self.assertEqual((1 + result1).ordinal, result1.ordinal + 2) - self.assertEqual((result1 - 1).ordinal, result2.ordinal - 2) - self.assertEqual((-1 + result1).ordinal, result2.ordinal - 2) - - def test_pindex_multiples(self): - pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') - expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07', - '2011-09', '2011-11'], freq='2M') - tm.assert_index_equal(pi, expected) - self.assertEqual(pi.freq, offsets.MonthEnd(2)) - self.assertEqual(pi.freqstr, '2M') - - pi = period_range(start='1/1/11', end='12/31/11', freq='2M') - tm.assert_index_equal(pi, expected) - self.assertEqual(pi.freq, offsets.MonthEnd(2)) - self.assertEqual(pi.freqstr, '2M') - - pi = period_range(start='1/1/11', periods=6, freq='2M') - tm.assert_index_equal(pi, expected) - self.assertEqual(pi.freq, offsets.MonthEnd(2)) - self.assertEqual(pi.freqstr, '2M') - - def test_iteration(self): - index = PeriodIndex(start='1/1/10', periods=4, freq='B') - - result = list(index) - tm.assertIsInstance(result[0], Period) - self.assertEqual(result[0].freq, index.freq) - - def test_take(self): - index = PeriodIndex(start='1/1/10', end='12/31/12', freq='D', - name='idx') - expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), - datetime(2010, 1, 9), datetime(2010, 1, 13)], - freq='D', name='idx') - - taken1 = index.take([5, 6, 8, 12]) - taken2 = index[[5, 6, 8, 12]] - - for taken in [taken1, taken2]: - tm.assert_index_equal(taken, expected) - tm.assertIsInstance(taken, PeriodIndex) - self.assertEqual(taken.freq, index.freq) - self.assertEqual(taken.name, expected.name) - - def test_take_fill_value(self): - # GH 12631 - idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', freq='D') - result = idx.take(np.array([1, 0, -1])) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', freq='D') - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') - tm.assert_index_equal(result, expected) - - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with tm.assertRaises(IndexError): - idx.take(np.array([1, -5])) - - def test_joins(self): - index = period_range('1/1/2000', '1/20/2000', freq='D') - - for kind in ['inner', 'outer', 'left', 'right']: - joined = index.join(index[:-5], how=kind) - - tm.assertIsInstance(joined, PeriodIndex) - self.assertEqual(joined.freq, index.freq) - - def test_join_self(self): - index = period_range('1/1/2000', '1/20/2000', freq='D') - - for kind in ['inner', 'outer', 'left', 'right']: - res = index.join(index, how=kind) - self.assertIs(index, res) - - def test_join_does_not_recur(self): - df = tm.makeCustomDataframe( - 3, 2, data_gen_f=lambda *args: np.random.randint(2), - c_idx_type='p', r_idx_type='dt') - s = df.iloc[:2, 0] - - res = s.index.join(df.columns, how='outer') - expected = Index([s.index[0], s.index[1], - df.columns[0], df.columns[1]], object) - tm.assert_index_equal(res, expected) - - def test_align_series(self): - rng = period_range('1/1/2000', '1/1/2010', freq='A') - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts + ts[::2] - expected = ts + ts - expected[1::2] = np.nan - tm.assert_series_equal(result, expected) - - result = ts + _permute(ts[::2]) - tm.assert_series_equal(result, expected) - - # it works! - for kind in ['inner', 'outer', 'left', 'right']: - ts.align(ts[::2], join=kind) - msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - ts + ts.asfreq('D', how="end") - - def test_align_frame(self): - rng = period_range('1/1/2000', '1/1/2010', freq='A') - ts = DataFrame(np.random.randn(len(rng), 3), index=rng) - - result = ts + ts[::2] - expected = ts + ts - expected.values[1::2] = np.nan - tm.assert_frame_equal(result, expected) - - result = ts + _permute(ts[::2]) - tm.assert_frame_equal(result, expected) - - def test_union(self): - index = period_range('1/1/2000', '1/20/2000', freq='D') - - result = index[:-5].union(index[10:]) - tm.assert_index_equal(result, index) - - # not in order - result = _permute(index[:-5]).union(_permute(index[10:])) - tm.assert_index_equal(result, index) - - # raise if different frequencies - index = period_range('1/1/2000', '1/20/2000', freq='D') - index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') - with tm.assertRaises(period.IncompatibleFrequency): - index.union(index2) - - msg = 'can only call with other PeriodIndex-ed objects' - with tm.assertRaisesRegexp(ValueError, msg): - index.join(index.to_timestamp()) - - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') - with tm.assertRaises(period.IncompatibleFrequency): - index.join(index3) - - def test_union_dataframe_index(self): - rng1 = pd.period_range('1/1/1999', '1/1/2012', freq='M') - s1 = pd.Series(np.random.randn(len(rng1)), rng1) - - rng2 = pd.period_range('1/1/1980', '12/1/2001', freq='M') - s2 = pd.Series(np.random.randn(len(rng2)), rng2) - df = pd.DataFrame({'s1': s1, 's2': s2}) - - exp = pd.period_range('1/1/1980', '1/1/2012', freq='M') - self.assert_index_equal(df.index, exp) - - def test_intersection(self): - index = period_range('1/1/2000', '1/20/2000', freq='D') - - result = index[:-5].intersection(index[10:]) - tm.assert_index_equal(result, index[10:-5]) - - # not in order - left = _permute(index[:-5]) - right = _permute(index[10:]) - result = left.intersection(right).sort_values() - tm.assert_index_equal(result, index[10:-5]) - - # raise if different frequencies - index = period_range('1/1/2000', '1/20/2000', freq='D') - index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') - with tm.assertRaises(period.IncompatibleFrequency): - index.intersection(index2) - - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') - with tm.assertRaises(period.IncompatibleFrequency): - index.intersection(index3) - - def test_intersection_cases(self): - base = period_range('6/1/2000', '6/30/2000', freq='D', name='idx') - - # if target has the same name, it is preserved - rng2 = period_range('5/15/2000', '6/20/2000', freq='D', name='idx') - expected2 = period_range('6/1/2000', '6/20/2000', freq='D', - name='idx') - - # if target name is different, it will be reset - rng3 = period_range('5/15/2000', '6/20/2000', freq='D', name='other') - expected3 = period_range('6/1/2000', '6/20/2000', freq='D', - name=None) - - rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = PeriodIndex([], name='idx', freq='D') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: - result = base.intersection(rng) - tm.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(result.freq, expected.freq) - - # non-monotonic - base = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-02', - '2011-01-03'], freq='D', name='idx') - - rng2 = PeriodIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - freq='D', name='idx') - expected2 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', - name='idx') - - rng3 = PeriodIndex(['2011-01-04', '2011-01-02', '2011-02-02', - '2011-02-03'], - freq='D', name='other') - expected3 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', - name=None) - - rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = PeriodIndex([], freq='D', name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: - result = base.intersection(rng) - tm.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) - self.assertEqual(result.freq, 'D') - - # empty same freq - rng = date_range('6/1/2000', '6/15/2000', freq='T') - result = rng[0:0].intersection(rng) - self.assertEqual(len(result), 0) - - result = rng.intersection(rng[0:0]) - self.assertEqual(len(result), 0) - - def test_fields(self): - # year, month, day, hour, minute - # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter - # qyear - pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2005') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2002') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2002') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='D', start='12/1/2001', end='6/1/2001') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='B', start='12/1/2001', end='6/1/2001') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='H', start='12/31/2001', end='1/1/2002 23:00') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='Min', start='12/31/2001', end='1/1/2002 00:20') - self._check_all_fields(pi) - - pi = PeriodIndex(freq='S', start='12/31/2001 00:00:00', - end='12/31/2001 00:05:00') - self._check_all_fields(pi) - - end_intv = Period('2006-12-31', 'W') - i1 = PeriodIndex(end=end_intv, periods=10) - self._check_all_fields(i1) - - def _check_all_fields(self, periodindex): - fields = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', - 'quarter', 'qyear', 'days_in_month', 'is_leap_year'] - - periods = list(periodindex) - s = pd.Series(periodindex) - - for field in fields: - field_idx = getattr(periodindex, field) - self.assertEqual(len(periodindex), len(field_idx)) - for x, val in zip(periods, field_idx): - self.assertEqual(getattr(x, field), val) - - if len(s) == 0: - continue - - field_s = getattr(s.dt, field) - self.assertEqual(len(periodindex), len(field_s)) - for x, val in zip(periods, field_s): - self.assertEqual(getattr(x, field), val) - - def test_is_full(self): - index = PeriodIndex([2005, 2007, 2009], freq='A') - self.assertFalse(index.is_full) - - index = PeriodIndex([2005, 2006, 2007], freq='A') - self.assertTrue(index.is_full) - - index = PeriodIndex([2005, 2005, 2007], freq='A') - self.assertFalse(index.is_full) - - index = PeriodIndex([2005, 2005, 2006], freq='A') - self.assertTrue(index.is_full) - - index = PeriodIndex([2006, 2005, 2005], freq='A') - self.assertRaises(ValueError, getattr, index, 'is_full') - - self.assertTrue(index[:0].is_full) - - def test_map(self): - index = PeriodIndex([2005, 2007, 2009], freq='A') - result = index.map(lambda x: x + 1) - expected = index + 1 - tm.assert_index_equal(result, expected) - - result = index.map(lambda x: x.ordinal) - exp = Index([x.ordinal for x in index]) - tm.assert_index_equal(result, exp) - - def test_map_with_string_constructor(self): - raw = [2005, 2007, 2009] - index = PeriodIndex(raw, freq='A') - types = str, - - if PY3: - # unicode - types += text_type, - - for t in types: - expected = Index(lmap(t, raw)) - res = index.map(t) - - # should return an Index - tm.assertIsInstance(res, Index) - - # preserve element types - self.assertTrue(all(isinstance(resi, t) for resi in res)) - - # lastly, values should compare equal - tm.assert_index_equal(res, expected) - - def test_convert_array_of_periods(self): - rng = period_range('1/1/2000', periods=20, freq='D') - periods = list(rng) - - result = pd.Index(periods) - tm.assertIsInstance(result, PeriodIndex) - - def test_with_multi_index(self): - # #1705 - index = date_range('1/1/2012', periods=4, freq='12H') - index_as_arrays = [index.to_period(freq='D'), index.hour] - - s = Series([0, 1, 2, 3], index_as_arrays) - - tm.assertIsInstance(s.index.levels[0], PeriodIndex) - - tm.assertIsInstance(s.index.values[0][0], Period) - - def test_to_timestamp_1703(self): - index = period_range('1/1/2012', periods=4, freq='D') - - result = index.to_timestamp() - self.assertEqual(result[0], Timestamp('1/1/2012')) - - def test_to_datetime_depr(self): - index = period_range('1/1/2012', periods=4, freq='D') - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = index.to_datetime() - self.assertEqual(result[0], Timestamp('1/1/2012')) - - def test_get_loc_msg(self): - idx = period_range('2000-1-1', freq='A', periods=10) - bad_period = Period('2012', 'A') - self.assertRaises(KeyError, idx.get_loc, bad_period) - - try: - idx.get_loc(bad_period) - except KeyError as inst: - self.assertEqual(inst.args[0], bad_period) - - def test_get_loc_nat(self): - didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) - pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') - - # check DatetimeIndex compat - for idx in [didx, pidx]: - self.assertEqual(idx.get_loc(pd.NaT), 1) - self.assertEqual(idx.get_loc(None), 1) - self.assertEqual(idx.get_loc(float('nan')), 1) - self.assertEqual(idx.get_loc(np.nan), 1) - - def test_append_concat(self): - # #1815 - d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') - d2 = date_range('12/31/2000', '12/31/2009', freq='A-DEC') - - s1 = Series(np.random.randn(10), d1) - s2 = Series(np.random.randn(10), d2) - - s1 = s1.to_period() - s2 = s2.to_period() - - # drops index - result = pd.concat([s1, s2]) - tm.assertIsInstance(result.index, PeriodIndex) - self.assertEqual(result.index[0], s1.index[0]) - - def test_pickle_freq(self): - # GH2891 - prng = period_range('1/1/2011', '1/1/2012', freq='M') - new_prng = self.round_trip_pickle(prng) - self.assertEqual(new_prng.freq, offsets.MonthEnd()) - self.assertEqual(new_prng.freqstr, 'M') - - def test_slice_keep_name(self): - idx = period_range('20010101', periods=10, freq='D', name='bob') - self.assertEqual(idx.name, idx[1:].name) - - def test_factorize(self): - idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', - '2014-03', '2014-03'], freq='M') - - exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') - - arr, idx = idx1.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - arr, idx = idx1.factorize(sort=True) - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', - '2014-03', '2014-01'], freq='M') - - exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) - arr, idx = idx2.factorize(sort=True) - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) - exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') - arr, idx = idx2.factorize() - self.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, exp_idx) - - def test_recreate_from_data(self): - for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: - org = PeriodIndex(start='2001/04/01', freq=o, periods=1) - idx = PeriodIndex(org.values, freq=o) - tm.assert_index_equal(idx, org) - - def test_combine_first(self): - # GH 3367 - didx = pd.DatetimeIndex(start='1950-01-31', end='1950-07-31', freq='M') - pidx = pd.PeriodIndex(start=pd.Period('1950-1'), - end=pd.Period('1950-7'), freq='M') - # check to be consistent with DatetimeIndex - for idx in [didx, pidx]: - a = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx) - b = pd.Series([9, 9, 9, 9, 9, 9, 9], index=idx) - - result = a.combine_first(b) - expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, - dtype=np.float64) - tm.assert_series_equal(result, expected) - - def test_searchsorted(self): - for freq in ['D', '2D']: - pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03', - '2014-01-04', '2014-01-05'], freq=freq) - - p1 = pd.Period('2014-01-01', freq=freq) - self.assertEqual(pidx.searchsorted(p1), 0) - - p2 = pd.Period('2014-01-04', freq=freq) - self.assertEqual(pidx.searchsorted(p2), 3) - - msg = "Input has different freq=H from PeriodIndex" - with self.assertRaisesRegexp(period.IncompatibleFrequency, msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='H')) - - msg = "Input has different freq=5D from PeriodIndex" - with self.assertRaisesRegexp(period.IncompatibleFrequency, msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) - - with tm.assert_produces_warning(FutureWarning): - pidx.searchsorted(key=p2) - - def test_round_trip(self): - - p = Period('2000Q1') - new_p = self.round_trip_pickle(p) - self.assertEqual(new_p, p) - - -def _permute(obj): - return obj.take(np.random.permutation(len(obj))) - - -class TestMethods(tm.TestCase): - - def test_add(self): - dt1 = Period(freq='D', year=2008, month=1, day=1) - dt2 = Period(freq='D', year=2008, month=1, day=2) - self.assertEqual(dt1 + 1, dt2) - self.assertEqual(1 + dt1, dt2) - - def test_add_pdnat(self): - p = pd.Period('2011-01', freq='M') - self.assertIs(p + pd.NaT, pd.NaT) - self.assertIs(pd.NaT + p, pd.NaT) - - p = pd.Period('NaT', freq='M') - self.assertIs(p + pd.NaT, pd.NaT) - self.assertIs(pd.NaT + p, pd.NaT) - - def test_add_raises(self): - # GH 4731 - dt1 = Period(freq='D', year=2008, month=1, day=1) - dt2 = Period(freq='D', year=2008, month=1, day=2) - msg = r"unsupported operand type\(s\)" - with tm.assertRaisesRegexp(TypeError, msg): - dt1 + "str" - - msg = r"unsupported operand type\(s\)" - with tm.assertRaisesRegexp(TypeError, msg): - "str" + dt1 - - with tm.assertRaisesRegexp(TypeError, msg): - dt1 + dt2 - - def test_sub(self): - dt1 = Period('2011-01-01', freq='D') - dt2 = Period('2011-01-15', freq='D') - - self.assertEqual(dt1 - dt2, -14) - self.assertEqual(dt2 - dt1, 14) - - msg = r"Input has different freq=M from Period\(freq=D\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - dt1 - pd.Period('2011-02', freq='M') - - def test_add_offset(self): - # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('2011', freq=freq) - exp = Period('2013', freq=freq) - self.assertEqual(p + offsets.YearEnd(2), exp) - self.assertEqual(offsets.YearEnd(2) + p, exp) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p - - for freq in ['M', '2M', '3M']: - p = Period('2011-03', freq=freq) - exp = Period('2011-05', freq=freq) - self.assertEqual(p + offsets.MonthEnd(2), exp) - self.assertEqual(offsets.MonthEnd(2) + p, exp) - - exp = Period('2012-03', freq=freq) - self.assertEqual(p + offsets.MonthEnd(12), exp) - self.assertEqual(offsets.MonthEnd(12) + p, exp) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p - - # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('2011-04-01', freq=freq) - - exp = Period('2011-04-06', freq=freq) - self.assertEqual(p + offsets.Day(5), exp) - self.assertEqual(offsets.Day(5) + p, exp) - - exp = Period('2011-04-02', freq=freq) - self.assertEqual(p + offsets.Hour(24), exp) - self.assertEqual(offsets.Hour(24) + p, exp) - - exp = Period('2011-04-03', freq=freq) - self.assertEqual(p + np.timedelta64(2, 'D'), exp) - with tm.assertRaises(TypeError): - np.timedelta64(2, 'D') + p - - exp = Period('2011-04-02', freq=freq) - self.assertEqual(p + np.timedelta64(3600 * 24, 's'), exp) - with tm.assertRaises(TypeError): - np.timedelta64(3600 * 24, 's') + p - - exp = Period('2011-03-30', freq=freq) - self.assertEqual(p + timedelta(-2), exp) - self.assertEqual(timedelta(-2) + p, exp) - - exp = Period('2011-04-03', freq=freq) - self.assertEqual(p + timedelta(hours=48), exp) - self.assertEqual(timedelta(hours=48) + p, exp) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p - - for freq in ['H', '2H', '3H']: - p = Period('2011-04-01 09:00', freq=freq) - - exp = Period('2011-04-03 09:00', freq=freq) - self.assertEqual(p + offsets.Day(2), exp) - self.assertEqual(offsets.Day(2) + p, exp) - - exp = Period('2011-04-01 12:00', freq=freq) - self.assertEqual(p + offsets.Hour(3), exp) - self.assertEqual(offsets.Hour(3) + p, exp) - - exp = Period('2011-04-01 12:00', freq=freq) - self.assertEqual(p + np.timedelta64(3, 'h'), exp) - with tm.assertRaises(TypeError): - np.timedelta64(3, 'h') + p - - exp = Period('2011-04-01 10:00', freq=freq) - self.assertEqual(p + np.timedelta64(3600, 's'), exp) - with tm.assertRaises(TypeError): - np.timedelta64(3600, 's') + p - - exp = Period('2011-04-01 11:00', freq=freq) - self.assertEqual(p + timedelta(minutes=120), exp) - self.assertEqual(timedelta(minutes=120) + p, exp) - - exp = Period('2011-04-05 12:00', freq=freq) - self.assertEqual(p + timedelta(days=4, minutes=180), exp) - self.assertEqual(timedelta(days=4, minutes=180) + p, exp) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p - - def test_add_offset_nat(self): - # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('NaT', freq=freq) - for o in [offsets.YearEnd(2)]: - self.assertIs(p + o, tslib.NaT) - self.assertIs(o + p, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - self.assertIs(p + o, tslib.NaT) - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - self.assertIs(o + p, tslib.NaT) - - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) - for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertIs(p + o, tslib.NaT) - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - self.assertIs(o + p, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - self.assertIs(p + o, tslib.NaT) - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - self.assertIs(o + p, tslib.NaT) - - # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), - np.timedelta64(3600 * 24, 's'), timedelta(-2), - timedelta(hours=48)]: - self.assertIs(p + o, tslib.NaT) - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - self.assertIs(o + p, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: - self.assertIs(p + o, tslib.NaT) - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - self.assertIs(o + p, tslib.NaT) - - for freq in ['H', '2H', '3H']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), - np.timedelta64(3600, 's'), timedelta(minutes=120), - timedelta(days=4, minutes=180)]: - self.assertIs(p + o, tslib.NaT) - - if not isinstance(o, np.timedelta64): - self.assertIs(o + p, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: - self.assertIs(p + o, tslib.NaT) - - if isinstance(o, np.timedelta64): - with tm.assertRaises(TypeError): - o + p - else: - self.assertIs(o + p, tslib.NaT) - - def test_sub_pdnat(self): - # GH 13071 - p = pd.Period('2011-01', freq='M') - self.assertIs(p - pd.NaT, pd.NaT) - self.assertIs(pd.NaT - p, pd.NaT) - - p = pd.Period('NaT', freq='M') - self.assertIs(p - pd.NaT, pd.NaT) - self.assertIs(pd.NaT - p, pd.NaT) - - def test_sub_offset(self): - # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('2011', freq=freq) - self.assertEqual(p - offsets.YearEnd(2), Period('2009', freq=freq)) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o - - for freq in ['M', '2M', '3M']: - p = Period('2011-03', freq=freq) - self.assertEqual(p - offsets.MonthEnd(2), - Period('2011-01', freq=freq)) - self.assertEqual(p - offsets.MonthEnd(12), - Period('2010-03', freq=freq)) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o - - # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('2011-04-01', freq=freq) - self.assertEqual(p - offsets.Day(5), - Period('2011-03-27', freq=freq)) - self.assertEqual(p - offsets.Hour(24), - Period('2011-03-31', freq=freq)) - self.assertEqual(p - np.timedelta64(2, 'D'), - Period('2011-03-30', freq=freq)) - self.assertEqual(p - np.timedelta64(3600 * 24, 's'), - Period('2011-03-31', freq=freq)) - self.assertEqual(p - timedelta(-2), - Period('2011-04-03', freq=freq)) - self.assertEqual(p - timedelta(hours=48), - Period('2011-03-30', freq=freq)) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o - - for freq in ['H', '2H', '3H']: - p = Period('2011-04-01 09:00', freq=freq) - self.assertEqual(p - offsets.Day(2), - Period('2011-03-30 09:00', freq=freq)) - self.assertEqual(p - offsets.Hour(3), - Period('2011-04-01 06:00', freq=freq)) - self.assertEqual(p - np.timedelta64(3, 'h'), - Period('2011-04-01 06:00', freq=freq)) - self.assertEqual(p - np.timedelta64(3600, 's'), - Period('2011-04-01 08:00', freq=freq)) - self.assertEqual(p - timedelta(minutes=120), - Period('2011-04-01 07:00', freq=freq)) - self.assertEqual(p - timedelta(days=4, minutes=180), - Period('2011-03-28 06:00', freq=freq)) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o - - def test_sub_offset_nat(self): - # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('NaT', freq=freq) - for o in [offsets.YearEnd(2)]: - self.assertIs(p - o, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - self.assertIs(p - o, tslib.NaT) - - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) - for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertIs(p - o, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - self.assertIs(p - o, tslib.NaT) - - # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), - np.timedelta64(3600 * 24, 's'), timedelta(-2), - timedelta(hours=48)]: - self.assertIs(p - o, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: - self.assertIs(p - o, tslib.NaT) - - for freq in ['H', '2H', '3H']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), - np.timedelta64(3600, 's'), timedelta(minutes=120), - timedelta(days=4, minutes=180)]: - self.assertIs(p - o, tslib.NaT) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: - self.assertIs(p - o, tslib.NaT) - - def test_nat_ops(self): - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) - self.assertIs(p + 1, tslib.NaT) - self.assertIs(1 + p, tslib.NaT) - self.assertIs(p - 1, tslib.NaT) - self.assertIs(p - Period('2011-01', freq=freq), tslib.NaT) - self.assertIs(Period('2011-01', freq=freq) - p, tslib.NaT) - - def test_period_ops_offset(self): - p = Period('2011-04-01', freq='D') - result = p + offsets.Day() - exp = pd.Period('2011-04-02', freq='D') - self.assertEqual(result, exp) - - result = p - offsets.Day(2) - exp = pd.Period('2011-03-30', freq='D') - self.assertEqual(result, exp) - - msg = r"Input cannot be converted to Period\(freq=D\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - p + offsets.Hour(2) - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - p - offsets.Hour(2) - - -class TestPeriodIndexSeriesMethods(tm.TestCase): - """ Test PeriodIndex and Period Series Ops consistency """ - - def _check(self, values, func, expected): - idx = pd.PeriodIndex(values) - result = func(idx) - if isinstance(expected, pd.Index): - tm.assert_index_equal(result, expected) - else: - # comp op results in bool - tm.assert_numpy_array_equal(result, expected) - - s = pd.Series(values) - result = func(s) - - exp = pd.Series(expected, name=values.name) - tm.assert_series_equal(result, exp) - - def test_pi_ops(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - - expected = PeriodIndex(['2011-03', '2011-04', - '2011-05', '2011-06'], freq='M', name='idx') - self._check(idx, lambda x: x + 2, expected) - self._check(idx, lambda x: 2 + x, expected) - - self._check(idx + 2, lambda x: x - 2, idx) - result = idx - Period('2011-01', freq='M') - exp = pd.Index([0, 1, 2, 3], name='idx') - tm.assert_index_equal(result, exp) - - result = Period('2011-01', freq='M') - idx - exp = pd.Index([0, -1, -2, -3], name='idx') - tm.assert_index_equal(result, exp) - - def test_pi_ops_errors(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - s = pd.Series(idx) - - msg = r"unsupported operand type\(s\)" - - for obj in [idx, s]: - for ng in ["str", 1.5]: - with tm.assertRaisesRegexp(TypeError, msg): - obj + ng - - with tm.assertRaises(TypeError): - # error message differs between PY2 and 3 - ng + obj - - with tm.assertRaisesRegexp(TypeError, msg): - obj - ng - - with tm.assertRaises(TypeError): - np.add(obj, ng) - - if _np_version_under1p10: - self.assertIs(np.add(ng, obj), NotImplemented) - else: - with tm.assertRaises(TypeError): - np.add(ng, obj) - - with tm.assertRaises(TypeError): - np.subtract(obj, ng) - - if _np_version_under1p10: - self.assertIs(np.subtract(ng, obj), NotImplemented) - else: - with tm.assertRaises(TypeError): - np.subtract(ng, obj) - - def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - expected = PeriodIndex(['2011-03', '2011-04', - 'NaT', '2011-06'], freq='M', name='idx') - self._check(idx, lambda x: x + 2, expected) - self._check(idx, lambda x: 2 + x, expected) - self._check(idx, lambda x: np.add(x, 2), expected) - - self._check(idx + 2, lambda x: x - 2, idx) - self._check(idx + 2, lambda x: np.subtract(x, 2), idx) - - # freq with mult - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='2M', name='idx') - expected = PeriodIndex(['2011-07', '2011-08', - 'NaT', '2011-10'], freq='2M', name='idx') - self._check(idx, lambda x: x + 3, expected) - self._check(idx, lambda x: 3 + x, expected) - self._check(idx, lambda x: np.add(x, 3), expected) - - self._check(idx + 3, lambda x: x - 3, idx) - self._check(idx + 3, lambda x: np.subtract(x, 3), idx) - - def test_pi_ops_array_int(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - f = lambda x: x + np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: np.add(x, np.array([4, -1, 1, 2])) - exp = PeriodIndex(['2011-05', '2011-01', 'NaT', - '2011-06'], freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: x - np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2010-12', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) - exp = PeriodIndex(['2010-10', '2010-12', 'NaT', - '2011-06'], freq='M', name='idx') - self._check(idx, f, exp) - - def test_pi_ops_offset(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') - f = lambda x: x + offsets.Day() - exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', - '2011-04-02'], freq='D', name='idx') - self._check(idx, f, exp) - - f = lambda x: x + offsets.Day(2) - exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', - '2011-04-03'], freq='D', name='idx') - self._check(idx, f, exp) - - f = lambda x: x - offsets.Day(2) - exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', - '2011-03-30'], freq='D', name='idx') - self._check(idx, f, exp) - - def test_pi_offset_errors(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') - s = pd.Series(idx) - - # Series op is applied per Period instance, thus error is raised - # from Period - msg_idx = r"Input has different freq from PeriodIndex\(freq=D\)" - msg_s = r"Input cannot be converted to Period\(freq=D\)" - for obj, msg in [(idx, msg_idx), (s, msg_s)]: - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - obj + offsets.Hour(2) - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - offsets.Hour(2) + obj - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - obj - offsets.Hour(2) - - def test_pi_sub_period(self): - # GH 13071 - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - - result = idx - pd.Period('2012-01', freq='M') - exp = pd.Index([-12, -11, -10, -9], name='idx') - tm.assert_index_equal(result, exp) - - result = np.subtract(idx, pd.Period('2012-01', freq='M')) - tm.assert_index_equal(result, exp) - - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12, 11, 10, 9], name='idx') - tm.assert_index_equal(result, exp) - - result = np.subtract(pd.Period('2012-01', freq='M'), idx) - if _np_version_under1p10: - self.assertIs(result, NotImplemented) - else: - tm.assert_index_equal(result, exp) - - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) - - def test_pi_sub_pdnat(self): - # GH 13071 - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') - tm.assert_index_equal(pd.NaT - idx, exp) - tm.assert_index_equal(idx - pd.NaT, exp) - - def test_pi_sub_period_nat(self): - # GH 13071 - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M', name='idx') - - result = idx - pd.Period('2012-01', freq='M') - exp = pd.Index([-12, np.nan, -10, -9], name='idx') - tm.assert_index_equal(result, exp) - - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12, np.nan, 10, 9], name='idx') - tm.assert_index_equal(result, exp) - - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) - - def test_pi_comp_period(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - - f = lambda x: x == pd.Period('2011-03', freq='M') - exp = np.array([False, False, True, False], dtype=np.bool) - self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') == x - self._check(idx, f, exp) - - f = lambda x: x != pd.Period('2011-03', freq='M') - exp = np.array([True, True, False, True], dtype=np.bool) - self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') != x - self._check(idx, f, exp) - - f = lambda x: pd.Period('2011-03', freq='M') >= x - exp = np.array([True, True, True, False], dtype=np.bool) - self._check(idx, f, exp) - - f = lambda x: x > pd.Period('2011-03', freq='M') - exp = np.array([False, False, False, True], dtype=np.bool) - self._check(idx, f, exp) - - f = lambda x: pd.Period('2011-03', freq='M') >= x - exp = np.array([True, True, True, False], dtype=np.bool) - self._check(idx, f, exp) - - def test_pi_comp_period_nat(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M', name='idx') - - f = lambda x: x == pd.Period('2011-03', freq='M') - exp = np.array([False, False, True, False], dtype=np.bool) - self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') == x - self._check(idx, f, exp) - - f = lambda x: x == tslib.NaT - exp = np.array([False, False, False, False], dtype=np.bool) - self._check(idx, f, exp) - f = lambda x: tslib.NaT == x - self._check(idx, f, exp) - - f = lambda x: x != pd.Period('2011-03', freq='M') - exp = np.array([True, True, False, True], dtype=np.bool) - self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') != x - self._check(idx, f, exp) - - f = lambda x: x != tslib.NaT - exp = np.array([True, True, True, True], dtype=np.bool) - self._check(idx, f, exp) - f = lambda x: tslib.NaT != x - self._check(idx, f, exp) - - f = lambda x: pd.Period('2011-03', freq='M') >= x - exp = np.array([True, False, True, False], dtype=np.bool) - self._check(idx, f, exp) - - f = lambda x: x < pd.Period('2011-03', freq='M') - exp = np.array([True, False, False, False], dtype=np.bool) - self._check(idx, f, exp) - - f = lambda x: x > tslib.NaT - exp = np.array([False, False, False, False], dtype=np.bool) - self._check(idx, f, exp) - - f = lambda x: tslib.NaT >= x - exp = np.array([False, False, False, False], dtype=np.bool) - self._check(idx, f, exp) - - -class TestPeriodRepresentation(tm.TestCase): - """ - Wish to match NumPy units - """ - - def test_annual(self): - self._check_freq('A', 1970) - - def test_monthly(self): - self._check_freq('M', '1970-01') - - def test_weekly(self): - self._check_freq('W-THU', '1970-01-01') - - def test_daily(self): - self._check_freq('D', '1970-01-01') - - def test_business_daily(self): - self._check_freq('B', '1970-01-01') - - def test_hourly(self): - self._check_freq('H', '1970-01-01') - - def test_minutely(self): - self._check_freq('T', '1970-01-01') - - def test_secondly(self): - self._check_freq('S', '1970-01-01') - - def test_millisecondly(self): - self._check_freq('L', '1970-01-01') - - def test_microsecondly(self): - self._check_freq('U', '1970-01-01') - - def test_nanosecondly(self): - self._check_freq('N', '1970-01-01') - - def _check_freq(self, freq, base_date): - rng = PeriodIndex(start=base_date, periods=10, freq=freq) - exp = np.arange(10, dtype=np.int64) - self.assert_numpy_array_equal(rng._values, exp) - self.assert_numpy_array_equal(rng.asi8, exp) - - def test_negone_ordinals(self): - freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] - - period = Period(ordinal=-1, freq='D') - for freq in freqs: - repr(period.asfreq(freq)) - - for freq in freqs: - period = Period(ordinal=-1, freq=freq) - repr(period) - self.assertEqual(period.year, 1969) - - period = Period(ordinal=-1, freq='B') - repr(period) - period = Period(ordinal=-1, freq='W') - repr(period) - - -class TestComparisons(tm.TestCase): - - def setUp(self): - self.january1 = Period('2000-01', 'M') - self.january2 = Period('2000-01', 'M') - self.february = Period('2000-02', 'M') - self.march = Period('2000-03', 'M') - self.day = Period('2012-01-01', 'D') - - def test_equal(self): - self.assertEqual(self.january1, self.january2) - - def test_equal_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 == self.day - - def test_notEqual(self): - self.assertNotEqual(self.january1, 1) - self.assertNotEqual(self.january1, self.february) - - def test_greater(self): - self.assertTrue(self.february > self.january1) - - def test_greater_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 > self.day - - def test_greater_Raises_Type(self): - with tm.assertRaises(TypeError): - self.january1 > 1 - - def test_greaterEqual(self): - self.assertTrue(self.january1 >= self.january2) - - def test_greaterEqual_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 >= self.day - - with tm.assertRaises(TypeError): - print(self.january1 >= 1) - - def test_smallerEqual(self): - self.assertTrue(self.january1 <= self.january2) - - def test_smallerEqual_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 <= self.day - - def test_smallerEqual_Raises_Type(self): - with tm.assertRaises(TypeError): - self.january1 <= 1 - - def test_smaller(self): - self.assertTrue(self.january1 < self.february) - - def test_smaller_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 < self.day - - def test_smaller_Raises_Type(self): - with tm.assertRaises(TypeError): - self.january1 < 1 - - def test_sort(self): - periods = [self.march, self.january1, self.february] - correctPeriods = [self.january1, self.february, self.march] - self.assertEqual(sorted(periods), correctPeriods) - - def test_period_nat_comp(self): - p_nat = Period('NaT', freq='D') - p = Period('2011-01-01', freq='D') - - nat = pd.Timestamp('NaT') - t = pd.Timestamp('2011-01-01') - # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t), - (t, nat), (nat, nat)]: - self.assertEqual(left < right, False) - self.assertEqual(left > right, False) - self.assertEqual(left == right, False) - self.assertEqual(left != right, True) - self.assertEqual(left <= right, False) - self.assertEqual(left >= right, False) - - def test_pi_pi_comp(self): - - for freq in ['M', '2M', '3M']: - base = PeriodIndex(['2011-01', '2011-02', - '2011-03', '2011-04'], freq=freq) - p = Period('2011-02', freq=freq) - - exp = np.array([False, True, False, False]) - self.assert_numpy_array_equal(base == p, exp) - self.assert_numpy_array_equal(p == base, exp) - - exp = np.array([True, False, True, True]) - self.assert_numpy_array_equal(base != p, exp) - self.assert_numpy_array_equal(p != base, exp) - - exp = np.array([False, False, True, True]) - self.assert_numpy_array_equal(base > p, exp) - self.assert_numpy_array_equal(p < base, exp) - - exp = np.array([True, False, False, False]) - self.assert_numpy_array_equal(base < p, exp) - self.assert_numpy_array_equal(p > base, exp) - - exp = np.array([False, True, True, True]) - self.assert_numpy_array_equal(base >= p, exp) - self.assert_numpy_array_equal(p <= base, exp) - - exp = np.array([True, True, False, False]) - self.assert_numpy_array_equal(base <= p, exp) - self.assert_numpy_array_equal(p >= base, exp) - - idx = PeriodIndex(['2011-02', '2011-01', '2011-03', - '2011-05'], freq=freq) - - exp = np.array([False, False, True, False]) - self.assert_numpy_array_equal(base == idx, exp) - - exp = np.array([True, True, False, True]) - self.assert_numpy_array_equal(base != idx, exp) - - exp = np.array([False, True, False, False]) - self.assert_numpy_array_equal(base > idx, exp) - - exp = np.array([True, False, False, True]) - self.assert_numpy_array_equal(base < idx, exp) - - exp = np.array([False, True, True, False]) - self.assert_numpy_array_equal(base >= idx, exp) - - exp = np.array([True, False, True, True]) - self.assert_numpy_array_equal(base <= idx, exp) - - # different base freq - msg = "Input has different freq=A-DEC from PeriodIndex" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - base <= Period('2011', freq='A') - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - Period('2011', freq='A') >= base - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') - base <= idx - - # different mult - msg = "Input has different freq=4M from PeriodIndex" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - base <= Period('2011', freq='4M') - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - Period('2011', freq='4M') >= base - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') - base <= idx - - def test_pi_nat_comp(self): - for freq in ['M', '2M', '3M']: - idx1 = PeriodIndex( - ['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) - - result = idx1 > Period('2011-02', freq=freq) - exp = np.array([False, False, False, True]) - self.assert_numpy_array_equal(result, exp) - result = Period('2011-02', freq=freq) < idx1 - self.assert_numpy_array_equal(result, exp) - - result = idx1 == Period('NaT', freq=freq) - exp = np.array([False, False, False, False]) - self.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) == idx1 - self.assert_numpy_array_equal(result, exp) - - result = idx1 != Period('NaT', freq=freq) - exp = np.array([True, True, True, True]) - self.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) != idx1 - self.assert_numpy_array_equal(result, exp) - - idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', - 'NaT'], freq=freq) - result = idx1 < idx2 - exp = np.array([True, False, False, False]) - self.assert_numpy_array_equal(result, exp) - - result = idx1 == idx2 - exp = np.array([False, False, False, False]) - self.assert_numpy_array_equal(result, exp) - - result = idx1 != idx2 - exp = np.array([True, True, True, True]) - self.assert_numpy_array_equal(result, exp) - - result = idx1 == idx1 - exp = np.array([True, True, False, True]) - self.assert_numpy_array_equal(result, exp) - - result = idx1 != idx1 - exp = np.array([False, False, True, False]) - self.assert_numpy_array_equal(result, exp) - - diff = PeriodIndex(['2011-02', '2011-01', '2011-04', - 'NaT'], freq='4M') - msg = "Input has different freq=4M from PeriodIndex" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx1 > diff - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx1 == diff - - -class TestSeriesPeriod(tm.TestCase): - - def setUp(self): - self.series = Series(period_range('2000-01-01', periods=10, freq='D')) - - def test_auto_conversion(self): - series = Series(list(period_range('2000-01-01', periods=10, freq='D'))) - self.assertEqual(series.dtype, 'object') - - series = pd.Series([pd.Period('2011-01-01', freq='D'), - pd.Period('2011-02-01', freq='D')]) - self.assertEqual(series.dtype, 'object') - - def test_getitem(self): - self.assertEqual(self.series[1], pd.Period('2000-01-02', freq='D')) - - result = self.series[[2, 4]] - exp = pd.Series([pd.Period('2000-01-03', freq='D'), - pd.Period('2000-01-05', freq='D')], - index=[2, 4]) - self.assert_series_equal(result, exp) - self.assertEqual(result.dtype, 'object') - - def test_constructor_cant_cast_period(self): - with tm.assertRaises(TypeError): - Series(period_range('2000-01-01', periods=10, freq='D'), - dtype=float) - - def test_constructor_cast_object(self): - s = Series(period_range('1/1/2000', periods=10), dtype=object) - exp = Series(period_range('1/1/2000', periods=10)) - tm.assert_series_equal(s, exp) - - def test_isnull(self): - # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) - tm.assert_series_equal(s.isnull(), Series([False, True])) - tm.assert_series_equal(s.notnull(), Series([True, False])) - - def test_fillna(self): - # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) - - res = s.fillna(pd.Period('2012-01', freq='M')) - exp = Series([pd.Period('2011-01', freq='M'), - pd.Period('2012-01', freq='M')]) - tm.assert_series_equal(res, exp) - self.assertEqual(res.dtype, 'object') - - res = s.fillna('XXX') - exp = Series([pd.Period('2011-01', freq='M'), 'XXX']) - tm.assert_series_equal(res, exp) - self.assertEqual(res.dtype, 'object') - - def test_dropna(self): - # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) - tm.assert_series_equal(s.dropna(), - Series([pd.Period('2011-01', freq='M')])) - - def test_series_comparison_scalars(self): - val = pd.Period('2000-01-04', freq='D') - result = self.series > val - expected = pd.Series([x > val for x in self.series]) - tm.assert_series_equal(result, expected) - - val = self.series[5] - result = self.series > val - expected = pd.Series([x > val for x in self.series]) - tm.assert_series_equal(result, expected) - - def test_between(self): - left, right = self.series[[2, 7]] - result = self.series.between(left, right) - expected = (self.series >= left) & (self.series <= right) - tm.assert_series_equal(result, expected) - - # --------------------------------------------------------------------- - # NaT support - - """ - # ToDo: Enable when support period dtype - def test_NaT_scalar(self): - series = Series([0, 1000, 2000, iNaT], dtype='period[D]') - - val = series[3] - self.assertTrue(isnull(val)) - - series[2] = val - self.assertTrue(isnull(series[2])) - - def test_NaT_cast(self): - result = Series([np.nan]).astype('period[D]') - expected = Series([NaT]) - tm.assert_series_equal(result, expected) - """ - - def test_set_none_nan(self): - # currently Period is stored as object dtype, not as NaT - self.series[3] = None - self.assertIs(self.series[3], None) - - self.series[3:5] = None - self.assertIs(self.series[4], None) - - self.series[5] = np.nan - self.assertTrue(np.isnan(self.series[5])) - - self.series[5:7] = np.nan - self.assertTrue(np.isnan(self.series[6])) - - def test_intercept_astype_object(self): - expected = self.series.astype('object') - - df = DataFrame({'a': self.series, - 'b': np.random.randn(len(self.series))}) - - result = df.values.squeeze() - self.assertTrue((result[:, 0] == expected.values).all()) - - df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) - - result = df.values.squeeze() - self.assertTrue((result[:, 0] == expected.values).all()) - - def test_ops_series_timedelta(self): - # GH 13043 - s = pd.Series([pd.Period('2015-01-01', freq='D'), - pd.Period('2015-01-02', freq='D')], name='xxx') - self.assertEqual(s.dtype, object) - - exp = pd.Series([pd.Period('2015-01-02', freq='D'), - pd.Period('2015-01-03', freq='D')], name='xxx') - tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) - tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) - - tm.assert_series_equal(s + pd.tseries.offsets.Day(), exp) - tm.assert_series_equal(pd.tseries.offsets.Day() + s, exp) - - def test_ops_series_period(self): - # GH 13043 - s = pd.Series([pd.Period('2015-01-01', freq='D'), - pd.Period('2015-01-02', freq='D')], name='xxx') - self.assertEqual(s.dtype, object) - - p = pd.Period('2015-01-10', freq='D') - # dtype will be object because of original dtype - exp = pd.Series([9, 8], name='xxx', dtype=object) - tm.assert_series_equal(p - s, exp) - tm.assert_series_equal(s - p, -exp) - - s2 = pd.Series([pd.Period('2015-01-05', freq='D'), - pd.Period('2015-01-04', freq='D')], name='xxx') - self.assertEqual(s2.dtype, object) - - exp = pd.Series([4, 2], name='xxx', dtype=object) - tm.assert_series_equal(s2 - s, exp) - tm.assert_series_equal(s - s2, -exp) - - def test_comp_series_period_scalar(self): - # GH 13200 - for freq in ['M', '2M', '3M']: - base = Series([Period(x, freq=freq) for x in - ['2011-01', '2011-02', '2011-03', '2011-04']]) - p = Period('2011-02', freq=freq) - - exp = pd.Series([False, True, False, False]) - tm.assert_series_equal(base == p, exp) - tm.assert_series_equal(p == base, exp) - - exp = pd.Series([True, False, True, True]) - tm.assert_series_equal(base != p, exp) - tm.assert_series_equal(p != base, exp) - - exp = pd.Series([False, False, True, True]) - tm.assert_series_equal(base > p, exp) - tm.assert_series_equal(p < base, exp) - - exp = pd.Series([True, False, False, False]) - tm.assert_series_equal(base < p, exp) - tm.assert_series_equal(p > base, exp) - - exp = pd.Series([False, True, True, True]) - tm.assert_series_equal(base >= p, exp) - tm.assert_series_equal(p <= base, exp) - - exp = pd.Series([True, True, False, False]) - tm.assert_series_equal(base <= p, exp) - tm.assert_series_equal(p >= base, exp) - - # different base freq - msg = "Input has different freq=A-DEC from Period" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - base <= Period('2011', freq='A') - - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - Period('2011', freq='A') >= base - - def test_comp_series_period_series(self): - # GH 13200 - for freq in ['M', '2M', '3M']: - base = Series([Period(x, freq=freq) for x in - ['2011-01', '2011-02', '2011-03', '2011-04']]) - - s = Series([Period(x, freq=freq) for x in - ['2011-02', '2011-01', '2011-03', '2011-05']]) - - exp = Series([False, False, True, False]) - tm.assert_series_equal(base == s, exp) - - exp = Series([True, True, False, True]) - tm.assert_series_equal(base != s, exp) - - exp = Series([False, True, False, False]) - tm.assert_series_equal(base > s, exp) - - exp = Series([True, False, False, True]) - tm.assert_series_equal(base < s, exp) - - exp = Series([False, True, True, False]) - tm.assert_series_equal(base >= s, exp) - - exp = Series([True, False, True, True]) - tm.assert_series_equal(base <= s, exp) - - s2 = Series([Period(x, freq='A') for x in - ['2011', '2011', '2011', '2011']]) - - # different base freq - msg = "Input has different freq=A-DEC from Period" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - base <= s2 - - def test_comp_series_period_object(self): - # GH 13200 - base = Series([Period('2011', freq='A'), Period('2011-02', freq='M'), - Period('2013', freq='A'), Period('2011-04', freq='M')]) - - s = Series([Period('2012', freq='A'), Period('2011-01', freq='M'), - Period('2013', freq='A'), Period('2011-05', freq='M')]) - - exp = Series([False, False, True, False]) - tm.assert_series_equal(base == s, exp) - - exp = Series([True, True, False, True]) - tm.assert_series_equal(base != s, exp) - - exp = Series([False, True, False, False]) - tm.assert_series_equal(base > s, exp) - - exp = Series([True, False, False, True]) - tm.assert_series_equal(base < s, exp) - - exp = Series([False, True, True, False]) - tm.assert_series_equal(base >= s, exp) - - exp = Series([True, False, True, True]) - tm.assert_series_equal(base <= s, exp) - - def test_ops_frame_period(self): - # GH 13043 - df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), - pd.Period('2015-02', freq='M')], - 'B': [pd.Period('2014-01', freq='M'), - pd.Period('2014-02', freq='M')]}) - self.assertEqual(df['A'].dtype, object) - self.assertEqual(df['B'].dtype, object) - - p = pd.Period('2015-03', freq='M') - # dtype will be object because of original dtype - exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), - 'B': np.array([14, 13], dtype=object)}) - tm.assert_frame_equal(p - df, exp) - tm.assert_frame_equal(df - p, -exp) - - df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), - pd.Period('2015-06', freq='M')], - 'B': [pd.Period('2015-05', freq='M'), - pd.Period('2015-06', freq='M')]}) - self.assertEqual(df2['A'].dtype, object) - self.assertEqual(df2['B'].dtype, object) - - exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), - 'B': np.array([16, 16], dtype=object)}) - tm.assert_frame_equal(df2 - df, exp) - tm.assert_frame_equal(df - df2, -exp) - - -class TestPeriodField(tm.TestCase): - - def test_get_period_field_raises_on_out_of_range(self): - self.assertRaises(ValueError, _period.get_period_field, -1, 0, 0) - - def test_get_period_field_array_raises_on_out_of_range(self): - self.assertRaises(ValueError, _period.get_period_field_arr, -1, - np.empty(1), 0) - - -class TestTslib(tm.TestCase): - def test_intraday_conversion_factors(self): - self.assertEqual(period_asfreq( - 1, get_freq('D'), get_freq('H'), False), 24) - self.assertEqual(period_asfreq( - 1, get_freq('D'), get_freq('T'), False), 1440) - self.assertEqual(period_asfreq( - 1, get_freq('D'), get_freq('S'), False), 86400) - self.assertEqual(period_asfreq(1, get_freq( - 'D'), get_freq('L'), False), 86400000) - self.assertEqual(period_asfreq(1, get_freq( - 'D'), get_freq('U'), False), 86400000000) - self.assertEqual(period_asfreq(1, get_freq( - 'D'), get_freq('N'), False), 86400000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('H'), get_freq('T'), False), 60) - self.assertEqual(period_asfreq( - 1, get_freq('H'), get_freq('S'), False), 3600) - self.assertEqual(period_asfreq(1, get_freq('H'), - get_freq('L'), False), 3600000) - self.assertEqual(period_asfreq(1, get_freq( - 'H'), get_freq('U'), False), 3600000000) - self.assertEqual(period_asfreq(1, get_freq( - 'H'), get_freq('N'), False), 3600000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('T'), get_freq('S'), False), 60) - self.assertEqual(period_asfreq( - 1, get_freq('T'), get_freq('L'), False), 60000) - self.assertEqual(period_asfreq(1, get_freq( - 'T'), get_freq('U'), False), 60000000) - self.assertEqual(period_asfreq(1, get_freq( - 'T'), get_freq('N'), False), 60000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('S'), get_freq('L'), False), 1000) - self.assertEqual(period_asfreq(1, get_freq('S'), - get_freq('U'), False), 1000000) - self.assertEqual(period_asfreq(1, get_freq( - 'S'), get_freq('N'), False), 1000000000) - - self.assertEqual(period_asfreq( - 1, get_freq('L'), get_freq('U'), False), 1000) - self.assertEqual(period_asfreq(1, get_freq('L'), - get_freq('N'), False), 1000000) - - self.assertEqual(period_asfreq( - 1, get_freq('U'), get_freq('N'), False), 1000) - - def test_period_ordinal_start_values(self): - # information for 1.1.1970 - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('A'))) - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('M'))) - self.assertEqual(1, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('W'))) - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('D'))) - self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, - get_freq('B'))) - - def test_period_ordinal_week(self): - self.assertEqual(1, period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, - get_freq('W'))) - self.assertEqual(2, period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, - get_freq('W'))) - - self.assertEqual(2284, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, - get_freq('W'))) - self.assertEqual(2285, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, - get_freq('W'))) - - def test_period_ordinal_business_day(self): - # Thursday - self.assertEqual(11415, period_ordinal(2013, 10, 3, 0, 0, 0, 0, 0, - get_freq('B'))) - # Friday - self.assertEqual(11416, period_ordinal(2013, 10, 4, 0, 0, 0, 0, 0, - get_freq('B'))) - # Saturday - self.assertEqual(11417, period_ordinal(2013, 10, 5, 0, 0, 0, 0, 0, - get_freq('B'))) - # Sunday - self.assertEqual(11417, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, - get_freq('B'))) - # Monday - self.assertEqual(11417, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, - get_freq('B'))) - # Tuesday - self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, - get_freq('B'))) From 843602480e681769b22d90950daa97f20fa9b9fd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Feb 2017 18:45:04 -0500 Subject: [PATCH 115/338] TST: more test_period reorg --- .../tests/indexes/period/test_construction.py | 25 +- pandas/tests/indexes/period/test_indexing.py | 317 +++++++ pandas/tests/indexes/period/test_ops.py | 492 +--------- pandas/tests/indexes/period/test_setops.py | 93 ++ pandas/tests/scalar/test_period.py | 898 +++--------------- pandas/tests/scalar/test_period_asfreq.py | 721 ++++++++++++++ 6 files changed, 1285 insertions(+), 1261 deletions(-) create mode 100644 pandas/tests/indexes/period/test_indexing.py create mode 100644 pandas/tests/scalar/test_period_asfreq.py diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index c1299c6abeda3..228615829b5b8 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -410,22 +410,9 @@ def test_constructor(self): self.assertTrue((i1 == i2).all()) self.assertEqual(i1.freq, i2.freq) - try: - PeriodIndex(start=start, end=end_intv) - raise AssertionError('Cannot allow mixed freq for start and end') - except ValueError: - pass - end_intv = Period('2005-05-01', 'B') i1 = PeriodIndex(start=start, end=end_intv) - try: - PeriodIndex(start=start) - raise AssertionError( - 'Must specify periods if missing start or end') - except ValueError: - pass - # infer freq from first element i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) self.assertEqual(len(i2), 2) @@ -441,6 +428,18 @@ def test_constructor(self): vals = np.array(vals) self.assertRaises(ValueError, PeriodIndex, vals) + def test_constructor_error(self): + start = Period('02-Apr-2005', 'B') + end_intv = Period('2006-12-31', ('w', 1)) + + msg = 'Start and end must have same freq' + with tm.assertRaisesRegexp(ValueError, msg): + PeriodIndex(start=start, end=end_intv) + + msg = 'Must specify 2 of start, end, periods' + with tm.assertRaisesRegexp(ValueError, msg): + PeriodIndex(start=start) + def test_recreate_from_data(self): for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: org = PeriodIndex(start='2001/04/01', freq=o, periods=1) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py new file mode 100644 index 0000000000000..8d9e26406defc --- /dev/null +++ b/pandas/tests/indexes/period/test_indexing.py @@ -0,0 +1,317 @@ +from datetime import datetime + +import numpy as np +import pandas as pd +from pandas.util import testing as tm +from pandas.compat import lrange +from pandas import (PeriodIndex, Series, DatetimeIndex, + period_range, Period, tslib, _np_version_under1p9) + + +class TestGetItem(tm.TestCase): + + def setUp(self): + pass + + def test_getitem(self): + idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + + for idx in [idx1]: + result = idx[0] + self.assertEqual(result, pd.Period('2011-01-01', freq='D')) + + result = idx[-1] + self.assertEqual(result, pd.Period('2011-01-31', freq='D')) + + result = idx[0:5] + expected = pd.period_range('2011-01-01', '2011-01-05', freq='D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + result = idx[0:10:2] + expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', + '2011-01-05', + '2011-01-07', '2011-01-09'], + freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + result = idx[-20:-5:3] + expected = pd.PeriodIndex(['2011-01-12', '2011-01-15', + '2011-01-18', + '2011-01-21', '2011-01-24'], + freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + result = idx[4::-1] + expected = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-03', + '2011-01-02', '2011-01-01'], + freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + def test_getitem_index(self): + idx = period_range('2007-01', periods=10, freq='M', name='x') + + result = idx[[1, 3, 5]] + exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], + freq='M', name='x') + tm.assert_index_equal(result, exp) + + result = idx[[True, True, False, False, False, + True, True, False, False, False]] + exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], + freq='M', name='x') + tm.assert_index_equal(result, exp) + + def test_getitem_partial(self): + rng = period_range('2007-01', periods=50, freq='M') + ts = Series(np.random.randn(len(rng)), rng) + + self.assertRaises(KeyError, ts.__getitem__, '2006') + + result = ts['2008'] + self.assertTrue((result.index.year == 2008).all()) + + result = ts['2008':'2009'] + self.assertEqual(len(result), 24) + + result = ts['2008-1':'2009-12'] + self.assertEqual(len(result), 24) + + result = ts['2008Q1':'2009Q4'] + self.assertEqual(len(result), 24) + + result = ts[:'2009'] + self.assertEqual(len(result), 36) + + result = ts['2009':] + self.assertEqual(len(result), 50 - 24) + + exp = result + result = ts[24:] + tm.assert_series_equal(exp, result) + + ts = ts[10:].append(ts[10:]) + self.assertRaisesRegexp(KeyError, + "left slice bound for non-unique " + "label: '2008'", + ts.__getitem__, slice('2008', '2009')) + + def test_getitem_datetime(self): + rng = period_range(start='2012-01-01', periods=10, freq='W-MON') + ts = Series(lrange(len(rng)), index=rng) + + dt1 = datetime(2011, 10, 2) + dt4 = datetime(2012, 4, 20) + + rs = ts[dt1:dt4] + tm.assert_series_equal(rs, ts) + + def test_getitem_nat(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) + self.assertIs(idx[1], tslib.NaT) + + s = pd.Series([0, 1, 2], index=idx) + self.assertEqual(s[pd.NaT], 1) + + s = pd.Series(idx, index=idx) + self.assertEqual(s[pd.Period('2011-01', freq='M')], + pd.Period('2011-01', freq='M')) + self.assertIs(s[pd.NaT], tslib.NaT) + + def test_getitem_list_periods(self): + # GH 7710 + rng = period_range(start='2012-01-01', periods=10, freq='D') + ts = Series(lrange(len(rng)), index=rng) + exp = ts.iloc[[1]] + tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) + + def test_getitem_seconds(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', + periods=4000) + pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', + '2013/02/01 09:00'] + for v in values: + if _np_version_under1p9: + with tm.assertRaises(ValueError): + idx[v] + else: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with tm.assertRaises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) + tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) + for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s[d], s) + + def test_getitem_day(self): + # GH 6716 + # Confirm DatetimeIndex and PeriodIndex works identically + didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) + pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', + '2013/02/01 09:00'] + for v in values: + + if _np_version_under1p9: + with tm.assertRaises(ValueError): + idx[v] + else: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with tm.assertRaises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + tm.assert_series_equal(s['2013/01'], s[0:31]) + tm.assert_series_equal(s['2013/02'], s[31:59]) + tm.assert_series_equal(s['2014'], s[365:]) + + invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + for v in invalid: + with tm.assertRaises(KeyError): + s[v] + + +class TestIndexing(tm.TestCase): + + def test_get_loc_msg(self): + idx = period_range('2000-1-1', freq='A', periods=10) + bad_period = Period('2012', 'A') + self.assertRaises(KeyError, idx.get_loc, bad_period) + + try: + idx.get_loc(bad_period) + except KeyError as inst: + self.assertEqual(inst.args[0], bad_period) + + def test_get_loc_nat(self): + didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) + pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + + # check DatetimeIndex compat + for idx in [didx, pidx]: + self.assertEqual(idx.get_loc(pd.NaT), 1) + self.assertEqual(idx.get_loc(None), 1) + self.assertEqual(idx.get_loc(float('nan')), 1) + self.assertEqual(idx.get_loc(np.nan), 1) + + def test_take(self): + # GH 10295 + idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + + for idx in [idx1]: + result = idx.take([0]) + self.assertEqual(result, pd.Period('2011-01-01', freq='D')) + + result = idx.take([5]) + self.assertEqual(result, pd.Period('2011-01-06', freq='D')) + + result = idx.take([0, 1, 2]) + expected = pd.period_range('2011-01-01', '2011-01-03', freq='D', + name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, 'D') + self.assertEqual(result.freq, expected.freq) + + result = idx.take([0, 2, 4]) + expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', + '2011-01-05'], freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + result = idx.take([7, 4, 1]) + expected = pd.PeriodIndex(['2011-01-08', '2011-01-05', + '2011-01-02'], + freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + result = idx.take([3, 2, 5]) + expected = PeriodIndex(['2011-01-04', '2011-01-03', '2011-01-06'], + freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + result = idx.take([-3, 2, 5]) + expected = PeriodIndex(['2011-01-29', '2011-01-03', '2011-01-06'], + freq='D', name='idx') + self.assert_index_equal(result, expected) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.freq, 'D') + + def test_take_misc(self): + index = PeriodIndex(start='1/1/10', end='12/31/12', freq='D', + name='idx') + expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), + datetime(2010, 1, 9), datetime(2010, 1, 13)], + freq='D', name='idx') + + taken1 = index.take([5, 6, 8, 12]) + taken2 = index[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + tm.assertIsInstance(taken, PeriodIndex) + self.assertEqual(taken.freq, index.freq) + self.assertEqual(taken.name, expected.name) + + def test_take_fill_value(self): + # GH 12631 + idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + name='xxx', freq='D') + result = idx.take(np.array([1, 0, -1])) + expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', freq='D') + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'], + name='xxx', freq='D') + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', freq='D') + tm.assert_index_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + idx.take(np.array([1, -5])) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 70759e8659c25..82a881d7c65bc 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,14 +1,12 @@ import numpy as np -from datetime import timedelta, datetime +from datetime import timedelta import pandas as pd import pandas.tslib as tslib import pandas.util.testing as tm import pandas.tseries.period as period -from pandas.compat import lrange from pandas import (DatetimeIndex, PeriodIndex, period_range, Series, Period, - _np_version_under1p10, Index, Timedelta, offsets, - _np_version_under1p9) + _np_version_under1p10, Index, Timedelta, offsets) from pandas.tests.test_base import Ops @@ -285,57 +283,6 @@ def test_resolution(self): idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) self.assertEqual(idx.resolution, expected) - def test_union(self): - # union - rng1 = pd.period_range('1/1/2000', freq='D', periods=5) - other1 = pd.period_range('1/6/2000', freq='D', periods=5) - expected1 = pd.period_range('1/1/2000', freq='D', periods=10) - - rng2 = pd.period_range('1/1/2000', freq='D', periods=5) - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.period_range('1/1/2000', freq='D', periods=8) - - rng3 = pd.period_range('1/1/2000', freq='D', periods=5) - other3 = pd.PeriodIndex([], freq='D') - expected3 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) - expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00', - '2000-01-01 11:00', '2000-01-01 12:00', - '2000-01-01 13:00', '2000-01-02 09:00', - '2000-01-02 10:00', '2000-01-02 11:00', - '2000-01-02 12:00', '2000-01-02 13:00'], - freq='H') - - rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' - '2000-01-01 09:08'], - freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05', '2000-01-01 09:08'], - freq='T') - - rng6 = pd.period_range('2000-01-01', freq='M', periods=7) - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.period_range('2000-01-01', freq='M', periods=10) - - rng7 = pd.period_range('2003-01-01', freq='A', periods=5) - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.period_range('1998-01-01', freq='A', periods=10) - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3), (rng4, other4, - expected4), - (rng5, other5, expected5), (rng6, other6, - expected6), - (rng7, other7, expected7)]: - - result_union = rng.union(other) - tm.assert_index_equal(result_union, expected) - def test_add_iadd(self): rng = pd.period_range('1/1/2000', freq='D', periods=5) other = pd.period_range('1/6/2000', freq='D', periods=5) @@ -432,48 +379,6 @@ def test_add_iadd(self): rng += 1 tm.assert_index_equal(rng, expected) - def test_difference(self): - # diff - rng1 = pd.period_range('1/1/2000', freq='D', periods=5) - other1 = pd.period_range('1/6/2000', freq='D', periods=5) - expected1 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng2 = pd.period_range('1/1/2000', freq='D', periods=5) - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.period_range('1/1/2000', freq='D', periods=3) - - rng3 = pd.period_range('1/1/2000', freq='D', periods=5) - other3 = pd.PeriodIndex([], freq='D') - expected3 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) - expected4 = rng4 - - rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex( - ['2000-01-01 09:01', '2000-01-01 09:05'], freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T') - - rng6 = pd.period_range('2000-01-01', freq='M', periods=7) - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.period_range('2000-01-01', freq='M', periods=3) - - rng7 = pd.period_range('2003-01-01', freq='A', periods=5) - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.period_range('2006-01-01', freq='A', periods=2) - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3), - (rng4, other4, expected4), - (rng5, other5, expected5), - (rng6, other6, expected6), - (rng7, other7, expected7), ]: - result_union = rng.difference(other) - tm.assert_index_equal(result_union, expected) - def test_sub(self): rng = period_range('2007-01', periods=50) @@ -833,98 +738,6 @@ def test_order(self): self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertEqual(ordered.freq, 'D') - def test_getitem(self): - idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') - - for idx in [idx1]: - result = idx[0] - self.assertEqual(result, pd.Period('2011-01-01', freq='D')) - - result = idx[-1] - self.assertEqual(result, pd.Period('2011-01-31', freq='D')) - - result = idx[0:5] - expected = pd.period_range('2011-01-01', '2011-01-05', freq='D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - result = idx[0:10:2] - expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', - '2011-01-05', - '2011-01-07', '2011-01-09'], - freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - result = idx[-20:-5:3] - expected = pd.PeriodIndex(['2011-01-12', '2011-01-15', - '2011-01-18', - '2011-01-21', '2011-01-24'], - freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - result = idx[4::-1] - expected = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-03', - '2011-01-02', '2011-01-01'], - freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - def test_take(self): - # GH 10295 - idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') - - for idx in [idx1]: - result = idx.take([0]) - self.assertEqual(result, pd.Period('2011-01-01', freq='D')) - - result = idx.take([5]) - self.assertEqual(result, pd.Period('2011-01-06', freq='D')) - - result = idx.take([0, 1, 2]) - expected = pd.period_range('2011-01-01', '2011-01-03', freq='D', - name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, 'D') - self.assertEqual(result.freq, expected.freq) - - result = idx.take([0, 2, 4]) - expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', - '2011-01-05'], freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - result = idx.take([7, 4, 1]) - expected = pd.PeriodIndex(['2011-01-08', '2011-01-05', - '2011-01-02'], - freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - result = idx.take([3, 2, 5]) - expected = PeriodIndex(['2011-01-04', '2011-01-03', '2011-01-06'], - freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - - result = idx.take([-3, 2, 5]) - expected = PeriodIndex(['2011-01-29', '2011-01-03', '2011-01-06'], - freq='D', name='idx') - self.assert_index_equal(result, expected) - self.assertEqual(result.freq, expected.freq) - self.assertEqual(result.freq, 'D') - def test_nat_new(self): idx = pd.period_range('2011-01', freq='M', periods=5, name='x') @@ -1350,6 +1163,9 @@ def test_ops_series_period(self): tm.assert_series_equal(s2 - s, exp) tm.assert_series_equal(s - s2, -exp) + +class TestFramePeriod(tm.TestCase): + def test_ops_frame_period(self): # GH 13043 df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), @@ -1379,303 +1195,7 @@ def test_ops_frame_period(self): tm.assert_frame_equal(df - df2, -exp) -class TestPeriodIndex(tm.TestCase): - - def setUp(self): - pass - - def test_getitem_index(self): - idx = period_range('2007-01', periods=10, freq='M', name='x') - - result = idx[[1, 3, 5]] - exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], - freq='M', name='x') - tm.assert_index_equal(result, exp) - - result = idx[[True, True, False, False, False, - True, True, False, False, False]] - exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], - freq='M', name='x') - tm.assert_index_equal(result, exp) - - def test_getitem_partial(self): - rng = period_range('2007-01', periods=50, freq='M') - ts = Series(np.random.randn(len(rng)), rng) - - self.assertRaises(KeyError, ts.__getitem__, '2006') - - result = ts['2008'] - self.assertTrue((result.index.year == 2008).all()) - - result = ts['2008':'2009'] - self.assertEqual(len(result), 24) - - result = ts['2008-1':'2009-12'] - self.assertEqual(len(result), 24) - - result = ts['2008Q1':'2009Q4'] - self.assertEqual(len(result), 24) - - result = ts[:'2009'] - self.assertEqual(len(result), 36) - - result = ts['2009':] - self.assertEqual(len(result), 50 - 24) - - exp = result - result = ts[24:] - tm.assert_series_equal(exp, result) - - ts = ts[10:].append(ts[10:]) - self.assertRaisesRegexp(KeyError, - "left slice bound for non-unique " - "label: '2008'", - ts.__getitem__, slice('2008', '2009')) - - def test_getitem_datetime(self): - rng = period_range(start='2012-01-01', periods=10, freq='W-MON') - ts = Series(lrange(len(rng)), index=rng) - - dt1 = datetime(2011, 10, 2) - dt4 = datetime(2012, 4, 20) - - rs = ts[dt1:dt4] - tm.assert_series_equal(rs, ts) - - def test_getitem_nat(self): - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') - self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) - self.assertIs(idx[1], tslib.NaT) - - s = pd.Series([0, 1, 2], index=idx) - self.assertEqual(s[pd.NaT], 1) - - s = pd.Series(idx, index=idx) - self.assertEqual(s[pd.Period('2011-01', freq='M')], - pd.Period('2011-01', freq='M')) - self.assertIs(s[pd.NaT], tslib.NaT) - - def test_getitem_list_periods(self): - # GH 7710 - rng = period_range(start='2012-01-01', periods=10, freq='D') - ts = Series(lrange(len(rng)), index=rng) - exp = ts.iloc[[1]] - tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) - - def test_getitem_seconds(self): - # GH 6716 - didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) - - for idx in [didx, pidx]: - # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] - for v in values: - if _np_version_under1p9: - with tm.assertRaises(ValueError): - idx[v] - else: - # GH7116 - # these show deprecations as we are trying - # to slice with non-integer indexers - # with tm.assertRaises(IndexError): - # idx[v] - continue - - s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) - tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) - for d in ['2013/01/01', '2013/01', '2013']: - tm.assert_series_equal(s[d], s) - - def test_getitem_day(self): - # GH 6716 - # Confirm DatetimeIndex and PeriodIndex works identically - didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) - pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) - - for idx in [didx, pidx]: - # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] - for v in values: - - if _np_version_under1p9: - with tm.assertRaises(ValueError): - idx[v] - else: - # GH7116 - # these show deprecations as we are trying - # to slice with non-integer indexers - # with tm.assertRaises(IndexError): - # idx[v] - continue - - s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01'], s[0:31]) - tm.assert_series_equal(s['2013/02'], s[31:59]) - tm.assert_series_equal(s['2014'], s[365:]) - - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] - for v in invalid: - with tm.assertRaises(KeyError): - s[v] - - def test_take(self): - index = PeriodIndex(start='1/1/10', end='12/31/12', freq='D', - name='idx') - expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), - datetime(2010, 1, 9), datetime(2010, 1, 13)], - freq='D', name='idx') - - taken1 = index.take([5, 6, 8, 12]) - taken2 = index[[5, 6, 8, 12]] - - for taken in [taken1, taken2]: - tm.assert_index_equal(taken, expected) - tm.assertIsInstance(taken, PeriodIndex) - self.assertEqual(taken.freq, index.freq) - self.assertEqual(taken.name, expected.name) - - def test_take_fill_value(self): - # GH 12631 - idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', freq='D') - result = idx.take(np.array([1, 0, -1])) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', freq='D') - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') - tm.assert_index_equal(result, expected) - - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assertRaisesRegexp(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with tm.assertRaises(IndexError): - idx.take(np.array([1, -5])) - - def test_get_loc_msg(self): - idx = period_range('2000-1-1', freq='A', periods=10) - bad_period = Period('2012', 'A') - self.assertRaises(KeyError, idx.get_loc, bad_period) - - try: - idx.get_loc(bad_period) - except KeyError as inst: - self.assertEqual(inst.args[0], bad_period) - - def test_get_loc_nat(self): - didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) - pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') - - # check DatetimeIndex compat - for idx in [didx, pidx]: - self.assertEqual(idx.get_loc(pd.NaT), 1) - self.assertEqual(idx.get_loc(None), 1) - self.assertEqual(idx.get_loc(float('nan')), 1) - self.assertEqual(idx.get_loc(np.nan), 1) - - -class TestComparisons(tm.TestCase): - - def setUp(self): - self.january1 = Period('2000-01', 'M') - self.january2 = Period('2000-01', 'M') - self.february = Period('2000-02', 'M') - self.march = Period('2000-03', 'M') - self.day = Period('2012-01-01', 'D') - - def test_equal(self): - self.assertEqual(self.january1, self.january2) - - def test_equal_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 == self.day - - def test_notEqual(self): - self.assertNotEqual(self.january1, 1) - self.assertNotEqual(self.january1, self.february) - - def test_greater(self): - self.assertTrue(self.february > self.january1) - - def test_greater_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 > self.day - - def test_greater_Raises_Type(self): - with tm.assertRaises(TypeError): - self.january1 > 1 - - def test_greaterEqual(self): - self.assertTrue(self.january1 >= self.january2) - - def test_greaterEqual_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 >= self.day - - with tm.assertRaises(TypeError): - print(self.january1 >= 1) - - def test_smallerEqual(self): - self.assertTrue(self.january1 <= self.january2) - - def test_smallerEqual_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 <= self.day - - def test_smallerEqual_Raises_Type(self): - with tm.assertRaises(TypeError): - self.january1 <= 1 - - def test_smaller(self): - self.assertTrue(self.january1 < self.february) - - def test_smaller_Raises_Value(self): - with tm.assertRaises(period.IncompatibleFrequency): - self.january1 < self.day - - def test_smaller_Raises_Type(self): - with tm.assertRaises(TypeError): - self.january1 < 1 - - def test_sort(self): - periods = [self.march, self.january1, self.february] - correctPeriods = [self.january1, self.february, self.march] - self.assertEqual(sorted(periods), correctPeriods) - - def test_period_nat_comp(self): - p_nat = Period('NaT', freq='D') - p = Period('2011-01-01', freq='D') - - nat = pd.Timestamp('NaT') - t = pd.Timestamp('2011-01-01') - # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t), - (t, nat), (nat, nat)]: - self.assertEqual(left < right, False) - self.assertEqual(left > right, False) - self.assertEqual(left == right, False) - self.assertEqual(left != right, True) - self.assertEqual(left <= right, False) - self.assertEqual(left >= right, False) +class TestPeriodIndexComparisons(tm.TestCase): def test_pi_pi_comp(self): diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 06e15f9175ed8..d4f06bae8bc32 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -43,6 +43,57 @@ def test_join_does_not_recur(self): tm.assert_index_equal(res, expected) def test_union(self): + # union + rng1 = pd.period_range('1/1/2000', freq='D', periods=5) + other1 = pd.period_range('1/6/2000', freq='D', periods=5) + expected1 = pd.period_range('1/1/2000', freq='D', periods=10) + + rng2 = pd.period_range('1/1/2000', freq='D', periods=5) + other2 = pd.period_range('1/4/2000', freq='D', periods=5) + expected2 = pd.period_range('1/1/2000', freq='D', periods=8) + + rng3 = pd.period_range('1/1/2000', freq='D', periods=5) + other3 = pd.PeriodIndex([], freq='D') + expected3 = pd.period_range('1/1/2000', freq='D', periods=5) + + rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) + other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) + expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00', + '2000-01-01 11:00', '2000-01-01 12:00', + '2000-01-01 13:00', '2000-01-02 09:00', + '2000-01-02 10:00', '2000-01-02 11:00', + '2000-01-02 12:00', '2000-01-02 13:00'], + freq='H') + + rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', + '2000-01-01 09:05'], freq='T') + other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' + '2000-01-01 09:08'], + freq='T') + expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', + '2000-01-01 09:05', '2000-01-01 09:08'], + freq='T') + + rng6 = pd.period_range('2000-01-01', freq='M', periods=7) + other6 = pd.period_range('2000-04-01', freq='M', periods=7) + expected6 = pd.period_range('2000-01-01', freq='M', periods=10) + + rng7 = pd.period_range('2003-01-01', freq='A', periods=5) + other7 = pd.period_range('1998-01-01', freq='A', periods=8) + expected7 = pd.period_range('1998-01-01', freq='A', periods=10) + + for rng, other, expected in [(rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), (rng4, other4, + expected4), + (rng5, other5, expected5), (rng6, other6, + expected6), + (rng7, other7, expected7)]: + + result_union = rng.union(other) + tm.assert_index_equal(result_union, expected) + + def test_union_misc(self): index = period_range('1/1/2000', '1/20/2000', freq='D') result = index[:-5].union(index[10:]) @@ -155,3 +206,45 @@ def test_intersection_cases(self): result = rng.intersection(rng[0:0]) self.assertEqual(len(result), 0) + + def test_difference(self): + # diff + rng1 = pd.period_range('1/1/2000', freq='D', periods=5) + other1 = pd.period_range('1/6/2000', freq='D', periods=5) + expected1 = pd.period_range('1/1/2000', freq='D', periods=5) + + rng2 = pd.period_range('1/1/2000', freq='D', periods=5) + other2 = pd.period_range('1/4/2000', freq='D', periods=5) + expected2 = pd.period_range('1/1/2000', freq='D', periods=3) + + rng3 = pd.period_range('1/1/2000', freq='D', periods=5) + other3 = pd.PeriodIndex([], freq='D') + expected3 = pd.period_range('1/1/2000', freq='D', periods=5) + + rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) + other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) + expected4 = rng4 + + rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', + '2000-01-01 09:05'], freq='T') + other5 = pd.PeriodIndex( + ['2000-01-01 09:01', '2000-01-01 09:05'], freq='T') + expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T') + + rng6 = pd.period_range('2000-01-01', freq='M', periods=7) + other6 = pd.period_range('2000-04-01', freq='M', periods=7) + expected6 = pd.period_range('2000-01-01', freq='M', periods=3) + + rng7 = pd.period_range('2003-01-01', freq='A', periods=5) + other7 = pd.period_range('1998-01-01', freq='A', periods=8) + expected7 = pd.period_range('2006-01-01', freq='A', periods=2) + + for rng, other, expected in [(rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), ]: + result_union = rng.difference(other) + tm.assert_index_equal(result_union, expected) diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index c94a7c62a6dc9..ffe00a4a62a0a 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -7,12 +7,28 @@ from pandas.compat import text_type, iteritems from pandas.compat.numpy import np_datetime64_compat from pandas import Period, Timestamp, tslib, offsets, _period -from pandas.tseries.frequencies import DAYS, MONTHS, _period_code_map +from pandas.tseries.frequencies import DAYS, MONTHS class TestPeriodProperties(tm.TestCase): "Test properties such as year, month, weekday, etc...." + def test_is_leap_year(self): + # GH 13727 + for freq in ['A', 'M', 'D', 'H']: + p = Period('2000-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + self.assertIsInstance(p.is_leap_year, bool) + + p = Period('1999-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + + p = Period('2004-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + + p = Period('2100-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + def test_quarterly_negative_ordinals(self): p = Period(ordinal=-1, freq='Q-DEC') self.assertEqual(p.year, 1969) @@ -273,7 +289,7 @@ def test_timestamp_mult(self): self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-03-31')) - def test_period_constructor(self): + def test_construction(self): i1 = Period('1/1/2005', freq='M') i2 = Period('Jan 2005') @@ -299,6 +315,41 @@ def test_period_constructor(self): self.assertEqual(i1, i2) self.assertEqual(i1, i3) + i1 = Period('1982', freq='min') + i2 = Period('1982', freq='MIN') + self.assertEqual(i1, i2) + i2 = Period('1982', freq=('Min', 1)) + self.assertEqual(i1, i2) + + i1 = Period(year=2005, month=3, day=1, freq='D') + i2 = Period('3/1/2005', freq='D') + self.assertEqual(i1, i2) + + i3 = Period(year=2005, month=3, day=1, freq='d') + self.assertEqual(i1, i3) + + i1 = Period('2007-01-01 09:00:00.001') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + self.assertEqual(i1, expected) + + expected = Period(np_datetime64_compat( + '2007-01-01 09:00:00.001Z'), freq='L') + self.assertEqual(i1, expected) + + i1 = Period('2007-01-01 09:00:00.00101') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + self.assertEqual(i1, expected) + + expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), + freq='U') + self.assertEqual(i1, expected) + + self.assertRaises(ValueError, Period, ordinal=200701) + + self.assertRaises(ValueError, Period, '2007-1-1', freq='X') + + def test_construction_bday(self): + # Biz day construction, roll forward if non-weekday i1 = Period('3/10/12', freq='B') i2 = Period('3/10/12', freq='D') @@ -311,6 +362,12 @@ def test_period_constructor(self): i3 = Period('3/10/12', freq='b') self.assertEqual(i1, i3) + i1 = Period(year=2012, month=3, day=10, freq='B') + i2 = Period('3/12/12', freq='B') + self.assertEqual(i1, i2) + + def test_construction_quarter(self): + i1 = Period(year=2005, quarter=1, freq='Q') i2 = Period('1/1/2005', freq='Q') self.assertEqual(i1, i2) @@ -319,17 +376,6 @@ def test_period_constructor(self): i2 = Period('9/1/2005', freq='Q') self.assertEqual(i1, i2) - i1 = Period(year=2005, month=3, day=1, freq='D') - i2 = Period('3/1/2005', freq='D') - self.assertEqual(i1, i2) - - i3 = Period(year=2005, month=3, day=1, freq='d') - self.assertEqual(i1, i3) - - i1 = Period(year=2012, month=3, day=10, freq='B') - i2 = Period('3/12/12', freq='B') - self.assertEqual(i1, i2) - i1 = Period('2005Q1') i2 = Period(year=2005, quarter=1, freq='Q') i3 = Period('2005q1') @@ -356,11 +402,7 @@ def test_period_constructor(self): lower = Period('4q1984') self.assertEqual(i1, lower) - i1 = Period('1982', freq='min') - i2 = Period('1982', freq='MIN') - self.assertEqual(i1, i2) - i2 = Period('1982', freq=('Min', 1)) - self.assertEqual(i1, i2) + def test_construction_month(self): expected = Period('2007-01', freq='M') i1 = Period('200701', freq='M') @@ -389,26 +431,6 @@ def test_period_constructor(self): self.assertEqual(i1, i4) self.assertEqual(i1, i5) - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') - self.assertEqual(i1, expected) - - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') - self.assertEqual(i1, expected) - - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') - self.assertEqual(i1, expected) - - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') - self.assertEqual(i1, expected) - - self.assertRaises(ValueError, Period, ordinal=200701) - - self.assertRaises(ValueError, Period, '2007-1-1', freq='X') - def test_period_constructor_offsets(self): self.assertEqual(Period('1/1/2005', freq=offsets.MonthEnd()), Period('1/1/2005', freq='M')) @@ -894,21 +916,6 @@ def test_constructor_infer_freq(self): p = Period('2007-01-01 07:10:15.123400') self.assertEqual(p.freq, 'U') - def test_asfreq_MS(self): - initial = Period("2013") - - self.assertEqual(initial.asfreq(freq="M", how="S"), - Period('2013-01', 'M')) - - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - initial.asfreq(freq="MS", how="S") - - with tm.assertRaisesRegexp(ValueError, msg): - pd.Period('2013-01', 'MS') - - self.assertTrue(_period_code_map.get("MS") is None) - def test_badinput(self): self.assertRaises(ValueError, Period, '-2000', 'A') self.assertRaises(tslib.DateParseError, Period, '0', 'A') @@ -945,722 +952,89 @@ def test_get_period_field_array_raises_on_out_of_range(self): np.empty(1), 0) -class TestFreqConversion(tm.TestCase): - "Test frequency conversion of date objects" - - def test_asfreq_corner(self): - val = Period(freq='A', year=2007) - result1 = val.asfreq('5t') - result2 = val.asfreq('t') - expected = Period('2007-12-31 23:59', freq='t') - self.assertEqual(result1.ordinal, expected.ordinal) - self.assertEqual(result1.freqstr, '5T') - self.assertEqual(result2.ordinal, expected.ordinal) - self.assertEqual(result2.freqstr, 'T') - - def test_conv_annual(self): - # frequency conversion tests: from Annual Frequency - - ival_A = Period(freq='A', year=2007) - - ival_AJAN = Period(freq="A-JAN", year=2007) - ival_AJUN = Period(freq="A-JUN", year=2007) - ival_ANOV = Period(freq="A-NOV", year=2007) - - ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) - ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) - ival_A_to_M_start = Period(freq='M', year=2007, month=1) - ival_A_to_M_end = Period(freq='M', year=2007, month=12) - ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31) - ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) - ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - - ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) - ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) - ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) - - self.assertEqual(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) - self.assertEqual(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) - self.assertEqual(ival_A.asfreq('M', 's'), ival_A_to_M_start) - self.assertEqual(ival_A.asfreq('M', 'E'), ival_A_to_M_end) - self.assertEqual(ival_A.asfreq('W', 'S'), ival_A_to_W_start) - self.assertEqual(ival_A.asfreq('W', 'E'), ival_A_to_W_end) - self.assertEqual(ival_A.asfreq('B', 'S'), ival_A_to_B_start) - self.assertEqual(ival_A.asfreq('B', 'E'), ival_A_to_B_end) - self.assertEqual(ival_A.asfreq('D', 'S'), ival_A_to_D_start) - self.assertEqual(ival_A.asfreq('D', 'E'), ival_A_to_D_end) - self.assertEqual(ival_A.asfreq('H', 'S'), ival_A_to_H_start) - self.assertEqual(ival_A.asfreq('H', 'E'), ival_A_to_H_end) - self.assertEqual(ival_A.asfreq('min', 'S'), ival_A_to_T_start) - self.assertEqual(ival_A.asfreq('min', 'E'), ival_A_to_T_end) - self.assertEqual(ival_A.asfreq('T', 'S'), ival_A_to_T_start) - self.assertEqual(ival_A.asfreq('T', 'E'), ival_A_to_T_end) - self.assertEqual(ival_A.asfreq('S', 'S'), ival_A_to_S_start) - self.assertEqual(ival_A.asfreq('S', 'E'), ival_A_to_S_end) - - self.assertEqual(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) - self.assertEqual(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) - - self.assertEqual(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) - self.assertEqual(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) - - self.assertEqual(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) - self.assertEqual(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) - - self.assertEqual(ival_A.asfreq('A'), ival_A) - - def test_conv_quarterly(self): - # frequency conversion tests: from Quarterly Frequency - - ival_Q = Period(freq='Q', year=2007, quarter=1) - ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) - - ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) - ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - - ival_Q_to_A = Period(freq='A', year=2007) - ival_Q_to_M_start = Period(freq='M', year=2007, month=1) - ival_Q_to_M_end = Period(freq='M', year=2007, month=3) - ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31) - ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) - ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23) - ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23, - minute=59, second=59) - - ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) - - ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) - - self.assertEqual(ival_Q.asfreq('A'), ival_Q_to_A) - self.assertEqual(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) - - self.assertEqual(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) - self.assertEqual(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) - self.assertEqual(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) - self.assertEqual(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) - self.assertEqual(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) - self.assertEqual(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) - self.assertEqual(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) - self.assertEqual(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) - self.assertEqual(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) - self.assertEqual(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) - self.assertEqual(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) - self.assertEqual(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) - self.assertEqual(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) - self.assertEqual(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) - - self.assertEqual(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) - self.assertEqual(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) - self.assertEqual(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) - self.assertEqual(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) - - self.assertEqual(ival_Q.asfreq('Q'), ival_Q) - - def test_conv_monthly(self): - # frequency conversion tests: from Monthly Frequency - - ival_M = Period(freq='M', year=2007, month=1) - ival_M_end_of_year = Period(freq='M', year=2007, month=12) - ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) - ival_M_to_A = Period(freq='A', year=2007) - ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31) - ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) - ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23) - ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, - minute=59, second=59) - - self.assertEqual(ival_M.asfreq('A'), ival_M_to_A) - self.assertEqual(ival_M_end_of_year.asfreq('A'), ival_M_to_A) - self.assertEqual(ival_M.asfreq('Q'), ival_M_to_Q) - self.assertEqual(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) - - self.assertEqual(ival_M.asfreq('W', 'S'), ival_M_to_W_start) - self.assertEqual(ival_M.asfreq('W', 'E'), ival_M_to_W_end) - self.assertEqual(ival_M.asfreq('B', 'S'), ival_M_to_B_start) - self.assertEqual(ival_M.asfreq('B', 'E'), ival_M_to_B_end) - self.assertEqual(ival_M.asfreq('D', 'S'), ival_M_to_D_start) - self.assertEqual(ival_M.asfreq('D', 'E'), ival_M_to_D_end) - self.assertEqual(ival_M.asfreq('H', 'S'), ival_M_to_H_start) - self.assertEqual(ival_M.asfreq('H', 'E'), ival_M_to_H_end) - self.assertEqual(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) - self.assertEqual(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) - self.assertEqual(ival_M.asfreq('S', 'S'), ival_M_to_S_start) - self.assertEqual(ival_M.asfreq('S', 'E'), ival_M_to_S_end) - - self.assertEqual(ival_M.asfreq('M'), ival_M) - - def test_conv_weekly(self): - # frequency conversion tests: from Weekly Frequency - ival_W = Period(freq='W', year=2007, month=1, day=1) - - ival_WSUN = Period(freq='W', year=2007, month=1, day=7) - ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6) - ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5) - ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4) - ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3) - ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2) - ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31) - ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31) - ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) - else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) - - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) - else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) - - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) - else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) - self.assertEqual(ival_W_end_of_year.asfreq('A'), - ival_W_to_A_end_of_year) - self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) - self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) - self.assertEqual(ival_W_end_of_month.asfreq('M'), - ival_W_to_M_end_of_month) - - self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - self.assertEqual(ival_W.asfreq('W'), ival_W) +class TestComparisons(tm.TestCase): - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - ival_W.asfreq('WK') + def setUp(self): + self.january1 = Period('2000-01', 'M') + self.january2 = Period('2000-01', 'M') + self.february = Period('2000-02', 'M') + self.march = Period('2000-03', 'M') + self.day = Period('2012-01-01', 'D') - def test_conv_weekly_legacy(self): - # frequency conversion tests: from Weekly Frequency - msg = pd.tseries.frequencies._INVALID_FREQ_ERROR - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK', year=2007, month=1, day=1) + def test_equal(self): + self.assertEqual(self.january1, self.january2) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-SAT', year=2007, month=1, day=6) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-FRI', year=2007, month=1, day=5) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-THU', year=2007, month=1, day=4) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-WED', year=2007, month=1, day=3) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-TUE', year=2007, month=1, day=2) - with self.assertRaisesRegexp(ValueError, msg): - Period(freq='WK-MON', year=2007, month=1, day=1) - - def test_conv_business(self): - # frequency conversion tests: from Business Frequency" - - ival_B = Period(freq='B', year=2007, month=1, day=1) - ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) - ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) - ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) - ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) - - ival_B_to_A = Period(freq='A', year=2007) - ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_B_to_M = Period(freq='M', year=2007, month=1) - ival_B_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - self.assertEqual(ival_B.asfreq('A'), ival_B_to_A) - self.assertEqual(ival_B_end_of_year.asfreq('A'), ival_B_to_A) - self.assertEqual(ival_B.asfreq('Q'), ival_B_to_Q) - self.assertEqual(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) - self.assertEqual(ival_B.asfreq('M'), ival_B_to_M) - self.assertEqual(ival_B_end_of_month.asfreq('M'), ival_B_to_M) - self.assertEqual(ival_B.asfreq('W'), ival_B_to_W) - self.assertEqual(ival_B_end_of_week.asfreq('W'), ival_B_to_W) - - self.assertEqual(ival_B.asfreq('D'), ival_B_to_D) - - self.assertEqual(ival_B.asfreq('H', 'S'), ival_B_to_H_start) - self.assertEqual(ival_B.asfreq('H', 'E'), ival_B_to_H_end) - self.assertEqual(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) - self.assertEqual(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) - self.assertEqual(ival_B.asfreq('S', 'S'), ival_B_to_S_start) - self.assertEqual(ival_B.asfreq('S', 'E'), ival_B_to_S_end) - - self.assertEqual(ival_B.asfreq('B'), ival_B) - - def test_conv_daily(self): - # frequency conversion tests: from Business Frequency" - - ival_D = Period(freq='D', year=2007, month=1, day=1) - ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) - ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) - ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) - ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) - - ival_D_friday = Period(freq='D', year=2007, month=1, day=5) - ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) - ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) - - # TODO: unused? - # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) - - ival_B_friday = Period(freq='B', year=2007, month=1, day=5) - ival_B_monday = Period(freq='B', year=2007, month=1, day=8) - - ival_D_to_A = Period(freq='A', year=2007) - - ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) - ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) - ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) - - ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) - ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) - ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) - - ival_D_to_M = Period(freq='M', year=2007, month=1) - ival_D_to_W = Period(freq='W', year=2007, month=1, day=7) - - ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - self.assertEqual(ival_D.asfreq('A'), ival_D_to_A) - - self.assertEqual(ival_D_end_of_quarter.asfreq('A-JAN'), - ival_Deoq_to_AJAN) - self.assertEqual(ival_D_end_of_quarter.asfreq('A-JUN'), - ival_Deoq_to_AJUN) - self.assertEqual(ival_D_end_of_quarter.asfreq('A-DEC'), - ival_Deoq_to_ADEC) - - self.assertEqual(ival_D_end_of_year.asfreq('A'), ival_D_to_A) - self.assertEqual(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) - self.assertEqual(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) - self.assertEqual(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) - self.assertEqual(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) - self.assertEqual(ival_D.asfreq('M'), ival_D_to_M) - self.assertEqual(ival_D_end_of_month.asfreq('M'), ival_D_to_M) - self.assertEqual(ival_D.asfreq('W'), ival_D_to_W) - self.assertEqual(ival_D_end_of_week.asfreq('W'), ival_D_to_W) - - self.assertEqual(ival_D_friday.asfreq('B'), ival_B_friday) - self.assertEqual(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) - self.assertEqual(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) - self.assertEqual(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) - self.assertEqual(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) - - self.assertEqual(ival_D.asfreq('H', 'S'), ival_D_to_H_start) - self.assertEqual(ival_D.asfreq('H', 'E'), ival_D_to_H_end) - self.assertEqual(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) - self.assertEqual(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) - self.assertEqual(ival_D.asfreq('S', 'S'), ival_D_to_S_start) - self.assertEqual(ival_D.asfreq('S', 'E'), ival_D_to_S_end) - - self.assertEqual(ival_D.asfreq('D'), ival_D) - - def test_conv_hourly(self): - # frequency conversion tests: from Hourly Frequency" - - ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, - hour=23) - ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, - hour=23) - ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, - hour=23) - ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, - hour=23) - ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, - hour=23) - - ival_H_to_A = Period(freq='A', year=2007) - ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_H_to_M = Period(freq='M', year=2007, month=1) - ival_H_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) - - ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=59) - ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=59, second=59) - - self.assertEqual(ival_H.asfreq('A'), ival_H_to_A) - self.assertEqual(ival_H_end_of_year.asfreq('A'), ival_H_to_A) - self.assertEqual(ival_H.asfreq('Q'), ival_H_to_Q) - self.assertEqual(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) - self.assertEqual(ival_H.asfreq('M'), ival_H_to_M) - self.assertEqual(ival_H_end_of_month.asfreq('M'), ival_H_to_M) - self.assertEqual(ival_H.asfreq('W'), ival_H_to_W) - self.assertEqual(ival_H_end_of_week.asfreq('W'), ival_H_to_W) - self.assertEqual(ival_H.asfreq('D'), ival_H_to_D) - self.assertEqual(ival_H_end_of_day.asfreq('D'), ival_H_to_D) - self.assertEqual(ival_H.asfreq('B'), ival_H_to_B) - self.assertEqual(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) - - self.assertEqual(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) - self.assertEqual(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) - self.assertEqual(ival_H.asfreq('S', 'S'), ival_H_to_S_start) - self.assertEqual(ival_H.asfreq('S', 'E'), ival_H_to_S_end) - - self.assertEqual(ival_H.asfreq('H'), ival_H) - - def test_conv_minutely(self): - # frequency conversion tests: from Minutely Frequency" - - ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=59) - - ival_T_to_A = Period(freq='A', year=2007) - ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_T_to_M = Period(freq='M', year=2007, month=1) - ival_T_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - - ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=59) - - self.assertEqual(ival_T.asfreq('A'), ival_T_to_A) - self.assertEqual(ival_T_end_of_year.asfreq('A'), ival_T_to_A) - self.assertEqual(ival_T.asfreq('Q'), ival_T_to_Q) - self.assertEqual(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) - self.assertEqual(ival_T.asfreq('M'), ival_T_to_M) - self.assertEqual(ival_T_end_of_month.asfreq('M'), ival_T_to_M) - self.assertEqual(ival_T.asfreq('W'), ival_T_to_W) - self.assertEqual(ival_T_end_of_week.asfreq('W'), ival_T_to_W) - self.assertEqual(ival_T.asfreq('D'), ival_T_to_D) - self.assertEqual(ival_T_end_of_day.asfreq('D'), ival_T_to_D) - self.assertEqual(ival_T.asfreq('B'), ival_T_to_B) - self.assertEqual(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) - self.assertEqual(ival_T.asfreq('H'), ival_T_to_H) - self.assertEqual(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) - - self.assertEqual(ival_T.asfreq('S', 'S'), ival_T_to_S_start) - self.assertEqual(ival_T.asfreq('S', 'E'), ival_T_to_S_end) - - self.assertEqual(ival_T.asfreq('Min'), ival_T) - - def test_conv_secondly(self): - # frequency conversion tests: from Secondly Frequency" - - ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, - second=0) - ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, - hour=23, minute=59, second=59) - ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=59, second=59) - ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=0, second=59) - - ival_S_to_A = Period(freq='A', year=2007) - ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_S_to_M = Period(freq='M', year=2007, month=1) - ival_S_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - - self.assertEqual(ival_S.asfreq('A'), ival_S_to_A) - self.assertEqual(ival_S_end_of_year.asfreq('A'), ival_S_to_A) - self.assertEqual(ival_S.asfreq('Q'), ival_S_to_Q) - self.assertEqual(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) - self.assertEqual(ival_S.asfreq('M'), ival_S_to_M) - self.assertEqual(ival_S_end_of_month.asfreq('M'), ival_S_to_M) - self.assertEqual(ival_S.asfreq('W'), ival_S_to_W) - self.assertEqual(ival_S_end_of_week.asfreq('W'), ival_S_to_W) - self.assertEqual(ival_S.asfreq('D'), ival_S_to_D) - self.assertEqual(ival_S_end_of_day.asfreq('D'), ival_S_to_D) - self.assertEqual(ival_S.asfreq('B'), ival_S_to_B) - self.assertEqual(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) - self.assertEqual(ival_S.asfreq('H'), ival_S_to_H) - self.assertEqual(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) - self.assertEqual(ival_S.asfreq('Min'), ival_S_to_T) - self.assertEqual(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) - - self.assertEqual(ival_S.asfreq('S'), ival_S) - - def test_asfreq_mult(self): - # normal freq to mult freq - p = Period(freq='A', year=2007) - # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq) - expected = Period('2007', freq='3A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='3A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - # mult freq to normal freq - p = Period(freq='3A', year=2007) - # ordinal will change because how=E is the default - for freq in ['A', offsets.YearEnd()]: - result = p.asfreq(freq) - expected = Period('2009', freq='A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - # ordinal will not change - for freq in ['A', offsets.YearEnd()]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='A') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - p = Period(freq='A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq) - expected = Period('2007-12', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - p = Period(freq='3A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq) - expected = Period('2009-12', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') - - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - def test_asfreq_combined(self): - # normal freq to combined freq - p = Period('2007', freq='H') - - # ordinal will not change - expected = Period('2007', freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): - result = p.asfreq(freq, how=how) - self.assertEqual(result, expected) - self.assertEqual(result.ordinal, expected.ordinal) - self.assertEqual(result.freq, expected.freq) - - # combined freq to normal freq - p1 = Period(freq='1D1H', year=2007) - p2 = Period(freq='1H1D', year=2007) - - # ordinal will change because how=E is the default - result1 = p1.asfreq('H') - result2 = p2.asfreq('H') - expected = Period('2007-01-02', freq='H') - self.assertEqual(result1, expected) - self.assertEqual(result1.ordinal, expected.ordinal) - self.assertEqual(result1.freq, expected.freq) - self.assertEqual(result2, expected) - self.assertEqual(result2.ordinal, expected.ordinal) - self.assertEqual(result2.freq, expected.freq) - - # ordinal will not change - result1 = p1.asfreq('H', how='S') - result2 = p2.asfreq('H', how='S') - expected = Period('2007-01-01', freq='H') - self.assertEqual(result1, expected) - self.assertEqual(result1.ordinal, expected.ordinal) - self.assertEqual(result1.freq, expected.freq) - self.assertEqual(result2, expected) - self.assertEqual(result2.ordinal, expected.ordinal) - self.assertEqual(result2.freq, expected.freq) + def test_equal_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 == self.day - def test_is_leap_year(self): - # GH 13727 - for freq in ['A', 'M', 'D', 'H']: - p = Period('2000-01-01 00:00:00', freq=freq) - self.assertTrue(p.is_leap_year) - self.assertIsInstance(p.is_leap_year, bool) + def test_notEqual(self): + self.assertNotEqual(self.january1, 1) + self.assertNotEqual(self.january1, self.february) - p = Period('1999-01-01 00:00:00', freq=freq) - self.assertFalse(p.is_leap_year) + def test_greater(self): + self.assertTrue(self.february > self.january1) - p = Period('2004-01-01 00:00:00', freq=freq) - self.assertTrue(p.is_leap_year) + def test_greater_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 > self.day - p = Period('2100-01-01 00:00:00', freq=freq) - self.assertFalse(p.is_leap_year) + def test_greater_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 > 1 + + def test_greaterEqual(self): + self.assertTrue(self.january1 >= self.january2) + + def test_greaterEqual_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 >= self.day + + with tm.assertRaises(TypeError): + print(self.january1 >= 1) + + def test_smallerEqual(self): + self.assertTrue(self.january1 <= self.january2) + + def test_smallerEqual_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 <= self.day + + def test_smallerEqual_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 <= 1 + + def test_smaller(self): + self.assertTrue(self.january1 < self.february) + + def test_smaller_Raises_Value(self): + with tm.assertRaises(period.IncompatibleFrequency): + self.january1 < self.day + + def test_smaller_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 < 1 + + def test_sort(self): + periods = [self.march, self.january1, self.february] + correctPeriods = [self.january1, self.february, self.march] + self.assertEqual(sorted(periods), correctPeriods) + + def test_period_nat_comp(self): + p_nat = Period('NaT', freq='D') + p = Period('2011-01-01', freq='D') + + nat = pd.Timestamp('NaT') + t = pd.Timestamp('2011-01-01') + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t), + (t, nat), (nat, nat)]: + self.assertEqual(left < right, False) + self.assertEqual(left > right, False) + self.assertEqual(left == right, False) + self.assertEqual(left != right, True) + self.assertEqual(left <= right, False) + self.assertEqual(left >= right, False) class TestMethods(tm.TestCase): diff --git a/pandas/tests/scalar/test_period_asfreq.py b/pandas/tests/scalar/test_period_asfreq.py new file mode 100644 index 0000000000000..d311fef8a826d --- /dev/null +++ b/pandas/tests/scalar/test_period_asfreq.py @@ -0,0 +1,721 @@ +import pandas as pd +from pandas import Period, offsets +from pandas.util import testing as tm +from pandas.tseries.frequencies import _period_code_map + + +class TestFreqConversion(tm.TestCase): + "Test frequency conversion of date objects" + + def test_asfreq_corner(self): + val = Period(freq='A', year=2007) + result1 = val.asfreq('5t') + result2 = val.asfreq('t') + expected = Period('2007-12-31 23:59', freq='t') + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freqstr, '5T') + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freqstr, 'T') + + def test_conv_annual(self): + # frequency conversion tests: from Annual Frequency + + ival_A = Period(freq='A', year=2007) + + ival_AJAN = Period(freq="A-JAN", year=2007) + ival_AJUN = Period(freq="A-JUN", year=2007) + ival_ANOV = Period(freq="A-NOV", year=2007) + + ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) + ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) + ival_A_to_M_start = Period(freq='M', year=2007, month=1) + ival_A_to_M_end = Period(freq='M', year=2007, month=12) + ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + + ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) + + self.assertEqual(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) + self.assertEqual(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) + self.assertEqual(ival_A.asfreq('M', 's'), ival_A_to_M_start) + self.assertEqual(ival_A.asfreq('M', 'E'), ival_A_to_M_end) + self.assertEqual(ival_A.asfreq('W', 'S'), ival_A_to_W_start) + self.assertEqual(ival_A.asfreq('W', 'E'), ival_A_to_W_end) + self.assertEqual(ival_A.asfreq('B', 'S'), ival_A_to_B_start) + self.assertEqual(ival_A.asfreq('B', 'E'), ival_A_to_B_end) + self.assertEqual(ival_A.asfreq('D', 'S'), ival_A_to_D_start) + self.assertEqual(ival_A.asfreq('D', 'E'), ival_A_to_D_end) + self.assertEqual(ival_A.asfreq('H', 'S'), ival_A_to_H_start) + self.assertEqual(ival_A.asfreq('H', 'E'), ival_A_to_H_end) + self.assertEqual(ival_A.asfreq('min', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('min', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('T', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('T', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('S', 'S'), ival_A_to_S_start) + self.assertEqual(ival_A.asfreq('S', 'E'), ival_A_to_S_end) + + self.assertEqual(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) + self.assertEqual(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) + + self.assertEqual(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) + self.assertEqual(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) + + self.assertEqual(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) + self.assertEqual(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) + + self.assertEqual(ival_A.asfreq('A'), ival_A) + + def test_conv_quarterly(self): + # frequency conversion tests: from Quarterly Frequency + + ival_Q = Period(freq='Q', year=2007, quarter=1) + ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) + + ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) + ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) + + ival_Q_to_A = Period(freq='A', year=2007) + ival_Q_to_M_start = Period(freq='M', year=2007, month=1) + ival_Q_to_M_end = Period(freq='M', year=2007, month=3) + ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23) + ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23, + minute=59, second=59) + + ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) + + self.assertEqual(ival_Q.asfreq('A'), ival_Q_to_A) + self.assertEqual(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) + + self.assertEqual(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) + self.assertEqual(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) + self.assertEqual(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) + self.assertEqual(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) + self.assertEqual(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) + self.assertEqual(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) + self.assertEqual(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) + self.assertEqual(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) + self.assertEqual(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) + self.assertEqual(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) + self.assertEqual(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) + self.assertEqual(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) + self.assertEqual(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) + self.assertEqual(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) + + self.assertEqual(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) + self.assertEqual(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) + self.assertEqual(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) + self.assertEqual(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) + + self.assertEqual(ival_Q.asfreq('Q'), ival_Q) + + def test_conv_monthly(self): + # frequency conversion tests: from Monthly Frequency + + ival_M = Period(freq='M', year=2007, month=1) + ival_M_end_of_year = Period(freq='M', year=2007, month=12) + ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) + ival_M_to_A = Period(freq='A', year=2007) + ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23) + ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, + minute=59, second=59) + + self.assertEqual(ival_M.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M_end_of_year.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M.asfreq('Q'), ival_M_to_Q) + self.assertEqual(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) + + self.assertEqual(ival_M.asfreq('W', 'S'), ival_M_to_W_start) + self.assertEqual(ival_M.asfreq('W', 'E'), ival_M_to_W_end) + self.assertEqual(ival_M.asfreq('B', 'S'), ival_M_to_B_start) + self.assertEqual(ival_M.asfreq('B', 'E'), ival_M_to_B_end) + self.assertEqual(ival_M.asfreq('D', 'S'), ival_M_to_D_start) + self.assertEqual(ival_M.asfreq('D', 'E'), ival_M_to_D_end) + self.assertEqual(ival_M.asfreq('H', 'S'), ival_M_to_H_start) + self.assertEqual(ival_M.asfreq('H', 'E'), ival_M_to_H_end) + self.assertEqual(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) + self.assertEqual(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) + self.assertEqual(ival_M.asfreq('S', 'S'), ival_M_to_S_start) + self.assertEqual(ival_M.asfreq('S', 'E'), ival_M_to_S_end) + + self.assertEqual(ival_M.asfreq('M'), ival_M) + + def test_conv_weekly(self): + # frequency conversion tests: from Weekly Frequency + ival_W = Period(freq='W', year=2007, month=1, day=1) + + ival_WSUN = Period(freq='W', year=2007, month=1, day=7) + ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6) + ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5) + ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4) + ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3) + ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2) + ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31) + ival_W_to_A = Period(freq='A', year=2007) + ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_W_to_M = Period(freq='M', year=2007, month=1) + + if Period(freq='D', year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq='A', year=2007) + else: + ival_W_to_A_end_of_year = Period(freq='A', year=2008) + + if Period(freq='D', year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) + else: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) + + if Period(freq='D', year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) + else: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) + + ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) + ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, + minute=59, second=59) + + self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) + self.assertEqual(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) + self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) + self.assertEqual(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + + self.assertEqual(ival_W.asfreq('W'), ival_W) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + ival_W.asfreq('WK') + + def test_conv_weekly_legacy(self): + # frequency conversion tests: from Weekly Frequency + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=1) + + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-SAT', year=2007, month=1, day=6) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-FRI', year=2007, month=1, day=5) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-THU', year=2007, month=1, day=4) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-WED', year=2007, month=1, day=3) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-TUE', year=2007, month=1, day=2) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-MON', year=2007, month=1, day=1) + + def test_conv_business(self): + # frequency conversion tests: from Business Frequency" + + ival_B = Period(freq='B', year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) + + ival_B_to_A = Period(freq='A', year=2007) + ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_B_to_M = Period(freq='M', year=2007, month=1) + ival_B_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) + ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, + minute=59, second=59) + + self.assertEqual(ival_B.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B_end_of_year.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B_end_of_month.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B.asfreq('W'), ival_B_to_W) + self.assertEqual(ival_B_end_of_week.asfreq('W'), ival_B_to_W) + + self.assertEqual(ival_B.asfreq('D'), ival_B_to_D) + + self.assertEqual(ival_B.asfreq('H', 'S'), ival_B_to_H_start) + self.assertEqual(ival_B.asfreq('H', 'E'), ival_B_to_H_end) + self.assertEqual(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) + self.assertEqual(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) + self.assertEqual(ival_B.asfreq('S', 'S'), ival_B_to_S_start) + self.assertEqual(ival_B.asfreq('S', 'E'), ival_B_to_S_end) + + self.assertEqual(ival_B.asfreq('B'), ival_B) + + def test_conv_daily(self): + # frequency conversion tests: from Business Frequency" + + ival_D = Period(freq='D', year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) + + ival_D_friday = Period(freq='D', year=2007, month=1, day=5) + ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) + ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) + + # TODO: unused? + # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) + + ival_B_friday = Period(freq='B', year=2007, month=1, day=5) + ival_B_monday = Period(freq='B', year=2007, month=1, day=8) + + ival_D_to_A = Period(freq='A', year=2007) + + ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) + ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) + ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) + + ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) + ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) + ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) + + ival_D_to_M = Period(freq='M', year=2007, month=1) + ival_D_to_W = Period(freq='W', year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) + ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, + minute=59, second=59) + + self.assertEqual(ival_D.asfreq('A'), ival_D_to_A) + + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JAN'), + ival_Deoq_to_AJAN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JUN'), + ival_Deoq_to_AJUN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-DEC'), + ival_Deoq_to_ADEC) + + self.assertEqual(ival_D_end_of_year.asfreq('A'), ival_D_to_A) + self.assertEqual(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) + self.assertEqual(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) + self.assertEqual(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D_end_of_month.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D.asfreq('W'), ival_D_to_W) + self.assertEqual(ival_D_end_of_week.asfreq('W'), ival_D_to_W) + + self.assertEqual(ival_D_friday.asfreq('B'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) + self.assertEqual(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) + + self.assertEqual(ival_D.asfreq('H', 'S'), ival_D_to_H_start) + self.assertEqual(ival_D.asfreq('H', 'E'), ival_D_to_H_end) + self.assertEqual(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) + self.assertEqual(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) + self.assertEqual(ival_D.asfreq('S', 'S'), ival_D_to_S_start) + self.assertEqual(ival_D.asfreq('S', 'E'), ival_D_to_S_end) + + self.assertEqual(ival_D.asfreq('D'), ival_D) + + def test_conv_hourly(self): + # frequency conversion tests: from Hourly Frequency" + + ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, + hour=23) + ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, + hour=23) + ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, + hour=23) + ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, + hour=23) + + ival_H_to_A = Period(freq='A', year=2007) + ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_H_to_M = Period(freq='M', year=2007, month=1) + ival_H_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) + + ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=59) + ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=59, second=59) + + self.assertEqual(ival_H.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H_end_of_year.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H_end_of_month.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H_end_of_week.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H_end_of_day.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H.asfreq('B'), ival_H_to_B) + self.assertEqual(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) + + self.assertEqual(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) + self.assertEqual(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) + self.assertEqual(ival_H.asfreq('S', 'S'), ival_H_to_S_start) + self.assertEqual(ival_H.asfreq('S', 'E'), ival_H_to_S_end) + + self.assertEqual(ival_H.asfreq('H'), ival_H) + + def test_conv_minutely(self): + # frequency conversion tests: from Minutely Frequency" + + ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=59) + + ival_T_to_A = Period(freq='A', year=2007) + ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_T_to_M = Period(freq='M', year=2007, month=1) + ival_T_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=0) + ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, + minute=0, second=59) + + self.assertEqual(ival_T.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T_end_of_year.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T_end_of_month.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T_end_of_week.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T_end_of_day.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T.asfreq('H'), ival_T_to_H) + self.assertEqual(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) + + self.assertEqual(ival_T.asfreq('S', 'S'), ival_T_to_S_start) + self.assertEqual(ival_T.asfreq('S', 'E'), ival_T_to_S_end) + + self.assertEqual(ival_T.asfreq('Min'), ival_T) + + def test_conv_secondly(self): + # frequency conversion tests: from Secondly Frequency" + + ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, + second=0) + ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, + hour=23, minute=59, second=59) + ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=59, second=59) + ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=59) + + ival_S_to_A = Period(freq='A', year=2007) + ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_S_to_M = Period(freq='M', year=2007, month=1) + ival_S_to_W = Period(freq='W', year=2007, month=1, day=7) + ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + + self.assertEqual(ival_S.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S_end_of_year.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S_end_of_month.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S_end_of_week.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S_end_of_day.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S.asfreq('Min'), ival_S_to_T) + self.assertEqual(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) + + self.assertEqual(ival_S.asfreq('S'), ival_S) + + def test_asfreq_mult(self): + # normal freq to mult freq + p = Period(freq='A', year=2007) + # ordinal will not change + for freq in ['3A', offsets.YearEnd(3)]: + result = p.asfreq(freq) + expected = Period('2007', freq='3A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + # ordinal will not change + for freq in ['3A', offsets.YearEnd(3)]: + result = p.asfreq(freq, how='S') + expected = Period('2007', freq='3A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + # mult freq to normal freq + p = Period(freq='3A', year=2007) + # ordinal will change because how=E is the default + for freq in ['A', offsets.YearEnd()]: + result = p.asfreq(freq) + expected = Period('2009', freq='A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + # ordinal will not change + for freq in ['A', offsets.YearEnd()]: + result = p.asfreq(freq, how='S') + expected = Period('2007', freq='A') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + p = Period(freq='A', year=2007) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq) + expected = Period('2007-12', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq, how='S') + expected = Period('2007-01', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + p = Period(freq='3A', year=2007) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq) + expected = Period('2009-12', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + for freq in ['2M', offsets.MonthEnd(2)]: + result = p.asfreq(freq, how='S') + expected = Period('2007-01', freq='2M') + + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + def test_asfreq_combined(self): + # normal freq to combined freq + p = Period('2007', freq='H') + + # ordinal will not change + expected = Period('2007', freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): + result = p.asfreq(freq, how=how) + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + # combined freq to normal freq + p1 = Period(freq='1D1H', year=2007) + p2 = Period(freq='1H1D', year=2007) + + # ordinal will change because how=E is the default + result1 = p1.asfreq('H') + result2 = p2.asfreq('H') + expected = Period('2007-01-02', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + # ordinal will not change + result1 = p1.asfreq('H', how='S') + result2 = p2.asfreq('H', how='S') + expected = Period('2007-01-01', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + def test_asfreq_MS(self): + initial = Period("2013") + + self.assertEqual(initial.asfreq(freq="M", how="S"), + Period('2013-01', 'M')) + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + initial.asfreq(freq="MS", how="S") + + with tm.assertRaisesRegexp(ValueError, msg): + pd.Period('2013-01', 'MS') + + self.assertTrue(_period_code_map.get("MS") is None) From d4da6264dc91f2f3f2e0c7ae605fe4d77710e1dc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 9 Feb 2017 11:42:15 -0500 Subject: [PATCH 116/338] DEPR: remove statsmodels as a dependency remove pd.ols, pd.fama_macbeth from top-level namespace xref #11898 closes https://github.com/pandas-dev/pandas2/issues/26 previously deprecated in 0.18.0 Author: Jeff Reback Closes #15353 from jreback/stats and squashes the following commits: 9563740 [Jeff Reback] DEPR: remove statsmodels as a dependency --- ci/requirements-2.7.pip | 1 - ci/requirements-2.7_COMPAT.run | 1 - ci/requirements-2.7_LOCALE.run | 1 - ci/requirements-2.7_SLOW.run | 1 - ci/requirements-3.4_SLOW.run | 1 - doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/api/tests/test_api.py | 4 +- pandas/stats/api.py | 4 - pandas/stats/common.py | 45 - pandas/stats/fama_macbeth.py | 241 ---- pandas/stats/interface.py | 143 --- pandas/stats/math.py | 130 --- pandas/stats/misc.py | 389 ------- pandas/stats/ols.py | 1377 ----------------------- pandas/stats/plm.py | 863 -------------- pandas/stats/tests/__init__.py | 0 pandas/stats/tests/common.py | 162 --- pandas/stats/tests/test_fama_macbeth.py | 68 -- pandas/stats/tests/test_math.py | 59 - pandas/stats/tests/test_ols.py | 968 ---------------- pandas/stats/tests/test_var.py | 94 -- pandas/stats/var.py | 605 ---------- pandas/util/print_versions.py | 1 - setup.py | 1 - 24 files changed, 3 insertions(+), 5158 deletions(-) delete mode 100644 pandas/stats/common.py delete mode 100644 pandas/stats/fama_macbeth.py delete mode 100644 pandas/stats/interface.py delete mode 100644 pandas/stats/math.py delete mode 100644 pandas/stats/misc.py delete mode 100644 pandas/stats/ols.py delete mode 100644 pandas/stats/plm.py delete mode 100644 pandas/stats/tests/__init__.py delete mode 100644 pandas/stats/tests/common.py delete mode 100644 pandas/stats/tests/test_fama_macbeth.py delete mode 100644 pandas/stats/tests/test_math.py delete mode 100644 pandas/stats/tests/test_ols.py delete mode 100644 pandas/stats/tests/test_var.py delete mode 100644 pandas/stats/var.py diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index d7266fe88fb32..d16b932c8be4f 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -1,4 +1,3 @@ -statsmodels blosc httplib2 google-api-python-client==1.2 diff --git a/ci/requirements-2.7_COMPAT.run b/ci/requirements-2.7_COMPAT.run index 32d71beb24388..d27b6a72c2d15 100644 --- a/ci/requirements-2.7_COMPAT.run +++ b/ci/requirements-2.7_COMPAT.run @@ -4,7 +4,6 @@ pytz=2013b scipy=0.11.0 xlwt=0.7.5 xlrd=0.9.2 -statsmodels=0.4.3 bottleneck=0.8.0 numexpr=2.2.2 pytables=3.0.0 diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run index 9bb37ee10f8db..1a9b42d832b0b 100644 --- a/ci/requirements-2.7_LOCALE.run +++ b/ci/requirements-2.7_LOCALE.run @@ -13,5 +13,4 @@ html5lib=1.0b2 lxml=3.2.1 scipy=0.11.0 beautiful-soup=4.2.1 -statsmodels=0.4.3 bigquery=2.0.17 diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index 630d22636f284..c2d2a14285ad6 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -4,7 +4,6 @@ numpy=1.8.2 matplotlib=1.3.1 scipy patsy -statsmodels xlwt openpyxl xlsxwriter diff --git a/ci/requirements-3.4_SLOW.run b/ci/requirements-3.4_SLOW.run index 215f840381ada..39018439a1223 100644 --- a/ci/requirements-3.4_SLOW.run +++ b/ci/requirements-3.4_SLOW.run @@ -17,5 +17,4 @@ sqlalchemy bottleneck pymysql psycopg2 -statsmodels jinja2=2.8 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9afcf85c929a7..3fb6f7b0b9a91 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -396,7 +396,7 @@ Removal of prior version deprecations/changes - The ``pandas.io.ga`` module with a ``google-analytics`` interface is removed (:issue:`11308`). Similar functionality can be found in the `Google2Pandas `__ package. - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) - +- ``pandas.stats.fama_macbeth``, ``pandas.stats.ols``, ``pandas.stats.plm`` and ``pandas.stats.var``, as well as the top-level ``pandas.fama_macbeth`` and ``pandas.ols`` routines are removed. Similar functionaility can be found in the `statsmodels `__ package. (:issue:`11898`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 02165d82d4232..a53f6103b408b 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -42,7 +42,7 @@ class TestPDApi(Base, tm.TestCase): 'json', 'lib', 'index', 'parser'] # these are already deprecated; awaiting removal - deprecated_modules = ['ols', 'stats', 'datetools'] + deprecated_modules = ['stats', 'datetools'] # misc misc = ['IndexSlice', 'NaT'] @@ -109,7 +109,7 @@ class TestPDApi(Base, tm.TestCase): 'expanding_max', 'expanding_mean', 'expanding_median', 'expanding_min', 'expanding_quantile', 'expanding_skew', 'expanding_std', 'expanding_sum', - 'expanding_var', 'fama_macbeth', 'rolling_apply', + 'expanding_var', 'rolling_apply', 'rolling_corr', 'rolling_count', 'rolling_cov', 'rolling_kurt', 'rolling_max', 'rolling_mean', 'rolling_median', 'rolling_min', 'rolling_quantile', diff --git a/pandas/stats/api.py b/pandas/stats/api.py index fd81b875faa91..2a11456d4f9e5 100644 --- a/pandas/stats/api.py +++ b/pandas/stats/api.py @@ -2,10 +2,6 @@ Common namespace of statistical functions """ -# pylint: disable-msg=W0611,W0614,W0401 - # flake8: noqa from pandas.stats.moments import * -from pandas.stats.interface import ols -from pandas.stats.fama_macbeth import fama_macbeth diff --git a/pandas/stats/common.py b/pandas/stats/common.py deleted file mode 100644 index be3b842e93cc8..0000000000000 --- a/pandas/stats/common.py +++ /dev/null @@ -1,45 +0,0 @@ - -_WINDOW_TYPES = { - 0: 'full_sample', - 1: 'rolling', - 2: 'expanding' -} -# also allow 'rolling' as key -_WINDOW_TYPES.update((v, v) for k, v in list(_WINDOW_TYPES.items())) -_ADDITIONAL_CLUSTER_TYPES = set(("entity", "time")) - - -def _get_cluster_type(cluster_type): - # this was previous behavior - if cluster_type is None: - return cluster_type - try: - return _get_window_type(cluster_type) - except ValueError: - final_type = str(cluster_type).lower().replace("_", " ") - if final_type in _ADDITIONAL_CLUSTER_TYPES: - return final_type - raise ValueError('Unrecognized cluster type: %s' % cluster_type) - - -def _get_window_type(window_type): - # e.g., 0, 1, 2 - final_type = _WINDOW_TYPES.get(window_type) - # e.g., 'full_sample' - final_type = final_type or _WINDOW_TYPES.get( - str(window_type).lower().replace(" ", "_")) - if final_type is None: - raise ValueError('Unrecognized window type: %s' % window_type) - return final_type - - -def banner(text, width=80): - """ - - """ - toFill = width - len(text) - - left = toFill // 2 - right = toFill - left - - return '%s%s%s' % ('-' * left, text, '-' * right) diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py deleted file mode 100644 index d564f9cb6c425..0000000000000 --- a/pandas/stats/fama_macbeth.py +++ /dev/null @@ -1,241 +0,0 @@ -from pandas.core.base import StringMixin -from pandas.compat import StringIO, range - -import numpy as np - -from pandas.core.api import Series, DataFrame -import pandas.stats.common as common -from pandas.util.decorators import cache_readonly - -# flake8: noqa - - -def fama_macbeth(**kwargs): - """Runs Fama-MacBeth regression. - - Parameters - ---------- - Takes the same arguments as a panel OLS, in addition to: - - nw_lags_beta: int - Newey-West adjusts the betas by the given lags - """ - window_type = kwargs.get('window_type') - if window_type is None: - klass = FamaMacBeth - else: - klass = MovingFamaMacBeth - - return klass(**kwargs) - - -class FamaMacBeth(StringMixin): - - def __init__(self, y, x, intercept=True, nw_lags=None, - nw_lags_beta=None, - entity_effects=False, time_effects=False, x_effects=None, - cluster=None, dropped_dummies=None, verbose=False): - import warnings - warnings.warn("The pandas.stats.fama_macbeth module is deprecated and will be " - "removed in a future version. We refer to external packages " - "like statsmodels, see here: " - "http://www.statsmodels.org/stable/index.html", - FutureWarning, stacklevel=4) - - if dropped_dummies is None: - dropped_dummies = {} - self._nw_lags_beta = nw_lags_beta - - from pandas.stats.plm import MovingPanelOLS - self._ols_result = MovingPanelOLS( - y=y, x=x, window_type='rolling', window=1, - intercept=intercept, - nw_lags=nw_lags, entity_effects=entity_effects, - time_effects=time_effects, x_effects=x_effects, cluster=cluster, - dropped_dummies=dropped_dummies, verbose=verbose) - - self._cols = self._ols_result._x.columns - - @cache_readonly - def _beta_raw(self): - return self._ols_result._beta_raw - - @cache_readonly - def _stats(self): - return _calc_t_stat(self._beta_raw, self._nw_lags_beta) - - @cache_readonly - def _mean_beta_raw(self): - return self._stats[0] - - @cache_readonly - def _std_beta_raw(self): - return self._stats[1] - - @cache_readonly - def _t_stat_raw(self): - return self._stats[2] - - def _make_result(self, result): - return Series(result, index=self._cols) - - @cache_readonly - def mean_beta(self): - return self._make_result(self._mean_beta_raw) - - @cache_readonly - def std_beta(self): - return self._make_result(self._std_beta_raw) - - @cache_readonly - def t_stat(self): - return self._make_result(self._t_stat_raw) - - @cache_readonly - def _results(self): - return { - 'mean_beta': self._mean_beta_raw, - 'std_beta': self._std_beta_raw, - 't_stat': self._t_stat_raw, - } - - @cache_readonly - def _coef_table(self): - buffer = StringIO() - buffer.write('%13s %13s %13s %13s %13s %13s\n' % - ('Variable', 'Beta', 'Std Err', 't-stat', 'CI 2.5%', 'CI 97.5%')) - template = '%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n' - - for i, name in enumerate(self._cols): - if i and not (i % 5): - buffer.write('\n' + common.banner('')) - - mean_beta = self._results['mean_beta'][i] - std_beta = self._results['std_beta'][i] - t_stat = self._results['t_stat'][i] - ci1 = mean_beta - 1.96 * std_beta - ci2 = mean_beta + 1.96 * std_beta - - values = '(%s)' % name, mean_beta, std_beta, t_stat, ci1, ci2 - - buffer.write(template % values) - - if self._nw_lags_beta is not None: - buffer.write('\n') - buffer.write('*** The Std Err, t-stat are Newey-West ' - 'adjusted with Lags %5d\n' % self._nw_lags_beta) - - return buffer.getvalue() - - def __unicode__(self): - return self.summary - - @cache_readonly - def summary(self): - template = """ -----------------------Summary of Fama-MacBeth Analysis------------------------- - -Formula: Y ~ %(formulaRHS)s -# betas : %(nu)3d - -----------------------Summary of Estimated Coefficients------------------------ -%(coefTable)s ---------------------------------End of Summary--------------------------------- -""" - params = { - 'formulaRHS': ' + '.join(self._cols), - 'nu': len(self._beta_raw), - 'coefTable': self._coef_table, - } - - return template % params - - -class MovingFamaMacBeth(FamaMacBeth): - - def __init__(self, y, x, window_type='rolling', window=10, - intercept=True, nw_lags=None, nw_lags_beta=None, - entity_effects=False, time_effects=False, x_effects=None, - cluster=None, dropped_dummies=None, verbose=False): - if dropped_dummies is None: - dropped_dummies = {} - self._window_type = common._get_window_type(window_type) - self._window = window - - FamaMacBeth.__init__( - self, y=y, x=x, intercept=intercept, - nw_lags=nw_lags, nw_lags_beta=nw_lags_beta, - entity_effects=entity_effects, time_effects=time_effects, - x_effects=x_effects, cluster=cluster, - dropped_dummies=dropped_dummies, verbose=verbose) - - self._index = self._ols_result._index - self._T = len(self._index) - - @property - def _is_rolling(self): - return self._window_type == 'rolling' - - def _calc_stats(self): - mean_betas = [] - std_betas = [] - t_stats = [] - - # XXX - - mask = self._ols_result._rolling_ols_call[2] - obs_total = mask.astype(int).cumsum() - - start = self._window - 1 - betas = self._beta_raw - for i in range(start, self._T): - if self._is_rolling: - begin = i - start - else: - begin = 0 - - B = betas[max(obs_total[begin] - 1, 0): obs_total[i]] - mean_beta, std_beta, t_stat = _calc_t_stat(B, self._nw_lags_beta) - mean_betas.append(mean_beta) - std_betas.append(std_beta) - t_stats.append(t_stat) - - return np.array([mean_betas, std_betas, t_stats]) - - _stats = cache_readonly(_calc_stats) - - def _make_result(self, result): - return DataFrame(result, index=self._result_index, columns=self._cols) - - @cache_readonly - def _result_index(self): - mask = self._ols_result._rolling_ols_call[2] - # HACK XXX - return self._index[mask.cumsum() >= self._window] - - @cache_readonly - def _results(self): - return { - 'mean_beta': self._mean_beta_raw[-1], - 'std_beta': self._std_beta_raw[-1], - 't_stat': self._t_stat_raw[-1], - } - - -def _calc_t_stat(beta, nw_lags_beta): - N = len(beta) - B = beta - beta.mean(0) - C = np.dot(B.T, B) / N - - if nw_lags_beta is not None: - for i in range(nw_lags_beta + 1): - - cov = np.dot(B[i:].T, B[:(N - i)]) / N - weight = i / (nw_lags_beta + 1) - C += 2 * (1 - weight) * cov - - mean_beta = beta.mean(0) - std_beta = np.sqrt(np.diag(C)) / np.sqrt(N) - t_stat = mean_beta / std_beta - - return mean_beta, std_beta, t_stat diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py deleted file mode 100644 index caf468b4f85fe..0000000000000 --- a/pandas/stats/interface.py +++ /dev/null @@ -1,143 +0,0 @@ -from pandas.core.api import Series, DataFrame, Panel, MultiIndex -from pandas.stats.ols import OLS, MovingOLS -from pandas.stats.plm import PanelOLS, MovingPanelOLS, NonPooledPanelOLS -import pandas.stats.common as common - - -def ols(**kwargs): - """Returns the appropriate OLS object depending on whether you need - simple or panel OLS, and a full-sample or rolling/expanding OLS. - - Will be a normal linear regression or a (pooled) panel regression depending - on the type of the inputs: - - y : Series, x : DataFrame -> OLS - y : Series, x : dict of DataFrame -> OLS - y : DataFrame, x : DataFrame -> PanelOLS - y : DataFrame, x : dict of DataFrame/Panel -> PanelOLS - y : Series with MultiIndex, x : Panel/DataFrame + MultiIndex -> PanelOLS - - Parameters - ---------- - y: Series or DataFrame - See above for types - x: Series, DataFrame, dict of Series, dict of DataFrame, Panel - weights : Series or ndarray - The weights are presumed to be (proportional to) the inverse of the - variance of the observations. That is, if the variables are to be - transformed by 1/sqrt(W) you must supply weights = 1/W - intercept: bool - True if you want an intercept. Defaults to True. - nw_lags: None or int - Number of Newey-West lags. Defaults to None. - nw_overlap: bool - Whether there are overlaps in the NW lags. Defaults to False. - window_type: {'full sample', 'rolling', 'expanding'} - 'full sample' by default - window: int - size of window (for rolling/expanding OLS). If window passed and no - explicit window_type, 'rolling" will be used as the window_type - - Panel OLS options: - pool: bool - Whether to run pooled panel regression. Defaults to true. - entity_effects: bool - Whether to account for entity fixed effects. Defaults to false. - time_effects: bool - Whether to account for time fixed effects. Defaults to false. - x_effects: list - List of x's to account for fixed effects. Defaults to none. - dropped_dummies: dict - Key is the name of the variable for the fixed effect. - Value is the value of that variable for which we drop the dummy. - - For entity fixed effects, key equals 'entity'. - - By default, the first dummy is dropped if no dummy is specified. - cluster: {'time', 'entity'} - cluster variances - - Examples - -------- - # Run simple OLS. - result = ols(y=y, x=x) - - # Run rolling simple OLS with window of size 10. - result = ols(y=y, x=x, window_type='rolling', window=10) - print(result.beta) - - result = ols(y=y, x=x, nw_lags=1) - - # Set up LHS and RHS for data across all items - y = A - x = {'B' : B, 'C' : C} - - # Run panel OLS. - result = ols(y=y, x=x) - - # Run expanding panel OLS with window 10 and entity clustering. - result = ols(y=y, x=x, cluster='entity', window_type='expanding', - window=10) - - Returns - ------- - The appropriate OLS object, which allows you to obtain betas and various - statistics, such as std err, t-stat, etc. - """ - - if (kwargs.get('cluster') is not None and - kwargs.get('nw_lags') is not None): - raise ValueError( - 'Pandas OLS does not work with Newey-West correction ' - 'and clustering.') - - pool = kwargs.get('pool') - if 'pool' in kwargs: - del kwargs['pool'] - - window_type = kwargs.get('window_type') - window = kwargs.get('window') - - if window_type is None: - if window is None: - window_type = 'full_sample' - else: - window_type = 'rolling' - else: - window_type = common._get_window_type(window_type) - - if window_type != 'full_sample': - kwargs['window_type'] = common._get_window_type(window_type) - - y = kwargs.get('y') - x = kwargs.get('x') - - panel = False - if isinstance(y, DataFrame) or (isinstance(y, Series) and - isinstance(y.index, MultiIndex)): - panel = True - if isinstance(x, Panel): - panel = True - - if window_type == 'full_sample': - for rolling_field in ('window_type', 'window', 'min_periods'): - if rolling_field in kwargs: - del kwargs[rolling_field] - - if panel: - if pool is False: - klass = NonPooledPanelOLS - else: - klass = PanelOLS - else: - klass = OLS - else: - if panel: - if pool is False: - klass = NonPooledPanelOLS - else: - klass = MovingPanelOLS - else: - klass = MovingOLS - - return klass(**kwargs) diff --git a/pandas/stats/math.py b/pandas/stats/math.py deleted file mode 100644 index 505415bebf89e..0000000000000 --- a/pandas/stats/math.py +++ /dev/null @@ -1,130 +0,0 @@ -# pylint: disable-msg=E1103 -# pylint: disable-msg=W0212 - -from __future__ import division - -from pandas.compat import range -import numpy as np -import numpy.linalg as linalg - - -def rank(X, cond=1.0e-12): - """ - Return the rank of a matrix X based on its generalized inverse, - not the SVD. - """ - X = np.asarray(X) - if len(X.shape) == 2: - import scipy.linalg as SL - D = SL.svdvals(X) - result = np.add.reduce(np.greater(D / D.max(), cond)) - return int(result.astype(np.int32)) - else: - return int(not np.alltrue(np.equal(X, 0.))) - - -def solve(a, b): - """Returns the solution of A X = B.""" - try: - return linalg.solve(a, b) - except linalg.LinAlgError: - return np.dot(linalg.pinv(a), b) - - -def inv(a): - """Returns the inverse of A.""" - try: - return np.linalg.inv(a) - except linalg.LinAlgError: - return np.linalg.pinv(a) - - -def is_psd(m): - eigvals = linalg.eigvals(m) - return np.isreal(eigvals).all() and (eigvals >= 0).all() - - -def newey_west(m, max_lags, nobs, df, nw_overlap=False): - """ - Compute Newey-West adjusted covariance matrix, taking into account - specified number of leads / lags - - Parameters - ---------- - m : (N x K) - max_lags : int - nobs : int - Number of observations in model - df : int - Degrees of freedom in explanatory variables - nw_overlap : boolean, default False - Assume data is overlapping - - Returns - ------- - ndarray (K x K) - - Reference - --------- - Newey, W. K. & West, K. D. (1987) A Simple, Positive - Semi-definite, Heteroskedasticity and Autocorrelation Consistent - Covariance Matrix, Econometrica, vol. 55(3), 703-708 - """ - Xeps = np.dot(m.T, m) - for lag in range(1, max_lags + 1): - auto_cov = np.dot(m[:-lag].T, m[lag:]) - weight = lag / (max_lags + 1) - if nw_overlap: - weight = 0 - bb = auto_cov + auto_cov.T - dd = (1 - weight) * bb - Xeps += dd - - Xeps *= nobs / (nobs - df) - - if nw_overlap and not is_psd(Xeps): - new_max_lags = int(np.ceil(max_lags * 1.5)) -# print('nw_overlap is True and newey_west generated a non positive ' -# 'semidefinite matrix, so using newey_west with max_lags of %d.' -# % new_max_lags) - return newey_west(m, new_max_lags, nobs, df) - - return Xeps - - -def calc_F(R, r, beta, var_beta, nobs, df): - """ - Computes the standard F-test statistic for linear restriction - hypothesis testing - - Parameters - ---------- - R: ndarray (N x N) - Restriction matrix - r: ndarray (N x 1) - Restriction vector - beta: ndarray (N x 1) - Estimated model coefficients - var_beta: ndarray (N x N) - Variance covariance matrix of regressors - nobs: int - Number of observations in model - df: int - Model degrees of freedom - - Returns - ------- - F value, (q, df_resid), p value - """ - from scipy.stats import f - - hyp = np.dot(R, beta.reshape(len(beta), 1)) - r - RSR = np.dot(R, np.dot(var_beta, R.T)) - - q = len(r) - - F = np.dot(hyp.T, np.dot(inv(RSR), hyp)).squeeze() / q - - p_value = 1 - f.cdf(F, q, nobs - df) - - return F, (q, nobs - df), p_value diff --git a/pandas/stats/misc.py b/pandas/stats/misc.py deleted file mode 100644 index 1a077dcb6f9a1..0000000000000 --- a/pandas/stats/misc.py +++ /dev/null @@ -1,389 +0,0 @@ -from numpy import NaN -from pandas import compat -import numpy as np - -from pandas.core.api import Series, DataFrame -from pandas.core.series import remove_na -from pandas.compat import zip, lrange -import pandas.core.common as com - - -def zscore(series): - return (series - series.mean()) / np.std(series, ddof=0) - - -def correl_ts(frame1, frame2): - """ - Pairwise correlation of columns of two DataFrame objects - - Parameters - ---------- - - Returns - ------- - y : Series - """ - results = {} - for col, series in compat.iteritems(frame1): - if col in frame2: - other = frame2[col] - - idx1 = series.valid().index - idx2 = other.valid().index - - common_index = idx1.intersection(idx2) - - seriesStand = zscore(series.reindex(common_index)) - otherStand = zscore(other.reindex(common_index)) - results[col] = (seriesStand * otherStand).mean() - - return Series(results) - - -def correl_xs(frame1, frame2): - return correl_ts(frame1.T, frame2.T) - - -def percentileofscore(a, score, kind='rank'): - """The percentile rank of a score relative to a list of scores. - - A `percentileofscore` of, for example, 80% means that 80% of the - scores in `a` are below the given score. In the case of gaps or - ties, the exact definition depends on the optional keyword, `kind`. - - Parameters - ---------- - a: array like - Array of scores to which `score` is compared. - score: int or float - Score that is compared to the elements in `a`. - kind: {'rank', 'weak', 'strict', 'mean'}, optional - This optional parameter specifies the interpretation of the - resulting score: - - - "rank": Average percentage ranking of score. In case of - multiple matches, average the percentage rankings of - all matching scores. - - "weak": This kind corresponds to the definition of a cumulative - distribution function. A percentileofscore of 80% - means that 80% of values are less than or equal - to the provided score. - - "strict": Similar to "weak", except that only values that are - strictly less than the given score are counted. - - "mean": The average of the "weak" and "strict" scores, often used in - testing. See - - http://en.wikipedia.org/wiki/Percentile_rank - - Returns - ------- - pcos : float - Percentile-position of score (0-100) relative to `a`. - - Examples - -------- - Three-quarters of the given values lie below a given score: - - >>> percentileofscore([1, 2, 3, 4], 3) - 75.0 - - With multiple matches, note how the scores of the two matches, 0.6 - and 0.8 respectively, are averaged: - - >>> percentileofscore([1, 2, 3, 3, 4], 3) - 70.0 - - Only 2/5 values are strictly less than 3: - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') - 40.0 - - But 4/5 values are less than or equal to 3: - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') - 80.0 - - The average between the weak and the strict scores is - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') - 60.0 - - """ - a = np.array(a) - n = len(a) - - if kind == 'rank': - if not(np.any(a == score)): - a = np.append(a, score) - a_len = np.array(lrange(len(a))) - else: - a_len = np.array(lrange(len(a))) + 1.0 - - a = np.sort(a) - idx = [a == score] - pct = (np.mean(a_len[idx]) / n) * 100.0 - return pct - - elif kind == 'strict': - return sum(a < score) / float(n) * 100 - elif kind == 'weak': - return sum(a <= score) / float(n) * 100 - elif kind == 'mean': - return (sum(a < score) + sum(a <= score)) * 50 / float(n) - else: - raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") - - -def percentileRank(frame, column=None, kind='mean'): - """ - Return score at percentile for each point in time (cross-section) - - Parameters - ---------- - frame: DataFrame - column: string or Series, optional - Column name or specific Series to compute percentiles for. - If not provided, percentiles are computed for all values at each - point in time. Note that this can take a LONG time. - kind: {'rank', 'weak', 'strict', 'mean'}, optional - This optional parameter specifies the interpretation of the - resulting score: - - - "rank": Average percentage ranking of score. In case of - multiple matches, average the percentage rankings of - all matching scores. - - "weak": This kind corresponds to the definition of a cumulative - distribution function. A percentileofscore of 80% - means that 80% of values are less than or equal - to the provided score. - - "strict": Similar to "weak", except that only values that are - strictly less than the given score are counted. - - "mean": The average of the "weak" and "strict" scores, often used in - testing. See - - http://en.wikipedia.org/wiki/Percentile_rank - - Returns - ------- - TimeSeries or DataFrame, depending on input - """ - fun = lambda xs, score: percentileofscore(remove_na(xs), - score, kind=kind) - - results = {} - framet = frame.T - if column is not None: - if isinstance(column, Series): - for date, xs in compat.iteritems(frame.T): - results[date] = fun(xs, column.get(date, NaN)) - else: - for date, xs in compat.iteritems(frame.T): - results[date] = fun(xs, xs[column]) - results = Series(results) - else: - for column in frame.columns: - for date, xs in compat.iteritems(framet): - results.setdefault(date, {})[column] = fun(xs, xs[column]) - results = DataFrame(results).T - return results - - -def bucket(series, k, by=None): - """ - Produce DataFrame representing quantiles of a Series - - Parameters - ---------- - series : Series - k : int - number of quantiles - by : Series or same-length array - bucket by value - - Returns - ------- - DataFrame - """ - if by is None: - by = series - else: - by = by.reindex(series.index) - - split = _split_quantile(by, k) - mat = np.empty((len(series), k), dtype=float) * np.NaN - - for i, v in enumerate(split): - mat[:, i][v] = series.take(v) - - return DataFrame(mat, index=series.index, columns=np.arange(k) + 1) - - -def _split_quantile(arr, k): - arr = np.asarray(arr) - mask = np.isfinite(arr) - order = arr[mask].argsort() - n = len(arr) - - return np.array_split(np.arange(n)[mask].take(order), k) - - -def bucketcat(series, cats): - """ - Produce DataFrame representing quantiles of a Series - - Parameters - ---------- - series : Series - cat : Series or same-length array - bucket by category; mutually exclusive with 'by' - - Returns - ------- - DataFrame - """ - if not isinstance(series, Series): - series = Series(series, index=np.arange(len(series))) - - cats = np.asarray(cats) - - unique_labels = np.unique(cats) - unique_labels = unique_labels[com.notnull(unique_labels)] - - # group by - data = {} - - for label in unique_labels: - data[label] = series[cats == label] - - return DataFrame(data, columns=unique_labels) - - -def bucketpanel(series, bins=None, by=None, cat=None): - """ - Bucket data by two Series to create summary panel - - Parameters - ---------- - series : Series - bins : tuple (length-2) - e.g. (2, 2) - by : tuple of Series - bucket by value - cat : tuple of Series - bucket by category; mutually exclusive with 'by' - - Returns - ------- - DataFrame - """ - use_by = by is not None - use_cat = cat is not None - - if use_by and use_cat: - raise Exception('must specify by or cat, but not both') - elif use_by: - if len(by) != 2: - raise Exception('must provide two bucketing series') - - xby, yby = by - xbins, ybins = bins - - return _bucketpanel_by(series, xby, yby, xbins, ybins) - - elif use_cat: - xcat, ycat = cat - return _bucketpanel_cat(series, xcat, ycat) - else: - raise Exception('must specify either values or categories ' - 'to bucket by') - - -def _bucketpanel_by(series, xby, yby, xbins, ybins): - xby = xby.reindex(series.index) - yby = yby.reindex(series.index) - - xlabels = _bucket_labels(xby.reindex(series.index), xbins) - ylabels = _bucket_labels(yby.reindex(series.index), ybins) - - labels = _uniquify(xlabels, ylabels, xbins, ybins) - - mask = com.isnull(labels) - labels[mask] = -1 - - unique_labels = np.unique(labels) - bucketed = bucketcat(series, labels) - - _ulist = list(labels) - index_map = dict((x, _ulist.index(x)) for x in unique_labels) - - def relabel(key): - pos = index_map[key] - - xlab = xlabels[pos] - ylab = ylabels[pos] - - return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL', - int(ylab) if com.notnull(ylab) else 'NULL') - - return bucketed.rename(columns=relabel) - - -def _bucketpanel_cat(series, xcat, ycat): - xlabels, xmapping = _intern(xcat) - ylabels, ymapping = _intern(ycat) - - shift = 10 ** (np.ceil(np.log10(ylabels.max()))) - labels = xlabels * shift + ylabels - - sorter = labels.argsort() - sorted_labels = labels.take(sorter) - sorted_xlabels = xlabels.take(sorter) - sorted_ylabels = ylabels.take(sorter) - - unique_labels = np.unique(labels) - unique_labels = unique_labels[com.notnull(unique_labels)] - - locs = sorted_labels.searchsorted(unique_labels) - xkeys = sorted_xlabels.take(locs) - ykeys = sorted_ylabels.take(locs) - - stringified = ['(%s, %s)' % arg - for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))] - - result = bucketcat(series, labels) - result.columns = stringified - - return result - - -def _intern(values): - # assumed no NaN values - values = np.asarray(values) - - uniqued = np.unique(values) - labels = uniqued.searchsorted(values) - return labels, uniqued - - -def _uniquify(xlabels, ylabels, xbins, ybins): - # encode the stuff, create unique label - shifter = 10 ** max(xbins, ybins) - _xpiece = xlabels * shifter - _ypiece = ylabels - - return _xpiece + _ypiece - - -def _bucket_labels(series, k): - arr = np.asarray(series) - mask = np.isfinite(arr) - order = arr[mask].argsort() - n = len(series) - - split = np.array_split(np.arange(n)[mask].take(order), k) - - mat = np.empty(n, dtype=float) * np.NaN - for i, v in enumerate(split): - mat[v] = i - - return mat + 1 diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py deleted file mode 100644 index 96ec70d59488a..0000000000000 --- a/pandas/stats/ols.py +++ /dev/null @@ -1,1377 +0,0 @@ -""" -Ordinary least squares regression -""" - -# pylint: disable-msg=W0201 - -# flake8: noqa - -from pandas.compat import zip, range, StringIO -from itertools import starmap -from pandas import compat -import numpy as np - -from pandas.core.api import DataFrame, Series, isnull -from pandas.core.base import StringMixin -from pandas.types.common import _ensure_float64 -from pandas.core.index import MultiIndex -from pandas.core.panel import Panel -from pandas.util.decorators import cache_readonly - -import pandas.stats.common as scom -import pandas.stats.math as math -import pandas.stats.moments as moments - -_FP_ERR = 1e-8 - - -class OLS(StringMixin): - """ - Runs a full sample ordinary least squares regression. - - Parameters - ---------- - y : Series - x : Series, DataFrame, dict of Series - intercept : bool - True if you want an intercept. - weights : array-like, optional - 1d array of weights. If you supply 1/W then the variables are pre- - multiplied by 1/sqrt(W). If no weights are supplied the default value - is 1 and WLS reults are the same as OLS. - nw_lags : None or int - Number of Newey-West lags. - nw_overlap : boolean, default False - Assume data is overlapping when computing Newey-West estimator - - """ - _panel_model = False - - def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, - nw_overlap=False): - import warnings - warnings.warn("The pandas.stats.ols module is deprecated and will be " - "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: " - "http://www.statsmodels.org/stable/regression.html", - FutureWarning, stacklevel=4) - - try: - import statsmodels.api as sm - except ImportError: - import scikits.statsmodels.api as sm - - self._x_orig = x - self._y_orig = y - self._weights_orig = weights - self._intercept = intercept - self._nw_lags = nw_lags - self._nw_overlap = nw_overlap - - (self._y, self._x, self._weights, self._x_filtered, - self._index, self._time_has_obs) = self._prepare_data() - - if self._weights is not None: - self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) - self._y_trans = self._y * np.sqrt(self._weights) - self.sm_ols = sm.WLS(self._y.get_values(), - self._x.get_values(), - weights=self._weights.values).fit() - else: - self._x_trans = self._x - self._y_trans = self._y - self.sm_ols = sm.OLS(self._y.get_values(), - self._x.get_values()).fit() - - def _prepare_data(self): - """ - Cleans the input for single OLS. - - Parameters - ---------- - lhs: Series - Dependent variable in the regression. - rhs: dict, whose values are Series, DataFrame, or dict - Explanatory variables of the regression. - - Returns - ------- - Series, DataFrame - Cleaned lhs and rhs - """ - (filt_lhs, filt_rhs, filt_weights, - pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig, - self._weights_orig) - if self._intercept: - filt_rhs['intercept'] = 1. - pre_filt_rhs['intercept'] = 1. - - if hasattr(filt_weights, 'to_dense'): - filt_weights = filt_weights.to_dense() - - return (filt_lhs, filt_rhs, filt_weights, - pre_filt_rhs, index, valid) - - @property - def nobs(self): - return self._nobs - - @property - def _nobs(self): - return len(self._y) - - @property - def nw_lags(self): - return self._nw_lags - - @property - def x(self): - """Returns the filtered x used in the regression.""" - return self._x - - @property - def y(self): - """Returns the filtered y used in the regression.""" - return self._y - - @cache_readonly - def _beta_raw(self): - """Runs the regression and returns the beta.""" - return self.sm_ols.params - - @cache_readonly - def beta(self): - """Returns the betas in Series form.""" - return Series(self._beta_raw, index=self._x.columns) - - @cache_readonly - def _df_raw(self): - """Returns the degrees of freedom.""" - return math.rank(self._x.values) - - @cache_readonly - def df(self): - """Returns the degrees of freedom. - - This equals the rank of the X matrix. - """ - return self._df_raw - - @cache_readonly - def _df_model_raw(self): - """Returns the raw model degrees of freedom.""" - return self.sm_ols.df_model - - @cache_readonly - def df_model(self): - """Returns the degrees of freedom of the model.""" - return self._df_model_raw - - @cache_readonly - def _df_resid_raw(self): - """Returns the raw residual degrees of freedom.""" - return self.sm_ols.df_resid - - @cache_readonly - def df_resid(self): - """Returns the degrees of freedom of the residuals.""" - return self._df_resid_raw - - @cache_readonly - def _f_stat_raw(self): - """Returns the raw f-stat value.""" - from scipy.stats import f - - cols = self._x.columns - - if self._nw_lags is None: - F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) - - q = len(cols) - if 'intercept' in cols: - q -= 1 - - shape = q, self.df_resid - p_value = 1 - f.cdf(F, shape[0], shape[1]) - return F, shape, p_value - - k = len(cols) - R = np.eye(k) - r = np.zeros((k, 1)) - - try: - intercept = cols.get_loc('intercept') - R = np.concatenate((R[0: intercept], R[intercept + 1:])) - r = np.concatenate((r[0: intercept], r[intercept + 1:])) - except KeyError: - # no intercept - pass - - return math.calc_F(R, r, self._beta_raw, self._var_beta_raw, - self._nobs, self.df) - - @cache_readonly - def f_stat(self): - """Returns the f-stat value.""" - return f_stat_to_dict(self._f_stat_raw) - - def f_test(self, hypothesis): - """Runs the F test, given a joint hypothesis. The hypothesis is - represented by a collection of equations, in the form - - A*x_1+B*x_2=C - - You must provide the coefficients even if they're 1. No spaces. - - The equations can be passed as either a single string or a - list of strings. - - Examples - -------- - o = ols(...) - o.f_test('1*x1+2*x2=0,1*x3=0') - o.f_test(['1*x1+2*x2=0','1*x3=0']) - """ - - x_names = self._x.columns - - R = [] - r = [] - - if isinstance(hypothesis, str): - eqs = hypothesis.split(',') - elif isinstance(hypothesis, list): - eqs = hypothesis - else: # pragma: no cover - raise Exception('hypothesis must be either string or list') - for equation in eqs: - row = np.zeros(len(x_names)) - lhs, rhs = equation.split('=') - for s in lhs.split('+'): - ss = s.split('*') - coeff = float(ss[0]) - x_name = ss[1] - - if x_name not in x_names: - raise Exception('no coefficient named %s' % x_name) - idx = x_names.get_loc(x_name) - row[idx] = coeff - rhs = float(rhs) - - R.append(row) - r.append(rhs) - - R = np.array(R) - q = len(r) - r = np.array(r).reshape(q, 1) - - result = math.calc_F(R, r, self._beta_raw, self._var_beta_raw, - self._nobs, self.df) - - return f_stat_to_dict(result) - - @cache_readonly - def _p_value_raw(self): - """Returns the raw p values.""" - from scipy.stats import t - - return 2 * t.sf(np.fabs(self._t_stat_raw), - self._df_resid_raw) - - @cache_readonly - def p_value(self): - """Returns the p values.""" - return Series(self._p_value_raw, index=self.beta.index) - - @cache_readonly - def _r2_raw(self): - """Returns the raw r-squared values.""" - if self._use_centered_tss: - return 1 - self.sm_ols.ssr / self.sm_ols.centered_tss - else: - return 1 - self.sm_ols.ssr / self.sm_ols.uncentered_tss - - @property - def _use_centered_tss(self): - # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR - return self._intercept - - @cache_readonly - def r2(self): - """Returns the r-squared values.""" - return self._r2_raw - - @cache_readonly - def _r2_adj_raw(self): - """Returns the raw r-squared adjusted values.""" - return self.sm_ols.rsquared_adj - - @cache_readonly - def r2_adj(self): - """Returns the r-squared adjusted values.""" - return self._r2_adj_raw - - @cache_readonly - def _resid_raw(self): - """Returns the raw residuals.""" - return self.sm_ols.resid - - @cache_readonly - def resid(self): - """Returns the residuals.""" - return Series(self._resid_raw, index=self._x.index) - - @cache_readonly - def _rmse_raw(self): - """Returns the raw rmse values.""" - return np.sqrt(self.sm_ols.mse_resid) - - @cache_readonly - def rmse(self): - """Returns the rmse value.""" - return self._rmse_raw - - @cache_readonly - def _std_err_raw(self): - """Returns the raw standard err values.""" - return np.sqrt(np.diag(self._var_beta_raw)) - - @cache_readonly - def std_err(self): - """Returns the standard err values of the betas.""" - return Series(self._std_err_raw, index=self.beta.index) - - @cache_readonly - def _t_stat_raw(self): - """Returns the raw t-stat value.""" - return self._beta_raw / self._std_err_raw - - @cache_readonly - def t_stat(self): - """Returns the t-stat values of the betas.""" - return Series(self._t_stat_raw, index=self.beta.index) - - @cache_readonly - def _var_beta_raw(self): - """ - Returns the raw covariance of beta. - """ - x = self._x.values - y = self._y.values - - xx = np.dot(x.T, x) - - if self._nw_lags is None: - return math.inv(xx) * (self._rmse_raw ** 2) - else: - resid = y - np.dot(x, self._beta_raw) - m = (x.T * resid).T - - xeps = math.newey_west(m, self._nw_lags, self._nobs, self._df_raw, - self._nw_overlap) - - xx_inv = math.inv(xx) - return np.dot(xx_inv, np.dot(xeps, xx_inv)) - - @cache_readonly - def var_beta(self): - """Returns the variance-covariance matrix of beta.""" - return DataFrame(self._var_beta_raw, index=self.beta.index, - columns=self.beta.index) - - @cache_readonly - def _y_fitted_raw(self): - """Returns the raw fitted y values.""" - if self._weights is None: - X = self._x_filtered.values - else: - # XXX - return self.sm_ols.fittedvalues - - b = self._beta_raw - return np.dot(X, b) - - @cache_readonly - def y_fitted(self): - """Returns the fitted y values. This equals BX.""" - if self._weights is None: - index = self._x_filtered.index - orig_index = index - else: - index = self._y.index - orig_index = self._y_orig.index - - result = Series(self._y_fitted_raw, index=index) - return result.reindex(orig_index) - - @cache_readonly - def _y_predict_raw(self): - """Returns the raw predicted y values.""" - return self._y_fitted_raw - - @cache_readonly - def y_predict(self): - """Returns the predicted y values. - - For in-sample, this is same as y_fitted.""" - return self.y_fitted - - def predict(self, beta=None, x=None, fill_value=None, - fill_method=None, axis=0): - """ - Parameters - ---------- - beta : Series - x : Series or DataFrame - fill_value : scalar or dict, default None - fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - axis : {0, 1}, default 0 - See DataFrame.fillna for more details - - Notes - ----- - 1. If both fill_value and fill_method are None then NaNs are dropped - (this is the default behavior) - 2. An intercept will be automatically added to the new_y_values if - the model was fitted using an intercept - - Returns - ------- - Series of predicted values - """ - if beta is None and x is None: - return self.y_predict - - if beta is None: - beta = self.beta - else: - beta = beta.reindex(self.beta.index) - if isnull(beta).any(): - raise ValueError('Must supply betas for same variables') - - if x is None: - x = self._x - orig_x = x - else: - orig_x = x - if fill_value is None and fill_method is None: - x = x.dropna(how='any') - else: - x = x.fillna(value=fill_value, method=fill_method, axis=axis) - if isinstance(x, Series): - x = DataFrame({'x': x}) - if self._intercept: - x['intercept'] = 1. - - x = x.reindex(columns=self._x.columns) - - rs = np.dot(x.values, beta.values) - return Series(rs, x.index).reindex(orig_x.index) - - RESULT_FIELDS = ['r2', 'r2_adj', 'df', 'df_model', 'df_resid', 'rmse', - 'f_stat', 'beta', 'std_err', 't_stat', 'p_value', 'nobs'] - - @cache_readonly - def _results(self): - results = {} - for result in self.RESULT_FIELDS: - results[result] = getattr(self, result) - - return results - - @cache_readonly - def _coef_table(self): - buf = StringIO() - - buf.write('%14s %10s %10s %10s %10s %10s %10s\n' % - ('Variable', 'Coef', 'Std Err', 't-stat', - 'p-value', 'CI 2.5%', 'CI 97.5%')) - buf.write(scom.banner('')) - coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f' - - results = self._results - - beta = results['beta'] - - for i, name in enumerate(beta.index): - if i and not (i % 5): - buf.write('\n' + scom.banner('')) - - std_err = results['std_err'][name] - CI1 = beta[name] - 1.96 * std_err - CI2 = beta[name] + 1.96 * std_err - - t_stat = results['t_stat'][name] - p_value = results['p_value'][name] - - line = coef_template % (name, - beta[name], std_err, t_stat, p_value, CI1, CI2) - - buf.write(line) - - if self.nw_lags is not None: - buf.write('\n') - buf.write('*** The calculations are Newey-West ' - 'adjusted with lags %5d\n' % self.nw_lags) - - return buf.getvalue() - - @cache_readonly - def summary_as_matrix(self): - """Returns the formatted results of the OLS as a DataFrame.""" - results = self._results - beta = results['beta'] - data = {'beta': results['beta'], - 't-stat': results['t_stat'], - 'p-value': results['p_value'], - 'std err': results['std_err']} - return DataFrame(data, beta.index).T - - @cache_readonly - def summary(self): - """ - This returns the formatted result of the OLS computation - """ - template = """ -%(bannerTop)s - -Formula: Y ~ %(formula)s - -Number of Observations: %(nobs)d -Number of Degrees of Freedom: %(df)d - -R-squared: %(r2)10.4f -Adj R-squared: %(r2_adj)10.4f - -Rmse: %(rmse)10.4f - -F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f - -Degrees of Freedom: model %(df_model)d, resid %(df_resid)d - -%(bannerCoef)s -%(coef_table)s -%(bannerEnd)s -""" - coef_table = self._coef_table - - results = self._results - - f_stat = results['f_stat'] - - bracketed = ['<%s>' % str(c) for c in results['beta'].index] - - formula = StringIO() - formula.write(bracketed[0]) - tot = len(bracketed[0]) - line = 1 - for coef in bracketed[1:]: - tot = tot + len(coef) + 3 - - if tot // (68 * line): - formula.write('\n' + ' ' * 12) - line += 1 - - formula.write(' + ' + coef) - - params = { - 'bannerTop': scom.banner('Summary of Regression Analysis'), - 'bannerCoef': scom.banner('Summary of Estimated Coefficients'), - 'bannerEnd': scom.banner('End of Summary'), - 'formula': formula.getvalue(), - 'r2': results['r2'], - 'r2_adj': results['r2_adj'], - 'nobs': results['nobs'], - 'df': results['df'], - 'df_model': results['df_model'], - 'df_resid': results['df_resid'], - 'coef_table': coef_table, - 'rmse': results['rmse'], - 'f_stat': f_stat['f-stat'], - 'f_stat_shape': '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']), - 'f_stat_p_value': f_stat['p-value'], - } - - return template % params - - def __unicode__(self): - return self.summary - - @cache_readonly - def _time_obs_count(self): - # XXX - return self._time_has_obs.astype(int) - - @property - def _total_times(self): - return self._time_has_obs.sum() - - -class MovingOLS(OLS): - """ - Runs a rolling/expanding simple OLS. - - Parameters - ---------- - y : Series - x : Series, DataFrame, or dict of Series - weights : array-like, optional - 1d array of weights. If None, equivalent to an unweighted OLS. - window_type : {'full sample', 'rolling', 'expanding'} - Default expanding - window : int - size of window (for rolling/expanding OLS) - min_periods : int - Threshold of non-null data points to require. - If None, defaults to size of window for window_type='rolling' and 1 - otherwise - intercept : bool - True if you want an intercept. - nw_lags : None or int - Number of Newey-West lags. - nw_overlap : boolean, default False - Assume data is overlapping when computing Newey-West estimator - - """ - - def __init__(self, y, x, weights=None, window_type='expanding', - window=None, min_periods=None, intercept=True, - nw_lags=None, nw_overlap=False): - - self._args = dict(intercept=intercept, nw_lags=nw_lags, - nw_overlap=nw_overlap) - - OLS.__init__(self, y=y, x=x, weights=weights, **self._args) - - self._set_window(window_type, window, min_periods) - - def _set_window(self, window_type, window, min_periods): - self._window_type = scom._get_window_type(window_type) - - if self._is_rolling: - if window is None: - raise AssertionError("Must specify window.") - if min_periods is None: - min_periods = window - else: - window = len(self._x) - if min_periods is None: - min_periods = 1 - - self._window = int(window) - self._min_periods = min_periods - -#------------------------------------------------------------------------------ -# "Public" results - - @cache_readonly - def beta(self): - """Returns the betas in Series/DataFrame form.""" - return DataFrame(self._beta_raw, - index=self._result_index, - columns=self._x.columns) - - @cache_readonly - def rank(self): - return Series(self._rank_raw, index=self._result_index) - - @cache_readonly - def df(self): - """Returns the degrees of freedom.""" - return Series(self._df_raw, index=self._result_index) - - @cache_readonly - def df_model(self): - """Returns the model degrees of freedom.""" - return Series(self._df_model_raw, index=self._result_index) - - @cache_readonly - def df_resid(self): - """Returns the residual degrees of freedom.""" - return Series(self._df_resid_raw, index=self._result_index) - - @cache_readonly - def f_stat(self): - """Returns the f-stat value.""" - f_stat_dicts = dict((date, f_stat_to_dict(f_stat)) - for date, f_stat in zip(self.beta.index, - self._f_stat_raw)) - - return DataFrame(f_stat_dicts).T - - def f_test(self, hypothesis): - raise NotImplementedError('must use full sample') - - @cache_readonly - def forecast_mean(self): - return Series(self._forecast_mean_raw, index=self._result_index) - - @cache_readonly - def forecast_vol(self): - return Series(self._forecast_vol_raw, index=self._result_index) - - @cache_readonly - def p_value(self): - """Returns the p values.""" - cols = self.beta.columns - return DataFrame(self._p_value_raw, columns=cols, - index=self._result_index) - - @cache_readonly - def r2(self): - """Returns the r-squared values.""" - return Series(self._r2_raw, index=self._result_index) - - @cache_readonly - def resid(self): - """Returns the residuals.""" - return Series(self._resid_raw[self._valid_obs_labels], - index=self._result_index) - - @cache_readonly - def r2_adj(self): - """Returns the r-squared adjusted values.""" - index = self.r2.index - - return Series(self._r2_adj_raw, index=index) - - @cache_readonly - def rmse(self): - """Returns the rmse values.""" - return Series(self._rmse_raw, index=self._result_index) - - @cache_readonly - def std_err(self): - """Returns the standard err values.""" - return DataFrame(self._std_err_raw, columns=self.beta.columns, - index=self._result_index) - - @cache_readonly - def t_stat(self): - """Returns the t-stat value.""" - return DataFrame(self._t_stat_raw, columns=self.beta.columns, - index=self._result_index) - - @cache_readonly - def var_beta(self): - """Returns the covariance of beta.""" - result = {} - result_index = self._result_index - for i in range(len(self._var_beta_raw)): - dm = DataFrame(self._var_beta_raw[i], columns=self.beta.columns, - index=self.beta.columns) - result[result_index[i]] = dm - - return Panel.from_dict(result, intersect=False) - - @cache_readonly - def y_fitted(self): - """Returns the fitted y values.""" - return Series(self._y_fitted_raw[self._valid_obs_labels], - index=self._result_index) - - @cache_readonly - def y_predict(self): - """Returns the predicted y values.""" - return Series(self._y_predict_raw[self._valid_obs_labels], - index=self._result_index) - -#------------------------------------------------------------------------------ -# "raw" attributes, calculations - - @property - def _is_rolling(self): - return self._window_type == 'rolling' - - @cache_readonly - def _beta_raw(self): - """Runs the regression and returns the beta.""" - beta, indices, mask = self._rolling_ols_call - - return beta[indices] - - @cache_readonly - def _result_index(self): - return self._index[self._valid_indices] - - @property - def _valid_indices(self): - return self._rolling_ols_call[1] - - @cache_readonly - def _rolling_ols_call(self): - return self._calc_betas(self._x_trans, self._y_trans) - - def _calc_betas(self, x, y): - N = len(self._index) - K = len(self._x.columns) - - betas = np.empty((N, K), dtype=float) - betas[:] = np.NaN - - valid = self._time_has_obs - enough = self._enough_obs - window = self._window - - # Use transformed (demeaned) Y, X variables - cum_xx = self._cum_xx(x) - cum_xy = self._cum_xy(x, y) - - for i in range(N): - if not valid[i] or not enough[i]: - continue - - xx = cum_xx[i] - xy = cum_xy[i] - if self._is_rolling and i >= window: - xx = xx - cum_xx[i - window] - xy = xy - cum_xy[i - window] - - betas[i] = math.solve(xx, xy) - - mask = ~np.isnan(betas).any(axis=1) - have_betas = np.arange(N)[mask] - - return betas, have_betas, mask - - def _rolling_rank(self): - dates = self._index - window = self._window - - ranks = np.empty(len(dates), dtype=float) - ranks[:] = np.NaN - for i, date in enumerate(dates): - if self._is_rolling and i >= window: - prior_date = dates[i - window + 1] - else: - prior_date = dates[0] - - x_slice = self._x.truncate(before=prior_date, after=date).values - - if len(x_slice) == 0: - continue - - ranks[i] = math.rank(x_slice) - - return ranks - - def _cum_xx(self, x): - dates = self._index - K = len(x.columns) - valid = self._time_has_obs - cum_xx = [] - - slicer = lambda df, dt: df.truncate(dt, dt).values - if not self._panel_model: - _get_index = x.index.get_loc - - def slicer(df, dt): - i = _get_index(dt) - return df.values[i:i + 1, :] - - last = np.zeros((K, K)) - - for i, date in enumerate(dates): - if not valid[i]: - cum_xx.append(last) - continue - - x_slice = slicer(x, date) - xx = last = last + np.dot(x_slice.T, x_slice) - cum_xx.append(xx) - - return cum_xx - - def _cum_xy(self, x, y): - dates = self._index - valid = self._time_has_obs - cum_xy = [] - - x_slicer = lambda df, dt: df.truncate(dt, dt).values - if not self._panel_model: - _get_index = x.index.get_loc - - def x_slicer(df, dt): - i = _get_index(dt) - return df.values[i:i + 1] - - _y_get_index = y.index.get_loc - _values = y.values - if isinstance(y.index, MultiIndex): - def y_slicer(df, dt): - loc = _y_get_index(dt) - return _values[loc] - else: - def y_slicer(df, dt): - i = _y_get_index(dt) - return _values[i:i + 1] - - last = np.zeros(len(x.columns)) - for i, date in enumerate(dates): - if not valid[i]: - cum_xy.append(last) - continue - - x_slice = x_slicer(x, date) - y_slice = y_slicer(y, date) - - xy = last = last + np.dot(x_slice.T, y_slice) - cum_xy.append(xy) - - return cum_xy - - @cache_readonly - def _rank_raw(self): - rank = self._rolling_rank() - return rank[self._valid_indices] - - @cache_readonly - def _df_raw(self): - """Returns the degrees of freedom.""" - return self._rank_raw - - @cache_readonly - def _df_model_raw(self): - """Returns the raw model degrees of freedom.""" - return self._df_raw - 1 - - @cache_readonly - def _df_resid_raw(self): - """Returns the raw residual degrees of freedom.""" - return self._nobs - self._df_raw - - @cache_readonly - def _f_stat_raw(self): - """Returns the raw f-stat value.""" - from scipy.stats import f - - items = self.beta.columns - nobs = self._nobs - df = self._df_raw - df_resid = nobs - df - - # var_beta has not been newey-west adjusted - if self._nw_lags is None: - F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) - - q = len(items) - if 'intercept' in items: - q -= 1 - - def get_result_simple(Fst, d): - return Fst, (q, d), 1 - f.cdf(Fst, q, d) - - # Compute the P-value for each pair - result = starmap(get_result_simple, zip(F, df_resid)) - - return list(result) - - K = len(items) - R = np.eye(K) - r = np.zeros((K, 1)) - - try: - intercept = items.get_loc('intercept') - R = np.concatenate((R[0: intercept], R[intercept + 1:])) - r = np.concatenate((r[0: intercept], r[intercept + 1:])) - except KeyError: - # no intercept - pass - - def get_result(beta, vcov, n, d): - return math.calc_F(R, r, beta, vcov, n, d) - - results = starmap(get_result, - zip(self._beta_raw, self._var_beta_raw, nobs, df)) - - return list(results) - - @cache_readonly - def _p_value_raw(self): - """Returns the raw p values.""" - from scipy.stats import t - - result = [2 * t.sf(a, b) - for a, b in zip(np.fabs(self._t_stat_raw), - self._df_resid_raw)] - - return np.array(result) - - @cache_readonly - def _resid_stats(self): - uncentered_sst = [] - sst = [] - sse = [] - - Yreg = self._y - Y = self._y_trans - X = self._x_trans - weights = self._weights - - dates = self._index - window = self._window - for n, index in enumerate(self._valid_indices): - if self._is_rolling and index >= window: - prior_date = dates[index - window + 1] - else: - prior_date = dates[0] - - date = dates[index] - beta = self._beta_raw[n] - - X_slice = X.truncate(before=prior_date, after=date).values - Y_slice = _y_converter(Y.truncate(before=prior_date, after=date)) - - resid = Y_slice - np.dot(X_slice, beta) - - if weights is not None: - Y_slice = _y_converter(Yreg.truncate(before=prior_date, - after=date)) - weights_slice = weights.truncate(prior_date, date) - demeaned = Y_slice - np.average(Y_slice, weights=weights_slice) - SS_total = (weights_slice * demeaned ** 2).sum() - else: - SS_total = ((Y_slice - Y_slice.mean()) ** 2).sum() - - SS_err = (resid ** 2).sum() - SST_uncentered = (Y_slice ** 2).sum() - - sse.append(SS_err) - sst.append(SS_total) - uncentered_sst.append(SST_uncentered) - - return { - 'sse': np.array(sse), - 'centered_tss': np.array(sst), - 'uncentered_tss': np.array(uncentered_sst), - } - - @cache_readonly - def _rmse_raw(self): - """Returns the raw rmse values.""" - return np.sqrt(self._resid_stats['sse'] / self._df_resid_raw) - - @cache_readonly - def _r2_raw(self): - rs = self._resid_stats - - if self._use_centered_tss: - return 1 - rs['sse'] / rs['centered_tss'] - else: - return 1 - rs['sse'] / rs['uncentered_tss'] - - @cache_readonly - def _r2_adj_raw(self): - """Returns the raw r-squared adjusted values.""" - nobs = self._nobs - factors = (nobs - 1) / (nobs - self._df_raw) - return 1 - (1 - self._r2_raw) * factors - - @cache_readonly - def _resid_raw(self): - """Returns the raw residuals.""" - return (self._y.values - self._y_fitted_raw) - - @cache_readonly - def _std_err_raw(self): - """Returns the raw standard err values.""" - results = [] - for i in range(len(self._var_beta_raw)): - results.append(np.sqrt(np.diag(self._var_beta_raw[i]))) - - return np.array(results) - - @cache_readonly - def _t_stat_raw(self): - """Returns the raw t-stat value.""" - return self._beta_raw / self._std_err_raw - - @cache_readonly - def _var_beta_raw(self): - """Returns the raw covariance of beta.""" - x = self._x_trans - y = self._y_trans - dates = self._index - nobs = self._nobs - rmse = self._rmse_raw - beta = self._beta_raw - df = self._df_raw - window = self._window - cum_xx = self._cum_xx(self._x) - - results = [] - for n, i in enumerate(self._valid_indices): - xx = cum_xx[i] - date = dates[i] - - if self._is_rolling and i >= window: - xx = xx - cum_xx[i - window] - prior_date = dates[i - window + 1] - else: - prior_date = dates[0] - - x_slice = x.truncate(before=prior_date, after=date) - y_slice = y.truncate(before=prior_date, after=date) - xv = x_slice.values - yv = np.asarray(y_slice) - - if self._nw_lags is None: - result = math.inv(xx) * (rmse[n] ** 2) - else: - resid = yv - np.dot(xv, beta[n]) - m = (xv.T * resid).T - - xeps = math.newey_west(m, self._nw_lags, nobs[n], df[n], - self._nw_overlap) - - xx_inv = math.inv(xx) - result = np.dot(xx_inv, np.dot(xeps, xx_inv)) - - results.append(result) - - return np.array(results) - - @cache_readonly - def _forecast_mean_raw(self): - """Returns the raw covariance of beta.""" - nobs = self._nobs - window = self._window - - # x should be ones - dummy = DataFrame(index=self._y.index) - dummy['y'] = 1 - - cum_xy = self._cum_xy(dummy, self._y) - - results = [] - for n, i in enumerate(self._valid_indices): - sumy = cum_xy[i] - - if self._is_rolling and i >= window: - sumy = sumy - cum_xy[i - window] - - results.append(sumy[0] / nobs[n]) - - return np.array(results) - - @cache_readonly - def _forecast_vol_raw(self): - """Returns the raw covariance of beta.""" - beta = self._beta_raw - window = self._window - dates = self._index - x = self._x - - results = [] - for n, i in enumerate(self._valid_indices): - date = dates[i] - if self._is_rolling and i >= window: - prior_date = dates[i - window + 1] - else: - prior_date = dates[0] - - x_slice = x.truncate(prior_date, date).values - x_demeaned = x_slice - x_slice.mean(0) - x_cov = np.dot(x_demeaned.T, x_demeaned) / (len(x_slice) - 1) - - B = beta[n] - result = np.dot(B, np.dot(x_cov, B)) - results.append(np.sqrt(result)) - - return np.array(results) - - @cache_readonly - def _y_fitted_raw(self): - """Returns the raw fitted y values.""" - return (self._x.values * self._beta_matrix(lag=0)).sum(1) - - @cache_readonly - def _y_predict_raw(self): - """Returns the raw predicted y values.""" - return (self._x.values * self._beta_matrix(lag=1)).sum(1) - - @cache_readonly - def _results(self): - results = {} - for result in self.RESULT_FIELDS: - value = getattr(self, result) - if isinstance(value, Series): - value = value[self.beta.index[-1]] - elif isinstance(value, DataFrame): - value = value.xs(self.beta.index[-1]) - else: # pragma: no cover - raise Exception('Problem retrieving %s' % result) - results[result] = value - - return results - - @cache_readonly - def _window_time_obs(self): - window_obs = (Series(self._time_obs_count > 0) - .rolling(self._window, min_periods=1) - .sum() - .values - ) - - window_obs[np.isnan(window_obs)] = 0 - return window_obs.astype(int) - - @cache_readonly - def _nobs_raw(self): - if self._is_rolling: - window = self._window - else: - # expanding case - window = len(self._index) - - result = Series(self._time_obs_count).rolling( - window, min_periods=1).sum().values - - return result.astype(int) - - def _beta_matrix(self, lag=0): - if lag < 0: - raise AssertionError("'lag' must be greater than or equal to 0, " - "input was {0}".format(lag)) - - betas = self._beta_raw - - labels = np.arange(len(self._y)) - lag - indexer = self._valid_obs_labels.searchsorted(labels, side='left') - indexer[indexer == len(betas)] = len(betas) - 1 - - beta_matrix = betas[indexer] - beta_matrix[labels < self._valid_obs_labels[0]] = np.NaN - - return beta_matrix - - @cache_readonly - def _valid_obs_labels(self): - dates = self._index[self._valid_indices] - return self._y.index.searchsorted(dates) - - @cache_readonly - def _nobs(self): - return self._nobs_raw[self._valid_indices] - - @property - def nobs(self): - return Series(self._nobs, index=self._result_index) - - @cache_readonly - def _enough_obs(self): - # XXX: what's the best way to determine where to start? - return self._nobs_raw >= max(self._min_periods, - len(self._x.columns) + 1) - - -def _safe_update(d, other): - """ - Combine dictionaries with non-overlapping keys - """ - for k, v in compat.iteritems(other): - if k in d: - raise Exception('Duplicate regressor: %s' % k) - - d[k] = v - - -def _filter_data(lhs, rhs, weights=None): - """ - Cleans the input for single OLS. - - Parameters - ---------- - lhs : Series - Dependent variable in the regression. - rhs : dict, whose values are Series, DataFrame, or dict - Explanatory variables of the regression. - weights : array-like, optional - 1d array of weights. If None, equivalent to an unweighted OLS. - - Returns - ------- - Series, DataFrame - Cleaned lhs and rhs - """ - if not isinstance(lhs, Series): - if len(lhs) != len(rhs): - raise AssertionError("length of lhs must equal length of rhs") - lhs = Series(lhs, index=rhs.index) - - rhs = _combine_rhs(rhs) - lhs = DataFrame({'__y__': lhs}, dtype=float) - pre_filt_rhs = rhs.dropna(how='any') - - combined = rhs.join(lhs, how='outer') - if weights is not None: - combined['__weights__'] = weights - - valid = (combined.count(1) == len(combined.columns)).values - index = combined.index - combined = combined[valid] - - if weights is not None: - filt_weights = combined.pop('__weights__') - else: - filt_weights = None - - filt_lhs = combined.pop('__y__') - filt_rhs = combined - - if hasattr(filt_weights, 'to_dense'): - filt_weights = filt_weights.to_dense() - - return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights, - pre_filt_rhs.to_dense(), index, valid) - - -def _combine_rhs(rhs): - """ - Glue input X variables together while checking for potential - duplicates - """ - series = {} - - if isinstance(rhs, Series): - series['x'] = rhs - elif isinstance(rhs, DataFrame): - series = rhs.copy() - elif isinstance(rhs, dict): - for name, value in compat.iteritems(rhs): - if isinstance(value, Series): - _safe_update(series, {name: value}) - elif isinstance(value, (dict, DataFrame)): - _safe_update(series, value) - else: # pragma: no cover - raise Exception('Invalid RHS data type: %s' % type(value)) - else: # pragma: no cover - raise Exception('Invalid RHS type: %s' % type(rhs)) - - if not isinstance(series, DataFrame): - series = DataFrame(series, dtype=float) - - return series - -# A little kludge so we can use this method for both -# MovingOLS and MovingPanelOLS - - -def _y_converter(y): - y = y.values.squeeze() - if y.ndim == 0: # pragma: no cover - return np.array([y]) - else: - return y - - -def f_stat_to_dict(result): - f_stat, shape, p_value = result - - result = {} - result['f-stat'] = f_stat - result['DF X'] = shape[0] - result['DF Resid'] = shape[1] - result['p-value'] = p_value - - return result diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py deleted file mode 100644 index 806dc289f843a..0000000000000 --- a/pandas/stats/plm.py +++ /dev/null @@ -1,863 +0,0 @@ -""" -Linear regression objects for panel data -""" - -# pylint: disable-msg=W0231 -# pylint: disable-msg=E1101,E1103 - -# flake8: noqa - -from __future__ import division -from pandas.compat import range -from pandas import compat -import warnings - -import numpy as np - -from pandas.core.panel import Panel -from pandas.core.frame import DataFrame -from pandas.core.reshape import get_dummies -from pandas.core.series import Series -from pandas.stats.ols import OLS, MovingOLS -import pandas.stats.common as com -import pandas.stats.math as math -from pandas.util.decorators import cache_readonly - - -class PanelOLS(OLS): - """Implements panel OLS. - - See ols function docs - """ - _panel_model = True - - def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, - entity_effects=False, time_effects=False, x_effects=None, - cluster=None, dropped_dummies=None, verbose=False, - nw_overlap=False): - import warnings - warnings.warn("The pandas.stats.plm module is deprecated and will be " - "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: " - "http://www.statsmodels.org/stable/mixed_linear.html", - FutureWarning, stacklevel=4) - self._x_orig = x - self._y_orig = y - self._weights = weights - - self._intercept = intercept - self._nw_lags = nw_lags - self._nw_overlap = nw_overlap - self._entity_effects = entity_effects - self._time_effects = time_effects - self._x_effects = x_effects - self._dropped_dummies = dropped_dummies or {} - self._cluster = com._get_cluster_type(cluster) - self._verbose = verbose - - (self._x, self._x_trans, - self._x_filtered, self._y, - self._y_trans) = self._prepare_data() - - self._index = self._x.index.levels[0] - - self._T = len(self._index) - - def log(self, msg): - if self._verbose: # pragma: no cover - print(msg) - - def _prepare_data(self): - """Cleans and stacks input data into DataFrame objects - - If time effects is True, then we turn off intercepts and omit an item - from every (entity and x) fixed effect. - - Otherwise: - - If we have an intercept, we omit an item from every fixed effect. - - Else, we omit an item from every fixed effect except one of them. - - The categorical variables will get dropped from x. - """ - (x, x_filtered, y, weights, cat_mapping) = self._filter_data() - - self.log('Adding dummies to X variables') - x = self._add_dummies(x, cat_mapping) - - self.log('Adding dummies to filtered X variables') - x_filtered = self._add_dummies(x_filtered, cat_mapping) - - if self._x_effects: - x = x.drop(self._x_effects, axis=1) - x_filtered = x_filtered.drop(self._x_effects, axis=1) - - if self._time_effects: - x_regressor = x.sub(x.mean(level=0), level=0) - - unstacked_y = y.unstack() - y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack() - y_regressor.index = y.index - - elif self._intercept: - # only add intercept when no time effects - self.log('Adding intercept') - x = x_regressor = add_intercept(x) - x_filtered = add_intercept(x_filtered) - y_regressor = y - else: - self.log('No intercept added') - x_regressor = x - y_regressor = y - - if weights is not None: - if not y_regressor.index.equals(weights.index): - raise AssertionError("y_regressor and weights must have the " - "same index") - if not x_regressor.index.equals(weights.index): - raise AssertionError("x_regressor and weights must have the " - "same index") - - rt_weights = np.sqrt(weights) - y_regressor = y_regressor * rt_weights - x_regressor = x_regressor.mul(rt_weights, axis=0) - - return x, x_regressor, x_filtered, y, y_regressor - - def _filter_data(self): - """ - - """ - data = self._x_orig - cat_mapping = {} - - if isinstance(data, DataFrame): - data = data.to_panel() - else: - if isinstance(data, Panel): - data = data.copy() - - data, cat_mapping = self._convert_x(data) - - if not isinstance(data, Panel): - data = Panel.from_dict(data, intersect=True) - - x_names = data.items - - if self._weights is not None: - data['__weights__'] = self._weights - - # Filter x's without y (so we can make a prediction) - filtered = data.to_frame() - - # Filter all data together using to_frame - - # convert to DataFrame - y = self._y_orig - if isinstance(y, Series): - y = y.unstack() - - data['__y__'] = y - data_long = data.to_frame() - - x_filt = filtered.filter(x_names) - x = data_long.filter(x_names) - y = data_long['__y__'] - - if self._weights is not None and not self._weights.empty: - weights = data_long['__weights__'] - else: - weights = None - - return x, x_filt, y, weights, cat_mapping - - def _convert_x(self, x): - # Converts non-numeric data in x to floats. x_converted is the - # DataFrame with converted values, and x_conversion is a dict that - # provides the reverse mapping. For example, if 'A' was converted to 0 - # for x named 'variety', then x_conversion['variety'][0] is 'A'. - x_converted = {} - cat_mapping = {} - # x can be either a dict or a Panel, but in Python 3, dicts don't have - # .iteritems - iteritems = getattr(x, 'iteritems', x.items) - for key, df in iteritems(): - if not isinstance(df, DataFrame): - raise AssertionError("all input items must be DataFrames, " - "at least one is of " - "type {0}".format(type(df))) - - if _is_numeric(df): - x_converted[key] = df - else: - try: - df = df.astype(float) - except (TypeError, ValueError): - values = df.values - distinct_values = sorted(set(values.flat)) - cat_mapping[key] = dict(enumerate(distinct_values)) - new_values = np.searchsorted(distinct_values, values) - x_converted[key] = DataFrame(new_values, index=df.index, - columns=df.columns) - - if len(cat_mapping) == 0: - x_converted = x - - return x_converted, cat_mapping - - def _add_dummies(self, panel, mapping): - """ - Add entity and / or categorical dummies to input X DataFrame - - Returns - ------- - DataFrame - """ - panel = self._add_entity_effects(panel) - panel = self._add_categorical_dummies(panel, mapping) - - return panel - - def _add_entity_effects(self, panel): - """ - Add entity dummies to panel - - Returns - ------- - DataFrame - """ - from pandas.core.reshape import make_axis_dummies - - if not self._entity_effects: - return panel - - self.log('-- Adding entity fixed effect dummies') - - dummies = make_axis_dummies(panel, 'minor') - - if not self._use_all_dummies: - if 'entity' in self._dropped_dummies: - to_exclude = str(self._dropped_dummies.get('entity')) - else: - to_exclude = dummies.columns[0] - - if to_exclude not in dummies.columns: - raise Exception('%s not in %s' % (to_exclude, - dummies.columns)) - - self.log('-- Excluding dummy for entity: %s' % to_exclude) - - dummies = dummies.filter(dummies.columns.difference([to_exclude])) - - dummies = dummies.add_prefix('FE_') - panel = panel.join(dummies) - - return panel - - def _add_categorical_dummies(self, panel, cat_mappings): - """ - Add categorical dummies to panel - - Returns - ------- - DataFrame - """ - if not self._x_effects: - return panel - - dropped_dummy = (self._entity_effects and not self._use_all_dummies) - - for effect in self._x_effects: - self.log('-- Adding fixed effect dummies for %s' % effect) - - dummies = get_dummies(panel[effect]) - - val_map = cat_mappings.get(effect) - if val_map: - val_map = dict((v, k) for k, v in compat.iteritems(val_map)) - - if dropped_dummy or not self._use_all_dummies: - if effect in self._dropped_dummies: - to_exclude = mapped_name = self._dropped_dummies.get( - effect) - - if val_map: - mapped_name = val_map[to_exclude] - else: - to_exclude = mapped_name = dummies.columns[0] - - if mapped_name not in dummies.columns: # pragma: no cover - raise Exception('%s not in %s' % (to_exclude, - dummies.columns)) - - self.log( - '-- Excluding dummy for %s: %s' % (effect, to_exclude)) - - dummies = dummies.filter( - dummies.columns.difference([mapped_name])) - dropped_dummy = True - - dummies = _convertDummies(dummies, cat_mappings.get(effect)) - dummies = dummies.add_prefix('%s_' % effect) - panel = panel.join(dummies) - - return panel - - @property - def _use_all_dummies(self): - """ - In the case of using an intercept or including time fixed - effects, completely partitioning the sample would make the X - not full rank. - """ - return (not self._intercept and not self._time_effects) - - @cache_readonly - def _beta_raw(self): - """Runs the regression and returns the beta.""" - X = self._x_trans.values - Y = self._y_trans.values.squeeze() - - beta, _, _, _ = np.linalg.lstsq(X, Y) - - return beta - - @cache_readonly - def beta(self): - return Series(self._beta_raw, index=self._x.columns) - - @cache_readonly - def _df_model_raw(self): - """Returns the raw model degrees of freedom.""" - return self._df_raw - 1 - - @cache_readonly - def _df_resid_raw(self): - """Returns the raw residual degrees of freedom.""" - return self._nobs - self._df_raw - - @cache_readonly - def _df_raw(self): - """Returns the degrees of freedom.""" - df = math.rank(self._x_trans.values) - if self._time_effects: - df += self._total_times - - return df - - @cache_readonly - def _r2_raw(self): - Y = self._y_trans.values.squeeze() - X = self._x_trans.values - - resid = Y - np.dot(X, self._beta_raw) - - SSE = (resid ** 2).sum() - - if self._use_centered_tss: - SST = ((Y - np.mean(Y)) ** 2).sum() - else: - SST = (Y ** 2).sum() - - return 1 - SSE / SST - - @property - def _use_centered_tss(self): - # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR - return self._intercept or self._entity_effects or self._time_effects - - @cache_readonly - def _r2_adj_raw(self): - """Returns the raw r-squared adjusted values.""" - nobs = self._nobs - factors = (nobs - 1) / (nobs - self._df_raw) - return 1 - (1 - self._r2_raw) * factors - - @cache_readonly - def _resid_raw(self): - Y = self._y.values.squeeze() - X = self._x.values - return Y - np.dot(X, self._beta_raw) - - @cache_readonly - def resid(self): - return self._unstack_vector(self._resid_raw) - - @cache_readonly - def _rmse_raw(self): - """Returns the raw rmse values.""" - # X = self._x.values - # Y = self._y.values.squeeze() - - X = self._x_trans.values - Y = self._y_trans.values.squeeze() - - resid = Y - np.dot(X, self._beta_raw) - ss = (resid ** 2).sum() - return np.sqrt(ss / (self._nobs - self._df_raw)) - - @cache_readonly - def _var_beta_raw(self): - cluster_axis = None - if self._cluster == 'time': - cluster_axis = 0 - elif self._cluster == 'entity': - cluster_axis = 1 - - x = self._x - y = self._y - - if self._time_effects: - xx = _xx_time_effects(x, y) - else: - xx = np.dot(x.values.T, x.values) - - return _var_beta_panel(y, x, self._beta_raw, xx, - self._rmse_raw, cluster_axis, self._nw_lags, - self._nobs, self._df_raw, self._nw_overlap) - - @cache_readonly - def _y_fitted_raw(self): - """Returns the raw fitted y values.""" - return np.dot(self._x.values, self._beta_raw) - - @cache_readonly - def y_fitted(self): - return self._unstack_vector(self._y_fitted_raw, index=self._x.index) - - def _unstack_vector(self, vec, index=None): - if index is None: - index = self._y_trans.index - panel = DataFrame(vec, index=index, columns=['dummy']) - return panel.to_panel()['dummy'] - - def _unstack_y(self, vec): - unstacked = self._unstack_vector(vec) - return unstacked.reindex(self.beta.index) - - @cache_readonly - def _time_obs_count(self): - return self._y_trans.count(level=0).values - - @cache_readonly - def _time_has_obs(self): - return self._time_obs_count > 0 - - @property - def _nobs(self): - return len(self._y) - - -def _convertDummies(dummies, mapping): - # cleans up the names of the generated dummies - new_items = [] - for item in dummies.columns: - if not mapping: - var = str(item) - if isinstance(item, float): - var = '%g' % item - - new_items.append(var) - else: - # renames the dummies if a conversion dict is provided - new_items.append(mapping[int(item)]) - - dummies = DataFrame(dummies.values, index=dummies.index, - columns=new_items) - - return dummies - - -def _is_numeric(df): - for col in df: - if df[col].dtype.name == 'object': - return False - - return True - - -def add_intercept(panel, name='intercept'): - """ - Add column of ones to input panel - - Parameters - ---------- - panel: Panel / DataFrame - name: string, default 'intercept'] - - Returns - ------- - New object (same type as input) - """ - panel = panel.copy() - panel[name] = 1. - - return panel.consolidate() - - -class MovingPanelOLS(MovingOLS, PanelOLS): - """Implements rolling/expanding panel OLS. - - See ols function docs - """ - _panel_model = True - - def __init__(self, y, x, weights=None, - window_type='expanding', window=None, - min_periods=None, - min_obs=None, - intercept=True, - nw_lags=None, nw_overlap=False, - entity_effects=False, - time_effects=False, - x_effects=None, - cluster=None, - dropped_dummies=None, - verbose=False): - - self._args = dict(intercept=intercept, - nw_lags=nw_lags, - nw_overlap=nw_overlap, - entity_effects=entity_effects, - time_effects=time_effects, - x_effects=x_effects, - cluster=cluster, - dropped_dummies=dropped_dummies, - verbose=verbose) - - PanelOLS.__init__(self, y=y, x=x, weights=weights, - **self._args) - - self._set_window(window_type, window, min_periods) - - if min_obs is None: - min_obs = len(self._x.columns) + 1 - - self._min_obs = min_obs - - @cache_readonly - def resid(self): - return self._unstack_y(self._resid_raw) - - @cache_readonly - def y_fitted(self): - return self._unstack_y(self._y_fitted_raw) - - @cache_readonly - def y_predict(self): - """Returns the predicted y values.""" - return self._unstack_y(self._y_predict_raw) - - def lagged_y_predict(self, lag=1): - """ - Compute forecast Y value lagging coefficient by input number - of time periods - - Parameters - ---------- - lag : int - - Returns - ------- - DataFrame - """ - x = self._x.values - betas = self._beta_matrix(lag=lag) - return self._unstack_y((betas * x).sum(1)) - - @cache_readonly - def _rolling_ols_call(self): - return self._calc_betas(self._x_trans, self._y_trans) - - @cache_readonly - def _df_raw(self): - """Returns the degrees of freedom.""" - df = self._rolling_rank() - - if self._time_effects: - df += self._window_time_obs - - return df[self._valid_indices] - - @cache_readonly - def _var_beta_raw(self): - """Returns the raw covariance of beta.""" - x = self._x - y = self._y - - dates = x.index.levels[0] - - cluster_axis = None - if self._cluster == 'time': - cluster_axis = 0 - elif self._cluster == 'entity': - cluster_axis = 1 - - nobs = self._nobs - rmse = self._rmse_raw - beta = self._beta_raw - df = self._df_raw - window = self._window - - if not self._time_effects: - # Non-transformed X - cum_xx = self._cum_xx(x) - - results = [] - for n, i in enumerate(self._valid_indices): - if self._is_rolling and i >= window: - prior_date = dates[i - window + 1] - else: - prior_date = dates[0] - - date = dates[i] - - x_slice = x.truncate(prior_date, date) - y_slice = y.truncate(prior_date, date) - - if self._time_effects: - xx = _xx_time_effects(x_slice, y_slice) - else: - xx = cum_xx[i] - if self._is_rolling and i >= window: - xx = xx - cum_xx[i - window] - - result = _var_beta_panel(y_slice, x_slice, beta[n], xx, rmse[n], - cluster_axis, self._nw_lags, - nobs[n], df[n], self._nw_overlap) - - results.append(result) - - return np.array(results) - - @cache_readonly - def _resid_raw(self): - beta_matrix = self._beta_matrix(lag=0) - - Y = self._y.values.squeeze() - X = self._x.values - resid = Y - (X * beta_matrix).sum(1) - - return resid - - @cache_readonly - def _y_fitted_raw(self): - x = self._x.values - betas = self._beta_matrix(lag=0) - return (betas * x).sum(1) - - @cache_readonly - def _y_predict_raw(self): - """Returns the raw predicted y values.""" - x = self._x.values - betas = self._beta_matrix(lag=1) - return (betas * x).sum(1) - - def _beta_matrix(self, lag=0): - if lag < 0: - raise AssertionError("'lag' must be greater than or equal to 0, " - "input was {0}".format(lag)) - - index = self._y_trans.index - major_labels = index.labels[0] - labels = major_labels - lag - indexer = self._valid_indices.searchsorted(labels, side='left') - - beta_matrix = self._beta_raw[indexer] - beta_matrix[labels < self._valid_indices[0]] = np.NaN - - return beta_matrix - - @cache_readonly - def _enough_obs(self): - # XXX: what's the best way to determine where to start? - # TODO: write unit tests for this - - rank_threshold = len(self._x.columns) + 1 - if self._min_obs < rank_threshold: # pragma: no cover - warnings.warn('min_obs is smaller than rank of X matrix') - - enough_observations = self._nobs_raw >= self._min_obs - enough_time_periods = self._window_time_obs >= self._min_periods - return enough_time_periods & enough_observations - - -def create_ols_dict(attr): - def attr_getter(self): - d = {} - for k, v in compat.iteritems(self.results): - result = getattr(v, attr) - d[k] = result - - return d - - return attr_getter - - -def create_ols_attr(attr): - return property(create_ols_dict(attr)) - - -class NonPooledPanelOLS(object): - """Implements non-pooled panel OLS. - - Parameters - ---------- - y : DataFrame - x : Series, DataFrame, or dict of Series - intercept : bool - True if you want an intercept. - nw_lags : None or int - Number of Newey-West lags. - window_type : {'full_sample', 'rolling', 'expanding'} - 'full_sample' by default - window : int - size of window (for rolling/expanding OLS) - """ - - ATTRIBUTES = [ - 'beta', - 'df', - 'df_model', - 'df_resid', - 'f_stat', - 'p_value', - 'r2', - 'r2_adj', - 'resid', - 'rmse', - 'std_err', - 'summary_as_matrix', - 't_stat', - 'var_beta', - 'x', - 'y', - 'y_fitted', - 'y_predict' - ] - - def __init__(self, y, x, window_type='full_sample', window=None, - min_periods=None, intercept=True, nw_lags=None, - nw_overlap=False): - - import warnings - warnings.warn("The pandas.stats.plm module is deprecated and will be " - "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: " - "http://www.statsmodels.org/stable/mixed_linear.html", - FutureWarning, stacklevel=4) - - for attr in self.ATTRIBUTES: - setattr(self.__class__, attr, create_ols_attr(attr)) - - results = {} - - for entity in y: - entity_y = y[entity] - - entity_x = {} - for x_var in x: - entity_x[x_var] = x[x_var][entity] - - from pandas.stats.interface import ols - results[entity] = ols(y=entity_y, - x=entity_x, - window_type=window_type, - window=window, - min_periods=min_periods, - intercept=intercept, - nw_lags=nw_lags, - nw_overlap=nw_overlap) - - self.results = results - - -def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, - nw_lags, nobs, df, nw_overlap): - xx_inv = math.inv(xx) - - yv = y.values - - if cluster_axis is None: - if nw_lags is None: - return xx_inv * (rmse ** 2) - else: - resid = yv - np.dot(x.values, beta) - m = (x.values.T * resid).T - - xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap) - - return np.dot(xx_inv, np.dot(xeps, xx_inv)) - else: - Xb = np.dot(x.values, beta).reshape((len(x.values), 1)) - resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid']) - - if cluster_axis == 1: - x = x.swaplevel(0, 1).sort_index(level=0) - resid = resid.swaplevel(0, 1).sort_index(level=0) - - m = _group_agg(x.values * resid.values, x.index._bounds, - lambda x: np.sum(x, axis=0)) - - if nw_lags is None: - nw_lags = 0 - - xox = 0 - for i in range(len(x.index.levels[0])): - xox += math.newey_west(m[i: i + 1], nw_lags, - nobs, df, nw_overlap) - - return np.dot(xx_inv, np.dot(xox, xx_inv)) - - -def _group_agg(values, bounds, f): - """ - R-style aggregator - - Parameters - ---------- - values : N-length or N x K ndarray - bounds : B-length ndarray - f : ndarray aggregation function - - Returns - ------- - ndarray with same length as bounds array - """ - if values.ndim == 1: - N = len(values) - result = np.empty(len(bounds), dtype=float) - elif values.ndim == 2: - N, K = values.shape - result = np.empty((len(bounds), K), dtype=float) - - testagg = f(values[:min(1, len(values))]) - if isinstance(testagg, np.ndarray) and testagg.ndim == 2: - raise AssertionError('Function must reduce') - - for i, left_bound in enumerate(bounds): - if i == len(bounds) - 1: - right_bound = N - else: - right_bound = bounds[i + 1] - - result[i] = f(values[left_bound:right_bound]) - - return result - - -def _xx_time_effects(x, y): - """ - Returns X'X - (X'T) (T'T)^-1 (T'X) - """ - # X'X - xx = np.dot(x.values.T, x.values) - xt = x.sum(level=0).values - - count = y.unstack().count(1).values - selector = count > 0 - - # X'X - (T'T)^-1 (T'X) - xt = xt[selector] - count = count[selector] - - return xx - np.dot(xt.T / count, xt) diff --git a/pandas/stats/tests/__init__.py b/pandas/stats/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/stats/tests/common.py b/pandas/stats/tests/common.py deleted file mode 100644 index 0ce4b20a4b719..0000000000000 --- a/pandas/stats/tests/common.py +++ /dev/null @@ -1,162 +0,0 @@ -# pylint: disable-msg=W0611,W0402 -# flake8: noqa - -from datetime import datetime -import string -import nose - -import numpy as np - -from pandas import DataFrame, bdate_range -from pandas.util.testing import assert_almost_equal # imported in other tests -import pandas.util.testing as tm - -N = 100 -K = 4 - -start = datetime(2007, 1, 1) -DATE_RANGE = bdate_range(start, periods=N) - -COLS = ['Col' + c for c in string.ascii_uppercase[:K]] - - -def makeDataFrame(): - data = DataFrame(np.random.randn(N, K), - columns=COLS, - index=DATE_RANGE) - - return data - - -def getBasicDatasets(): - A = makeDataFrame() - B = makeDataFrame() - C = makeDataFrame() - - return A, B, C - - -def check_for_scipy(): - try: - import scipy - except ImportError: - raise nose.SkipTest('no scipy') - - -def check_for_statsmodels(): - _have_statsmodels = True - try: - import statsmodels.api as sm - except ImportError: - try: - import scikits.statsmodels.api as sm - except ImportError: - raise nose.SkipTest('no statsmodels') - - -class BaseTest(tm.TestCase): - - def setUp(self): - check_for_scipy() - check_for_statsmodels() - - self.A, self.B, self.C = getBasicDatasets() - - self.createData1() - self.createData2() - self.createData3() - - def createData1(self): - date = datetime(2007, 1, 1) - date2 = datetime(2007, 1, 15) - date3 = datetime(2007, 1, 22) - - A = self.A.copy() - B = self.B.copy() - C = self.C.copy() - - A['ColA'][date] = np.NaN - B['ColA'][date] = np.NaN - C['ColA'][date] = np.NaN - C['ColA'][date2] = np.NaN - - # truncate data to save time - A = A[:30] - B = B[:30] - C = C[:30] - - self.panel_y = A - self.panel_x = {'B': B, 'C': C} - - self.series_panel_y = A.filter(['ColA']) - self.series_panel_x = {'B': B.filter(['ColA']), - 'C': C.filter(['ColA'])} - self.series_y = A['ColA'] - self.series_x = {'B': B['ColA'], - 'C': C['ColA']} - - def createData2(self): - y_data = [[1, np.NaN], - [2, 3], - [4, 5]] - y_index = [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)] - y_cols = ['A', 'B'] - self.panel_y2 = DataFrame(np.array(y_data), index=y_index, - columns=y_cols) - - x1_data = [[6, np.NaN], - [7, 8], - [9, 30], - [11, 12]] - x1_index = [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3), - datetime(2000, 1, 4)] - x1_cols = ['A', 'B'] - x1 = DataFrame(np.array(x1_data), index=x1_index, - columns=x1_cols) - - x2_data = [[13, 14, np.NaN], - [15, np.NaN, np.NaN], - [16, 17, 48], - [19, 20, 21], - [22, 23, 24]] - x2_index = [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5)] - x2_cols = ['C', 'A', 'B'] - x2 = DataFrame(np.array(x2_data), index=x2_index, - columns=x2_cols) - - self.panel_x2 = {'x1': x1, 'x2': x2} - - def createData3(self): - y_data = [[1, 2], - [3, 4]] - y_index = [datetime(2000, 1, 1), - datetime(2000, 1, 2)] - y_cols = ['A', 'B'] - self.panel_y3 = DataFrame(np.array(y_data), index=y_index, - columns=y_cols) - - x1_data = [['A', 'B'], - ['C', 'A']] - x1_index = [datetime(2000, 1, 1), - datetime(2000, 1, 2)] - x1_cols = ['A', 'B'] - x1 = DataFrame(np.array(x1_data), index=x1_index, - columns=x1_cols) - - x2_data = [['foo', 'bar'], - ['baz', 'foo']] - x2_index = [datetime(2000, 1, 1), - datetime(2000, 1, 2)] - x2_cols = ['A', 'B'] - x2 = DataFrame(np.array(x2_data), index=x2_index, - columns=x2_cols) - - self.panel_x3 = {'x1': x1, 'x2': x2} diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py deleted file mode 100644 index 0c9fcf775ad2d..0000000000000 --- a/pandas/stats/tests/test_fama_macbeth.py +++ /dev/null @@ -1,68 +0,0 @@ -# flake8: noqa - -from pandas import DataFrame, Panel -from pandas.stats.api import fama_macbeth -from .common import assert_almost_equal, BaseTest - -from pandas.compat import range -from pandas import compat -import pandas.util.testing as tm -import numpy as np - - -class TestFamaMacBeth(BaseTest): - - def testFamaMacBethRolling(self): - # self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y, - # nw_lags_beta=2) - - # df = DataFrame(np.random.randn(50, 10)) - x = dict((k, DataFrame(np.random.randn(50, 10))) for k in 'abcdefg') - x = Panel.from_dict(x) - y = (DataFrame(np.random.randn(50, 10)) + - DataFrame(0.01 * np.random.randn(50, 10))) - self.checkFamaMacBethExtended('rolling', x, y, nw_lags_beta=2) - self.checkFamaMacBethExtended('expanding', x, y, nw_lags_beta=2) - - def checkFamaMacBethExtended(self, window_type, x, y, **kwds): - window = 25 - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = fama_macbeth(y=y, x=x, window_type=window_type, window=window, - **kwds) - self._check_stuff_works(result) - - index = result._index - time = len(index) - - for i in range(time - window + 1): - if window_type == 'rolling': - start = index[i] - else: - start = index[0] - - end = index[i + window - 1] - - x2 = {} - for k, v in x.iteritems(): - x2[k] = v.truncate(start, end) - y2 = y.truncate(start, end) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - reference = fama_macbeth(y=y2, x=x2, **kwds) - # reference._stats is tuple - assert_almost_equal(reference._stats, result._stats[:, i], - check_dtype=False) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - static = fama_macbeth(y=y2, x=x2, **kwds) - self._check_stuff_works(static) - - def _check_stuff_works(self, result): - # does it work? - attrs = ['mean_beta', 'std_beta', 't_stat'] - for attr in attrs: - getattr(result, attr) - - # does it work? - result.summary diff --git a/pandas/stats/tests/test_math.py b/pandas/stats/tests/test_math.py deleted file mode 100644 index 3f89dbcd20065..0000000000000 --- a/pandas/stats/tests/test_math.py +++ /dev/null @@ -1,59 +0,0 @@ -import nose - -from datetime import datetime -from numpy.random import randn -import numpy as np - -from pandas.core.api import Series, DataFrame, date_range -import pandas.util.testing as tm -import pandas.stats.math as pmath -from pandas import ols - -N, K = 100, 10 - -_have_statsmodels = True -try: - import statsmodels.api as sm -except ImportError: - try: - import scikits.statsmodels.api as sm # noqa - except ImportError: - _have_statsmodels = False - - -class TestMath(tm.TestCase): - - _nan_locs = np.arange(20, 40) - _inf_locs = np.array([]) - - def setUp(self): - arr = randn(N) - arr[self._nan_locs] = np.NaN - - self.arr = arr - self.rng = date_range(datetime(2009, 1, 1), periods=N) - - self.series = Series(arr.copy(), index=self.rng) - - self.frame = DataFrame(randn(N, K), index=self.rng, - columns=np.arange(K)) - - def test_rank_1d(self): - self.assertEqual(1, pmath.rank(self.series)) - self.assertEqual(0, pmath.rank(Series(0, self.series.index))) - - def test_solve_rect(self): - if not _have_statsmodels: - raise nose.SkipTest("no statsmodels") - - b = Series(np.random.randn(N), self.frame.index) - result = pmath.solve(self.frame, b) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = ols(y=b, x=self.frame, intercept=False).beta - self.assertTrue(np.allclose(result, expected)) - - def test_inv_illformed(self): - singular = DataFrame(np.array([[1, 1], [2, 2]])) - rs = pmath.inv(singular) - expected = np.array([[0.1, 0.2], [0.1, 0.2]]) - self.assertTrue(np.allclose(rs, expected)) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py deleted file mode 100644 index b90c51366c86f..0000000000000 --- a/pandas/stats/tests/test_ols.py +++ /dev/null @@ -1,968 +0,0 @@ -""" -Unit test suite for OLS and PanelOLS classes -""" - -# pylint: disable-msg=W0212 - -# flake8: noqa - -from __future__ import division - -from datetime import datetime -from pandas import compat -from distutils.version import LooseVersion -import nose -import numpy as np - -from pandas import date_range, bdate_range -from pandas.core.panel import Panel -from pandas import DataFrame, Index, Series, notnull, offsets -from pandas.stats.api import ols -from pandas.stats.ols import _filter_data -from pandas.stats.plm import NonPooledPanelOLS, PanelOLS -from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assertRaisesRegexp, slow) -import pandas.util.testing as tm -import pandas.compat as compat -from pandas.stats.tests.common import BaseTest - -_have_statsmodels = True -try: - import statsmodels.api as sm -except ImportError: - try: - import scikits.statsmodels.api as sm - except ImportError: - _have_statsmodels = False - - -def _check_repr(obj): - repr(obj) - str(obj) - - -def _compare_ols_results(model1, model2): - tm.assertIsInstance(model1, type(model2)) - - if hasattr(model1, '_window_type'): - _compare_moving_ols(model1, model2) - else: - _compare_fullsample_ols(model1, model2) - - -def _compare_fullsample_ols(model1, model2): - assert_series_equal(model1.beta, model2.beta) - - -def _compare_moving_ols(model1, model2): - assert_frame_equal(model1.beta, model2.beta) - - -class TestOLS(BaseTest): - - # TODO: Add tests for OLS y predict - # TODO: Right now we just check for consistency between full-sample and - # rolling/expanding results of the panel OLS. We should also cross-check - # with trusted implementations of panel OLS (e.g. R). - # TODO: Add tests for non pooled OLS. - - @classmethod - def setUpClass(cls): - super(TestOLS, cls).setUpClass() - try: - import matplotlib as mpl - mpl.use('Agg', warn=False) - except ImportError: - pass - - if not _have_statsmodels: - raise nose.SkipTest("no statsmodels") - - def testOLSWithDatasets_ccard(self): - self.checkDataSet(sm.datasets.ccard.load(), skip_moving=True) - self.checkDataSet(sm.datasets.cpunish.load(), skip_moving=True) - self.checkDataSet(sm.datasets.longley.load(), skip_moving=True) - self.checkDataSet(sm.datasets.stackloss.load(), skip_moving=True) - - @slow - def testOLSWithDatasets_copper(self): - self.checkDataSet(sm.datasets.copper.load()) - - @slow - def testOLSWithDatasets_scotland(self): - self.checkDataSet(sm.datasets.scotland.load()) - - # degenerate case fails on some platforms - # self.checkDataSet(datasets.ccard.load(), 39, 49) # one col in X all - # 0s - - def testWLS(self): - # WLS centered SS changed (fixed) in 0.5.0 - sm_version = sm.version.version - if sm_version < LooseVersion('0.5.0'): - raise nose.SkipTest("WLS centered SS not fixed in statsmodels" - " version {0}".format(sm_version)) - - X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) - Y = Series(np.random.randn(30)) - weights = X.std(1) - - self._check_wls(X, Y, weights) - - weights.loc[[5, 15]] = np.nan - Y[[2, 21]] = np.nan - self._check_wls(X, Y, weights) - - def _check_wls(self, x, y, weights): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=y, x=x, weights=1 / weights) - - combined = x.copy() - combined['__y__'] = y - combined['__weights__'] = weights - combined = combined.dropna() - - endog = combined.pop('__y__').values - aweights = combined.pop('__weights__').values - exog = sm.add_constant(combined.values, prepend=False) - - sm_result = sm.WLS(endog, exog, weights=1 / aweights).fit() - - assert_almost_equal(sm_result.params, result._beta_raw) - assert_almost_equal(sm_result.resid, result._resid_raw) - - self.checkMovingOLS('rolling', x, y, weights=weights) - self.checkMovingOLS('expanding', x, y, weights=weights) - - def checkDataSet(self, dataset, start=None, end=None, skip_moving=False): - exog = dataset.exog[start: end] - endog = dataset.endog[start: end] - x = DataFrame(exog, index=np.arange(exog.shape[0]), - columns=np.arange(exog.shape[1])) - y = Series(endog, index=np.arange(len(endog))) - - self.checkOLS(exog, endog, x, y) - - if not skip_moving: - self.checkMovingOLS('rolling', x, y) - self.checkMovingOLS('rolling', x, y, nw_lags=0) - self.checkMovingOLS('expanding', x, y, nw_lags=0) - self.checkMovingOLS('rolling', x, y, nw_lags=1) - self.checkMovingOLS('expanding', x, y, nw_lags=1) - self.checkMovingOLS('expanding', x, y, nw_lags=1, nw_overlap=True) - - def checkOLS(self, exog, endog, x, y): - reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=y, x=x) - - # check that sparse version is the same - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - sparse_result = ols(y=y.to_sparse(), x=x.to_sparse()) - _compare_ols_results(result, sparse_result) - - assert_almost_equal(reference.params, result._beta_raw) - assert_almost_equal(reference.df_model, result._df_model_raw) - assert_almost_equal(reference.df_resid, result._df_resid_raw) - assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) - assert_almost_equal(reference.pvalues, result._p_value_raw) - assert_almost_equal(reference.rsquared, result._r2_raw) - assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) - assert_almost_equal(reference.resid, result._resid_raw) - assert_almost_equal(reference.bse, result._std_err_raw) - assert_almost_equal(reference.tvalues, result._t_stat_raw) - assert_almost_equal(reference.cov_params(), result._var_beta_raw) - assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) - - _check_non_raw_results(result) - - def checkMovingOLS(self, window_type, x, y, weights=None, **kwds): - window = np.linalg.matrix_rank(x.values) * 2 - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - moving = ols(y=y, x=x, weights=weights, window_type=window_type, - window=window, **kwds) - - # check that sparse version is the same - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - sparse_moving = ols(y=y.to_sparse(), x=x.to_sparse(), - weights=weights, - window_type=window_type, - window=window, **kwds) - _compare_ols_results(moving, sparse_moving) - - index = moving._index - - for n, i in enumerate(moving._valid_indices): - if window_type == 'rolling' and i >= window: - prior_date = index[i - window + 1] - else: - prior_date = index[0] - - date = index[i] - - x_iter = {} - for k, v in compat.iteritems(x): - x_iter[k] = v.truncate(before=prior_date, after=date) - y_iter = y.truncate(before=prior_date, after=date) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - static = ols(y=y_iter, x=x_iter, weights=weights, **kwds) - - self.compare(static, moving, event_index=i, - result_index=n) - - _check_non_raw_results(moving) - - FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', 'p_value', - 'r2', 'r2_adj', 'rmse', 'std_err', 't_stat', - 'var_beta'] - - def compare(self, static, moving, event_index=None, - result_index=None): - - index = moving._index - - # Check resid if we have a time index specified - if event_index is not None: - ref = static._resid_raw[-1] - - label = index[event_index] - - res = moving.resid[label] - - assert_almost_equal(ref, res) - - ref = static._y_fitted_raw[-1] - res = moving.y_fitted[label] - - assert_almost_equal(ref, res) - - # Check y_fitted - - for field in self.FIELDS: - attr = '_%s_raw' % field - - ref = getattr(static, attr) - res = getattr(moving, attr) - - if result_index is not None: - res = res[result_index] - - assert_almost_equal(ref, res) - - def test_ols_object_dtype(self): - df = DataFrame(np.random.randn(20, 2), dtype=object) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=df[0], x=df[1]) - summary = repr(model) - - -class TestOLSMisc(tm.TestCase): - - """ - For test coverage with faux data - """ - @classmethod - def setUpClass(cls): - super(TestOLSMisc, cls).setUpClass() - if not _have_statsmodels: - raise nose.SkipTest("no statsmodels") - - def test_f_test(self): - x = tm.makeTimeDataFrame() - y = x.pop('A') - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x) - - hyp = '1*B+1*C+1*D=0' - result = model.f_test(hyp) - - hyp = ['1*B=0', - '1*C=0', - '1*D=0'] - result = model.f_test(hyp) - assert_almost_equal(result['f-stat'], model.f_stat['f-stat']) - - self.assertRaises(Exception, model.f_test, '1*A=0') - - def test_r2_no_intercept(self): - y = tm.makeTimeSeries() - x = tm.makeTimeDataFrame() - - x_with = x.copy() - x_with['intercept'] = 1. - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model1 = ols(y=y, x=x) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model2 = ols(y=y, x=x_with, intercept=False) - assert_series_equal(model1.beta, model2.beta) - - # TODO: can we infer whether the intercept is there... - self.assertNotEqual(model1.r2, model2.r2) - - # rolling - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model1 = ols(y=y, x=x, window=20) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model2 = ols(y=y, x=x_with, window=20, intercept=False) - assert_frame_equal(model1.beta, model2.beta) - self.assertTrue((model1.r2 != model2.r2).all()) - - def test_summary_many_terms(self): - x = DataFrame(np.random.randn(100, 20)) - y = np.random.randn(100) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x) - model.summary - - def test_y_predict(self): - y = tm.makeTimeSeries() - x = tm.makeTimeDataFrame() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model1 = ols(y=y, x=x) - assert_series_equal(model1.y_predict, model1.y_fitted) - assert_almost_equal(model1._y_predict_raw, model1._y_fitted_raw) - - def test_predict(self): - y = tm.makeTimeSeries() - x = tm.makeTimeDataFrame() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model1 = ols(y=y, x=x) - assert_series_equal(model1.predict(), model1.y_predict) - assert_series_equal(model1.predict(x=x), model1.y_predict) - assert_series_equal(model1.predict(beta=model1.beta), model1.y_predict) - - exog = x.copy() - exog['intercept'] = 1. - rs = Series(np.dot(exog.values, model1.beta.values), x.index) - assert_series_equal(model1.y_predict, rs) - - x2 = x.reindex(columns=x.columns[::-1]) - assert_series_equal(model1.predict(x=x2), model1.y_predict) - - x3 = x2 + 10 - pred3 = model1.predict(x=x3) - x3['intercept'] = 1. - x3 = x3.reindex(columns=model1.beta.index) - expected = Series(np.dot(x3.values, model1.beta.values), x3.index) - assert_series_equal(expected, pred3) - - beta = Series(0., model1.beta.index) - pred4 = model1.predict(beta=beta) - assert_series_equal(Series(0., pred4.index), pred4) - - def test_predict_longer_exog(self): - exogenous = {"1998": "4760", "1999": "5904", "2000": "4504", - "2001": "9808", "2002": "4241", "2003": "4086", - "2004": "4687", "2005": "7686", "2006": "3740", - "2007": "3075", "2008": "3753", "2009": "4679", - "2010": "5468", "2011": "7154", "2012": "4292", - "2013": "4283", "2014": "4595", "2015": "9194", - "2016": "4221", "2017": "4520"} - endogenous = {"1998": "691", "1999": "1580", "2000": "80", - "2001": "1450", "2002": "555", "2003": "956", - "2004": "877", "2005": "614", "2006": "468", - "2007": "191"} - - endog = Series(endogenous) - exog = Series(exogenous) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=endog, x=exog) - - pred = model.y_predict - self.assert_index_equal(pred.index, exog.index) - - def test_longpanel_series_combo(self): - wp = tm.makePanel() - lp = wp.to_frame() - - y = lp.pop('ItemA') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=lp, entity_effects=True, window=20) - self.assertTrue(notnull(model.beta.values).all()) - tm.assertIsInstance(model, PanelOLS) - model.summary - - def test_series_rhs(self): - y = tm.makeTimeSeries() - x = tm.makeTimeSeries() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = ols(y=y, x={'x': x}) - assert_series_equal(model.beta, expected.beta) - - # GH 5233/5250 - assert_series_equal(model.y_predict, model.predict(x=x)) - - def test_various_attributes(self): - # just make sure everything "works". test correctness elsewhere - - x = DataFrame(np.random.randn(100, 5)) - y = np.random.randn(100) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x, window=20) - - series_attrs = ['rank', 'df', 'forecast_mean', 'forecast_vol'] - - for attr in series_attrs: - value = getattr(model, attr) - tm.assertIsInstance(value, Series) - - # works - model._results - - def test_catch_regressor_overlap(self): - df1 = tm.makeTimeDataFrame().loc[:, ['A', 'B']] - df2 = tm.makeTimeDataFrame().loc[:, ['B', 'C', 'D']] - y = tm.makeTimeSeries() - - data = {'foo': df1, 'bar': df2} - - def f(): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - ols(y=y, x=data) - self.assertRaises(Exception, f) - - def test_plm_ctor(self): - y = tm.makeTimeDataFrame() - x = {'a': tm.makeTimeDataFrame(), - 'b': tm.makeTimeDataFrame()} - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x, intercept=False) - model.summary - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=Panel(x)) - model.summary - - def test_plm_attrs(self): - y = tm.makeTimeDataFrame() - x = {'a': tm.makeTimeDataFrame(), - 'b': tm.makeTimeDataFrame()} - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rmodel = ols(y=y, x=x, window=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x) - model.resid - rmodel.resid - - def test_plm_lagged_y_predict(self): - y = tm.makeTimeDataFrame() - x = {'a': tm.makeTimeDataFrame(), - 'b': tm.makeTimeDataFrame()} - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x, window=10) - result = model.lagged_y_predict(2) - - def test_plm_f_test(self): - y = tm.makeTimeDataFrame() - x = {'a': tm.makeTimeDataFrame(), - 'b': tm.makeTimeDataFrame()} - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=y, x=x) - - hyp = '1*a+1*b=0' - result = model.f_test(hyp) - - hyp = ['1*a=0', - '1*b=0'] - result = model.f_test(hyp) - assert_almost_equal(result['f-stat'], model.f_stat['f-stat']) - - def test_plm_exclude_dummy_corner(self): - y = tm.makeTimeDataFrame() - x = {'a': tm.makeTimeDataFrame(), - 'b': tm.makeTimeDataFrame()} - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols( - y=y, x=x, entity_effects=True, dropped_dummies={'entity': 'D'}) - model.summary - - def f(): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - ols(y=y, x=x, entity_effects=True, - dropped_dummies={'entity': 'E'}) - self.assertRaises(Exception, f) - - def test_columns_tuples_summary(self): - # #1837 - X = DataFrame(np.random.randn(10, 2), columns=[('a', 'b'), ('c', 'd')]) - Y = Series(np.random.randn(10)) - - # it works! - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - model = ols(y=Y, x=X) - model.summary - - -class TestPanelOLS(BaseTest): - - FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', - 'p_value', 'r2', 'r2_adj', 'rmse', 'std_err', - 't_stat', 'var_beta'] - - _other_fields = ['resid', 'y_fitted'] - - def testFiltering(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y2, x=self.panel_x2) - - x = result._x - index = x.index.get_level_values(0) - index = Index(sorted(set(index))) - exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3)]) - self.assert_index_equal(exp_index, index) - - index = x.index.get_level_values(1) - index = Index(sorted(set(index))) - exp_index = Index(['A', 'B']) - self.assert_index_equal(exp_index, index) - - x = result._x_filtered - index = x.index.get_level_values(0) - index = Index(sorted(set(index))) - exp_index = Index([datetime(2000, 1, 1), - datetime(2000, 1, 3), - datetime(2000, 1, 4)]) - self.assert_index_equal(exp_index, index) - - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 4, 5], - check_dtype=False) - - exp_x = np.array([[6, 14, 1], [9, 17, 1], - [30, 48, 1]], dtype=np.float64) - assert_almost_equal(exp_x, result._x.values) - - exp_x_filtered = np.array([[6, 14, 1], [9, 17, 1], [30, 48, 1], - [11, 20, 1], [12, 21, 1]], dtype=np.float64) - assert_almost_equal(exp_x_filtered, result._x_filtered.values) - - self.assert_index_equal(result._x_filtered.index.levels[0], - result.y_fitted.index) - - def test_wls_panel(self): - y = tm.makeTimeDataFrame() - x = Panel({'x1': tm.makeTimeDataFrame(), - 'x2': tm.makeTimeDataFrame()}) - - y.iloc[[1, 7], y.columns.get_loc('A')] = np.nan - y.iloc[[6, 15], y.columns.get_loc('B')] = np.nan - y.iloc[[3, 20], y.columns.get_loc('C')] = np.nan - y.iloc[[5, 11], y.columns.get_loc('D')] = np.nan - - stack_y = y.stack() - stack_x = DataFrame(dict((k, v.stack()) - for k, v in x.iteritems())) - - weights = x.std('items') - stack_weights = weights.stack() - - stack_y.index = stack_y.index._tuple_index - stack_x.index = stack_x.index._tuple_index - stack_weights.index = stack_weights.index._tuple_index - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=y, x=x, weights=1 / weights) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights) - - assert_almost_equal(result.beta, expected.beta) - - for attr in ['resid', 'y_fitted']: - rvals = getattr(result, attr).stack().values - evals = getattr(expected, attr).values - assert_almost_equal(rvals, evals) - - def testWithTimeEffects(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) - - # .flat is flatiter instance - assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5], - check_dtype=False) - - exp_x = np.array([[0, 0], [-10.5, -15.5], [10.5, 15.5]]) - assert_almost_equal(result._x_trans.values, exp_x) - - # _check_non_raw_results(result) - - def testWithEntityEffects(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True) - - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 4, 5], - check_dtype=False) - - exp_x = DataFrame([[0., 6., 14., 1.], [0, 9, 17, 1], [1, 30, 48, 1]], - index=result._x.index, columns=['FE_B', 'x1', 'x2', - 'intercept'], - dtype=float) - tm.assert_frame_equal(result._x, exp_x.loc[:, result._x.columns]) - # _check_non_raw_results(result) - - def testWithEntityEffectsAndDroppedDummies(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True, - dropped_dummies={'entity': 'B'}) - - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 4, 5], - check_dtype=False) - exp_x = DataFrame([[1., 6., 14., 1.], [1, 9, 17, 1], [0, 30, 48, 1]], - index=result._x.index, columns=['FE_A', 'x1', 'x2', - 'intercept'], - dtype=float) - tm.assert_frame_equal(result._x, exp_x.loc[:, result._x.columns]) - # _check_non_raw_results(result) - - def testWithXEffects(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) - - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 4, 5], - check_dtype=False) - - res = result._x - exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], - columns=['x1_30', 'x1_9', 'x2', 'intercept'], - index=res.index, dtype=float) - exp_x[['x1_30', 'x1_9']] = exp_x[['x1_30', 'x1_9']].astype(np.uint8) - assert_frame_equal(res, exp_x.reindex(columns=res.columns)) - - def testWithXEffectsAndDroppedDummies(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1'], - dropped_dummies={'x1': 30}) - - res = result._x - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 4, 5], - check_dtype=False) - exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], - columns=['x1_6', 'x1_9', 'x2', 'intercept'], - index=res.index, dtype=float) - exp_x[['x1_6', 'x1_9']] = exp_x[['x1_6', 'x1_9']].astype(np.uint8) - - assert_frame_equal(res, exp_x.reindex(columns=res.columns)) - - def testWithXEffectsAndConversion(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y3, x=self.panel_x3, - x_effects=['x1', 'x2']) - - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 2, 3, 4], - check_dtype=False) - exp_x = np.array([[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], - [0, 0, 0, 1, 1]], dtype=np.float64) - assert_almost_equal(result._x.values, exp_x) - - exp_index = Index(['x1_B', 'x1_C', 'x2_baz', 'x2_foo', 'intercept']) - self.assert_index_equal(exp_index, result._x.columns) - - # _check_non_raw_results(result) - - def testWithXEffectsAndConversionAndDroppedDummies(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2'], - dropped_dummies={'x2': 'foo'}) - # .flat is flatiter instance - assert_almost_equal(result._y.values.flat, [1, 2, 3, 4], - check_dtype=False) - exp_x = np.array([[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], - [0, 0, 0, 0, 1]], dtype=np.float64) - assert_almost_equal(result._x.values, exp_x) - - exp_index = Index(['x1_B', 'x1_C', 'x2_bar', 'x2_baz', 'intercept']) - self.assert_index_equal(exp_index, result._x.columns) - - # _check_non_raw_results(result) - - def testForSeries(self): - self.checkForSeries(self.series_panel_x, self.series_panel_y, - self.series_x, self.series_y) - - self.checkForSeries(self.series_panel_x, self.series_panel_y, - self.series_x, self.series_y, nw_lags=0) - - self.checkForSeries(self.series_panel_x, self.series_panel_y, - self.series_x, self.series_y, nw_lags=1, - nw_overlap=True) - - def testRolling(self): - self.checkMovingOLS(self.panel_x, self.panel_y) - - def testRollingWithFixedEffects(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - entity_effects=True) - self.checkMovingOLS(self.panel_x, self.panel_y, intercept=False, - entity_effects=True) - - def testRollingWithTimeEffects(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - time_effects=True) - - def testRollingWithNeweyWest(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - nw_lags=1) - - def testRollingWithEntityCluster(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - cluster='entity') - - def testUnknownClusterRaisesValueError(self): - assertRaisesRegexp(ValueError, "Unrecognized cluster.*ridiculous", - self.checkMovingOLS, self.panel_x, self.panel_y, - cluster='ridiculous') - - def testRollingWithTimeEffectsAndEntityCluster(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - time_effects=True, cluster='entity') - - def testRollingWithTimeCluster(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - cluster='time') - - def testRollingWithNeweyWestAndEntityCluster(self): - self.assertRaises(ValueError, self.checkMovingOLS, - self.panel_x, self.panel_y, - nw_lags=1, cluster='entity') - - def testRollingWithNeweyWestAndTimeEffectsAndEntityCluster(self): - self.assertRaises(ValueError, - self.checkMovingOLS, self.panel_x, self.panel_y, - nw_lags=1, cluster='entity', - time_effects=True) - - def testExpanding(self): - self.checkMovingOLS( - self.panel_x, self.panel_y, window_type='expanding') - - def testNonPooled(self): - self.checkNonPooled(y=self.panel_y, x=self.panel_x) - self.checkNonPooled(y=self.panel_y, x=self.panel_x, - window_type='rolling', window=25, min_periods=10) - - def testUnknownWindowType(self): - assertRaisesRegexp(ValueError, "window.*ridiculous", - self.checkNonPooled, y=self.panel_y, x=self.panel_x, - window_type='ridiculous', window=25, min_periods=10) - - def checkNonPooled(self, x, y, **kwds): - # For now, just check that it doesn't crash - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=y, x=x, pool=False, **kwds) - - _check_repr(result) - for attr in NonPooledPanelOLS.ATTRIBUTES: - _check_repr(getattr(result, attr)) - - def checkMovingOLS(self, x, y, window_type='rolling', **kwds): - window = 25 # must be larger than rank of x - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - moving = ols(y=y, x=x, window_type=window_type, - window=window, **kwds) - - index = moving._index - - for n, i in enumerate(moving._valid_indices): - if window_type == 'rolling' and i >= window: - prior_date = index[i - window + 1] - else: - prior_date = index[0] - - date = index[i] - - x_iter = {} - for k, v in compat.iteritems(x): - x_iter[k] = v.truncate(before=prior_date, after=date) - y_iter = y.truncate(before=prior_date, after=date) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - static = ols(y=y_iter, x=x_iter, **kwds) - - self.compare(static, moving, event_index=i, - result_index=n) - - _check_non_raw_results(moving) - - def checkForSeries(self, x, y, series_x, series_y, **kwds): - # Consistency check with simple OLS. - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ols(y=y, x=x, **kwds) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - reference = ols(y=series_y, x=series_x, **kwds) - - self.compare(reference, result) - - def compare(self, static, moving, event_index=None, - result_index=None): - - # Check resid if we have a time index specified - if event_index is not None: - staticSlice = _period_slice(static, -1) - movingSlice = _period_slice(moving, event_index) - - ref = static._resid_raw[staticSlice] - res = moving._resid_raw[movingSlice] - - assert_almost_equal(ref, res) - - ref = static._y_fitted_raw[staticSlice] - res = moving._y_fitted_raw[movingSlice] - - assert_almost_equal(ref, res) - - # Check y_fitted - - for field in self.FIELDS: - attr = '_%s_raw' % field - - ref = getattr(static, attr) - res = getattr(moving, attr) - - if result_index is not None: - res = res[result_index] - - assert_almost_equal(ref, res) - - def test_auto_rolling_window_type(self): - data = tm.makeTimeDataFrame() - y = data.pop('A') - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - window_model = ols(y=y, x=data, window=20, min_periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rolling_model = ols(y=y, x=data, window=20, min_periods=10, - window_type='rolling') - - assert_frame_equal(window_model.beta, rolling_model.beta) - - def test_group_agg(self): - from pandas.stats.plm import _group_agg - - values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) - bounds = np.arange(5) * 2 - f = lambda x: x.mean(axis=0) - - agged = _group_agg(values, bounds, f) - - assert(agged[1][0] == 2.5) - assert(agged[2][0] == 4.5) - - # test a function that doesn't aggregate - f2 = lambda x: np.zeros((2, 2)) - self.assertRaises(Exception, _group_agg, values, bounds, f2) - - -def _check_non_raw_results(model): - _check_repr(model) - _check_repr(model.resid) - _check_repr(model.summary_as_matrix) - _check_repr(model.y_fitted) - _check_repr(model.y_predict) - - -def _period_slice(panelModel, i): - index = panelModel._x_trans.index - period = index.levels[0][i] - - L, R = index.get_major_bounds(period, period) - - return slice(L, R) - - -class TestOLSFilter(tm.TestCase): - - def setUp(self): - date_index = date_range(datetime(2009, 12, 11), periods=3, - freq=offsets.BDay()) - ts = Series([3, 1, 4], index=date_index) - self.TS1 = ts - - date_index = date_range(datetime(2009, 12, 11), periods=5, - freq=offsets.BDay()) - ts = Series([1, 5, 9, 2, 6], index=date_index) - self.TS2 = ts - - date_index = date_range(datetime(2009, 12, 11), periods=3, - freq=offsets.BDay()) - ts = Series([5, np.nan, 3], index=date_index) - self.TS3 = ts - - date_index = date_range(datetime(2009, 12, 11), periods=5, - freq=offsets.BDay()) - ts = Series([np.nan, 5, 8, 9, 7], index=date_index) - self.TS4 = ts - - data = {'x1': self.TS2, 'x2': self.TS4} - self.DF1 = DataFrame(data=data) - - data = {'x1': self.TS2, 'x2': self.TS4} - self.DICT1 = data - - def testFilterWithSeriesRHS(self): - (lhs, rhs, weights, rhs_pre, - index, valid) = _filter_data(self.TS1, {'x1': self.TS2}, None) - self.tsAssertEqual(self.TS1.astype(np.float64), lhs, check_names=False) - self.tsAssertEqual(self.TS2[:3].astype(np.float64), rhs['x1'], - check_names=False) - self.tsAssertEqual(self.TS2.astype(np.float64), rhs_pre['x1'], - check_names=False) - - def testFilterWithSeriesRHS2(self): - (lhs, rhs, weights, rhs_pre, - index, valid) = _filter_data(self.TS2, {'x1': self.TS1}, None) - self.tsAssertEqual(self.TS2[:3].astype(np.float64), lhs, - check_names=False) - self.tsAssertEqual(self.TS1.astype(np.float64), rhs['x1'], - check_names=False) - self.tsAssertEqual(self.TS1.astype(np.float64), rhs_pre['x1'], - check_names=False) - - def testFilterWithSeriesRHS3(self): - (lhs, rhs, weights, rhs_pre, - index, valid) = _filter_data(self.TS3, {'x1': self.TS4}, None) - exp_lhs = self.TS3[2:3] - exp_rhs = self.TS4[2:3] - exp_rhs_pre = self.TS4[1:] - self.tsAssertEqual(exp_lhs, lhs, check_names=False) - self.tsAssertEqual(exp_rhs, rhs['x1'], check_names=False) - self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1'], check_names=False) - - def testFilterWithDataFrameRHS(self): - (lhs, rhs, weights, rhs_pre, - index, valid) = _filter_data(self.TS1, self.DF1, None) - exp_lhs = self.TS1[1:].astype(np.float64) - exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3].astype(np.float64) - self.tsAssertEqual(exp_lhs, lhs, check_names=False) - self.tsAssertEqual(exp_rhs1, rhs['x1'], check_names=False) - self.tsAssertEqual(exp_rhs2, rhs['x2'], check_names=False) - - def testFilterWithDictRHS(self): - (lhs, rhs, weights, rhs_pre, - index, valid) = _filter_data(self.TS1, self.DICT1, None) - exp_lhs = self.TS1[1:].astype(np.float64) - exp_rhs1 = self.TS2[1:3].astype(np.float64) - exp_rhs2 = self.TS4[1:3].astype(np.float64) - self.tsAssertEqual(exp_lhs, lhs, check_names=False) - self.tsAssertEqual(exp_rhs1, rhs['x1'], check_names=False) - self.tsAssertEqual(exp_rhs2, rhs['x2'], check_names=False) - - def tsAssertEqual(self, ts1, ts2, **kwargs): - self.assert_series_equal(ts1, ts2, **kwargs) diff --git a/pandas/stats/tests/test_var.py b/pandas/stats/tests/test_var.py deleted file mode 100644 index 04e2019f00a82..0000000000000 --- a/pandas/stats/tests/test_var.py +++ /dev/null @@ -1,94 +0,0 @@ -# flake8: noqa - -from __future__ import print_function - -import pandas.util.testing as tm - -from pandas.compat import range -import nose - -raise nose.SkipTest('skipping this for now') - -try: - import statsmodels.tsa.var as sm_var - import statsmodels as sm -except ImportError: - import scikits.statsmodels.tsa.var as sm_var - import scikits.statsmodels as sm - - -import pandas.stats.var as _pvar -reload(_pvar) -from pandas.stats.var import VAR - -DECIMAL_6 = 6 -DECIMAL_5 = 5 -DECIMAL_4 = 4 -DECIMAL_3 = 3 -DECIMAL_2 = 2 - - -class CheckVAR(object): - - def test_params(self): - tm.assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) - - def test_neqs(self): - tm.assert_numpy_array_equal(self.res1.neqs, self.res2.neqs) - - def test_nobs(self): - tm.assert_numpy_array_equal(self.res1.avobs, self.res2.nobs) - - def test_df_eq(self): - tm.assert_numpy_array_equal(self.res1.df_eq, self.res2.df_eq) - - def test_rmse(self): - results = self.res1.results - for i in range(len(results)): - tm.assert_almost_equal(results[i].mse_resid ** .5, - eval('self.res2.rmse_' + str(i + 1)), - DECIMAL_6) - - def test_rsquared(self): - results = self.res1.results - for i in range(len(results)): - tm.assert_almost_equal(results[i].rsquared, - eval('self.res2.rsquared_' + str(i + 1)), - DECIMAL_3) - - def test_llf(self): - results = self.res1.results - tm.assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) - for i in range(len(results)): - tm.assert_almost_equal(results[i].llf, - eval('self.res2.llf_' + str(i + 1)), - DECIMAL_2) - - def test_aic(self): - tm.assert_almost_equal(self.res1.aic, self.res2.aic) - - def test_bic(self): - tm.assert_almost_equal(self.res1.bic, self.res2.bic) - - def test_hqic(self): - tm.assert_almost_equal(self.res1.hqic, self.res2.hqic) - - def test_fpe(self): - tm.assert_almost_equal(self.res1.fpe, self.res2.fpe) - - def test_detsig(self): - tm.assert_almost_equal(self.res1.detomega, self.res2.detsig) - - def test_bse(self): - tm.assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) - - -class Foo(object): - - def __init__(self): - data = sm.datasets.macrodata.load() - data = data.data[['realinv', 'realgdp', 'realcons']].view((float, 3)) - data = diff(log(data), axis=0) - self.res1 = VAR2(endog=data).fit(maxlag=2) - from results import results_var - self.res2 = results_var.MacrodataResults() diff --git a/pandas/stats/var.py b/pandas/stats/var.py deleted file mode 100644 index db4028d60f5c8..0000000000000 --- a/pandas/stats/var.py +++ /dev/null @@ -1,605 +0,0 @@ -# flake8: noqa - -from __future__ import division - -from pandas.compat import range, lrange, zip, reduce -from pandas import compat -import numpy as np -from pandas.core.base import StringMixin -from pandas.util.decorators import cache_readonly -from pandas.core.frame import DataFrame -from pandas.core.panel import Panel -from pandas.core.series import Series -import pandas.stats.common as common -from pandas.stats.math import inv -from pandas.stats.ols import _combine_rhs - - -class VAR(StringMixin): - """ - Estimates VAR(p) regression on multivariate time series data - presented in pandas data structures. - - Parameters - ---------- - data : DataFrame or dict of Series - p : lags to include - - """ - - def __init__(self, data, p=1, intercept=True): - import warnings - warnings.warn("The pandas.stats.var module is deprecated and will be " - "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: " - "http://www.statsmodels.org/stable/vector_ar.html#var", - FutureWarning, stacklevel=4) - - try: - import statsmodels.tsa.vector_ar.api as sm_var - except ImportError: - import scikits.statsmodels.tsa.var as sm_var - - self._data = DataFrame(_combine_rhs(data)) - self._p = p - - self._columns = self._data.columns - self._index = self._data.index - - self._intercept = intercept - - @cache_readonly - def aic(self): - """Returns the Akaike information criterion.""" - return self._ic['aic'] - - @cache_readonly - def bic(self): - """Returns the Bayesian information criterion.""" - return self._ic['bic'] - - @cache_readonly - def beta(self): - """ - Returns a DataFrame, where each column x1 contains the betas - calculated by regressing the x1 column of the VAR input with - the lagged input. - - Returns - ------- - DataFrame - """ - d = dict([(key, value.beta) - for (key, value) in compat.iteritems(self.ols_results)]) - return DataFrame(d) - - def forecast(self, h): - """ - Returns a DataFrame containing the forecasts for 1, 2, ..., n time - steps. Each column x1 contains the forecasts of the x1 column. - - Parameters - ---------- - n: int - Number of time steps ahead to forecast. - - Returns - ------- - DataFrame - """ - forecast = self._forecast_raw(h)[:, 0, :] - return DataFrame(forecast, index=lrange(1, 1 + h), - columns=self._columns) - - def forecast_cov(self, h): - """ - Returns the covariance of the forecast residuals. - - Returns - ------- - DataFrame - """ - return [DataFrame(value, index=self._columns, columns=self._columns) - for value in self._forecast_cov_raw(h)] - - def forecast_std_err(self, h): - """ - Returns the standard errors of the forecast residuals. - - Returns - ------- - DataFrame - """ - return DataFrame(self._forecast_std_err_raw(h), - index=lrange(1, 1 + h), columns=self._columns) - - @cache_readonly - def granger_causality(self): - """Returns the f-stats and p-values from the Granger Causality Test. - - If the data consists of columns x1, x2, x3, then we perform the - following regressions: - - x1 ~ L(x2, x3) - x1 ~ L(x1, x3) - x1 ~ L(x1, x2) - - The f-stats of these results are placed in the 'x1' column of the - returned DataFrame. We then repeat for x2, x3. - - Returns - ------- - Dict, where 'f-stat' returns the DataFrame containing the f-stats, - and 'p-value' returns the DataFrame containing the corresponding - p-values of the f-stats. - """ - from pandas.stats.api import ols - from scipy.stats import f - - d = {} - for col in self._columns: - d[col] = {} - for i in range(1, 1 + self._p): - lagged_data = self._lagged_data[i].filter( - self._columns - [col]) - - for key, value in compat.iteritems(lagged_data): - d[col][_make_param_name(i, key)] = value - - f_stat_dict = {} - p_value_dict = {} - - for col, y in compat.iteritems(self._data): - ssr_full = (self.resid[col] ** 2).sum() - - f_stats = [] - p_values = [] - - for col2 in self._columns: - result = ols(y=y, x=d[col2]) - - resid = result.resid - ssr_reduced = (resid ** 2).sum() - - M = self._p - N = self._nobs - K = self._k * self._p + 1 - f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K)) - f_stats.append(f_stat) - - p_value = f.sf(f_stat, M, N - K) - p_values.append(p_value) - - f_stat_dict[col] = Series(f_stats, self._columns) - p_value_dict[col] = Series(p_values, self._columns) - - f_stat_mat = DataFrame(f_stat_dict) - p_value_mat = DataFrame(p_value_dict) - - return { - 'f-stat': f_stat_mat, - 'p-value': p_value_mat, - } - - @cache_readonly - def ols_results(self): - """ - Returns the results of the regressions: - x_1 ~ L(X) - x_2 ~ L(X) - ... - x_k ~ L(X) - - where X = [x_1, x_2, ..., x_k] - and L(X) represents the columns of X lagged 1, 2, ..., n lags - (n is the user-provided number of lags). - - Returns - ------- - dict - """ - from pandas.stats.api import ols - - d = {} - for i in range(1, 1 + self._p): - for col, series in compat.iteritems(self._lagged_data[i]): - d[_make_param_name(i, col)] = series - - result = dict([(col, ols(y=y, x=d, intercept=self._intercept)) - for col, y in compat.iteritems(self._data)]) - - return result - - @cache_readonly - def resid(self): - """ - Returns the DataFrame containing the residuals of the VAR regressions. - Each column x1 contains the residuals generated by regressing the x1 - column of the input against the lagged input. - - Returns - ------- - DataFrame - """ - d = dict([(col, series.resid) - for (col, series) in compat.iteritems(self.ols_results)]) - return DataFrame(d, index=self._index) - - @cache_readonly - def summary(self): - template = """ -%(banner_top)s - -Number of Observations: %(nobs)d -AIC: %(aic).3f -BIC: %(bic).3f - -%(banner_coef)s -%(coef_table)s -%(banner_end)s -""" - params = { - 'banner_top': common.banner('Summary of VAR'), - 'banner_coef': common.banner('Summary of Estimated Coefficients'), - 'banner_end': common.banner('End of Summary'), - 'coef_table': self.beta, - 'aic': self.aic, - 'bic': self.bic, - 'nobs': self._nobs, - } - - return template % params - - @cache_readonly - def _alpha(self): - """ - Returns array where the i-th element contains the intercept - when regressing the i-th column of self._data with the lagged data. - """ - if self._intercept: - return self._beta_raw[-1] - else: - return np.zeros(self._k) - - @cache_readonly - def _beta_raw(self): - return np.array([list(self.beta[col].values()) for col in self._columns]).T - - def _trans_B(self, h): - """ - Returns 0, 1, ..., (h-1)-th power of transpose of B as defined in - equation (4) on p. 142 of the Stata 11 Time Series reference book. - """ - result = [np.eye(1 + self._k * self._p)] - - row1 = np.zeros((1, 1 + self._k * self._p)) - row1[0, 0] = 1 - - v = self._alpha.reshape((self._k, 1)) - row2 = np.hstack(tuple([v] + self._lag_betas)) - - m = self._k * (self._p - 1) - row3 = np.hstack(( - np.zeros((m, 1)), - np.eye(m), - np.zeros((m, self._k)) - )) - - trans_B = np.vstack((row1, row2, row3)).T - - result.append(trans_B) - - for i in range(2, h): - result.append(np.dot(trans_B, result[i - 1])) - - return result - - @cache_readonly - def _x(self): - values = np.array([ - list(self._lagged_data[i][col].values()) - for i in range(1, 1 + self._p) - for col in self._columns - ]).T - - x = np.hstack((np.ones((len(values), 1)), values))[self._p:] - - return x - - @cache_readonly - def _cov_beta(self): - cov_resid = self._sigma - - x = self._x - - inv_cov_x = inv(np.dot(x.T, x)) - - return np.kron(inv_cov_x, cov_resid) - - def _data_xs(self, i): - """ - Returns the cross-section of the data at the given timestep. - """ - return self._data.values[i] - - def _forecast_cov_raw(self, n): - resid = self._forecast_cov_resid_raw(n) - # beta = self._forecast_cov_beta_raw(n) - - # return [a + b for a, b in zip(resid, beta)] - # TODO: ignore the beta forecast std err until it's verified - - return resid - - def _forecast_cov_beta_raw(self, n): - """ - Returns the covariance of the beta errors for the forecast at - 1, 2, ..., n timesteps. - """ - p = self._p - - values = self._data.values - T = len(values) - self._p - 1 - - results = [] - - for h in range(1, n + 1): - psi = self._psi(h) - trans_B = self._trans_B(h) - - sum = 0 - - cov_beta = self._cov_beta - - for t in range(T + 1): - index = t + p - y = values.take(lrange(index, index - p, -1), axis=0).ravel() - trans_Z = np.hstack(([1], y)) - trans_Z = trans_Z.reshape(1, len(trans_Z)) - - sum2 = 0 - for i in range(h): - ZB = np.dot(trans_Z, trans_B[h - 1 - i]) - - prod = np.kron(ZB, psi[i]) - sum2 = sum2 + prod - - sum = sum + chain_dot(sum2, cov_beta, sum2.T) - - results.append(sum / (T + 1)) - - return results - - def _forecast_cov_resid_raw(self, h): - """ - Returns the covariance of the residual errors for the forecast at - 1, 2, ..., h timesteps. - """ - psi_values = self._psi(h) - sum = 0 - result = [] - for i in range(h): - psi = psi_values[i] - sum = sum + chain_dot(psi, self._sigma, psi.T) - result.append(sum) - - return result - - def _forecast_raw(self, h): - """ - Returns the forecast at 1, 2, ..., h timesteps in the future. - """ - k = self._k - result = [] - for i in range(h): - sum = self._alpha.reshape(1, k) - for j in range(self._p): - beta = self._lag_betas[j] - idx = i - j - if idx > 0: - y = result[idx - 1] - else: - y = self._data_xs(idx - 1) - - sum = sum + np.dot(beta, y.T).T - result.append(sum) - - return np.array(result) - - def _forecast_std_err_raw(self, h): - """ - Returns the standard error of the forecasts - at 1, 2, ..., n timesteps. - """ - return np.array([np.sqrt(np.diag(value)) - for value in self._forecast_cov_raw(h)]) - - @cache_readonly - def _ic(self): - """ - Returns the Akaike/Bayesian information criteria. - """ - RSS = self._rss - k = self._p * (self._k * self._p + 1) - n = self._nobs * self._k - - return {'aic': 2 * k + n * np.log(RSS / n), - 'bic': n * np.log(RSS / n) + k * np.log(n)} - - @cache_readonly - def _k(self): - return len(self._columns) - - @cache_readonly - def _lag_betas(self): - """ - Returns list of B_i, where B_i represents the (k, k) matrix - with the j-th row containing the betas of regressing the j-th - column of self._data with self._data lagged i time steps. - First element is B_1, second element is B_2, etc. - """ - k = self._k - b = self._beta_raw - return [b[k * i: k * (i + 1)].T for i in range(self._p)] - - @cache_readonly - def _lagged_data(self): - return dict([(i, self._data.shift(i)) - for i in range(1, 1 + self._p)]) - - @cache_readonly - def _nobs(self): - return len(self._data) - self._p - - def _psi(self, h): - """ - psi value used for calculating standard error. - - Returns [psi_0, psi_1, ..., psi_(h - 1)] - """ - k = self._k - result = [np.eye(k)] - for i in range(1, h): - result.append(sum( - [np.dot(result[i - j], self._lag_betas[j - 1]) - for j in range(1, 1 + i) - if j <= self._p])) - - return result - - @cache_readonly - def _resid_raw(self): - resid = np.array([self.ols_results[col]._resid_raw - for col in self._columns]) - return resid - - @cache_readonly - def _rss(self): - """Returns the sum of the squares of the residuals.""" - return (self._resid_raw ** 2).sum() - - @cache_readonly - def _sigma(self): - """Returns covariance of resids.""" - k = self._k - n = self._nobs - - resid = self._resid_raw - - return np.dot(resid, resid.T) / (n - k) - - def __unicode__(self): - return self.summary - - -def lag_select(data, max_lags=5, ic=None): - """ - Select number of lags based on a variety of information criteria - - Parameters - ---------- - data : DataFrame-like - max_lags : int - Maximum number of lags to evaluate - ic : {None, 'aic', 'bic', ...} - Choosing None will just display the results - - Returns - ------- - None - """ - pass - - -class PanelVAR(VAR): - """ - Performs Vector Autoregression on panel data. - - Parameters - ---------- - data: Panel or dict of DataFrame - lags: int - """ - - def __init__(self, data, lags, intercept=True): - self._data = _prep_panel_data(data) - self._p = lags - self._intercept = intercept - - self._columns = self._data.items - - @cache_readonly - def _nobs(self): - """Returns the number of observations.""" - _, timesteps, entities = self._data.values.shape - return (timesteps - self._p) * entities - - @cache_readonly - def _rss(self): - """Returns the sum of the squares of the residuals.""" - return (self.resid.values ** 2).sum() - - def forecast(self, h): - """ - Returns the forecasts at 1, 2, ..., n timesteps in the future. - """ - forecast = self._forecast_raw(h).T.swapaxes(1, 2) - index = lrange(1, 1 + h) - w = Panel(forecast, items=self._data.items, major_axis=index, - minor_axis=self._data.minor_axis) - return w - - @cache_readonly - def resid(self): - """ - Returns the DataFrame containing the residuals of the VAR regressions. - Each column x1 contains the residuals generated by regressing the x1 - column of the input against the lagged input. - - Returns - ------- - DataFrame - """ - d = dict([(key, value.resid) - for (key, value) in compat.iteritems(self.ols_results)]) - return Panel.fromDict(d) - - def _data_xs(self, i): - return self._data.values[:, i, :].T - - @cache_readonly - def _sigma(self): - """Returns covariance of resids.""" - k = self._k - resid = _drop_incomplete_rows(self.resid.toLong().values) - n = len(resid) - return np.dot(resid.T, resid) / (n - k) - - -def _prep_panel_data(data): - """Converts the given data into a Panel.""" - if isinstance(data, Panel): - return data - - return Panel.fromDict(data) - - -def _drop_incomplete_rows(array): - mask = np.isfinite(array).all(1) - indices = np.arange(len(array))[mask] - return array.take(indices, 0) - - -def _make_param_name(lag, name): - return 'L%d.%s' % (lag, name) - - -def chain_dot(*matrices): - """ - Returns the dot product of the given matrices. - - Parameters - ---------- - matrices: argument list of ndarray - """ - return reduce(lambda x, y: np.dot(y, x), matrices[::-1]) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index c7b9f4bdea6b2..c3962ad9c823c 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -69,7 +69,6 @@ def show_versions(as_json=False): ("Cython", lambda mod: mod.__version__), ("numpy", lambda mod: mod.version.version), ("scipy", lambda mod: mod.version.version), - ("statsmodels", lambda mod: mod.__version__), ("xarray", lambda mod: mod.__version__), ("IPython", lambda mod: mod.__version__), ("sphinx", lambda mod: mod.__version__), diff --git a/setup.py b/setup.py index 3c2617da18eae..c3cb56f2d6d1b 100755 --- a/setup.py +++ b/setup.py @@ -660,7 +660,6 @@ def pxd(name): 'pandas.io.tests.json', 'pandas.io.tests.parser', 'pandas.io.tests.sas', - 'pandas.stats.tests', 'pandas.msgpack', 'pandas.util.clipboard' ], From e5a58f0ae9a06456ea004fefa318e06bc514bca5 Mon Sep 17 00:00:00 2001 From: Stephen Rauch Date: Thu, 9 Feb 2017 12:04:19 -0500 Subject: [PATCH 117/338] BUG: Multiline Eval broken for local variables after first line Also fixes the code which attempted to ignore any blank lines in the multiline expression. closes #15342 Author: Stephen Rauch Closes #15343 from stephenrauch/multi-line-eval-with-local and squashes the following commits: fe67ede [Stephen Rauch] BUG: GH15342 - Multiline Eval broken for local variables after first line --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/computation/eval.py | 5 ++--- pandas/computation/tests/test_eval.py | 19 +++++++++++++++---- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3fb6f7b0b9a91..e765cdef4d219 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -517,3 +517,4 @@ Bug Fixes - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) +- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index a0a08e4a968cc..5b21c753a71da 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -236,7 +236,7 @@ def eval(expr, parser='pandas', engine=None, truediv=True, first_expr = True if isinstance(expr, string_types): _check_expression(expr) - exprs = [e for e in expr.splitlines() if e != ''] + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ''] else: exprs = [expr] multi_line = len(exprs) > 1 @@ -254,8 +254,7 @@ def eval(expr, parser='pandas', engine=None, truediv=True, _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - level += 1 - env = _ensure_scope(level, global_dict=global_dict, + env = _ensure_scope(level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index aa05626af9175..a4bb81ce7263c 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1274,7 +1274,6 @@ def test_assignment_fails(self): local_dict={'df': df, 'df2': df2}) def test_assignment_column(self): - tm.skip_if_no_ne('numexpr') df = DataFrame(np.random.randn(5, 2), columns=list('ab')) orig_df = df.copy() @@ -1346,7 +1345,6 @@ def test_column_in(self): def assignment_not_inplace(self): # GH 9297 - tm.skip_if_no_ne('numexpr') df = DataFrame(np.random.randn(5, 2), columns=list('ab')) actual = df.eval('c = a + b', inplace=False) @@ -1365,7 +1363,6 @@ def assignment_not_inplace(self): def test_multi_line_expression(self): # GH 11149 - tm.skip_if_no_ne('numexpr') df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) expected = df.copy() @@ -1393,7 +1390,6 @@ def test_multi_line_expression(self): def test_multi_line_expression_not_inplace(self): # GH 11149 - tm.skip_if_no_ne('numexpr') df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) expected = df.copy() @@ -1411,6 +1407,21 @@ def test_multi_line_expression_not_inplace(self): e = a + 2""", inplace=False) assert_frame_equal(expected, df) + def test_multi_line_expression_local_variable(self): + # GH 15342 + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + expected = df.copy() + + local_var = 7 + expected['c'] = expected['a'] * local_var + expected['d'] = expected['c'] + local_var + ans = df.eval(""" + c = a * @local_var + d = c + @local_var + """, inplace=True) + assert_frame_equal(expected, df) + self.assertIsNone(ans) + def test_assignment_in_query(self): # GH 8664 df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) From 64384e90b57d35e2bcb49e247a1ceb6e075b41bd Mon Sep 17 00:00:00 2001 From: Piotr Chromiec Date: Thu, 9 Feb 2017 12:08:02 -0500 Subject: [PATCH 118/338] BUG: fix read_gbq lost precision for longs above 2^53 and floats above 10k closes #14020 closes #14305 Author: Piotr Chromiec Closes #14064 from tworec/read_gbq_full_long_support and squashes the following commits: 788ccee [Piotr Chromiec] BUG: fix read_gbq lost numeric precision --- doc/source/install.rst | 13 +- doc/source/io.rst | 61 +++++-- doc/source/whatsnew/v0.20.0.txt | 5 +- pandas/io/gbq.py | 24 +-- pandas/io/tests/test_gbq.py | 288 +++++++++++++++++++++----------- 5 files changed, 263 insertions(+), 128 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 158a6e5562b7a..4b3ea19624a0e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -250,9 +250,9 @@ Optional Dependencies * `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: - - `psycopg2 `__: for PostgreSQL - - `pymysql `__: for MySQL. - - `SQLite `__: for SQLite, this is included in Python's standard library by default. + * `psycopg2 `__: for PostgreSQL + * `pymysql `__: for MySQL. + * `SQLite `__: for SQLite, this is included in Python's standard library by default. * `matplotlib `__: for plotting * For Excel I/O: @@ -272,11 +272,8 @@ Optional Dependencies `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* Google's `python-gflags <`__ , - `oauth2client `__ , - `httplib2 `__ - and `google-api-python-client `__ - : Needed for :mod:`~pandas.io.gbq` +* For Google BigQuery I/O - see :ref:`here `. + * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: diff --git a/doc/source/io.rst b/doc/source/io.rst index 4c78758a0e2d2..22eac33a715ba 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -39,7 +39,7 @@ object. * :ref:`read_json` * :ref:`read_msgpack` * :ref:`read_html` - * :ref:`read_gbq` + * :ref:`read_gbq` * :ref:`read_stata` * :ref:`read_sas` * :ref:`read_clipboard` @@ -55,7 +55,7 @@ The corresponding ``writer`` functions are object methods that are accessed like * :ref:`to_json` * :ref:`to_msgpack` * :ref:`to_html` - * :ref:`to_gbq` + * :ref:`to_gbq` * :ref:`to_stata` * :ref:`to_clipboard` * :ref:`to_pickle` @@ -4648,16 +4648,11 @@ DataFrame with a shape and data types derived from the source table. Additionally, DataFrames can be inserted into new BigQuery tables or appended to existing tables. -You will need to install some additional dependencies: - -- Google's `python-gflags `__ -- `httplib2 `__ -- `google-api-python-client `__ - .. warning:: To use this module, you will need a valid BigQuery account. Refer to the - `BigQuery Documentation `__ for details on the service itself. + `BigQuery Documentation `__ + for details on the service itself. The key functions are: @@ -4671,7 +4666,44 @@ The key functions are: .. currentmodule:: pandas -.. _io.bigquery_reader: + +Supported Data Types +++++++++++++++++++++ + +Pandas supports all these `BigQuery data types `__: +``STRING``, ``INTEGER`` (64bit), ``FLOAT`` (64 bit), ``BOOLEAN`` and +``TIMESTAMP`` (microsecond precision). Data types ``BYTES`` and ``RECORD`` +are not supported. + +Integer and boolean ``NA`` handling ++++++++++++++++++++++++++++++++++++ + +.. versionadded:: 0.20 + +Since all columns in BigQuery queries are nullable, and NumPy lacks of ``NA`` +support for integer and boolean types, this module will store ``INTEGER`` or +``BOOLEAN`` columns with at least one ``NULL`` value as ``dtype=object``. +Otherwise those columns will be stored as ``dtype=int64`` or ``dtype=bool`` +respectively. + +This is opposite to default pandas behaviour which will promote integer +type to float in order to store NAs. See the :ref:`gotchas` +for detailed explaination. + +While this trade-off works well for most cases, it breaks down for storing +values greater than 2**53. Such values in BigQuery can represent identifiers +and unnoticed precision lost for identifier is what we want to avoid. + +.. _io.bigquery_deps: + +Dependencies +++++++++++++ + +This module requires following additional dependencies: + +- `httplib2 `__: HTTP client +- `google-api-python-client `__: Google's API client +- `oauth2client `__: authentication and authorization for Google's API .. _io.bigquery_authentication: @@ -4686,7 +4718,7 @@ Is possible to authenticate with either user account credentials or service acco Authenticating with user account credentials is as simple as following the prompts in a browser window which will be automatically opened for you. You will be authenticated to the specified ``BigQuery`` account using the product name ``pandas GBQ``. It is only possible on local host. -The remote authentication using user account credentials is not currently supported in Pandas. +The remote authentication using user account credentials is not currently supported in pandas. Additional information on the authentication mechanism can be found `here `__. @@ -4695,8 +4727,6 @@ is particularly useful when working on remote servers (eg. jupyter iPython noteb Additional information on service accounts can be found `here `__. -You will need to install an additional dependency: `oauth2client `__. - Authentication via ``application default credentials`` is also possible. This is only valid if the parameter ``private_key`` is not provided. This method also requires that the credentials can be fetched from the environment the code is running in. @@ -4716,6 +4746,7 @@ Additional information on A private key can be obtained from the Google developers console by clicking `here `__. Use JSON key type. +.. _io.bigquery_reader: Querying '''''''' @@ -4775,7 +4806,6 @@ For more information about query configuration parameters see .. _io.bigquery_writer: - Writing DataFrames '''''''''''''''''' @@ -4865,6 +4895,8 @@ For example: often as the service seems to be changing and evolving. BiqQuery is best for analyzing large sets of data quickly, but it is not a direct replacement for a transactional database. +.. _io.bigquery_create_tables: + Creating BigQuery Tables '''''''''''''''''''''''' @@ -4894,6 +4926,7 @@ produce the dictionary representation schema of the specified pandas DataFrame. the new table with a different name. Refer to `Google BigQuery issue 191 `__. + .. _io.stata: Stata Format diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e765cdef4d219..9eae2b7a33923 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -369,7 +369,9 @@ Other API Changes - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) +- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) +- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). + .. _whatsnew_0200.deprecations: Deprecations @@ -439,6 +441,7 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) + - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 966f53e9d75ef..76c228418a616 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -603,18 +603,14 @@ def _parse_data(schema, rows): # see: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing - dtype_map = {'INTEGER': np.dtype(float), - 'FLOAT': np.dtype(float), - # This seems to be buggy without nanosecond indicator + dtype_map = {'FLOAT': np.dtype(float), 'TIMESTAMP': 'M8[ns]'} fields = schema['fields'] col_types = [field['type'] for field in fields] col_names = [str(field['name']) for field in fields] col_dtypes = [dtype_map.get(field['type'], object) for field in fields] - page_array = np.zeros((len(rows),), - dtype=lzip(col_names, col_dtypes)) - + page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes)) for row_num, raw_row in enumerate(rows): entries = raw_row.get('f', []) for col_num, field_type in enumerate(col_types): @@ -628,7 +624,9 @@ def _parse_data(schema, rows): def _parse_entry(field_value, field_type): if field_value is None or field_value == 'null': return None - if field_type == 'INTEGER' or field_type == 'FLOAT': + if field_type == 'INTEGER': + return int(field_value) + elif field_type == 'FLOAT': return float(field_value) elif field_type == 'TIMESTAMP': timestamp = datetime.utcfromtimestamp(float(field_value)) @@ -757,10 +755,14 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, 'Column order does not match this DataFrame.' ) - # Downcast floats to integers and objects to booleans - # if there are no NaN's. This is presently due to a - # limitation of numpy in handling missing data. - final_df._data = final_df._data.downcast(dtypes='infer') + # cast BOOLEAN and INTEGER columns from object to bool/int + # if they dont have any nulls + type_map = {'BOOLEAN': bool, 'INTEGER': int} + for field in schema['fields']: + if field['type'] in type_map and \ + final_df[field['name']].notnull().all(): + final_df[field['name']] = \ + final_df[field['name']].astype(type_map[field['type']]) connector.print_elapsed_seconds( 'Total time taken', diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 457e2d218cb33..1157482d7ae67 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -46,6 +46,11 @@ def _skip_if_no_project_id(): "Cannot run integration tests without a project id") +def _skip_local_auth_if_in_travis_env(): + if _in_travis_environment(): + raise nose.SkipTest("Cannot run local auth in travis environment") + + def _skip_if_no_private_key_path(): if not _get_private_key_path(): raise nose.SkipTest("Cannot run integration tests without a " @@ -248,14 +253,14 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -class TestGBQConnectorIntegration(tm.TestCase): +class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): _setup_common() _skip_if_no_project_id() + _skip_local_auth_if_in_travis_env() - self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_path()) + self.sut = gbq.GbqConnector(_get_project_id()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -293,8 +298,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase): - +class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -325,16 +329,15 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase): - +class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() _skip_if_no_project_id() - _skip_if_no_private_key_path() + _skip_if_no_private_key_contents() self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_path()) + private_key=_get_private_key_contents()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -373,9 +376,9 @@ def test_import_google_api_python_client(self): from googleapiclient.discovery import build # noqa from googleapiclient.errors import HttpError # noqa - def test_should_return_bigquery_integers_as_python_floats(self): + def test_should_return_bigquery_integers_as_python_ints(self): result = gbq._parse_entry(1, 'INTEGER') - tm.assert_equal(result, float(1)) + tm.assert_equal(result, int(1)) def test_should_return_bigquery_floats_as_python_floats(self): result = gbq._parse_entry(1, 'FLOAT') @@ -403,15 +406,15 @@ def test_to_gbq_with_no_project_id_given_should_fail(self): def test_read_gbq_with_no_project_id_given_should_fail(self): with tm.assertRaises(TypeError): - gbq.read_gbq('SELECT "1" as NUMBER_1') + gbq.read_gbq('SELECT 1') def test_that_parse_data_works_properly(self): test_schema = {'fields': [ - {'mode': 'NULLABLE', 'name': 'VALID_STRING', 'type': 'STRING'}]} + {'mode': 'NULLABLE', 'name': 'valid_string', 'type': 'STRING'}]} test_page = [{'f': [{'v': 'PI'}]}] test_output = gbq._parse_data(test_schema, test_page) - correct_output = DataFrame({'VALID_STRING': ['PI']}) + correct_output = DataFrame({'valid_string': ['PI']}) tm.assert_frame_equal(test_output, correct_output) def test_read_gbq_with_invalid_private_key_json_should_fail(self): @@ -435,12 +438,12 @@ def test_read_gbq_with_empty_private_key_file_should_fail(self): private_key=empty_file_path) def test_read_gbq_with_corrupted_private_key_json_should_fail(self): - _skip_if_no_private_key_path() + _skip_if_no_private_key_contents() with tm.assertRaises(gbq.InvalidPrivateKeyFormat): gbq.read_gbq( 'SELECT 1', project_id='x', - private_key=re.sub('[a-z]', '9', _get_private_key_path())) + private_key=re.sub('[a-z]', '9', _get_private_key_contents())) class TestReadGBQIntegration(tm.TestCase): @@ -475,112 +478,207 @@ def tearDown(self): pass def test_should_read_as_user_account(self): - if _in_travis_environment(): - raise nose.SkipTest("Cannot run local auth in travis environment") + _skip_local_auth_if_in_travis_env() - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_should_read_as_service_account_with_key_path(self): _skip_if_no_private_key_path() - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_should_read_as_service_account_with_key_contents(self): _skip_if_no_private_key_contents() - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_contents()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) + + +class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): + + @classmethod + def setUpClass(cls): + # - GLOBAL CLASS FIXTURES - + # put here any instruction you want to execute only *ONCE* *BEFORE* + # executing *ALL* tests described below. + + _skip_if_no_project_id() + _skip_if_no_private_key_path() + + _setup_common() + + def setUp(self): + # - PER-TEST FIXTURES - + # put here any instruction you want to be run *BEFORE* *EVERY* test is + # executed. + pass + + @classmethod + def tearDownClass(cls): + # - GLOBAL CLASS FIXTURES - + # put here any instruction you want to execute only *ONCE* *AFTER* + # executing all tests. + pass + + def tearDown(self): + # - PER-TEST FIXTURES - + # put here any instructions you want to be run *AFTER* *EVERY* test is + # executed. + pass def test_should_properly_handle_valid_strings(self): - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_should_properly_handle_empty_strings(self): - query = 'SELECT "" as EMPTY_STRING' + query = 'SELECT "" AS empty_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING': [""]})) + tm.assert_frame_equal(df, DataFrame({'empty_string': [""]})) def test_should_properly_handle_null_strings(self): - query = 'SELECT STRING(NULL) as NULL_STRING' + query = 'SELECT STRING(NULL) AS null_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_STRING': [None]})) + tm.assert_frame_equal(df, DataFrame({'null_string': [None]})) def test_should_properly_handle_valid_integers(self): - query = 'SELECT INTEGER(3) as VALID_INTEGER' + query = 'SELECT INTEGER(3) AS valid_integer' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal(df, DataFrame({'valid_integer': [3]})) + + def test_should_properly_handle_nullable_integers(self): + query = '''SELECT * FROM + (SELECT 1 AS nullable_integer), + (SELECT NULL AS nullable_integer)''' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER': [3]})) + tm.assert_frame_equal( + df, DataFrame({'nullable_integer': [1, None]}).astype(object)) + + def test_should_properly_handle_valid_longs(self): + query = 'SELECT 1 << 62 AS valid_long' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'valid_long': [1 << 62]})) + + def test_should_properly_handle_nullable_longs(self): + query = '''SELECT * FROM + (SELECT 1 << 62 AS nullable_long), + (SELECT NULL AS nullable_long)''' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'nullable_long': [1 << 62, None]}).astype(object)) def test_should_properly_handle_null_integers(self): - query = 'SELECT INTEGER(NULL) as NULL_INTEGER' + query = 'SELECT INTEGER(NULL) AS null_integer' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER': [np.nan]})) + tm.assert_frame_equal(df, DataFrame({'null_integer': [None]})) def test_should_properly_handle_valid_floats(self): - query = 'SELECT PI() as VALID_FLOAT' + from math import pi + query = 'SELECT PI() AS valid_float' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal(df, DataFrame( + {'valid_float': [pi]})) + + def test_should_properly_handle_nullable_floats(self): + from math import pi + query = '''SELECT * FROM + (SELECT PI() AS nullable_float), + (SELECT NULL AS nullable_float)''' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'nullable_float': [pi, None]})) + + def test_should_properly_handle_valid_doubles(self): + from math import pi + query = 'SELECT PI() * POW(10, 307) AS valid_double' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( - {'VALID_FLOAT': [3.141592653589793]})) + {'valid_double': [pi * 10 ** 307]})) + + def test_should_properly_handle_nullable_doubles(self): + from math import pi + query = '''SELECT * FROM + (SELECT PI() * POW(10, 307) AS nullable_double), + (SELECT NULL AS nullable_double)''' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'nullable_double': [pi * 10 ** 307, None]})) def test_should_properly_handle_null_floats(self): - query = 'SELECT FLOAT(NULL) as NULL_FLOAT' + query = 'SELECT FLOAT(NULL) AS null_float' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT': [np.nan]})) + tm.assert_frame_equal(df, DataFrame({'null_float': [np.nan]})) def test_should_properly_handle_timestamp_unix_epoch(self): - query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' + query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( - {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) + {'unix_epoch': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) def test_should_properly_handle_arbitrary_timestamp(self): - query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' + query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({ - 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] + 'valid_timestamp': [np.datetime64('2004-09-15T05:00:00.000000Z')] })) def test_should_properly_handle_null_timestamp(self): - query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP' + query = 'SELECT TIMESTAMP(NULL) AS null_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP': [NaT]})) + tm.assert_frame_equal(df, DataFrame({'null_timestamp': [NaT]})) def test_should_properly_handle_true_boolean(self): - query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN' + query = 'SELECT BOOLEAN(TRUE) AS true_boolean' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN': [True]})) + tm.assert_frame_equal(df, DataFrame({'true_boolean': [True]})) def test_should_properly_handle_false_boolean(self): - query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN' + query = 'SELECT BOOLEAN(FALSE) AS false_boolean' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN': [False]})) + tm.assert_frame_equal(df, DataFrame({'false_boolean': [False]})) def test_should_properly_handle_null_boolean(self): - query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN' + query = 'SELECT BOOLEAN(NULL) AS null_boolean' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal(df, DataFrame({'null_boolean': [None]})) + + def test_should_properly_handle_nullable_booleans(self): + query = '''SELECT * FROM + (SELECT BOOLEAN(TRUE) AS nullable_boolean), + (SELECT NULL AS nullable_boolean)''' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN': [None]})) + tm.assert_frame_equal( + df, DataFrame({'nullable_boolean': [True, None]}).astype(object)) def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame( - {'UNICODE_STRING': [u("\xe9\xfc")]} + {'unicode_string': [u("\xe9\xfc")]} ) unicode_string = "\xc3\xa9\xc3\xbc" @@ -588,40 +686,40 @@ def test_unicode_string_conversion_and_normalization(self): if compat.PY3: unicode_string = unicode_string.encode('latin-1').decode('utf8') - query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string) + query = 'SELECT "{0}" AS unicode_string'.format(unicode_string) df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, correct_test_datatype) def test_index_column(self): - query = "SELECT 'a' as STRING_1, 'b' as STRING_2" + query = "SELECT 'a' AS string_1, 'b' AS string_2" result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - index_col="STRING_1", + index_col="string_1", private_key=_get_private_key_path()) correct_frame = DataFrame( - {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") + {'string_1': ['a'], 'string_2': ['b']}).set_index("string_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name) def test_column_order(self): - query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" - col_order = ['STRING_3', 'STRING_1', 'STRING_2'] + query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" + col_order = ['string_3', 'string_1', 'string_2'] result_frame = gbq.read_gbq(query, project_id=_get_project_id(), col_order=col_order, private_key=_get_private_key_path()) - correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ - 'b'], 'STRING_3': ['c']})[col_order] + correct_frame = DataFrame({'string_1': ['a'], 'string_2': [ + 'b'], 'string_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame) def test_column_order_plus_index(self): - query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" - col_order = ['STRING_3', 'STRING_2'] + query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" + col_order = ['string_3', 'string_2'] result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - index_col='STRING_1', col_order=col_order, + index_col='string_1', col_order=col_order, private_key=_get_private_key_path()) correct_frame = DataFrame( - {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) - correct_frame.set_index('STRING_1', inplace=True) + {'string_1': ['a'], 'string_2': ['b'], 'string_3': ['c']}) + correct_frame.set_index('string_1', inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame) @@ -655,14 +753,17 @@ def test_download_dataset_larger_than_200k_rows(self): def test_zero_rows(self): # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 - df = gbq.read_gbq("SELECT title, id " + df = gbq.read_gbq("SELECT title, id, is_bot, " + "SEC_TO_TIMESTAMP(timestamp) ts " "FROM [publicdata:samples.wikipedia] " "WHERE timestamp=-9999999", project_id=_get_project_id(), private_key=_get_private_key_path()) page_array = np.zeros( - (0,), dtype=[('title', object), ('id', np.dtype(float))]) - expected_result = DataFrame(page_array, columns=['title', 'id']) + (0,), dtype=[('title', object), ('id', np.dtype(int)), + ('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')]) + expected_result = DataFrame( + page_array, columns=['title', 'id', 'is_bot', 'ts']) self.assert_frame_equal(df, expected_result) def test_legacy_sql(self): @@ -715,7 +816,7 @@ def test_invalid_option_for_sql_dialect(self): dialect='standard', private_key=_get_private_key_path()) def test_query_with_parameters(self): - sql_statement = "SELECT @param1 + @param2 as VALID_RESULT" + sql_statement = "SELECT @param1 + @param2 AS valid_result" config = { 'query': { "useLegacySql": False, @@ -753,11 +854,11 @@ def test_query_with_parameters(self): df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) - tm.assert_frame_equal(df, DataFrame({'VALID_RESULT': [3]})) + tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) def test_query_inside_configuration(self): - query_no_use = 'SELECT "PI_WRONG" as VALID_STRING' - query = 'SELECT "PI" as VALID_STRING' + query_no_use = 'SELECT "PI_WRONG" AS valid_string' + query = 'SELECT "PI" AS valid_string' config = { 'query': { "query": query, @@ -774,7 +875,7 @@ def test_query_inside_configuration(self): df = gbq.read_gbq(None, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_configuration_without_query(self): sql_statement = 'SELECT 1' @@ -800,7 +901,7 @@ def test_configuration_without_query(self): configuration=config) -class TestToGBQIntegration(tm.TestCase): +class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. # Make sure to modify the for loop range in the tearDownClass when a new @@ -814,6 +915,7 @@ def setUpClass(cls): # executing *ALL* tests described below. _skip_if_no_project_id() + _skip_if_no_private_key_path() _setup_common() clean_gbq_environment(_get_private_key_path()) @@ -859,11 +961,11 @@ def test_upload_data(self): sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) - self.assertEqual(result['NUM_ROWS'][0], test_size) + self.assertEqual(result['num_rows'][0], test_size) def test_upload_data_if_table_exists_fail(self): destination_table = DESTINATION_TABLE + "2" @@ -899,11 +1001,11 @@ def test_upload_data_if_table_exists_append(self): sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) - self.assertEqual(result['NUM_ROWS'][0], test_size * 2) + self.assertEqual(result['num_rows'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): @@ -932,11 +1034,11 @@ def test_upload_data_if_table_exists_replace(self): sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) - self.assertEqual(result['NUM_ROWS'][0], 5) + self.assertEqual(result['num_rows'][0], 5) @tm.slow def test_google_upload_errors_should_raise_exception(self): @@ -1113,7 +1215,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -class TestToGBQIntegrationServiceAccountKeyPath(tm.TestCase): +class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. # Make sure to modify the for loop range in the tearDownClass when a new @@ -1128,10 +1230,10 @@ def setUpClass(cls): # executing *ALL* tests described below. _skip_if_no_project_id() - _skip_if_no_private_key_path() + _skip_local_auth_if_in_travis_env() _setup_common() - clean_gbq_environment(_get_private_key_path()) + clean_gbq_environment() def setUp(self): # - PER-TEST FIXTURES - @@ -1145,7 +1247,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment(_get_private_key_path()) + clean_gbq_environment() def tearDown(self): # - PER-TEST FIXTURES - @@ -1153,26 +1255,24 @@ def tearDown(self): # is executed. pass - def test_upload_data_as_service_account_with_key_path(self): + def test_upload_data(self): destination_table = "{0}.{1}".format(DATASET_ID + "2", TABLE_ID + "1") test_size = 10 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_path()) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( - "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), - project_id=_get_project_id(), - private_key=_get_private_key_path()) + "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), + project_id=_get_project_id()) - self.assertEqual(result['NUM_ROWS'][0], test_size) + self.assertEqual(result['num_rows'][0], test_size) -class TestToGBQIntegrationServiceAccountKeyContents(tm.TestCase): +class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. # Make sure to modify the for loop range in the tearDownClass when a new @@ -1212,7 +1312,7 @@ def tearDown(self): # is executed. pass - def test_upload_data_as_service_account_with_key_contents(self): + def test_upload_data(self): destination_table = "{0}.{1}".format(DATASET_ID + "3", TABLE_ID + "1") test_size = 10 @@ -1224,7 +1324,7 @@ def test_upload_data_as_service_account_with_key_contents(self): sleep(30) # <- Curses Google!!! result = gbq.read_gbq( - "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), + "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_contents()) - self.assertEqual(result['NUM_ROWS'][0], test_size) + self.assertEqual(result['num_rows'][0], test_size) From a22a2ae2913d6980025f6100b6f521517803022e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 9 Feb 2017 17:36:19 -0500 Subject: [PATCH 119/338] CLN: strip out and form tools/concat.py from tools/merge.py will facilitate some changes in ``tools/merge`` w.r.t. #15321, plus these are independent anyhow. Author: Jeff Reback Closes #15358 from jreback/concat and squashes the following commits: ba34c51 [Jeff Reback] CLN: strip out and form tools/concat.py from tools/merge.py --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/__init__.py | 6 +- pandas/core/base.py | 4 +- pandas/core/categorical.py | 2 +- pandas/core/frame.py | 9 +- pandas/core/groupby.py | 12 +- pandas/core/panel.py | 2 +- pandas/core/reshape.py | 2 +- pandas/core/series.py | 2 +- pandas/formats/format.py | 4 +- pandas/io/gbq.py | 4 +- pandas/io/pytables.py | 8 +- pandas/tests/groupby/test_groupby.py | 5 +- pandas/tools/concat.py | 615 ++++++++++++++++++++++ pandas/tools/merge.py | 634 +---------------------- pandas/tools/pivot.py | 4 +- pandas/tools/plotting.py | 2 +- pandas/tools/tests/test_join.py | 3 +- pandas/tools/tests/test_merge.py | 3 +- pandas/tools/tests/test_merge_ordered.py | 2 - pandas/tools/tests/test_pivot.py | 4 +- 21 files changed, 672 insertions(+), 657 deletions(-) create mode 100644 pandas/tools/concat.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9eae2b7a33923..2279d0464a5c7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -385,7 +385,7 @@ Deprecations - ``TimedeltaIndex.searchsorted()``, ``DatetimeIndex.searchsorted()``, and ``PeriodIndex.searchsorted()`` have deprecated the ``key`` parameter in favor of ``value`` (:issue:`12662`) - ``DataFrame.astype()`` has deprecated the ``raise_on_error`` parameter in favor of ``errors`` (:issue:`14878`) - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) - +- importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) .. _whatsnew_0200.prior_deprecations: diff --git a/pandas/__init__.py b/pandas/__init__.py index 9133e11beaa2b..76542db22a757 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -42,10 +42,10 @@ from pandas.sparse.api import * from pandas.stats.api import * from pandas.tseries.api import * -from pandas.io.api import * from pandas.computation.api import * -from pandas.tools.merge import (merge, concat, ordered_merge, +from pandas.tools.concat import concat +from pandas.tools.merge import (merge, ordered_merge, merge_ordered, merge_asof) from pandas.tools.pivot import pivot_table, crosstab from pandas.tools.plotting import scatter_matrix, plot_params @@ -54,6 +54,8 @@ from pandas.core.reshape import melt from pandas.util.print_versions import show_versions +from pandas.io.api import * + # define the testing framework import pandas.util.testing from pandas.util.nosetester import NoseTester diff --git a/pandas/core/base.py b/pandas/core/base.py index 657da859ddde2..92ec6bb3d73e6 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -472,7 +472,7 @@ def _aggregate(self, arg, *args, **kwargs): arg = new_arg - from pandas.tools.merge import concat + from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ @@ -579,7 +579,7 @@ def _agg(arg, func): return result, True def _aggregate_multiple_funcs(self, arg, _level): - from pandas.tools.merge import concat + from pandas.tools.concat import concat if self.axis != 0: raise NotImplementedError("axis other than 0 is not supported") diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 5980f872f951f..491db2e080953 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1907,7 +1907,7 @@ def describe(self): counts = self.value_counts(dropna=False) freqs = counts / float(counts.sum()) - from pandas.tools.merge import concat + from pandas.tools.concat import concat result = concat([counts, freqs], axis=1) result.columns = ['counts', 'freqs'] result.index.name = 'categories' diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79bdad82af5a3..aa03bfb9a54b9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4402,7 +4402,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): if (self.columns.get_indexer(other.columns) >= 0).all(): other = other.loc[:, self.columns] - from pandas.tools.merge import concat + from pandas.tools.concat import concat if isinstance(other, (list, tuple)): to_concat = [self] + other else: @@ -4532,7 +4532,8 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): - from pandas.tools.merge import merge, concat + from pandas.tools.merge import merge + from pandas.tools.concat import concat if isinstance(other, Series): if other.name is None: @@ -4636,7 +4637,7 @@ def round(self, decimals=0, *args, **kwargs): Series.round """ - from pandas.tools.merge import concat + from pandas.tools.concat import concat def _dict_round(df, decimals): for col, vals in df.iteritems(): @@ -5306,7 +5307,7 @@ def isin(self, values): """ if isinstance(values, dict): from collections import defaultdict - from pandas.tools.merge import concat + from pandas.tools.concat import concat values = defaultdict(list, values) return concat((self.iloc[:, [i]].isin(values[col]) for i, col in enumerate(self.columns)), axis=1) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 99220232114ce..53b6dbe6075cf 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -854,7 +854,7 @@ def _wrap_applied_output(self, *args, **kwargs): raise AbstractMethodError(self) def _concat_objects(self, keys, values, not_indexed_same=False): - from pandas.tools.merge import concat + from pandas.tools.concat import concat def reset_identity(values): # reset the identities of the components @@ -3507,7 +3507,7 @@ def first_non_None_value(values): # still a series # path added as of GH 5545 elif all_indexed_same: - from pandas.tools.merge import concat + from pandas.tools.concat import concat return concat(values) if not all_indexed_same: @@ -3540,7 +3540,7 @@ def first_non_None_value(values): else: # GH5788 instead of stacking; concat gets the # dtypes correct - from pandas.tools.merge import concat + from pandas.tools.concat import concat result = concat(values, keys=key_index, names=key_index.names, axis=self.axis).unstack() @@ -3588,7 +3588,7 @@ def first_non_None_value(values): not_indexed_same=not_indexed_same) def _transform_general(self, func, *args, **kwargs): - from pandas.tools.merge import concat + from pandas.tools.concat import concat applied = [] obj = self._obj_with_exclusions @@ -3980,7 +3980,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions) def _apply_to_column_groupbys(self, func): - from pandas.tools.merge import concat + from pandas.tools.concat import concat return concat( (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), @@ -4061,7 +4061,7 @@ def groupby_series(obj, col=None): if isinstance(obj, Series): results = groupby_series(obj) else: - from pandas.tools.merge import concat + from pandas.tools.concat import concat results = [groupby_series(obj[col], col) for col in obj.columns] results = concat(results, axis=1) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 6da10305eb4fc..4a6c6cf291316 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1282,7 +1282,7 @@ def join(self, other, how='left', lsuffix='', rsuffix=''): ------- joined : Panel """ - from pandas.tools.merge import concat + from pandas.tools.concat import concat if isinstance(other, Panel): join_major, join_minor = self._get_join_index(other, how) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index d6287f17c8387..bd0358abf67d5 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1194,7 +1194,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, -------- Series.str.get_dummies """ - from pandas.tools.merge import concat + from pandas.tools.concat import concat from itertools import cycle if isinstance(data, DataFrame): diff --git a/pandas/core/series.py b/pandas/core/series.py index 43f16f690692a..e1eac8f66017e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1588,7 +1588,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): """ - from pandas.tools.merge import concat + from pandas.tools.concat import concat if isinstance(to_append, (list, tuple)): to_concat = [self] + to_append diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 439b96d650204..1a7a06199ad8a 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -165,7 +165,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True, self._chk_truncate() def _chk_truncate(self): - from pandas.tools.merge import concat + from pandas.tools.concat import concat max_rows = self.max_rows truncate_v = max_rows and (len(self.series) > max_rows) series = self.series @@ -406,7 +406,7 @@ def _chk_truncate(self): Checks whether the frame should be truncated. If so, slices the frame up. """ - from pandas.tools.merge import concat + from pandas.tools.concat import concat # Column of which first element is used to determine width of a dot col self.tr_size_col = -1 diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 76c228418a616..169a2b1df9b4c 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -10,9 +10,7 @@ import numpy as np from distutils.version import StrictVersion -from pandas import compat -from pandas.core.api import DataFrame -from pandas.tools.merge import concat +from pandas import compat, DataFrame, concat from pandas.core.common import PandasError from pandas.compat import lzip, bytes_to_str diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7b4800115b23b..7661cd19e4d14 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -26,13 +26,12 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index, isnull) + MultiIndex, Int64Index, isnull, concat, + SparseSeries, SparseDataFrame, PeriodIndex, + DatetimeIndex, TimedeltaIndex) from pandas.core import config from pandas.io.common import _stringify_path -from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex -from pandas.tseries.api import PeriodIndex, DatetimeIndex -from pandas.tseries.tdi import TimedeltaIndex from pandas.core.base import StringMixin from pandas.formats.printing import adjoin, pprint_thing from pandas.core.common import _asarray_tuplesafe, PerformanceWarning @@ -42,7 +41,6 @@ _block2d_to_blocknd, _factor_indexer, _block_shape) from pandas.core.index import _ensure_index -from pandas.tools.merge import concat from pandas import compat from pandas.compat import u_safe as u, PY3, range, lrange, string_types, text_type, filter from pandas.core.config import get_option diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 458e869130190..53f85349834ac 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,7 +6,8 @@ from numpy import nan from pandas import (date_range, bdate_range, Timestamp, - isnull, Index, MultiIndex, DataFrame, Series) + isnull, Index, MultiIndex, DataFrame, Series, + concat, Panel) from pandas.core.common import UnsupportedFunctionCall from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -14,8 +15,6 @@ from pandas.compat import (range, long, lrange, StringIO, lmap, lzip, map, zip, builtins, OrderedDict, product as cart_product) from pandas import compat -from pandas.core.panel import Panel -from pandas.tools.merge import concat from collections import defaultdict import pandas.core.common as com import numpy as np diff --git a/pandas/tools/concat.py b/pandas/tools/concat.py new file mode 100644 index 0000000000000..dbbc831b19d1d --- /dev/null +++ b/pandas/tools/concat.py @@ -0,0 +1,615 @@ +""" +concat routines +""" + +import numpy as np +from pandas import compat, DataFrame, Series, Index, MultiIndex +from pandas.core.index import (_get_combined_index, + _ensure_index, _get_consensus_names, + _all_indexes_same) +from pandas.core.categorical import (_factorize_from_iterable, + _factorize_from_iterables) +from pandas.core.internals import concatenate_block_managers +from pandas.core import common as com +from pandas.core.generic import NDFrame +import pandas.types.concat as _concat + +# --------------------------------------------------------------------- +# Concatenate DataFrame objects + + +def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, + keys=None, levels=None, names=None, verify_integrity=False, + copy=True): + """ + Concatenate pandas objects along a particular axis with optional set logic + along the other axes. + + Can also add a layer of hierarchical indexing on the concatenation axis, + which may be useful if the labels are the same (or overlapping) on + the passed axis number. + + Parameters + ---------- + objs : a sequence or mapping of Series, DataFrame, or Panel objects + If a dict is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case a ValueError will be raised + axis : {0/'index', 1/'columns'}, default 0 + The axis to concatenate along + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis(es) + join_axes : list of Index objects + Specific indexes to use for the other n - 1 axes instead of performing + inner/outer set logic + ignore_index : boolean, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys + names : list, default None + Names for the levels in the resulting hierarchical index + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation + copy : boolean, default True + If False, do not copy data unnecessarily + + Returns + ------- + concatenated : type of objects + + Notes + ----- + The keys, levels, and names arguments are all optional. + + A walkthrough of how this method fits in with other tools for combining + panda objects can be found `here + `__. + + See Also + -------- + Series.append + DataFrame.append + DataFrame.join + DataFrame.merge + + Examples + -------- + Combine two ``Series``. + + >>> s1 = pd.Series(['a', 'b']) + >>> s2 = pd.Series(['c', 'd']) + >>> pd.concat([s1, s2]) + 0 a + 1 b + 0 c + 1 d + dtype: object + + Clear the existing index and reset it in the result + by setting the ``ignore_index`` option to ``True``. + + >>> pd.concat([s1, s2], ignore_index=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + Add a hierarchical index at the outermost level of + the data with the ``keys`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2',]) + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Label the index keys you create with the ``names`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2'], + ... names=['Series name', 'Row ID']) + Series name Row ID + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Combine two ``DataFrame`` objects with identical columns. + + >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], + ... columns=['letter', 'number']) + >>> df1 + letter number + 0 a 1 + 1 b 2 + >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], + ... columns=['letter', 'number']) + >>> df2 + letter number + 0 c 3 + 1 d 4 + >>> pd.concat([df1, df2]) + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects with overlapping columns + and return everything. Columns outside the intersection will + be filled with ``NaN`` values. + + >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], + ... columns=['letter', 'number', 'animal']) + >>> df3 + letter number animal + 0 c 3 cat + 1 d 4 dog + >>> pd.concat([df1, df3]) + animal letter number + 0 NaN a 1 + 1 NaN b 2 + 0 cat c 3 + 1 dog d 4 + + Combine ``DataFrame`` objects with overlapping columns + and return only those that are shared by passing ``inner`` to + the ``join`` keyword argument. + + >>> pd.concat([df1, df3], join="inner") + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects horizontally along the x axis by + passing in ``axis=1``. + + >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], + ... columns=['animal', 'name']) + >>> pd.concat([df1, df4], axis=1) + letter number animal name + 0 a 1 bird polly + 1 b 2 monkey george + + Prevent the result from including duplicate index values with the + ``verify_integrity`` option. + + >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 + 0 + a 1 + >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 + 0 + a 2 + >>> pd.concat([df5, df6], verify_integrity=True) + ValueError: Indexes have overlapping values: ['a'] + """ + op = _Concatenator(objs, axis=axis, join_axes=join_axes, + ignore_index=ignore_index, join=join, + keys=keys, levels=levels, names=names, + verify_integrity=verify_integrity, + copy=copy) + return op.get_result() + + +class _Concatenator(object): + """ + Orchestrates a concatenation operation for BlockManagers + """ + + def __init__(self, objs, axis=0, join='outer', join_axes=None, + keys=None, levels=None, names=None, + ignore_index=False, verify_integrity=False, copy=True): + if isinstance(objs, (NDFrame, compat.string_types)): + raise TypeError('first argument must be an iterable of pandas ' + 'objects, you passed an object of type ' + '"{0}"'.format(type(objs).__name__)) + + if join == 'outer': + self.intersect = False + elif join == 'inner': + self.intersect = True + else: # pragma: no cover + raise ValueError('Only can inner (intersect) or outer (union) ' + 'join the other axis') + + if isinstance(objs, dict): + if keys is None: + keys = sorted(objs) + objs = [objs[k] for k in keys] + else: + objs = list(objs) + + if len(objs) == 0: + raise ValueError('No objects to concatenate') + + if keys is None: + objs = [obj for obj in objs if obj is not None] + else: + # #1649 + clean_keys = [] + clean_objs = [] + for k, v in zip(keys, objs): + if v is None: + continue + clean_keys.append(k) + clean_objs.append(v) + objs = clean_objs + name = getattr(keys, 'name', None) + keys = Index(clean_keys, name=name) + + if len(objs) == 0: + raise ValueError('All objects passed were None') + + # consolidate data & figure out what our result ndim is going to be + ndims = set() + for obj in objs: + if not isinstance(obj, NDFrame): + raise TypeError("cannot concatenate a non-NDFrame object") + + # consolidate + obj.consolidate(inplace=True) + ndims.add(obj.ndim) + + # get the sample + # want the higest ndim that we have, and must be non-empty + # unless all objs are empty + sample = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break + + else: + # filter out the empties if we have not multi-index possibiltes + # note to keep empty Series as it affect to result columns / name + non_empties = [obj for obj in objs + if sum(obj.shape) > 0 or isinstance(obj, Series)] + + if (len(non_empties) and (keys is None and names is None and + levels is None and join_axes is None)): + objs = non_empties + sample = objs[0] + + if sample is None: + sample = objs[0] + self.objs = objs + + # Standardize axis parameter to int + if isinstance(sample, Series): + axis = DataFrame()._get_axis_number(axis) + else: + axis = sample._get_axis_number(axis) + + # Need to flip BlockManager axis in the DataFrame special case + self._is_frame = isinstance(sample, DataFrame) + if self._is_frame: + axis = 1 if axis == 0 else 0 + + self._is_series = isinstance(sample, Series) + if not 0 <= axis <= sample.ndim: + raise AssertionError("axis must be between 0 and {0}, " + "input was {1}".format(sample.ndim, axis)) + + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + current_column = 0 + max_ndim = sample.ndim + self.objs, objs = [], self.objs + for obj in objs: + + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim - 1: + raise ValueError("cannot concatenate unaligned mixed " + "dimensional NDFrame objects") + + else: + name = getattr(obj, 'name', None) + if ignore_index or name is None: + name = current_column + current_column += 1 + + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 + obj = sample._constructor({name: obj}) + + self.objs.append(obj) + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.axis = axis + self.join_axes = join_axes + self.keys = keys + self.names = names or getattr(keys, 'names', None) + self.levels = levels + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + self.copy = copy + + self.new_axes = self._get_new_axes() + + def get_result(self): + + # series only + if self._is_series: + + # stack blocks + if self.axis == 0: + # concat Series with length to keep dtype as much + non_empties = [x for x in self.objs if len(x) > 0] + if len(non_empties) > 0: + values = [x._values for x in non_empties] + else: + values = [x._values for x in self.objs] + new_data = _concat._concat_compat(values) + + name = com._consensus_name_attr(self.objs) + cons = _concat._get_series_result_type(new_data) + + return (cons(new_data, index=self.new_axes[0], + name=name, dtype=new_data.dtype) + .__finalize__(self, method='concat')) + + # combine as columns in a frame + else: + data = dict(zip(range(len(self.objs)), self.objs)) + cons = _concat._get_series_result_type(data) + + index, columns = self.new_axes + df = cons(data, index=index) + df.columns = columns + return df.__finalize__(self, method='concat') + + # combine block managers + else: + mgrs_indexers = [] + for obj in self.objs: + mgr = obj._data + indexers = {} + for ax, new_labels in enumerate(self.new_axes): + if ax == self.axis: + # Suppress reindexing on concat axis + continue + + obj_labels = mgr.axes[ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.reindex(new_labels)[1] + + mgrs_indexers.append((obj._data, indexers)) + + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, + copy=self.copy) + if not self.copy: + new_data._consolidate_inplace() + + cons = _concat._get_frame_result_type(new_data, self.objs) + return (cons._from_axes(new_data, self.new_axes) + .__finalize__(self, method='concat')) + + def _get_result_dim(self): + if self._is_series and self.axis == 1: + return 2 + else: + return self.objs[0].ndim + + def _get_new_axes(self): + ndim = self._get_result_dim() + new_axes = [None] * ndim + + if self.join_axes is None: + for i in range(ndim): + if i == self.axis: + continue + new_axes[i] = self._get_comb_axis(i) + else: + if len(self.join_axes) != ndim - 1: + raise AssertionError("length of join_axes must not be " + "equal to {0}".format(ndim - 1)) + + # ufff... + indices = compat.lrange(ndim) + indices.remove(self.axis) + + for i, ax in zip(indices, self.join_axes): + new_axes[i] = ax + + new_axes[self.axis] = self._get_concat_axis() + return new_axes + + def _get_comb_axis(self, i): + if self._is_series: + all_indexes = [x.index for x in self.objs] + else: + try: + all_indexes = [x._data.axes[i] for x in self.objs] + except IndexError: + types = [type(x).__name__ for x in self.objs] + raise TypeError("Cannot concatenate list of %s" % types) + + return _get_combined_index(all_indexes, intersect=self.intersect) + + def _get_concat_axis(self): + """ + Return index to be used along concatenation axis. + """ + if self._is_series: + if self.axis == 0: + indexes = [x.index for x in self.objs] + elif self.ignore_index: + idx = com._default_index(len(self.objs)) + return idx + elif self.keys is None: + names = [None] * len(self.objs) + num = 0 + has_names = False + for i, x in enumerate(self.objs): + if not isinstance(x, Series): + raise TypeError("Cannot concatenate type 'Series' " + "with object of type " + "%r" % type(x).__name__) + if x.name is not None: + names[i] = x.name + has_names = True + else: + names[i] = num + num += 1 + if has_names: + return Index(names) + else: + return com._default_index(len(self.objs)) + else: + return _ensure_index(self.keys) + else: + indexes = [x._data.axes[self.axis] for x in self.objs] + + if self.ignore_index: + idx = com._default_index(sum(len(i) for i in indexes)) + return idx + + if self.keys is None: + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, self.keys, + self.levels, self.names) + + self._maybe_check_integrity(concat_axis) + + return concat_axis + + def _maybe_check_integrity(self, concat_index): + if self.verify_integrity: + if not concat_index.is_unique: + overlap = concat_index.get_duplicates() + raise ValueError('Indexes have overlapping values: %s' + % str(overlap)) + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +def _make_concat_multiindex(indexes, keys, levels=None, names=None): + + if ((levels is None and isinstance(keys[0], tuple)) or + (levels is not None and len(levels) > 1)): + zipped = compat.lzip(*keys) + if names is None: + names = [None] * len(zipped) + + if levels is None: + _, levels = _factorize_from_iterables(zipped) + else: + levels = [_ensure_index(x) for x in levels] + else: + zipped = [keys] + if names is None: + names = [None] + + if levels is None: + levels = [_ensure_index(keys)] + else: + levels = [_ensure_index(x) for x in levels] + + if not _all_indexes_same(indexes): + label_list = [] + + # things are potentially different sizes, so compute the exact labels + # for each level and pass those to MultiIndex.from_arrays + + for hlevel, level in zip(zipped, levels): + to_concat = [] + for key, index in zip(hlevel, indexes): + try: + i = level.get_loc(key) + except KeyError: + raise ValueError('Key %s not in level %s' + % (str(key), str(level))) + + to_concat.append(np.repeat(i, len(index))) + label_list.append(np.concatenate(to_concat)) + + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + levels.extend(concat_index.levels) + label_list.extend(concat_index.labels) + else: + codes, categories = _factorize_from_iterable(concat_index) + levels.append(categories) + label_list.append(codes) + + if len(names) == len(levels): + names = list(names) + else: + # make sure that all of the passed indices have the same nlevels + if not len(set([idx.nlevels for idx in indexes])) == 1: + raise AssertionError("Cannot concat indices that do" + " not have the same number of levels") + + # also copies + names = names + _get_consensus_names(indexes) + + return MultiIndex(levels=levels, labels=label_list, names=names, + verify_integrity=False) + + new_index = indexes[0] + n = len(new_index) + kpieces = len(indexes) + + # also copies + new_names = list(names) + new_levels = list(levels) + + # construct labels + new_labels = [] + + # do something a bit more speedy + + for hlevel, level in zip(zipped, levels): + hlevel = _ensure_index(hlevel) + mapped = level.get_indexer(hlevel) + + mask = mapped == -1 + if mask.any(): + raise ValueError('Values not found in passed level: %s' + % str(hlevel[mask])) + + new_labels.append(np.repeat(mapped, n)) + + if isinstance(new_index, MultiIndex): + new_levels.extend(new_index.levels) + new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) + else: + new_levels.append(new_index) + new_labels.append(np.tile(np.arange(n), kpieces)) + + if len(new_names) < len(new_levels): + new_names.extend(new_index.names) + + return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, + verify_integrity=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3fbd83a6f3245..d938c2eeacbef 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -4,19 +4,16 @@ import copy import warnings - import string import numpy as np -from pandas.compat import range, lrange, lzip, zip, map, filter +from pandas.compat import range, lzip, zip, map, filter import pandas.compat as compat -from pandas import (Categorical, DataFrame, Series, +import pandas as pd +from pandas import (Categorical, Series, DataFrame, Index, MultiIndex, Timedelta) -from pandas.core.categorical import (_factorize_from_iterable, - _factorize_from_iterables) from pandas.core.frame import _merge_doc -from pandas.types.generic import ABCSeries from pandas.types.common import (is_datetime64tz_dtype, is_datetime64_dtype, needs_i8_conversion, @@ -33,23 +30,31 @@ _ensure_object, _get_dtype) from pandas.types.missing import na_value_for_dtype - -from pandas.core.generic import NDFrame -from pandas.core.index import (_get_combined_index, - _ensure_index, _get_consensus_names, - _all_indexes_same) from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution import pandas.core.algorithms as algos import pandas.core.common as com -import pandas.types.concat as _concat import pandas._join as _join import pandas.hashtable as _hash +# back-compat of pseudo-public API +def concat_wrap(): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.tools.merge.concat is deprecated. " + "import from the public API: " + "pandas.concat instead", + FutureWarning, stacklevel=3) + return pd.concat(*args, **kwargs) + return wrapper + +concat = concat_wrap() + + @Substitution('\nleft : DataFrame') @Appender(_merge_doc, indents=0) def merge(left, right, how='inner', on=None, left_on=None, right_on=None, @@ -139,6 +144,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, # preserve the original order # if we have a missing piece this can be reset + from pandas.tools.concat import concat result = concat(pieces, ignore_index=True) result = result.reindex(columns=pieces[0].columns, copy=False) return result, lby @@ -793,9 +799,9 @@ def _get_merge_keys(self): left, right = self.left, self.right is_lkey = lambda x: isinstance( - x, (np.ndarray, ABCSeries)) and len(x) == len(left) + x, (np.ndarray, Series)) and len(x) == len(left) is_rkey = lambda x: isinstance( - x, (np.ndarray, ABCSeries)) and len(x) == len(right) + x, (np.ndarray, Series)) and len(x) == len(right) # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A # user could, for example, request 'left_index' and 'left_by'. In a @@ -1419,606 +1425,6 @@ def _get_join_keys(llab, rlab, shape, sort): return _get_join_keys(llab, rlab, shape, sort) -# --------------------------------------------------------------------- -# Concatenate DataFrame objects - - -def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, - copy=True): - """ - Concatenate pandas objects along a particular axis with optional set logic - along the other axes. - - Can also add a layer of hierarchical indexing on the concatenation axis, - which may be useful if the labels are the same (or overlapping) on - the passed axis number. - - Parameters - ---------- - objs : a sequence or mapping of Series, DataFrame, or Panel objects - If a dict is passed, the sorted keys will be used as the `keys` - argument, unless it is passed, in which case the values will be - selected (see below). Any None objects will be dropped silently unless - they are all None in which case a ValueError will be raised - axis : {0/'index', 1/'columns'}, default 0 - The axis to concatenate along - join : {'inner', 'outer'}, default 'outer' - How to handle indexes on other axis(es) - join_axes : list of Index objects - Specific indexes to use for the other n - 1 axes instead of performing - inner/outer set logic - ignore_index : boolean, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. - keys : sequence, default None - If multiple levels passed, should contain tuples. Construct - hierarchical index using the passed keys as the outermost level - levels : list of sequences, default None - Specific levels (unique values) to use for constructing a - MultiIndex. Otherwise they will be inferred from the keys - names : list, default None - Names for the levels in the resulting hierarchical index - verify_integrity : boolean, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation - copy : boolean, default True - If False, do not copy data unnecessarily - - Returns - ------- - concatenated : type of objects - - Notes - ----- - The keys, levels, and names arguments are all optional. - - A walkthrough of how this method fits in with other tools for combining - panda objects can be found `here - `__. - - See Also - -------- - Series.append - DataFrame.append - DataFrame.join - DataFrame.merge - - Examples - -------- - Combine two ``Series``. - - >>> s1 = pd.Series(['a', 'b']) - >>> s2 = pd.Series(['c', 'd']) - >>> pd.concat([s1, s2]) - 0 a - 1 b - 0 c - 1 d - dtype: object - - Clear the existing index and reset it in the result - by setting the ``ignore_index`` option to ``True``. - - >>> pd.concat([s1, s2], ignore_index=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - - Add a hierarchical index at the outermost level of - the data with the ``keys`` option. - - >>> pd.concat([s1, s2], keys=['s1', 's2',]) - s1 0 a - 1 b - s2 0 c - 1 d - dtype: object - - Label the index keys you create with the ``names`` option. - - >>> pd.concat([s1, s2], keys=['s1', 's2'], - ... names=['Series name', 'Row ID']) - Series name Row ID - s1 0 a - 1 b - s2 0 c - 1 d - dtype: object - - Combine two ``DataFrame`` objects with identical columns. - - >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) - >>> df1 - letter number - 0 a 1 - 1 b 2 - >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) - >>> df2 - letter number - 0 c 3 - 1 d 4 - >>> pd.concat([df1, df2]) - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects with overlapping columns - and return everything. Columns outside the intersection will - be filled with ``NaN`` values. - - >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) - >>> df3 - letter number animal - 0 c 3 cat - 1 d 4 dog - >>> pd.concat([df1, df3]) - animal letter number - 0 NaN a 1 - 1 NaN b 2 - 0 cat c 3 - 1 dog d 4 - - Combine ``DataFrame`` objects with overlapping columns - and return only those that are shared by passing ``inner`` to - the ``join`` keyword argument. - - >>> pd.concat([df1, df3], join="inner") - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects horizontally along the x axis by - passing in ``axis=1``. - - >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], - ... columns=['animal', 'name']) - >>> pd.concat([df1, df4], axis=1) - letter number animal name - 0 a 1 bird polly - 1 b 2 monkey george - - Prevent the result from including duplicate index values with the - ``verify_integrity`` option. - - >>> df5 = pd.DataFrame([1], index=['a']) - >>> df5 - 0 - a 1 - >>> df6 = pd.DataFrame([2], index=['a']) - >>> df6 - 0 - a 2 - >>> pd.concat([df5, df6], verify_integrity=True) - ValueError: Indexes have overlapping values: ['a'] - """ - op = _Concatenator(objs, axis=axis, join_axes=join_axes, - ignore_index=ignore_index, join=join, - keys=keys, levels=levels, names=names, - verify_integrity=verify_integrity, - copy=copy) - return op.get_result() - - -class _Concatenator(object): - """ - Orchestrates a concatenation operation for BlockManagers - """ - - def __init__(self, objs, axis=0, join='outer', join_axes=None, - keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): - if isinstance(objs, (NDFrame, compat.string_types)): - raise TypeError('first argument must be an iterable of pandas ' - 'objects, you passed an object of type ' - '"{0}"'.format(type(objs).__name__)) - - if join == 'outer': - self.intersect = False - elif join == 'inner': - self.intersect = True - else: # pragma: no cover - raise ValueError('Only can inner (intersect) or outer (union) ' - 'join the other axis') - - if isinstance(objs, dict): - if keys is None: - keys = sorted(objs) - objs = [objs[k] for k in keys] - else: - objs = list(objs) - - if len(objs) == 0: - raise ValueError('No objects to concatenate') - - if keys is None: - objs = [obj for obj in objs if obj is not None] - else: - # #1649 - clean_keys = [] - clean_objs = [] - for k, v in zip(keys, objs): - if v is None: - continue - clean_keys.append(k) - clean_objs.append(v) - objs = clean_objs - name = getattr(keys, 'name', None) - keys = Index(clean_keys, name=name) - - if len(objs) == 0: - raise ValueError('All objects passed were None') - - # consolidate data & figure out what our result ndim is going to be - ndims = set() - for obj in objs: - if not isinstance(obj, NDFrame): - raise TypeError("cannot concatenate a non-NDFrame object") - - # consolidate - obj.consolidate(inplace=True) - ndims.add(obj.ndim) - - # get the sample - # want the higest ndim that we have, and must be non-empty - # unless all objs are empty - sample = None - if len(ndims) > 1: - max_ndim = max(ndims) - for obj in objs: - if obj.ndim == max_ndim and np.sum(obj.shape): - sample = obj - break - - else: - # filter out the empties if we have not multi-index possibiltes - # note to keep empty Series as it affect to result columns / name - non_empties = [obj for obj in objs - if sum(obj.shape) > 0 or isinstance(obj, Series)] - - if (len(non_empties) and (keys is None and names is None and - levels is None and join_axes is None)): - objs = non_empties - sample = objs[0] - - if sample is None: - sample = objs[0] - self.objs = objs - - # Standardize axis parameter to int - if isinstance(sample, Series): - axis = DataFrame()._get_axis_number(axis) - else: - axis = sample._get_axis_number(axis) - - # Need to flip BlockManager axis in the DataFrame special case - self._is_frame = isinstance(sample, DataFrame) - if self._is_frame: - axis = 1 if axis == 0 else 0 - - self._is_series = isinstance(sample, ABCSeries) - if not 0 <= axis <= sample.ndim: - raise AssertionError("axis must be between 0 and {0}, " - "input was {1}".format(sample.ndim, axis)) - - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - if len(ndims) > 1: - current_column = 0 - max_ndim = sample.ndim - self.objs, objs = [], self.objs - for obj in objs: - - ndim = obj.ndim - if ndim == max_ndim: - pass - - elif ndim != max_ndim - 1: - raise ValueError("cannot concatenate unaligned mixed " - "dimensional NDFrame objects") - - else: - name = getattr(obj, 'name', None) - if ignore_index or name is None: - name = current_column - current_column += 1 - - # doing a row-wise concatenation so need everything - # to line up - if self._is_frame and axis == 1: - name = 0 - obj = sample._constructor({name: obj}) - - self.objs.append(obj) - - # note: this is the BlockManager axis (since DataFrame is transposed) - self.axis = axis - self.join_axes = join_axes - self.keys = keys - self.names = names or getattr(keys, 'names', None) - self.levels = levels - - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity - self.copy = copy - - self.new_axes = self._get_new_axes() - - def get_result(self): - - # series only - if self._is_series: - - # stack blocks - if self.axis == 0: - # concat Series with length to keep dtype as much - non_empties = [x for x in self.objs if len(x) > 0] - if len(non_empties) > 0: - values = [x._values for x in non_empties] - else: - values = [x._values for x in self.objs] - new_data = _concat._concat_compat(values) - - name = com._consensus_name_attr(self.objs) - cons = _concat._get_series_result_type(new_data) - - return (cons(new_data, index=self.new_axes[0], - name=name, dtype=new_data.dtype) - .__finalize__(self, method='concat')) - - # combine as columns in a frame - else: - data = dict(zip(range(len(self.objs)), self.objs)) - cons = _concat._get_series_result_type(data) - - index, columns = self.new_axes - df = cons(data, index=index) - df.columns = columns - return df.__finalize__(self, method='concat') - - # combine block managers - else: - mgrs_indexers = [] - for obj in self.objs: - mgr = obj._data - indexers = {} - for ax, new_labels in enumerate(self.new_axes): - if ax == self.axis: - # Suppress reindexing on concat axis - continue - - obj_labels = mgr.axes[ax] - if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.reindex(new_labels)[1] - - mgrs_indexers.append((obj._data, indexers)) - - new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, - copy=self.copy) - if not self.copy: - new_data._consolidate_inplace() - - cons = _concat._get_frame_result_type(new_data, self.objs) - return (cons._from_axes(new_data, self.new_axes) - .__finalize__(self, method='concat')) - - def _get_result_dim(self): - if self._is_series and self.axis == 1: - return 2 - else: - return self.objs[0].ndim - - def _get_new_axes(self): - ndim = self._get_result_dim() - new_axes = [None] * ndim - - if self.join_axes is None: - for i in range(ndim): - if i == self.axis: - continue - new_axes[i] = self._get_comb_axis(i) - else: - if len(self.join_axes) != ndim - 1: - raise AssertionError("length of join_axes must not be " - "equal to {0}".format(ndim - 1)) - - # ufff... - indices = lrange(ndim) - indices.remove(self.axis) - - for i, ax in zip(indices, self.join_axes): - new_axes[i] = ax - - new_axes[self.axis] = self._get_concat_axis() - return new_axes - - def _get_comb_axis(self, i): - if self._is_series: - all_indexes = [x.index for x in self.objs] - else: - try: - all_indexes = [x._data.axes[i] for x in self.objs] - except IndexError: - types = [type(x).__name__ for x in self.objs] - raise TypeError("Cannot concatenate list of %s" % types) - - return _get_combined_index(all_indexes, intersect=self.intersect) - - def _get_concat_axis(self): - """ - Return index to be used along concatenation axis. - """ - if self._is_series: - if self.axis == 0: - indexes = [x.index for x in self.objs] - elif self.ignore_index: - idx = com._default_index(len(self.objs)) - return idx - elif self.keys is None: - names = [None] * len(self.objs) - num = 0 - has_names = False - for i, x in enumerate(self.objs): - if not isinstance(x, Series): - raise TypeError("Cannot concatenate type 'Series' " - "with object of type " - "%r" % type(x).__name__) - if x.name is not None: - names[i] = x.name - has_names = True - else: - names[i] = num - num += 1 - if has_names: - return Index(names) - else: - return com._default_index(len(self.objs)) - else: - return _ensure_index(self.keys) - else: - indexes = [x._data.axes[self.axis] for x in self.objs] - - if self.ignore_index: - idx = com._default_index(sum(len(i) for i in indexes)) - return idx - - if self.keys is None: - concat_axis = _concat_indexes(indexes) - else: - concat_axis = _make_concat_multiindex(indexes, self.keys, - self.levels, self.names) - - self._maybe_check_integrity(concat_axis) - - return concat_axis - - def _maybe_check_integrity(self, concat_index): - if self.verify_integrity: - if not concat_index.is_unique: - overlap = concat_index.get_duplicates() - raise ValueError('Indexes have overlapping values: %s' - % str(overlap)) - - -def _concat_indexes(indexes): - return indexes[0].append(indexes[1:]) - - -def _make_concat_multiindex(indexes, keys, levels=None, names=None): - - if ((levels is None and isinstance(keys[0], tuple)) or - (levels is not None and len(levels) > 1)): - zipped = lzip(*keys) - if names is None: - names = [None] * len(zipped) - - if levels is None: - _, levels = _factorize_from_iterables(zipped) - else: - levels = [_ensure_index(x) for x in levels] - else: - zipped = [keys] - if names is None: - names = [None] - - if levels is None: - levels = [_ensure_index(keys)] - else: - levels = [_ensure_index(x) for x in levels] - - if not _all_indexes_same(indexes): - label_list = [] - - # things are potentially different sizes, so compute the exact labels - # for each level and pass those to MultiIndex.from_arrays - - for hlevel, level in zip(zipped, levels): - to_concat = [] - for key, index in zip(hlevel, indexes): - try: - i = level.get_loc(key) - except KeyError: - raise ValueError('Key %s not in level %s' - % (str(key), str(level))) - - to_concat.append(np.repeat(i, len(index))) - label_list.append(np.concatenate(to_concat)) - - concat_index = _concat_indexes(indexes) - - # these go at the end - if isinstance(concat_index, MultiIndex): - levels.extend(concat_index.levels) - label_list.extend(concat_index.labels) - else: - codes, categories = _factorize_from_iterable(concat_index) - levels.append(categories) - label_list.append(codes) - - if len(names) == len(levels): - names = list(names) - else: - # make sure that all of the passed indices have the same nlevels - if not len(set([idx.nlevels for idx in indexes])) == 1: - raise AssertionError("Cannot concat indices that do" - " not have the same number of levels") - - # also copies - names = names + _get_consensus_names(indexes) - - return MultiIndex(levels=levels, labels=label_list, names=names, - verify_integrity=False) - - new_index = indexes[0] - n = len(new_index) - kpieces = len(indexes) - - # also copies - new_names = list(names) - new_levels = list(levels) - - # construct labels - new_labels = [] - - # do something a bit more speedy - - for hlevel, level in zip(zipped, levels): - hlevel = _ensure_index(hlevel) - mapped = level.get_indexer(hlevel) - - mask = mapped == -1 - if mask.any(): - raise ValueError('Values not found in passed level: %s' - % str(hlevel[mask])) - - new_labels.append(np.repeat(mapped, n)) - - if isinstance(new_index, MultiIndex): - new_levels.extend(new_index.levels) - new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) - else: - new_levels.append(new_index) - new_labels.append(np.tile(np.arange(n), kpieces)) - - if len(new_names) < len(new_levels): - new_names.extend(new_index.names) - - return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, - verify_integrity=False) - def _should_fill(lname, rname): if (not isinstance(lname, compat.string_types) or diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 01eefe5f07173..41fc705691a96 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -2,10 +2,8 @@ from pandas.types.common import is_list_like, is_scalar -from pandas import Series, DataFrame -from pandas.core.index import MultiIndex, Index +from pandas import Series, DataFrame, MultiIndex, Index, concat from pandas.core.groupby import Grouper -from pandas.tools.merge import concat from pandas.tools.util import cartesian_product from pandas.compat import range, lrange, zip from pandas import compat diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index ee70515850b25..0b1ced97d2b81 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -3135,7 +3135,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: - from pandas.tools.merge import concat + from pandas.tools.concat import concat keys, frames = zip(*grouped) if grouped.axis == 0: df = concat(frames, keys=keys, axis=1) diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index ff0a494bd7d02..fe5821a637205 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -6,9 +6,8 @@ import pandas as pd from pandas.compat import lrange import pandas.compat as compat -from pandas.tools.merge import merge, concat from pandas.util.testing import assert_frame_equal -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, MultiIndex, Series, merge, concat import pandas._join as _join import pandas.util.testing as tm diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index a348a901442c9..d66cd793ec0be 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -8,7 +8,8 @@ import pandas as pd from pandas.compat import lrange, lzip -from pandas.tools.merge import merge, concat, MergeError +from pandas.tools.concat import concat +from pandas.tools.merge import merge, MergeError from pandas.util.testing import (assert_frame_equal, assert_series_equal, slow) diff --git a/pandas/tools/tests/test_merge_ordered.py b/pandas/tools/tests/test_merge_ordered.py index e08cc98e50794..e4a41ea9a28eb 100644 --- a/pandas/tools/tests/test_merge_ordered.py +++ b/pandas/tools/tests/test_merge_ordered.py @@ -40,10 +40,8 @@ def test_ffill(self): def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) - # right = concat([self.right, self.right], ignore_index=True) left['group'] = ['a'] * 3 + ['b'] * 3 - # right['group'] = ['a'] * 4 + ['b'] * 4 result = merge_ordered(left, self.right, on='key', left_by='group', fill_method='ffill') diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 7f2bb7e724362..f5d91d0088306 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Index, MultiIndex, Grouper, date_range -from pandas.tools.merge import concat +from pandas import (DataFrame, Series, Index, MultiIndex, + Grouper, date_range, concat) from pandas.tools.pivot import pivot_table, crosstab from pandas.compat import range, product import pandas.util.testing as tm From 89a4e925b78fb51c81f67acd9cd14cf4bd671125 Mon Sep 17 00:00:00 2001 From: TrigonaMinima Date: Fri, 10 Feb 2017 00:33:21 +0530 Subject: [PATCH 120/338] TST: Remaining tseries tests reorg closes #14854 closes #15359 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/indexes/datetimes/test_ops.py | 14 +- pandas/tests/indexes/datetimes/test_setops.py | 10 +- .../tests => tests/tseries}/__init__.py | 0 .../tseries}/data/cday-0.14.1.pickle | Bin .../tseries}/data/dateoffset_0_15_2.pickle | 0 .../tseries}/test_bin_groupby.py | 0 .../tests => tests/tseries}/test_converter.py | 0 .../tseries}/test_frequencies.py | 0 .../tests => tests/tseries}/test_holiday.py | 0 .../tests => tests/tseries}/test_offsets.py | 0 .../tests => tests/tseries}/test_resample.py | 0 .../tests => tests/tseries}/test_timezones.py | 0 .../tseries/tests/data/daterange_073.pickle | Bin 650 -> 0 bytes pandas/tseries/tests/data/frame.pickle | Bin 1182 -> 0 bytes pandas/tseries/tests/data/series.pickle | Bin 646 -> 0 bytes .../tests/data/series_daterange0.pickle | Bin 357 -> 0 bytes .../tseries/tests/test_timeseries_legacy.py | 219 ------------------ setup.py | 5 +- 19 files changed, 25 insertions(+), 224 deletions(-) rename pandas/{tseries/tests => tests/tseries}/__init__.py (100%) rename pandas/{tseries/tests => tests/tseries}/data/cday-0.14.1.pickle (100%) rename pandas/{tseries/tests => tests/tseries}/data/dateoffset_0_15_2.pickle (100%) rename pandas/{tseries/tests => tests/tseries}/test_bin_groupby.py (100%) rename pandas/{tseries/tests => tests/tseries}/test_converter.py (100%) rename pandas/{tseries/tests => tests/tseries}/test_frequencies.py (100%) rename pandas/{tseries/tests => tests/tseries}/test_holiday.py (100%) rename pandas/{tseries/tests => tests/tseries}/test_offsets.py (100%) rename pandas/{tseries/tests => tests/tseries}/test_resample.py (100%) rename pandas/{tseries/tests => tests/tseries}/test_timezones.py (100%) delete mode 100644 pandas/tseries/tests/data/daterange_073.pickle delete mode 100644 pandas/tseries/tests/data/frame.pickle delete mode 100644 pandas/tseries/tests/data/series.pickle delete mode 100644 pandas/tseries/tests/data/series_daterange0.pickle delete mode 100644 pandas/tseries/tests/test_timeseries_legacy.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2279d0464a5c7..17ce4517035a7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -371,6 +371,7 @@ Other API Changes - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) - The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). +- Reorganization of timeseries development tests (:issue:`14854`) .. _whatsnew_0200.deprecations: diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 63bf07ec041d3..7a5ce3a44681b 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -7,10 +7,12 @@ import pandas.util.testing as tm from pandas.core.common import PerformanceWarning from pandas.tseries.index import cdate_range +from pandas.tseries.frequencies import get_offset, to_offset from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, date_range, TimedeltaIndex, _np_version_under1p10, Index, datetime, Float64Index, offsets, bdate_range) -from pandas.tseries.offsets import BMonthEnd, CDay, BDay +from pandas.tseries.offsets import (BMonthEnd, CDay, BDay, Milli, MonthBegin, + Micro) from pandas.tests.test_base import Ops @@ -911,6 +913,16 @@ def test_equals(self): self.assertFalse(idx.equals(list(idx3))) self.assertFalse(idx.equals(pd.Series(idx3))) + def test_ms_vs_MS(self): + left = get_offset('ms') + right = get_offset('MS') + self.assertEqual(left, Milli()) + self.assertEqual(right, MonthBegin()) + + def test_rule_aliases(self): + rule = to_offset('10us') + self.assertEqual(rule, Micro(10)) + class TestDateTimeIndexToJulianDate(tm.TestCase): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 7da660a956e23..8d05a4016ba45 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -6,7 +6,7 @@ import pandas.util.testing as tm from pandas.tseries.index import cdate_range from pandas import (DatetimeIndex, date_range, Series, bdate_range, DataFrame, - Int64Index, Index) + Int64Index, Index, to_datetime) from pandas.tseries.offsets import Minute, BMonthEnd, MonthEnd START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) @@ -190,6 +190,14 @@ def test_datetimeindex_union_join_empty(self): result = dti.join(empty) tm.assertIsInstance(result, DatetimeIndex) + def test_join_nonunique(self): + idx1 = to_datetime(['2012-11-06 16:00:11.477563', + '2012-11-06 16:00:11.477563']) + idx2 = to_datetime(['2012-11-06 15:11:09.006507', + '2012-11-06 15:11:09.006507']) + rs = idx1.join(idx2, how='outer') + self.assertTrue(rs.is_monotonic) + class TestBusinessDatetimeIndex(tm.TestCase): diff --git a/pandas/tseries/tests/__init__.py b/pandas/tests/tseries/__init__.py similarity index 100% rename from pandas/tseries/tests/__init__.py rename to pandas/tests/tseries/__init__.py diff --git a/pandas/tseries/tests/data/cday-0.14.1.pickle b/pandas/tests/tseries/data/cday-0.14.1.pickle similarity index 100% rename from pandas/tseries/tests/data/cday-0.14.1.pickle rename to pandas/tests/tseries/data/cday-0.14.1.pickle diff --git a/pandas/tseries/tests/data/dateoffset_0_15_2.pickle b/pandas/tests/tseries/data/dateoffset_0_15_2.pickle similarity index 100% rename from pandas/tseries/tests/data/dateoffset_0_15_2.pickle rename to pandas/tests/tseries/data/dateoffset_0_15_2.pickle diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tests/tseries/test_bin_groupby.py similarity index 100% rename from pandas/tseries/tests/test_bin_groupby.py rename to pandas/tests/tseries/test_bin_groupby.py diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tests/tseries/test_converter.py similarity index 100% rename from pandas/tseries/tests/test_converter.py rename to pandas/tests/tseries/test_converter.py diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py similarity index 100% rename from pandas/tseries/tests/test_frequencies.py rename to pandas/tests/tseries/test_frequencies.py diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tests/tseries/test_holiday.py similarity index 100% rename from pandas/tseries/tests/test_holiday.py rename to pandas/tests/tseries/test_holiday.py diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tests/tseries/test_offsets.py similarity index 100% rename from pandas/tseries/tests/test_offsets.py rename to pandas/tests/tseries/test_offsets.py diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tests/tseries/test_resample.py similarity index 100% rename from pandas/tseries/tests/test_resample.py rename to pandas/tests/tseries/test_resample.py diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tests/tseries/test_timezones.py similarity index 100% rename from pandas/tseries/tests/test_timezones.py rename to pandas/tests/tseries/test_timezones.py diff --git a/pandas/tseries/tests/data/daterange_073.pickle b/pandas/tseries/tests/data/daterange_073.pickle deleted file mode 100644 index 0214a023e6338dce54e6daf8b3d94a7275baca66..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 650 zcmZY7OH0E*5C`z4PrH3+U)q<}*CAed_9jSAE<`BoQDl>BZ848-vOy{q^blLWo!>~) zb{G%N!ZQ3A=JKESwB<$ad@;2AKn&f;Q8OL{d_f)qVfkLDg2+-tYSx^4HV=1WHdi9x z-jg7sq#JKLnWm|jY36DyGdk61Gu|yGwpz>uky)0$zosdwB?CE~W|;P77{=XCQrnN- zDD&$<=5=ecUCmrUu#p8u3g22LwXJw8_oh3^q7*@LCj=6X`nPgnkX%h7Rn(=8|4V3gVF}+qI5udC|!^~N>3;w{`{A; z@_i>Hx1;1JWdG_z9xvsI&WfHNxZIh&3OQJ_yg!+QLdny=^fnRN!cm;avn2QACCQ(& Y?DLBq%8RAEoDS9@(>$t0rm-@Izk(m6nE(I) diff --git a/pandas/tseries/tests/data/frame.pickle b/pandas/tseries/tests/data/frame.pickle deleted file mode 100644 index b3b100fb43022faf7bd0b949238988afc7de53bf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1182 zcmZo*N-jvuOGzx&OU^G!)k`Z%%uVHTNi0cp1G5SlH5&_2m1O3Xq!#5R<`i=|<>V)4 z`zGckrl%Hh6*2==vcwj$Y9!~C<`z^!%*!p!DalMMDoU*6iZ4n{&d)0@DJo4a;VNW9 zu{JX=CAEUfGq1$V#1qUcWcOxh4P{Jf4=Uu)@MiR8ZH1W1l~Ph!kjhoa8OoGt;mzR9 z2voqO;msV%XyfPS=k*^5z=StLNm6I11_Kl@LTM%_%?zbkpmd2}YgY7m%J$PITE56D?utru>CGx=D$H7fJyWf^==6j7BJDUVc$-VoqjNYN2dL zC|iD7T5)Pgp&TM4K*5ocnp2XJQig0taVTS+H)Cm% zUwcw&Y@sqRmcZ$Y3z%rZ>8el#9w(~cq~guh28xw5SgfewNFN;`6M*TW;fH?PbI(qD zpGB$Jd180$pY_~~+OhDdefBm(`J$sw?Iq%0oobwM+g|&L=xlec2ljnYDGD!n-q>g8 zUwoA0f6#t{RL7q$(nsuHeSXyJ?tR(*#=mDLA1WNNXXv=rfBD92`>Bn3vYT!_x3`GD za8a>$i~TLpIhKblp4tZ;l`Kf#cV$0_Toy_unf!A`-q_VZL0AG-eV zsr|Adk2*u+Yxb8^`t(}$F4?E}MdinQzi+=`r_oc77x(PTws(Kna^b%HBhDjt=Q_T$ z&pcMAp>pVf{gs02oSzq6vX}X#bi8QMeftj!9HuX9TW+tmz5In8^E>;!XC?%#<#=n~ z!1O`X*y@NqU;2%8|888h*FUt3f9K|R_OmDVNaSvOY(Fu@LfxqNfxVtm!`}<-H-Wh& PF}6@WgCns$DM=3iVMVf9 diff --git a/pandas/tseries/tests/data/series.pickle b/pandas/tseries/tests/data/series.pickle deleted file mode 100644 index 307a4ac26517384a8267b23a8891e4cf919a20d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 646 zcmZo*O3o|IEvVE>&M!*U%Pq|*$xJLNO049HFG@|$&nqq|DorloDr8J9NX$z~EQTm6 zPA$qzE#?Zz%uNl3FbkQy8CpXbliGs{nKir_y}4Q;#&V^UR2HOi6|#gfrCE40cryYO zuxfZShcepu`T2SM2LdqR%}|om85;0gre(H&?L+&pe^2_|L|)soNw4(YaOa)<>%zvK zYfcaC6He?G)>?kezSjQ7#_DSi>^Cm|Q~hw#b9(`k2lLpcEV8#d5t>_a^o4!SJ<&Jf z{KxD|GEg0!l30>jl$e*E%H;xN1%X+GY;dQuL!6!gbge(kwH#pA)}Xr99_ZTGLQaij zkbxz@VBmr?3b{hL*sn4&Gk`&BP$72)M1%z{!UGjyg^Tb)McCjXd{7Z~xClQ~gbOYr z02SeeiwHtRc;F&JP!V3Zh%i)y4=y5-TH@E*h7!YI@8sv_6mvPb024!@sAglKSZ$%W zMkr@qeo<~>PG(hVp+rY0TYg$vacW7SBqAh0!I6@hQ!OOZ59U9nA?q0TAECTVJ&E1=s@p_O5Y99O<1%Q zB;BptG-d=J1D0mc)|$GRnB*5MVgwiVVm1}zl5>9Zji(6111*Nz)~! zRiE7aziNM|S|k1TkABe8-^TPSq(1=Pb|Z~nI#^obZBbmI= 3") - - pth, _ = os.path.split(os.path.abspath(__file__)) - filepath = os.path.join(pth, 'data', 'frame.pickle') - - with open(filepath, 'rb') as f: - cls.frame = pickle.load(f) - - filepath = os.path.join(pth, 'data', 'series.pickle') - with open(filepath, 'rb') as f: - cls.series = pickle.load(f) - - def test_pass_offset_warn(self): - buf = StringIO() - - sys.stderr = buf - DatetimeIndex(start='1/1/2000', periods=10, offset='H') - sys.stderr = sys.__stderr__ - - def test_unpickle_legacy_frame(self): - dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', - freq=BDay(1)) - - unpickled = self.frame - - self.assertEqual(type(unpickled.index), DatetimeIndex) - self.assertEqual(len(unpickled), 10) - self.assertTrue((unpickled.columns == Int64Index(np.arange(5))).all()) - self.assertTrue((unpickled.index == dtindex).all()) - self.assertEqual(unpickled.index.offset, BDay(1, normalize=True)) - - def test_unpickle_legacy_series(self): - unpickled = self.series - - dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', - freq=BDay(1)) - - self.assertEqual(type(unpickled.index), DatetimeIndex) - self.assertEqual(len(unpickled), 10) - self.assertTrue((unpickled.index == dtindex).all()) - self.assertEqual(unpickled.index.offset, BDay(1, normalize=True)) - - def test_unpickle_legacy_len0_daterange(self): - pth, _ = os.path.split(os.path.abspath(__file__)) - filepath = os.path.join(pth, 'data', 'series_daterange0.pickle') - - result = pd.read_pickle(filepath) - - ex_index = DatetimeIndex([], freq='B') - - self.assert_index_equal(result.index, ex_index) - tm.assertIsInstance(result.index.freq, BDay) - self.assertEqual(len(result), 0) - - def test_arithmetic_interaction(self): - index = self.frame.index - obj_index = index.asobject - - dseries = Series(rand(len(index)), index=index) - oseries = Series(dseries.values, index=obj_index) - - result = dseries + oseries - expected = dseries * 2 - tm.assertIsInstance(result.index, DatetimeIndex) - assert_series_equal(result, expected) - - result = dseries + oseries[:5] - expected = dseries + dseries[:5] - tm.assertIsInstance(result.index, DatetimeIndex) - assert_series_equal(result, expected) - - def test_join_interaction(self): - index = self.frame.index - obj_index = index.asobject - - def _check_join(left, right, how='inner'): - ra, rb, rc = left.join(right, how=how, return_indexers=True) - ea, eb, ec = left.join(DatetimeIndex(right), how=how, - return_indexers=True) - - tm.assertIsInstance(ra, DatetimeIndex) - self.assert_index_equal(ra, ea) - - assert_almost_equal(rb, eb) - assert_almost_equal(rc, ec) - - _check_join(index[:15], obj_index[5:], how='inner') - _check_join(index[:15], obj_index[5:], how='outer') - _check_join(index[:15], obj_index[5:], how='right') - _check_join(index[:15], obj_index[5:], how='left') - - def test_join_nonunique(self): - idx1 = to_datetime(['2012-11-06 16:00:11.477563', - '2012-11-06 16:00:11.477563']) - idx2 = to_datetime(['2012-11-06 15:11:09.006507', - '2012-11-06 15:11:09.006507']) - rs = idx1.join(idx2, how='outer') - self.assertTrue(rs.is_monotonic) - - def test_unpickle_daterange(self): - pth, _ = os.path.split(os.path.abspath(__file__)) - filepath = os.path.join(pth, 'data', 'daterange_073.pickle') - - rng = read_pickle(filepath) - tm.assertIsInstance(rng[0], datetime) - tm.assertIsInstance(rng.offset, BDay) - self.assertEqual(rng.values.dtype, object) - - def test_setops(self): - index = self.frame.index - obj_index = index.asobject - - result = index[:5].union(obj_index[5:]) - expected = index - tm.assertIsInstance(result, DatetimeIndex) - self.assert_index_equal(result, expected) - - result = index[:10].intersection(obj_index[5:]) - expected = index[5:10] - tm.assertIsInstance(result, DatetimeIndex) - self.assert_index_equal(result, expected) - - result = index[:10] - obj_index[5:] - expected = index[:5] - tm.assertIsInstance(result, DatetimeIndex) - self.assert_index_equal(result, expected) - - def test_index_conversion(self): - index = self.frame.index - obj_index = index.asobject - - conv = DatetimeIndex(obj_index) - self.assert_index_equal(conv, index) - - self.assertRaises(ValueError, DatetimeIndex, ['a', 'b', 'c', 'd']) - - def test_tolist(self): - rng = date_range('1/1/2000', periods=10) - - result = rng.tolist() - tm.assertIsInstance(result[0], Timestamp) - - def test_object_convert_fail(self): - idx = DatetimeIndex([np.NaT]) - self.assertRaises(ValueError, idx.astype, 'O') - - def test_setops_conversion_fail(self): - index = self.frame.index - - right = Index(['a', 'b', 'c', 'd']) - - result = index.union(right) - expected = Index(np.concatenate([index.asobject, right])) - self.assert_index_equal(result, expected) - - result = index.intersection(right) - expected = Index([]) - self.assert_index_equal(result, expected) - - def test_legacy_time_rules(self): - rules = [('WEEKDAY', 'B'), ('EOM', 'BM'), ('W@MON', 'W-MON'), - ('W@TUE', 'W-TUE'), ('W@WED', 'W-WED'), ('W@THU', 'W-THU'), - ('W@FRI', 'W-FRI'), ('Q@JAN', 'BQ-JAN'), ('Q@FEB', 'BQ-FEB'), - ('Q@MAR', 'BQ-MAR'), ('A@JAN', 'BA-JAN'), ('A@FEB', 'BA-FEB'), - ('A@MAR', 'BA-MAR'), ('A@APR', 'BA-APR'), ('A@MAY', 'BA-MAY'), - ('A@JUN', 'BA-JUN'), ('A@JUL', 'BA-JUL'), ('A@AUG', 'BA-AUG'), - ('A@SEP', 'BA-SEP'), ('A@OCT', 'BA-OCT'), ('A@NOV', 'BA-NOV'), - ('A@DEC', 'BA-DEC'), ('WOM@1FRI', 'WOM-1FRI'), - ('WOM@2FRI', 'WOM-2FRI'), ('WOM@3FRI', 'WOM-3FRI'), - ('WOM@4FRI', 'WOM-4FRI')] - - start, end = '1/1/2000', '1/1/2010' - - for old_freq, new_freq in rules: - old_rng = date_range(start, end, freq=old_freq) - new_rng = date_range(start, end, freq=new_freq) - self.assert_index_equal(old_rng, new_rng) - - def test_ms_vs_MS(self): - left = get_offset('ms') - right = get_offset('MS') - self.assertEqual(left, Milli()) - self.assertEqual(right, MonthBegin()) - - def test_rule_aliases(self): - rule = to_offset('10us') - self.assertEqual(rule, Micro(10)) diff --git a/setup.py b/setup.py index c3cb56f2d6d1b..edec53e9cefb0 100755 --- a/setup.py +++ b/setup.py @@ -648,13 +648,13 @@ def pxd(name): 'pandas.tests.series', 'pandas.tests.formats', 'pandas.tests.scalar', + 'pandas.tests.tseries', 'pandas.tests.types', 'pandas.tests.test_msgpack', 'pandas.tests.plotting', 'pandas.tools', 'pandas.tools.tests', 'pandas.tseries', - 'pandas.tseries.tests', 'pandas.types', 'pandas.io.tests', 'pandas.io.tests.json', @@ -688,8 +688,7 @@ def pxd(name): 'pandas.tests': ['data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'], - 'pandas.tseries.tests': ['data/*.pickle', - 'data/*.csv'] + 'pandas.tests.tseries': ['data/*.pickle'] }, ext_modules=extensions, maintainer_email=EMAIL, From 08108896f01d335cf66662d990de2480d6077793 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 9 Feb 2017 18:34:18 -0500 Subject: [PATCH 121/338] TST: more tseries reorg --- .../tests/{tseries => groupby}/test_bin_groupby.py | 0 pandas/tests/indexes/datetimes/test_ops.py | 14 +------------- pandas/tests/tseries/test_frequencies.py | 12 ++++++++++++ 3 files changed, 13 insertions(+), 13 deletions(-) rename pandas/tests/{tseries => groupby}/test_bin_groupby.py (100%) diff --git a/pandas/tests/tseries/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py similarity index 100% rename from pandas/tests/tseries/test_bin_groupby.py rename to pandas/tests/groupby/test_bin_groupby.py diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 7a5ce3a44681b..63bf07ec041d3 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -7,12 +7,10 @@ import pandas.util.testing as tm from pandas.core.common import PerformanceWarning from pandas.tseries.index import cdate_range -from pandas.tseries.frequencies import get_offset, to_offset from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, date_range, TimedeltaIndex, _np_version_under1p10, Index, datetime, Float64Index, offsets, bdate_range) -from pandas.tseries.offsets import (BMonthEnd, CDay, BDay, Milli, MonthBegin, - Micro) +from pandas.tseries.offsets import BMonthEnd, CDay, BDay from pandas.tests.test_base import Ops @@ -913,16 +911,6 @@ def test_equals(self): self.assertFalse(idx.equals(list(idx3))) self.assertFalse(idx.equals(pd.Series(idx3))) - def test_ms_vs_MS(self): - left = get_offset('ms') - right = get_offset('MS') - self.assertEqual(left, Milli()) - self.assertEqual(right, MonthBegin()) - - def test_rule_aliases(self): - rule = to_offset('10us') - self.assertEqual(rule, Micro(10)) - class TestDateTimeIndexToJulianDate(tm.TestCase): diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 9983bf5270b29..5fbef465ca8fc 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -247,6 +247,18 @@ def test_anchored_shortcuts(self): frequencies.to_offset(invalid_anchor) +def test_ms_vs_MS(): + left = frequencies.get_offset('ms') + right = frequencies.get_offset('MS') + assert left == offsets.Milli() + assert right == offsets.MonthBegin() + + +def test_rule_aliases(): + rule = frequencies.to_offset('10us') + assert rule == offsets.Micro(10) + + def test_get_rule_month(): result = frequencies._get_rule_month('W') assert (result == 'DEC') From 18bc171b0333d7d1bc2c459946d6b99193add8df Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 9 Feb 2017 19:16:38 -0500 Subject: [PATCH 122/338] API: Reformat output of groupby.describe (#4792) closes #4792 Author: Matt Roeschke Author: Matthew Roeschke Closes #15260 from mroeschke/fix_4792 and squashes the following commits: 618bc46 [Matthew Roeschke] Merge branch 'master' into fix_4792 184378d [Matt Roeschke] TST: groupby.describe levels don't appear as column (#4792) --- doc/source/whatsnew/v0.20.0.txt | 53 +++++++++++++++++ pandas/core/groupby.py | 19 +++++- pandas/tests/formats/test_format.py | 33 +++-------- pandas/tests/groupby/test_categorical.py | 22 ++++--- pandas/tests/groupby/test_groupby.py | 74 +++++++++++++++++------- pandas/tests/test_generic.py | 8 +-- 6 files changed, 150 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 17ce4517035a7..6fe066b08e255 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -356,6 +356,59 @@ New Behavior: In [11]: index.memory_usage(deep=True) Out[11]: 260 +.. _whatsnew_0200.api_breaking.groupby_describe: + +Groupby Describe Formatting +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index. +This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`) + +Previous Behavior: + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]}) + + In [2]: df.groupby('A').describe() + Out[2]: + B + A + 1 count 2.000000 + mean 1.500000 + std 0.707107 + min 1.000000 + 25% 1.250000 + 50% 1.500000 + 75% 1.750000 + max 2.000000 + 2 count 2.000000 + mean 3.500000 + std 0.707107 + min 3.000000 + 25% 3.250000 + 50% 3.500000 + 75% 3.750000 + max 4.000000 + + In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) + Out[3]: + B + mean std amin amax + A + 1 1.5 0.707107 1 2 + 2 3.5 0.707107 3 4 + +New Behavior: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]}) + + df.groupby('A').describe() + + df.groupby('A').agg([np.mean, np.std, np.min, np.max]) + .. _whatsnew_0200.api: Other API Changes diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 53b6dbe6075cf..a228861270aea 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -80,7 +80,6 @@ 'mean', 'sum', 'min', 'max', 'cumcount', 'resample', - 'describe', 'rank', 'quantile', 'fillna', 'mad', @@ -1138,6 +1137,16 @@ def ohlc(self): return self._apply_to_column_groupbys( lambda x: x._cython_agg_general('ohlc')) + @Appender(DataFrame.describe.__doc__) + @Substitution(name='groupby') + @Appender(_doc_template) + def describe(self, **kwargs): + self._set_group_selection() + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() + @Substitution(name='groupby') @Appender(_doc_template) def resample(self, rule, *args, **kwargs): @@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'): def nsmallest(self, n=5, keep='first'): return self.apply(lambda x: x.nsmallest(n=n, keep=keep)) + @Appender(Series.describe.__doc__) + def describe(self, **kwargs): + self._set_group_selection() + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index a9553d9ea10cb..99cc70ae36f6b 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3545,30 +3545,15 @@ def test_to_latex_multiindex(self): self.assertEqual(result, expected) result = df.groupby('a').describe().to_latex() - expected = r"""\begin{tabular}{llr} -\toprule - & & c \\ -a & {} & \\ -\midrule -0 & count & 2.000000 \\ - & mean & 1.500000 \\ - & std & 0.707107 \\ - & min & 1.000000 \\ - & 25\% & 1.250000 \\ - & 50\% & 1.500000 \\ - & 75\% & 1.750000 \\ - & max & 2.000000 \\ -1 & count & 2.000000 \\ - & mean & 3.500000 \\ - & std & 0.707107 \\ - & min & 3.000000 \\ - & 25\% & 3.250000 \\ - & 50\% & 3.500000 \\ - & 75\% & 3.750000 \\ - & max & 4.000000 \\ -\bottomrule -\end{tabular} -""" + expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & ' + ' & & & & & & ' + '\\\\\n{} & count & mean & std & min & 25\\% & ' + '50\\% & 75\\% & max \\\\\na & & & ' + ' & & & & & \\\\\n\\midrule\n0 ' + '& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 ' + '& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 ' + '& 3.5 & 3.75 & 4.0 ' + '\\\\\n\\bottomrule\n\\end{tabular}\n') self.assertEqual(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8952b520f4f78..eebd0e0f490c1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -107,17 +107,20 @@ def test_groupby_categorical(self): exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False).describe() - expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(1)), exp) def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility @@ -144,7 +147,6 @@ def test_groupby_datetime_categorical(self): ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() - expected.index.names = [None, None] assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( @@ -155,10 +157,14 @@ def test_groupby_datetime_categorical(self): expc = Categorical.from_codes( np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(1)), exp) def test_groupby_categorical_index(self): @@ -195,8 +201,8 @@ def test_groupby_describe_categorical_columns(self): df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() - tm.assert_index_equal(result.columns, cats) - tm.assert_categorical_equal(result.columns.values, cats.values) + tm.assert_index_equal(result.stack().columns, cats) + tm.assert_categorical_equal(result.stack().columns.values, cats.values) def test_groupby_unstack_categorical(self): # GH11558 (example is taken from the original issue) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 53f85349834ac..d625fa07d932c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1085,7 +1085,7 @@ def test_attr_wrapper(self): for name, gp in grouped: expected[name] = gp.describe() expected = DataFrame(expected).T - assert_frame_equal(result.unstack(), expected) + assert_frame_equal(result, expected) # get attribute result = grouped.dtype @@ -1097,7 +1097,7 @@ def test_attr_wrapper(self): def test_series_describe_multikey(self): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe().unstack() + result = grouped.describe() assert_series_equal(result['mean'], grouped.mean(), check_names=False) assert_series_equal(result['std'], grouped.std(), check_names=False) assert_series_equal(result['min'], grouped.min(), check_names=False) @@ -1106,7 +1106,7 @@ def test_series_describe_single(self): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe() + expected = grouped.describe().stack() assert_series_equal(result, expected) def test_series_index_name(self): @@ -1117,17 +1117,27 @@ def test_series_index_name(self): def test_frame_describe_multikey(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() - + desc_groups = [] for col in self.tsframe: - expected = grouped[col].describe() - assert_series_equal(result[col], expected, check_names=False) + group = grouped[col].describe() + group_col = pd.MultiIndex([[col] * len(group.columns), + group.columns], + [[0] * len(group.columns), + range(len(group.columns))]) + group = pd.DataFrame(group.values, + columns=group_col, + index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) groupedT = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis=1) result = groupedT.describe() - - for name, group in groupedT: - assert_frame_equal(result[name], group.describe()) + expected = self.tsframe.describe().T + expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], + [range(4), range(len(expected.index))]) + tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(self): @@ -1137,10 +1147,27 @@ def test_frame_describe_tupleindex(self): 'z': [100, 200, 300, 400, 500] * 3}) df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 df2 = df1.rename(columns={'k': 'key'}) - result = df1.groupby('k').describe() - expected = df2.groupby('key').describe() - expected.index.set_names(result.index.names, inplace=True) - assert_frame_equal(result, expected) + tm.assertRaises(ValueError, lambda: df1.groupby('k').describe()) + tm.assertRaises(ValueError, lambda: df2.groupby('key').describe()) + + def test_frame_describe_unstacked_format(self): + # GH 4792 + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} + df = pd.DataFrame({'PRICE': prices, + 'VOLUME': volumes}) + result = df.groupby('PRICE').VOLUME.describe() + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] + expected = pd.DataFrame(data, + index=pd.Index([24990, 25499], name='PRICE'), + columns=['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) def test_frame_groupby(self): grouped = self.tsframe.groupby(lambda x: x.weekday()) @@ -2545,16 +2572,21 @@ def test_non_cython_api(self): assert_frame_equal(result, expected) # describe - expected = DataFrame(dict(B=concat( - [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()], - keys=[1, 3]))) - expected.index.names = ['A', None] + expected_index = pd.Index([1, 3], name='A') + expected_col = pd.MultiIndex(levels=[['B'], + ['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']], + labels=[[0] * 8, list(range(8))]) + expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, nan, nan, nan, nan, nan, nan, nan]], + index=expected_index, + columns=expected_col) result = g.describe() assert_frame_equal(result, expected) - expected = concat( - [df.loc[[0, 1], ['A', 'B']].describe(), - df.loc[[2], ['A', 'B']].describe()], keys=[0, 1]) + expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T]) + expected.index = pd.Index([0, 1]) result = gni.describe() assert_frame_equal(result, expected) @@ -3872,7 +3904,6 @@ def test_groupby_whitelist(self): 'tail', 'cumcount', 'resample', - 'describe', 'rank', 'quantile', 'fillna', @@ -3909,7 +3940,6 @@ def test_groupby_whitelist(self): 'tail', 'cumcount', 'resample', - 'describe', 'rank', 'quantile', 'fillna', diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index bb341c26d454e..e84e2d6809e7b 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1267,10 +1267,10 @@ def test_describe_typefiltering_groupby(self): 'numD': np.arange(24.) + .5, 'ts': tm.makeTimeSeries()[:24].index}) G = df.groupby('catA') - self.assertTrue(G.describe(include=['number']).shape == (16, 2)) - self.assertTrue(G.describe(include=['number', 'object']).shape == (22, - 3)) - self.assertTrue(G.describe(include='all').shape == (26, 4)) + self.assertTrue(G.describe(include=['number']).shape == (2, 16)) + self.assertTrue(G.describe(include=['number', 'object']).shape == (2, + 33)) + self.assertTrue(G.describe(include='all').shape == (2, 52)) def test_describe_multi_index_df_column_names(self): """ Test that column names persist after the describe operation.""" From 24e98dff31ed1e706a23abe5538586b43294653c Mon Sep 17 00:00:00 2001 From: Tobias Gustafsson Date: Fri, 10 Feb 2017 09:09:31 -0500 Subject: [PATCH 123/338] BUG: Fix #15344 by backporting ujson usage of PEP 393 API Make use of the PEP 393 API to avoid expanding single byte ascii characters into four byte unicode characters when encoding objects to json. closes #15344 Author: Tobias Gustafsson Closes #15360 from tobgu/backport-ujson-compact-ascii-encoding and squashes the following commits: 44de133 [Tobias Gustafsson] Fix C-code formatting to pass linting of GH15344 b7e404f [Tobias Gustafsson] Merge branch 'master' into backport-ujson-compact-ascii-encoding 4e8e2ff [Tobias Gustafsson] BUG: Fix #15344 by backporting ujson usage of PEP 393 APIs for compact ascii --- doc/source/whatsnew/v0.20.0.txt | 5 ++++- pandas/io/tests/json/test_pandas.py | 10 ++++++++++ pandas/src/ujson/python/objToJSON.c | 10 ++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6fe066b08e255..5fbce3d2594a9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -538,6 +538,8 @@ Bug Fixes - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) +- Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) +- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`) - Bug in ``DataFrame.resample().median()`` if duplicate column names are present (:issue:`14233`) @@ -561,7 +563,6 @@ Bug Fixes - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) -- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) @@ -574,4 +575,6 @@ Bug Fixes - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) + + - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index ee5039c38b182..440f5c13d5121 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1044,3 +1044,13 @@ def roundtrip(s, encoding='latin-1'): for s in examples: roundtrip(s) + + def test_data_frame_size_after_to_json(self): + # GH15344 + df = DataFrame({'a': [str(1)]}) + + size_before = df.memory_usage(index=True, deep=True).sum() + df.to_json() + size_after = df.memory_usage(index=True, deep=True).sum() + + self.assertEqual(size_before, size_after) diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 42c0b62a57511..e3c75d3b6e081 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -402,6 +402,16 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PyObject *obj = (PyObject *)_obj; + +#if (PY_VERSION_HEX >= 0x03030000) + if (PyUnicode_IS_COMPACT_ASCII(obj)) { + Py_ssize_t len; + char *data = PyUnicode_AsUTF8AndSize(obj, &len); + *_outLen = len; + return data; + } +#endif + PyObject *newObj = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), NULL); From eaf2216a0339428761da6cf9e446edb79cfa8aa9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 10 Feb 2017 10:33:36 -0500 Subject: [PATCH 124/338] TST: Use pytest closes https://github.com/pydata/pandas/issues/13097 Author: Tom Augspurger Closes #13856 from TomAugspurger/pytest and squashes the following commits: 59e2be9 [Tom Augspurger] NOSE_ARGS -> TEST_ARGS 03695aa [Tom Augspurger] TST: Remove disabled marks 42790ae [Tom Augspurger] TST: Remove test_multi.sh 40d7336 [Tom Augspurger] PKG: redo pd.test import 9ba1f12 [Tom Augspurger] TST: Skip if getlocale is None 14c447c [Tom Augspurger] TST: pd.test uses pytest c4f6008 [Tom Augspurger] TST/CI: Use pytest b268d89 [Tom Augspurger] TST: Change method to make reporting more standard a638390 [Tom Augspurger] TST: Test consistency change c8dc927 [Tom Augspurger] TST: Refactor to use setup_class 9b5f2b2 [Tom Augspurger] TST: Refactor sql test inheritance --- .gitignore | 2 + .travis.yml | 38 ++-- appveyor.yml | 8 +- ci/install_test.sh | 3 +- ci/install_travis.sh | 11 +- ci/requirements_all.txt | 2 + ci/requirements_dev.txt | 2 + ci/script.sh | 8 +- doc/source/contributing.rst | 24 ++- doc/source/install.rst | 4 +- doc/source/whatsnew/v0.20.0.txt | 3 + pandas/__init__.py | 6 +- pandas/api/tests/test_api.py | 2 +- pandas/conftest.py | 21 ++ pandas/io/tests/parser/test_network.py | 2 +- pandas/io/tests/test_packers.py | 21 +- pandas/io/tests/test_pickle.py | 7 +- pandas/io/tests/test_sql.py | 27 +-- pandas/tests/formats/test_format.py | 22 +-- pandas/tests/frame/common.py | 4 +- pandas/tools/tests/test_util.py | 5 + pandas/util/_tester.py | 23 +++ pandas/util/nosetester.py | 261 ------------------------- pandas/util/testing.py | 7 +- setup.cfg | 6 + test.bat | 3 +- test.sh | 9 +- test_fast.sh | 3 +- test_multi.sh | 1 - test_rebuild.sh | 8 +- tox.ini | 9 +- 31 files changed, 179 insertions(+), 373 deletions(-) create mode 100644 pandas/conftest.py create mode 100644 pandas/util/_tester.py delete mode 100644 pandas/util/nosetester.py delete mode 100755 test_multi.sh diff --git a/.gitignore b/.gitignore index a77e780f3332d..808d9fb73a631 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,8 @@ dist **/wheelhouse/* # coverage .coverage +coverage.xml +coverage_html_report # OS generated files # ###################### diff --git a/.travis.yml b/.travis.yml index be2058950d8ec..b38c99e3a5be9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,7 +32,7 @@ matrix: env: - PYTHON_VERSION=3.5 - JOB_NAME: "35_osx" - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - BUILD_TYPE=conda - JOB_TAG=_OSX - TRAVIS_PYTHON_VERSION=3.5 @@ -42,7 +42,7 @@ matrix: env: - PYTHON_VERSION=2.7 - JOB_NAME: "27_slow_nnet_LOCALE" - - NOSE_ARGS="slow and not network and not disabled" + - TEST_ARGS="--only-slow --skip-network" - LOCALE_OVERRIDE="zh_CN.UTF-8" - FULL_DEPS=true - JOB_TAG=_LOCALE @@ -56,7 +56,7 @@ matrix: env: - PYTHON_VERSION=2.7 - JOB_NAME: "27_nslow" - - NOSE_ARGS="not slow and not disabled" + - TEST_ARGS="--skip-slow" - FULL_DEPS=true - CLIPBOARD_GUI=gtk2 - LINT=true @@ -70,7 +70,7 @@ matrix: env: - PYTHON_VERSION=3.5 - JOB_NAME: "35_nslow" - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - FULL_DEPS=true - CLIPBOARD=xsel - COVERAGE=true @@ -84,7 +84,7 @@ matrix: env: - PYTHON_VERSION=3.6 - JOB_NAME: "36" - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - PANDAS_TESTING_MODE="deprecate" addons: apt: @@ -96,7 +96,7 @@ matrix: env: - PYTHON_VERSION=2.7 - JOB_NAME: "27_nslow_nnet_COMPAT" - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - LOCALE_OVERRIDE="it_IT.UTF-8" - INSTALL_TEST=true - JOB_TAG=_COMPAT @@ -112,7 +112,7 @@ matrix: - PYTHON_VERSION=2.7 - JOB_NAME: "27_slow" - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" + - TEST_ARGS="--only-slow --skip-network" - FULL_DEPS=true - CACHE_NAME="27_slow" - USE_CACHE=true @@ -122,7 +122,7 @@ matrix: - PYTHON_VERSION=2.7 - JOB_NAME: "27_build_test_conda" - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" + - TEST_ARGS="--skip-slow" - FULL_DEPS=true - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" @@ -133,7 +133,7 @@ matrix: - PYTHON_VERSION=3.4 - JOB_NAME: "34_nslow" - LOCALE_OVERRIDE="zh_CN.UTF-8" - - NOSE_ARGS="not slow and not disabled" + - TEST_ARGS="--skip-slow" - FULL_DEPS=true - CLIPBOARD=xsel - CACHE_NAME="34_nslow" @@ -149,7 +149,7 @@ matrix: - PYTHON_VERSION=3.4 - JOB_NAME: "34_slow" - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" + - TEST_ARGS="--only-slow --skip-network" - FULL_DEPS=true - CLIPBOARD=xsel - CACHE_NAME="34_slow" @@ -164,7 +164,7 @@ matrix: - PYTHON_VERSION=3.5 - JOB_NAME: "35_numpy_dev" - JOB_TAG=_NUMPY_DEV - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - PANDAS_TESTING_MODE="deprecate" - CACHE_NAME="35_numpy_dev" - USE_CACHE=true @@ -179,7 +179,7 @@ matrix: - PYTHON_VERSION=3.5 - JOB_NAME: "35_ascii" - JOB_TAG=_ASCII - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - LOCALE_OVERRIDE="C" - CACHE_NAME="35_ascii" - USE_CACHE=true @@ -199,7 +199,7 @@ matrix: - PYTHON_VERSION=2.7 - JOB_NAME: "27_slow" - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" + - TEST_ARGS="--only-slow --skip-network" - FULL_DEPS=true - CACHE_NAME="27_slow" - USE_CACHE=true @@ -208,7 +208,7 @@ matrix: - PYTHON_VERSION=3.4 - JOB_NAME: "34_slow" - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" + - TEST_ARGS="--only-slow --skip-network" - FULL_DEPS=true - CLIPBOARD=xsel - CACHE_NAME="34_slow" @@ -222,7 +222,7 @@ matrix: - PYTHON_VERSION=2.7 - JOB_NAME: "27_build_test_conda" - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" + - TEST_ARGS="--skip-slow" - FULL_DEPS=true - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" @@ -232,7 +232,7 @@ matrix: - PYTHON_VERSION=3.4 - JOB_NAME: "34_nslow" - LOCALE_OVERRIDE="zh_CN.UTF-8" - - NOSE_ARGS="not slow and not disabled" + - TEST_ARGS="--skip-slow" - FULL_DEPS=true - CLIPBOARD=xsel - CACHE_NAME="34_nslow" @@ -247,7 +247,7 @@ matrix: - PYTHON_VERSION=3.5 - JOB_NAME: "35_numpy_dev" - JOB_TAG=_NUMPY_DEV - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - PANDAS_TESTING_MODE="deprecate" - CACHE_NAME="35_numpy_dev" - USE_CACHE=true @@ -260,7 +260,7 @@ matrix: env: - PYTHON_VERSION=2.7 - JOB_NAME: "27_nslow_nnet_COMPAT" - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - LOCALE_OVERRIDE="it_IT.UTF-8" - INSTALL_TEST=true - JOB_TAG=_COMPAT @@ -275,7 +275,7 @@ matrix: - PYTHON_VERSION=3.5 - JOB_NAME: "35_ascii" - JOB_TAG=_ASCII - - NOSE_ARGS="not slow and not network and not disabled" + - TEST_ARGS="--skip-slow --skip-network" - LOCALE_OVERRIDE="C" - CACHE_NAME="35_ascii" - USE_CACHE=true diff --git a/appveyor.yml b/appveyor.yml index 2499e7069843d..42c3be13af809 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -14,6 +14,7 @@ environment: # /E:ON and /V:ON options are not enabled in the batch script intepreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" + clone_folder: C:\projects\pandas matrix: @@ -82,7 +83,7 @@ install: - cmd: '%CMD_IN_ENV% conda build ci\appveyor.recipe -q' # create our env - - cmd: conda create -q -n pandas python=%PYTHON_VERSION% nose + - cmd: conda create -q -n pandas python=%PYTHON_VERSION% nose pytest - cmd: activate pandas - SET REQ=ci\requirements-%PYTHON_VERSION%-%PYTHON_ARCH%.run - cmd: echo "installing requirements from %REQ%" @@ -93,7 +94,8 @@ install: test_script: # tests - - cd \ - cmd: activate pandas - cmd: conda list - - cmd: nosetests --exe -A "not slow and not network and not disabled" pandas + - cmd: cd \ + - cmd: python -c "import pandas; pandas.test(['--skip-slow', '--skip-network'])" + diff --git a/ci/install_test.sh b/ci/install_test.sh index e01ad7b94a349..cbb84d8fa4b65 100755 --- a/ci/install_test.sh +++ b/ci/install_test.sh @@ -8,7 +8,8 @@ if [ "$INSTALL_TEST" ]; then conda uninstall cython || exit 1 python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1 pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1 - nosetests --exe -A "$NOSE_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml + # nosetests --exe -A "$TEST_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml + pytest pandas/tests/test_series.py --junitxml=/tmp/pytest_install.xml else echo "Skipping installation test." fi diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 52b52d787aade..f65176fb1147c 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -83,6 +83,7 @@ else # Useful for debugging any issues with conda conda info -a || exit 1 + fi # may have installation instructions for this build @@ -90,13 +91,9 @@ INSTALL="ci/install-${PYTHON_VERSION}${JOB_TAG}.sh" if [ -e ${INSTALL} ]; then time bash $INSTALL || exit 1 else - # create new env - time conda create -n pandas python=$PYTHON_VERSION nose || exit 1 + time conda create -n pandas python=$PYTHON_VERSION nose pytest || exit 1 - if [ "$COVERAGE" ]; then - pip install coverage - fi if [ "$LINT" ]; then conda install flake8 pip install cpplint @@ -119,6 +116,10 @@ fi source activate pandas +if [ "$COVERAGE" ]; then + pip install coverage pytest-cov +fi + if [ "$BUILD_TEST" ]; then # build testing diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt index bc97957bff2b7..b64143fcd4ecd 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements_all.txt @@ -1,4 +1,6 @@ nose +pytest +pytest-cov flake8 sphinx ipython diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 7396fba6548d9..b8af9d035de98 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -3,4 +3,6 @@ pytz numpy cython nose +pytest +pytest-cov flake8 diff --git a/ci/script.sh b/ci/script.sh index e2ba883b81883..3eac3002d6805 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running nosetests as this is simply a build test." elif [ "$COVERAGE" ]; then - echo nosetests --exe -A "$NOSE_ARGS" pandas --with-coverage --with-xunit --xunit-file=/tmp/nosetests.xml - nosetests --exe -A "$NOSE_ARGS" pandas --with-coverage --cover-package=pandas --cover-tests --with-xunit --xunit-file=/tmp/nosetests.xml + echo pytest -s --cov=pandas --cov-report xml:/tmp/nosetests.xml $TEST_ARGS pandas + pytest -s --cov=pandas --cov-report xml:/tmp/nosetests.xml $TEST_ARGS pandas else - echo nosetests --exe -A "$NOSE_ARGS" pandas --doctest-tests --with-xunit --xunit-file=/tmp/nosetests.xml - nosetests --exe -A "$NOSE_ARGS" pandas --doctest-tests --with-xunit --xunit-file=/tmp/nosetests.xml + echo pytest $TEST_ARGS pandas + pytest $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index ecc2a5e723c45..dbe329b589c75 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -552,8 +552,8 @@ use cases and writing corresponding tests. Adding tests is one of the most common requests after code is pushed to *pandas*. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. -Like many packages, *pandas* uses the `Nose testing system -`_ and the convenient +Like many packages, *pandas* uses `pytest +`_ and the convenient extensions in `numpy.testing `_. @@ -595,17 +595,25 @@ Running the test suite The tests can then be run directly inside your Git clone (without having to install *pandas*) by typing:: - nosetests pandas + pytest pandas The tests suite is exhaustive and takes around 20 minutes to run. Often it is worth running only a subset of tests first around your changes before running the -entire suite. This is done using one of the following constructs:: +entire suite. - nosetests pandas/tests/[test-module].py - nosetests pandas/tests/[test-module].py:[TestClass] - nosetests pandas/tests/[test-module].py:[TestClass].[test_method] +The easiest way to do this is with:: - .. versionadded:: 0.18.0 + pytest pandas/path/to/test.py -k regex_matching_test_name + +Or with one of the following constructs:: + + pytest pandas/tests/[test-module].py + pytest pandas/tests/[test-module].py::[TestClass] + pytest pandas/tests/[test-module].py::[TestClass]::[test_method] + +For more, see the `pytest`_ documentation. + + .. versionadded:: 0.18.0 Furthermore one can run diff --git a/doc/source/install.rst b/doc/source/install.rst index 4b3ea19624a0e..1c7cbc9326614 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -188,8 +188,8 @@ Running the test suite pandas is equipped with an exhaustive set of unit tests covering about 97% of the codebase as of this writing. To run it on your machine to verify that everything is working (and you have all of the dependencies, soft and hard, -installed), make sure you have `nose -`__ and run: +installed), make sure you have `pytest +`__ and run: :: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5fbce3d2594a9..d0ffa786aaa8e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -11,6 +11,9 @@ Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` +- Switched the test framework to `pytest`_ (:issue:`13097`) + +.. _pytest: http://doc.pytest.org/en/latest/ Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. diff --git a/pandas/__init__.py b/pandas/__init__.py index 76542db22a757..70c547010f623 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -56,11 +56,7 @@ from pandas.io.api import * -# define the testing framework -import pandas.util.testing -from pandas.util.nosetester import NoseTester -test = NoseTester().test -del NoseTester +from pandas.util._tester import test # use the closest tagged version if possible from ._version import get_versions diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index a53f6103b408b..05cf5dc4b7e7b 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -28,7 +28,7 @@ class TestPDApi(Base, tm.TestCase): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'locale'] + ignored = ['tests', 'locale', 'conftest'] # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', diff --git a/pandas/conftest.py b/pandas/conftest.py new file mode 100644 index 0000000000000..b3683de3a173b --- /dev/null +++ b/pandas/conftest.py @@ -0,0 +1,21 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption("--skip-slow", action="store_true", + help="skip slow tests") + parser.addoption("--skip-network", action="store_true", + help="run network tests") + parser.addoption("--only-slow", action="store_true", + help="run only slow tests") + + +def pytest_runtest_setup(item): + if 'slow' in item.keywords and item.config.getoption("--skip-slow"): + pytest.skip("skipping due to --skip-slow") + + if 'slow' not in item.keywords and item.config.getoption("--only-slow"): + pytest.skip("skipping due to --only-slow") + + if 'skip' in item.keywords and item.config.getoption("--skip-network"): + pytest.skip("skipping due to --skip-network") diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index e06f94c780c8b..533b7733bde28 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -24,7 +24,7 @@ class TestCompressedUrl(object): 'xz': '.xz', } - def __init__(self): + def setup(self): path = os.path.join(tm.get_data_path(), 'salaries.csv') self.local_table = read_table(path) self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/' diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 8a0cfb92bd3c0..2ee36d85f674c 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -793,18 +793,19 @@ class TestMsgpack(): http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class """ - def setUp(self): + @classmethod + def setup_class(cls): from pandas.io.tests.generate_legacy_storage_files import ( create_msgpack_data, create_data) - self.data = create_msgpack_data() - self.all_data = create_data() - self.path = u('__%s__.msgpack' % tm.rands(10)) - self.minimum_structure = {'series': ['float', 'int', 'mixed', - 'ts', 'mi', 'dup'], - 'frame': ['float', 'int', 'mixed', 'mi'], - 'panel': ['float'], - 'index': ['int', 'date', 'period'], - 'mi': ['reg2']} + cls.data = create_msgpack_data() + cls.all_data = create_data() + cls.path = u('__%s__.msgpack' % tm.rands(10)) + cls.minimum_structure = {'series': ['float', 'int', 'mixed', + 'ts', 'mi', 'dup'], + 'frame': ['float', 'int', 'mixed', 'mi'], + 'panel': ['float'], + 'index': ['int', 'date', 'period'], + 'mi': ['reg2']} def check_min_structure(self, data): for typ, v in self.minimum_structure.items(): diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 73a9173e85906..89827817a85fb 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -31,11 +31,12 @@ class TestPickle(): nose-test-generators-inside-class """ - def setUp(self): + @classmethod + def setup_class(cls): from pandas.io.tests.generate_legacy_storage_files import ( create_pickle_data) - self.data = create_pickle_data() - self.path = u('__%s__.pickle' % tm.rands(10)) + cls.data = create_pickle_data() + cls.path = u('__%s__.pickle' % tm.rands(10)) def compare_element(self, result, expected, typ, version=None): if isinstance(expected, Index): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 4bcde764001c1..ddda65c5bafc8 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -236,7 +236,7 @@ def _close_conn(self): pass -class PandasSQLTest(unittest.TestCase): +class PandasSQLTest(object): """ Base class with common private methods for SQLAlchemy and fallback cases. @@ -839,7 +839,7 @@ def test_unicode_column_name(self): df.to_sql('test_unicode', self.conn, index=False) -class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): +class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi, unittest.TestCase): """ Test the public API as it would be used directly @@ -1024,11 +1024,11 @@ def tearDown(self): super(_EngineToConnMixin, self).tearDown() -class TestSQLApiConn(_EngineToConnMixin, TestSQLApi): +class TestSQLApiConn(_EngineToConnMixin, TestSQLApi, unittest.TestCase): pass -class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): +class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi, unittest.TestCase): """ Test the public sqlite connection fallback API @@ -1875,34 +1875,39 @@ def test_schema_support(self): tm.assert_frame_equal(res1, res2) -class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy): +class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy, unittest.TestCase): pass -class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn): +class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn, + unittest.TestCase): pass -class TestPostgreSQLAlchemy(_TestPostgreSQLAlchemy, _TestSQLAlchemy): +class TestPostgreSQLAlchemy(_TestPostgreSQLAlchemy, _TestSQLAlchemy, + unittest.TestCase): pass -class TestPostgreSQLAlchemyConn(_TestPostgreSQLAlchemy, _TestSQLAlchemyConn): +class TestPostgreSQLAlchemyConn(_TestPostgreSQLAlchemy, _TestSQLAlchemyConn, + unittest.TestCase): pass -class TestSQLiteAlchemy(_TestSQLiteAlchemy, _TestSQLAlchemy): +class TestSQLiteAlchemy(_TestSQLiteAlchemy, _TestSQLAlchemy, + unittest.TestCase): pass -class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn): +class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn, + unittest.TestCase): pass # ----------------------------------------------------------------------------- # -- Test Sqlite / MySQL fallback -class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): +class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest, unittest.TestCase): """ Test the fallback mode against an in-memory sqlite database. diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 99cc70ae36f6b..9a24ae332f7c5 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3923,6 +3923,15 @@ def test_period(self): self.assertEqual(str(df), exp) +def gen_series_formatting(): + s1 = pd.Series(['a'] * 100) + s2 = pd.Series(['ab'] * 100) + s3 = pd.Series(['a', 'ab', 'abc', 'abcd', 'abcde', 'abcdef']) + s4 = s3[::-1] + test_sers = {'onel': s1, 'twol': s2, 'asc': s3, 'desc': s4} + return test_sers + + class TestSeriesFormatting(tm.TestCase): def setUp(self): @@ -4320,15 +4329,6 @@ def test_consistent_format(self): '1.0000\n129 1.0000\ndtype: float64') self.assertEqual(res, exp) - @staticmethod - def gen_test_series(): - s1 = pd.Series(['a'] * 100) - s2 = pd.Series(['ab'] * 100) - s3 = pd.Series(['a', 'ab', 'abc', 'abcd', 'abcde', 'abcdef']) - s4 = s3[::-1] - test_sers = {'onel': s1, 'twol': s2, 'asc': s3, 'desc': s4} - return test_sers - def chck_ncols(self, s): with option_context("display.max_rows", 10): res = repr(s) @@ -4339,7 +4339,7 @@ def chck_ncols(self, s): self.assertEqual(ncolsizes, 1) def test_format_explicit(self): - test_sers = self.gen_test_series() + test_sers = gen_series_formatting() with option_context("display.max_rows", 4): res = repr(test_sers['onel']) exp = '0 a\n1 a\n ..\n98 a\n99 a\ndtype: object' @@ -4358,7 +4358,7 @@ def test_format_explicit(self): self.assertEqual(exp, res) def test_ncols(self): - test_sers = self.gen_test_series() + test_sers = gen_series_formatting() for s in test_sers.values(): self.chck_ncols(s) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 37f67712e1b58..b9cd764c8704c 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -89,11 +89,11 @@ def empty(self): @cache_readonly def ts1(self): - return tm.makeTimeSeries() + return tm.makeTimeSeries(nper=30) @cache_readonly def ts2(self): - return tm.makeTimeSeries()[5:] + return tm.makeTimeSeries(nper=30)[5:] @cache_readonly def simple(self): diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index e1d057eb3c3c0..0716a13fac3fe 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -93,6 +93,11 @@ def test_set_locale(self): raise nose.SkipTest("Only a single locale found, no point in " "trying to test setting another locale") + if all(x is None for x in CURRENT_LOCALE): + # Not sure why, but on some travis runs with pytest, + # getlocale() returned (None, None). + raise nose.SkipTest("CURRENT_LOCALE is not set.") + if LOCALE_OVERRIDE is None: lang, enc = 'it_CH', 'UTF-8' elif LOCALE_OVERRIDE == 'C': diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py new file mode 100644 index 0000000000000..b0e402939caae --- /dev/null +++ b/pandas/util/_tester.py @@ -0,0 +1,23 @@ +""" +Entrypoint for testing from the top-level namespace +""" +import os + +PKG = os.path.dirname(os.path.dirname(__file__)) + + +try: + import pytest +except ImportError: + def test(): + raise ImportError("Need pytest>=3.0 to run tests") +else: + def test(extra_args=None): + if extra_args: + cmd = ['-q'] + extra_args + [PKG] + else: + cmd = ['-q', PKG] + pytest.main(cmd) + + +__all__ = ['test'] diff --git a/pandas/util/nosetester.py b/pandas/util/nosetester.py deleted file mode 100644 index 1bdaaff99fd50..0000000000000 --- a/pandas/util/nosetester.py +++ /dev/null @@ -1,261 +0,0 @@ -""" -Nose test running. - -This module implements ``test()`` function for pandas modules. - -""" -from __future__ import division, absolute_import, print_function - -import os -import sys -import warnings -from pandas.compat import string_types -from numpy.testing import nosetester - - -def get_package_name(filepath): - """ - Given a path where a package is installed, determine its name. - - Parameters - ---------- - filepath : str - Path to a file. If the determination fails, "pandas" is returned. - - Examples - -------- - >>> pandas.util.nosetester.get_package_name('nonsense') - 'pandas' - - """ - - pkg_name = [] - while 'site-packages' in filepath or 'dist-packages' in filepath: - filepath, p2 = os.path.split(filepath) - if p2 in ('site-packages', 'dist-packages'): - break - pkg_name.append(p2) - - # if package name determination failed, just default to pandas - if not pkg_name: - return "pandas" - - # otherwise, reverse to get correct order and return - pkg_name.reverse() - - # don't include the outer egg directory - if pkg_name[0].endswith('.egg'): - pkg_name.pop(0) - - return '.'.join(pkg_name) - -import_nose = nosetester.import_nose -run_module_suite = nosetester.run_module_suite - - -class NoseTester(nosetester.NoseTester): - """ - Nose test runner. - - This class is made available as pandas.util.nosetester.NoseTester, and - a test function is typically added to a package's __init__.py like so:: - - from numpy.testing import Tester - test = Tester().test - - Calling this test function finds and runs all tests associated with the - package and all its sub-packages. - - Attributes - ---------- - package_path : str - Full path to the package to test. - package_name : str - Name of the package to test. - - Parameters - ---------- - package : module, str or None, optional - The package to test. If a string, this should be the full path to - the package. If None (default), `package` is set to the module from - which `NoseTester` is initialized. - raise_warnings : None, str or sequence of warnings, optional - This specifies which warnings to configure as 'raise' instead - of 'warn' during the test execution. Valid strings are: - - - "develop" : equals ``(DeprecationWarning, RuntimeWarning)`` - - "release" : equals ``()``, don't raise on any warnings. - - See Notes for more details. - - Notes - ----- - The default for `raise_warnings` is - ``(DeprecationWarning, RuntimeWarning)`` for development versions of - pandas, and ``()`` for released versions. The purpose of this switching - behavior is to catch as many warnings as possible during development, but - not give problems for packaging of released versions. - - """ - excludes = [] - - def _show_system_info(self): - nose = import_nose() - - import pandas - print("pandas version %s" % pandas.__version__) - import numpy - print("numpy version %s" % numpy.__version__) - pddir = os.path.dirname(pandas.__file__) - print("pandas is installed in %s" % pddir) - - pyversion = sys.version.replace('\n', '') - print("Python version %s" % pyversion) - print("nose version %d.%d.%d" % nose.__versioninfo__) - - def _get_custom_doctester(self): - """ Return instantiated plugin for doctests - - Allows subclassing of this class to override doctester - - A return value of None means use the nose builtin doctest plugin - """ - return None - - def _test_argv(self, label, verbose, extra_argv): - """ - Generate argv for nosetest command - - Parameters - ---------- - label : {'fast', 'full', '', attribute identifier}, optional - see ``test`` docstring - verbose : int, optional - Verbosity value for test outputs, in the range 1-10. Default is 1. - extra_argv : list, optional - List with any extra arguments to pass to nosetests. - - Returns - ------- - argv : list - command line arguments that will be passed to nose - """ - - argv = [__file__, self.package_path] - if label and label != 'full': - if not isinstance(label, string_types): - raise TypeError('Selection label should be a string') - if label == 'fast': - label = 'not slow and not network and not disabled' - argv += ['-A', label] - argv += ['--verbosity', str(verbose)] - - # When installing with setuptools, and also in some other cases, the - # test_*.py files end up marked +x executable. Nose, by default, does - # not run files marked with +x as they might be scripts. However, in - # our case nose only looks for test_*.py files under the package - # directory, which should be safe. - argv += ['--exe'] - - if extra_argv: - argv += extra_argv - return argv - - def test(self, label='fast', verbose=1, extra_argv=None, - doctests=False, coverage=False, raise_warnings=None): - """ - Run tests for module using nose. - - Parameters - ---------- - label : {'fast', 'full', '', attribute identifier}, optional - Identifies the tests to run. This can be a string to pass to - the nosetests executable with the '-A' option, or one of several - special values. Special values are: - - * 'fast' - the default - which corresponds to the ``nosetests -A`` - option of 'not slow'. - * 'full' - fast (as above) and slow tests as in the - 'no -A' option to nosetests - this is the same as ''. - * None or '' - run all tests. - * attribute_identifier - string passed directly to nosetests - as '-A'. - - verbose : int, optional - Verbosity value for test outputs, in the range 1-10. Default is 1. - extra_argv : list, optional - List with any extra arguments to pass to nosetests. - doctests : bool, optional - If True, run doctests in module. Default is False. - coverage : bool, optional - If True, report coverage of NumPy code. Default is False. - (This requires the `coverage module - `_). - raise_warnings : str or sequence of warnings, optional - This specifies which warnings to configure as 'raise' instead - of 'warn' during the test execution. Valid strings are: - - - 'develop' : equals ``(DeprecationWarning, RuntimeWarning)`` - - 'release' : equals ``()``, don't raise on any warnings. - - Returns - ------- - result : object - Returns the result of running the tests as a - ``nose.result.TextTestResult`` object. - - """ - - # cap verbosity at 3 because nose becomes *very* verbose beyond that - verbose = min(verbose, 3) - - if doctests: - print("Running unit tests and doctests for %s" % self.package_name) - else: - print("Running unit tests for %s" % self.package_name) - - self._show_system_info() - - # reset doctest state on every run - import doctest - doctest.master = None - - if raise_warnings is None: - - # default based on if we are released - from pandas import __version__ - from distutils.version import StrictVersion - try: - StrictVersion(__version__) - raise_warnings = 'release' - except ValueError: - raise_warnings = 'develop' - - _warn_opts = dict(develop=(DeprecationWarning, RuntimeWarning), - release=()) - if isinstance(raise_warnings, string_types): - raise_warnings = _warn_opts[raise_warnings] - - with warnings.catch_warnings(): - - if len(raise_warnings): - - # Reset the warning filters to the default state, - # so that running the tests is more repeatable. - warnings.resetwarnings() - # Set all warnings to 'warn', this is because the default - # 'once' has the bad property of possibly shadowing later - # warnings. - warnings.filterwarnings('always') - # Force the requested warnings to raise - for warningtype in raise_warnings: - warnings.filterwarnings('error', category=warningtype) - # Filter out annoying import messages. - warnings.filterwarnings("ignore", category=FutureWarning) - - from numpy.testing.noseclasses import NumpyTestProgram - argv, plugins = self.prepare_test_args( - label, verbose, extra_argv, doctests, coverage) - t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins) - - return t.result diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6b2e920a24063..336a766fd5830 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -19,7 +19,8 @@ from distutils.version import LooseVersion from numpy.random import randn, rand -from numpy.testing.decorators import slow # noqa +# from numpy.testing.decorators import slow # noqa +import pytest import numpy as np import pandas as pd @@ -2549,9 +2550,7 @@ def assert_produces_warning(expected_warning=Warning, filter_level="always", % extra_warnings) -def disabled(t): - t.disabled = True - return t +slow = pytest.mark.slow class RNGContext(object): diff --git a/setup.cfg b/setup.cfg index f69e256b80869..143470f7ee350 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,3 +19,9 @@ based_on_style = pep8 split_before_named_assigns = false split_penalty_after_opening_bracket = 1000000 split_penalty_logical_operator = 30 + +[tool:pytest] +# TODO: Change all yield-based (nose-style) fixutures to pytest fixtures +# Silencing the warning until then +addopts = --disable-pytest-warnings +testpaths = pandas diff --git a/test.bat b/test.bat index 16aa6c9105ec3..7f9244abb2bc8 100644 --- a/test.bat +++ b/test.bat @@ -1,3 +1,4 @@ :: test on windows -nosetests --exe -A "not slow and not network and not disabled" pandas %* +:: nosetests --exe -A "not slow and not network and not disabled" pandas %* +pytest pandas diff --git a/test.sh b/test.sh index 4a9ffd7be98b1..23c7ff52d2ce9 100755 --- a/test.sh +++ b/test.sh @@ -1,11 +1,4 @@ #!/bin/sh command -v coverage >/dev/null && coverage erase command -v python-coverage >/dev/null && python-coverage erase -# nosetests pandas/tests/test_index.py --with-coverage --cover-package=pandas.core --pdb-failure --pdb -#nosetests -w pandas --with-coverage --cover-package=pandas --pdb-failure --pdb #--cover-inclusive -#nosetests -A "not slow" -w pandas/tseries --with-coverage --cover-package=pandas.tseries $* #--cover-inclusive -nosetests -w pandas --with-coverage --cover-package=pandas $* -# nosetests -w pandas/io --with-coverage --cover-package=pandas.io --pdb-failure --pdb -# nosetests -w pandas/core --with-coverage --cover-package=pandas.core --pdb-failure --pdb -# nosetests -w pandas/stats --with-coverage --cover-package=pandas.stats -# coverage run runtests.py +pytest pandas --cov=pandas diff --git a/test_fast.sh b/test_fast.sh index b390705f901ad..0b394cffa3d74 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -1 +1,2 @@ -nosetests -A "not slow and not network" pandas --with-id $* +# nosetests -A "not slow and not network" pandas --with-id $* +pytest pandas --skip-slow diff --git a/test_multi.sh b/test_multi.sh deleted file mode 100755 index 5d77945c66a26..0000000000000 --- a/test_multi.sh +++ /dev/null @@ -1 +0,0 @@ -nosetests -A "not slow and not network" pandas --processes=4 $* diff --git a/test_rebuild.sh b/test_rebuild.sh index d3710c5ff67d3..65aa1098811a1 100755 --- a/test_rebuild.sh +++ b/test_rebuild.sh @@ -3,10 +3,4 @@ python setup.py clean python setup.py build_ext --inplace coverage erase -# nosetests pandas/tests/test_index.py --with-coverage --cover-package=pandas.core --pdb-failure --pdb -#nosetests -w pandas --with-coverage --cover-package=pandas --pdb-failure --pdb #--cover-inclusive -nosetests -w pandas --with-coverage --cover-package=pandas $* #--cover-inclusive -# nosetests -w pandas/io --with-coverage --cover-package=pandas.io --pdb-failure --pdb -# nosetests -w pandas/core --with-coverage --cover-package=pandas.core --pdb-failure --pdb -# nosetests -w pandas/stats --with-coverage --cover-package=pandas.stats -# coverage run runtests.py +pytest pandas --cov=pandas diff --git a/tox.ini b/tox.ini index 5d6c8975307b6..85c5d90fde7fb 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ envlist = py27, py34, py35 deps = cython nose + pytest pytz>=2011k python-dateutil beautifulsoup4 @@ -26,7 +27,7 @@ changedir = {envdir} commands = # TODO: --exe because of GH #761 - {envbindir}/nosetests --exe pandas {posargs:-A "not network and not disabled"} + {envbindir}/pytest pandas {posargs:-A "not network and not disabled"} # cleanup the temp. build dir created by the tox build # /bin/rm -rf {toxinidir}/build @@ -63,18 +64,18 @@ usedevelop = True deps = {[testenv]deps} openpyxl<2.0.0 -commands = {envbindir}/nosetests {toxinidir}/pandas/io/tests/test_excel.py +commands = {envbindir}/pytest {toxinidir}/pandas/io/tests/test_excel.py [testenv:openpyxl20] usedevelop = True deps = {[testenv]deps} openpyxl<2.2.0 -commands = {envbindir}/nosetests {posargs} {toxinidir}/pandas/io/tests/test_excel.py +commands = {envbindir}/pytest {posargs} {toxinidir}/pandas/io/tests/test_excel.py [testenv:openpyxl22] usedevelop = True deps = {[testenv]deps} openpyxl>=2.2.0 -commands = {envbindir}/nosetests {posargs} {toxinidir}/pandas/io/tests/test_excel.py +commands = {envbindir}/pytest {posargs} {toxinidir}/pandas/io/tests/test_excel.py From ed27d400329f91400b5aeae81dbff10d54b497f7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Feb 2017 10:42:18 -0500 Subject: [PATCH 125/338] TST: small adjustments for pytest --- doc/source/contributing.rst | 2 +- pandas/util/_tester.py | 9 ++++++--- pandas/util/testing.py | 6 ++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index dbe329b589c75..3ef9ed8962a23 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -613,7 +613,7 @@ Or with one of the following constructs:: For more, see the `pytest`_ documentation. - .. versionadded:: 0.18.0 + .. versionadded:: 0.20.0 Furthermore one can run diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index b0e402939caae..8d9701e0b4672 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -13,10 +13,13 @@ def test(): raise ImportError("Need pytest>=3.0 to run tests") else: def test(extra_args=None): + cmd = ['--skip-slow', '--skip-network'] if extra_args: - cmd = ['-q'] + extra_args + [PKG] - else: - cmd = ['-q', PKG] + if not isinstance(extra_args, list): + extra_args = [extra_args] + cmd = extra_args + cmd += [PKG] + print("running: pytest {}".format(' '.join(cmd))) pytest.main(cmd) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 336a766fd5830..c3633c945f60a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -19,7 +19,6 @@ from distutils.version import LooseVersion from numpy.random import randn, rand -# from numpy.testing.decorators import slow # noqa import pytest import numpy as np @@ -50,6 +49,8 @@ from pandas.util.decorators import deprecate from pandas import _testing from pandas.io.common import urlopen +slow = pytest.mark.slow + N = 30 K = 4 @@ -2550,9 +2551,6 @@ def assert_produces_warning(expected_warning=Warning, filter_level="always", % extra_warnings) -slow = pytest.mark.slow - - class RNGContext(object): """ Context manager to set the numpy random number generator speed. Returns From 3e600fb08466959b1e75a1459157c9b39be4d44b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Feb 2017 13:19:22 -0500 Subject: [PATCH 126/338] COMPAT: skip tests for numpy >= 1.12 with pow and integer inputs closes #15363 CI: fix 3.5 build to numpy 1.11.3 --- .gitignore | 1 + ci/requirements-3.5.build | 2 +- ci/requirements-3.5.run | 2 +- pandas/tests/test_expressions.py | 17 ++++++++++++++++- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 808d9fb73a631..a509fcf736ea8 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ .noseids .ipynb_checkpoints .tags +.cache/ # Compiled source # ################### diff --git a/ci/requirements-3.5.build b/ci/requirements-3.5.build index 9558cf00ddf5c..2fc2053e64fe9 100644 --- a/ci/requirements-3.5.build +++ b/ci/requirements-3.5.build @@ -1,4 +1,4 @@ python-dateutil pytz -numpy +numpy=1.11.3 cython diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index ef354195c8f23..b07ce611c79a2 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -1,6 +1,6 @@ python-dateutil pytz -numpy +numpy=1.11.3 openpyxl xlsxwriter xlrd diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index eca4a8f3c9e66..136786ecff0a0 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -12,7 +12,7 @@ from pandas.core.api import DataFrame, Panel from pandas.computation import expressions as expr -from pandas import compat +from pandas import compat, _np_version_under1p12 from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_panel4d_equal, slow) @@ -78,6 +78,13 @@ def run_arithmetic_test(self, df, other, assert_func, check_dtype=False, if not compat.PY3: operations.append('div') for arith in operations: + + # numpy >= 1.12 doesn't handle integers + # raised to integer powers + # https://github.com/pandas-dev/pandas/issues/15363 + if arith == 'pow' and not _np_version_under1p12: + continue + operator_name = arith if arith == 'div': operator_name = 'truediv' @@ -90,6 +97,7 @@ def run_arithmetic_test(self, df, other, assert_func, check_dtype=False, expr.set_use_numexpr(False) expected = op(df, other) expr.set_use_numexpr(True) + result = op(df, other) try: if check_dtype: @@ -273,6 +281,13 @@ def testit(): for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), ('div', '/'), ('pow', '**')]: + + # numpy >= 1.12 doesn't handle integers + # raised to integer powers + # https://github.com/pandas-dev/pandas/issues/15363 + if op == 'pow' and not _np_version_under1p12: + continue + if op == 'div': op = getattr(operator, 'truediv', None) else: From 021f59454a76636fb25ba1a70dc34470042a62f7 Mon Sep 17 00:00:00 2001 From: Joshua Bradt Date: Fri, 10 Feb 2017 14:48:11 -0500 Subject: [PATCH 127/338] BUG: Fixed handling of non-list value_vars in melt The value_vars argument of melt is now cast to list like the id_vars argument. closes #15348 Author: Joshua Bradt Author: Joshua Bradt Closes #15351 from jbradt/fix-melt and squashes the following commits: a2f2510 [Joshua Bradt] Changed to tm.assertRaisesRegexp for Python 2 compat. 3038f64 [Joshua Bradt] Merge remote-tracking branch 'upstream/master' into fix-melt e907135 [Joshua Bradt] Split test into two parts 20159c1 [Joshua Bradt] Changed exception classes to ValueError. 129d531 [Joshua Bradt] Moved binary operators to satisfy flake8 70d7256 [Joshua Bradt] Merge branch 'master' into fix-melt 455a310 [Joshua Bradt] Tested types when using MultiIndex to ensure they are lists. 7406222 [Joshua Bradt] Fixed formatting. Added comment with issue number to test. d4c5da3 [Joshua Bradt] Improved type checking and tests. Added whatsnew note. 33728de [Joshua Bradt] BUG: Fixed handling of non-list value_vars in melt --- doc/source/whatsnew/v0.20.0.txt | 3 +-- pandas/core/reshape.py | 14 ++++++++++-- pandas/tests/test_reshape.py | 39 +++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d0ffa786aaa8e..9f86c777c665d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -578,6 +578,5 @@ Bug Fixes - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - - +- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index bd0358abf67d5..cebaf4e3fd89b 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -761,16 +761,26 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, """ # TODO: what about the existing index? if id_vars is not None: - if not isinstance(id_vars, (tuple, list, np.ndarray)): + if not is_list_like(id_vars): id_vars = [id_vars] + elif (isinstance(frame.columns, MultiIndex) and + not isinstance(id_vars, list)): + raise ValueError('id_vars must be a list of tuples when columns' + ' are a MultiIndex') else: id_vars = list(id_vars) else: id_vars = [] if value_vars is not None: - if not isinstance(value_vars, (tuple, list, np.ndarray)): + if not is_list_like(value_vars): value_vars = [value_vars] + elif (isinstance(frame.columns, MultiIndex) and + not isinstance(value_vars, list)): + raise ValueError('value_vars must be a list of tuples when' + ' columns are a MultiIndex') + else: + value_vars = list(value_vars) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index ed5ec970ba33c..d587e4ea6a1fa 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -56,6 +56,45 @@ def test_value_vars(self): columns=['id1', 'id2', 'variable', 'value']) tm.assert_frame_equal(result4, expected4) + def test_value_vars_types(self): + # GH 15348 + expected = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', 'value']) + + for type_ in (tuple, list, np.array): + result = melt(self.df, id_vars=['id1', 'id2'], + value_vars=type_(('A', 'B'))) + tm.assert_frame_equal(result, expected) + + def test_vars_work_with_multiindex(self): + expected = DataFrame({ + ('A', 'a'): self.df1[('A', 'a')], + 'CAP': ['B'] * len(self.df1), + 'low': ['b'] * len(self.df1), + 'value': self.df1[('B', 'b')], + }, columns=[('A', 'a'), 'CAP', 'low', 'value']) + + result = melt(self.df1, id_vars=[('A', 'a')], value_vars=[('B', 'b')]) + tm.assert_frame_equal(result, expected) + + def test_tuple_vars_fail_with_multiindex(self): + # melt should fail with an informative error message if + # the columns have a MultiIndex and a tuple is passed + # for id_vars or value_vars. + tuple_a = ('A', 'a') + list_a = [tuple_a] + tuple_b = ('B', 'b') + list_b = [tuple_b] + + for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), + (tuple_a, tuple_b)): + with tm.assertRaisesRegexp(ValueError, r'MultiIndex'): + melt(self.df1, id_vars=id_vars, value_vars=value_vars) + def test_custom_var_name(self): result5 = melt(self.df, var_name=self.var_name) self.assertEqual(result5.columns.tolist(), ['var', 'value']) From 636df8d4b2565d41ba0bb55630be381083ca9fe7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Feb 2017 16:05:54 -0500 Subject: [PATCH 128/338] TST: split up tests/indexing/test_indexing a bit Author: Jeff Reback Closes #15367 from jreback/indexing and squashes the following commits: 15e6010 [Jeff Reback] pep 3a12fdd [Jeff Reback] add panel 5605b2b [Jeff Reback] add chaining and caching 05f6f40 [Jeff Reback] split out datetime d6be34f [Jeff Reback] TST: split up tests/indexing/test_indexing a bit --- pandas/tests/indexing/common.py | 5 + .../indexing/test_chaining_and_caching.py | 358 +++ pandas/tests/indexing/test_datetime.py | 192 ++ pandas/tests/indexing/test_floats.py | 157 ++ pandas/tests/indexing/test_indexing.py | 2242 +---------------- pandas/tests/indexing/test_multiindex.py | 1206 +++++++++ pandas/tests/indexing/test_panel.py | 209 ++ pandas/tests/indexing/test_timedelta.py | 21 + 8 files changed, 2215 insertions(+), 2175 deletions(-) create mode 100644 pandas/tests/indexing/common.py create mode 100644 pandas/tests/indexing/test_chaining_and_caching.py create mode 100644 pandas/tests/indexing/test_datetime.py create mode 100644 pandas/tests/indexing/test_multiindex.py create mode 100644 pandas/tests/indexing/test_panel.py create mode 100644 pandas/tests/indexing/test_timedelta.py diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py new file mode 100644 index 0000000000000..73167393cf35d --- /dev/null +++ b/pandas/tests/indexing/common.py @@ -0,0 +1,5 @@ +""" common utilities """ + + +def _mklbl(prefix, n): + return ["%s%s" % (prefix, i) for i in range(n)] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py new file mode 100644 index 0000000000000..0e921aaf826f9 --- /dev/null +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -0,0 +1,358 @@ +import numpy as np +import pandas as pd +from pandas.core import common as com +from pandas import (compat, DataFrame, option_context, + Series, MultiIndex, date_range, Timestamp) +from pandas.util import testing as tm + + +class TestCaching(tm.TestCase): + + def test_slice_consolidate_invalidate_item_cache(self): + + # this is chained assignment, but will 'work' + with option_context('chained_assignment', None): + + # #3970 + df = DataFrame({"aa": compat.lrange(5), "bb": [2.2] * 5}) + + # Creates a second float block + df["cc"] = 0.0 + + # caches a reference to the 'bb' series + df["bb"] + + # repr machinery triggers consolidation + repr(df) + + # Assignment to wrong series + df['bb'].iloc[0] = 0.17 + df._clear_item_cache() + self.assertAlmostEqual(df['bb'][0], 0.17) + + def test_setitem_cache_updating(self): + # GH 5424 + cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven'] + + for do_ref in [False, False]: + df = DataFrame({'a': cont, + "b": cont[3:] + cont[:3], + 'c': np.arange(7)}) + + # ref the cache + if do_ref: + df.ix[0, "c"] + + # set it + df.ix[7, 'c'] = 1 + + self.assertEqual(df.ix[0, 'c'], 0.0) + self.assertEqual(df.ix[7, 'c'], 1.0) + + # GH 7084 + # not updating cache on series setting with slices + expected = DataFrame({'A': [600, 600, 600]}, + index=date_range('5/7/2014', '5/9/2014')) + out = DataFrame({'A': [0, 0, 0]}, + index=date_range('5/7/2014', '5/9/2014')) + df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]}) + + # loop through df to update out + six = Timestamp('5/7/2014') + eix = Timestamp('5/9/2014') + for ix, row in df.iterrows(): + out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] + + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) + + # try via a chain indexing + # this actually works + out = DataFrame({'A': [0, 0, 0]}, + index=date_range('5/7/2014', '5/9/2014')) + for ix, row in df.iterrows(): + v = out[row['C']][six:eix] + row['D'] + out[row['C']][six:eix] = v + + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) + + out = DataFrame({'A': [0, 0, 0]}, + index=date_range('5/7/2014', '5/9/2014')) + for ix, row in df.iterrows(): + out.loc[six:eix, row['C']] += row['D'] + + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) + + +class TestChaining(tm.TestCase): + + def test_setitem_chained_setfault(self): + + # GH6026 + # setfaults under numpy 1.7.1 (ok on 1.8) + data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout'] + mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none'] + + df = DataFrame({'response': np.array(data)}) + mask = df.response == 'timeout' + df.response[mask] = 'none' + tm.assert_frame_equal(df, DataFrame({'response': mdata})) + + recarray = np.rec.fromarrays([data], names=['response']) + df = DataFrame(recarray) + mask = df.response == 'timeout' + df.response[mask] = 'none' + tm.assert_frame_equal(df, DataFrame({'response': mdata})) + + df = DataFrame({'response': data, 'response1': data}) + mask = df.response == 'timeout' + df.response[mask] = 'none' + tm.assert_frame_equal(df, DataFrame({'response': mdata, + 'response1': data})) + + # GH 6056 + expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) + df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) + df['A'].iloc[0] = np.nan + result = df.head() + tm.assert_frame_equal(result, expected) + + df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) + df.A.iloc[0] = np.nan + result = df.head() + tm.assert_frame_equal(result, expected) + + def test_detect_chained_assignment(self): + + pd.set_option('chained_assignment', 'raise') + + # work with the chain + expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) + df = DataFrame(np.arange(4).reshape(2, 2), + columns=list('AB'), dtype='int64') + self.assertIsNone(df.is_copy) + df['A'][0] = -5 + df['A'][1] = -6 + tm.assert_frame_equal(df, expected) + + # test with the chaining + df = DataFrame({'A': Series(range(2), dtype='int64'), + 'B': np.array(np.arange(2, 4), dtype=np.float64)}) + self.assertIsNone(df.is_copy) + + def f(): + df['A'][0] = -5 + + self.assertRaises(com.SettingWithCopyError, f) + + def f(): + df['A'][1] = np.nan + + self.assertRaises(com.SettingWithCopyError, f) + self.assertIsNone(df['A'].is_copy) + + # using a copy (the chain), fails + df = DataFrame({'A': Series(range(2), dtype='int64'), + 'B': np.array(np.arange(2, 4), dtype=np.float64)}) + + def f(): + df.loc[0]['A'] = -5 + + self.assertRaises(com.SettingWithCopyError, f) + + # doc example + df = DataFrame({'a': ['one', 'one', 'two', 'three', + 'two', 'one', 'six'], + 'c': Series(range(7), dtype='int64')}) + self.assertIsNone(df.is_copy) + expected = DataFrame({'a': ['one', 'one', 'two', 'three', + 'two', 'one', 'six'], + 'c': [42, 42, 2, 3, 4, 42, 6]}) + + def f(): + indexer = df.a.str.startswith('o') + df[indexer]['c'] = 42 + + self.assertRaises(com.SettingWithCopyError, f) + + expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]}) + df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) + + def f(): + df['A'][0] = 111 + + self.assertRaises(com.SettingWithCopyError, f) + + def f(): + df.loc[0]['A'] = 111 + + self.assertRaises(com.SettingWithCopyError, f) + + df.loc[0, 'A'] = 111 + tm.assert_frame_equal(df, expected) + + # make sure that is_copy is picked up reconstruction + # GH5475 + df = DataFrame({"A": [1, 2]}) + self.assertIsNone(df.is_copy) + with tm.ensure_clean('__tmp__pickle') as path: + df.to_pickle(path) + df2 = pd.read_pickle(path) + df2["B"] = df2["A"] + df2["B"] = df2["A"] + + # a suprious raise as we are setting the entire column here + # GH5597 + from string import ascii_letters as letters + + def random_text(nobs=100): + df = [] + for i in range(nobs): + idx = np.random.randint(len(letters), size=2) + idx.sort() + df.append([letters[idx[0]:idx[1]]]) + + return DataFrame(df, columns=['letters']) + + df = random_text(100000) + + # always a copy + x = df.iloc[[0, 1, 2]] + self.assertIsNotNone(x.is_copy) + x = df.iloc[[0, 1, 2, 4]] + self.assertIsNotNone(x.is_copy) + + # explicity copy + indexer = df.letters.apply(lambda x: len(x) > 10) + df = df.ix[indexer].copy() + self.assertIsNone(df.is_copy) + df['letters'] = df['letters'].apply(str.lower) + + # implicity take + df = random_text(100000) + indexer = df.letters.apply(lambda x: len(x) > 10) + df = df.ix[indexer] + self.assertIsNotNone(df.is_copy) + df['letters'] = df['letters'].apply(str.lower) + + # implicity take 2 + df = random_text(100000) + indexer = df.letters.apply(lambda x: len(x) > 10) + df = df.ix[indexer] + self.assertIsNotNone(df.is_copy) + df.loc[:, 'letters'] = df['letters'].apply(str.lower) + + # should be ok even though it's a copy! + self.assertIsNone(df.is_copy) + df['letters'] = df['letters'].apply(str.lower) + self.assertIsNone(df.is_copy) + + df = random_text(100000) + indexer = df.letters.apply(lambda x: len(x) > 10) + df.ix[indexer, 'letters'] = df.ix[indexer, 'letters'].apply(str.lower) + + # an identical take, so no copy + df = DataFrame({'a': [1]}).dropna() + self.assertIsNone(df.is_copy) + df['a'] += 1 + + # inplace ops + # original from: + # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + a = [12, 23] + b = [123, None] + c = [1234, 2345] + d = [12345, 23456] + tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), + ('ears', 'right')] + events = {('eyes', 'left'): a, + ('eyes', 'right'): b, + ('ears', 'left'): c, + ('ears', 'right'): d} + multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) + zed = DataFrame(events, index=['a', 'b'], columns=multiind) + + def f(): + zed['eyes']['right'].fillna(value=555, inplace=True) + + self.assertRaises(com.SettingWithCopyError, f) + + df = DataFrame(np.random.randn(10, 4)) + s = df.iloc[:, 0].sort_values() + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) + tm.assert_series_equal(s, df[0].sort_values()) + + # false positives GH6025 + df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) + str(df) + df['column1'] = df['column1'] + 'b' + str(df) + df = df[df['column2'] != 8] + str(df) + df['column1'] = df['column1'] + 'c' + str(df) + + # from SO: + # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc + df = DataFrame(np.arange(0, 9), columns=['count']) + df['group'] = 'b' + + def f(): + df.iloc[0:5]['group'] = 'a' + + self.assertRaises(com.SettingWithCopyError, f) + + # mixed type setting + # same dtype & changing dtype + df = DataFrame(dict(A=date_range('20130101', periods=5), + B=np.random.randn(5), + C=np.arange(5, dtype='int64'), + D=list('abcde'))) + + def f(): + df.ix[2]['D'] = 'foo' + + self.assertRaises(com.SettingWithCopyError, f) + + def f(): + df.ix[2]['C'] = 'foo' + + self.assertRaises(com.SettingWithCopyError, f) + + def f(): + df['C'][2] = 'foo' + + self.assertRaises(com.SettingWithCopyError, f) + + def test_setting_with_copy_bug(self): + + # operating on a copy + df = pd.DataFrame({'a': list(range(4)), + 'b': list('ab..'), + 'c': ['a', 'b', np.nan, 'd']}) + mask = pd.isnull(df.c) + + def f(): + df[['c']][mask] = df[['b']][mask] + + self.assertRaises(com.SettingWithCopyError, f) + + # invalid warning as we are returning a new object + # GH 8730 + df1 = DataFrame({'x': Series(['a', 'b', 'c']), + 'y': Series(['d', 'e', 'f'])}) + df2 = df1[['x']] + + # this should not raise + df2['y'] = ['g', 'h', 'i'] + + def test_detect_chained_assignment_warnings(self): + + # warnings + with option_context('chained_assignment', 'warn'): + df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) + with tm.assert_produces_warning( + expected_warning=com.SettingWithCopyWarning): + df.loc[0]['A'] = 111 diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py new file mode 100644 index 0000000000000..1c4e5772d316f --- /dev/null +++ b/pandas/tests/indexing/test_datetime.py @@ -0,0 +1,192 @@ +import numpy as np +import pandas as pd +from pandas import date_range, Index, DataFrame, Series, Timestamp +from pandas.util import testing as tm + + +class TestDatetimeIndex(tm.TestCase): + + def test_indexing_with_datetime_tz(self): + + # 8260 + # support datetime64 with tz + + idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), + name='foo') + dr = date_range('20130110', periods=3) + df = DataFrame({'A': idx, 'B': dr}) + df['C'] = idx + df.iloc[1, 1] = pd.NaT + df.iloc[1, 2] = pd.NaT + + # indexing + result = df.iloc[1] + expected = Series([Timestamp('2013-01-02 00:00:00-0500', + tz='US/Eastern'), np.nan, np.nan], + index=list('ABC'), dtype='object', name=1) + tm.assert_series_equal(result, expected) + result = df.loc[1] + expected = Series([Timestamp('2013-01-02 00:00:00-0500', + tz='US/Eastern'), np.nan, np.nan], + index=list('ABC'), dtype='object', name=1) + tm.assert_series_equal(result, expected) + + # indexing - fast_xs + df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) + result = df.iloc[5] + expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D') + self.assertEqual(result, expected) + + result = df.loc[5] + self.assertEqual(result, expected) + + # indexing - boolean + result = df[df.a > df.a[3]] + expected = df.iloc[4:] + tm.assert_frame_equal(result, expected) + + # indexing - setting an element + df = DataFrame(data=pd.to_datetime( + ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time']) + df['new_col'] = ['new', 'old'] + df.time = df.set_index('time').index.tz_localize('UTC') + v = df[df.new_col == 'new'].set_index('time').index.tz_convert( + 'US/Pacific') + + # trying to set a single element on a part of a different timezone + def f(): + df.loc[df.new_col == 'new', 'time'] = v + + self.assertRaises(ValueError, f) + + v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') + df.loc[df.new_col == 'new', 'time'] = v + tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) + + def test_indexing_with_datetimeindex_tz(self): + + # GH 12050 + # indexing on a series with a datetimeindex with tz + index = pd.date_range('2015-01-01', periods=2, tz='utc') + + ser = pd.Series(range(2), index=index, + dtype='int64') + + # list-like indexing + + for sel in (index, list(index)): + # getitem + tm.assert_series_equal(ser[sel], ser) + + # setitem + result = ser.copy() + result[sel] = 1 + expected = pd.Series(1, index=index) + tm.assert_series_equal(result, expected) + + # .loc getitem + tm.assert_series_equal(ser.loc[sel], ser) + + # .loc setitem + result = ser.copy() + result.loc[sel] = 1 + expected = pd.Series(1, index=index) + tm.assert_series_equal(result, expected) + + # single element indexing + + # getitem + self.assertEqual(ser[index[1]], 1) + + # setitem + result = ser.copy() + result[index[1]] = 5 + expected = pd.Series([0, 5], index=index) + tm.assert_series_equal(result, expected) + + # .loc getitem + self.assertEqual(ser.loc[index[1]], 1) + + # .loc setitem + result = ser.copy() + result.loc[index[1]] = 5 + expected = pd.Series([0, 5], index=index) + tm.assert_series_equal(result, expected) + + def test_partial_setting_with_datetimelike_dtype(self): + + # GH9478 + # a datetimeindex alignment issue with partial setting + df = pd.DataFrame(np.arange(6.).reshape(3, 2), columns=list('AB'), + index=pd.date_range('1/1/2000', periods=3, + freq='1H')) + expected = df.copy() + expected['C'] = [expected.index[0]] + [pd.NaT, pd.NaT] + + mask = df.A < 1 + df.loc[mask, 'C'] = df.loc[mask].index + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_datetime(self): + + # GH 9516 + dt1 = Timestamp('20130101 09:00:00') + dt2 = Timestamp('20130101 10:00:00') + + for conv in [lambda x: x, lambda x: x.to_datetime64(), + lambda x: x.to_pydatetime(), lambda x: np.datetime64(x)]: + + df = pd.DataFrame() + df.loc[conv(dt1), 'one'] = 100 + df.loc[conv(dt2), 'one'] = 200 + + expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2]) + tm.assert_frame_equal(df, expected) + + def test_series_partial_set_datetime(self): + # GH 11497 + + idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx') + ser = Series([0.1, 0.2], index=idx, name='s') + + result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] + exp = Series([0.1, 0.2], index=idx, name='s') + tm.assert_series_equal(result, exp, check_index_type=True) + + keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), + Timestamp('2011-01-01')] + exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), + name='s') + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), + Timestamp('2011-01-03')] + exp = Series([np.nan, 0.2, np.nan], + index=pd.DatetimeIndex(keys, name='idx'), name='s') + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + def test_series_partial_set_period(self): + # GH 11497 + + idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') + ser = Series([0.1, 0.2], index=idx, name='s') + + result = ser.loc[[pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-02', freq='D')]] + exp = Series([0.1, 0.2], index=idx, name='s') + tm.assert_series_equal(result, exp, check_index_type=True) + + keys = [pd.Period('2011-01-02', freq='D'), + pd.Period('2011-01-02', freq='D'), + pd.Period('2011-01-01', freq='D')] + exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), + name='s') + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + keys = [pd.Period('2011-01-03', freq='D'), + pd.Period('2011-01-02', freq='D'), + pd.Period('2011-01-03', freq='D')] + exp = Series([np.nan, 0.2, np.nan], + index=pd.PeriodIndex(keys, name='idx'), name='s') + result = ser.loc[keys] + tm.assert_series_equal(result, exp) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 8f0fa2d56113b..99e7460b2a3de 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -709,3 +709,160 @@ def test_floating_tuples(self): result = s[0.0] expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name='foo') assert_series_equal(result, expected) + + def test_float64index_slicing_bug(self): + # GH 5557, related to slicing a float index + ser = {256: 2321.0, + 1: 78.0, + 2: 2716.0, + 3: 0.0, + 4: 369.0, + 5: 0.0, + 6: 269.0, + 7: 0.0, + 8: 0.0, + 9: 0.0, + 10: 3536.0, + 11: 0.0, + 12: 24.0, + 13: 0.0, + 14: 931.0, + 15: 0.0, + 16: 101.0, + 17: 78.0, + 18: 9643.0, + 19: 0.0, + 20: 0.0, + 21: 0.0, + 22: 63761.0, + 23: 0.0, + 24: 446.0, + 25: 0.0, + 26: 34773.0, + 27: 0.0, + 28: 729.0, + 29: 78.0, + 30: 0.0, + 31: 0.0, + 32: 3374.0, + 33: 0.0, + 34: 1391.0, + 35: 0.0, + 36: 361.0, + 37: 0.0, + 38: 61808.0, + 39: 0.0, + 40: 0.0, + 41: 0.0, + 42: 6677.0, + 43: 0.0, + 44: 802.0, + 45: 0.0, + 46: 2691.0, + 47: 0.0, + 48: 3582.0, + 49: 0.0, + 50: 734.0, + 51: 0.0, + 52: 627.0, + 53: 70.0, + 54: 2584.0, + 55: 0.0, + 56: 324.0, + 57: 0.0, + 58: 605.0, + 59: 0.0, + 60: 0.0, + 61: 0.0, + 62: 3989.0, + 63: 10.0, + 64: 42.0, + 65: 0.0, + 66: 904.0, + 67: 0.0, + 68: 88.0, + 69: 70.0, + 70: 8172.0, + 71: 0.0, + 72: 0.0, + 73: 0.0, + 74: 64902.0, + 75: 0.0, + 76: 347.0, + 77: 0.0, + 78: 36605.0, + 79: 0.0, + 80: 379.0, + 81: 70.0, + 82: 0.0, + 83: 0.0, + 84: 3001.0, + 85: 0.0, + 86: 1630.0, + 87: 7.0, + 88: 364.0, + 89: 0.0, + 90: 67404.0, + 91: 9.0, + 92: 0.0, + 93: 0.0, + 94: 7685.0, + 95: 0.0, + 96: 1017.0, + 97: 0.0, + 98: 2831.0, + 99: 0.0, + 100: 2963.0, + 101: 0.0, + 102: 854.0, + 103: 0.0, + 104: 0.0, + 105: 0.0, + 106: 0.0, + 107: 0.0, + 108: 0.0, + 109: 0.0, + 110: 0.0, + 111: 0.0, + 112: 0.0, + 113: 0.0, + 114: 0.0, + 115: 0.0, + 116: 0.0, + 117: 0.0, + 118: 0.0, + 119: 0.0, + 120: 0.0, + 121: 0.0, + 122: 0.0, + 123: 0.0, + 124: 0.0, + 125: 0.0, + 126: 67744.0, + 127: 22.0, + 128: 264.0, + 129: 0.0, + 260: 197.0, + 268: 0.0, + 265: 0.0, + 269: 0.0, + 261: 0.0, + 266: 1198.0, + 267: 0.0, + 262: 2629.0, + 258: 775.0, + 257: 0.0, + 263: 0.0, + 259: 0.0, + 264: 163.0, + 250: 10326.0, + 251: 0.0, + 252: 1228.0, + 253: 0.0, + 254: 2769.0, + 255: 0.0} + + # smoke test for the repr + s = Series(ser) + result = s.value_counts() + str(result) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b06b1067b7c6b..f7a4af711bbb8 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 -import sys -import nose import itertools import warnings from warnings import catch_warnings @@ -10,22 +8,21 @@ from pandas.types.common import (is_integer_dtype, is_float_dtype, is_scalar) -from pandas.compat import range, lrange, lzip, StringIO, lmap, map +from pandas.compat import range, lrange, lzip, StringIO, lmap from pandas.tslib import NaT from numpy import nan from numpy.random import randn import numpy as np import pandas as pd -import pandas.core.common as com from pandas import option_context from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, MultiIndex, Timestamp, Timedelta, UInt64Index) from pandas.formats.printing import pprint_thing from pandas import concat -from pandas.core.common import PerformanceWarning, UnsortedIndexError - +from pandas.core.common import PerformanceWarning +from pandas.tests.indexing.common import _mklbl import pandas.util.testing as tm from pandas import date_range @@ -93,10 +90,6 @@ def _axify(obj, key, axis): return tuple(axes) -def _mklbl(prefix, n): - return ["%s%s" % (prefix, i) for i in range(n)] - - class TestIndexing(tm.TestCase): _objs = set(['series', 'frame', 'panel']) @@ -665,40 +658,6 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) - def test_iloc_getitem_multiindex2(self): - # TODO(wesm): fix this - raise nose.SkipTest('this test was being suppressed, ' - 'needs to be fixed') - - arr = np.random.randn(3, 3) - df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) - - rs = df.iloc[2] - xp = Series(arr[2], index=df.columns) - tm.assert_series_equal(rs, xp) - - rs = df.iloc[:, 2] - xp = Series(arr[:, 2], index=df.index) - tm.assert_series_equal(rs, xp) - - rs = df.iloc[2, 2] - xp = df.values[2, 2] - self.assertEqual(rs, xp) - - # for multiple items - # GH 5528 - rs = df.iloc[[0, 1]] - xp = df.xs(4, drop_level=False) - tm.assert_frame_equal(rs, xp) - - tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) - index = MultiIndex.from_tuples(tup) - df = DataFrame(np.random.randn(4, 4), index=index) - rs = df.iloc[[2, 3]] - xp = df.xs('b', drop_level=False) - tm.assert_frame_equal(rs, xp) - def test_iloc_setitem(self): df = self.frame_ints @@ -872,210 +831,6 @@ def compare(result, expected): result2 = s.loc[0:3] tm.assert_series_equal(result1, result2) - def test_setitem_multiindex(self): - for index_fn in ('ix', 'loc'): - - def check(target, indexers, value, compare_fn, expected=None): - fn = getattr(target, index_fn) - fn.__setitem__(indexers, value) - result = fn.__getitem__(indexers) - if expected is None: - expected = value - compare_fn(result, expected) - # GH7190 - index = pd.MultiIndex.from_product([np.arange(0, 100), - np.arange(0, 80)], - names=['time', 'firm']) - t, n = 0, 2 - df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=0, - compare_fn=self.assertEqual) - - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=1, - compare_fn=self.assertEqual) - - df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=2, - compare_fn=self.assertEqual) - - # GH 7218, assinging with 0-dim arrays - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, - indexers=((t, n), 'X'), - value=np.array(3), - compare_fn=self.assertEqual, - expected=3, ) - - # GH5206 - df = pd.DataFrame(np.arange(25).reshape(5, 5), - columns='A,B,C,D,E'.split(','), dtype=float) - df['F'] = 99 - row_selection = df['A'] % 2 == 0 - col_selection = ['B', 'C'] - with catch_warnings(record=True): - df.ix[row_selection, col_selection] = df['F'] - output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) - with catch_warnings(record=True): - tm.assert_frame_equal(df.ix[row_selection, col_selection], - output) - check(target=df, - indexers=(row_selection, col_selection), - value=df['F'], - compare_fn=tm.assert_frame_equal, - expected=output, ) - - # GH11372 - idx = pd.MultiIndex.from_product([ - ['A', 'B', 'C'], - pd.date_range('2015-01-01', '2015-04-01', freq='MS')]) - cols = pd.MultiIndex.from_product([ - ['foo', 'bar'], - pd.date_range('2016-01-01', '2016-02-01', freq='MS')]) - - df = pd.DataFrame(np.random.random((12, 4)), - index=idx, columns=cols) - - subidx = pd.MultiIndex.from_tuples( - [('A', pd.Timestamp('2015-01-01')), - ('A', pd.Timestamp('2015-02-01'))]) - subcols = pd.MultiIndex.from_tuples( - [('foo', pd.Timestamp('2016-01-01')), - ('foo', pd.Timestamp('2016-02-01'))]) - - vals = pd.DataFrame(np.random.random((2, 2)), - index=subidx, columns=subcols) - check(target=df, - indexers=(subidx, subcols), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # set all columns - vals = pd.DataFrame( - np.random.random((2, 4)), index=subidx, columns=cols) - check(target=df, - indexers=(subidx, slice(None, None, None)), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # identity - copy = df.copy() - check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=tm.assert_frame_equal, expected=copy) - - def test_indexing_with_datetime_tz(self): - - # 8260 - # support datetime64 with tz - - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') - dr = date_range('20130110', periods=3) - df = DataFrame({'A': idx, 'B': dr}) - df['C'] = idx - df.iloc[1, 1] = pd.NaT - df.iloc[1, 2] = pd.NaT - - # indexing - result = df.iloc[1] - expected = Series([Timestamp('2013-01-02 00:00:00-0500', - tz='US/Eastern'), np.nan, np.nan], - index=list('ABC'), dtype='object', name=1) - tm.assert_series_equal(result, expected) - result = df.loc[1] - expected = Series([Timestamp('2013-01-02 00:00:00-0500', - tz='US/Eastern'), np.nan, np.nan], - index=list('ABC'), dtype='object', name=1) - tm.assert_series_equal(result, expected) - - # indexing - fast_xs - df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) - result = df.iloc[5] - expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D') - self.assertEqual(result, expected) - - result = df.loc[5] - self.assertEqual(result, expected) - - # indexing - boolean - result = df[df.a > df.a[3]] - expected = df.iloc[4:] - tm.assert_frame_equal(result, expected) - - # indexing - setting an element - df = DataFrame(data=pd.to_datetime( - ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time']) - df['new_col'] = ['new', 'old'] - df.time = df.set_index('time').index.tz_localize('UTC') - v = df[df.new_col == 'new'].set_index('time').index.tz_convert( - 'US/Pacific') - - # trying to set a single element on a part of a different timezone - def f(): - df.loc[df.new_col == 'new', 'time'] = v - - self.assertRaises(ValueError, f) - - v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') - df.loc[df.new_col == 'new', 'time'] = v - tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) - - def test_indexing_with_datetimeindex_tz(self): - - # GH 12050 - # indexing on a series with a datetimeindex with tz - index = pd.date_range('2015-01-01', periods=2, tz='utc') - - ser = pd.Series(range(2), index=index, - dtype='int64') - - # list-like indexing - - for sel in (index, list(index)): - # getitem - tm.assert_series_equal(ser[sel], ser) - - # setitem - result = ser.copy() - result[sel] = 1 - expected = pd.Series(1, index=index) - tm.assert_series_equal(result, expected) - - # .loc getitem - tm.assert_series_equal(ser.loc[sel], ser) - - # .loc setitem - result = ser.copy() - result.loc[sel] = 1 - expected = pd.Series(1, index=index) - tm.assert_series_equal(result, expected) - - # single element indexing - - # getitem - self.assertEqual(ser[index[1]], 1) - - # setitem - result = ser.copy() - result[index[1]] = 5 - expected = pd.Series([0, 5], index=index) - tm.assert_series_equal(result, expected) - - # .loc getitem - self.assertEqual(ser.loc[index[1]], 1) - - # .loc setitem - result = ser.copy() - result.loc[index[1]] = 5 - expected = pd.Series([0, 5], index=index) - tm.assert_series_equal(result, expected) - def test_loc_setitem_dups(self): # GH 6541 @@ -1241,28 +996,6 @@ def test_loc_getitem_label_array_like(self): self.check_result('array like', 'loc', Series(index=[4, 8, 12]).index, 'ix', [4, 8, 12], typs=['ints', 'uints'], axes=2) - def test_loc_getitem_series(self): - # GH14730 - # passing a series as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) - x = Series(index=index, data=range(9), dtype=np.float64) - y = Series([1, 3]) - expected = Series( - data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) - result = x.loc[y] - tm.assert_series_equal(result, expected) - - result = x.loc[[1, 3]] - tm.assert_series_equal(result, expected) - - empty = Series(data=[], dtype=np.float64) - expected = Series([], index=MultiIndex( - levels=index.levels, labels=[[], []], dtype=np.float64)) - result = x.loc[empty] - tm.assert_series_equal(result, expected) - def test_loc_getitem_bool(self): # boolean indexers b = [True, False, True, False] @@ -1700,136 +1433,6 @@ def test_iloc_getitem_labelled_frame(self): # trying to use a label self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j', 'D'])) - def test_iloc_getitem_panel(self): - - # GH 7189 - p = Panel(np.arange(4 * 3 * 2).reshape(4, 3, 2), - items=['A', 'B', 'C', 'D'], - major_axis=['a', 'b', 'c'], - minor_axis=['one', 'two']) - - result = p.iloc[1] - expected = p.loc['B'] - tm.assert_frame_equal(result, expected) - - result = p.iloc[1, 1] - expected = p.loc['B', 'b'] - tm.assert_series_equal(result, expected) - - result = p.iloc[1, 1, 1] - expected = p.loc['B', 'b', 'two'] - self.assertEqual(result, expected) - - # slice - result = p.iloc[1:3] - expected = p.loc[['B', 'C']] - tm.assert_panel_equal(result, expected) - - result = p.iloc[:, 0:2] - expected = p.loc[:, ['a', 'b']] - tm.assert_panel_equal(result, expected) - - # list of integers - result = p.iloc[[0, 2]] - expected = p.loc[['A', 'C']] - tm.assert_panel_equal(result, expected) - - # neg indicies - result = p.iloc[[-1, 1], [-1, 1]] - expected = p.loc[['D', 'B'], ['c', 'b']] - tm.assert_panel_equal(result, expected) - - # dups indicies - result = p.iloc[[-1, -1, 1], [-1, 1]] - expected = p.loc[['D', 'D', 'B'], ['c', 'b']] - tm.assert_panel_equal(result, expected) - - # combined - result = p.iloc[0, [True, True], [0, 1]] - expected = p.loc['A', ['a', 'b'], ['one', 'two']] - tm.assert_frame_equal(result, expected) - - # out-of-bounds exception - self.assertRaises(IndexError, p.iloc.__getitem__, tuple([10, 5])) - - def f(): - p.iloc[0, [True, True], [0, 1, 2]] - - self.assertRaises(IndexError, f) - - # trying to use a label - self.assertRaises(ValueError, p.iloc.__getitem__, tuple(['j', 'D'])) - - # GH - p = Panel( - np.random.rand(4, 3, 2), items=['A', 'B', 'C', 'D'], - major_axis=['U', 'V', 'W'], minor_axis=['X', 'Y']) - expected = p['A'] - - result = p.iloc[0, :, :] - tm.assert_frame_equal(result, expected) - - result = p.iloc[0, [True, True, True], :] - tm.assert_frame_equal(result, expected) - - result = p.iloc[0, [True, True, True], [0, 1]] - tm.assert_frame_equal(result, expected) - - def f(): - p.iloc[0, [True, True, True], [0, 1, 2]] - - self.assertRaises(IndexError, f) - - def f(): - p.iloc[0, [True, True, True], [2]] - - self.assertRaises(IndexError, f) - - def test_iloc_getitem_panel_multiindex(self): - # GH 7199 - # Panel with multi-index - multi_index = pd.MultiIndex.from_tuples([('ONE', 'one'), - ('TWO', 'two'), - ('THREE', 'three')], - names=['UPPER', 'lower']) - - simple_index = [x[0] for x in multi_index] - wd1 = Panel(items=['First', 'Second'], major_axis=['a', 'b', 'c', 'd'], - minor_axis=multi_index) - - wd2 = Panel(items=['First', 'Second'], major_axis=['a', 'b', 'c', 'd'], - minor_axis=simple_index) - - expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] - result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG - tm.assert_frame_equal(result1, expected1) - - expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] - result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] - tm.assert_frame_equal(result2, expected2) - - expected1 = DataFrame(index=['a'], columns=multi_index, - dtype='float64') - result1 = wd1.iloc[0, [0], [0, 1, 2]] - tm.assert_frame_equal(result1, expected1) - - expected2 = DataFrame(index=['a'], columns=simple_index, - dtype='float64') - result2 = wd2.iloc[0, [0], [0, 1, 2]] - tm.assert_frame_equal(result2, expected2) - - # GH 7516 - mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) - p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), - items=['a', 'b', 'c'], major_axis=mi, - minor_axis=['u', 'v', 'w']) - result = p.iloc[:, 1, 0] - expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') - tm.assert_series_equal(result, expected) - - result = p.loc[:, (1, 'y'), 'u'] - tm.assert_series_equal(result, expected) - def test_iloc_getitem_doc_issue(self): # multi axis slicing issue with single block @@ -1956,929 +1559,90 @@ def test_iloc_setitem_list_of_lists(self): expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) - df = DataFrame( - dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) - df.iloc[2:4] = [['x', 11], ['y', 13]] - expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], - B=[5, 6, 11, 13, 9])) - tm.assert_frame_equal(df, expected) - - def test_iloc_getitem_multiindex(self): - mi_labels = DataFrame(np.random.randn(4, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j', 'k'], - ['X', 'X', 'Y', 'Y']]) - - mi_int = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) - - # the first row - rs = mi_int.iloc[0] - with catch_warnings(record=True): - xp = mi_int.ix[4].ix[8] - tm.assert_series_equal(rs, xp, check_names=False) - self.assertEqual(rs.name, (4, 8)) - self.assertEqual(xp.name, 8) - - # 2nd (last) columns - rs = mi_int.iloc[:, 2] - with catch_warnings(record=True): - xp = mi_int.ix[:, 2] - tm.assert_series_equal(rs, xp) - - # corner column - rs = mi_int.iloc[2, 2] - with catch_warnings(record=True): - xp = mi_int.ix[:, 2].ix[2] - self.assertEqual(rs, xp) - - # this is basically regular indexing - rs = mi_labels.iloc[2, 2] - with catch_warnings(record=True): - xp = mi_labels.ix['j'].ix[:, 'j'].ix[0, 0] - self.assertEqual(rs, xp) - - def test_loc_multiindex(self): - - mi_labels = DataFrame(np.random.randn(3, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) - - mi_int = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) - - # the first row - rs = mi_labels.loc['i'] - with catch_warnings(record=True): - xp = mi_labels.ix['i'] - tm.assert_frame_equal(rs, xp) - - # 2nd (last) columns - rs = mi_labels.loc[:, 'j'] - with catch_warnings(record=True): - xp = mi_labels.ix[:, 'j'] - tm.assert_frame_equal(rs, xp) - - # corner column - rs = mi_labels.loc['j'].loc[:, 'j'] - with catch_warnings(record=True): - xp = mi_labels.ix['j'].ix[:, 'j'] - tm.assert_frame_equal(rs, xp) - - # with a tuple - rs = mi_labels.loc[('i', 'X')] - with catch_warnings(record=True): - xp = mi_labels.ix[('i', 'X')] - tm.assert_frame_equal(rs, xp) - - rs = mi_int.loc[4] - with catch_warnings(record=True): - xp = mi_int.ix[4] - tm.assert_frame_equal(rs, xp) - - def test_loc_multiindex_indexer_none(self): - - # GH6788 - # multi-index indexer is None (meaning take all) - attributes = ['Attribute' + str(i) for i in range(1)] - attribute_values = ['Value' + str(i) for i in range(5)] - - index = MultiIndex.from_product([attributes, attribute_values]) - df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 - df = DataFrame(df, columns=index) - result = df[attributes] - tm.assert_frame_equal(result, df) - - # GH 7349 - # loc with a multi-index seems to be doing fallback - df = DataFrame(np.arange(12).reshape(-1, 1), - index=pd.MultiIndex.from_product([[1, 2, 3, 4], - [1, 2, 3]])) - - expected = df.loc[([1, 2], ), :] - result = df.loc[[1, 2]] - tm.assert_frame_equal(result, expected) - - def test_loc_multiindex_incomplete(self): - - # GH 7399 - # incomplete indexers - s = pd.Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) - expected = s.loc[:, 'a':'c'] - - result = s.loc[0:4, 'a':'c'] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected) - - result = s.loc[:4, 'a':'c'] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected) - - result = s.loc[0:, 'a':'c'] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected) - - # GH 7400 - # multiindexer gettitem with list of indexers skips wrong element - s = pd.Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) - expected = s.iloc[[6, 7, 8, 12, 13, 14]] - result = s.loc[2:4:2, 'a':'c'] - tm.assert_series_equal(result, expected) - - def test_multiindex_perf_warn(self): - - if sys.version_info < (2, 7): - raise nose.SkipTest('python version < 2.7') - - df = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) - - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.index]): - df.loc[(1, 'z')] - - df = df.iloc[[2, 1, 3, 0]] - with tm.assert_produces_warning(PerformanceWarning): - df.loc[(0, )] - - def test_series_getitem_multiindex(self): - - # GH 6018 - # series regression getitem with a multi-index - - s = Series([1, 2, 3]) - s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) - - result = s[:, 0] - expected = Series([1], index=[0]) - tm.assert_series_equal(result, expected) - - result = s.loc[:, 1] - expected = Series([2, 3], index=[1, 2]) - tm.assert_series_equal(result, expected) - - # xs - result = s.xs(0, level=0) - expected = Series([1], index=[0]) - tm.assert_series_equal(result, expected) - - result = s.xs(1, level=1) - expected = Series([2, 3], index=[1, 2]) - tm.assert_series_equal(result, expected) - - # GH6258 - dt = list(date_range('20130903', periods=3)) - idx = MultiIndex.from_product([list('AB'), dt]) - s = Series([1, 3, 4, 1, 3, 4], index=idx) - - result = s.xs('20130903', level=1) - expected = Series([1, 1], index=list('AB')) - tm.assert_series_equal(result, expected) - - # GH5684 - idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')]) - s = Series([1, 2, 3, 4], index=idx) - s.index.set_names(['L1', 'L2'], inplace=True) - result = s.xs('one', level='L2') - expected = Series([1, 3], index=['a', 'b']) - expected.index.set_names(['L1'], inplace=True) - tm.assert_series_equal(result, expected) - - def test_ix_general(self): - - # ix general issues - - # GH 2817 - data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, - 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, - 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} - df = DataFrame(data).set_index(keys=['col', 'year']) - key = 4.0, 2012 - - # emits a PerformanceWarning, ok - with self.assert_produces_warning(PerformanceWarning): - tm.assert_frame_equal(df.loc[key], df.iloc[2:]) - - # this is ok - df.sort_index(inplace=True) - res = df.loc[key] - - # col has float dtype, result should be Float64Index - index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], - names=['col', 'year']) - expected = DataFrame({'amount': [222, 333, 444]}, index=index) - tm.assert_frame_equal(res, expected) - - def test_ix_weird_slicing(self): - # http://stackoverflow.com/q/17056560/1240268 - df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], - 'two': [1, 2, 3, 4, 5]}) - df.loc[df['one'] > 1, 'two'] = -df['two'] - - expected = DataFrame({'one': {0: 1.0, - 1: 2.0, - 2: 3.0, - 3: nan, - 4: nan}, - 'two': {0: 1, - 1: -2, - 2: -3, - 3: 4, - 4: 5}}) - tm.assert_frame_equal(df, expected) - - def test_xs_multiindex(self): - - # GH2903 - columns = MultiIndex.from_tuples( - [('a', 'foo'), ('a', 'bar'), ('b', 'hello'), - ('b', 'world')], names=['lvl0', 'lvl1']) - df = DataFrame(np.random.randn(4, 4), columns=columns) - df.sort_index(axis=1, inplace=True) - result = df.xs('a', level='lvl0', axis=1) - expected = df.iloc[:, 0:2].loc[:, 'a'] - tm.assert_frame_equal(result, expected) - - result = df.xs('foo', level='lvl1', axis=1) - expected = df.iloc[:, 1:2].copy() - expected.columns = expected.columns.droplevel('lvl1') - tm.assert_frame_equal(result, expected) - - def test_per_axis_per_level_getitem(self): - - # GH6134 - # example test case - ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( - 'C', 4), _mklbl('D', 2)]) - df = DataFrame(np.arange(len(ix.get_values())), index=ix) - - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C2' or c == 'C3')]] - result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] - tm.assert_frame_equal(result, expected) - - # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - - df = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) - df = df.sort_index(axis=0).sort_index(axis=1) - - # identity - result = df.loc[(slice(None), slice(None)), :] - tm.assert_frame_equal(result, df) - result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] - tm.assert_frame_equal(result, df) - result = df.loc[:, (slice(None), slice(None))] - tm.assert_frame_equal(result, df) - - # index - result = df.loc[(slice(None), [1]), :] - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), 1), :] - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - # columns - result = df.loc[:, (slice(None), ['foo'])] - expected = df.iloc[:, [1, 3]] - tm.assert_frame_equal(result, expected) - - # both - result = df.loc[(slice(None), 1), (slice(None), ['foo'])] - expected = df.iloc[[0, 3], [1, 3]] - tm.assert_frame_equal(result, expected) - - result = df.loc['A', 'a'] - expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), - index=Index([1, 2, 3], name='two'), - columns=Index(['bar', 'foo'], name='lvl1')) - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), [1, 2]), :] - expected = df.iloc[[0, 1, 3]] - tm.assert_frame_equal(result, expected) - - # multi-level series - s = Series(np.arange(len(ix.get_values())), index=ix) - result = s.loc['A1':'A3', :, ['C1', 'C3']] - expected = s.loc[[tuple([a, b, c, d]) - for a, b, c, d in s.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_series_equal(result, expected) - - # boolean indexers - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] - expected = df.iloc[[2, 3]] - tm.assert_frame_equal(result, expected) - - def f(): - df.loc[(slice(None), np.array([True, False])), :] - - self.assertRaises(ValueError, f) - - # ambiguous cases - # these can be multiply interpreted (e.g. in this case - # as df.loc[slice(None),[1]] as well - self.assertRaises(KeyError, lambda: df.loc[slice(None), [1]]) - - result = df.loc[(slice(None), [1]), :] - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - # not lexsorted - self.assertEqual(df.index.lexsort_depth, 2) - df = df.sort_index(level=1, axis=0) - self.assertEqual(df.index.lexsort_depth, 0) - with tm.assertRaisesRegexp( - UnsortedIndexError, - 'MultiIndex Slicing requires the index to be fully ' - r'lexsorted tuple len \(2\), lexsort depth \(0\)'): - df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] - - def test_multiindex_slicers_non_unique(self): - - # GH 7106 - # non-unique mi index support - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 3], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) - self.assertFalse(df.index.is_unique) - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) - result = df.loc[(slice(None), slice(None), 1), :] - tm.assert_frame_equal(result, expected) - - # this is equivalent of an xs expression - result = df.xs(1, level=2, drop_level=False) - tm.assert_frame_equal(result, expected) - - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 2], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) - self.assertFalse(df.index.is_unique) - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) - result = df.loc[(slice(None), slice(None), 1), :] - self.assertFalse(result.index.is_unique) - tm.assert_frame_equal(result, expected) - - # GH12896 - # numpy-implementation dependent bug - ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, - 17, 18, 19, 200000, 200000] - n = len(ints) - idx = MultiIndex.from_arrays([['a'] * n, ints]) - result = Series([1] * n, index=idx) - result = result.sort_index() - result = result.loc[(slice(None), slice(100000))] - expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() - tm.assert_series_equal(result, expected) - - def test_multiindex_slicers_datetimelike(self): - - # GH 7429 - # buggy/inconsistent behavior when slicing with datetime-like - import datetime - dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) + - datetime.timedelta(days=i) for i in range(6)] - freq = [1, 2] - index = MultiIndex.from_product( - [dates, freq], names=['date', 'frequency']) - - df = DataFrame( - np.arange(6 * 2 * 4, dtype='int64').reshape( - -1, 4), index=index, columns=list('ABCD')) - - # multi-axis slicing - idx = pd.IndexSlice - expected = df.iloc[[0, 2, 4], [0, 1]] - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), - slice(1, 1)), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( - '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), 1), - slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - # with strings - result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), - slice(1, 1)), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), - idx['A', 'B']] - tm.assert_frame_equal(result, expected) - - def test_multiindex_slicers_edges(self): - # GH 8132 - # various edge cases - df = DataFrame( - {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5, - 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3, - 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", - "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", - "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", - "2013-07-09", "2013-08-06", "2013-09-03"], - 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]}) - - df['DATE'] = pd.to_datetime(df['DATE']) - df1 = df.set_index(['A', 'B', 'DATE']) - df1 = df1.sort_index() - - # A1 - Get all values under "A0" and "A1" - result = df1.loc[(slice('A1')), :] - expected = df1.iloc[0:10] - tm.assert_frame_equal(result, expected) - - # A2 - Get all values from the start to "A2" - result = df1.loc[(slice('A2')), :] - expected = df1 - tm.assert_frame_equal(result, expected) - - # A3 - Get all values under "B1" or "B2" - result = df1.loc[(slice(None), slice('B1', 'B2')), :] - expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] - tm.assert_frame_equal(result, expected) - - # A4 - Get all values between 2013-07-02 and 2013-07-09 - result = df1.loc[(slice(None), slice(None), - slice('20130702', '20130709')), :] - expected = df1.iloc[[1, 2, 6, 7, 12]] - tm.assert_frame_equal(result, expected) - - # B1 - Get all values in B0 that are also under A0, A1 and A2 - result = df1.loc[(slice('A2'), slice('B0')), :] - expected = df1.iloc[[0, 1, 5, 6, 10, 11]] - tm.assert_frame_equal(result, expected) - - # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for - # the As) - result = df1.loc[(slice(None), slice('B2')), :] - expected = df1 - tm.assert_frame_equal(result, expected) - - # B3 - Get all values from B1 to B2 and up to 2013-08-06 - result = df1.loc[(slice(None), slice('B1', 'B2'), - slice('2013-08-06')), :] - expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] - tm.assert_frame_equal(result, expected) - - # B4 - Same as A4 but the start of the date slice is not a key. - # shows indexing on a partial selection slice - result = df1.loc[(slice(None), slice(None), - slice('20130701', '20130709')), :] - expected = df1.iloc[[1, 2, 6, 7, 12]] - tm.assert_frame_equal(result, expected) - - def test_per_axis_per_level_doc_examples(self): - - # test index maker - idx = pd.IndexSlice - - # from indexing.rst / advanced - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, columns=columns) - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx[:, :, ['C1', 'C3']], :] - tm.assert_frame_equal(result, expected) - - # not sorted - def f(): - df.loc['A1', (slice(None), 'foo')] - - self.assertRaises(UnsortedIndexError, f) - df = df.sort_index(axis=1) - - # slicing - df.loc['A1', (slice(None), 'foo')] - df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')] - - # setitem - df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 - - def test_loc_axis_arguments(self): - - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, - columns=columns).sort_index().sort_index(axis=1) - - # axis 0 - result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - result = df.loc(axis='index')[:, :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - # axis 1 - result = df.loc(axis=1)[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] - tm.assert_frame_equal(result, expected) - - result = df.loc(axis='columns')[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] - tm.assert_frame_equal(result, expected) - - # invalid axis - def f(): - df.loc(axis=-1)[:, :, ['C1', 'C3']] - - self.assertRaises(ValueError, f) - - def f(): - df.loc(axis=2)[:, :, ['C1', 'C3']] - - self.assertRaises(ValueError, f) - - def f(): - df.loc(axis='foo')[:, :, ['C1', 'C3']] - - self.assertRaises(ValueError, f) - - def test_loc_coerceion(self): - - # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) - expected = df.dtypes - - result = df.iloc[[0]] - tm.assert_series_equal(result.dtypes, expected) - - result = df.iloc[[1]] - tm.assert_series_equal(result.dtypes, expected) - - # 12045 - import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) - expected = df.dtypes - - result = df.iloc[[0]] - tm.assert_series_equal(result.dtypes, expected) - - result = df.iloc[[1]] - tm.assert_series_equal(result.dtypes, expected) - - # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) - expected = df.dtypes - - result = df.iloc[0:2] - tm.assert_series_equal(result.dtypes, expected) - - result = df.iloc[3:] - tm.assert_series_equal(result.dtypes, expected) - - def test_per_axis_per_level_setitem(self): - - # test index maker - idx = pd.IndexSlice - - # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - - df_orig = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) - df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) - - # identity - df = df_orig.copy() - df.loc[(slice(None), slice(None)), :] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc(axis=0)[:, :] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[:, (slice(None), slice(None))] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - # index - df = df_orig.copy() - df.loc[(slice(None), [1]), :] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[(slice(None), 1), :] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc(axis=0)[:, 1] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3]] = 100 - tm.assert_frame_equal(df, expected) - - # columns - df = df_orig.copy() - df.loc[:, (slice(None), ['foo'])] = 100 - expected = df_orig.copy() - expected.iloc[:, [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - # both - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[idx[:, 1], idx[:, ['foo']]] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc['A', 'a'] = 100 - expected = df_orig.copy() - expected.iloc[0:3, 0:2] = 100 - tm.assert_frame_equal(df, expected) - - # setting with a list-like - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100, 100], [100, 100]], dtype='int64') - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - # not enough values - df = df_orig.copy() - - def f(): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100], [100, 100]], dtype='int64') - - self.assertRaises(ValueError, f) - - def f(): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [100, 100, 100, 100], dtype='int64') - - self.assertRaises(ValueError, f) - - # with an alignable rhs - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( - None), 1), (slice(None), ['foo'])] * 5 - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( - None), 1), (slice(None), ['foo'])] - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - tm.assert_frame_equal(df, expected) - - rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() - rhs.loc[:, ('c', 'bah')] = 10 - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + df = DataFrame( + dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) + df.iloc[2:4] = [['x', 11], ['y', 13]] + expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], + B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) - def test_multiindex_setitem(self): - - # GH 3738 - # setting with a multi-index right hand side - arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), - np.array(['one', 'two', 'one', 'one', 'two', 'one']), - np.arange(0, 6, 1)] - - df_orig = pd.DataFrame(np.random.randn(6, 3), - index=arrays, - columns=['A', 'B', 'C']).sort_index() + def test_ix_general(self): - expected = df_orig.loc[['bar']] * 2 - df = df_orig.copy() - df.loc[['bar']] *= 2 - tm.assert_frame_equal(df.loc[['bar']], expected) + # ix general issues - # raise because these have differing levels - def f(): - df.loc['bar'] *= 2 + # GH 2817 + data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} + df = DataFrame(data).set_index(keys=['col', 'year']) + key = 4.0, 2012 - self.assertRaises(TypeError, f) + # emits a PerformanceWarning, ok + with self.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(df.loc[key], df.iloc[2:]) - # from SO - # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation - df_orig = DataFrame.from_dict({'price': { - ('DE', 'Coal', 'Stock'): 2, - ('DE', 'Gas', 'Stock'): 4, - ('DE', 'Elec', 'Demand'): 1, - ('FR', 'Gas', 'Stock'): 5, - ('FR', 'Solar', 'SupIm'): 0, - ('FR', 'Wind', 'SupIm'): 0 - }}) - df_orig.index = MultiIndex.from_tuples(df_orig.index, - names=['Sit', 'Com', 'Type']) + # this is ok + df.sort_index(inplace=True) + res = df.loc[key] - expected = df_orig.copy() - expected.iloc[[0, 2, 3]] *= 2 + # col has float dtype, result should be Float64Index + index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], + names=['col', 'year']) + expected = DataFrame({'amount': [222, 333, 444]}, index=index) + tm.assert_frame_equal(res, expected) - idx = pd.IndexSlice - df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], :] *= 2 - tm.assert_frame_equal(df, expected) + def test_ix_weird_slicing(self): + # http://stackoverflow.com/q/17056560/1240268 + df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], + 'two': [1, 2, 3, 4, 5]}) + df.loc[df['one'] > 1, 'two'] = -df['two'] - df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], 'price'] *= 2 + expected = DataFrame({'one': {0: 1.0, + 1: 2.0, + 2: 3.0, + 3: nan, + 4: nan}, + 'two': {0: 1, + 1: -2, + 2: -3, + 3: 4, + 4: 5}}) tm.assert_frame_equal(df, expected) - def test_getitem_multiindex(self): - # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise - # the appropriate error, only in PY3 of course! - index = MultiIndex(levels=[['D', 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) - arr = np.random.randn(len(index), 1) - df = DataFrame(arr, index=index, columns=['val']) - result = df.val['D'] - expected = Series(arr.ravel()[0:3], name='val', index=Index( - [26, 37, 57], name='day')) - tm.assert_series_equal(result, expected) - - def f(): - df.val['A'] - - self.assertRaises(KeyError, f) - - def f(): - df.val['X'] - - self.assertRaises(KeyError, f) - - # A is treated as a special Timestamp - index = MultiIndex(levels=[['A', 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) - df = DataFrame(arr, index=index, columns=['val']) - result = df.val['A'] - expected = Series(arr.ravel()[0:3], name='val', index=Index( - [26, 37, 57], name='day')) - tm.assert_series_equal(result, expected) - - def f(): - df.val['X'] + def test_loc_coerceion(self): - self.assertRaises(KeyError, f) + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) + expected = df.dtypes - # GH 7866 - # multi-index slicing with missing indexers - idx = pd.MultiIndex.from_product([['A', 'B', 'C'], - ['foo', 'bar', 'baz']], - names=['one', 'two']) - s = pd.Series(np.arange(9, dtype='int64'), index=idx).sort_index() + result = df.iloc[[0]] + tm.assert_series_equal(result.dtypes, expected) - exp_idx = pd.MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], - names=['one', 'two']) - expected = pd.Series(np.arange(3, dtype='int64'), - index=exp_idx).sort_index() + result = df.iloc[[1]] + tm.assert_series_equal(result.dtypes, expected) - result = s.loc[['A']] - tm.assert_series_equal(result, expected) - result = s.loc[['A', 'D']] - tm.assert_series_equal(result, expected) + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + expected = df.dtypes - # not any values found - self.assertRaises(KeyError, lambda: s.loc[['D']]) + result = df.iloc[[0]] + tm.assert_series_equal(result.dtypes, expected) - # empty ok - result = s.loc[[]] - expected = s.iloc[[]] - tm.assert_series_equal(result, expected) + result = df.iloc[[1]] + tm.assert_series_equal(result.dtypes, expected) - idx = pd.IndexSlice - expected = pd.Series([0, 3, 6], index=pd.MultiIndex.from_product( - [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + expected = df.dtypes - result = s.loc[idx[:, ['foo']]] - tm.assert_series_equal(result, expected) - result = s.loc[idx[:, ['foo', 'bah']]] - tm.assert_series_equal(result, expected) + result = df.iloc[0:2] + tm.assert_series_equal(result.dtypes, expected) - # GH 8737 - # empty indexer - multi_index = pd.MultiIndex.from_product((['foo', 'bar', 'baz'], - ['alpha', 'beta'])) - df = DataFrame( - np.random.randn(5, 6), index=range(5), columns=multi_index) - df = df.sort_index(level=0, axis=1) - - expected = DataFrame(index=range(5), - columns=multi_index.reindex([])[0]) - result1 = df.loc[:, ([], slice(None))] - result2 = df.loc[:, (['foo'], [])] - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # regression from < 0.14.0 - # GH 7914 - df = DataFrame([[np.mean, np.median], ['mean', 'median']], - columns=MultiIndex.from_tuples([('functs', 'mean'), - ('functs', 'median')]), - index=['function', 'name']) - result = df.loc['function', ('functs', 'mean')] - self.assertEqual(result, np.mean) + result = df.iloc[3:] + tm.assert_series_equal(result.dtypes, expected) def test_setitem_dtype_upcast(self): @@ -3154,233 +1918,6 @@ def test_multi_nan_indexing(self): Index(['C1', 'C2', 'C3', 'C4'], name='b')]) tm.assert_frame_equal(result, expected) - def test_iloc_panel_issue(self): - - # GH 3617 - p = Panel(randn(4, 4, 4)) - - self.assertEqual(p.iloc[:3, :3, :3].shape, (3, 3, 3)) - self.assertEqual(p.iloc[1, :3, :3].shape, (3, 3)) - self.assertEqual(p.iloc[:3, 1, :3].shape, (3, 3)) - self.assertEqual(p.iloc[:3, :3, 1].shape, (3, 3)) - self.assertEqual(p.iloc[1, 1, :3].shape, (3, )) - self.assertEqual(p.iloc[1, :3, 1].shape, (3, )) - self.assertEqual(p.iloc[:3, 1, 1].shape, (3, )) - - def test_panel_getitem(self): - # GH4016, date selection returns a frame when a partial string - # selection - ind = date_range(start="2000", freq="D", periods=1000) - df = DataFrame( - np.random.randn( - len(ind), 5), index=ind, columns=list('ABCDE')) - panel = Panel(dict([('frame_' + c, df) for c in list('ABC')])) - - test2 = panel.ix[:, "2002":"2002-12-31"] - test1 = panel.ix[:, "2002"] - tm.assert_panel_equal(test1, test2) - - # GH8710 - # multi-element getting with a list - panel = tm.makePanel() - - expected = panel.iloc[[0, 1]] - - result = panel.loc[['ItemA', 'ItemB']] - tm.assert_panel_equal(result, expected) - - result = panel.loc[['ItemA', 'ItemB'], :, :] - tm.assert_panel_equal(result, expected) - - result = panel[['ItemA', 'ItemB']] - tm.assert_panel_equal(result, expected) - - result = panel.loc['ItemA':'ItemB'] - tm.assert_panel_equal(result, expected) - - result = panel.ix['ItemA':'ItemB'] - tm.assert_panel_equal(result, expected) - - result = panel.ix[['ItemA', 'ItemB']] - tm.assert_panel_equal(result, expected) - - # with an object-like - # GH 9140 - class TestObject: - - def __str__(self): - return "TestObject" - - obj = TestObject() - - p = Panel(np.random.randn(1, 5, 4), items=[obj], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - - expected = p.iloc[0] - result = p[obj] - tm.assert_frame_equal(result, expected) - - def test_panel_setitem(self): - - # GH 7763 - # loc and setitem have setting differences - np.random.seed(0) - index = range(3) - columns = list('abc') - - panel = Panel({'A': DataFrame(np.random.randn(3, 3), - index=index, columns=columns), - 'B': DataFrame(np.random.randn(3, 3), - index=index, columns=columns), - 'C': DataFrame(np.random.randn(3, 3), - index=index, columns=columns)}) - - replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns) - expected = Panel({'A': replace, 'B': replace, 'C': replace}) - - p = panel.copy() - for idx in list('ABC'): - p[idx] = replace - tm.assert_panel_equal(p, expected) - - p = panel.copy() - for idx in list('ABC'): - p.loc[idx, :, :] = replace - tm.assert_panel_equal(p, expected) - - def test_panel_setitem_with_multiindex(self): - - # 10360 - # failing with a multi-index - arr = np.array([[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], - dtype=np.float64) - - # reg index - axes = dict(items=['A', 'B'], major_axis=[0, 1], - minor_axis=['X', 'Y', 'Z']) - p1 = Panel(0., **axes) - p1.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p1, expected) - - # multi-indexes - axes['items'] = pd.MultiIndex.from_tuples([('A', 'a'), ('B', 'b')]) - p2 = Panel(0., **axes) - p2.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p2, expected) - - axes['major_axis'] = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)]) - p3 = Panel(0., **axes) - p3.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p3, expected) - - axes['minor_axis'] = pd.MultiIndex.from_product([['X'], range(3)]) - p4 = Panel(0., **axes) - p4.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p4, expected) - - arr = np.array( - [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]], dtype=np.float64) - p5 = Panel(0., **axes) - p5.iloc[0, :, 0] = [1, 2] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p5, expected) - - def test_panel_assignment(self): - # GH3777 - wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - wp2 = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - - # TODO: unused? - # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] - - def f(): - wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[ - ['Item1', 'Item2'], :, ['A', 'B']] - - self.assertRaises(NotImplementedError, f) - - # to_assign = wp2.loc[['Item1', 'Item2'], :, ['A', 'B']] - # wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = to_assign - # result = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] - # tm.assert_panel_equal(result,expected) - - def test_multiindex_assignment(self): - - # GH3777 part 2 - - # mixed dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) - df['d'] = np.nan - arr = np.array([0., 1.]) - - df.ix[4, 'd'] = arr - tm.assert_series_equal(df.ix[4, 'd'], - Series(arr, index=[8, 10], name='d')) - - # single dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) - - df.ix[4, 'c'] = arr - exp = Series(arr, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) - - # scalar ok - df.ix[4, 'c'] = 10 - exp = Series(10, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) - - # invalid assignments - def f(): - df.ix[4, 'c'] = [0, 1, 2, 3] - - self.assertRaises(ValueError, f) - - def f(): - df.ix[4, 'c'] = [0] - - self.assertRaises(ValueError, f) - - # groupby example - NUM_ROWS = 100 - NUM_COLS = 10 - col_names = ['A' + num for num in - map(str, np.arange(NUM_COLS).tolist())] - index_cols = col_names[:5] - - df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), - dtype=np.int64, columns=col_names) - df = df.set_index(index_cols).sort_index() - grp = df.groupby(level=index_cols[:4]) - df['new_col'] = np.nan - - f_index = np.arange(5) - - def f(name, df2): - return Series(np.arange(df2.shape[0]), - name=df2.index.values[0]).reindex(f_index) - - # TODO(wesm): unused? - # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T - - # we are actually operating on a copy here - # but in this case, that's ok - for name, df2 in grp: - new_vals = np.arange(df2.shape[0]) - df.ix[name, 'new_col'] = new_vals - def test_multi_assign(self): # GH 3626, an assignement of a sub-df to a df @@ -4069,36 +2606,6 @@ def f(): dtype='float64') tm.assert_frame_equal(df, exp) - def test_partial_setting_with_datetimelike_dtype(self): - - # GH9478 - # a datetimeindex alignment issue with partial setting - df = pd.DataFrame(np.arange(6.).reshape(3, 2), columns=list('AB'), - index=pd.date_range('1/1/2000', periods=3, - freq='1H')) - expected = df.copy() - expected['C'] = [expected.index[0]] + [pd.NaT, pd.NaT] - - mask = df.A < 1 - df.loc[mask, 'C'] = df.loc[mask].index - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_datetime(self): - - # GH 9516 - dt1 = Timestamp('20130101 09:00:00') - dt2 = Timestamp('20130101 10:00:00') - - for conv in [lambda x: x, lambda x: x.to_datetime64(), - lambda x: x.to_pydatetime(), lambda x: np.datetime64(x)]: - - df = pd.DataFrame() - df.loc[conv(dt1), 'one'] = 100 - df.loc[conv(dt2), 'one'] = 200 - - expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2]) - tm.assert_frame_equal(df, expected) - def test_series_partial_set(self): # partial set with new index # Regression from GH4825 @@ -4233,54 +2740,6 @@ def test_series_partial_set_with_name(self): result = ser.iloc[[1, 1, 0, 0]] tm.assert_series_equal(result, expected, check_index_type=True) - def test_series_partial_set_datetime(self): - # GH 11497 - - idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') - - result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] - exp = Series([0.1, 0.2], index=idx, name='s') - tm.assert_series_equal(result, exp, check_index_type=True) - - keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), - Timestamp('2011-01-01')] - exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), - name='s') - tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - - keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), - Timestamp('2011-01-03')] - exp = Series([np.nan, 0.2, np.nan], - index=pd.DatetimeIndex(keys, name='idx'), name='s') - tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - - def test_series_partial_set_period(self): - # GH 11497 - - idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') - - result = ser.loc[[pd.Period('2011-01-01', freq='D'), - pd.Period('2011-01-02', freq='D')]] - exp = Series([0.1, 0.2], index=idx, name='s') - tm.assert_series_equal(result, exp, check_index_type=True) - - keys = [pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-01', freq='D')] - exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), - name='s') - tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - - keys = [pd.Period('2011-01-03', freq='D'), - pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-03', freq='D')] - exp = Series([np.nan, 0.2, np.nan], - index=pd.PeriodIndex(keys, name='idx'), name='s') - result = ser.loc[keys] - tm.assert_series_equal(result, exp) - def test_partial_set_invalid(self): # GH 4940 @@ -4566,509 +3025,6 @@ def test_cache_updating(self): expected = Series([0, 0, 0, 2, 0], name='f') tm.assert_series_equal(df.f, expected) - def test_slice_consolidate_invalidate_item_cache(self): - - # this is chained assignment, but will 'work' - with option_context('chained_assignment', None): - - # #3970 - df = DataFrame({"aa": lrange(5), "bb": [2.2] * 5}) - - # Creates a second float block - df["cc"] = 0.0 - - # caches a reference to the 'bb' series - df["bb"] - - # repr machinery triggers consolidation - repr(df) - - # Assignment to wrong series - df['bb'].iloc[0] = 0.17 - df._clear_item_cache() - self.assertAlmostEqual(df['bb'][0], 0.17) - - def test_setitem_cache_updating(self): - # GH 5424 - cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven'] - - for do_ref in [False, False]: - df = DataFrame({'a': cont, - "b": cont[3:] + cont[:3], - 'c': np.arange(7)}) - - # ref the cache - if do_ref: - df.ix[0, "c"] - - # set it - df.ix[7, 'c'] = 1 - - self.assertEqual(df.ix[0, 'c'], 0.0) - self.assertEqual(df.ix[7, 'c'], 1.0) - - # GH 7084 - # not updating cache on series setting with slices - expected = DataFrame({'A': [600, 600, 600]}, - index=date_range('5/7/2014', '5/9/2014')) - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) - df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]}) - - # loop through df to update out - six = Timestamp('5/7/2014') - eix = Timestamp('5/9/2014') - for ix, row in df.iterrows(): - out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] - - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) - - # try via a chain indexing - # this actually works - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) - for ix, row in df.iterrows(): - v = out[row['C']][six:eix] + row['D'] - out[row['C']][six:eix] = v - - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) - - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) - for ix, row in df.iterrows(): - out.loc[six:eix, row['C']] += row['D'] - - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) - - def test_setitem_chained_setfault(self): - - # GH6026 - # setfaults under numpy 1.7.1 (ok on 1.8) - data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout'] - mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none'] - - df = DataFrame({'response': np.array(data)}) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata})) - - recarray = np.rec.fromarrays([data], names=['response']) - df = DataFrame(recarray) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata})) - - df = DataFrame({'response': data, 'response1': data}) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata, - 'response1': data})) - - # GH 6056 - expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) - df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) - df['A'].iloc[0] = np.nan - result = df.head() - tm.assert_frame_equal(result, expected) - - df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) - df.A.iloc[0] = np.nan - result = df.head() - tm.assert_frame_equal(result, expected) - - def test_detect_chained_assignment(self): - - pd.set_option('chained_assignment', 'raise') - - # work with the chain - expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) - df = DataFrame(np.arange(4).reshape(2, 2), - columns=list('AB'), dtype='int64') - self.assertIsNone(df.is_copy) - df['A'][0] = -5 - df['A'][1] = -6 - tm.assert_frame_equal(df, expected) - - # test with the chaining - df = DataFrame({'A': Series(range(2), dtype='int64'), - 'B': np.array(np.arange(2, 4), dtype=np.float64)}) - self.assertIsNone(df.is_copy) - - def f(): - df['A'][0] = -5 - - self.assertRaises(com.SettingWithCopyError, f) - - def f(): - df['A'][1] = np.nan - - self.assertRaises(com.SettingWithCopyError, f) - self.assertIsNone(df['A'].is_copy) - - # using a copy (the chain), fails - df = DataFrame({'A': Series(range(2), dtype='int64'), - 'B': np.array(np.arange(2, 4), dtype=np.float64)}) - - def f(): - df.loc[0]['A'] = -5 - - self.assertRaises(com.SettingWithCopyError, f) - - # doc example - df = DataFrame({'a': ['one', 'one', 'two', 'three', - 'two', 'one', 'six'], - 'c': Series(range(7), dtype='int64')}) - self.assertIsNone(df.is_copy) - expected = DataFrame({'a': ['one', 'one', 'two', 'three', - 'two', 'one', 'six'], - 'c': [42, 42, 2, 3, 4, 42, 6]}) - - def f(): - indexer = df.a.str.startswith('o') - df[indexer]['c'] = 42 - - self.assertRaises(com.SettingWithCopyError, f) - - expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]}) - df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) - - def f(): - df['A'][0] = 111 - - self.assertRaises(com.SettingWithCopyError, f) - - def f(): - df.loc[0]['A'] = 111 - - self.assertRaises(com.SettingWithCopyError, f) - - df.loc[0, 'A'] = 111 - tm.assert_frame_equal(df, expected) - - # make sure that is_copy is picked up reconstruction - # GH5475 - df = DataFrame({"A": [1, 2]}) - self.assertIsNone(df.is_copy) - with tm.ensure_clean('__tmp__pickle') as path: - df.to_pickle(path) - df2 = pd.read_pickle(path) - df2["B"] = df2["A"] - df2["B"] = df2["A"] - - # a suprious raise as we are setting the entire column here - # GH5597 - from string import ascii_letters as letters - - def random_text(nobs=100): - df = [] - for i in range(nobs): - idx = np.random.randint(len(letters), size=2) - idx.sort() - df.append([letters[idx[0]:idx[1]]]) - - return DataFrame(df, columns=['letters']) - - df = random_text(100000) - - # always a copy - x = df.iloc[[0, 1, 2]] - self.assertIsNotNone(x.is_copy) - x = df.iloc[[0, 1, 2, 4]] - self.assertIsNotNone(x.is_copy) - - # explicity copy - indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.ix[indexer].copy() - self.assertIsNone(df.is_copy) - df['letters'] = df['letters'].apply(str.lower) - - # implicity take - df = random_text(100000) - indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.ix[indexer] - self.assertIsNotNone(df.is_copy) - df['letters'] = df['letters'].apply(str.lower) - - # implicity take 2 - df = random_text(100000) - indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.ix[indexer] - self.assertIsNotNone(df.is_copy) - df.loc[:, 'letters'] = df['letters'].apply(str.lower) - - # should be ok even though it's a copy! - self.assertIsNone(df.is_copy) - df['letters'] = df['letters'].apply(str.lower) - self.assertIsNone(df.is_copy) - - df = random_text(100000) - indexer = df.letters.apply(lambda x: len(x) > 10) - df.ix[indexer, 'letters'] = df.ix[indexer, 'letters'].apply(str.lower) - - # an identical take, so no copy - df = DataFrame({'a': [1]}).dropna() - self.assertIsNone(df.is_copy) - df['a'] += 1 - - # inplace ops - # original from: - # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug - a = [12, 23] - b = [123, None] - c = [1234, 2345] - d = [12345, 23456] - tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), - ('ears', 'right')] - events = {('eyes', 'left'): a, - ('eyes', 'right'): b, - ('ears', 'left'): c, - ('ears', 'right'): d} - multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) - zed = DataFrame(events, index=['a', 'b'], columns=multiind) - - def f(): - zed['eyes']['right'].fillna(value=555, inplace=True) - - self.assertRaises(com.SettingWithCopyError, f) - - df = DataFrame(np.random.randn(10, 4)) - s = df.iloc[:, 0].sort_values() - tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) - tm.assert_series_equal(s, df[0].sort_values()) - - # false positives GH6025 - df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) - str(df) - df['column1'] = df['column1'] + 'b' - str(df) - df = df[df['column2'] != 8] - str(df) - df['column1'] = df['column1'] + 'c' - str(df) - - # from SO: - # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc - df = DataFrame(np.arange(0, 9), columns=['count']) - df['group'] = 'b' - - def f(): - df.iloc[0:5]['group'] = 'a' - - self.assertRaises(com.SettingWithCopyError, f) - - # mixed type setting - # same dtype & changing dtype - df = DataFrame(dict(A=date_range('20130101', periods=5), - B=np.random.randn(5), - C=np.arange(5, dtype='int64'), - D=list('abcde'))) - - def f(): - df.ix[2]['D'] = 'foo' - - self.assertRaises(com.SettingWithCopyError, f) - - def f(): - df.ix[2]['C'] = 'foo' - - self.assertRaises(com.SettingWithCopyError, f) - - def f(): - df['C'][2] = 'foo' - - self.assertRaises(com.SettingWithCopyError, f) - - def test_setting_with_copy_bug(self): - - # operating on a copy - df = pd.DataFrame({'a': list(range(4)), - 'b': list('ab..'), - 'c': ['a', 'b', np.nan, 'd']}) - mask = pd.isnull(df.c) - - def f(): - df[['c']][mask] = df[['b']][mask] - - self.assertRaises(com.SettingWithCopyError, f) - - # invalid warning as we are returning a new object - # GH 8730 - df1 = DataFrame({'x': Series(['a', 'b', 'c']), - 'y': Series(['d', 'e', 'f'])}) - df2 = df1[['x']] - - # this should not raise - df2['y'] = ['g', 'h', 'i'] - - def test_detect_chained_assignment_warnings(self): - - # warnings - with option_context('chained_assignment', 'warn'): - df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) - with tm.assert_produces_warning( - expected_warning=com.SettingWithCopyWarning): - df.loc[0]['A'] = 111 - - def test_float64index_slicing_bug(self): - # GH 5557, related to slicing a float index - ser = {256: 2321.0, - 1: 78.0, - 2: 2716.0, - 3: 0.0, - 4: 369.0, - 5: 0.0, - 6: 269.0, - 7: 0.0, - 8: 0.0, - 9: 0.0, - 10: 3536.0, - 11: 0.0, - 12: 24.0, - 13: 0.0, - 14: 931.0, - 15: 0.0, - 16: 101.0, - 17: 78.0, - 18: 9643.0, - 19: 0.0, - 20: 0.0, - 21: 0.0, - 22: 63761.0, - 23: 0.0, - 24: 446.0, - 25: 0.0, - 26: 34773.0, - 27: 0.0, - 28: 729.0, - 29: 78.0, - 30: 0.0, - 31: 0.0, - 32: 3374.0, - 33: 0.0, - 34: 1391.0, - 35: 0.0, - 36: 361.0, - 37: 0.0, - 38: 61808.0, - 39: 0.0, - 40: 0.0, - 41: 0.0, - 42: 6677.0, - 43: 0.0, - 44: 802.0, - 45: 0.0, - 46: 2691.0, - 47: 0.0, - 48: 3582.0, - 49: 0.0, - 50: 734.0, - 51: 0.0, - 52: 627.0, - 53: 70.0, - 54: 2584.0, - 55: 0.0, - 56: 324.0, - 57: 0.0, - 58: 605.0, - 59: 0.0, - 60: 0.0, - 61: 0.0, - 62: 3989.0, - 63: 10.0, - 64: 42.0, - 65: 0.0, - 66: 904.0, - 67: 0.0, - 68: 88.0, - 69: 70.0, - 70: 8172.0, - 71: 0.0, - 72: 0.0, - 73: 0.0, - 74: 64902.0, - 75: 0.0, - 76: 347.0, - 77: 0.0, - 78: 36605.0, - 79: 0.0, - 80: 379.0, - 81: 70.0, - 82: 0.0, - 83: 0.0, - 84: 3001.0, - 85: 0.0, - 86: 1630.0, - 87: 7.0, - 88: 364.0, - 89: 0.0, - 90: 67404.0, - 91: 9.0, - 92: 0.0, - 93: 0.0, - 94: 7685.0, - 95: 0.0, - 96: 1017.0, - 97: 0.0, - 98: 2831.0, - 99: 0.0, - 100: 2963.0, - 101: 0.0, - 102: 854.0, - 103: 0.0, - 104: 0.0, - 105: 0.0, - 106: 0.0, - 107: 0.0, - 108: 0.0, - 109: 0.0, - 110: 0.0, - 111: 0.0, - 112: 0.0, - 113: 0.0, - 114: 0.0, - 115: 0.0, - 116: 0.0, - 117: 0.0, - 118: 0.0, - 119: 0.0, - 120: 0.0, - 121: 0.0, - 122: 0.0, - 123: 0.0, - 124: 0.0, - 125: 0.0, - 126: 67744.0, - 127: 22.0, - 128: 264.0, - 129: 0.0, - 260: 197.0, - 268: 0.0, - 265: 0.0, - 269: 0.0, - 261: 0.0, - 266: 1198.0, - 267: 0.0, - 262: 2629.0, - 258: 775.0, - 257: 0.0, - 263: 0.0, - 259: 0.0, - 264: 163.0, - 250: 10326.0, - 251: 0.0, - 252: 1228.0, - 253: 0.0, - 254: 2769.0, - 255: 0.0} - - # smoke test for the repr - s = Series(ser) - result = s.value_counts() - str(result) - def test_set_ix_out_of_bounds_axis_0(self): df = pd.DataFrame( randn(2, 5), index=["row%s" % i for i in range(2)], @@ -5281,34 +3237,6 @@ def assert_slices_equivalent(l_slc, i_slc): assert_slices_equivalent(SLC[idx[13]:idx[9]:-1], SLC[13:8:-1]) assert_slices_equivalent(SLC[idx[9]:idx[13]:-1], SLC[:0]) - def test_multiindex_label_slicing_with_negative_step(self): - s = Series(np.arange(20), - MultiIndex.from_product([list('abcde'), np.arange(4)])) - SLC = pd.IndexSlice - - def assert_slices_equivalent(l_slc, i_slc): - tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) - - assert_slices_equivalent(SLC[::-1], SLC[::-1]) - - assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) - - assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) - assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) - - assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) - - assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) - assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) - assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) - def test_slice_with_zero_step_raises(self): s = Series(np.arange(20), index=_mklbl('A', 20)) self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', @@ -5390,23 +3318,6 @@ def test_maybe_numeric_slice(self): expected = [1] self.assertEqual(result, expected) - def test_multiindex_slice_first_level(self): - # GH 12697 - freq = ['a', 'b', 'c', 'd'] - idx = pd.MultiIndex.from_product([freq, np.arange(500)]) - df = pd.DataFrame(list(range(2000)), index=idx, columns=['Test']) - df_slice = df.loc[pd.IndexSlice[:, 30:70], :] - result = df_slice.loc['a'] - expected = pd.DataFrame(list(range(30, 71)), - columns=['Test'], - index=range(30, 71)) - tm.assert_frame_equal(result, expected) - result = df_slice.loc['d'] - expected = pd.DataFrame(list(range(1530, 1571)), - columns=['Test'], - index=range(30, 71)) - tm.assert_frame_equal(result, expected) - class TestSeriesNoneCoercion(tm.TestCase): EXPECTED_RESULTS = [ @@ -5511,22 +3422,3 @@ def test_none_coercion_mixed_dtypes(self): datetime(2000, 1, 3)], 'd': [None, 'b', 'c']}) tm.assert_frame_equal(start_dataframe, exp) - - -class TestTimedeltaIndexing(tm.TestCase): - - def test_boolean_indexing(self): - # GH 14946 - df = pd.DataFrame({'x': range(10)}) - df.index = pd.to_timedelta(range(10), unit='s') - conditions = [df['x'] > 3, df['x'] == 3, df['x'] < 3] - expected_data = [[0, 1, 2, 3, 10, 10, 10, 10, 10, 10], - [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], - [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] - for cond, data in zip(conditions, expected_data): - result = df.copy() - result.loc[cond, 'x'] = 10 - expected = pd.DataFrame(data, - index=pd.to_timedelta(range(10), unit='s'), - columns=['x']) - tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py new file mode 100644 index 0000000000000..1e6ecbbcdc756 --- /dev/null +++ b/pandas/tests/indexing/test_multiindex.py @@ -0,0 +1,1206 @@ +from warnings import catch_warnings +import pytest +import numpy as np +import pandas as pd +from pandas import (Panel, Series, MultiIndex, DataFrame, + Timestamp, Index, date_range) +from pandas.util import testing as tm +from pandas.core.common import PerformanceWarning, UnsortedIndexError +from pandas.tests.indexing.common import _mklbl + + +class TestMultiIndexBasic(tm.TestCase): + + def test_iloc_getitem_multiindex2(self): + # TODO(wesm): fix this + pytest.skip('this test was being suppressed, ' + 'needs to be fixed') + + arr = np.random.randn(3, 3) + df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]]) + + rs = df.iloc[2] + xp = Series(arr[2], index=df.columns) + tm.assert_series_equal(rs, xp) + + rs = df.iloc[:, 2] + xp = Series(arr[:, 2], index=df.index) + tm.assert_series_equal(rs, xp) + + rs = df.iloc[2, 2] + xp = df.values[2, 2] + self.assertEqual(rs, xp) + + # for multiple items + # GH 5528 + rs = df.iloc[[0, 1]] + xp = df.xs(4, drop_level=False) + tm.assert_frame_equal(rs, xp) + + tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) + index = MultiIndex.from_tuples(tup) + df = DataFrame(np.random.randn(4, 4), index=index) + rs = df.iloc[[2, 3]] + xp = df.xs('b', drop_level=False) + tm.assert_frame_equal(rs, xp) + + def test_setitem_multiindex(self): + for index_fn in ('ix', 'loc'): + + def check(target, indexers, value, compare_fn, expected=None): + fn = getattr(target, index_fn) + fn.__setitem__(indexers, value) + result = fn.__getitem__(indexers) + if expected is None: + expected = value + compare_fn(result, expected) + # GH7190 + index = pd.MultiIndex.from_product([np.arange(0, 100), + np.arange(0, 80)], + names=['time', 'firm']) + t, n = 0, 2 + df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=0, + compare_fn=self.assertEqual) + + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=1, + compare_fn=self.assertEqual) + + df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=2, + compare_fn=self.assertEqual) + + # GH 7218, assinging with 0-dim arrays + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, + indexers=((t, n), 'X'), + value=np.array(3), + compare_fn=self.assertEqual, + expected=3, ) + + # GH5206 + df = pd.DataFrame(np.arange(25).reshape(5, 5), + columns='A,B,C,D,E'.split(','), dtype=float) + df['F'] = 99 + row_selection = df['A'] % 2 == 0 + col_selection = ['B', 'C'] + with catch_warnings(record=True): + df.ix[row_selection, col_selection] = df['F'] + output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) + with catch_warnings(record=True): + tm.assert_frame_equal(df.ix[row_selection, col_selection], + output) + check(target=df, + indexers=(row_selection, col_selection), + value=df['F'], + compare_fn=tm.assert_frame_equal, + expected=output, ) + + # GH11372 + idx = pd.MultiIndex.from_product([ + ['A', 'B', 'C'], + pd.date_range('2015-01-01', '2015-04-01', freq='MS')]) + cols = pd.MultiIndex.from_product([ + ['foo', 'bar'], + pd.date_range('2016-01-01', '2016-02-01', freq='MS')]) + + df = pd.DataFrame(np.random.random((12, 4)), + index=idx, columns=cols) + + subidx = pd.MultiIndex.from_tuples( + [('A', pd.Timestamp('2015-01-01')), + ('A', pd.Timestamp('2015-02-01'))]) + subcols = pd.MultiIndex.from_tuples( + [('foo', pd.Timestamp('2016-01-01')), + ('foo', pd.Timestamp('2016-02-01'))]) + + vals = pd.DataFrame(np.random.random((2, 2)), + index=subidx, columns=subcols) + check(target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, ) + # set all columns + vals = pd.DataFrame( + np.random.random((2, 4)), index=subidx, columns=cols) + check(target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, ) + # identity + copy = df.copy() + check(target=df, indexers=(df.index, df.columns), value=df, + compare_fn=tm.assert_frame_equal, expected=copy) + + def test_loc_getitem_series(self): + # GH14730 + # passing a series as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = Series([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), + dtype=np.float64) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + result = x.loc[[1, 3]] + tm.assert_series_equal(result, expected) + + empty = Series(data=[], dtype=np.float64) + expected = Series([], index=MultiIndex( + levels=index.levels, labels=[[], []], dtype=np.float64)) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + def test_iloc_getitem_multiindex(self): + mi_labels = DataFrame(np.random.randn(4, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], + index=[['i', 'i', 'j', 'k'], + ['X', 'X', 'Y', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]]) + + # the first row + rs = mi_int.iloc[0] + with catch_warnings(record=True): + xp = mi_int.ix[4].ix[8] + tm.assert_series_equal(rs, xp, check_names=False) + self.assertEqual(rs.name, (4, 8)) + self.assertEqual(xp.name, 8) + + # 2nd (last) columns + rs = mi_int.iloc[:, 2] + with catch_warnings(record=True): + xp = mi_int.ix[:, 2] + tm.assert_series_equal(rs, xp) + + # corner column + rs = mi_int.iloc[2, 2] + with catch_warnings(record=True): + xp = mi_int.ix[:, 2].ix[2] + self.assertEqual(rs, xp) + + # this is basically regular indexing + rs = mi_labels.iloc[2, 2] + with catch_warnings(record=True): + xp = mi_labels.ix['j'].ix[:, 'j'].ix[0, 0] + self.assertEqual(rs, xp) + + def test_loc_multiindex(self): + + mi_labels = DataFrame(np.random.randn(3, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], + index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]]) + + # the first row + rs = mi_labels.loc['i'] + with catch_warnings(record=True): + xp = mi_labels.ix['i'] + tm.assert_frame_equal(rs, xp) + + # 2nd (last) columns + rs = mi_labels.loc[:, 'j'] + with catch_warnings(record=True): + xp = mi_labels.ix[:, 'j'] + tm.assert_frame_equal(rs, xp) + + # corner column + rs = mi_labels.loc['j'].loc[:, 'j'] + with catch_warnings(record=True): + xp = mi_labels.ix['j'].ix[:, 'j'] + tm.assert_frame_equal(rs, xp) + + # with a tuple + rs = mi_labels.loc[('i', 'X')] + with catch_warnings(record=True): + xp = mi_labels.ix[('i', 'X')] + tm.assert_frame_equal(rs, xp) + + rs = mi_int.loc[4] + with catch_warnings(record=True): + xp = mi_int.ix[4] + tm.assert_frame_equal(rs, xp) + + def test_loc_multiindex_indexer_none(self): + + # GH6788 + # multi-index indexer is None (meaning take all) + attributes = ['Attribute' + str(i) for i in range(1)] + attribute_values = ['Value' + str(i) for i in range(5)] + + index = MultiIndex.from_product([attributes, attribute_values]) + df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 + df = DataFrame(df, columns=index) + result = df[attributes] + tm.assert_frame_equal(result, df) + + # GH 7349 + # loc with a multi-index seems to be doing fallback + df = DataFrame(np.arange(12).reshape(-1, 1), + index=pd.MultiIndex.from_product([[1, 2, 3, 4], + [1, 2, 3]])) + + expected = df.loc[([1, 2], ), :] + result = df.loc[[1, 2]] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_incomplete(self): + + # GH 7399 + # incomplete indexers + s = pd.Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + expected = s.loc[:, 'a':'c'] + + result = s.loc[0:4, 'a':'c'] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + result = s.loc[:4, 'a':'c'] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + result = s.loc[0:, 'a':'c'] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + # GH 7400 + # multiindexer gettitem with list of indexers skips wrong element + s = pd.Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + expected = s.iloc[[6, 7, 8, 12, 13, 14]] + result = s.loc[2:4:2, 'a':'c'] + tm.assert_series_equal(result, expected) + + def test_multiindex_perf_warn(self): + + df = DataFrame({'jim': [0, 0, 1, 1], + 'joe': ['x', 'x', 'z', 'y'], + 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) + + with tm.assert_produces_warning(PerformanceWarning, + clear=[pd.core.index]): + df.loc[(1, 'z')] + + df = df.iloc[[2, 1, 3, 0]] + with tm.assert_produces_warning(PerformanceWarning): + df.loc[(0, )] + + def test_series_getitem_multiindex(self): + + # GH 6018 + # series regression getitem with a multi-index + + s = Series([1, 2, 3]) + s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) + + result = s[:, 0] + expected = Series([1], index=[0]) + tm.assert_series_equal(result, expected) + + result = s.loc[:, 1] + expected = Series([2, 3], index=[1, 2]) + tm.assert_series_equal(result, expected) + + # xs + result = s.xs(0, level=0) + expected = Series([1], index=[0]) + tm.assert_series_equal(result, expected) + + result = s.xs(1, level=1) + expected = Series([2, 3], index=[1, 2]) + tm.assert_series_equal(result, expected) + + # GH6258 + dt = list(date_range('20130903', periods=3)) + idx = MultiIndex.from_product([list('AB'), dt]) + s = Series([1, 3, 4, 1, 3, 4], index=idx) + + result = s.xs('20130903', level=1) + expected = Series([1, 1], index=list('AB')) + tm.assert_series_equal(result, expected) + + # GH5684 + idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), + ('b', 'two')]) + s = Series([1, 2, 3, 4], index=idx) + s.index.set_names(['L1', 'L2'], inplace=True) + result = s.xs('one', level='L2') + expected = Series([1, 3], index=['a', 'b']) + expected.index.set_names(['L1'], inplace=True) + tm.assert_series_equal(result, expected) + + def test_xs_multiindex(self): + + # GH2903 + columns = MultiIndex.from_tuples( + [('a', 'foo'), ('a', 'bar'), ('b', 'hello'), + ('b', 'world')], names=['lvl0', 'lvl1']) + df = DataFrame(np.random.randn(4, 4), columns=columns) + df.sort_index(axis=1, inplace=True) + result = df.xs('a', level='lvl0', axis=1) + expected = df.iloc[:, 0:2].loc[:, 'a'] + tm.assert_frame_equal(result, expected) + + result = df.xs('foo', level='lvl1', axis=1) + expected = df.iloc[:, 1:2].copy() + expected.columns = expected.columns.droplevel('lvl1') + tm.assert_frame_equal(result, expected) + + def test_multiindex_setitem(self): + + # GH 3738 + # setting with a multi-index right hand side + arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), + np.array(['one', 'two', 'one', 'one', 'two', 'one']), + np.arange(0, 6, 1)] + + df_orig = pd.DataFrame(np.random.randn(6, 3), + index=arrays, + columns=['A', 'B', 'C']).sort_index() + + expected = df_orig.loc[['bar']] * 2 + df = df_orig.copy() + df.loc[['bar']] *= 2 + tm.assert_frame_equal(df.loc[['bar']], expected) + + # raise because these have differing levels + def f(): + df.loc['bar'] *= 2 + + self.assertRaises(TypeError, f) + + # from SO + # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + df_orig = DataFrame.from_dict({'price': { + ('DE', 'Coal', 'Stock'): 2, + ('DE', 'Gas', 'Stock'): 4, + ('DE', 'Elec', 'Demand'): 1, + ('FR', 'Gas', 'Stock'): 5, + ('FR', 'Solar', 'SupIm'): 0, + ('FR', 'Wind', 'SupIm'): 0 + }}) + df_orig.index = MultiIndex.from_tuples(df_orig.index, + names=['Sit', 'Com', 'Type']) + + expected = df_orig.copy() + expected.iloc[[0, 2, 3]] *= 2 + + idx = pd.IndexSlice + df = df_orig.copy() + df.loc[idx[:, :, 'Stock'], :] *= 2 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, :, 'Stock'], 'price'] *= 2 + tm.assert_frame_equal(df, expected) + + def test_getitem_multiindex(self): + # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise + # the appropriate error, only in PY3 of course! + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + arr = np.random.randn(len(index), 1) + df = DataFrame(arr, index=index, columns=['val']) + result = df.val['D'] + expected = Series(arr.ravel()[0:3], name='val', index=Index( + [26, 37, 57], name='day')) + tm.assert_series_equal(result, expected) + + def f(): + df.val['A'] + + self.assertRaises(KeyError, f) + + def f(): + df.val['X'] + + self.assertRaises(KeyError, f) + + # A is treated as a special Timestamp + index = MultiIndex(levels=[['A', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + df = DataFrame(arr, index=index, columns=['val']) + result = df.val['A'] + expected = Series(arr.ravel()[0:3], name='val', index=Index( + [26, 37, 57], name='day')) + tm.assert_series_equal(result, expected) + + def f(): + df.val['X'] + + self.assertRaises(KeyError, f) + + # GH 7866 + # multi-index slicing with missing indexers + idx = pd.MultiIndex.from_product([['A', 'B', 'C'], + ['foo', 'bar', 'baz']], + names=['one', 'two']) + s = pd.Series(np.arange(9, dtype='int64'), index=idx).sort_index() + + exp_idx = pd.MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], + names=['one', 'two']) + expected = pd.Series(np.arange(3, dtype='int64'), + index=exp_idx).sort_index() + + result = s.loc[['A']] + tm.assert_series_equal(result, expected) + result = s.loc[['A', 'D']] + tm.assert_series_equal(result, expected) + + # not any values found + self.assertRaises(KeyError, lambda: s.loc[['D']]) + + # empty ok + result = s.loc[[]] + expected = s.iloc[[]] + tm.assert_series_equal(result, expected) + + idx = pd.IndexSlice + expected = pd.Series([0, 3, 6], index=pd.MultiIndex.from_product( + [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() + + result = s.loc[idx[:, ['foo']]] + tm.assert_series_equal(result, expected) + result = s.loc[idx[:, ['foo', 'bah']]] + tm.assert_series_equal(result, expected) + + # GH 8737 + # empty indexer + multi_index = pd.MultiIndex.from_product((['foo', 'bar', 'baz'], + ['alpha', 'beta'])) + df = DataFrame( + np.random.randn(5, 6), index=range(5), columns=multi_index) + df = df.sort_index(level=0, axis=1) + + expected = DataFrame(index=range(5), + columns=multi_index.reindex([])[0]) + result1 = df.loc[:, ([], slice(None))] + result2 = df.loc[:, (['foo'], [])] + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # regression from < 0.14.0 + # GH 7914 + df = DataFrame([[np.mean, np.median], ['mean', 'median']], + columns=MultiIndex.from_tuples([('functs', 'mean'), + ('functs', 'median')]), + index=['function', 'name']) + result = df.loc['function', ('functs', 'mean')] + self.assertEqual(result, np.mean) + + def test_multiindex_assignment(self): + + # GH3777 part 2 + + # mixed dtype + df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list('abc'), + index=[[4, 4, 8], [8, 10, 12]]) + df['d'] = np.nan + arr = np.array([0., 1.]) + + df.ix[4, 'd'] = arr + tm.assert_series_equal(df.ix[4, 'd'], + Series(arr, index=[8, 10], name='d')) + + # single dtype + df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list('abc'), + index=[[4, 4, 8], [8, 10, 12]]) + + df.ix[4, 'c'] = arr + exp = Series(arr, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) + + # scalar ok + df.ix[4, 'c'] = 10 + exp = Series(10, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) + + # invalid assignments + def f(): + df.ix[4, 'c'] = [0, 1, 2, 3] + + self.assertRaises(ValueError, f) + + def f(): + df.ix[4, 'c'] = [0] + + self.assertRaises(ValueError, f) + + # groupby example + NUM_ROWS = 100 + NUM_COLS = 10 + col_names = ['A' + num for num in + map(str, np.arange(NUM_COLS).tolist())] + index_cols = col_names[:5] + + df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, columns=col_names) + df = df.set_index(index_cols).sort_index() + grp = df.groupby(level=index_cols[:4]) + df['new_col'] = np.nan + + f_index = np.arange(5) + + def f(name, df2): + return Series(np.arange(df2.shape[0]), + name=df2.index.values[0]).reindex(f_index) + + # TODO(wesm): unused? + # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T + + # we are actually operating on a copy here + # but in this case, that's ok + for name, df2 in grp: + new_vals = np.arange(df2.shape[0]) + df.ix[name, 'new_col'] = new_vals + + def test_multiindex_label_slicing_with_negative_step(self): + s = Series(np.arange(20), + MultiIndex.from_product([list('abcde'), np.arange(4)])) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + + assert_slices_equivalent(SLC[::-1], SLC[::-1]) + + assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) + assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) + + assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) + + assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) + + assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) + assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) + assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) + + def test_multiindex_slice_first_level(self): + # GH 12697 + freq = ['a', 'b', 'c', 'd'] + idx = pd.MultiIndex.from_product([freq, np.arange(500)]) + df = pd.DataFrame(list(range(2000)), index=idx, columns=['Test']) + df_slice = df.loc[pd.IndexSlice[:, 30:70], :] + result = df_slice.loc['a'] + expected = pd.DataFrame(list(range(30, 71)), + columns=['Test'], + index=range(30, 71)) + tm.assert_frame_equal(result, expected) + result = df_slice.loc['d'] + expected = pd.DataFrame(list(range(1530, 1571)), + columns=['Test'], + index=range(30, 71)) + tm.assert_frame_equal(result, expected) + + +class TestMultiIndexSlicers(tm.TestCase): + + def test_per_axis_per_level_getitem(self): + + # GH6134 + # example test case + ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( + 'C', 4), _mklbl('D', 2)]) + df = DataFrame(np.arange(len(ix.get_values())), index=ix) + + result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C2' or c == 'C3')]] + result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] + tm.assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A', 1), ('A', 2), + ('A', 3), ('B', 1)], + names=['one', 'two']) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + + df = DataFrame( + np.arange(16, dtype='int64').reshape( + 4, 4), index=index, columns=columns) + df = df.sort_index(axis=0).sort_index(axis=1) + + # identity + result = df.loc[(slice(None), slice(None)), :] + tm.assert_frame_equal(result, df) + result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + result = df.loc[:, (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), 1), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # columns + result = df.loc[:, (slice(None), ['foo'])] + expected = df.iloc[:, [1, 3]] + tm.assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None), 1), (slice(None), ['foo'])] + expected = df.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc['A', 'a'] + expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), + index=Index([1, 2, 3], name='two'), + columns=Index(['bar', 'foo'], name='lvl1')) + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), [1, 2]), :] + expected = df.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + # multi-level series + s = Series(np.arange(len(ix.get_values())), index=ix) + result = s.loc['A1':'A3', :, ['C1', 'C3']] + expected = s.loc[[tuple([a, b, c, d]) + for a, b, c, d in s.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[(slice(None), np.array([True, False])), :] + + self.assertRaises(ValueError, f) + + # ambiguous cases + # these can be multiply interpreted (e.g. in this case + # as df.loc[slice(None),[1]] as well + self.assertRaises(KeyError, lambda: df.loc[slice(None), [1]]) + + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # not lexsorted + self.assertEqual(df.index.lexsort_depth, 2) + df = df.sort_index(level=1, axis=0) + self.assertEqual(df.index.lexsort_depth, 0) + with tm.assertRaisesRegexp( + UnsortedIndexError, + 'MultiIndex Slicing requires the index to be fully ' + r'lexsorted tuple len \(2\), lexsort depth \(0\)'): + df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + + def test_multiindex_slicers_non_unique(self): + + # GH 7106 + # non-unique mi index support + df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], + B=['a', 'a', 'a', 'a'], + C=[1, 2, 1, 3], + D=[1, 2, 3, 4])) + .set_index(['A', 'B', 'C']).sort_index()) + self.assertFalse(df.index.is_unique) + expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], + C=[1, 1], D=[1, 3])) + .set_index(['A', 'B', 'C']).sort_index()) + result = df.loc[(slice(None), slice(None), 1), :] + tm.assert_frame_equal(result, expected) + + # this is equivalent of an xs expression + result = df.xs(1, level=2, drop_level=False) + tm.assert_frame_equal(result, expected) + + df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], + B=['a', 'a', 'a', 'a'], + C=[1, 2, 1, 2], + D=[1, 2, 3, 4])) + .set_index(['A', 'B', 'C']).sort_index()) + self.assertFalse(df.index.is_unique) + expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], + C=[1, 1], D=[1, 3])) + .set_index(['A', 'B', 'C']).sort_index()) + result = df.loc[(slice(None), slice(None), 1), :] + self.assertFalse(result.index.is_unique) + tm.assert_frame_equal(result, expected) + + # GH12896 + # numpy-implementation dependent bug + ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, + 17, 18, 19, 200000, 200000] + n = len(ints) + idx = MultiIndex.from_arrays([['a'] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + tm.assert_series_equal(result, expected) + + def test_multiindex_slicers_datetimelike(self): + + # GH 7429 + # buggy/inconsistent behavior when slicing with datetime-like + import datetime + dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) + + datetime.timedelta(days=i) for i in range(6)] + freq = [1, 2] + index = MultiIndex.from_product( + [dates, freq], names=['date', 'frequency']) + + df = DataFrame( + np.arange(6 * 2 * 4, dtype='int64').reshape( + -1, 4), index=index, columns=list('ABCD')) + + # multi-axis slicing + idx = pd.IndexSlice + expected = df.iloc[[0, 2, 4], [0, 1]] + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), + slice(1, 1)), slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( + '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), 1), + slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + # with strings + result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), + slice(1, 1)), slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), + idx['A', 'B']] + tm.assert_frame_equal(result, expected) + + def test_multiindex_slicers_edges(self): + # GH 8132 + # various edge cases + df = DataFrame( + {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5, + 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3, + 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", + "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", + "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", + "2013-07-09", "2013-08-06", "2013-09-03"], + 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]}) + + df['DATE'] = pd.to_datetime(df['DATE']) + df1 = df.set_index(['A', 'B', 'DATE']) + df1 = df1.sort_index() + + # A1 - Get all values under "A0" and "A1" + result = df1.loc[(slice('A1')), :] + expected = df1.iloc[0:10] + tm.assert_frame_equal(result, expected) + + # A2 - Get all values from the start to "A2" + result = df1.loc[(slice('A2')), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # A3 - Get all values under "B1" or "B2" + result = df1.loc[(slice(None), slice('B1', 'B2')), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] + tm.assert_frame_equal(result, expected) + + # A4 - Get all values between 2013-07-02 and 2013-07-09 + result = df1.loc[(slice(None), slice(None), + slice('20130702', '20130709')), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + # B1 - Get all values in B0 that are also under A0, A1 and A2 + result = df1.loc[(slice('A2'), slice('B0')), :] + expected = df1.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for + # the As) + result = df1.loc[(slice(None), slice('B2')), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # B3 - Get all values from B1 to B2 and up to 2013-08-06 + result = df1.loc[(slice(None), slice('B1', 'B2'), + slice('2013-08-06')), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] + tm.assert_frame_equal(result, expected) + + # B4 - Same as A4 but the start of the date slice is not a key. + # shows indexing on a partial selection slice + result = df1.loc[(slice(None), slice(None), + slice('20130701', '20130709')), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + def test_per_axis_per_level_doc_examples(self): + + # test index maker + idx = pd.IndexSlice + + # from indexing.rst / advanced + index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), + _mklbl('C', 4), _mklbl('D', 2)]) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') + .reshape((len(index), len(columns))), + index=index, columns=columns) + result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + result = df.loc[idx[:, :, ['C1', 'C3']], :] + tm.assert_frame_equal(result, expected) + + # not sorted + def f(): + df.loc['A1', (slice(None), 'foo')] + + self.assertRaises(UnsortedIndexError, f) + df = df.sort_index(axis=1) + + # slicing + df.loc['A1', (slice(None), 'foo')] + df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')] + + # setitem + df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + + def test_loc_axis_arguments(self): + + index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), + _mklbl('C', 4), _mklbl('D', 2)]) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') + .reshape((len(index), len(columns))), + index=index, + columns=columns).sort_index().sort_index(axis=1) + + # axis 0 + result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis='index')[:, :, ['C1', 'C3']] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:, 'foo'] + expected = df.loc[:, (slice(None), 'foo')] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis='columns')[:, 'foo'] + expected = df.loc[:, (slice(None), 'foo')] + tm.assert_frame_equal(result, expected) + + # invalid axis + def f(): + df.loc(axis=-1)[:, :, ['C1', 'C3']] + + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis=2)[:, :, ['C1', 'C3']] + + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis='foo')[:, :, ['C1', 'C3']] + + self.assertRaises(ValueError, f) + + def test_per_axis_per_level_setitem(self): + + # test index maker + idx = pd.IndexSlice + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A', 1), ('A', 2), + ('A', 3), ('B', 1)], + names=['one', 'two']) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + + df_orig = DataFrame( + np.arange(16, dtype='int64').reshape( + 4, 4), index=index, columns=columns) + df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None), slice(None)), :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:, (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None), [1]), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, 1] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:, (slice(None), ['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[:, [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, 1], idx[:, ['foo']]] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc['A', 'a'] = 100 + expected = df_orig.copy() + expected.iloc[0:3, 0:2] = 100 + tm.assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( + [[100, 100], [100, 100]], dtype='int64') + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + + def f(): + df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( + [[100], [100, 100]], dtype='int64') + + self.assertRaises(ValueError, f) + + def f(): + df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( + [100, 100, 100, 100], dtype='int64') + + self.assertRaises(ValueError, f) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( + None), 1), (slice(None), ['foo'])] * 5 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( + None), 1), (slice(None), ['foo'])] + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() + rhs.loc[:, ('c', 'bah')] = 10 + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + +class TestMultiIndexPanel(tm.TestCase): + + def test_iloc_getitem_panel_multiindex(self): + # GH 7199 + # Panel with multi-index + multi_index = pd.MultiIndex.from_tuples([('ONE', 'one'), + ('TWO', 'two'), + ('THREE', 'three')], + names=['UPPER', 'lower']) + + simple_index = [x[0] for x in multi_index] + wd1 = Panel(items=['First', 'Second'], major_axis=['a', 'b', 'c', 'd'], + minor_axis=multi_index) + + wd2 = Panel(items=['First', 'Second'], major_axis=['a', 'b', 'c', 'd'], + minor_axis=simple_index) + + expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] + result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG + tm.assert_frame_equal(result1, expected1) + + expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] + result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] + tm.assert_frame_equal(result2, expected2) + + expected1 = DataFrame(index=['a'], columns=multi_index, + dtype='float64') + result1 = wd1.iloc[0, [0], [0, 1, 2]] + tm.assert_frame_equal(result1, expected1) + + expected2 = DataFrame(index=['a'], columns=simple_index, + dtype='float64') + result2 = wd2.iloc[0, [0], [0, 1, 2]] + tm.assert_frame_equal(result2, expected2) + + # GH 7516 + mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) + p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), + items=['a', 'b', 'c'], major_axis=mi, + minor_axis=['u', 'v', 'w']) + result = p.iloc[:, 1, 0] + expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') + tm.assert_series_equal(result, expected) + + result = p.loc[:, (1, 'y'), 'u'] + tm.assert_series_equal(result, expected) + + def test_panel_setitem_with_multiindex(self): + + # 10360 + # failing with a multi-index + arr = np.array([[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], + dtype=np.float64) + + # reg index + axes = dict(items=['A', 'B'], major_axis=[0, 1], + minor_axis=['X', 'Y', 'Z']) + p1 = Panel(0., **axes) + p1.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p1, expected) + + # multi-indexes + axes['items'] = pd.MultiIndex.from_tuples([('A', 'a'), ('B', 'b')]) + p2 = Panel(0., **axes) + p2.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p2, expected) + + axes['major_axis'] = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)]) + p3 = Panel(0., **axes) + p3.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p3, expected) + + axes['minor_axis'] = pd.MultiIndex.from_product([['X'], range(3)]) + p4 = Panel(0., **axes) + p4.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p4, expected) + + arr = np.array( + [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]], dtype=np.float64) + p5 = Panel(0., **axes) + p5.iloc[0, :, 0] = [1, 2] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p5, expected) diff --git a/pandas/tests/indexing/test_panel.py b/pandas/tests/indexing/test_panel.py new file mode 100644 index 0000000000000..5ec3076af599a --- /dev/null +++ b/pandas/tests/indexing/test_panel.py @@ -0,0 +1,209 @@ +import numpy as np +from pandas.util import testing as tm +from pandas import Panel, date_range, DataFrame + + +class TestPanel(tm.TestCase): + + def test_iloc_getitem_panel(self): + + # GH 7189 + p = Panel(np.arange(4 * 3 * 2).reshape(4, 3, 2), + items=['A', 'B', 'C', 'D'], + major_axis=['a', 'b', 'c'], + minor_axis=['one', 'two']) + + result = p.iloc[1] + expected = p.loc['B'] + tm.assert_frame_equal(result, expected) + + result = p.iloc[1, 1] + expected = p.loc['B', 'b'] + tm.assert_series_equal(result, expected) + + result = p.iloc[1, 1, 1] + expected = p.loc['B', 'b', 'two'] + self.assertEqual(result, expected) + + # slice + result = p.iloc[1:3] + expected = p.loc[['B', 'C']] + tm.assert_panel_equal(result, expected) + + result = p.iloc[:, 0:2] + expected = p.loc[:, ['a', 'b']] + tm.assert_panel_equal(result, expected) + + # list of integers + result = p.iloc[[0, 2]] + expected = p.loc[['A', 'C']] + tm.assert_panel_equal(result, expected) + + # neg indicies + result = p.iloc[[-1, 1], [-1, 1]] + expected = p.loc[['D', 'B'], ['c', 'b']] + tm.assert_panel_equal(result, expected) + + # dups indicies + result = p.iloc[[-1, -1, 1], [-1, 1]] + expected = p.loc[['D', 'D', 'B'], ['c', 'b']] + tm.assert_panel_equal(result, expected) + + # combined + result = p.iloc[0, [True, True], [0, 1]] + expected = p.loc['A', ['a', 'b'], ['one', 'two']] + tm.assert_frame_equal(result, expected) + + # out-of-bounds exception + self.assertRaises(IndexError, p.iloc.__getitem__, tuple([10, 5])) + + def f(): + p.iloc[0, [True, True], [0, 1, 2]] + + self.assertRaises(IndexError, f) + + # trying to use a label + self.assertRaises(ValueError, p.iloc.__getitem__, tuple(['j', 'D'])) + + # GH + p = Panel( + np.random.rand(4, 3, 2), items=['A', 'B', 'C', 'D'], + major_axis=['U', 'V', 'W'], minor_axis=['X', 'Y']) + expected = p['A'] + + result = p.iloc[0, :, :] + tm.assert_frame_equal(result, expected) + + result = p.iloc[0, [True, True, True], :] + tm.assert_frame_equal(result, expected) + + result = p.iloc[0, [True, True, True], [0, 1]] + tm.assert_frame_equal(result, expected) + + def f(): + p.iloc[0, [True, True, True], [0, 1, 2]] + + self.assertRaises(IndexError, f) + + def f(): + p.iloc[0, [True, True, True], [2]] + + self.assertRaises(IndexError, f) + + def test_iloc_panel_issue(self): + + # GH 3617 + p = Panel(np.random.randn(4, 4, 4)) + + self.assertEqual(p.iloc[:3, :3, :3].shape, (3, 3, 3)) + self.assertEqual(p.iloc[1, :3, :3].shape, (3, 3)) + self.assertEqual(p.iloc[:3, 1, :3].shape, (3, 3)) + self.assertEqual(p.iloc[:3, :3, 1].shape, (3, 3)) + self.assertEqual(p.iloc[1, 1, :3].shape, (3, )) + self.assertEqual(p.iloc[1, :3, 1].shape, (3, )) + self.assertEqual(p.iloc[:3, 1, 1].shape, (3, )) + + def test_panel_getitem(self): + # GH4016, date selection returns a frame when a partial string + # selection + ind = date_range(start="2000", freq="D", periods=1000) + df = DataFrame( + np.random.randn( + len(ind), 5), index=ind, columns=list('ABCDE')) + panel = Panel(dict([('frame_' + c, df) for c in list('ABC')])) + + test2 = panel.ix[:, "2002":"2002-12-31"] + test1 = panel.ix[:, "2002"] + tm.assert_panel_equal(test1, test2) + + # GH8710 + # multi-element getting with a list + panel = tm.makePanel() + + expected = panel.iloc[[0, 1]] + + result = panel.loc[['ItemA', 'ItemB']] + tm.assert_panel_equal(result, expected) + + result = panel.loc[['ItemA', 'ItemB'], :, :] + tm.assert_panel_equal(result, expected) + + result = panel[['ItemA', 'ItemB']] + tm.assert_panel_equal(result, expected) + + result = panel.loc['ItemA':'ItemB'] + tm.assert_panel_equal(result, expected) + + result = panel.ix['ItemA':'ItemB'] + tm.assert_panel_equal(result, expected) + + result = panel.ix[['ItemA', 'ItemB']] + tm.assert_panel_equal(result, expected) + + # with an object-like + # GH 9140 + class TestObject: + + def __str__(self): + return "TestObject" + + obj = TestObject() + + p = Panel(np.random.randn(1, 5, 4), items=[obj], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + + expected = p.iloc[0] + result = p[obj] + tm.assert_frame_equal(result, expected) + + def test_panel_setitem(self): + + # GH 7763 + # loc and setitem have setting differences + np.random.seed(0) + index = range(3) + columns = list('abc') + + panel = Panel({'A': DataFrame(np.random.randn(3, 3), + index=index, columns=columns), + 'B': DataFrame(np.random.randn(3, 3), + index=index, columns=columns), + 'C': DataFrame(np.random.randn(3, 3), + index=index, columns=columns)}) + + replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns) + expected = Panel({'A': replace, 'B': replace, 'C': replace}) + + p = panel.copy() + for idx in list('ABC'): + p[idx] = replace + tm.assert_panel_equal(p, expected) + + p = panel.copy() + for idx in list('ABC'): + p.loc[idx, :, :] = replace + tm.assert_panel_equal(p, expected) + + def test_panel_assignment(self): + # GH3777 + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + wp2 = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + + # TODO: unused? + # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] + + def f(): + wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[ + ['Item1', 'Item2'], :, ['A', 'B']] + + self.assertRaises(NotImplementedError, f) + + # to_assign = wp2.loc[['Item1', 'Item2'], :, ['A', 'B']] + # wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = to_assign + # result = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] + # tm.assert_panel_equal(result,expected) diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py new file mode 100644 index 0000000000000..e5ccd72cac20a --- /dev/null +++ b/pandas/tests/indexing/test_timedelta.py @@ -0,0 +1,21 @@ +import pandas as pd +from pandas.util import testing as tm + + +class TestTimedeltaIndexing(tm.TestCase): + + def test_boolean_indexing(self): + # GH 14946 + df = pd.DataFrame({'x': range(10)}) + df.index = pd.to_timedelta(range(10), unit='s') + conditions = [df['x'] > 3, df['x'] == 3, df['x'] < 3] + expected_data = [[0, 1, 2, 3, 10, 10, 10, 10, 10, 10], + [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], + [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] + for cond, data in zip(conditions, expected_data): + result = df.copy() + result.loc[cond, 'x'] = 10 + expected = pd.DataFrame(data, + index=pd.to_timedelta(range(10), unit='s'), + columns=['x']) + tm.assert_frame_equal(expected, result) From a4e8018b1c63a56f6f83ff3755fad7cea31e6a95 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 11 Feb 2017 23:02:13 +0100 Subject: [PATCH 129/338] DOC: fix py3 compat (change lost in FAQ-gotchas merge) --- doc/source/advanced.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 21ae9f1eb8409..8833d73cb0a84 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -880,7 +880,7 @@ normal Python ``list``. Monotonicity of an index can be tested with the ``is_mon .. ipython:: python - df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=range(5)) + df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5))) df.index.is_monotonic_increasing # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: @@ -894,7 +894,7 @@ On the other hand, if the index is not monotonic, then both slice bounds must be .. ipython:: python - df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=range(6)) + df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6))) df.index.is_monotonic_increasing # OK because 2 and 4 are in the index From 5dabbaff7fd6fc212f8fe5eaa0a00ecfd5eeb7a2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Feb 2017 18:56:19 -0500 Subject: [PATCH 130/338] TST: remove nose TST: raise nose.SkipTest -> pytest.skip TST: remove KnownFailure (unused), should be replaced by pytest.xfail anyhow xref #15341 Author: Jeff Reback Closes #15368 from jreback/skip and squashes the following commits: afdb5f9 [Jeff Reback] TST: raise nose.SkipTest -> pytest.skip --- .travis.yml | 2 +- ci/install_test.sh | 1 - ci/install_travis.sh | 13 +- ci/lint.sh | 2 +- ci/requirements_all.txt | 2 +- ci/requirements_dev.txt | 2 +- ci/script.sh | 6 +- doc/source/contributing.rst | 2 +- pandas/computation/tests/test_compat.py | 8 +- pandas/computation/tests/test_eval.py | 19 ++- pandas/io/tests/json/test_pandas.py | 6 +- pandas/io/tests/json/test_ujson.py | 12 +- pandas/io/tests/parser/c_parser_only.py | 4 +- pandas/io/tests/parser/common.py | 8 +- pandas/io/tests/parser/compression.py | 8 +- pandas/io/tests/parser/converters.py | 6 +- pandas/io/tests/parser/parse_dates.py | 8 +- pandas/io/tests/parser/python_parser_only.py | 6 +- pandas/io/tests/parser/test_network.py | 4 +- pandas/io/tests/parser/test_read_fwf.py | 8 +- pandas/io/tests/parser/usecols.py | 4 +- pandas/io/tests/test_clipboard.py | 7 +- pandas/io/tests/test_excel.py | 42 +++--- pandas/io/tests/test_feather.py | 9 +- pandas/io/tests/test_gbq.py | 26 ++-- pandas/io/tests/test_html.py | 12 +- pandas/io/tests/test_packers.py | 28 ++-- pandas/io/tests/test_pickle.py | 4 +- pandas/io/tests/test_pytables.py | 29 ++-- pandas/io/tests/test_sql.py | 38 ++--- pandas/io/tests/test_stata.py | 4 +- pandas/sparse/tests/test_indexing.py | 2 +- pandas/sparse/tests/test_libsparse.py | 4 +- pandas/sparse/tests/test_series.py | 4 +- pandas/tests/formats/test_format.py | 12 +- pandas/tests/formats/test_printing.py | 2 +- pandas/tests/formats/test_style.py | 18 +-- pandas/tests/frame/test_analytics.py | 6 +- pandas/tests/frame/test_constructors.py | 6 +- pandas/tests/frame/test_missing.py | 4 +- pandas/tests/frame/test_operators.py | 4 +- pandas/tests/frame/test_quantile.py | 6 +- pandas/tests/frame/test_query_eval.py | 8 +- pandas/tests/groupby/test_misc.py | 4 +- pandas/tests/indexes/datetimes/test_tools.py | 6 +- pandas/tests/indexes/test_multi.py | 6 +- pandas/tests/indexing/test_coercion.py | 12 +- pandas/tests/plotting/common.py | 4 +- pandas/tests/plotting/test_boxplot_method.py | 4 +- pandas/tests/plotting/test_datetimelike.py | 6 +- pandas/tests/plotting/test_frame.py | 4 +- pandas/tests/series/test_analytics.py | 16 +- pandas/tests/series/test_missing.py | 8 +- pandas/tests/series/test_quantile.py | 8 +- pandas/tests/test_base.py | 4 +- pandas/tests/test_expressions.py | 96 ++++++------ pandas/tests/test_generic.py | 6 +- pandas/tests/test_internals.py | 4 +- pandas/tests/test_msgpack/test_unpack.py | 4 +- pandas/tests/test_multilevel.py | 4 +- pandas/tests/test_panel.py | 12 +- pandas/tests/test_panel4d.py | 4 +- pandas/tests/test_testing.py | 33 +--- pandas/tests/test_window.py | 16 +- pandas/tests/tseries/test_converter.py | 7 +- pandas/tests/tseries/test_offsets.py | 8 +- pandas/tools/tests/test_util.py | 16 +- pandas/util/decorators.py | 59 -------- pandas/util/print_versions.py | 2 +- pandas/util/testing.py | 150 ++++++++----------- setup.cfg | 2 +- 71 files changed, 389 insertions(+), 522 deletions(-) diff --git a/.travis.yml b/.travis.yml index b38c99e3a5be9..2ff5d508d0371 100644 --- a/.travis.yml +++ b/.travis.yml @@ -331,5 +331,5 @@ after_script: - echo "after_script start" - ci/install_test.sh - source activate pandas && python -c "import pandas; pandas.show_versions();" - - ci/print_skipped.py /tmp/nosetests.xml + - ci/print_skipped.py /tmp/pytest.xml - echo "after_script done" diff --git a/ci/install_test.sh b/ci/install_test.sh index cbb84d8fa4b65..9ace633d7f39d 100755 --- a/ci/install_test.sh +++ b/ci/install_test.sh @@ -8,7 +8,6 @@ if [ "$INSTALL_TEST" ]; then conda uninstall cython || exit 1 python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1 pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1 - # nosetests --exe -A "$TEST_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml pytest pandas/tests/test_series.py --junitxml=/tmp/pytest_install.xml else echo "Skipping installation test." diff --git a/ci/install_travis.sh b/ci/install_travis.sh index f65176fb1147c..ad804b96a0d82 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -92,12 +92,7 @@ if [ -e ${INSTALL} ]; then time bash $INSTALL || exit 1 else # create new env - time conda create -n pandas python=$PYTHON_VERSION nose pytest || exit 1 - - if [ "$LINT" ]; then - conda install flake8 - pip install cpplint - fi + time conda create -n pandas python=$PYTHON_VERSION pytest || exit 1 fi # build deps @@ -116,6 +111,12 @@ fi source activate pandas +pip install pytest-xdist +if [ "$LINT" ]; then + conda install flake8 + pip install cpplint +fi + if [ "$COVERAGE" ]; then pip install coverage pytest-cov fi diff --git a/ci/lint.sh b/ci/lint.sh index 2cbfdadf486b8..2ffc68e5eb139 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -55,7 +55,7 @@ if [ "$LINT" ]; then echo "Linting *.c and *.h DONE" echo "Check for invalid testing" - grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas + grep -r -E --include '*.py' --exclude testing.py '(numpy|np)\.testing' pandas if [ $? = "0" ]; then RET=1 fi diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt index b64143fcd4ecd..4ff80a478f247 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements_all.txt @@ -1,6 +1,6 @@ -nose pytest pytest-cov +pytest-xdist flake8 sphinx ipython diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index b8af9d035de98..b0a8adc8df5cb 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -2,7 +2,7 @@ python-dateutil pytz numpy cython -nose pytest pytest-cov +pytest-xdist flake8 diff --git a/ci/script.sh b/ci/script.sh index 3eac3002d6805..c52fa0fdb33a3 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -18,10 +18,10 @@ if [ -n "$LOCALE_OVERRIDE" ]; then fi if [ "$BUILD_TEST" ]; then - echo "We are not running nosetests as this is simply a build test." + echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s --cov=pandas --cov-report xml:/tmp/nosetests.xml $TEST_ARGS pandas - pytest -s --cov=pandas --cov-report xml:/tmp/nosetests.xml $TEST_ARGS pandas + echo pytest -s --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + pytest -s --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas else echo pytest $TEST_ARGS pandas pytest $TEST_ARGS pandas # TODO: doctest diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 3ef9ed8962a23..5c2bb9b73d618 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -734,7 +734,7 @@ gbq integration tests on a forked repository: the status by visiting your Travis branches page which exists at the following location: https://travis-ci.org/your-user-name/pandas/branches . Click on a build job for your branch. Expand the following line in the - build log: ``ci/print_skipped.py /tmp/nosetests.xml`` . Search for the + build log: ``ci/print_skipped.py /tmp/pytest.xml`` . Search for the term ``test_gbq`` and confirm that gbq integration tests are not skipped. Running the vbench performance test suite (phasing out) diff --git a/pandas/computation/tests/test_compat.py b/pandas/computation/tests/test_compat.py index 900dd2c28b4c5..599d0c10336dc 100644 --- a/pandas/computation/tests/test_compat.py +++ b/pandas/computation/tests/test_compat.py @@ -1,7 +1,7 @@ # flake8: noqa -import nose +import pytest from itertools import product from distutils.version import LooseVersion @@ -31,7 +31,7 @@ def test_compat(): assert _NUMEXPR_INSTALLED except ImportError: - raise nose.SkipTest("not testing numexpr version compat") + pytest.skip("not testing numexpr version compat") def test_invalid_numexpr_version(): @@ -49,14 +49,14 @@ def testit(): try: import numexpr as ne except ImportError: - raise nose.SkipTest("no numexpr") + pytest.skip("no numexpr") else: if ne.__version__ < LooseVersion('2.1'): with tm.assertRaisesRegexp(ImportError, "'numexpr' version is " ".+, must be >= 2.1"): testit() elif ne.__version__ == LooseVersion('2.4.4'): - raise nose.SkipTest("numexpr version==2.4.4") + pytest.skip("numexpr version==2.4.4") else: testit() else: diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index a4bb81ce7263c..ada714c8ac52e 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -6,8 +6,7 @@ from itertools import product from distutils.version import LooseVersion -import nose -from nose.tools import assert_raises +import pytest from numpy.random import randn, rand, randint import numpy as np @@ -319,7 +318,7 @@ def get_expected_pow_result(self, lhs, rhs): except ValueError as e: if str(e).startswith('negative number cannot be raised to a fractional power'): if self.engine == 'python': - raise nose.SkipTest(str(e)) + pytest.skip(str(e)) else: expected = np.nan else: @@ -1174,13 +1173,15 @@ def test_bool_ops_with_constants(self): def test_panel_fails(self): x = Panel(randn(3, 4, 5)) y = Series(randn(10)) - assert_raises(NotImplementedError, self.eval, 'x + y', + with pytest.raises(NotImplementedError): + self.eval('x + y', local_dict={'x': x, 'y': y}) def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) - assert_raises(NotImplementedError, self.eval, 'x + y', + with pytest.raises(NotImplementedError): + self.eval('x + y', local_dict={'x': x, 'y': y}) def test_constant(self): @@ -1705,7 +1706,7 @@ def test_result_types(self): def test_result_types2(self): # xref https://github.com/pandas-dev/pandas/issues/12293 - raise nose.SkipTest("unreliable tests on complex128") + pytest.skip("unreliable tests on complex128") # Did not test complex64 because DataFrame is converting it to # complex128. Due to https://github.com/pandas-dev/pandas/issues/10952 @@ -1822,7 +1823,8 @@ def check_disallowed_nodes(engine, parser): inst = VisitorClass('x + 1', engine, parser) for ops in uns_ops: - assert_raises(NotImplementedError, getattr(inst, ops)) + with pytest.raises(NotImplementedError): + getattr(inst, ops)() def test_disallowed_nodes(): @@ -1833,7 +1835,8 @@ def test_disallowed_nodes(): def check_syntax_error_exprs(engine, parser): tm.skip_if_no_ne(engine) e = 's +' - assert_raises(SyntaxError, pd.eval, e, engine=engine, parser=parser) + with pytest.raises(SyntaxError): + pd.eval(e, engine=engine, parser=parser) def test_syntax_error_exprs(): diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 440f5c13d5121..c298b3841096c 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 -import nose +import pytest from pandas.compat import range, lrange, StringIO, OrderedDict import os @@ -1009,8 +1009,8 @@ def test_latin_encoding(self): return # GH 13774 - raise nose.SkipTest("encoding not implemented in .to_json(), " - "xref #13774") + pytest.skip("encoding not implemented in .to_json(), " + "xref #13774") values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], [b'E\xc9, 17', b'a', b'b', b'c'], diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index 3da61b7696fdc..6a986710ae444 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -7,7 +7,7 @@ except ImportError: import simplejson as json import math -import nose +import pytest import platform import sys import time @@ -28,7 +28,7 @@ def _skip_if_python_ver(skip_major, skip_minor=None): major, minor = sys.version_info[:2] if major == skip_major and (skip_minor is None or minor == skip_minor): - raise nose.SkipTest("skipping Python version %d.%d" % (major, minor)) + pytest.skip("skipping Python version %d.%d" % (major, minor)) json_unicode = (json.dumps if compat.PY3 @@ -95,7 +95,7 @@ def test_encodeNonCLocale(self): try: locale.setlocale(locale.LC_NUMERIC, 'Italian_Italy') except: - raise nose.SkipTest('Could not set locale for testing') + pytest.skip('Could not set locale for testing') self.assertEqual(ujson.loads(ujson.dumps(4.78e60)), 4.78e60) self.assertEqual(ujson.loads('4.78', precise_float=True), 4.78) locale.setlocale(locale.LC_NUMERIC, savedlocale) @@ -113,7 +113,7 @@ def test_decimalDecodeTestPrecise(self): def test_encodeDoubleTinyExponential(self): if compat.is_platform_windows() and not compat.PY3: - raise nose.SkipTest("buggy on win-64 for py2") + pytest.skip("buggy on win-64 for py2") num = 1e-40 self.assertEqual(num, ujson.decode(ujson.encode(num))) @@ -393,8 +393,8 @@ def test_nat(self): def test_npy_nat(self): from distutils.version import LooseVersion if LooseVersion(np.__version__) < '1.7.0': - raise nose.SkipTest("numpy version < 1.7.0, is " - "{0}".format(np.__version__)) + pytest.skip("numpy version < 1.7.0, is " + "{0}".format(np.__version__)) input = np.datetime64('NaT') assert ujson.encode(input) == 'null', "Expected null" diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 11073f3f108ba..ffbd904843bfc 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -7,7 +7,7 @@ further arguments when parsing. """ -import nose +import pytest import numpy as np import pandas as pd @@ -159,7 +159,7 @@ def error(val): def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: - raise nose.SkipTest( + pytest.skip( "segfaults on win-64, only when all tests are run") data = """\ diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 9655c481b763a..0671901fc170a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -9,7 +9,7 @@ import sys from datetime import datetime -import nose +import pytest import numpy as np from pandas.lib import Timestamp @@ -635,8 +635,8 @@ def test_file(self): url_table = self.read_table('file://localhost/' + localtable) except URLError: # fails on some systems - raise nose.SkipTest("failing on %s" % - ' '.join(platform.uname()).strip()) + pytest.skip("failing on %s" % + ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table) @@ -1262,7 +1262,7 @@ def test_verbose_import(self): def test_iteration_open_handle(self): if PY3: - raise nose.SkipTest( + pytest.skip( "won't work in Python 3 {0}".format(sys.version_info)) with tm.ensure_clean() as path: diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index 308ca6e8a5a2c..bdcd10fc64aa5 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -5,7 +5,7 @@ of the parsers defined in parsers.py """ -import nose +import pytest import pandas.util.testing as tm @@ -16,7 +16,7 @@ def test_zip(self): try: import zipfile except ImportError: - raise nose.SkipTest('need zipfile to run') + pytest.skip('need zipfile to run') with open(self.csv1, 'rb') as data_file: data = data_file.read() @@ -67,7 +67,7 @@ def test_gzip(self): try: import gzip except ImportError: - raise nose.SkipTest('need gzip to run') + pytest.skip('need gzip to run') with open(self.csv1, 'rb') as data_file: data = data_file.read() @@ -96,7 +96,7 @@ def test_bz2(self): try: import bz2 except ImportError: - raise nose.SkipTest('need bz2 to run') + pytest.skip('need bz2 to run') with open(self.csv1, 'rb') as data_file: data = data_file.read() diff --git a/pandas/io/tests/parser/converters.py b/pandas/io/tests/parser/converters.py index 2ceaff9291e7e..859d2e19bd56a 100644 --- a/pandas/io/tests/parser/converters.py +++ b/pandas/io/tests/parser/converters.py @@ -7,7 +7,7 @@ from datetime import datetime -import nose +import pytest import numpy as np import pandas as pd @@ -84,8 +84,8 @@ def test_converter_return_string_bug(self): def test_converters_corner_with_nas(self): # skip aberration observed on Win64 Python 3.2.2 if hash(np.int64(-1)) != -2: - raise nose.SkipTest("skipping because of windows hash on Python" - " 3.2.2") + pytest.skip("skipping because of windows hash on Python" + " 3.2.2") data = """id,score,days 1,2,12 diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index ad3d5f2382a49..6197d07d4eafa 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -8,7 +8,7 @@ from distutils.version import LooseVersion from datetime import datetime -import nose +import pytest import numpy as np import pandas.lib as lib from pandas.lib import Timestamp @@ -268,9 +268,9 @@ def test_yy_format_with_yearfirst(self): # See gh-217 import dateutil if dateutil.__version__ >= LooseVersion('2.5.0'): - raise nose.SkipTest("testing yearfirst=True not-support" - "on datetutil < 2.5.0 this works but" - "is wrong") + pytest.skip("testing yearfirst=True not-support" + "on datetutil < 2.5.0 this works but" + "is wrong") rs = self.read_csv(StringIO(data), index_col=0, parse_dates=[['date', 'time']]) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 283ff366b5efd..bd76070933c47 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -9,7 +9,7 @@ import csv import sys -import nose +import pytest import pandas.util.testing as tm from pandas import DataFrame, Index @@ -79,7 +79,7 @@ def test_sniff_delimiter(self): def test_BytesIO_input(self): if not compat.PY3: - raise nose.SkipTest( + pytest.skip( "Bytes-related test - only needs to work on Python 3") data = BytesIO("שלום::1234\n562::123".encode('cp1255')) @@ -130,7 +130,7 @@ def test_decompression_regex_sep(self): import gzip import bz2 except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') + pytest.skip('need gzip and bz2 to run') with open(self.csv1, 'rb') as f: data = f.read() diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 533b7733bde28..4d75b59b09560 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -6,7 +6,7 @@ """ import os -import nose +import pytest import functools from itertools import product @@ -59,7 +59,7 @@ def setUp(self): try: import s3fs # noqa except ImportError: - raise nose.SkipTest("s3fs not installed") + pytest.skip("s3fs not installed") @tm.network def test_parse_public_s3_bucket(self): diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py index a423355081ac3..dccae06afe4d1 100644 --- a/pandas/io/tests/parser/test_read_fwf.py +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -8,7 +8,7 @@ from datetime import datetime -import nose +import pytest import numpy as np import pandas as pd import pandas.util.testing as tm @@ -75,7 +75,7 @@ def test_fwf(self): def test_BytesIO_input(self): if not compat.PY3: - raise nose.SkipTest( + pytest.skip( "Bytes-related test - only needs to work on Python 3") result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[ @@ -192,7 +192,7 @@ def test_fwf_compression(self): import gzip import bz2 except ImportError: - raise nose.SkipTest("Need gzip and bz2 to run this test") + pytest.skip("Need gzip and bz2 to run this test") data = """1111111111 2222222222 @@ -333,7 +333,7 @@ def test_multiple_delimiters(self): def test_variable_width_unicode(self): if not compat.PY3: - raise nose.SkipTest( + pytest.skip( 'Bytes-related test - only needs to work on Python 3') test = """ שלום שלום diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 4875282067fb3..95df077dae997 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -5,7 +5,7 @@ for all of the parsers defined in parsers.py """ -import nose +import pytest import numpy as np import pandas.util.testing as tm @@ -377,7 +377,7 @@ def test_usecols_with_multibyte_characters(self): tm.assert_frame_equal(df, expected) def test_usecols_with_multibyte_unicode_characters(self): - raise nose.SkipTest('TODO: see gh-13253') + pytest.skip('TODO: see gh-13253') s = '''あああ,いい,ううう,ええええ 0.056674973,8,True,a diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index 98a4152754b55..3abd1093362f4 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -2,7 +2,7 @@ import numpy as np from numpy.random import randint -import nose +import pytest import pandas as pd from pandas import DataFrame @@ -15,10 +15,13 @@ try: DataFrame({'A': [1, 2]}).to_clipboard() + _DEPS_INSTALLED = 1 except PyperclipException: - raise nose.SkipTest("clipboard primitives not installed") + _DEPS_INSTALLED = 0 +@pytest.mark.skipif(not _DEPS_INSTALLED, + reason="clipboard primitives not installed") class TestClipboard(tm.TestCase): @classmethod diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 2791e397d5b86..a22c89184f20d 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -9,7 +9,7 @@ import warnings import operator import functools -import nose +import pytest from numpy import nan import numpy as np @@ -32,30 +32,30 @@ def _skip_if_no_xlrd(): import xlrd ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) if ver < (0, 9): - raise nose.SkipTest('xlrd < 0.9, skipping') + pytest.skip('xlrd < 0.9, skipping') except ImportError: - raise nose.SkipTest('xlrd not installed, skipping') + pytest.skip('xlrd not installed, skipping') def _skip_if_no_xlwt(): try: import xlwt # NOQA except ImportError: - raise nose.SkipTest('xlwt not installed, skipping') + pytest.skip('xlwt not installed, skipping') def _skip_if_no_openpyxl(): try: import openpyxl # NOQA except ImportError: - raise nose.SkipTest('openpyxl not installed, skipping') + pytest.skip('openpyxl not installed, skipping') def _skip_if_no_xlsxwriter(): try: import xlsxwriter # NOQA except ImportError: - raise nose.SkipTest('xlsxwriter not installed, skipping') + pytest.skip('xlsxwriter not installed, skipping') def _skip_if_no_excelsuite(): @@ -68,7 +68,7 @@ def _skip_if_no_s3fs(): try: import s3fs # noqa except ImportError: - raise nose.SkipTest('s3fs not installed, skipping') + pytest.skip('s3fs not installed, skipping') _seriesd = tm.getSeriesData() @@ -600,7 +600,7 @@ def test_read_from_file_url(self): # FILE if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") + pytest.skip("file:// not supported with Python < 2.6") localtable = os.path.join(self.dirpath, 'test1' + self.ext) local_table = read_excel(localtable) @@ -610,8 +610,8 @@ def test_read_from_file_url(self): except URLError: # fails on some systems import platform - raise nose.SkipTest("failing on %s" % - ' '.join(platform.uname()).strip()) + pytest.skip("failing on %s" % + ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table) @@ -1513,7 +1513,7 @@ def test_to_excel_unicode_filename(self): try: f = open(filename, 'wb') except UnicodeEncodeError: - raise nose.SkipTest('no unicode file names on this system') + pytest.skip('no unicode file names on this system') else: f.close() @@ -1555,7 +1555,7 @@ def test_to_excel_unicode_filename(self): # import xlwt # import xlrd # except ImportError: - # raise nose.SkipTest + # pytest.skip # filename = '__tmp_to_excel_header_styling_xls__.xls' # pdf.to_excel(filename, 'test1') @@ -1601,9 +1601,9 @@ def test_to_excel_unicode_filename(self): # import openpyxl # from openpyxl.cell import get_column_letter # except ImportError: - # raise nose.SkipTest + # pytest.skip # if openpyxl.__version__ < '1.6.1': - # raise nose.SkipTest + # pytest.skip # # test xlsx_styling # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx' # pdf.to_excel(filename, 'test1') @@ -1635,7 +1635,7 @@ def test_excel_010_hemstring(self): _skip_if_no_xlrd() if self.merge_cells: - raise nose.SkipTest('Skip tests for merged MI format.') + pytest.skip('Skip tests for merged MI format.') from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 @@ -1690,7 +1690,7 @@ def test_excel_010_hemstring_raises_NotImplementedError(self): _skip_if_no_xlrd() if self.merge_cells: - raise nose.SkipTest('Skip tests for merged MI format.') + pytest.skip('Skip tests for merged MI format.') from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 @@ -1873,7 +1873,7 @@ class OpenpyxlTests(ExcelWriterBase, tm.TestCase): def test_to_excel_styleconverter(self): _skip_if_no_openpyxl() if not openpyxl_compat.is_compat(major_ver=1): - raise nose.SkipTest('incompatiable openpyxl version') + pytest.skip('incompatiable openpyxl version') import openpyxl @@ -1910,7 +1910,7 @@ def setUpClass(cls): ver = openpyxl.__version__ if (not (LooseVersion(ver) >= LooseVersion('2.0.0') and LooseVersion(ver) < LooseVersion('2.2.0'))): - raise nose.SkipTest("openpyxl %s >= 2.2" % str(ver)) + pytest.skip("openpyxl %s >= 2.2" % str(ver)) cls.setUpClass = setUpClass return cls @@ -2026,7 +2026,7 @@ def setUpClass(cls): import openpyxl ver = openpyxl.__version__ if LooseVersion(ver) < LooseVersion('2.2.0'): - raise nose.SkipTest("openpyxl %s < 2.2" % str(ver)) + pytest.skip("openpyxl %s < 2.2" % str(ver)) cls.setUpClass = setUpClass return cls @@ -2095,7 +2095,7 @@ def test_to_excel_styleconverter(self): def test_write_cells_merge_styled(self): if not openpyxl_compat.is_compat(major_ver=2): - raise nose.SkipTest('incompatiable openpyxl version') + pytest.skip('incompatiable openpyxl version') from pandas.formats.format import ExcelCell @@ -2278,7 +2278,7 @@ def test_ExcelWriter_dispatch(self): except ImportError: _skip_if_no_openpyxl() if not openpyxl_compat.is_compat(major_ver=1): - raise nose.SkipTest('incompatible openpyxl version') + pytest.skip('incompatible openpyxl version') writer_klass = _Openpyxl1Writer with ensure_clean('.xlsx') as path: diff --git a/pandas/io/tests/test_feather.py b/pandas/io/tests/test_feather.py index 218175e5ef527..6e2c28a0f68de 100644 --- a/pandas/io/tests/test_feather.py +++ b/pandas/io/tests/test_feather.py @@ -1,17 +1,12 @@ """ test feather-format compat """ -import nose +import pytest +feather = pytest.importorskip('feather') import numpy as np import pandas as pd - from pandas.io.feather_format import to_feather, read_feather -try: - import feather # noqa -except ImportError: - raise nose.SkipTest('no feather-format installed') - from feather import FeatherError import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, ensure_clean diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 1157482d7ae67..0868edd2147b5 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -1,6 +1,6 @@ import re from datetime import datetime -import nose +import pytest import pytz import platform from time import sleep @@ -42,25 +42,25 @@ def _skip_if_no_project_id(): if not _get_project_id(): - raise nose.SkipTest( + pytest.skip( "Cannot run integration tests without a project id") def _skip_local_auth_if_in_travis_env(): if _in_travis_environment(): - raise nose.SkipTest("Cannot run local auth in travis environment") + pytest.skip("Cannot run local auth in travis environment") def _skip_if_no_private_key_path(): if not _get_private_key_path(): - raise nose.SkipTest("Cannot run integration tests without a " - "private key json file path") + pytest.skip("Cannot run integration tests without a " + "private key json file path") def _skip_if_no_private_key_contents(): if not _get_private_key_contents(): - raise nose.SkipTest("Cannot run integration tests without a " - "private key json contents") + pytest.skip("Cannot run integration tests without a " + "private key json contents") def _in_travis_environment(): @@ -184,7 +184,7 @@ def _setup_common(): try: _test_imports() except (ImportError, NotImplementedError) as import_exception: - raise nose.SkipTest(import_exception) + pytest.skip(import_exception) if _in_travis_environment(): logging.getLogger('oauth2client').setLevel(logging.ERROR) @@ -284,15 +284,15 @@ def test_should_be_able_to_get_results_from_query(self): def test_get_application_default_credentials_does_not_throw_error(self): if _check_if_can_get_correct_default_credentials(): - raise nose.SkipTest("Can get default_credentials " - "from the environment!") + pytest.skip("Can get default_credentials " + "from the environment!") credentials = self.sut.get_application_default_credentials() self.assertIsNone(credentials) def test_get_application_default_credentials_returns_credentials(self): if not _check_if_can_get_correct_default_credentials(): - raise nose.SkipTest("Cannot get default_credentials " - "from the environment!") + pytest.skip("Cannot get default_credentials " + "from the environment!") from oauth2client.client import GoogleCredentials credentials = self.sut.get_application_default_credentials() self.assertTrue(isinstance(credentials, GoogleCredentials)) @@ -1015,7 +1015,7 @@ def test_upload_data_if_table_exists_append(self): def test_upload_data_if_table_exists_replace(self): - raise nose.SkipTest("buggy test") + pytest.skip("buggy test") destination_table = DESTINATION_TABLE + "4" diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 356adb92829c6..232e68a87f16e 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -12,7 +12,7 @@ from distutils.version import LooseVersion -import nose +import pytest import numpy as np from numpy.random import rand @@ -39,7 +39,7 @@ def _have_module(module_name): def _skip_if_no(module_name): if not _have_module(module_name): - raise nose.SkipTest("{0!r} not found".format(module_name)) + pytest.skip("{0!r} not found".format(module_name)) def _skip_if_none_of(module_names): @@ -48,16 +48,16 @@ def _skip_if_none_of(module_names): if module_names == 'bs4': import bs4 if bs4.__version__ == LooseVersion('4.2.0'): - raise nose.SkipTest("Bad version of bs4: 4.2.0") + pytest.skip("Bad version of bs4: 4.2.0") else: not_found = [module_name for module_name in module_names if not _have_module(module_name)] if set(not_found) & set(module_names): - raise nose.SkipTest("{0!r} not found".format(not_found)) + pytest.skip("{0!r} not found".format(not_found)) if 'bs4' in module_names: import bs4 if bs4.__version__ == LooseVersion('4.2.0'): - raise nose.SkipTest("Bad version of bs4: 4.2.0") + pytest.skip("Bad version of bs4: 4.2.0") DATA_PATH = tm.get_data_path() @@ -685,7 +685,7 @@ def test_decimal_rows(self): ''') expected = DataFrame(data={'Header': 1100.101}, index=[0]) result = self.read_html(data, decimal='#')[0] - nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64')) + assert result['Header'].dtype == np.dtype('float64') tm.assert_frame_equal(result, expected) def test_bool_header_arg(self): diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 2ee36d85f674c..4bb6f4a69bab3 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -1,4 +1,4 @@ -import nose +import pytest import os import datetime @@ -168,7 +168,7 @@ def test_list_numpy_float(self): def test_list_numpy_float_complex(self): if not hasattr(np, 'complex128'): - raise nose.SkipTest('numpy cant handle complex128') + pytest.skip('numpy cant handle complex128') x = [np.float32(np.random.rand()) for i in range(5)] + \ [np.complex128(np.random.rand() + 1j * np.random.rand()) @@ -261,7 +261,7 @@ def test_datetimes(self): # fails under 2.6/win32 (np.datetime64 seems broken) if LooseVersion(sys.version) < '2.7': - raise nose.SkipTest('2.6 with np.datetime64 is broken') + pytest.skip('2.6 with np.datetime64 is broken') for i in [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1, 5, 1), @@ -589,12 +589,12 @@ def _test_compression(self, compress): def test_compression_zlib(self): if not _ZLIB_INSTALLED: - raise nose.SkipTest('no zlib') + pytest.skip('no zlib') self._test_compression('zlib') def test_compression_blosc(self): if not _BLOSC_INSTALLED: - raise nose.SkipTest('no blosc') + pytest.skip('no blosc') self._test_compression('blosc') def _test_compression_warns_when_decompress_caches(self, compress): @@ -653,12 +653,12 @@ def decompress(ob): def test_compression_warns_when_decompress_caches_zlib(self): if not _ZLIB_INSTALLED: - raise nose.SkipTest('no zlib') + pytest.skip('no zlib') self._test_compression_warns_when_decompress_caches('zlib') def test_compression_warns_when_decompress_caches_blosc(self): if not _BLOSC_INSTALLED: - raise nose.SkipTest('no blosc') + pytest.skip('no blosc') self._test_compression_warns_when_decompress_caches('blosc') def _test_small_strings_no_warn(self, compress): @@ -690,18 +690,18 @@ def _test_small_strings_no_warn(self, compress): def test_small_strings_no_warn_zlib(self): if not _ZLIB_INSTALLED: - raise nose.SkipTest('no zlib') + pytest.skip('no zlib') self._test_small_strings_no_warn('zlib') def test_small_strings_no_warn_blosc(self): if not _BLOSC_INSTALLED: - raise nose.SkipTest('no blosc') + pytest.skip('no blosc') self._test_small_strings_no_warn('blosc') def test_readonly_axis_blosc(self): # GH11880 if not _BLOSC_INSTALLED: - raise nose.SkipTest('no blosc') + pytest.skip('no blosc') df1 = DataFrame({'A': list('abcd')}) df2 = DataFrame(df1, index=[1., 2., 3., 4.]) self.assertTrue(1 in self.encode_decode(df1['A'], compress='blosc')) @@ -717,9 +717,9 @@ def test_readonly_axis_zlib(self): def test_readonly_axis_blosc_to_sql(self): # GH11880 if not _BLOSC_INSTALLED: - raise nose.SkipTest('no blosc') + pytest.skip('no blosc') if not self._SQLALCHEMY_INSTALLED: - raise nose.SkipTest('no sqlalchemy') + pytest.skip('no sqlalchemy') expected = DataFrame({'A': list('abcd')}) df = self.encode_decode(expected, compress='blosc') eng = self._create_sql_engine("sqlite:///:memory:") @@ -731,9 +731,9 @@ def test_readonly_axis_blosc_to_sql(self): def test_readonly_axis_zlib_to_sql(self): # GH11880 if not _ZLIB_INSTALLED: - raise nose.SkipTest('no zlib') + pytest.skip('no zlib') if not self._SQLALCHEMY_INSTALLED: - raise nose.SkipTest('no sqlalchemy') + pytest.skip('no sqlalchemy') expected = DataFrame({'A': list('abcd')}) df = self.encode_decode(expected, compress='zlib') eng = self._create_sql_engine("sqlite:///:memory:") diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 89827817a85fb..588b2d5f04888 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -2,7 +2,7 @@ """ manage legacy pickle tests """ -import nose +import pytest import os from distutils.version import LooseVersion @@ -172,7 +172,7 @@ def compare_sp_frame_float(self, result, expected, typ, version): def read_pickles(self, version): if not is_platform_little_endian(): - raise nose.SkipTest("known failure on non-little endian") + pytest.skip("known failure on non-little endian") pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) n = 0 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 501e744ad308c..3fa0eb2ef52dc 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1,4 +1,4 @@ -import nose +import pytest import sys import os import warnings @@ -17,17 +17,14 @@ from pandas.compat import is_platform_windows, PY3, PY35 from pandas.formats.printing import pprint_thing -from pandas.io.pytables import _tables, TableIterator -try: - _tables() -except ImportError as e: - raise nose.SkipTest(e) - +tables = pytest.importorskip('tables') +from pandas.io.pytables import TableIterator from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf, IncompatibilityWarning, PerformanceWarning, AttributeConflictWarning, DuplicateWarning, PossibleDataLossError, ClosedFileError) + from pandas.io import pytables as pytables import pandas.util.testing as tm from pandas.util.testing import (assert_panel4d_equal, @@ -43,7 +40,7 @@ try: import tables except ImportError: - raise nose.SkipTest('no pytables') + pytest.skip('no pytables') from distutils.version import LooseVersion @@ -738,7 +735,7 @@ def test_put_compression(self): def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') if skip_compression: - raise nose.SkipTest("skipping on windows/PY3") + pytest.skip("skipping on windows/PY3") df = tm.makeTimeDataFrame() @@ -968,7 +965,7 @@ def check(format, index): def test_encoding(self): if sys.byteorder != 'little': - raise nose.SkipTest('system byteorder is not little') + pytest.skip('system byteorder is not little') with ensure_clean_store(self.path) as store: df = DataFrame(dict(A='foo', B='bar'), index=range(5)) @@ -2830,14 +2827,14 @@ def test_index_types(self): def test_timeseries_preepoch(self): if sys.version_info[0] == 2 and sys.version_info[1] < 7: - raise nose.SkipTest("won't work on Python < 2.7") + pytest.skip("won't work on Python < 2.7") dr = bdate_range('1/1/1940', '1/1/1960') ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: - raise nose.SkipTest('known failer on some windows platforms') + pytest.skip('known failer on some windows platforms') def test_frame(self): @@ -4202,8 +4199,8 @@ def test_nan_selection_bug_4858(self): # GH 4858; nan selection bug, only works for pytables >= 3.1 if LooseVersion(tables.__version__) < '3.1.0': - raise nose.SkipTest('tables version does not support fix for nan ' - 'selection bug: GH 4858') + pytest.skip('tables version does not support fix for nan ' + 'selection bug: GH 4858') with ensure_clean_store(self.path) as store: @@ -4453,7 +4450,7 @@ def test_pytables_native_read(self): def test_pytables_native2_read(self): # fails on win/3.5 oddly if PY35 and is_platform_windows(): - raise nose.SkipTest("native2 read fails oddly on windows / 3.5") + pytest.skip("native2 read fails oddly on windows / 3.5") with ensure_clean_store( tm.get_data_path('legacy_hdf/pytables_native2.h5'), @@ -4585,7 +4582,7 @@ def do_copy(f=None, new_f=None, keys=None, safe_remove(path) def test_legacy_table_write(self): - raise nose.SkipTest("cannot write legacy tables") + pytest.skip("cannot write legacy tables") store = HDFStore(tm.get_data_path( 'legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a') diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index ddda65c5bafc8..a6f4d96001021 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -24,7 +24,7 @@ import os import sys -import nose +import pytest import warnings import numpy as np import pandas as pd @@ -854,7 +854,7 @@ def connect(self): if SQLALCHEMY_INSTALLED: return sqlalchemy.create_engine('sqlite:///:memory:') else: - raise nose.SkipTest('SQLAlchemy not installed') + pytest.skip('SQLAlchemy not installed') def test_read_table_columns(self): # test columns argument in read_table @@ -1063,7 +1063,7 @@ def test_con_string_import_error(self): self.assertRaises(ImportError, sql.read_sql, "SELECT * FROM iris", conn) else: - raise nose.SkipTest('SQLAlchemy is installed') + pytest.skip('SQLAlchemy is installed') def test_read_sql_delegate(self): iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) @@ -1128,7 +1128,7 @@ def setUpClass(cls): conn.connect() except sqlalchemy.exc.OperationalError: msg = "{0} - can't connect to {1} server".format(cls, cls.flavor) - raise nose.SkipTest(msg) + pytest.skip(msg) def setUp(self): self.setup_connect() @@ -1141,7 +1141,7 @@ def setUp(self): def setup_import(cls): # Skip this test if SQLAlchemy not available if not SQLALCHEMY_INSTALLED: - raise nose.SkipTest('SQLAlchemy not installed') + pytest.skip('SQLAlchemy not installed') @classmethod def setup_driver(cls): @@ -1158,7 +1158,7 @@ def setup_connect(self): # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: - raise nose.SkipTest( + pytest.skip( "Can't connect to {0} server".format(self.flavor)) def test_aread_sql(self): @@ -1304,7 +1304,7 @@ def check(col): # GH11216 df = pd.read_sql_query("select * from types_test_data", self.conn) if not hasattr(df, 'DateColWithTz'): - raise nose.SkipTest("no column with datetime with time zone") + pytest.skip("no column with datetime with time zone") # this is parsed on Travis (linux), but not on macosx for some reason # even with the same versions of psycopg2 & sqlalchemy, possibly a @@ -1319,7 +1319,7 @@ def check(col): df = pd.read_sql_query("select * from types_test_data", self.conn, parse_dates=['DateColWithTz']) if not hasattr(df, 'DateColWithTz'): - raise nose.SkipTest("no column with datetime with time zone") + pytest.skip("no column with datetime with time zone") check(df.DateColWithTz) df = pd.concat(list(pd.read_sql_query("select * from types_test_data", @@ -1665,7 +1665,7 @@ class Temporary(Base): class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): def test_transactions(self): - raise nose.SkipTest( + pytest.skip( "Nested transactions rollbacks don't work with Pandas") @@ -1739,7 +1739,7 @@ def setup_driver(cls): import pymysql # noqa cls.driver = 'pymysql' except ImportError: - raise nose.SkipTest('pymysql not installed') + pytest.skip('pymysql not installed') def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) @@ -1808,7 +1808,7 @@ def setup_driver(cls): import psycopg2 # noqa cls.driver = 'psycopg2' except ImportError: - raise nose.SkipTest('psycopg2 not installed') + pytest.skip('psycopg2 not installed') def test_schema_support(self): # only test this for postgresql (schema's not supported in @@ -2007,7 +2007,7 @@ def test_to_sql_save_index(self): def test_transactions(self): if PY36: - raise nose.SkipTest("not working on python > 3.5") + pytest.skip("not working on python > 3.5") self._transaction_test() def _get_sqlite_column_type(self, table, column): @@ -2019,7 +2019,7 @@ def _get_sqlite_column_type(self, table, column): def test_dtype(self): if self.flavor == 'mysql': - raise nose.SkipTest('Not applicable to MySQL legacy') + pytest.skip('Not applicable to MySQL legacy') cols = ['A', 'B'] data = [(0.8, True), (0.9, None)] @@ -2045,7 +2045,7 @@ def test_dtype(self): def test_notnull_dtype(self): if self.flavor == 'mysql': - raise nose.SkipTest('Not applicable to MySQL legacy') + pytest.skip('Not applicable to MySQL legacy') cols = {'Bool': Series([True, None]), 'Date': Series([datetime(2012, 5, 1), None]), @@ -2130,7 +2130,7 @@ def _skip_if_no_pymysql(): try: import pymysql # noqa except ImportError: - raise nose.SkipTest('pymysql not installed, skipping') + pytest.skip('pymysql not installed, skipping') class TestXSQLite(SQLiteMixIn, tm.TestCase): @@ -2389,12 +2389,12 @@ def setUpClass(cls): try: pymysql.connect(read_default_group='pandas') except pymysql.ProgrammingError: - raise nose.SkipTest( + pytest.skip( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") except pymysql.Error: - raise nose.SkipTest( + pytest.skip( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " @@ -2415,12 +2415,12 @@ def setUp(self): try: self.conn = pymysql.connect(read_default_group='pandas') except pymysql.ProgrammingError: - raise nose.SkipTest( + pytest.skip( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") except pymysql.Error: - raise nose.SkipTest( + pytest.skip( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fcb935925e61f..ae09e671dbca3 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -9,7 +9,7 @@ from datetime import datetime from distutils.version import LooseVersion -import nose +import pytest import numpy as np import pandas as pd import pandas.util.testing as tm @@ -128,7 +128,7 @@ def test_read_dta1(self): def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': - raise nose.SkipTest('datetime interp under 2.6 is faulty') + pytest.skip('datetime interp under 2.6 is faulty') expected = DataFrame.from_records( [ diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index c400b68c8a7d8..357a7103f4027 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -1,6 +1,6 @@ # pylint: disable-msg=E1101,W0612 -import nose # noqa +import pytest # noqa import numpy as np import pandas as pd import pandas.util.testing as tm diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 491005db2ae79..4d5a93d77cf14 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -1,6 +1,6 @@ from pandas import Series -import nose +import pytest import numpy as np import operator import pandas.util.testing as tm @@ -213,7 +213,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): longer_index.to_int_index()) if compat.is_platform_windows(): - raise nose.SkipTest("segfaults on win-64 when all tests are run") + pytest.skip("segfaults on win-64 when all tests are run") check_cases(_check_case) def test_intersect_empty(self): diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index db6ae14b096d3..d4543b97af4dd 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -577,8 +577,8 @@ def check(a, b): def test_binary_operators(self): # skipping for now ##### - import nose - raise nose.SkipTest("skipping sparse binary operators test") + import pytest + pytest.skip("skipping sparse binary operators test") def _check_inplace_op(iop, op): tmp = self.bseries.copy() diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 9a24ae332f7c5..476c6a636ae5a 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -44,7 +44,7 @@ reset_option) from datetime import datetime -import nose +import pytest use_32bit_repr = is_platform_windows() or is_platform_32bit() @@ -287,7 +287,7 @@ def test_repr_non_interactive(self): def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: - raise nose.SkipTest("terminal size too small, " + pytest.skip("terminal size too small, " "{0} x {1}".format(term_width, term_height)) def mkframe(n): @@ -1871,7 +1871,7 @@ def test_to_html_regression_GH6098(self): df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_() def test_to_html_truncate(self): - raise nose.SkipTest("unreliable on travis") + pytest.skip("unreliable on travis") index = pd.DatetimeIndex(start='20010101', freq='D', periods=20) df = DataFrame(index=index, columns=range(20)) fmt.set_option('display.max_rows', 8) @@ -1972,7 +1972,7 @@ def test_to_html_truncate(self): self.assertEqual(result, expected) def test_to_html_truncate_multi_index(self): - raise nose.SkipTest("unreliable on travis") + pytest.skip("unreliable on travis") arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] df = DataFrame(index=arrays, columns=arrays) @@ -2089,7 +2089,7 @@ def test_to_html_truncate_multi_index(self): self.assertEqual(result, expected) def test_to_html_truncate_multi_index_sparse_off(self): - raise nose.SkipTest("unreliable on travis") + pytest.skip("unreliable on travis") arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] df = DataFrame(index=arrays, columns=arrays) @@ -2250,7 +2250,7 @@ def test_pprint_thing(self): from pandas.formats.printing import pprint_thing as pp_t if PY3: - raise nose.SkipTest("doesn't work on Python 3") + pytest.skip("doesn't work on Python 3") self.assertEqual(pp_t('a'), u('a')) self.assertEqual(pp_t(u('a')), u('a')) diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 1e6794c1c9c69..52f3e06c6cbd0 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -126,7 +126,7 @@ def test_ambiguous_width(self): # common.console_encode should encode things as utf-8. # """ # if compat.PY3: -# raise nose.SkipTest +# pytest.skip # with tm.stdin_encoding(encoding=None): # result = printing.console_encode(u"\u05d0") diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index eaa209178b2e9..53bb3f9010f7e 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -1,5 +1,4 @@ -import os -from nose import SkipTest +import pytest import copy import numpy as np @@ -8,20 +7,7 @@ from pandas.util.testing import TestCase import pandas.util.testing as tm -# Getting failures on a python 2.7 build with -# whenever we try to import jinja, whether it's installed or not. -# so we're explicitly skipping that one *before* we try to import -# jinja. We still need to export the imports as globals, -# since importing Styler tries to import jinja2. -job_name = os.environ.get('JOB_NAME', None) -if job_name == '27_slow_nnet_LOCALE': - raise SkipTest("No jinja") -try: - # Do try except on just jinja, so the only reason - # We skip is if jinja can't import, not something else - import jinja2 # noqa -except ImportError: - raise SkipTest("No Jinja2") +jinja2 = pytest.importorskip('jinja2') from pandas.formats.style import Styler, _get_level_lengths # noqa diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index a55d2cfb2fb2b..1f0d16e959cd7 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -5,7 +5,7 @@ from datetime import timedelta, datetime from distutils.version import LooseVersion import sys -import nose +import pytest from numpy import nan from numpy.random import randn @@ -2066,8 +2066,8 @@ def test_round_issue(self): def test_built_in_round(self): if not compat.PY3: - raise nose.SkipTest("build in round cannot be overriden " - "prior to Python 3") + pytest.skip("build in round cannot be overriden " + "prior to Python 3") # GH11763 # Here's the test frame we'll be working with diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 66a235e1260bd..76eb61bd81110 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -6,7 +6,7 @@ import functools import itertools -import nose +import pytest from numpy.random import randn import numpy as np @@ -1702,7 +1702,7 @@ def test_from_records_with_datetimes(self): # this may fail on certain platforms because of a numpy issue # related GH6140 if not is_platform_little_endian(): - raise nose.SkipTest("known failure of test on non-little endian") + pytest.skip("known failure of test on non-little endian") # construction with a null in a recarray # GH 6140 @@ -1714,7 +1714,7 @@ def test_from_records_with_datetimes(self): try: recarray = np.core.records.fromarrays(arrdata, dtype=dtypes) except (ValueError): - raise nose.SkipTest("known failure of numpy rec array creation") + pytest.skip("known failure of numpy rec array creation") result = DataFrame.from_records(recarray) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index ef800f0dface3..80ea01d3a05aa 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -23,8 +23,8 @@ def _skip_if_no_pchip(): try: from scipy.interpolate import pchip_interpolate # noqa except ImportError: - import nose - raise nose.SkipTest('scipy.interpolate.pchip missing') + import pytest + pytest.skip('scipy.interpolate.pchip missing') class TestDataFrameMissingData(tm.TestCase, TestData): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index ec73689088035..d6a3592446fd5 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -5,7 +5,7 @@ from datetime import datetime import operator -import nose +import pytest from numpy import nan, random import numpy as np @@ -323,7 +323,7 @@ def test_logical_typeerror(self): self.assertRaises(TypeError, self.frame.__gt__, 'foo') self.assertRaises(TypeError, self.frame.__ne__, 'foo') else: - raise nose.SkipTest('test_logical_typeerror not tested on PY3') + pytest.skip('test_logical_typeerror not tested on PY3') def test_logical_with_nas(self): d = DataFrame({'a': [np.nan, False], 'b': [True, True]}) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 400ead788aa7c..909a1a6a4c917 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -3,7 +3,7 @@ from __future__ import print_function -import nose +import pytest import numpy as np from pandas import (DataFrame, Series, Timestamp, _np_version_under1p11) @@ -106,7 +106,7 @@ def test_quantile_axis_parameter(self): def test_quantile_interpolation(self): # GH #10174 if _np_version_under1p9: - raise nose.SkipTest("Numpy version under 1.9") + pytest.skip("Numpy version under 1.9") from numpy import percentile @@ -171,7 +171,7 @@ def test_quantile_interpolation(self): def test_quantile_interpolation_np_lt_1p9(self): # GH #10174 if not _np_version_under1p9: - raise nose.SkipTest("Numpy version is greater than 1.9") + pytest.skip("Numpy version is greater than 1.9") from numpy import percentile diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index aed02b7323f85..647af92b42273 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,7 +3,7 @@ from __future__ import print_function import operator -import nose +import pytest from itertools import product from pandas.compat import (zip, range, lrange, StringIO) @@ -30,14 +30,14 @@ def skip_if_no_pandas_parser(parser): if parser != 'pandas': - raise nose.SkipTest("cannot evaluate with parser {0!r}".format(parser)) + pytest.skip("cannot evaluate with parser {0!r}".format(parser)) def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': if not _NUMEXPR_INSTALLED: - raise nose.SkipTest("cannot query engine numexpr when numexpr not " - "installed") + pytest.skip("cannot query engine numexpr when numexpr not " + "installed") class TestCompat(tm.TestCase): diff --git a/pandas/tests/groupby/test_misc.py b/pandas/tests/groupby/test_misc.py index c9d8ad4231cfb..9395304385681 100644 --- a/pandas/tests/groupby/test_misc.py +++ b/pandas/tests/groupby/test_misc.py @@ -1,6 +1,6 @@ """ misc non-groupby routines, as they are defined in core/groupby.py """ -import nose +import pytest import numpy as np from numpy import nan from pandas.util import testing as tm @@ -42,7 +42,7 @@ def test_nargsort(self): np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) np.argsort(items2, kind='mergesort') except TypeError: - raise nose.SkipTest('requested sort not available for type') + pytest.skip('requested sort not available for type') # mergesort is the most difficult to get right because we want it to be # stable. diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index af749963146c6..1b67ffce63b10 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1,7 +1,7 @@ """ test to_datetime """ import sys -import nose +import pytest import locale import calendar import numpy as np @@ -139,7 +139,7 @@ def test_to_datetime_with_non_exact(self): # 8904 # exact kw if sys.version_info < (2, 7): - raise nose.SkipTest('on python version < 2.7') + pytest.skip('on python version < 2.7') s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', '19MAY11 00:00:00Z']) @@ -277,7 +277,7 @@ def test_to_datetime_tz_psycopg2(self): try: import psycopg2 except ImportError: - raise nose.SkipTest("no psycopg2 installed") + pytest.skip("no psycopg2 installed") # misc cases tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 365236f72e80e..702c4758da245 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -6,7 +6,7 @@ from datetime import timedelta from itertools import product -import nose +import pytest import numpy as np @@ -988,8 +988,8 @@ def test_iter(self): def test_legacy_pickle(self): if PY3: - raise nose.SkipTest("testing for legacy pickles not " - "support on py3") + pytest.skip("testing for legacy pickles not " + "support on py3") path = tm.get_data_path('multiindex_v1.pickle') obj = pd.read_pickle(path) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index b9a746cd25c7a..38f8bb5355a69 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -import nose +import pytest import numpy as np import pandas as pd @@ -1172,8 +1172,8 @@ def _assert_replace_conversion(self, from_key, to_key, how): if (from_key == 'bool' and to_key == 'int64' and tm.is_platform_windows()): - raise nose.SkipTest("windows platform buggy: {0} -> {1}".format - (from_key, to_key)) + pytest.skip("windows platform buggy: {0} -> {1}".format + (from_key, to_key)) if ((from_key == 'float64' and to_key in ('bool', 'int64')) or @@ -1189,8 +1189,8 @@ def _assert_replace_conversion(self, from_key, to_key, how): # buggy on 32-bit if tm.is_platform_32bit(): - raise nose.SkipTest("32-bit platform buggy: {0} -> {1}".format - (from_key, to_key)) + pytest.skip("32-bit platform buggy: {0} -> {1}".format + (from_key, to_key)) # Expected: do not downcast by replacement exp = pd.Series(self.rep[to_key], index=index, @@ -1243,7 +1243,7 @@ def test_replace_series_bool(self): if compat.PY3: # doesn't work in PY3, though ...dict_from_bool works fine - raise nose.SkipTest("doesn't work as in PY3") + pytest.skip("doesn't work as in PY3") self._assert_replace_conversion(from_key, to_key, how='series') diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 9fe1d7cacd38f..92e2dc7b5d934 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding: utf-8 -import nose +import pytest import os import warnings @@ -28,7 +28,7 @@ def _skip_if_no_scipy_gaussian_kde(): try: from scipy.stats import gaussian_kde # noqa except ImportError: - raise nose.SkipTest("scipy version doesn't support gaussian_kde") + pytest.skip("scipy version doesn't support gaussian_kde") def _ok_for_gaussian_kde(kind): diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index f7fd6a8519533..31c150bc1e64f 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,6 +1,6 @@ # coding: utf-8 -import nose +import pytest import itertools import string from distutils.version import LooseVersion @@ -28,7 +28,7 @@ def _skip_if_mpl_14_or_dev_boxplot(): # Don't need try / except since that's done at class level import matplotlib if str(matplotlib.__version__) >= LooseVersion('1.4'): - raise nose.SkipTest("Matplotlib Regression in 1.4 and current dev.") + pytest.skip("Matplotlib Regression in 1.4 and current dev.") @tm.mplskip diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index bcc9c7ceea8b5..25568f7eb61dc 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -2,7 +2,7 @@ from datetime import datetime, timedelta, date, time -import nose +import pytest from pandas.compat import lrange, zip import numpy as np @@ -161,8 +161,8 @@ def check_format_of_first_point(ax, expected_string): self.assertEqual(expected_string, ax.format_coord(first_x, first_y)) except (ValueError): - raise nose.SkipTest("skipping test because issue forming " - "test comparison GH7664") + pytest.skip("skipping test because issue forming " + "test comparison GH7664") annual = Series(1, index=date_range('2014-01-01', periods=3, freq='A-DEC')) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 81a54bd38b3f8..48af366f24ea4 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2,7 +2,7 @@ """ Test cases for DataFrame.plot """ -import nose +import pytest import string import warnings @@ -1275,7 +1275,7 @@ def test_kde_missing_vals(self): def test_hist_df(self): from matplotlib.patches import Rectangle if self.mpl_le_1_2_1: - raise nose.SkipTest("not supported in matplotlib <= 1.2.x") + pytest.skip("not supported in matplotlib <= 1.2.x") df = DataFrame(randn(100, 4)) series = df[0] diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 52b85c89a7009..222165e9d3633 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -4,7 +4,7 @@ from itertools import product from distutils.version import LooseVersion -import nose +import pytest from numpy import nan import numpy as np @@ -476,8 +476,8 @@ def test_cummax_timedelta64(self): self.assert_series_equal(expected, result) def test_npdiff(self): - raise nose.SkipTest("skipping due to Series no longer being an " - "ndarray") + pytest.skip("skipping due to Series no longer being an " + "ndarray") # no longer works as the return type of np.diff is now nd.array s = Series(np.arange(5)) @@ -622,7 +622,7 @@ def test_numpy_round(self): def test_built_in_round(self): if not compat.PY3: - raise nose.SkipTest( + pytest.skip( 'build in round cannot be overriden prior to Python 3') s = Series([1.123, 2.123, 3.123], index=lrange(3)) @@ -785,8 +785,8 @@ def test_corr_rank(self): # these methods got rewritten in 0.8 if scipy.__version__ < LooseVersion('0.9'): - raise nose.SkipTest("skipping corr rank because of scipy version " - "{0}".format(scipy.__version__)) + pytest.skip("skipping corr rank because of scipy version " + "{0}".format(scipy.__version__)) # results from R A = Series( @@ -1063,8 +1063,8 @@ def test_rank_signature(self): self.assertRaises(ValueError, s.rank, 'average') def test_rank_inf(self): - raise nose.SkipTest('DataFrame.rank does not currently rank ' - 'np.inf and -np.inf properly') + pytest.skip('DataFrame.rank does not currently rank ' + 'np.inf and -np.inf properly') values = np.array( [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 702fa2acb5106..405d6c98a5d37 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -22,16 +22,16 @@ def _skip_if_no_pchip(): try: from scipy.interpolate import pchip_interpolate # noqa except ImportError: - import nose - raise nose.SkipTest('scipy.interpolate.pchip missing') + import pytest + pytest.skip('scipy.interpolate.pchip missing') def _skip_if_no_akima(): try: from scipy.interpolate import Akima1DInterpolator # noqa except ImportError: - import nose - raise nose.SkipTest('scipy.interpolate.Akima1DInterpolator missing') + import pytest + pytest.skip('scipy.interpolate.Akima1DInterpolator missing') def _simple_ts(start, end, freq='D'): diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 76db6c90a685f..b8d1b92081858 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -1,7 +1,7 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -import nose +import pytest import numpy as np import pandas as pd @@ -73,7 +73,7 @@ def test_quantile_multi(self): def test_quantile_interpolation(self): # GH #10174 if _np_version_under1p9: - raise nose.SkipTest("Numpy version is under 1.9") + pytest.skip("Numpy version is under 1.9") from numpy import percentile @@ -89,7 +89,7 @@ def test_quantile_interpolation(self): def test_quantile_interpolation_dtype(self): # GH #10174 if _np_version_under1p9: - raise nose.SkipTest("Numpy version is under 1.9") + pytest.skip("Numpy version is under 1.9") from numpy import percentile @@ -105,7 +105,7 @@ def test_quantile_interpolation_dtype(self): def test_quantile_interpolation_np_lt_1p9(self): # GH #10174 if not _np_version_under1p9: - raise nose.SkipTest("Numpy version is greater than 1.9") + pytest.skip("Numpy version is greater than 1.9") from numpy import percentile diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1d1ef1a08859c..473f1d81c9532 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -4,7 +4,7 @@ import re import sys from datetime import datetime, timedelta -import nose +import pytest import numpy as np import pandas as pd @@ -32,7 +32,7 @@ def test_string_methods_dont_fail(self): def test_tricky_container(self): if not hasattr(self, 'unicode_container'): - raise nose.SkipTest('Need unicode_container to test with this') + pytest.skip('Need unicode_container to test with this') repr(self.unicode_container) str(self.unicode_container) bytes(self.unicode_container) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 136786ecff0a0..0318757f76a11 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -4,7 +4,7 @@ import re import operator -import nose +import pytest from numpy.random import randn @@ -21,13 +21,7 @@ if not expr._USE_NUMEXPR: - try: - import numexpr # noqa - except ImportError: - msg = "don't have" - else: - msg = "not using" - raise nose.SkipTest("{0} numexpr".format(msg)) + numexpr = pytest.importorskip('numexpr') _frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') _frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') @@ -70,9 +64,8 @@ def setUp(self): def tearDown(self): expr._MIN_ELEMENTS = self._MIN_ELEMENTS - @nose.tools.nottest - def run_arithmetic_test(self, df, other, assert_func, check_dtype=False, - test_flex=True): + def run_arithmetic(self, df, other, assert_func, check_dtype=False, + test_flex=True): expr._MIN_ELEMENTS = 0 operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv', 'pow'] if not compat.PY3: @@ -109,15 +102,14 @@ def run_arithmetic_test(self, df, other, assert_func, check_dtype=False, raise def test_integer_arithmetic(self): - self.run_arithmetic_test(self.integer, self.integer, - assert_frame_equal) - self.run_arithmetic_test(self.integer.iloc[:, 0], - self.integer.iloc[:, 0], assert_series_equal, - check_dtype=True) - - @nose.tools.nottest - def run_binary_test(self, df, other, assert_func, test_flex=False, - numexpr_ops=set(['gt', 'lt', 'ge', 'le', 'eq', 'ne'])): + self.run_arithmetic(self.integer, self.integer, + assert_frame_equal) + self.run_arithmetic(self.integer.iloc[:, 0], + self.integer.iloc[:, 0], assert_series_equal, + check_dtype=True) + + def run_binary(self, df, other, assert_func, test_flex=False, + numexpr_ops=set(['gt', 'lt', 'ge', 'le', 'eq', 'ne'])): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -151,46 +143,46 @@ def run_binary_test(self, df, other, assert_func, test_flex=False, def run_frame(self, df, other, binary_comp=None, run_binary=True, **kwargs): - self.run_arithmetic_test(df, other, assert_frame_equal, - test_flex=False, **kwargs) - self.run_arithmetic_test(df, other, assert_frame_equal, test_flex=True, - **kwargs) + self.run_arithmetic(df, other, assert_frame_equal, + test_flex=False, **kwargs) + self.run_arithmetic(df, other, assert_frame_equal, test_flex=True, + **kwargs) if run_binary: if binary_comp is None: expr.set_use_numexpr(False) binary_comp = other + 1 expr.set_use_numexpr(True) - self.run_binary_test(df, binary_comp, assert_frame_equal, - test_flex=False, **kwargs) - self.run_binary_test(df, binary_comp, assert_frame_equal, - test_flex=True, **kwargs) + self.run_binary(df, binary_comp, assert_frame_equal, + test_flex=False, **kwargs) + self.run_binary(df, binary_comp, assert_frame_equal, + test_flex=True, **kwargs) def run_series(self, ser, other, binary_comp=None, **kwargs): - self.run_arithmetic_test(ser, other, assert_series_equal, - test_flex=False, **kwargs) - self.run_arithmetic_test(ser, other, assert_almost_equal, - test_flex=True, **kwargs) + self.run_arithmetic(ser, other, assert_series_equal, + test_flex=False, **kwargs) + self.run_arithmetic(ser, other, assert_almost_equal, + test_flex=True, **kwargs) # series doesn't uses vec_compare instead of numexpr... # if binary_comp is None: # binary_comp = other + 1 - # self.run_binary_test(ser, binary_comp, assert_frame_equal, + # self.run_binary(ser, binary_comp, assert_frame_equal, # test_flex=False, **kwargs) - # self.run_binary_test(ser, binary_comp, assert_frame_equal, + # self.run_binary(ser, binary_comp, assert_frame_equal, # test_flex=True, **kwargs) def run_panel(self, panel, other, binary_comp=None, run_binary=True, assert_func=assert_panel_equal, **kwargs): - self.run_arithmetic_test(panel, other, assert_func, test_flex=False, - **kwargs) - self.run_arithmetic_test(panel, other, assert_func, test_flex=True, - **kwargs) + self.run_arithmetic(panel, other, assert_func, test_flex=False, + **kwargs) + self.run_arithmetic(panel, other, assert_func, test_flex=True, + **kwargs) if run_binary: if binary_comp is None: binary_comp = other + 1 - self.run_binary_test(panel, binary_comp, assert_func, - test_flex=False, **kwargs) - self.run_binary_test(panel, binary_comp, assert_func, - test_flex=True, **kwargs) + self.run_binary(panel, binary_comp, assert_func, + test_flex=False, **kwargs) + self.run_binary(panel, binary_comp, assert_func, + test_flex=True, **kwargs) def test_integer_arithmetic_frame(self): self.run_frame(self.integer, self.integer) @@ -234,22 +226,22 @@ def test_mixed_panel(self): binary_comp=-2) def test_float_arithemtic(self): - self.run_arithmetic_test(self.frame, self.frame, assert_frame_equal) - self.run_arithmetic_test(self.frame.iloc[:, 0], self.frame.iloc[:, 0], - assert_series_equal, check_dtype=True) + self.run_arithmetic(self.frame, self.frame, assert_frame_equal) + self.run_arithmetic(self.frame.iloc[:, 0], self.frame.iloc[:, 0], + assert_series_equal, check_dtype=True) def test_mixed_arithmetic(self): - self.run_arithmetic_test(self.mixed, self.mixed, assert_frame_equal) + self.run_arithmetic(self.mixed, self.mixed, assert_frame_equal) for col in self.mixed.columns: - self.run_arithmetic_test(self.mixed[col], self.mixed[col], - assert_series_equal) + self.run_arithmetic(self.mixed[col], self.mixed[col], + assert_series_equal) def test_integer_with_zeros(self): self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) - self.run_arithmetic_test(self.integer, self.integer, - assert_frame_equal) - self.run_arithmetic_test(self.integer.iloc[:, 0], - self.integer.iloc[:, 0], assert_series_equal) + self.run_arithmetic(self.integer, self.integer, + assert_frame_equal) + self.run_arithmetic(self.integer.iloc[:, 0], + self.integer.iloc[:, 0], assert_series_equal) def test_invalid(self): diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index e84e2d6809e7b..28f1dc61533c1 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -2,7 +2,7 @@ # pylint: disable-msg=E1101,W0612 from operator import methodcaller -import nose +import pytest import numpy as np from numpy import nan import pandas as pd @@ -367,7 +367,7 @@ def test_head_tail(self): try: o.head() except (NotImplementedError): - raise nose.SkipTest('not implemented on {0}'.format( + pytest.skip('not implemented on {0}'.format( o.__class__.__name__)) self._compare(o.head(), o.iloc[:5]) @@ -1567,7 +1567,7 @@ class TestPanel4D(tm.TestCase, Generic): _comparator = lambda self, x, y: assert_panel4d_equal(x, y, by_blocks=True) def test_sample(self): - raise nose.SkipTest("sample on Panel4D") + pytest.skip("sample on Panel4D") def test_to_xarray(self): diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 1dfea168c067c..f086935df6dc8 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -3,7 +3,7 @@ from datetime import datetime, date -import nose +import pytest import numpy as np import re @@ -276,7 +276,7 @@ def test_split_block_at(self): # with dup column support this method was taken out # GH3679 - raise nose.SkipTest("skipping for now") + pytest.skip("skipping for now") bs = list(self.fblock.split_block_at('a')) self.assertEqual(len(bs), 1) diff --git a/pandas/tests/test_msgpack/test_unpack.py b/pandas/tests/test_msgpack/test_unpack.py index a182c676adb3b..ae8227ab276fb 100644 --- a/pandas/tests/test_msgpack/test_unpack.py +++ b/pandas/tests/test_msgpack/test_unpack.py @@ -2,7 +2,7 @@ import sys from pandas.msgpack import Unpacker, packb, OutOfData, ExtType import pandas.util.testing as tm -import nose +import pytest class TestUnpack(tm.TestCase): @@ -19,7 +19,7 @@ def test_unpack_array_header_from_file(self): def test_unpacker_hook_refcnt(self): if not hasattr(sys, 'getrefcount'): - raise nose.SkipTest('no sys.getrefcount()') + pytest.skip('no sys.getrefcount()') result = [] def hook(x): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1fe2d701f5a41..8e0628eefa392 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -3,7 +3,7 @@ from warnings import catch_warnings import datetime import itertools -import nose +import pytest from numpy.random import randn import numpy as np @@ -1733,7 +1733,7 @@ def test_getitem_lowerdim_corner(self): # AMBIGUOUS CASES! def test_partial_ix_missing(self): - raise nose.SkipTest("skipping for now") + pytest.skip("skipping for now") result = self.ymd.loc[2000, 0] expected = self.ymd.loc[2000]['A'] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 4f56419b1323a..2f329f241a5b8 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -4,7 +4,7 @@ from datetime import datetime import operator -import nose +import pytest import numpy as np import pandas as pd @@ -97,7 +97,7 @@ def test_skew(self): try: from scipy.stats import skew except ImportError: - raise nose.SkipTest("no scipy.stats.skew") + pytest.skip("no scipy.stats.skew") def this_skew(x): if len(x) < 3: @@ -2059,7 +2059,7 @@ def test_to_excel(self): import openpyxl # noqa from pandas.io.excel import ExcelFile except ImportError: - raise nose.SkipTest("need xlwt xlrd openpyxl") + pytest.skip("need xlwt xlrd openpyxl") for ext in ['xls', 'xlsx']: with ensure_clean('__tmp__.' + ext) as path: @@ -2067,7 +2067,7 @@ def test_to_excel(self): try: reader = ExcelFile(path) except ImportError: - raise nose.SkipTest("need xlwt xlrd openpyxl") + pytest.skip("need xlwt xlrd openpyxl") for item, df in self.panel.iteritems(): recdf = reader.parse(str(item), index_col=0) @@ -2079,14 +2079,14 @@ def test_to_excel_xlsxwriter(self): import xlsxwriter # noqa from pandas.io.excel import ExcelFile except ImportError: - raise nose.SkipTest("Requires xlrd and xlsxwriter. Skipping test.") + pytest.skip("Requires xlrd and xlsxwriter. Skipping test.") with ensure_clean('__tmp__.xlsx') as path: self.panel.to_excel(path, engine='xlsxwriter') try: reader = ExcelFile(path) except ImportError as e: - raise nose.SkipTest("cannot write excel file: %s" % e) + pytest.skip("cannot write excel file: %s" % e) for item, df in self.panel.iteritems(): recdf = reader.parse(str(item), index_col=0) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 96864c626ba7f..902b42e7d77d7 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -2,7 +2,7 @@ from datetime import datetime from pandas.compat import range, lrange import operator -import nose +import pytest import numpy as np @@ -66,7 +66,7 @@ def test_skew(self): try: from scipy.stats import skew except ImportError: - raise nose.SkipTest("no scipy.stats.skew") + pytest.skip("no scipy.stats.skew") def this_skew(x): if len(x) < 3: diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 466e9ee5a30b8..07bfdc8fc9078 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import pandas as pd import unittest -import nose +import pytest import numpy as np import sys from pandas import Series, DataFrame @@ -10,8 +10,7 @@ raise_with_traceback, assert_index_equal, assert_series_equal, assert_frame_equal, assert_numpy_array_equal, - RNGContext, assertRaises, - skip_if_no_package_deco) + RNGContext) from pandas.compat import is_platform_windows # let's get meta. @@ -167,8 +166,8 @@ class TestAssertNumpyArrayEqual(tm.TestCase): def test_numpy_array_equal_message(self): if is_platform_windows(): - raise nose.SkipTest("windows has incomparable line-endings " - "and uses L on the shape") + pytest.skip("windows has incomparable line-endings " + "and uses L on the shape") expected = """numpy array are different @@ -295,8 +294,8 @@ def test_numpy_array_equal_message(self): def test_numpy_array_equal_object_message(self): if is_platform_windows(): - raise nose.SkipTest("windows has incomparable line-endings " - "and uses L on the shape") + pytest.skip("windows has incomparable line-endings " + "and uses L on the shape") a = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-01')]) b = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]) @@ -772,27 +771,9 @@ class TestLocale(tm.TestCase): def test_locale(self): if sys.platform == 'win32': - raise nose.SkipTest( + pytest.skip( "skipping on win platforms as locale not available") # GH9744 locales = tm.get_locales() self.assertTrue(len(locales) >= 1) - - -def test_skiptest_deco(): - from nose import SkipTest - - @skip_if_no_package_deco("fakepackagename") - def f(): - pass - with assertRaises(SkipTest): - f() - - @skip_if_no_package_deco("numpy") - def f(): - pass - # hack to ensure that SkipTest is *not* raised - with assertRaises(ValueError): - f() - raise ValueError diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 48861fc6a9528..3add568c1ea99 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1,9 +1,8 @@ from itertools import product -import nose +import pytest import sys import warnings -from nose.tools import assert_raises from datetime import datetime from numpy.random import randn import numpy as np @@ -726,7 +725,8 @@ def check_dtypes(self, f, f_name, d, d_name, exp): else: # other methods not Implemented ATM - assert_raises(NotImplementedError, f, roll) + with pytest.raises(NotImplementedError): + f(roll) class TestDtype_timedelta(DatetimeLike): @@ -741,8 +741,8 @@ class TestDtype_datetime64UTC(DatetimeLike): dtype = 'datetime64[ns, UTC]' def _create_data(self): - raise nose.SkipTest("direct creation of extension dtype " - "datetime64[ns, UTC] is not supported ATM") + pytest.skip("direct creation of extension dtype " + "datetime64[ns, UTC] is not supported ATM") class TestMoments(Base): @@ -1160,7 +1160,7 @@ def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: - raise nose.SkipTest('no scipy') + pytest.skip('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False), name='skew') @@ -1168,14 +1168,14 @@ def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: - raise nose.SkipTest('no scipy') + pytest.skip('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False), name='kurt') def test_fperr_robustness(self): # TODO: remove this once python 2.5 out of picture if PY3: - raise nose.SkipTest("doesn't work on python 3") + pytest.skip("doesn't work on python 3") # #2114 data = '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a@\xaa\xaa\xaa\xaa\xaa\xaa\x02@8\x8e\xe38\x8e\xe3\xe8?z\t\xed%\xb4\x97\xd0?\xa2\x0c<\xdd\x9a\x1f\xb6?\x82\xbb\xfa&y\x7f\x9d?\xac\'\xa7\xc4P\xaa\x83?\x90\xdf\xde\xb0k8j?`\xea\xe9u\xf2zQ?*\xe37\x9d\x98N7?\xe2.\xf5&v\x13\x1f?\xec\xc9\xf8\x19\xa4\xb7\x04?\x90b\xf6w\x85\x9f\xeb>\xb5A\xa4\xfaXj\xd2>F\x02\xdb\xf8\xcb\x8d\xb8>.\xac<\xfb\x87^\xa0>\xe8:\xa6\xf9_\xd3\x85>\xfb?\xe2cUU\xfd?\xfc\x7fA\xed8\x8e\xe3?\xa5\xaa\xac\x91\xf6\x12\xca?n\x1cs\xb6\xf9a\xb1?\xe8%D\xf3L-\x97?5\xddZD\x11\xe7~?#>\xe7\x82\x0b\x9ad?\xd9R4Y\x0fxK?;7x;\nP2?N\xf4JO\xb8j\x18?4\xf81\x8a%G\x00?\x9a\xf5\x97\r2\xb4\xe5>\xcd\x9c\xca\xbcB\xf0\xcc>3\x13\x87(\xd7J\xb3>\x99\x19\xb4\xe0\x1e\xb9\x99>ff\xcd\x95\x14&\x81>\x88\x88\xbc\xc7p\xddf>`\x0b\xa6_\x96|N>@\xb2n\xea\x0eS4>U\x98\x938i\x19\x1b>\x8eeb\xd0\xf0\x10\x02>\xbd\xdc-k\x96\x16\xe8=(\x93\x1e\xf2\x0e\x0f\xd0=\xe0n\xd3Bii\xb5=*\xe9\x19Y\x8c\x8c\x9c=\xc6\xf0\xbb\x90]\x08\x83=]\x96\xfa\xc0|`i=>d\xfc\xd5\xfd\xeaP=R0\xfb\xc7\xa7\x8e6=\xc2\x95\xf9_\x8a\x13\x1e=\xd6c\xa6\xea\x06\r\x04=r\xda\xdd8\t\xbc\xea<\xf6\xe6\x93\xd0\xb0\xd2\xd1<\x9d\xdeok\x96\xc3\xb7<&~\xea9s\xaf\x9f\xb8\x02@\xc6\xd2&\xfd\xa8\xf5\xe8?\xd9\xe1\x19\xfe\xc5\xa3\xd0?v\x82"\xa8\xb2/\xb6?\x9dX\x835\xee\x94\x9d?h\x90W\xce\x9e\xb8\x83?\x8a\xc0th~Kj?\\\x80\xf8\x9a\xa9\x87Q?%\xab\xa0\xce\x8c_7?1\xe4\x80\x13\x11*\x1f? \x98\x00\r\xb6\xc6\x04?\x80u\xabf\x9d\xb3\xeb>UNrD\xbew\xd2>\x1c\x13C[\xa8\x9f\xb8>\x12b\xd7m-\x1fQ@\xe3\x85>\xe6\x91)l\x00/m>Da\xc6\xf2\xaatS>\x05\xd7]\xee\xe3\xf09>' # noqa diff --git a/pandas/tests/tseries/test_converter.py b/pandas/tests/tseries/test_converter.py index b934aaed7d41f..5351e26f0e62b 100644 --- a/pandas/tests/tseries/test_converter.py +++ b/pandas/tests/tseries/test_converter.py @@ -1,3 +1,4 @@ +import pytest from datetime import datetime, date import numpy as np @@ -7,11 +8,7 @@ from pandas.tseries.offsets import Second, Milli, Micro, Day from pandas.compat.numpy import np_datetime64_compat -try: - import pandas.tseries.converter as converter -except ImportError: - import nose - raise nose.SkipTest("no pandas.tseries.converter, skipping") +converter = pytest.importorskip('pandas.tseries.converter') def test_timtetonum_accepts_unicode(): diff --git a/pandas/tests/tseries/test_offsets.py b/pandas/tests/tseries/test_offsets.py index 7c5a4c3df28b2..dfa1e94e4dc11 100644 --- a/pandas/tests/tseries/test_offsets.py +++ b/pandas/tests/tseries/test_offsets.py @@ -3,8 +3,7 @@ from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta -import nose -from nose.tools import assert_raises +import pytest from pandas.compat import range, iteritems from pandas import compat @@ -59,7 +58,8 @@ def test_ole2datetime(): actual = ole2datetime(60000) assert actual == datetime(2064, 4, 8) - assert_raises(ValueError, ole2datetime, 60) + with pytest.raises(ValueError): + ole2datetime(60) def test_to_datetime1(): @@ -159,7 +159,7 @@ def test_apply_out_of_range(self): except (tslib.OutOfBoundsDatetime): raise except (ValueError, KeyError) as e: - raise nose.SkipTest( + pytest.skip( "cannot create out_of_range offset: {0} {1}".format( str(self).split('.')[-1], e)) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 0716a13fac3fe..2672db13a959f 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -1,7 +1,7 @@ import os import locale import codecs -import nose +import pytest import decimal import numpy as np @@ -68,7 +68,7 @@ def setUpClass(cls): cls.locales = tm.get_locales() if not cls.locales: - raise nose.SkipTest("No locales found") + pytest.skip("No locales found") tm._skip_if_windows() @@ -83,20 +83,20 @@ def test_get_locales(self): def test_get_locales_prefix(self): if len(self.locales) == 1: - raise nose.SkipTest("Only a single locale found, no point in " - "trying to test filtering locale prefixes") + pytest.skip("Only a single locale found, no point in " + "trying to test filtering locale prefixes") first_locale = self.locales[0] assert len(tm.get_locales(prefix=first_locale[:2])) > 0 def test_set_locale(self): if len(self.locales) == 1: - raise nose.SkipTest("Only a single locale found, no point in " - "trying to test setting another locale") + pytest.skip("Only a single locale found, no point in " + "trying to test setting another locale") if all(x is None for x in CURRENT_LOCALE): # Not sure why, but on some travis runs with pytest, # getlocale() returned (None, None). - raise nose.SkipTest("CURRENT_LOCALE is not set.") + pytest.skip("CURRENT_LOCALE is not set.") if LOCALE_OVERRIDE is None: lang, enc = 'it_CH', 'UTF-8' @@ -456,7 +456,7 @@ def test_downcast_limits(self): # Test the limits of each downcast. Bug: #14401. # Check to make sure numpy is new enough to run this test. if _np_version_under1p9: - raise nose.SkipTest("Numpy version is under 1.9") + pytest.skip("Numpy version is under 1.9") i = 'integer' u = 'unsigned' diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 85d77c2f6f57c..1b501eb1d9bda 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -206,65 +206,6 @@ def wrapped(*args, **kwargs): return wrapped -class KnownFailureTest(Exception): - """Raise this exception to mark a test as a known failing test.""" - pass - - -def knownfailureif(fail_condition, msg=None): - """ - Make function raise KnownFailureTest exception if given condition is true. - - If the condition is a callable, it is used at runtime to dynamically - make the decision. This is useful for tests that may require costly - imports, to delay the cost until the test suite is actually executed. - - Parameters - ---------- - fail_condition : bool or callable - Flag to determine whether to mark the decorated test as a known - failure (if True) or not (if False). - msg : str, optional - Message to give on raising a KnownFailureTest exception. - Default is None. - - Returns - ------- - decorator : function - Decorator, which, when applied to a function, causes SkipTest - to be raised when `skip_condition` is True, and the function - to be called normally otherwise. - - Notes - ----- - The decorator itself is decorated with the ``nose.tools.make_decorator`` - function in order to transmit function name, and various other metadata. - - """ - if msg is None: - msg = 'Test skipped due to known failure' - - # Allow for both boolean or callable known failure conditions. - if callable(fail_condition): - fail_val = fail_condition - else: - fail_val = lambda: fail_condition - - def knownfail_decorator(f): - # Local import to avoid a hard nose dependency and only incur the - # import time overhead at actual test-time. - import nose - - def knownfailer(*args, **kwargs): - if fail_val(): - raise KnownFailureTest(msg) - else: - return f(*args, **kwargs) - return nose.tools.make_decorator(f)(knownfailer) - - return knownfail_decorator - - def make_signature(func): """ Returns a string repr of the arg list of a func call, with any defaults diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index c3962ad9c823c..7c5148caf7e74 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -63,7 +63,7 @@ def show_versions(as_json=False): deps = [ # (MODULE_NAME, f(mod) -> mod version) ("pandas", lambda mod: mod.__version__), - ("nose", lambda mod: mod.__version__), + ("pytest", lambda mod: mod.__version__), ("pip", lambda mod: mod.__version__), ("setuptools", lambda mod: mod.__version__), ("Cython", lambda mod: mod.__version__), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c3633c945f60a..566ceec027b2b 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -248,9 +248,9 @@ def close(fignum=None): def _skip_if_32bit(): - import nose + import pytest if is_platform_32bit(): - raise nose.SkipTest("skipping for 32 bit") + pytest.skip("skipping for 32 bit") def mplskip(cls): @@ -262,8 +262,8 @@ def setUpClass(cls): import matplotlib as mpl mpl.use("Agg", warn=False) except ImportError: - import nose - raise nose.SkipTest("matplotlib not installed") + import pytest + pytest.skip("matplotlib not installed") cls.setUpClass = setUpClass return cls @@ -273,102 +273,102 @@ def _skip_if_no_mpl(): try: import matplotlib # noqa except ImportError: - import nose - raise nose.SkipTest("matplotlib not installed") + import pytest + pytest.skip("matplotlib not installed") def _skip_if_mpl_1_5(): import matplotlib v = matplotlib.__version__ if v > LooseVersion('1.4.3') or v[0] == '0': - import nose - raise nose.SkipTest("matplotlib 1.5") + import pytest + pytest.skip("matplotlib 1.5") def _skip_if_no_scipy(): try: import scipy.stats # noqa except ImportError: - import nose - raise nose.SkipTest("no scipy.stats module") + import pytest + pytest.skip("no scipy.stats module") try: import scipy.interpolate # noqa except ImportError: - import nose - raise nose.SkipTest('scipy.interpolate missing') + import pytest + pytest.skip('scipy.interpolate missing') def _skip_if_scipy_0_17(): import scipy v = scipy.__version__ if v >= LooseVersion("0.17.0"): - import nose - raise nose.SkipTest("scipy 0.17") + import pytest + pytest.skip("scipy 0.17") def _skip_if_no_lzma(): try: return compat.import_lzma() except ImportError: - import nose - raise nose.SkipTest('need backports.lzma to run') + import pytest + pytest.skip('need backports.lzma to run') def _skip_if_no_xarray(): try: import xarray except ImportError: - import nose - raise nose.SkipTest("xarray not installed") + import pytest + pytest.skip("xarray not installed") v = xarray.__version__ if v < LooseVersion('0.7.0'): - import nose - raise nose.SkipTest("xarray not version is too low: {0}".format(v)) + import pytest + pytest.skip("xarray not version is too low: {0}".format(v)) def _skip_if_no_pytz(): try: import pytz # noqa except ImportError: - import nose - raise nose.SkipTest("pytz not installed") + import pytest + pytest.skip("pytz not installed") def _skip_if_no_dateutil(): try: import dateutil # noqa except ImportError: - import nose - raise nose.SkipTest("dateutil not installed") + import pytest + pytest.skip("dateutil not installed") def _skip_if_windows_python_3(): if PY3 and is_platform_windows(): - import nose - raise nose.SkipTest("not used on python 3/win32") + import pytest + pytest.skip("not used on python 3/win32") def _skip_if_windows(): if is_platform_windows(): - import nose - raise nose.SkipTest("Running on Windows") + import pytest + pytest.skip("Running on Windows") def _skip_if_no_pathlib(): try: from pathlib import Path # noqa except ImportError: - import nose - raise nose.SkipTest("pathlib not available") + import pytest + pytest.skip("pathlib not available") def _skip_if_no_localpath(): try: from py.path import local as LocalPath # noqa except ImportError: - import nose - raise nose.SkipTest("py.path not installed") + import pytest + pytest.skip("py.path not installed") def _incompat_bottleneck_version(method): @@ -392,27 +392,27 @@ def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': if not _USE_NUMEXPR: - import nose - raise nose.SkipTest("numexpr enabled->{enabled}, " - "installed->{installed}".format( - enabled=_USE_NUMEXPR, - installed=_NUMEXPR_INSTALLED)) + import pytest + pytest.skip("numexpr enabled->{enabled}, " + "installed->{installed}".format( + enabled=_USE_NUMEXPR, + installed=_NUMEXPR_INSTALLED)) def _skip_if_has_locale(): import locale lang, _ = locale.getlocale() if lang is not None: - import nose - raise nose.SkipTest("Specific locale is set {0}".format(lang)) + import pytest + pytest.skip("Specific locale is set {0}".format(lang)) def _skip_if_not_us_locale(): import locale lang, _ = locale.getlocale() if lang != 'en_US': - import nose - raise nose.SkipTest("Specific locale is set {0}".format(lang)) + import pytest + pytest.skip("Specific locale is set {0}".format(lang)) # ----------------------------------------------------------------------------- # locale utilities @@ -662,8 +662,8 @@ def ensure_clean(filename=None, return_filelike=False): try: fd, filename = tempfile.mkstemp(suffix=filename) except UnicodeEncodeError: - import nose - raise nose.SkipTest('no unicode file names on this system') + import pytest + pytest.skip('no unicode file names on this system') try: yield filename @@ -1997,9 +1997,7 @@ def __init__(self, *args, **kwargs): # Dependency checks. Copied this from Nipy/Nipype (Copyright of # respective developers, license: BSD-3) -def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion, - exc_failed_import=ImportError, - exc_failed_check=RuntimeError): +def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion): """Check that the minimal version of the required package is installed. Parameters @@ -2015,10 +2013,6 @@ def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion, checker : object, optional The class that will perform the version checking. Default is distutils.version.LooseVersion. - exc_failed_import : Exception, optional - Class of the exception to be thrown if import failed. - exc_failed_check : Exception, optional - Class of the exception to be thrown if version check failed. Examples -------- @@ -2027,6 +2021,7 @@ def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion, """ + import pytest if app: msg = '%s requires %s' % (app, pkg_name) else: @@ -2036,46 +2031,24 @@ def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion, try: mod = __import__(pkg_name) except ImportError: - raise exc_failed_import(msg) - if not version: - return + mod = None try: have_version = mod.__version__ except AttributeError: - raise exc_failed_check('Cannot find version for %s' % pkg_name) - if checker(have_version) < checker(version): - raise exc_failed_check(msg) + pytest.skip('Cannot find version for %s' % pkg_name) + if version and checker(have_version) < checker(version): + pytest.skip(msg) def skip_if_no_package(*args, **kwargs): - """Raise SkipTest if package_check fails + """pytest.skip() if package_check fails Parameters ---------- *args Positional parameters passed to `package_check` *kwargs Keyword parameters passed to `package_check` """ - from nose import SkipTest - package_check(exc_failed_import=SkipTest, - exc_failed_check=SkipTest, - *args, **kwargs) - - -def skip_if_no_package_deco(pkg_name, version=None, app='pandas'): - from nose import SkipTest - - def deco(func): - @wraps(func) - def wrapper(*args, **kwargs): - package_check(pkg_name, version=version, app=app, - exc_failed_import=SkipTest, - exc_failed_check=SkipTest) - return func(*args, **kwargs) - return wrapper - return deco -# -# Additional tags decorators for nose -# + package_check(*args, **kwargs) def optional_args(decorator): @@ -2255,18 +2228,17 @@ def network(t, url="http://www.google.com", >>> test_something() Traceback (most recent call last): ... - SkipTest Errors not related to networking will always be raised. """ - from nose import SkipTest + from pytest import skip t.network = True @wraps(t) def wrapper(*args, **kwargs): if check_before_test and not raise_on_error: if not can_connect(url, error_classes): - raise SkipTest + skip() try: return t(*args, **kwargs) except Exception as e: @@ -2275,8 +2247,8 @@ def wrapper(*args, **kwargs): errno = getattr(e.reason, 'errno', None) if errno in skip_errnos: - raise SkipTest("Skipping test due to known errno" - " and error %s" % e) + skip("Skipping test due to known errno" + " and error %s" % e) try: e_str = traceback.format_exc(e) @@ -2284,8 +2256,8 @@ def wrapper(*args, **kwargs): e_str = str(e) if any([m.lower() in e_str.lower() for m in _skip_on_messages]): - raise SkipTest("Skipping test because exception " - "message is known and error %s" % e) + skip("Skipping test because exception " + "message is known and error %s" % e) if not isinstance(e, error_classes): raise @@ -2293,8 +2265,8 @@ def wrapper(*args, **kwargs): if raise_on_error or can_connect(url, error_classes): raise else: - raise SkipTest("Skipping test due to lack of connectivity" - " and error %s" % e) + skip("Skipping test due to lack of connectivity" + " and error %s" % e) return wrapper @@ -2775,8 +2747,8 @@ def set_timezone(tz): 'EDT' """ if is_platform_windows(): - import nose - raise nose.SkipTest("timezone setting not supported on windows") + import pytest + pytest.skip("timezone setting not supported on windows") import os import time diff --git a/setup.cfg b/setup.cfg index 143470f7ee350..45d98dd733f1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ tag_prefix = v parentdir_prefix = pandas- [flake8] -ignore = E731 +ignore = E731,E402 [yapf] based_on_style = pep8 From 54b1f9cb9ad05c14f34aee48b62066ec51b054c5 Mon Sep 17 00:00:00 2001 From: "John W. O'Brien" Date: Sat, 11 Feb 2017 21:21:56 -0500 Subject: [PATCH 131/338] BUG: Avoid grafting missing examples directory (#15373) --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 2d26fbfd6adaf..b7a7e6039ac9a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,7 +7,6 @@ include setup.py graft doc prune doc/build -graft examples graft pandas global-exclude *.so From 97cdbf0a06901dffdaec73d7ddc2508455fe4ed8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Feb 2017 21:53:44 -0500 Subject: [PATCH 132/338] CLN: remove pandas/io/auth.py, from ga.py (now removed) (#15374) --- pandas/io/auth.py | 126 ---------------------------------------------- 1 file changed, 126 deletions(-) delete mode 100644 pandas/io/auth.py diff --git a/pandas/io/auth.py b/pandas/io/auth.py deleted file mode 100644 index e42df6a7309b7..0000000000000 --- a/pandas/io/auth.py +++ /dev/null @@ -1,126 +0,0 @@ -from __future__ import print_function -# see LICENSES directory for copyright and license -import os -import sys -import logging - -import httplib2 - -import apiclient.discovery as gapi -import gflags -import oauth2client.file as auth_file -import oauth2client.client as oauth -import oauth2client.tools as tools -OOB_CALLBACK_URN = oauth.OOB_CALLBACK_URN - - -class AuthenticationConfigError(ValueError): - pass - -FLOWS = {} -FLAGS = gflags.FLAGS -DEFAULT_SECRETS = os.path.join( - os.path.dirname(__file__), 'client_secrets.json') -DEFAULT_SCOPE = 'https://www.googleapis.com/auth/analytics.readonly' -DEFAULT_TOKEN_FILE = os.path.join(os.path.dirname(__file__), 'analytics.dat') -MISSING_CLIENT_MSG = """ -WARNING: Please configure OAuth 2.0 - -You need to populate the client_secrets.json file found at: - - %s - -with information from the APIs Console -. - -""" -DOC_URL = ('https://developers.google.com/api-client-library/python/guide/' - 'aaa_client_secrets') - -gflags.DEFINE_enum('logging_level', 'ERROR', - ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - 'Set the level of logging detail.') - -# Name of file that will store the access and refresh tokens to access -# the API without having to login each time. Make sure this file is in -# a secure place. - - -def process_flags(flags=None): - """Uses the command-line flags to set the logging level. - - Args: - argv: List of command line arguments passed to the python script. - """ - if flags is None: - flags = [] - - # Let the gflags module process the command-line arguments. - try: - FLAGS(flags) - except gflags.FlagsError as e: - print('%s\nUsage: %s ARGS\n%s' % (e, str(flags), FLAGS)) - sys.exit(1) - - # Set the logging according to the command-line flag. - logging.getLogger().setLevel(getattr(logging, FLAGS.logging_level)) - - -def get_flow(secret, scope, redirect): - """ - Retrieve an authentication flow object based on the given - configuration in the secret file name, the authentication scope, - and a redirect URN - """ - key = (secret, scope, redirect) - flow = FLOWS.get(key, None) - if flow is None: - msg = MISSING_CLIENT_MSG % secret - if not os.path.exists(secret): - raise AuthenticationConfigError(msg) - flow = oauth.flow_from_clientsecrets(secret, scope, - redirect_uri=redirect, - message=msg) - FLOWS[key] = flow - return flow - - -def make_token_store(fpath=None): - """create token storage from give file name""" - if fpath is None: - fpath = DEFAULT_TOKEN_FILE - return auth_file.Storage(fpath) - - -def authenticate(flow, storage=None): - """ - Try to retrieve a valid set of credentials from the token store if possible - Otherwise use the given authentication flow to obtain new credentials - and return an authenticated http object - - Parameters - ---------- - flow : authentication workflow - storage: token storage, default None - """ - http = httplib2.Http() - - # Prepare credentials, and authorize HTTP object with them. - credentials = storage.get() - if credentials is None or credentials.invalid: - credentials = tools.run(flow, storage) - - http = credentials.authorize(http) - return http - - -def init_service(http): - """ - Use the given http object to build the analytics service object - """ - return gapi.build('analytics', 'v3', http=http) - - -def reset_default_token_store(): - import os - os.remove(DEFAULT_TOKEN_FILE) From c059b0a55504b52d6be195dcecc42a0a866b2b6e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 09:50:55 -0500 Subject: [PATCH 133/338] TST: consolidate remaining tests under pandas.tests move all remaining tests so that ALL tests are now under pandas/tests Author: Jeff Reback Closes #15371 from jreback/tests and squashes the following commits: 43039e4 [Jeff Reback] add in data 118127b [Jeff Reback] wip bfa6a9c [Jeff Reback] fix data locations 79a79e6 [Jeff Reback] fix import 57437bf [Jeff Reback] fixes b407586 [Jeff Reback] move io e13bfe3 [Jeff Reback] move tools 0194e31 [Jeff Reback] move computation 0e6bcb4 [Jeff Reback] rename test_msgpack -> msgpack c5e4ab8 [Jeff Reback] move sparse 42e60e2 [Jeff Reback] move api tests --- pandas/{api/tests => tests/api}/__init__.py | 0 pandas/{api/tests => tests/api}/test_api.py | 2 +- .../tests => tests/computation}/__init__.py | 0 .../computation}/test_compat.py | 0 .../tests => tests/computation}/test_eval.py | 0 pandas/tests/indexes/datetimes/test_ops.py | 3 +- pandas/{io/tests => tests/io}/__init__.py | 0 .../{io/tests => tests/io}/data/S4_EDUC1.dta | Bin .../{io/tests => tests/io}/data/banklist.csv | 0 .../{io/tests => tests/io}/data/banklist.html | 0 pandas/{io/tests => tests/io}/data/blank.xls | Bin pandas/{io/tests => tests/io}/data/blank.xlsm | Bin pandas/{io/tests => tests/io}/data/blank.xlsx | Bin .../io}/data/blank_with_header.xls | Bin .../io}/data/blank_with_header.xlsm | Bin .../io}/data/blank_with_header.xlsx | Bin .../io}/data/categorical_0_14_1.pickle | 0 .../io}/data/categorical_0_15_2.pickle | Bin .../io}/data/computer_sales_page.html | 0 .../tests => tests/io}/data/gbq_fake_job.txt | 0 .../data/html_encoding/chinese_utf-16.html | Bin .../data/html_encoding/chinese_utf-32.html | Bin .../io}/data/html_encoding/chinese_utf-8.html | 0 .../io}/data/html_encoding/letz_latin1.html | 0 pandas/{io/tests => tests/io}/data/iris.csv | 0 .../io}/data/legacy_hdf/datetimetz_object.h5 | Bin .../io}/data/legacy_hdf/legacy.h5 | Bin .../io}/data/legacy_hdf/legacy_0.10.h5 | Bin .../io}/data/legacy_hdf/legacy_table.h5 | Bin .../io}/data/legacy_hdf/legacy_table_0.11.h5 | Bin .../io}/data/legacy_hdf/pytables_native.h5 | Bin .../io}/data/legacy_hdf/pytables_native2.h5 | Bin .../0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack | Bin .../0.16.2_AMD64_windows_2.7.10.msgpack | Bin .../0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack | Bin .../0.16.2_x86_64_darwin_2.7.10.msgpack | Bin .../0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack | Bin .../0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack | Bin .../0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack | Bin .../0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack | Bin .../0.17.0_AMD64_windows_2.7.11.msgpack | Bin .../0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack | Bin .../0.17.0_x86_64_darwin_2.7.11.msgpack | Bin .../0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack | Bin .../0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack | Bin .../0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack | Bin .../0.17.1_AMD64_windows_2.7.11.msgpack | Bin .../0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack | Bin .../0.17.1_AMD64_windows_2.7.11.msgpack | Bin .../0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack | Bin .../0.17.1_x86_64_darwin_2.7.11.msgpack | Bin .../0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack | Bin .../0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack | Bin .../0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack | Bin .../0.18.0_AMD64_windows_2.7.11.msgpack | Bin .../0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack | Bin .../0.18.0_x86_64_darwin_2.7.11.msgpack | Bin .../0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack | Bin .../0.18.1_x86_64_darwin_2.7.12.msgpack | Bin .../0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack | Bin .../0.10.1/AMD64_windows_2.7.3.pickle | Bin .../0.10.1/x86_64_linux_2.7.3.pickle | Bin .../0.11.0/0.11.0_x86_64_linux_3.3.0.pickle | Bin .../0.11.0/x86_64_linux_2.7.3.pickle | Bin .../0.11.0/x86_64_linux_3.3.0.pickle | Bin .../0.12.0/0.12.0_AMD64_windows_2.7.3.pickle | Bin .../0.12.0/0.12.0_x86_64_linux_2.7.3.pickle | Bin .../0.13.0/0.13.0_AMD64_windows_2.7.3.pickle | Bin .../0.13.0/0.13.0_i686_linux_2.6.5.pickle | Bin .../0.13.0/0.13.0_i686_linux_2.7.3.pickle | Bin .../0.13.0/0.13.0_i686_linux_3.2.3.pickle | Bin .../0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle | Bin .../0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle | Bin .../0.13.0/0.13.0_x86_64_linux_2.7.3.pickle | Bin .../0.13.0/0.13.0_x86_64_linux_2.7.8.pickle | Bin .../0.13.0/0.13.0_x86_64_linux_3.3.0.pickle | Bin .../0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle | Bin .../0.14.0/0.14.0_x86_64_linux_2.7.8.pickle | Bin .../0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle | Bin .../0.14.1/0.14.1_x86_64_linux_2.7.8.pickle | Bin .../0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle | Bin .../0.15.0/0.15.0_x86_64_linux_2.7.8.pickle | Bin .../0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle | Bin .../0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle | Bin .../0.16.2/0.16.2_AMD64_windows_2.7.10.pickle | Bin .../0.16.2/0.16.2_AMD64_windows_3.4.3.pickle | Bin .../0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle | Bin .../0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle | Bin .../0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle | Bin .../0.16.2/0.16.2_x86_64_linux_2.7.10.pickle | Bin .../0.16.2/0.16.2_x86_64_linux_3.4.3.pickle | Bin .../0.17.0/0.17.0_AMD64_windows_2.7.11.pickle | Bin .../0.17.0/0.17.0_AMD64_windows_3.4.4.pickle | Bin .../0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle | Bin .../0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle | Bin .../0.17.0/0.17.0_x86_64_linux_2.7.11.pickle | Bin .../0.17.0/0.17.0_x86_64_linux_3.4.4.pickle | Bin .../0.17.0/0.17.1_AMD64_windows_2.7.11.pickle | Bin .../0.17.1/0.17.1_AMD64_windows_2.7.11.pickle | Bin .../0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle | Bin .../0.18.0/0.18.0_AMD64_windows_2.7.11.pickle | Bin .../0.18.0/0.18.0_AMD64_windows_3.5.1.pickle | Bin .../0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle | Bin .../0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle | Bin .../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin .../0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle | Bin pandas/{io/tests => tests/io}/data/macau.html | 0 .../{io/tests => tests/io}/data/nyse_wsj.html | 0 pandas/{io/tests => tests/io}/data/spam.html | 0 .../tests => tests/io}/data/stata10_115.dta | Bin .../tests => tests/io}/data/stata10_117.dta | Bin .../tests => tests/io}/data/stata11_115.dta | Bin .../tests => tests/io}/data/stata11_117.dta | Bin .../tests => tests/io}/data/stata12_117.dta | Bin .../tests => tests/io}/data/stata14_118.dta | Bin .../{io/tests => tests/io}/data/stata15.dta | Bin .../tests => tests/io}/data/stata1_114.dta | Bin .../tests => tests/io}/data/stata1_117.dta | Bin .../io}/data/stata1_encoding.dta | Bin .../tests => tests/io}/data/stata2_113.dta | Bin .../tests => tests/io}/data/stata2_114.dta | Bin .../tests => tests/io}/data/stata2_115.dta | Bin .../tests => tests/io}/data/stata2_117.dta | Bin pandas/{io/tests => tests/io}/data/stata3.csv | 0 .../tests => tests/io}/data/stata3_113.dta | Bin .../tests => tests/io}/data/stata3_114.dta | Bin .../tests => tests/io}/data/stata3_115.dta | Bin .../tests => tests/io}/data/stata3_117.dta | Bin .../tests => tests/io}/data/stata4_113.dta | Bin .../tests => tests/io}/data/stata4_114.dta | Bin .../tests => tests/io}/data/stata4_115.dta | Bin .../tests => tests/io}/data/stata4_117.dta | Bin pandas/{io/tests => tests/io}/data/stata5.csv | 0 .../tests => tests/io}/data/stata5_113.dta | Bin .../tests => tests/io}/data/stata5_114.dta | Bin .../tests => tests/io}/data/stata5_115.dta | Bin .../tests => tests/io}/data/stata5_117.dta | Bin pandas/{io/tests => tests/io}/data/stata6.csv | 0 .../tests => tests/io}/data/stata6_113.dta | Bin .../tests => tests/io}/data/stata6_114.dta | Bin .../tests => tests/io}/data/stata6_115.dta | Bin .../tests => tests/io}/data/stata6_117.dta | Bin .../tests => tests/io}/data/stata7_111.dta | Bin .../tests => tests/io}/data/stata7_115.dta | Bin .../tests => tests/io}/data/stata7_117.dta | Bin .../tests => tests/io}/data/stata8_113.dta | Bin .../tests => tests/io}/data/stata8_115.dta | Bin .../tests => tests/io}/data/stata8_117.dta | Bin .../tests => tests/io}/data/stata9_115.dta | Bin .../tests => tests/io}/data/stata9_117.dta | Bin pandas/{io/tests => tests/io}/data/test1.csv | 0 pandas/{io/tests => tests/io}/data/test1.xls | Bin pandas/{io/tests => tests/io}/data/test1.xlsm | Bin pandas/{io/tests => tests/io}/data/test1.xlsx | Bin pandas/{io/tests => tests/io}/data/test2.xls | Bin pandas/{io/tests => tests/io}/data/test2.xlsm | Bin pandas/{io/tests => tests/io}/data/test2.xlsx | Bin pandas/{io/tests => tests/io}/data/test3.xls | Bin pandas/{io/tests => tests/io}/data/test3.xlsm | Bin pandas/{io/tests => tests/io}/data/test3.xlsx | Bin pandas/{io/tests => tests/io}/data/test4.xls | Bin pandas/{io/tests => tests/io}/data/test4.xlsm | Bin pandas/{io/tests => tests/io}/data/test4.xlsx | Bin pandas/{io/tests => tests/io}/data/test5.xls | Bin pandas/{io/tests => tests/io}/data/test5.xlsm | Bin pandas/{io/tests => tests/io}/data/test5.xlsx | Bin .../io}/data/test_converters.xls | Bin .../io}/data/test_converters.xlsm | Bin .../io}/data/test_converters.xlsx | Bin .../io}/data/test_index_name_pre17.xls | Bin .../io}/data/test_index_name_pre17.xlsm | Bin .../io}/data/test_index_name_pre17.xlsx | Bin .../{io/tests => tests/io}/data/test_mmap.csv | 0 .../io}/data/test_multisheet.xls | Bin .../io}/data/test_multisheet.xlsm | Bin .../io}/data/test_multisheet.xlsx | Bin .../tests => tests/io}/data/test_squeeze.xls | Bin .../tests => tests/io}/data/test_squeeze.xlsm | Bin .../tests => tests/io}/data/test_squeeze.xlsx | Bin .../tests => tests/io}/data/test_types.xls | Bin .../tests => tests/io}/data/test_types.xlsm | Bin .../tests => tests/io}/data/test_types.xlsx | Bin .../io}/data/testdateoverflow.xls | Bin .../io}/data/testdateoverflow.xlsm | Bin .../io}/data/testdateoverflow.xlsx | Bin .../{io/tests => tests/io}/data/testdtype.xls | Bin .../tests => tests/io}/data/testdtype.xlsm | Bin .../tests => tests/io}/data/testdtype.xlsx | Bin .../io}/data/testmultiindex.xls | Bin .../io}/data/testmultiindex.xlsm | Bin .../io}/data/testmultiindex.xlsx | Bin .../tests => tests/io}/data/testskiprows.xls | Bin .../tests => tests/io}/data/testskiprows.xlsm | Bin .../tests => tests/io}/data/testskiprows.xlsx | Bin .../tests => tests/io}/data/times_1900.xls | Bin .../tests => tests/io}/data/times_1900.xlsm | Bin .../tests => tests/io}/data/times_1900.xlsx | Bin .../tests => tests/io}/data/times_1904.xls | Bin .../tests => tests/io}/data/times_1904.xlsm | Bin .../tests => tests/io}/data/times_1904.xlsx | Bin pandas/{io/tests => tests/io}/data/tips.csv | 0 .../tests => tests/io}/data/valid_markup.html | 0 .../io}/data/wikipedia_states.html | 0 .../io}/generate_legacy_storage_files.py | 0 .../{io/tests => tests/io}/json/__init__.py | 0 .../io}/json/data/tsframe_iso_v012.json | 0 .../io}/json/data/tsframe_v012.json | 0 .../tests => tests/io}/json/test_normalize.py | 0 .../tests => tests/io}/json/test_pandas.py | 0 .../{io/tests => tests/io}/json/test_ujson.py | 0 .../{io/tests => tests/io}/parser/__init__.py | 0 .../io}/parser/c_parser_only.py | 0 .../{io/tests => tests/io}/parser/comment.py | 0 .../{io/tests => tests/io}/parser/common.py | 0 .../tests => tests/io}/parser/compression.py | 0 .../tests => tests/io}/parser/converters.py | 0 .../tests => tests/io}/parser/data/iris.csv | 0 .../io}/parser/data/salaries.csv | 0 .../io}/parser/data/salaries.csv.bz2 | Bin .../io}/parser/data/salaries.csv.gz | Bin .../io}/parser/data/salaries.csv.xz | Bin .../io}/parser/data/salaries.csv.zip | Bin .../io}/parser/data/sauron.SHIFT_JIS.csv | 0 .../tests => tests/io}/parser/data/test1.csv | 0 .../io}/parser/data/test1.csv.bz2 | Bin .../io}/parser/data/test1.csv.gz | Bin .../tests => tests/io}/parser/data/test2.csv | 0 .../io}/parser/data/test_mmap.csv | 0 .../tests => tests/io}/parser/data/tips.csv | 0 .../io}/parser/data/unicode_series.csv | 0 .../io}/parser/data/utf16_ex.txt | Bin .../{io/tests => tests/io}/parser/dialect.py | 0 .../{io/tests => tests/io}/parser/dtypes.py | 0 .../{io/tests => tests/io}/parser/header.py | 0 .../tests => tests/io}/parser/index_col.py | 0 .../tests => tests/io}/parser/multithread.py | 0 .../tests => tests/io}/parser/na_values.py | 0 .../tests => tests/io}/parser/parse_dates.py | 0 .../io}/parser/python_parser_only.py | 0 .../{io/tests => tests/io}/parser/quoting.py | 0 .../{io/tests => tests/io}/parser/skiprows.py | 0 .../tests => tests/io}/parser/test_network.py | 0 .../tests => tests/io}/parser/test_parsers.py | 0 .../io}/parser/test_read_fwf.py | 0 .../io}/parser/test_textreader.py | 0 .../io}/parser/test_unsupported.py | 0 .../{io/tests => tests/io}/parser/usecols.py | 0 .../tests => tests/io}/sas/data/DEMO_G.csv | 0 .../tests => tests/io}/sas/data/DEMO_G.xpt | Bin .../tests => tests/io}/sas/data/DRXFCD_G.csv | 0 .../tests => tests/io}/sas/data/DRXFCD_G.xpt | Bin .../tests => tests/io}/sas/data/SSHSV1_A.csv | 0 .../tests => tests/io}/sas/data/SSHSV1_A.xpt | Bin .../tests => tests/io}/sas/data/airline.csv | 0 .../io}/sas/data/airline.sas7bdat | Bin .../io}/sas/data/paxraw_d_short.csv | 0 .../io}/sas/data/paxraw_d_short.xpt | Bin .../io}/sas/data/productsales.csv | 0 .../io}/sas/data/productsales.sas7bdat | Bin .../io}/sas/data/test1.sas7bdat | Bin .../io}/sas/data/test10.sas7bdat | Bin .../io}/sas/data/test11.sas7bdat | Bin .../io}/sas/data/test12.sas7bdat | Bin .../io}/sas/data/test13.sas7bdat | Bin .../io}/sas/data/test14.sas7bdat | Bin .../io}/sas/data/test15.sas7bdat | Bin .../io}/sas/data/test16.sas7bdat | Bin .../io}/sas/data/test2.sas7bdat | Bin .../io}/sas/data/test3.sas7bdat | Bin .../io}/sas/data/test4.sas7bdat | Bin .../io}/sas/data/test5.sas7bdat | Bin .../io}/sas/data/test6.sas7bdat | Bin .../io}/sas/data/test7.sas7bdat | Bin .../io}/sas/data/test8.sas7bdat | Bin .../io}/sas/data/test9.sas7bdat | Bin .../io}/sas/data/test_12659.csv | 0 .../io}/sas/data/test_12659.sas7bdat | Bin .../io}/sas/data/test_sas7bdat_1.csv | 0 .../io}/sas/data/test_sas7bdat_2.csv | 0 pandas/{io/tests => tests/io}/sas/test_sas.py | 0 .../tests => tests/io}/sas/test_sas7bdat.py | 0 .../{io/tests => tests/io}/sas/test_xport.py | 0 .../{io/tests => tests/io}/test_clipboard.py | 0 pandas/{io/tests => tests/io}/test_common.py | 0 .../io}/test_date_converters.py | 0 pandas/{io/tests => tests/io}/test_excel.py | 0 pandas/{io/tests => tests/io}/test_feather.py | 0 pandas/{io/tests => tests/io}/test_gbq.py | 0 pandas/{io/tests => tests/io}/test_html.py | 0 pandas/{io/tests => tests/io}/test_packers.py | 2 +- pandas/{io/tests => tests/io}/test_pickle.py | 2 +- .../{io/tests => tests/io}/test_pytables.py | 0 pandas/{io/tests => tests/io}/test_s3.py | 0 pandas/{io/tests => tests/io}/test_sql.py | 0 pandas/{io/tests => tests/io}/test_stata.py | 0 .../tests => tests/msgpack}/__init__.py | 0 .../{test_msgpack => msgpack}/test_buffer.py | 0 .../{test_msgpack => msgpack}/test_case.py | 0 .../{test_msgpack => msgpack}/test_except.py | 0 .../test_extension.py | 0 .../{test_msgpack => msgpack}/test_format.py | 0 .../{test_msgpack => msgpack}/test_limits.py | 0 .../{test_msgpack => msgpack}/test_newspec.py | 0 .../{test_msgpack => msgpack}/test_obj.py | 0 .../{test_msgpack => msgpack}/test_pack.py | 0 .../test_read_size.py | 0 .../{test_msgpack => msgpack}/test_seq.py | 0 .../test_sequnpack.py | 0 .../{test_msgpack => msgpack}/test_subtype.py | 0 .../{test_msgpack => msgpack}/test_unpack.py | 0 .../test_unpack_raw.py | 0 .../{test_msgpack => sparse}/__init__.py | 0 .../sparse}/test_arithmetics.py | 0 .../tests => tests/sparse}/test_array.py | 0 .../sparse}/test_combine_concat.py | 0 .../tests => tests/sparse}/test_format.py | 0 .../tests => tests/sparse}/test_frame.py | 0 .../tests => tests/sparse}/test_groupby.py | 0 .../tests => tests/sparse}/test_indexing.py | 0 .../tests => tests/sparse}/test_libsparse.py | 0 .../tests => tests/sparse}/test_list.py | 0 .../tests => tests/sparse}/test_pivot.py | 0 .../tests => tests/sparse}/test_series.py | 0 .../{tools/tests => tests/tools}/__init__.py | 0 .../tools}/data/allow_exact_matches.csv | 0 .../allow_exact_matches_and_tolerance.csv | 0 .../tests => tests/tools}/data/asof.csv | 0 .../tests => tests/tools}/data/asof2.csv | 0 .../tests => tests/tools}/data/cut_data.csv | 0 .../tests => tests/tools}/data/quotes.csv | 0 .../tests => tests/tools}/data/quotes2.csv | 0 .../tests => tests/tools}/data/tolerance.csv | 0 .../tests => tests/tools}/data/trades.csv | 0 .../tests => tests/tools}/data/trades2.csv | 0 .../tests => tests/tools}/test_concat.py | 0 .../tests => tests/tools}/test_hashing.py | 0 .../{tools/tests => tests/tools}/test_join.py | 2 +- .../tests => tests/tools}/test_merge.py | 0 .../tests => tests/tools}/test_merge_asof.py | 0 .../tools}/test_merge_ordered.py | 0 .../tests => tests/tools}/test_pivot.py | 0 .../{tools/tests => tests/tools}/test_tile.py | 0 .../{tools/tests => tests/tools}/test_util.py | 0 setup.py | 64 +++++++++--------- 344 files changed, 38 insertions(+), 37 deletions(-) rename pandas/{api/tests => tests/api}/__init__.py (100%) rename pandas/{api/tests => tests/api}/test_api.py (99%) rename pandas/{computation/tests => tests/computation}/__init__.py (100%) rename pandas/{computation/tests => tests/computation}/test_compat.py (100%) rename pandas/{computation/tests => tests/computation}/test_eval.py (100%) rename pandas/{io/tests => tests/io}/__init__.py (100%) rename pandas/{io/tests => tests/io}/data/S4_EDUC1.dta (100%) rename pandas/{io/tests => tests/io}/data/banklist.csv (100%) rename pandas/{io/tests => tests/io}/data/banklist.html (100%) rename pandas/{io/tests => tests/io}/data/blank.xls (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank.xlsm (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank.xlsx (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank_with_header.xls (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank_with_header.xlsm (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank_with_header.xlsx (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/categorical_0_14_1.pickle (100%) rename pandas/{io/tests => tests/io}/data/categorical_0_15_2.pickle (100%) rename pandas/{io/tests => tests/io}/data/computer_sales_page.html (100%) rename pandas/{io/tests => tests/io}/data/gbq_fake_job.txt (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/chinese_utf-16.html (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/chinese_utf-32.html (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/chinese_utf-8.html (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/letz_latin1.html (100%) rename pandas/{io/tests => tests/io}/data/iris.csv (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/datetimetz_object.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy_0.10.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy_table.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy_table_0.11.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/pytables_native.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/pytables_native2.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle (100%) rename pandas/{io/tests => tests/io}/data/macau.html (100%) rename pandas/{io/tests => tests/io}/data/nyse_wsj.html (100%) rename pandas/{io/tests => tests/io}/data/spam.html (100%) rename pandas/{io/tests => tests/io}/data/stata10_115.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata10_117.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata11_115.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata11_117.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata12_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata14_118.dta (100%) rename pandas/{io/tests => tests/io}/data/stata15.dta (100%) rename pandas/{io/tests => tests/io}/data/stata1_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata1_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata1_encoding.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3.csv (100%) rename pandas/{io/tests => tests/io}/data/stata3_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5.csv (100%) rename pandas/{io/tests => tests/io}/data/stata5_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6.csv (100%) rename pandas/{io/tests => tests/io}/data/stata6_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata7_111.dta (100%) rename pandas/{io/tests => tests/io}/data/stata7_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata7_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata8_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata8_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata8_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata9_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata9_117.dta (100%) rename pandas/{io/tests => tests/io}/data/test1.csv (100%) rename pandas/{io/tests => tests/io}/data/test1.xls (100%) rename pandas/{io/tests => tests/io}/data/test1.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test1.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test2.xls (100%) rename pandas/{io/tests => tests/io}/data/test2.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test2.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test3.xls (100%) rename pandas/{io/tests => tests/io}/data/test3.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test3.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test4.xls (100%) rename pandas/{io/tests => tests/io}/data/test4.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test4.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test5.xls (100%) rename pandas/{io/tests => tests/io}/data/test5.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test5.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_converters.xls (100%) rename pandas/{io/tests => tests/io}/data/test_converters.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_converters.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_index_name_pre17.xls (100%) rename pandas/{io/tests => tests/io}/data/test_index_name_pre17.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_index_name_pre17.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_mmap.csv (100%) rename pandas/{io/tests => tests/io}/data/test_multisheet.xls (100%) rename pandas/{io/tests => tests/io}/data/test_multisheet.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_multisheet.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_squeeze.xls (100%) rename pandas/{io/tests => tests/io}/data/test_squeeze.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_squeeze.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_types.xls (100%) rename pandas/{io/tests => tests/io}/data/test_types.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_types.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testdateoverflow.xls (100%) rename pandas/{io/tests => tests/io}/data/testdateoverflow.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testdateoverflow.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testdtype.xls (100%) rename pandas/{io/tests => tests/io}/data/testdtype.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testdtype.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testmultiindex.xls (100%) rename pandas/{io/tests => tests/io}/data/testmultiindex.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testmultiindex.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testskiprows.xls (100%) rename pandas/{io/tests => tests/io}/data/testskiprows.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testskiprows.xlsx (100%) rename pandas/{io/tests => tests/io}/data/times_1900.xls (100%) rename pandas/{io/tests => tests/io}/data/times_1900.xlsm (100%) rename pandas/{io/tests => tests/io}/data/times_1900.xlsx (100%) rename pandas/{io/tests => tests/io}/data/times_1904.xls (100%) rename pandas/{io/tests => tests/io}/data/times_1904.xlsm (100%) rename pandas/{io/tests => tests/io}/data/times_1904.xlsx (100%) rename pandas/{io/tests => tests/io}/data/tips.csv (100%) rename pandas/{io/tests => tests/io}/data/valid_markup.html (100%) rename pandas/{io/tests => tests/io}/data/wikipedia_states.html (100%) rename pandas/{io/tests => tests/io}/generate_legacy_storage_files.py (100%) rename pandas/{io/tests => tests/io}/json/__init__.py (100%) rename pandas/{io/tests => tests/io}/json/data/tsframe_iso_v012.json (100%) rename pandas/{io/tests => tests/io}/json/data/tsframe_v012.json (100%) rename pandas/{io/tests => tests/io}/json/test_normalize.py (100%) rename pandas/{io/tests => tests/io}/json/test_pandas.py (100%) rename pandas/{io/tests => tests/io}/json/test_ujson.py (100%) rename pandas/{io/tests => tests/io}/parser/__init__.py (100%) rename pandas/{io/tests => tests/io}/parser/c_parser_only.py (100%) rename pandas/{io/tests => tests/io}/parser/comment.py (100%) rename pandas/{io/tests => tests/io}/parser/common.py (100%) rename pandas/{io/tests => tests/io}/parser/compression.py (100%) rename pandas/{io/tests => tests/io}/parser/converters.py (100%) rename pandas/{io/tests => tests/io}/parser/data/iris.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.bz2 (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.gz (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.xz (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.zip (100%) rename pandas/{io/tests => tests/io}/parser/data/sauron.SHIFT_JIS.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/test1.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/test1.csv.bz2 (100%) rename pandas/{io/tests => tests/io}/parser/data/test1.csv.gz (100%) rename pandas/{io/tests => tests/io}/parser/data/test2.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/test_mmap.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/tips.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/unicode_series.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/utf16_ex.txt (100%) rename pandas/{io/tests => tests/io}/parser/dialect.py (100%) rename pandas/{io/tests => tests/io}/parser/dtypes.py (100%) rename pandas/{io/tests => tests/io}/parser/header.py (100%) rename pandas/{io/tests => tests/io}/parser/index_col.py (100%) rename pandas/{io/tests => tests/io}/parser/multithread.py (100%) rename pandas/{io/tests => tests/io}/parser/na_values.py (100%) rename pandas/{io/tests => tests/io}/parser/parse_dates.py (100%) rename pandas/{io/tests => tests/io}/parser/python_parser_only.py (100%) rename pandas/{io/tests => tests/io}/parser/quoting.py (100%) rename pandas/{io/tests => tests/io}/parser/skiprows.py (100%) rename pandas/{io/tests => tests/io}/parser/test_network.py (100%) rename pandas/{io/tests => tests/io}/parser/test_parsers.py (100%) rename pandas/{io/tests => tests/io}/parser/test_read_fwf.py (100%) rename pandas/{io/tests => tests/io}/parser/test_textreader.py (100%) rename pandas/{io/tests => tests/io}/parser/test_unsupported.py (100%) rename pandas/{io/tests => tests/io}/parser/usecols.py (100%) rename pandas/{io/tests => tests/io}/sas/data/DEMO_G.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/DEMO_G.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/DRXFCD_G.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/DRXFCD_G.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/SSHSV1_A.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/SSHSV1_A.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/airline.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/airline.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/paxraw_d_short.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/paxraw_d_short.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/productsales.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/productsales.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test1.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test10.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test11.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test12.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test13.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test14.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test15.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test16.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test2.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test3.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test4.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test5.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test6.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test7.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test8.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test9.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test_12659.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/test_12659.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test_sas7bdat_1.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/test_sas7bdat_2.csv (100%) rename pandas/{io/tests => tests/io}/sas/test_sas.py (100%) rename pandas/{io/tests => tests/io}/sas/test_sas7bdat.py (100%) rename pandas/{io/tests => tests/io}/sas/test_xport.py (100%) rename pandas/{io/tests => tests/io}/test_clipboard.py (100%) rename pandas/{io/tests => tests/io}/test_common.py (100%) rename pandas/{io/tests => tests/io}/test_date_converters.py (100%) rename pandas/{io/tests => tests/io}/test_excel.py (100%) rename pandas/{io/tests => tests/io}/test_feather.py (100%) rename pandas/{io/tests => tests/io}/test_gbq.py (100%) rename pandas/{io/tests => tests/io}/test_html.py (100%) rename pandas/{io/tests => tests/io}/test_packers.py (99%) rename pandas/{io/tests => tests/io}/test_pickle.py (99%) rename pandas/{io/tests => tests/io}/test_pytables.py (100%) rename pandas/{io/tests => tests/io}/test_s3.py (100%) rename pandas/{io/tests => tests/io}/test_sql.py (100%) rename pandas/{io/tests => tests/io}/test_stata.py (100%) rename pandas/{sparse/tests => tests/msgpack}/__init__.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_buffer.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_case.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_except.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_extension.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_format.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_limits.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_newspec.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_obj.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_pack.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_read_size.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_seq.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_sequnpack.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_subtype.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_unpack.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_unpack_raw.py (100%) rename pandas/tests/{test_msgpack => sparse}/__init__.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_arithmetics.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_array.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_combine_concat.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_format.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_frame.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_groupby.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_indexing.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_libsparse.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_list.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_pivot.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_series.py (100%) rename pandas/{tools/tests => tests/tools}/__init__.py (100%) rename pandas/{tools/tests => tests/tools}/data/allow_exact_matches.csv (100%) rename pandas/{tools/tests => tests/tools}/data/allow_exact_matches_and_tolerance.csv (100%) rename pandas/{tools/tests => tests/tools}/data/asof.csv (100%) rename pandas/{tools/tests => tests/tools}/data/asof2.csv (100%) rename pandas/{tools/tests => tests/tools}/data/cut_data.csv (100%) rename pandas/{tools/tests => tests/tools}/data/quotes.csv (100%) rename pandas/{tools/tests => tests/tools}/data/quotes2.csv (100%) rename pandas/{tools/tests => tests/tools}/data/tolerance.csv (100%) rename pandas/{tools/tests => tests/tools}/data/trades.csv (100%) rename pandas/{tools/tests => tests/tools}/data/trades2.csv (100%) rename pandas/{tools/tests => tests/tools}/test_concat.py (100%) rename pandas/{tools/tests => tests/tools}/test_hashing.py (100%) rename pandas/{tools/tests => tests/tools}/test_join.py (99%) rename pandas/{tools/tests => tests/tools}/test_merge.py (100%) rename pandas/{tools/tests => tests/tools}/test_merge_asof.py (100%) rename pandas/{tools/tests => tests/tools}/test_merge_ordered.py (100%) rename pandas/{tools/tests => tests/tools}/test_pivot.py (100%) rename pandas/{tools/tests => tests/tools}/test_tile.py (100%) rename pandas/{tools/tests => tests/tools}/test_util.py (100%) diff --git a/pandas/api/tests/__init__.py b/pandas/tests/api/__init__.py similarity index 100% rename from pandas/api/tests/__init__.py rename to pandas/tests/api/__init__.py diff --git a/pandas/api/tests/test_api.py b/pandas/tests/api/test_api.py similarity index 99% rename from pandas/api/tests/test_api.py rename to pandas/tests/api/test_api.py index 05cf5dc4b7e7b..90a0c1d5c9347 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/tests/api/test_api.py @@ -133,7 +133,7 @@ def test_api(self): class TestApi(Base, tm.TestCase): - allowed = ['tests', 'types'] + allowed = ['types'] def test_api(self): diff --git a/pandas/computation/tests/__init__.py b/pandas/tests/computation/__init__.py similarity index 100% rename from pandas/computation/tests/__init__.py rename to pandas/tests/computation/__init__.py diff --git a/pandas/computation/tests/test_compat.py b/pandas/tests/computation/test_compat.py similarity index 100% rename from pandas/computation/tests/test_compat.py rename to pandas/tests/computation/test_compat.py diff --git a/pandas/computation/tests/test_eval.py b/pandas/tests/computation/test_eval.py similarity index 100% rename from pandas/computation/tests/test_eval.py rename to pandas/tests/computation/test_eval.py diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 63bf07ec041d3..9a968a42c4247 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1245,7 +1245,8 @@ def test_shift(self): self.assertEqual(shifted[0], self.rng[0]) self.assertEqual(shifted.offset, self.rng.offset) - with tm.assert_produces_warning(PerformanceWarning): + # PerformanceWarning + with warnings.catch_warnings(record=True): rng = date_range(START, END, freq=BMonthEnd()) shifted = rng.shift(1, freq=CDay()) self.assertEqual(shifted[0], rng[0] + CDay()) diff --git a/pandas/io/tests/__init__.py b/pandas/tests/io/__init__.py similarity index 100% rename from pandas/io/tests/__init__.py rename to pandas/tests/io/__init__.py diff --git a/pandas/io/tests/data/S4_EDUC1.dta b/pandas/tests/io/data/S4_EDUC1.dta similarity index 100% rename from pandas/io/tests/data/S4_EDUC1.dta rename to pandas/tests/io/data/S4_EDUC1.dta diff --git a/pandas/io/tests/data/banklist.csv b/pandas/tests/io/data/banklist.csv similarity index 100% rename from pandas/io/tests/data/banklist.csv rename to pandas/tests/io/data/banklist.csv diff --git a/pandas/io/tests/data/banklist.html b/pandas/tests/io/data/banklist.html similarity index 100% rename from pandas/io/tests/data/banklist.html rename to pandas/tests/io/data/banklist.html diff --git a/pandas/io/tests/data/blank.xls b/pandas/tests/io/data/blank.xls old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank.xls rename to pandas/tests/io/data/blank.xls diff --git a/pandas/io/tests/data/blank.xlsm b/pandas/tests/io/data/blank.xlsm old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank.xlsm rename to pandas/tests/io/data/blank.xlsm diff --git a/pandas/io/tests/data/blank.xlsx b/pandas/tests/io/data/blank.xlsx old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank.xlsx rename to pandas/tests/io/data/blank.xlsx diff --git a/pandas/io/tests/data/blank_with_header.xls b/pandas/tests/io/data/blank_with_header.xls old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank_with_header.xls rename to pandas/tests/io/data/blank_with_header.xls diff --git a/pandas/io/tests/data/blank_with_header.xlsm b/pandas/tests/io/data/blank_with_header.xlsm old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank_with_header.xlsm rename to pandas/tests/io/data/blank_with_header.xlsm diff --git a/pandas/io/tests/data/blank_with_header.xlsx b/pandas/tests/io/data/blank_with_header.xlsx old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank_with_header.xlsx rename to pandas/tests/io/data/blank_with_header.xlsx diff --git a/pandas/io/tests/data/categorical_0_14_1.pickle b/pandas/tests/io/data/categorical_0_14_1.pickle similarity index 100% rename from pandas/io/tests/data/categorical_0_14_1.pickle rename to pandas/tests/io/data/categorical_0_14_1.pickle diff --git a/pandas/io/tests/data/categorical_0_15_2.pickle b/pandas/tests/io/data/categorical_0_15_2.pickle similarity index 100% rename from pandas/io/tests/data/categorical_0_15_2.pickle rename to pandas/tests/io/data/categorical_0_15_2.pickle diff --git a/pandas/io/tests/data/computer_sales_page.html b/pandas/tests/io/data/computer_sales_page.html similarity index 100% rename from pandas/io/tests/data/computer_sales_page.html rename to pandas/tests/io/data/computer_sales_page.html diff --git a/pandas/io/tests/data/gbq_fake_job.txt b/pandas/tests/io/data/gbq_fake_job.txt similarity index 100% rename from pandas/io/tests/data/gbq_fake_job.txt rename to pandas/tests/io/data/gbq_fake_job.txt diff --git a/pandas/io/tests/data/html_encoding/chinese_utf-16.html b/pandas/tests/io/data/html_encoding/chinese_utf-16.html similarity index 100% rename from pandas/io/tests/data/html_encoding/chinese_utf-16.html rename to pandas/tests/io/data/html_encoding/chinese_utf-16.html diff --git a/pandas/io/tests/data/html_encoding/chinese_utf-32.html b/pandas/tests/io/data/html_encoding/chinese_utf-32.html similarity index 100% rename from pandas/io/tests/data/html_encoding/chinese_utf-32.html rename to pandas/tests/io/data/html_encoding/chinese_utf-32.html diff --git a/pandas/io/tests/data/html_encoding/chinese_utf-8.html b/pandas/tests/io/data/html_encoding/chinese_utf-8.html similarity index 100% rename from pandas/io/tests/data/html_encoding/chinese_utf-8.html rename to pandas/tests/io/data/html_encoding/chinese_utf-8.html diff --git a/pandas/io/tests/data/html_encoding/letz_latin1.html b/pandas/tests/io/data/html_encoding/letz_latin1.html similarity index 100% rename from pandas/io/tests/data/html_encoding/letz_latin1.html rename to pandas/tests/io/data/html_encoding/letz_latin1.html diff --git a/pandas/io/tests/data/iris.csv b/pandas/tests/io/data/iris.csv similarity index 100% rename from pandas/io/tests/data/iris.csv rename to pandas/tests/io/data/iris.csv diff --git a/pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 b/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 rename to pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy.h5 b/pandas/tests/io/data/legacy_hdf/legacy.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy.h5 rename to pandas/tests/io/data/legacy_hdf/legacy.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 b/pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 rename to pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_table.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy_table.h5 rename to pandas/tests/io/data/legacy_hdf/legacy_table.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_0.11.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 rename to pandas/tests/io/data/legacy_hdf/legacy_table_0.11.h5 diff --git a/pandas/io/tests/data/legacy_hdf/pytables_native.h5 b/pandas/tests/io/data/legacy_hdf/pytables_native.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/pytables_native.h5 rename to pandas/tests/io/data/legacy_hdf/pytables_native.h5 diff --git a/pandas/io/tests/data/legacy_hdf/pytables_native2.h5 b/pandas/tests/io/data/legacy_hdf/pytables_native2.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/pytables_native2.h5 rename to pandas/tests/io/data/legacy_hdf/pytables_native2.h5 diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle rename to pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle rename to pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle b/pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle rename to pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle rename to pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle b/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle diff --git a/pandas/io/tests/data/macau.html b/pandas/tests/io/data/macau.html similarity index 100% rename from pandas/io/tests/data/macau.html rename to pandas/tests/io/data/macau.html diff --git a/pandas/io/tests/data/nyse_wsj.html b/pandas/tests/io/data/nyse_wsj.html similarity index 100% rename from pandas/io/tests/data/nyse_wsj.html rename to pandas/tests/io/data/nyse_wsj.html diff --git a/pandas/io/tests/data/spam.html b/pandas/tests/io/data/spam.html similarity index 100% rename from pandas/io/tests/data/spam.html rename to pandas/tests/io/data/spam.html diff --git a/pandas/io/tests/data/stata10_115.dta b/pandas/tests/io/data/stata10_115.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata10_115.dta rename to pandas/tests/io/data/stata10_115.dta diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/tests/io/data/stata10_117.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata10_117.dta rename to pandas/tests/io/data/stata10_117.dta diff --git a/pandas/io/tests/data/stata11_115.dta b/pandas/tests/io/data/stata11_115.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata11_115.dta rename to pandas/tests/io/data/stata11_115.dta diff --git a/pandas/io/tests/data/stata11_117.dta b/pandas/tests/io/data/stata11_117.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata11_117.dta rename to pandas/tests/io/data/stata11_117.dta diff --git a/pandas/io/tests/data/stata12_117.dta b/pandas/tests/io/data/stata12_117.dta similarity index 100% rename from pandas/io/tests/data/stata12_117.dta rename to pandas/tests/io/data/stata12_117.dta diff --git a/pandas/io/tests/data/stata14_118.dta b/pandas/tests/io/data/stata14_118.dta similarity index 100% rename from pandas/io/tests/data/stata14_118.dta rename to pandas/tests/io/data/stata14_118.dta diff --git a/pandas/io/tests/data/stata15.dta b/pandas/tests/io/data/stata15.dta similarity index 100% rename from pandas/io/tests/data/stata15.dta rename to pandas/tests/io/data/stata15.dta diff --git a/pandas/io/tests/data/stata1_114.dta b/pandas/tests/io/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1_114.dta rename to pandas/tests/io/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_117.dta b/pandas/tests/io/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_117.dta rename to pandas/tests/io/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata1_encoding.dta b/pandas/tests/io/data/stata1_encoding.dta similarity index 100% rename from pandas/io/tests/data/stata1_encoding.dta rename to pandas/tests/io/data/stata1_encoding.dta diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/tests/io/data/stata2_113.dta similarity index 100% rename from pandas/io/tests/data/stata2_113.dta rename to pandas/tests/io/data/stata2_113.dta diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/tests/io/data/stata2_114.dta similarity index 100% rename from pandas/io/tests/data/stata2_114.dta rename to pandas/tests/io/data/stata2_114.dta diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/tests/io/data/stata2_115.dta similarity index 100% rename from pandas/io/tests/data/stata2_115.dta rename to pandas/tests/io/data/stata2_115.dta diff --git a/pandas/io/tests/data/stata2_117.dta b/pandas/tests/io/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_117.dta rename to pandas/tests/io/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3.csv b/pandas/tests/io/data/stata3.csv similarity index 100% rename from pandas/io/tests/data/stata3.csv rename to pandas/tests/io/data/stata3.csv diff --git a/pandas/io/tests/data/stata3_113.dta b/pandas/tests/io/data/stata3_113.dta similarity index 100% rename from pandas/io/tests/data/stata3_113.dta rename to pandas/tests/io/data/stata3_113.dta diff --git a/pandas/io/tests/data/stata3_114.dta b/pandas/tests/io/data/stata3_114.dta similarity index 100% rename from pandas/io/tests/data/stata3_114.dta rename to pandas/tests/io/data/stata3_114.dta diff --git a/pandas/io/tests/data/stata3_115.dta b/pandas/tests/io/data/stata3_115.dta similarity index 100% rename from pandas/io/tests/data/stata3_115.dta rename to pandas/tests/io/data/stata3_115.dta diff --git a/pandas/io/tests/data/stata3_117.dta b/pandas/tests/io/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_117.dta rename to pandas/tests/io/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4_113.dta b/pandas/tests/io/data/stata4_113.dta similarity index 100% rename from pandas/io/tests/data/stata4_113.dta rename to pandas/tests/io/data/stata4_113.dta diff --git a/pandas/io/tests/data/stata4_114.dta b/pandas/tests/io/data/stata4_114.dta similarity index 100% rename from pandas/io/tests/data/stata4_114.dta rename to pandas/tests/io/data/stata4_114.dta diff --git a/pandas/io/tests/data/stata4_115.dta b/pandas/tests/io/data/stata4_115.dta similarity index 100% rename from pandas/io/tests/data/stata4_115.dta rename to pandas/tests/io/data/stata4_115.dta diff --git a/pandas/io/tests/data/stata4_117.dta b/pandas/tests/io/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_117.dta rename to pandas/tests/io/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/tests/io/data/stata5.csv similarity index 100% rename from pandas/io/tests/data/stata5.csv rename to pandas/tests/io/data/stata5.csv diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/tests/io/data/stata5_113.dta similarity index 100% rename from pandas/io/tests/data/stata5_113.dta rename to pandas/tests/io/data/stata5_113.dta diff --git a/pandas/io/tests/data/stata5_114.dta b/pandas/tests/io/data/stata5_114.dta similarity index 100% rename from pandas/io/tests/data/stata5_114.dta rename to pandas/tests/io/data/stata5_114.dta diff --git a/pandas/io/tests/data/stata5_115.dta b/pandas/tests/io/data/stata5_115.dta similarity index 100% rename from pandas/io/tests/data/stata5_115.dta rename to pandas/tests/io/data/stata5_115.dta diff --git a/pandas/io/tests/data/stata5_117.dta b/pandas/tests/io/data/stata5_117.dta similarity index 100% rename from pandas/io/tests/data/stata5_117.dta rename to pandas/tests/io/data/stata5_117.dta diff --git a/pandas/io/tests/data/stata6.csv b/pandas/tests/io/data/stata6.csv similarity index 100% rename from pandas/io/tests/data/stata6.csv rename to pandas/tests/io/data/stata6.csv diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/tests/io/data/stata6_113.dta similarity index 100% rename from pandas/io/tests/data/stata6_113.dta rename to pandas/tests/io/data/stata6_113.dta diff --git a/pandas/io/tests/data/stata6_114.dta b/pandas/tests/io/data/stata6_114.dta similarity index 100% rename from pandas/io/tests/data/stata6_114.dta rename to pandas/tests/io/data/stata6_114.dta diff --git a/pandas/io/tests/data/stata6_115.dta b/pandas/tests/io/data/stata6_115.dta similarity index 100% rename from pandas/io/tests/data/stata6_115.dta rename to pandas/tests/io/data/stata6_115.dta diff --git a/pandas/io/tests/data/stata6_117.dta b/pandas/tests/io/data/stata6_117.dta similarity index 100% rename from pandas/io/tests/data/stata6_117.dta rename to pandas/tests/io/data/stata6_117.dta diff --git a/pandas/io/tests/data/stata7_111.dta b/pandas/tests/io/data/stata7_111.dta similarity index 100% rename from pandas/io/tests/data/stata7_111.dta rename to pandas/tests/io/data/stata7_111.dta diff --git a/pandas/io/tests/data/stata7_115.dta b/pandas/tests/io/data/stata7_115.dta similarity index 100% rename from pandas/io/tests/data/stata7_115.dta rename to pandas/tests/io/data/stata7_115.dta diff --git a/pandas/io/tests/data/stata7_117.dta b/pandas/tests/io/data/stata7_117.dta similarity index 100% rename from pandas/io/tests/data/stata7_117.dta rename to pandas/tests/io/data/stata7_117.dta diff --git a/pandas/io/tests/data/stata8_113.dta b/pandas/tests/io/data/stata8_113.dta similarity index 100% rename from pandas/io/tests/data/stata8_113.dta rename to pandas/tests/io/data/stata8_113.dta diff --git a/pandas/io/tests/data/stata8_115.dta b/pandas/tests/io/data/stata8_115.dta similarity index 100% rename from pandas/io/tests/data/stata8_115.dta rename to pandas/tests/io/data/stata8_115.dta diff --git a/pandas/io/tests/data/stata8_117.dta b/pandas/tests/io/data/stata8_117.dta similarity index 100% rename from pandas/io/tests/data/stata8_117.dta rename to pandas/tests/io/data/stata8_117.dta diff --git a/pandas/io/tests/data/stata9_115.dta b/pandas/tests/io/data/stata9_115.dta similarity index 100% rename from pandas/io/tests/data/stata9_115.dta rename to pandas/tests/io/data/stata9_115.dta diff --git a/pandas/io/tests/data/stata9_117.dta b/pandas/tests/io/data/stata9_117.dta similarity index 100% rename from pandas/io/tests/data/stata9_117.dta rename to pandas/tests/io/data/stata9_117.dta diff --git a/pandas/io/tests/data/test1.csv b/pandas/tests/io/data/test1.csv similarity index 100% rename from pandas/io/tests/data/test1.csv rename to pandas/tests/io/data/test1.csv diff --git a/pandas/io/tests/data/test1.xls b/pandas/tests/io/data/test1.xls similarity index 100% rename from pandas/io/tests/data/test1.xls rename to pandas/tests/io/data/test1.xls diff --git a/pandas/io/tests/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm similarity index 100% rename from pandas/io/tests/data/test1.xlsm rename to pandas/tests/io/data/test1.xlsm diff --git a/pandas/io/tests/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx similarity index 100% rename from pandas/io/tests/data/test1.xlsx rename to pandas/tests/io/data/test1.xlsx diff --git a/pandas/io/tests/data/test2.xls b/pandas/tests/io/data/test2.xls similarity index 100% rename from pandas/io/tests/data/test2.xls rename to pandas/tests/io/data/test2.xls diff --git a/pandas/io/tests/data/test2.xlsm b/pandas/tests/io/data/test2.xlsm similarity index 100% rename from pandas/io/tests/data/test2.xlsm rename to pandas/tests/io/data/test2.xlsm diff --git a/pandas/io/tests/data/test2.xlsx b/pandas/tests/io/data/test2.xlsx similarity index 100% rename from pandas/io/tests/data/test2.xlsx rename to pandas/tests/io/data/test2.xlsx diff --git a/pandas/io/tests/data/test3.xls b/pandas/tests/io/data/test3.xls similarity index 100% rename from pandas/io/tests/data/test3.xls rename to pandas/tests/io/data/test3.xls diff --git a/pandas/io/tests/data/test3.xlsm b/pandas/tests/io/data/test3.xlsm similarity index 100% rename from pandas/io/tests/data/test3.xlsm rename to pandas/tests/io/data/test3.xlsm diff --git a/pandas/io/tests/data/test3.xlsx b/pandas/tests/io/data/test3.xlsx similarity index 100% rename from pandas/io/tests/data/test3.xlsx rename to pandas/tests/io/data/test3.xlsx diff --git a/pandas/io/tests/data/test4.xls b/pandas/tests/io/data/test4.xls similarity index 100% rename from pandas/io/tests/data/test4.xls rename to pandas/tests/io/data/test4.xls diff --git a/pandas/io/tests/data/test4.xlsm b/pandas/tests/io/data/test4.xlsm similarity index 100% rename from pandas/io/tests/data/test4.xlsm rename to pandas/tests/io/data/test4.xlsm diff --git a/pandas/io/tests/data/test4.xlsx b/pandas/tests/io/data/test4.xlsx similarity index 100% rename from pandas/io/tests/data/test4.xlsx rename to pandas/tests/io/data/test4.xlsx diff --git a/pandas/io/tests/data/test5.xls b/pandas/tests/io/data/test5.xls similarity index 100% rename from pandas/io/tests/data/test5.xls rename to pandas/tests/io/data/test5.xls diff --git a/pandas/io/tests/data/test5.xlsm b/pandas/tests/io/data/test5.xlsm similarity index 100% rename from pandas/io/tests/data/test5.xlsm rename to pandas/tests/io/data/test5.xlsm diff --git a/pandas/io/tests/data/test5.xlsx b/pandas/tests/io/data/test5.xlsx similarity index 100% rename from pandas/io/tests/data/test5.xlsx rename to pandas/tests/io/data/test5.xlsx diff --git a/pandas/io/tests/data/test_converters.xls b/pandas/tests/io/data/test_converters.xls similarity index 100% rename from pandas/io/tests/data/test_converters.xls rename to pandas/tests/io/data/test_converters.xls diff --git a/pandas/io/tests/data/test_converters.xlsm b/pandas/tests/io/data/test_converters.xlsm similarity index 100% rename from pandas/io/tests/data/test_converters.xlsm rename to pandas/tests/io/data/test_converters.xlsm diff --git a/pandas/io/tests/data/test_converters.xlsx b/pandas/tests/io/data/test_converters.xlsx similarity index 100% rename from pandas/io/tests/data/test_converters.xlsx rename to pandas/tests/io/data/test_converters.xlsx diff --git a/pandas/io/tests/data/test_index_name_pre17.xls b/pandas/tests/io/data/test_index_name_pre17.xls similarity index 100% rename from pandas/io/tests/data/test_index_name_pre17.xls rename to pandas/tests/io/data/test_index_name_pre17.xls diff --git a/pandas/io/tests/data/test_index_name_pre17.xlsm b/pandas/tests/io/data/test_index_name_pre17.xlsm similarity index 100% rename from pandas/io/tests/data/test_index_name_pre17.xlsm rename to pandas/tests/io/data/test_index_name_pre17.xlsm diff --git a/pandas/io/tests/data/test_index_name_pre17.xlsx b/pandas/tests/io/data/test_index_name_pre17.xlsx similarity index 100% rename from pandas/io/tests/data/test_index_name_pre17.xlsx rename to pandas/tests/io/data/test_index_name_pre17.xlsx diff --git a/pandas/io/tests/data/test_mmap.csv b/pandas/tests/io/data/test_mmap.csv similarity index 100% rename from pandas/io/tests/data/test_mmap.csv rename to pandas/tests/io/data/test_mmap.csv diff --git a/pandas/io/tests/data/test_multisheet.xls b/pandas/tests/io/data/test_multisheet.xls similarity index 100% rename from pandas/io/tests/data/test_multisheet.xls rename to pandas/tests/io/data/test_multisheet.xls diff --git a/pandas/io/tests/data/test_multisheet.xlsm b/pandas/tests/io/data/test_multisheet.xlsm similarity index 100% rename from pandas/io/tests/data/test_multisheet.xlsm rename to pandas/tests/io/data/test_multisheet.xlsm diff --git a/pandas/io/tests/data/test_multisheet.xlsx b/pandas/tests/io/data/test_multisheet.xlsx similarity index 100% rename from pandas/io/tests/data/test_multisheet.xlsx rename to pandas/tests/io/data/test_multisheet.xlsx diff --git a/pandas/io/tests/data/test_squeeze.xls b/pandas/tests/io/data/test_squeeze.xls similarity index 100% rename from pandas/io/tests/data/test_squeeze.xls rename to pandas/tests/io/data/test_squeeze.xls diff --git a/pandas/io/tests/data/test_squeeze.xlsm b/pandas/tests/io/data/test_squeeze.xlsm similarity index 100% rename from pandas/io/tests/data/test_squeeze.xlsm rename to pandas/tests/io/data/test_squeeze.xlsm diff --git a/pandas/io/tests/data/test_squeeze.xlsx b/pandas/tests/io/data/test_squeeze.xlsx similarity index 100% rename from pandas/io/tests/data/test_squeeze.xlsx rename to pandas/tests/io/data/test_squeeze.xlsx diff --git a/pandas/io/tests/data/test_types.xls b/pandas/tests/io/data/test_types.xls similarity index 100% rename from pandas/io/tests/data/test_types.xls rename to pandas/tests/io/data/test_types.xls diff --git a/pandas/io/tests/data/test_types.xlsm b/pandas/tests/io/data/test_types.xlsm similarity index 100% rename from pandas/io/tests/data/test_types.xlsm rename to pandas/tests/io/data/test_types.xlsm diff --git a/pandas/io/tests/data/test_types.xlsx b/pandas/tests/io/data/test_types.xlsx similarity index 100% rename from pandas/io/tests/data/test_types.xlsx rename to pandas/tests/io/data/test_types.xlsx diff --git a/pandas/io/tests/data/testdateoverflow.xls b/pandas/tests/io/data/testdateoverflow.xls similarity index 100% rename from pandas/io/tests/data/testdateoverflow.xls rename to pandas/tests/io/data/testdateoverflow.xls diff --git a/pandas/io/tests/data/testdateoverflow.xlsm b/pandas/tests/io/data/testdateoverflow.xlsm similarity index 100% rename from pandas/io/tests/data/testdateoverflow.xlsm rename to pandas/tests/io/data/testdateoverflow.xlsm diff --git a/pandas/io/tests/data/testdateoverflow.xlsx b/pandas/tests/io/data/testdateoverflow.xlsx similarity index 100% rename from pandas/io/tests/data/testdateoverflow.xlsx rename to pandas/tests/io/data/testdateoverflow.xlsx diff --git a/pandas/io/tests/data/testdtype.xls b/pandas/tests/io/data/testdtype.xls similarity index 100% rename from pandas/io/tests/data/testdtype.xls rename to pandas/tests/io/data/testdtype.xls diff --git a/pandas/io/tests/data/testdtype.xlsm b/pandas/tests/io/data/testdtype.xlsm similarity index 100% rename from pandas/io/tests/data/testdtype.xlsm rename to pandas/tests/io/data/testdtype.xlsm diff --git a/pandas/io/tests/data/testdtype.xlsx b/pandas/tests/io/data/testdtype.xlsx similarity index 100% rename from pandas/io/tests/data/testdtype.xlsx rename to pandas/tests/io/data/testdtype.xlsx diff --git a/pandas/io/tests/data/testmultiindex.xls b/pandas/tests/io/data/testmultiindex.xls similarity index 100% rename from pandas/io/tests/data/testmultiindex.xls rename to pandas/tests/io/data/testmultiindex.xls diff --git a/pandas/io/tests/data/testmultiindex.xlsm b/pandas/tests/io/data/testmultiindex.xlsm similarity index 100% rename from pandas/io/tests/data/testmultiindex.xlsm rename to pandas/tests/io/data/testmultiindex.xlsm diff --git a/pandas/io/tests/data/testmultiindex.xlsx b/pandas/tests/io/data/testmultiindex.xlsx similarity index 100% rename from pandas/io/tests/data/testmultiindex.xlsx rename to pandas/tests/io/data/testmultiindex.xlsx diff --git a/pandas/io/tests/data/testskiprows.xls b/pandas/tests/io/data/testskiprows.xls similarity index 100% rename from pandas/io/tests/data/testskiprows.xls rename to pandas/tests/io/data/testskiprows.xls diff --git a/pandas/io/tests/data/testskiprows.xlsm b/pandas/tests/io/data/testskiprows.xlsm similarity index 100% rename from pandas/io/tests/data/testskiprows.xlsm rename to pandas/tests/io/data/testskiprows.xlsm diff --git a/pandas/io/tests/data/testskiprows.xlsx b/pandas/tests/io/data/testskiprows.xlsx similarity index 100% rename from pandas/io/tests/data/testskiprows.xlsx rename to pandas/tests/io/data/testskiprows.xlsx diff --git a/pandas/io/tests/data/times_1900.xls b/pandas/tests/io/data/times_1900.xls similarity index 100% rename from pandas/io/tests/data/times_1900.xls rename to pandas/tests/io/data/times_1900.xls diff --git a/pandas/io/tests/data/times_1900.xlsm b/pandas/tests/io/data/times_1900.xlsm similarity index 100% rename from pandas/io/tests/data/times_1900.xlsm rename to pandas/tests/io/data/times_1900.xlsm diff --git a/pandas/io/tests/data/times_1900.xlsx b/pandas/tests/io/data/times_1900.xlsx similarity index 100% rename from pandas/io/tests/data/times_1900.xlsx rename to pandas/tests/io/data/times_1900.xlsx diff --git a/pandas/io/tests/data/times_1904.xls b/pandas/tests/io/data/times_1904.xls similarity index 100% rename from pandas/io/tests/data/times_1904.xls rename to pandas/tests/io/data/times_1904.xls diff --git a/pandas/io/tests/data/times_1904.xlsm b/pandas/tests/io/data/times_1904.xlsm similarity index 100% rename from pandas/io/tests/data/times_1904.xlsm rename to pandas/tests/io/data/times_1904.xlsm diff --git a/pandas/io/tests/data/times_1904.xlsx b/pandas/tests/io/data/times_1904.xlsx similarity index 100% rename from pandas/io/tests/data/times_1904.xlsx rename to pandas/tests/io/data/times_1904.xlsx diff --git a/pandas/io/tests/data/tips.csv b/pandas/tests/io/data/tips.csv similarity index 100% rename from pandas/io/tests/data/tips.csv rename to pandas/tests/io/data/tips.csv diff --git a/pandas/io/tests/data/valid_markup.html b/pandas/tests/io/data/valid_markup.html similarity index 100% rename from pandas/io/tests/data/valid_markup.html rename to pandas/tests/io/data/valid_markup.html diff --git a/pandas/io/tests/data/wikipedia_states.html b/pandas/tests/io/data/wikipedia_states.html similarity index 100% rename from pandas/io/tests/data/wikipedia_states.html rename to pandas/tests/io/data/wikipedia_states.html diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py similarity index 100% rename from pandas/io/tests/generate_legacy_storage_files.py rename to pandas/tests/io/generate_legacy_storage_files.py diff --git a/pandas/io/tests/json/__init__.py b/pandas/tests/io/json/__init__.py similarity index 100% rename from pandas/io/tests/json/__init__.py rename to pandas/tests/io/json/__init__.py diff --git a/pandas/io/tests/json/data/tsframe_iso_v012.json b/pandas/tests/io/json/data/tsframe_iso_v012.json similarity index 100% rename from pandas/io/tests/json/data/tsframe_iso_v012.json rename to pandas/tests/io/json/data/tsframe_iso_v012.json diff --git a/pandas/io/tests/json/data/tsframe_v012.json b/pandas/tests/io/json/data/tsframe_v012.json similarity index 100% rename from pandas/io/tests/json/data/tsframe_v012.json rename to pandas/tests/io/json/data/tsframe_v012.json diff --git a/pandas/io/tests/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py similarity index 100% rename from pandas/io/tests/json/test_normalize.py rename to pandas/tests/io/json/test_normalize.py diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py similarity index 100% rename from pandas/io/tests/json/test_pandas.py rename to pandas/tests/io/json/test_pandas.py diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py similarity index 100% rename from pandas/io/tests/json/test_ujson.py rename to pandas/tests/io/json/test_ujson.py diff --git a/pandas/io/tests/parser/__init__.py b/pandas/tests/io/parser/__init__.py similarity index 100% rename from pandas/io/tests/parser/__init__.py rename to pandas/tests/io/parser/__init__.py diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py similarity index 100% rename from pandas/io/tests/parser/c_parser_only.py rename to pandas/tests/io/parser/c_parser_only.py diff --git a/pandas/io/tests/parser/comment.py b/pandas/tests/io/parser/comment.py similarity index 100% rename from pandas/io/tests/parser/comment.py rename to pandas/tests/io/parser/comment.py diff --git a/pandas/io/tests/parser/common.py b/pandas/tests/io/parser/common.py similarity index 100% rename from pandas/io/tests/parser/common.py rename to pandas/tests/io/parser/common.py diff --git a/pandas/io/tests/parser/compression.py b/pandas/tests/io/parser/compression.py similarity index 100% rename from pandas/io/tests/parser/compression.py rename to pandas/tests/io/parser/compression.py diff --git a/pandas/io/tests/parser/converters.py b/pandas/tests/io/parser/converters.py similarity index 100% rename from pandas/io/tests/parser/converters.py rename to pandas/tests/io/parser/converters.py diff --git a/pandas/io/tests/parser/data/iris.csv b/pandas/tests/io/parser/data/iris.csv similarity index 100% rename from pandas/io/tests/parser/data/iris.csv rename to pandas/tests/io/parser/data/iris.csv diff --git a/pandas/io/tests/parser/data/salaries.csv b/pandas/tests/io/parser/data/salaries.csv similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv rename to pandas/tests/io/parser/data/salaries.csv diff --git a/pandas/io/tests/parser/data/salaries.csv.bz2 b/pandas/tests/io/parser/data/salaries.csv.bz2 similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.bz2 rename to pandas/tests/io/parser/data/salaries.csv.bz2 diff --git a/pandas/io/tests/parser/data/salaries.csv.gz b/pandas/tests/io/parser/data/salaries.csv.gz similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.gz rename to pandas/tests/io/parser/data/salaries.csv.gz diff --git a/pandas/io/tests/parser/data/salaries.csv.xz b/pandas/tests/io/parser/data/salaries.csv.xz similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.xz rename to pandas/tests/io/parser/data/salaries.csv.xz diff --git a/pandas/io/tests/parser/data/salaries.csv.zip b/pandas/tests/io/parser/data/salaries.csv.zip similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.zip rename to pandas/tests/io/parser/data/salaries.csv.zip diff --git a/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv b/pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv similarity index 100% rename from pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv rename to pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv diff --git a/pandas/io/tests/parser/data/test1.csv b/pandas/tests/io/parser/data/test1.csv similarity index 100% rename from pandas/io/tests/parser/data/test1.csv rename to pandas/tests/io/parser/data/test1.csv diff --git a/pandas/io/tests/parser/data/test1.csv.bz2 b/pandas/tests/io/parser/data/test1.csv.bz2 similarity index 100% rename from pandas/io/tests/parser/data/test1.csv.bz2 rename to pandas/tests/io/parser/data/test1.csv.bz2 diff --git a/pandas/io/tests/parser/data/test1.csv.gz b/pandas/tests/io/parser/data/test1.csv.gz similarity index 100% rename from pandas/io/tests/parser/data/test1.csv.gz rename to pandas/tests/io/parser/data/test1.csv.gz diff --git a/pandas/io/tests/parser/data/test2.csv b/pandas/tests/io/parser/data/test2.csv similarity index 100% rename from pandas/io/tests/parser/data/test2.csv rename to pandas/tests/io/parser/data/test2.csv diff --git a/pandas/io/tests/parser/data/test_mmap.csv b/pandas/tests/io/parser/data/test_mmap.csv similarity index 100% rename from pandas/io/tests/parser/data/test_mmap.csv rename to pandas/tests/io/parser/data/test_mmap.csv diff --git a/pandas/io/tests/parser/data/tips.csv b/pandas/tests/io/parser/data/tips.csv similarity index 100% rename from pandas/io/tests/parser/data/tips.csv rename to pandas/tests/io/parser/data/tips.csv diff --git a/pandas/io/tests/parser/data/unicode_series.csv b/pandas/tests/io/parser/data/unicode_series.csv similarity index 100% rename from pandas/io/tests/parser/data/unicode_series.csv rename to pandas/tests/io/parser/data/unicode_series.csv diff --git a/pandas/io/tests/parser/data/utf16_ex.txt b/pandas/tests/io/parser/data/utf16_ex.txt similarity index 100% rename from pandas/io/tests/parser/data/utf16_ex.txt rename to pandas/tests/io/parser/data/utf16_ex.txt diff --git a/pandas/io/tests/parser/dialect.py b/pandas/tests/io/parser/dialect.py similarity index 100% rename from pandas/io/tests/parser/dialect.py rename to pandas/tests/io/parser/dialect.py diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py similarity index 100% rename from pandas/io/tests/parser/dtypes.py rename to pandas/tests/io/parser/dtypes.py diff --git a/pandas/io/tests/parser/header.py b/pandas/tests/io/parser/header.py similarity index 100% rename from pandas/io/tests/parser/header.py rename to pandas/tests/io/parser/header.py diff --git a/pandas/io/tests/parser/index_col.py b/pandas/tests/io/parser/index_col.py similarity index 100% rename from pandas/io/tests/parser/index_col.py rename to pandas/tests/io/parser/index_col.py diff --git a/pandas/io/tests/parser/multithread.py b/pandas/tests/io/parser/multithread.py similarity index 100% rename from pandas/io/tests/parser/multithread.py rename to pandas/tests/io/parser/multithread.py diff --git a/pandas/io/tests/parser/na_values.py b/pandas/tests/io/parser/na_values.py similarity index 100% rename from pandas/io/tests/parser/na_values.py rename to pandas/tests/io/parser/na_values.py diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py similarity index 100% rename from pandas/io/tests/parser/parse_dates.py rename to pandas/tests/io/parser/parse_dates.py diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py similarity index 100% rename from pandas/io/tests/parser/python_parser_only.py rename to pandas/tests/io/parser/python_parser_only.py diff --git a/pandas/io/tests/parser/quoting.py b/pandas/tests/io/parser/quoting.py similarity index 100% rename from pandas/io/tests/parser/quoting.py rename to pandas/tests/io/parser/quoting.py diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/tests/io/parser/skiprows.py similarity index 100% rename from pandas/io/tests/parser/skiprows.py rename to pandas/tests/io/parser/skiprows.py diff --git a/pandas/io/tests/parser/test_network.py b/pandas/tests/io/parser/test_network.py similarity index 100% rename from pandas/io/tests/parser/test_network.py rename to pandas/tests/io/parser/test_network.py diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py similarity index 100% rename from pandas/io/tests/parser/test_parsers.py rename to pandas/tests/io/parser/test_parsers.py diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py similarity index 100% rename from pandas/io/tests/parser/test_read_fwf.py rename to pandas/tests/io/parser/test_read_fwf.py diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py similarity index 100% rename from pandas/io/tests/parser/test_textreader.py rename to pandas/tests/io/parser/test_textreader.py diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py similarity index 100% rename from pandas/io/tests/parser/test_unsupported.py rename to pandas/tests/io/parser/test_unsupported.py diff --git a/pandas/io/tests/parser/usecols.py b/pandas/tests/io/parser/usecols.py similarity index 100% rename from pandas/io/tests/parser/usecols.py rename to pandas/tests/io/parser/usecols.py diff --git a/pandas/io/tests/sas/data/DEMO_G.csv b/pandas/tests/io/sas/data/DEMO_G.csv similarity index 100% rename from pandas/io/tests/sas/data/DEMO_G.csv rename to pandas/tests/io/sas/data/DEMO_G.csv diff --git a/pandas/io/tests/sas/data/DEMO_G.xpt b/pandas/tests/io/sas/data/DEMO_G.xpt similarity index 100% rename from pandas/io/tests/sas/data/DEMO_G.xpt rename to pandas/tests/io/sas/data/DEMO_G.xpt diff --git a/pandas/io/tests/sas/data/DRXFCD_G.csv b/pandas/tests/io/sas/data/DRXFCD_G.csv similarity index 100% rename from pandas/io/tests/sas/data/DRXFCD_G.csv rename to pandas/tests/io/sas/data/DRXFCD_G.csv diff --git a/pandas/io/tests/sas/data/DRXFCD_G.xpt b/pandas/tests/io/sas/data/DRXFCD_G.xpt similarity index 100% rename from pandas/io/tests/sas/data/DRXFCD_G.xpt rename to pandas/tests/io/sas/data/DRXFCD_G.xpt diff --git a/pandas/io/tests/sas/data/SSHSV1_A.csv b/pandas/tests/io/sas/data/SSHSV1_A.csv similarity index 100% rename from pandas/io/tests/sas/data/SSHSV1_A.csv rename to pandas/tests/io/sas/data/SSHSV1_A.csv diff --git a/pandas/io/tests/sas/data/SSHSV1_A.xpt b/pandas/tests/io/sas/data/SSHSV1_A.xpt similarity index 100% rename from pandas/io/tests/sas/data/SSHSV1_A.xpt rename to pandas/tests/io/sas/data/SSHSV1_A.xpt diff --git a/pandas/io/tests/sas/data/airline.csv b/pandas/tests/io/sas/data/airline.csv similarity index 100% rename from pandas/io/tests/sas/data/airline.csv rename to pandas/tests/io/sas/data/airline.csv diff --git a/pandas/io/tests/sas/data/airline.sas7bdat b/pandas/tests/io/sas/data/airline.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/airline.sas7bdat rename to pandas/tests/io/sas/data/airline.sas7bdat diff --git a/pandas/io/tests/sas/data/paxraw_d_short.csv b/pandas/tests/io/sas/data/paxraw_d_short.csv similarity index 100% rename from pandas/io/tests/sas/data/paxraw_d_short.csv rename to pandas/tests/io/sas/data/paxraw_d_short.csv diff --git a/pandas/io/tests/sas/data/paxraw_d_short.xpt b/pandas/tests/io/sas/data/paxraw_d_short.xpt similarity index 100% rename from pandas/io/tests/sas/data/paxraw_d_short.xpt rename to pandas/tests/io/sas/data/paxraw_d_short.xpt diff --git a/pandas/io/tests/sas/data/productsales.csv b/pandas/tests/io/sas/data/productsales.csv similarity index 100% rename from pandas/io/tests/sas/data/productsales.csv rename to pandas/tests/io/sas/data/productsales.csv diff --git a/pandas/io/tests/sas/data/productsales.sas7bdat b/pandas/tests/io/sas/data/productsales.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/productsales.sas7bdat rename to pandas/tests/io/sas/data/productsales.sas7bdat diff --git a/pandas/io/tests/sas/data/test1.sas7bdat b/pandas/tests/io/sas/data/test1.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test1.sas7bdat rename to pandas/tests/io/sas/data/test1.sas7bdat diff --git a/pandas/io/tests/sas/data/test10.sas7bdat b/pandas/tests/io/sas/data/test10.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test10.sas7bdat rename to pandas/tests/io/sas/data/test10.sas7bdat diff --git a/pandas/io/tests/sas/data/test11.sas7bdat b/pandas/tests/io/sas/data/test11.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test11.sas7bdat rename to pandas/tests/io/sas/data/test11.sas7bdat diff --git a/pandas/io/tests/sas/data/test12.sas7bdat b/pandas/tests/io/sas/data/test12.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test12.sas7bdat rename to pandas/tests/io/sas/data/test12.sas7bdat diff --git a/pandas/io/tests/sas/data/test13.sas7bdat b/pandas/tests/io/sas/data/test13.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test13.sas7bdat rename to pandas/tests/io/sas/data/test13.sas7bdat diff --git a/pandas/io/tests/sas/data/test14.sas7bdat b/pandas/tests/io/sas/data/test14.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test14.sas7bdat rename to pandas/tests/io/sas/data/test14.sas7bdat diff --git a/pandas/io/tests/sas/data/test15.sas7bdat b/pandas/tests/io/sas/data/test15.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test15.sas7bdat rename to pandas/tests/io/sas/data/test15.sas7bdat diff --git a/pandas/io/tests/sas/data/test16.sas7bdat b/pandas/tests/io/sas/data/test16.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test16.sas7bdat rename to pandas/tests/io/sas/data/test16.sas7bdat diff --git a/pandas/io/tests/sas/data/test2.sas7bdat b/pandas/tests/io/sas/data/test2.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test2.sas7bdat rename to pandas/tests/io/sas/data/test2.sas7bdat diff --git a/pandas/io/tests/sas/data/test3.sas7bdat b/pandas/tests/io/sas/data/test3.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test3.sas7bdat rename to pandas/tests/io/sas/data/test3.sas7bdat diff --git a/pandas/io/tests/sas/data/test4.sas7bdat b/pandas/tests/io/sas/data/test4.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test4.sas7bdat rename to pandas/tests/io/sas/data/test4.sas7bdat diff --git a/pandas/io/tests/sas/data/test5.sas7bdat b/pandas/tests/io/sas/data/test5.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test5.sas7bdat rename to pandas/tests/io/sas/data/test5.sas7bdat diff --git a/pandas/io/tests/sas/data/test6.sas7bdat b/pandas/tests/io/sas/data/test6.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test6.sas7bdat rename to pandas/tests/io/sas/data/test6.sas7bdat diff --git a/pandas/io/tests/sas/data/test7.sas7bdat b/pandas/tests/io/sas/data/test7.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test7.sas7bdat rename to pandas/tests/io/sas/data/test7.sas7bdat diff --git a/pandas/io/tests/sas/data/test8.sas7bdat b/pandas/tests/io/sas/data/test8.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test8.sas7bdat rename to pandas/tests/io/sas/data/test8.sas7bdat diff --git a/pandas/io/tests/sas/data/test9.sas7bdat b/pandas/tests/io/sas/data/test9.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test9.sas7bdat rename to pandas/tests/io/sas/data/test9.sas7bdat diff --git a/pandas/io/tests/sas/data/test_12659.csv b/pandas/tests/io/sas/data/test_12659.csv similarity index 100% rename from pandas/io/tests/sas/data/test_12659.csv rename to pandas/tests/io/sas/data/test_12659.csv diff --git a/pandas/io/tests/sas/data/test_12659.sas7bdat b/pandas/tests/io/sas/data/test_12659.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test_12659.sas7bdat rename to pandas/tests/io/sas/data/test_12659.sas7bdat diff --git a/pandas/io/tests/sas/data/test_sas7bdat_1.csv b/pandas/tests/io/sas/data/test_sas7bdat_1.csv similarity index 100% rename from pandas/io/tests/sas/data/test_sas7bdat_1.csv rename to pandas/tests/io/sas/data/test_sas7bdat_1.csv diff --git a/pandas/io/tests/sas/data/test_sas7bdat_2.csv b/pandas/tests/io/sas/data/test_sas7bdat_2.csv similarity index 100% rename from pandas/io/tests/sas/data/test_sas7bdat_2.csv rename to pandas/tests/io/sas/data/test_sas7bdat_2.csv diff --git a/pandas/io/tests/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py similarity index 100% rename from pandas/io/tests/sas/test_sas.py rename to pandas/tests/io/sas/test_sas.py diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py similarity index 100% rename from pandas/io/tests/sas/test_sas7bdat.py rename to pandas/tests/io/sas/test_sas7bdat.py diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py similarity index 100% rename from pandas/io/tests/sas/test_xport.py rename to pandas/tests/io/sas/test_xport.py diff --git a/pandas/io/tests/test_clipboard.py b/pandas/tests/io/test_clipboard.py similarity index 100% rename from pandas/io/tests/test_clipboard.py rename to pandas/tests/io/test_clipboard.py diff --git a/pandas/io/tests/test_common.py b/pandas/tests/io/test_common.py similarity index 100% rename from pandas/io/tests/test_common.py rename to pandas/tests/io/test_common.py diff --git a/pandas/io/tests/test_date_converters.py b/pandas/tests/io/test_date_converters.py similarity index 100% rename from pandas/io/tests/test_date_converters.py rename to pandas/tests/io/test_date_converters.py diff --git a/pandas/io/tests/test_excel.py b/pandas/tests/io/test_excel.py similarity index 100% rename from pandas/io/tests/test_excel.py rename to pandas/tests/io/test_excel.py diff --git a/pandas/io/tests/test_feather.py b/pandas/tests/io/test_feather.py similarity index 100% rename from pandas/io/tests/test_feather.py rename to pandas/tests/io/test_feather.py diff --git a/pandas/io/tests/test_gbq.py b/pandas/tests/io/test_gbq.py similarity index 100% rename from pandas/io/tests/test_gbq.py rename to pandas/tests/io/test_gbq.py diff --git a/pandas/io/tests/test_html.py b/pandas/tests/io/test_html.py similarity index 100% rename from pandas/io/tests/test_html.py rename to pandas/tests/io/test_html.py diff --git a/pandas/io/tests/test_packers.py b/pandas/tests/io/test_packers.py similarity index 99% rename from pandas/io/tests/test_packers.py rename to pandas/tests/io/test_packers.py index 4bb6f4a69bab3..911cd8164571d 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -795,7 +795,7 @@ class TestMsgpack(): @classmethod def setup_class(cls): - from pandas.io.tests.generate_legacy_storage_files import ( + from pandas.tests.io.generate_legacy_storage_files import ( create_msgpack_data, create_data) cls.data = create_msgpack_data() cls.all_data = create_data() diff --git a/pandas/io/tests/test_pickle.py b/pandas/tests/io/test_pickle.py similarity index 99% rename from pandas/io/tests/test_pickle.py rename to pandas/tests/io/test_pickle.py index 588b2d5f04888..5445c506b050c 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -33,7 +33,7 @@ class TestPickle(): @classmethod def setup_class(cls): - from pandas.io.tests.generate_legacy_storage_files import ( + from pandas.tests.io.generate_legacy_storage_files import ( create_pickle_data) cls.data = create_pickle_data() cls.path = u('__%s__.pickle' % tm.rands(10)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/tests/io/test_pytables.py similarity index 100% rename from pandas/io/tests/test_pytables.py rename to pandas/tests/io/test_pytables.py diff --git a/pandas/io/tests/test_s3.py b/pandas/tests/io/test_s3.py similarity index 100% rename from pandas/io/tests/test_s3.py rename to pandas/tests/io/test_s3.py diff --git a/pandas/io/tests/test_sql.py b/pandas/tests/io/test_sql.py similarity index 100% rename from pandas/io/tests/test_sql.py rename to pandas/tests/io/test_sql.py diff --git a/pandas/io/tests/test_stata.py b/pandas/tests/io/test_stata.py similarity index 100% rename from pandas/io/tests/test_stata.py rename to pandas/tests/io/test_stata.py diff --git a/pandas/sparse/tests/__init__.py b/pandas/tests/msgpack/__init__.py similarity index 100% rename from pandas/sparse/tests/__init__.py rename to pandas/tests/msgpack/__init__.py diff --git a/pandas/tests/test_msgpack/test_buffer.py b/pandas/tests/msgpack/test_buffer.py similarity index 100% rename from pandas/tests/test_msgpack/test_buffer.py rename to pandas/tests/msgpack/test_buffer.py diff --git a/pandas/tests/test_msgpack/test_case.py b/pandas/tests/msgpack/test_case.py similarity index 100% rename from pandas/tests/test_msgpack/test_case.py rename to pandas/tests/msgpack/test_case.py diff --git a/pandas/tests/test_msgpack/test_except.py b/pandas/tests/msgpack/test_except.py similarity index 100% rename from pandas/tests/test_msgpack/test_except.py rename to pandas/tests/msgpack/test_except.py diff --git a/pandas/tests/test_msgpack/test_extension.py b/pandas/tests/msgpack/test_extension.py similarity index 100% rename from pandas/tests/test_msgpack/test_extension.py rename to pandas/tests/msgpack/test_extension.py diff --git a/pandas/tests/test_msgpack/test_format.py b/pandas/tests/msgpack/test_format.py similarity index 100% rename from pandas/tests/test_msgpack/test_format.py rename to pandas/tests/msgpack/test_format.py diff --git a/pandas/tests/test_msgpack/test_limits.py b/pandas/tests/msgpack/test_limits.py similarity index 100% rename from pandas/tests/test_msgpack/test_limits.py rename to pandas/tests/msgpack/test_limits.py diff --git a/pandas/tests/test_msgpack/test_newspec.py b/pandas/tests/msgpack/test_newspec.py similarity index 100% rename from pandas/tests/test_msgpack/test_newspec.py rename to pandas/tests/msgpack/test_newspec.py diff --git a/pandas/tests/test_msgpack/test_obj.py b/pandas/tests/msgpack/test_obj.py similarity index 100% rename from pandas/tests/test_msgpack/test_obj.py rename to pandas/tests/msgpack/test_obj.py diff --git a/pandas/tests/test_msgpack/test_pack.py b/pandas/tests/msgpack/test_pack.py similarity index 100% rename from pandas/tests/test_msgpack/test_pack.py rename to pandas/tests/msgpack/test_pack.py diff --git a/pandas/tests/test_msgpack/test_read_size.py b/pandas/tests/msgpack/test_read_size.py similarity index 100% rename from pandas/tests/test_msgpack/test_read_size.py rename to pandas/tests/msgpack/test_read_size.py diff --git a/pandas/tests/test_msgpack/test_seq.py b/pandas/tests/msgpack/test_seq.py similarity index 100% rename from pandas/tests/test_msgpack/test_seq.py rename to pandas/tests/msgpack/test_seq.py diff --git a/pandas/tests/test_msgpack/test_sequnpack.py b/pandas/tests/msgpack/test_sequnpack.py similarity index 100% rename from pandas/tests/test_msgpack/test_sequnpack.py rename to pandas/tests/msgpack/test_sequnpack.py diff --git a/pandas/tests/test_msgpack/test_subtype.py b/pandas/tests/msgpack/test_subtype.py similarity index 100% rename from pandas/tests/test_msgpack/test_subtype.py rename to pandas/tests/msgpack/test_subtype.py diff --git a/pandas/tests/test_msgpack/test_unpack.py b/pandas/tests/msgpack/test_unpack.py similarity index 100% rename from pandas/tests/test_msgpack/test_unpack.py rename to pandas/tests/msgpack/test_unpack.py diff --git a/pandas/tests/test_msgpack/test_unpack_raw.py b/pandas/tests/msgpack/test_unpack_raw.py similarity index 100% rename from pandas/tests/test_msgpack/test_unpack_raw.py rename to pandas/tests/msgpack/test_unpack_raw.py diff --git a/pandas/tests/test_msgpack/__init__.py b/pandas/tests/sparse/__init__.py similarity index 100% rename from pandas/tests/test_msgpack/__init__.py rename to pandas/tests/sparse/__init__.py diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/tests/sparse/test_arithmetics.py similarity index 100% rename from pandas/sparse/tests/test_arithmetics.py rename to pandas/tests/sparse/test_arithmetics.py diff --git a/pandas/sparse/tests/test_array.py b/pandas/tests/sparse/test_array.py similarity index 100% rename from pandas/sparse/tests/test_array.py rename to pandas/tests/sparse/test_array.py diff --git a/pandas/sparse/tests/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py similarity index 100% rename from pandas/sparse/tests/test_combine_concat.py rename to pandas/tests/sparse/test_combine_concat.py diff --git a/pandas/sparse/tests/test_format.py b/pandas/tests/sparse/test_format.py similarity index 100% rename from pandas/sparse/tests/test_format.py rename to pandas/tests/sparse/test_format.py diff --git a/pandas/sparse/tests/test_frame.py b/pandas/tests/sparse/test_frame.py similarity index 100% rename from pandas/sparse/tests/test_frame.py rename to pandas/tests/sparse/test_frame.py diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/tests/sparse/test_groupby.py similarity index 100% rename from pandas/sparse/tests/test_groupby.py rename to pandas/tests/sparse/test_groupby.py diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/tests/sparse/test_indexing.py similarity index 100% rename from pandas/sparse/tests/test_indexing.py rename to pandas/tests/sparse/test_indexing.py diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/tests/sparse/test_libsparse.py similarity index 100% rename from pandas/sparse/tests/test_libsparse.py rename to pandas/tests/sparse/test_libsparse.py diff --git a/pandas/sparse/tests/test_list.py b/pandas/tests/sparse/test_list.py similarity index 100% rename from pandas/sparse/tests/test_list.py rename to pandas/tests/sparse/test_list.py diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/tests/sparse/test_pivot.py similarity index 100% rename from pandas/sparse/tests/test_pivot.py rename to pandas/tests/sparse/test_pivot.py diff --git a/pandas/sparse/tests/test_series.py b/pandas/tests/sparse/test_series.py similarity index 100% rename from pandas/sparse/tests/test_series.py rename to pandas/tests/sparse/test_series.py diff --git a/pandas/tools/tests/__init__.py b/pandas/tests/tools/__init__.py similarity index 100% rename from pandas/tools/tests/__init__.py rename to pandas/tests/tools/__init__.py diff --git a/pandas/tools/tests/data/allow_exact_matches.csv b/pandas/tests/tools/data/allow_exact_matches.csv similarity index 100% rename from pandas/tools/tests/data/allow_exact_matches.csv rename to pandas/tests/tools/data/allow_exact_matches.csv diff --git a/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv b/pandas/tests/tools/data/allow_exact_matches_and_tolerance.csv similarity index 100% rename from pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv rename to pandas/tests/tools/data/allow_exact_matches_and_tolerance.csv diff --git a/pandas/tools/tests/data/asof.csv b/pandas/tests/tools/data/asof.csv similarity index 100% rename from pandas/tools/tests/data/asof.csv rename to pandas/tests/tools/data/asof.csv diff --git a/pandas/tools/tests/data/asof2.csv b/pandas/tests/tools/data/asof2.csv similarity index 100% rename from pandas/tools/tests/data/asof2.csv rename to pandas/tests/tools/data/asof2.csv diff --git a/pandas/tools/tests/data/cut_data.csv b/pandas/tests/tools/data/cut_data.csv similarity index 100% rename from pandas/tools/tests/data/cut_data.csv rename to pandas/tests/tools/data/cut_data.csv diff --git a/pandas/tools/tests/data/quotes.csv b/pandas/tests/tools/data/quotes.csv similarity index 100% rename from pandas/tools/tests/data/quotes.csv rename to pandas/tests/tools/data/quotes.csv diff --git a/pandas/tools/tests/data/quotes2.csv b/pandas/tests/tools/data/quotes2.csv similarity index 100% rename from pandas/tools/tests/data/quotes2.csv rename to pandas/tests/tools/data/quotes2.csv diff --git a/pandas/tools/tests/data/tolerance.csv b/pandas/tests/tools/data/tolerance.csv similarity index 100% rename from pandas/tools/tests/data/tolerance.csv rename to pandas/tests/tools/data/tolerance.csv diff --git a/pandas/tools/tests/data/trades.csv b/pandas/tests/tools/data/trades.csv similarity index 100% rename from pandas/tools/tests/data/trades.csv rename to pandas/tests/tools/data/trades.csv diff --git a/pandas/tools/tests/data/trades2.csv b/pandas/tests/tools/data/trades2.csv similarity index 100% rename from pandas/tools/tests/data/trades2.csv rename to pandas/tests/tools/data/trades2.csv diff --git a/pandas/tools/tests/test_concat.py b/pandas/tests/tools/test_concat.py similarity index 100% rename from pandas/tools/tests/test_concat.py rename to pandas/tests/tools/test_concat.py diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tests/tools/test_hashing.py similarity index 100% rename from pandas/tools/tests/test_hashing.py rename to pandas/tests/tools/test_hashing.py diff --git a/pandas/tools/tests/test_join.py b/pandas/tests/tools/test_join.py similarity index 99% rename from pandas/tools/tests/test_join.py rename to pandas/tests/tools/test_join.py index fe5821a637205..ab42b1212301b 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tests/tools/test_join.py @@ -11,7 +11,7 @@ import pandas._join as _join import pandas.util.testing as tm -from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS +from pandas.tests.tools.test_merge import get_test_data, N, NGROUPS a_ = np.array diff --git a/pandas/tools/tests/test_merge.py b/pandas/tests/tools/test_merge.py similarity index 100% rename from pandas/tools/tests/test_merge.py rename to pandas/tests/tools/test_merge.py diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tests/tools/test_merge_asof.py similarity index 100% rename from pandas/tools/tests/test_merge_asof.py rename to pandas/tests/tools/test_merge_asof.py diff --git a/pandas/tools/tests/test_merge_ordered.py b/pandas/tests/tools/test_merge_ordered.py similarity index 100% rename from pandas/tools/tests/test_merge_ordered.py rename to pandas/tests/tools/test_merge_ordered.py diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tests/tools/test_pivot.py similarity index 100% rename from pandas/tools/tests/test_pivot.py rename to pandas/tests/tools/test_pivot.py diff --git a/pandas/tools/tests/test_tile.py b/pandas/tests/tools/test_tile.py similarity index 100% rename from pandas/tools/tests/test_tile.py rename to pandas/tests/tools/test_tile.py diff --git a/pandas/tools/tests/test_util.py b/pandas/tests/tools/test_util.py similarity index 100% rename from pandas/tools/tests/test_util.py rename to pandas/tests/tools/test_util.py diff --git a/setup.py b/setup.py index edec53e9cefb0..cbcadce459c67 100755 --- a/setup.py +++ b/setup.py @@ -622,12 +622,10 @@ def pxd(name): version=versioneer.get_version(), packages=['pandas', 'pandas.api', - 'pandas.api.tests', 'pandas.api.types', 'pandas.compat', 'pandas.compat.numpy', 'pandas.computation', - 'pandas.computation.tests', 'pandas.core', 'pandas.indexes', 'pandas.io', @@ -635,59 +633,61 @@ def pxd(name): 'pandas.io.sas', 'pandas.formats', 'pandas.sparse', - 'pandas.sparse.tests', 'pandas.stats', 'pandas.util', 'pandas.tests', + 'pandas.tests.api', + 'pandas.tests.computation', 'pandas.tests.frame', 'pandas.tests.indexes', 'pandas.tests.indexes.datetimes', 'pandas.tests.indexes.timedeltas', 'pandas.tests.indexes.period', + 'pandas.tests.io', + 'pandas.tests.io.json', + 'pandas.tests.io.parser', + 'pandas.tests.io.sas', 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', + 'pandas.tests.msgpack', 'pandas.tests.scalar', + 'pandas.tests.sparse', 'pandas.tests.tseries', + 'pandas.tests.tools', 'pandas.tests.types', - 'pandas.tests.test_msgpack', 'pandas.tests.plotting', 'pandas.tools', - 'pandas.tools.tests', 'pandas.tseries', 'pandas.types', - 'pandas.io.tests', - 'pandas.io.tests.json', - 'pandas.io.tests.parser', - 'pandas.io.tests.sas', 'pandas.msgpack', 'pandas.util.clipboard' ], - package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', - 'tests/data/legacy_pickle/*/*.pickle', - 'tests/data/legacy_msgpack/*/*.msgpack', - 'tests/data/*.csv*', - 'tests/data/*.dta', - 'tests/data/*.pickle', - 'tests/data/*.txt', - 'tests/data/*.xls', - 'tests/data/*.xlsx', - 'tests/data/*.xlsm', - 'tests/data/*.table', - 'tests/parser/data/*.csv', - 'tests/parser/data/*.gz', - 'tests/parser/data/*.bz2', - 'tests/parser/data/*.txt', - 'tests/sas/data/*.csv', - 'tests/sas/data/*.xpt', - 'tests/sas/data/*.sas7bdat', - 'tests/data/*.html', - 'tests/data/html_encoding/*.html', - 'tests/json/data/*.json'], - 'pandas.tools': ['tests/data/*.csv'], - 'pandas.tests': ['data/*.csv'], + package_data={'pandas.tests': ['data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'], + 'pandas.tests.io': ['data/legacy_hdf/*.h5', + 'data/legacy_pickle/*/*.pickle', + 'data/legacy_msgpack/*/*.msgpack', + 'data/*.csv*', + 'data/*.dta', + 'data/*.pickle', + 'data/*.txt', + 'data/*.xls', + 'data/*.xlsx', + 'data/*.xlsm', + 'data/*.table', + 'parser/data/*.csv', + 'parser/data/*.gz', + 'parser/data/*.bz2', + 'parser/data/*.txt', + 'sas/data/*.csv', + 'sas/data/*.xpt', + 'sas/data/*.sas7bdat', + 'data/*.html', + 'data/html_encoding/*.html', + 'json/data/*.json'], + 'pandas.tests.tools': ['data/*.csv'], 'pandas.tests.tseries': ['data/*.pickle'] }, ext_modules=extensions, From 1cad3873b2a7099932b1050d94c715a11615cb66 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 10:09:27 -0500 Subject: [PATCH 134/338] TST: fix locations for github based url tests --- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/test_excel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 0671901fc170a..b667eed346355 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -617,7 +617,7 @@ def test_read_csv_parse_simple_list(self): def test_url(self): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/io/tests/parser/data/salaries.csv') + 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salaries.csv') diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index a22c89184f20d..0c2b443cffe52 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -581,7 +581,7 @@ def test_read_xlrd_Book(self): @tm.network def test_read_from_http_url(self): url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/io/tests/data/test1' + self.ext) + 'pandas/tests/io/data/test1' + self.ext) url_table = read_excel(url) local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) From a9adfb6ad26a0fbc8e5735c5df5219bd9cfb7f51 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 10:31:47 -0500 Subject: [PATCH 135/338] DOC: fix path in whatsnew --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9f86c777c665d..aa620bce0df59 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -91,7 +91,7 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). url = 'https://github.com/{repo}/raw/{branch}/{path}'.format( repo = 'pandas-dev/pandas', branch = 'master', - path = 'pandas/io/tests/parser/data/salaries.csv.bz2', + path = 'pandas/tests/io/parser/data/salaries.csv.bz2', ) df = pd.read_table(url, compression='infer') # default, infer compression df = pd.read_table(url, compression='bz2') # explicitly specify compression From 0a4773a63b807de7039c042f1696986eed17ff5d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Feb 2017 16:17:27 -0500 Subject: [PATCH 136/338] TST: use xdist for multiple cpu testing closes #15369 --- .travis.yml | 3 +- ci/script_multi.sh | 32 +++ ci/{script.sh => script_single.sh} | 10 +- pandas/tests/indexes/datetimes/test_ops.py | 244 +++++++++++---------- pandas/tests/io/test_clipboard.py | 1 + pandas/tests/io/test_pytables.py | 7 +- pandas/tests/io/test_sql.py | 19 +- pandas/tests/test_window.py | 83 ++++--- setup.cfg | 2 + test_fast.sh | 3 +- 10 files changed, 223 insertions(+), 181 deletions(-) create mode 100755 ci/script_multi.sh rename ci/{script.sh => script_single.sh} (63%) diff --git a/.travis.yml b/.travis.yml index 2ff5d508d0371..6b90e49b336b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -320,7 +320,8 @@ before_script: script: - echo "script start" - ci/run_build_docs.sh - - ci/script.sh + - ci/script_single.sh + - ci/script_multi.sh - ci/lint.sh - echo "script done" diff --git a/ci/script_multi.sh b/ci/script_multi.sh new file mode 100755 index 0000000000000..83f8427cc57ad --- /dev/null +++ b/ci/script_multi.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +echo "[script multi]" + +source activate pandas + +# don't run the tests for the doc build +if [ x"$DOC_BUILD" != x"" ]; then + exit 0 +fi + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE"; + echo "Setting LC_ALL to $LOCALE_OVERRIDE" + + pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' + python -c "$pycmd" +fi + +if [ "$BUILD_TEST" ]; then + echo "We are not running pytest as this is simply a build test." +elif [ "$COVERAGE" ]; then + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas +else + echo pytest -n 2 -m "not single" $TEST_ARGS pandas + pytest -n 2 -m "not single" $TEST_ARGS pandas # TODO: doctest +fi + +RET="$?" + +exit "$RET" diff --git a/ci/script.sh b/ci/script_single.sh similarity index 63% rename from ci/script.sh rename to ci/script_single.sh index c52fa0fdb33a3..38021fcac5721 100755 --- a/ci/script.sh +++ b/ci/script_single.sh @@ -1,6 +1,6 @@ #!/bin/bash -echo "inside $0" +echo "[script_single]" source activate pandas @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas - pytest -s --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas else - echo pytest $TEST_ARGS pandas - pytest $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" $TEST_ARGS pandas + pytest -m "single" $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 9a968a42c4247..8eb9128d8d1c8 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1,7 +1,9 @@ +import pytest import warnings import numpy as np from datetime import timedelta +from itertools import product import pandas as pd import pandas.tslib as tslib import pandas.util.testing as tm @@ -958,134 +960,134 @@ def test_second(self): tm.assert_index_equal(r1, r2) -class TestDatetimeIndex(tm.TestCase): - - # GH 10699 - def test_datetime64_with_DateOffset(self): - for klass, assert_func in zip([Series, DatetimeIndex], - [self.assert_series_equal, - tm.assert_index_equal]): - s = klass(date_range('2000-01-01', '2000-01-31'), name='a') - result = s + pd.DateOffset(years=1) - result2 = pd.DateOffset(years=1) + s - exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') +# GH 10699 +@pytest.mark.parametrize('klass,assert_func', zip([Series, DatetimeIndex], + [tm.assert_series_equal, + tm.assert_index_equal])) +def test_datetime64_with_DateOffset(klass, assert_func): + s = klass(date_range('2000-01-01', '2000-01-31'), name='a') + result = s + pd.DateOffset(years=1) + result2 = pd.DateOffset(years=1) + s + exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') + assert_func(result, exp) + assert_func(result2, exp) + + result = s - pd.DateOffset(years=1) + exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') + assert_func(result, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.Day() + result2 = pd.offsets.Day() + s + exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), + Timestamp('2000-02-16', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.MonthEnd() + result2 = pd.offsets.MonthEnd() + s + exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), + Timestamp('2000-02-29', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + # array of offsets - valid for Series only + if klass is Series: + with tm.assert_produces_warning(PerformanceWarning): + s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.MonthEnd()]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') + ]) assert_func(result, exp) - assert_func(result2, exp) - result = s - pd.DateOffset(years=1) - exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') + # same offset + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.DateOffset(years=1)]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) assert_func(result, exp) - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')], name='a') - result = s + pd.offsets.Day() - result2 = pd.offsets.Day() + s - exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), - Timestamp('2000-02-16', tz='US/Central')], name='a') - assert_func(result, exp) - assert_func(result2, exp) - - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')], name='a') - result = s + pd.offsets.MonthEnd() - result2 = pd.offsets.MonthEnd() + s - exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], name='a') - assert_func(result, exp) - assert_func(result2, exp) - - # array of offsets - valid for Series only - if klass is Series: - with tm.assert_produces_warning(PerformanceWarning): - s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) - result = s + Series([pd.offsets.DateOffset(years=1), - pd.offsets.MonthEnd()]) - exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') - ]) - assert_func(result, exp) - - # same offset - result = s + Series([pd.offsets.DateOffset(years=1), - pd.offsets.DateOffset(years=1)]) - exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) - assert_func(result, exp) - - s = klass([Timestamp('2000-01-05 00:15:00'), + s = klass([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-03-31'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31'), + Timestamp('2000-05-15'), + Timestamp('2001-06-15')]) + + # DateOffset relativedelta fastpath + relative_kwargs = [('years', 2), ('months', 5), ('days', 3), + ('hours', 5), ('minutes', 10), ('seconds', 2), + ('microseconds', 5)] + for i, kwd in enumerate(relative_kwargs): + op = pd.DateOffset(**dict([kwd])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + + # assert these are equal on a piecewise basis + offsets = ['YearBegin', ('YearBegin', {'month': 5}), + 'YearEnd', ('YearEnd', {'month': 5}), + 'MonthBegin', 'MonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', + 'Week', ('Week', {'weekday': 3}), + 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', + 'CustomBusinessDay', 'CDay', 'CBMonthEnd', + 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', + 'BusinessHour', 'BYearBegin', 'BYearEnd', + 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}), + ('FY5253Quarter', {'qtr_with_extra_week': 1, + 'startingMonth': 1, + 'weekday': 2, + 'variation': 'nearest'}), + ('FY5253', {'weekday': 0, + 'startingMonth': 2, + 'variation': + 'nearest'}), + ('WeekOfMonth', {'weekday': 2, + 'week': 2}), + 'Easter', ('DateOffset', {'day': 4}), + ('DateOffset', {'month': 5})] + + with warnings.catch_warnings(record=True): + for normalize in (True, False): + for do in offsets: + if isinstance(do, tuple): + do, kwargs = do + else: + do = do + kwargs = {} + + for n in [0, 5]: + if (do in ['WeekOfMonth', 'LastWeekOfMonth', + 'FY5253Quarter', 'FY5253'] and n == 0): + continue + op = getattr(pd.offsets, do)(n, + normalize=normalize, + **kwargs) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + assert_func(klass([op + x for x in s]), op + s) + + +@pytest.mark.parametrize('years,months', product([-1, 0, 1], [-2, 0, 2])) +def test_shift_months(years, months): + s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp('2000-01-31 00:23:00'), Timestamp('2000-01-01'), - Timestamp('2000-03-31'), Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) - - # DateOffset relativedelta fastpath - relative_kwargs = [('years', 2), ('months', 5), ('days', 3), - ('hours', 5), ('minutes', 10), ('seconds', 2), - ('microseconds', 5)] - for i, kwd in enumerate(relative_kwargs): - op = pd.DateOffset(**dict([kwd])) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - - # assert these are equal on a piecewise basis - offsets = ['YearBegin', ('YearBegin', {'month': 5}), 'YearEnd', - ('YearEnd', {'month': 5}), 'MonthBegin', 'MonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'Week', ('Week', { - 'weekday': 3 - }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', - 'CustomBusinessDay', 'CDay', 'CBMonthEnd', - 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', - 'BusinessHour', 'BYearBegin', 'BYearEnd', - 'BQuarterBegin', ('LastWeekOfMonth', { - 'weekday': 2 - }), ('FY5253Quarter', {'qtr_with_extra_week': 1, - 'startingMonth': 1, - 'weekday': 2, - 'variation': 'nearest'}), - ('FY5253', {'weekday': 0, - 'startingMonth': 2, - 'variation': - 'nearest'}), ('WeekOfMonth', {'weekday': 2, - 'week': 2}), - 'Easter', ('DateOffset', {'day': 4}), - ('DateOffset', {'month': 5})] - - with warnings.catch_warnings(record=True): - for normalize in (True, False): - for do in offsets: - if isinstance(do, tuple): - do, kwargs = do - else: - do = do - kwargs = {} - - for n in [0, 5]: - if (do in ['WeekOfMonth', 'LastWeekOfMonth', - 'FY5253Quarter', 'FY5253'] and n == 0): - continue - op = getattr(pd.offsets, do)(n, - normalize=normalize, - **kwargs) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - assert_func(klass([op + x for x in s]), op + s) - - def test_shift_months(self): - s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp( - '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( - '2000-02-29'), Timestamp('2000-12-31')]) - for years in [-1, 0, 1]: - for months in [-2, 0, 2]: - actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + - months)) - expected = DatetimeIndex([x + offsets.DateOffset( - years=years, months=months) for x in s]) - tm.assert_index_equal(actual, expected) + Timestamp('2000-12-31')]) + actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + + months)) + expected = DatetimeIndex([x + offsets.DateOffset( + years=years, months=months) for x in s]) + tm.assert_index_equal(actual, expected) class TestBusinessDatetimeIndex(tm.TestCase): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3abd1093362f4..2e701143357e3 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -20,6 +20,7 @@ _DEPS_INSTALLED = 0 +@pytest.mark.single @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") class TestClipboard(tm.TestCase): diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 3fa0eb2ef52dc..a840ff46aa845 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -36,12 +36,6 @@ from pandas import concat, Timestamp from pandas import compat from pandas.compat import range, lrange, u - -try: - import tables -except ImportError: - pytest.skip('no pytables') - from distutils.version import LooseVersion _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' @@ -165,6 +159,7 @@ def tearDown(self): pass +@pytest.mark.single class TestHDFStore(Base, tm.TestCase): def test_factory_fun(self): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a6f4d96001021..78560611da7aa 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,13 +18,13 @@ """ from __future__ import print_function +import pytest import unittest import sqlite3 import csv import os import sys -import pytest import warnings import numpy as np import pandas as pd @@ -839,6 +839,7 @@ def test_unicode_column_name(self): df.to_sql('test_unicode', self.conn, index=False) +@pytest.mark.single class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi, unittest.TestCase): """ Test the public API as it would be used directly @@ -1024,10 +1025,12 @@ def tearDown(self): super(_EngineToConnMixin, self).tearDown() +@pytest.mark.single class TestSQLApiConn(_EngineToConnMixin, TestSQLApi, unittest.TestCase): pass +@pytest.mark.single class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi, unittest.TestCase): """ Test the public sqlite connection fallback API @@ -1875,30 +1878,36 @@ def test_schema_support(self): tm.assert_frame_equal(res1, res2) +@pytest.mark.single class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy, unittest.TestCase): pass +@pytest.mark.single class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn, unittest.TestCase): pass +@pytest.mark.single class TestPostgreSQLAlchemy(_TestPostgreSQLAlchemy, _TestSQLAlchemy, unittest.TestCase): pass +@pytest.mark.single class TestPostgreSQLAlchemyConn(_TestPostgreSQLAlchemy, _TestSQLAlchemyConn, unittest.TestCase): pass +@pytest.mark.single class TestSQLiteAlchemy(_TestSQLiteAlchemy, _TestSQLAlchemy, unittest.TestCase): pass +@pytest.mark.single class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn, unittest.TestCase): pass @@ -1907,6 +1916,7 @@ class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn, # ----------------------------------------------------------------------------- # -- Test Sqlite / MySQL fallback +@pytest.mark.single class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest, unittest.TestCase): """ Test the fallback mode against an in-memory sqlite database. @@ -2133,6 +2143,7 @@ def _skip_if_no_pymysql(): pytest.skip('pymysql not installed, skipping') +@pytest.mark.single class TestXSQLite(SQLiteMixIn, tm.TestCase): def setUp(self): @@ -2343,6 +2354,7 @@ def clean_up(test_table_to_drop): clean_up(table_name) +@pytest.mark.single class TestSQLFlavorDeprecation(tm.TestCase): """ gh-13611: test that the 'flavor' parameter @@ -2367,8 +2379,9 @@ def test_deprecated_flavor(self): getattr(sql, func)(self.con, flavor='sqlite') -@unittest.skip("gh-13611: there is no support for MySQL " - "if SQLAlchemy is not installed") +@pytest.mark.single +@pytest.mark.skip(reason="gh-13611: there is no support for MySQL " + "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn, tm.TestCase): @classmethod diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3add568c1ea99..1bb1f91423a9d 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2,6 +2,7 @@ import pytest import sys import warnings +from warnings import catch_warnings from datetime import datetime from numpy.random import randn @@ -291,8 +292,7 @@ def test_how_compat(self): for op in ['mean', 'sum', 'std', 'var', 'kurt', 'skew']: for t in ['rolling', 'expanding']: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): dfunc = getattr(pd, "{0}_{1}".format(t, op)) if dfunc is None: @@ -526,7 +526,7 @@ def setUp(self): def test_deprecations(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): mom.rolling_mean(np.ones(10), 3, center=True, axis=0) mom.rolling_mean(Series(np.ones(10)), 3, center=True, axis=0) @@ -791,7 +791,7 @@ def test_cmov_mean(self): xp = np.array([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, 12.818, 12.952, np.nan, np.nan]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_mean(vals, 5, center=True) tm.assert_almost_equal(xp, rs) @@ -808,7 +808,7 @@ def test_cmov_window(self): xp = np.array([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, 12.818, 12.952, np.nan, np.nan]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) tm.assert_almost_equal(xp, rs) @@ -823,19 +823,19 @@ def test_cmov_window_corner(self): # all nan vals = np.empty(10, dtype=float) vals.fill(np.nan) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) self.assertTrue(np.isnan(rs).all()) # empty vals = np.array([]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) self.assertEqual(len(rs), 0) # shorter than window vals = np.random.randn(5) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 10, 'boxcar') self.assertTrue(np.isnan(rs).all()) self.assertEqual(len(rs), 5) @@ -1014,16 +1014,16 @@ def test_cmov_window_special_linear_range(self): tm.assert_series_equal(xp, rs) def test_rolling_median(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_moment_func(mom.rolling_median, np.median, name='median') def test_rolling_min(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_moment_func(mom.rolling_min, np.min, name='min') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): a = np.array([1, 2, 3, 4, 5]) b = mom.rolling_min(a, window=100, min_periods=1) tm.assert_almost_equal(b, np.ones(len(a))) @@ -1033,10 +1033,10 @@ def test_rolling_min(self): def test_rolling_max(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_moment_func(mom.rolling_max, np.max, name='max') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): a = np.array([1, 2, 3, 4, 5], dtype=np.float64) b = mom.rolling_max(a, window=100, min_periods=1) tm.assert_almost_equal(a, b) @@ -1102,11 +1102,11 @@ def test_rolling_apply_out_of_bounds(self): arr = np.arange(4) # it works! - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_apply(arr, 10, np.sum) self.assertTrue(isnull(result).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) tm.assert_almost_equal(result, result) @@ -1117,19 +1117,19 @@ def test_rolling_std(self): name='std', ddof=0) def test_rolling_std_1obs(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1) expected = np.array([np.nan] * 5) tm.assert_almost_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1, ddof=0) expected = np.zeros(5) tm.assert_almost_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), 3, min_periods=2) self.assertTrue(np.isnan(result[2])) @@ -1142,11 +1142,11 @@ def test_rolling_std_neg_sqrt(self): a = np.array([0.0011448196318903589, 0.00028718669878572767, 0.00028718669878572767, 0.00028718669878572767, 0.00028718669878572767]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): b = mom.rolling_std(a, window=3) self.assertTrue(np.isfinite(b[2:]).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): b = mom.ewmstd(a, span=3) self.assertTrue(np.isfinite(b[2:]).all()) @@ -1184,25 +1184,25 @@ def test_fperr_robustness(self): if sys.byteorder != "little": arr = arr.byteswap().newbyteorder() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_sum(arr, 2) self.assertTrue((result[1:] >= 0).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_mean(arr, 2) self.assertTrue((result[1:] >= 0).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_var(arr, 2) self.assertTrue((result[1:] >= 0).all()) # #2527, ugh arr = np.array([0.00012456, 0.0003, 0]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_mean(arr, 1) self.assertTrue(result[-1] >= 0) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_mean(-arr, 1) self.assertTrue(result[-1] <= 0) @@ -1327,15 +1327,13 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): # catch a freq deprecation warning if freq is provided and not # None - w = FutureWarning if freq is not None else None - with tm.assert_produces_warning(w, check_stacklevel=False): + with catch_warnings(record=True): r = obj.rolling(window=window, min_periods=min_periods, freq=freq, center=center) return getattr(r, name)(**kwargs) # check via the moments API - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): return f(obj, window=window, min_periods=min_periods, freq=freq, center=center, **kwargs) @@ -1419,7 +1417,7 @@ def test_ewma(self): arr = np.zeros(1000) arr[5] = 1 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.ewma(arr, span=100, adjust=False).sum() self.assertTrue(np.abs(result - 1) < 1e-2) @@ -1506,7 +1504,7 @@ def test_ewmvol(self): self._check_ew(mom.ewmvol, name='vol') def test_ewma_span_com_args(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) tm.assert_almost_equal(A, B) @@ -1515,7 +1513,7 @@ def test_ewma_span_com_args(self): self.assertRaises(ValueError, mom.ewma, self.arr) def test_ewma_halflife_arg(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): A = mom.ewma(self.arr, com=13.932726172912965) B = mom.ewma(self.arr, halflife=10.0) tm.assert_almost_equal(A, B) @@ -1530,7 +1528,7 @@ def test_ewma_halflife_arg(self): def test_ewma_alpha_old_api(self): # GH 10789 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): a = mom.ewma(self.arr, alpha=0.61722699889169674) b = mom.ewma(self.arr, com=0.62014947789973052) c = mom.ewma(self.arr, span=2.240298955799461) @@ -1541,7 +1539,7 @@ def test_ewma_alpha_old_api(self): def test_ewma_alpha_arg_old_api(self): # GH 10789 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.assertRaises(ValueError, mom.ewma, self.arr) self.assertRaises(ValueError, mom.ewma, self.arr, com=10.0, alpha=0.5) @@ -1598,13 +1596,12 @@ def test_ew_empty_arrays(self): funcs = [mom.ewma, mom.ewmvol, mom.ewmvar] for f in funcs: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): result = f(arr, 3) tm.assert_almost_equal(result, arr) def _check_ew(self, func, name=None): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_ew_ndarray(func, name=name) self._check_ew_structures(func, name=name) @@ -2870,7 +2867,7 @@ def test_rolling_max_gh6297(self): expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max() tm.assert_series_equal(expected, x) @@ -2889,14 +2886,14 @@ def test_rolling_max_how_resample(self): # Default how should be max expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max() tm.assert_series_equal(expected, x) # Now specify median (10.0) expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max(how='median') tm.assert_series_equal(expected, x) @@ -2904,7 +2901,7 @@ def test_rolling_max_how_resample(self): v = (4.0 + 10.0 + 20.0) / 3.0 expected = Series([0.0, 1.0, 2.0, 3.0, v], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max(how='mean') tm.assert_series_equal(expected, x) @@ -2923,7 +2920,7 @@ def test_rolling_min_how_resample(self): # Default how should be min expected = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): r = series.rolling(window=1, freq='D') tm.assert_series_equal(expected, r.min()) @@ -2942,7 +2939,7 @@ def test_rolling_median_how_resample(self): # Default how should be median expected = Series([0.0, 1.0, 2.0, 3.0, 10], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').median() tm.assert_series_equal(expected, x) diff --git a/setup.cfg b/setup.cfg index 45d98dd733f1f..b9de7a3532209 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,3 +25,5 @@ split_penalty_logical_operator = 30 # Silencing the warning until then addopts = --disable-pytest-warnings testpaths = pandas +markers = + single: mark a test as single cpu only diff --git a/test_fast.sh b/test_fast.sh index 0b394cffa3d74..43eb376f879cd 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -1,2 +1 @@ -# nosetests -A "not slow and not network" pandas --with-id $* -pytest pandas --skip-slow +pytest pandas --skip-slow --skip-network -m "not single" -n 4 From 1d9569dbf7f8cd742fcacfe762c9a10a34fbaf18 Mon Sep 17 00:00:00 2001 From: Andrew Kittredge Date: Sun, 12 Feb 2017 12:37:13 -0500 Subject: [PATCH 137/338] Typo (#15377) --- doc/source/advanced.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 8833d73cb0a84..b6f015c15606d 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -59,7 +59,7 @@ Creating a MultiIndex (hierarchical index) object The ``MultiIndex`` object is the hierarchical analogue of the standard ``Index`` object which typically stores the axis labels in pandas objects. You -can think of ``MultiIndex`` an array of tuples where each tuple is unique. A +can think of ``MultiIndex`` as an array of tuples where each tuple is unique. A ``MultiIndex`` can be created from a list of arrays (using ``MultiIndex.from_arrays``), an array of tuples (using ``MultiIndex.from_tuples``), or a crossed set of iterables (using From 7949cec049ae6bff366683d162a8fe225cf07ed0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 11:46:48 -0500 Subject: [PATCH 138/338] TST: control skipping of numexpr tests if its installed / used --- pandas/tests/test_expressions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 0318757f76a11..3032a288032a2 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -20,9 +20,6 @@ import pandas.util.testing as tm -if not expr._USE_NUMEXPR: - numexpr = pytest.importorskip('numexpr') - _frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') _frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') _mixed = DataFrame({'A': _frame['A'].copy(), @@ -50,6 +47,7 @@ _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) +@pytest.mark.skipif(not expr._USE_NUMEXPR, reason='not using numexpr') class TestExpressions(tm.TestCase): def setUp(self): From f239129128663f646e7315cd98126697c28a97a2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 12:51:11 -0500 Subject: [PATCH 139/338] TST: make test_gbq single cpu --- pandas/tests/io/test_gbq.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 0868edd2147b5..0317ebc49ad2c 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,6 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) +@pytest.mark.single class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -298,6 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -329,6 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -360,6 +363,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) +@pytest.mark.single class GBQUnitTests(tm.TestCase): def setUp(self): @@ -446,6 +450,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) +@pytest.mark.single class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -499,6 +504,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) +@pytest.mark.single class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -901,6 +907,7 @@ def test_configuration_without_query(self): configuration=config) +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1215,6 +1222,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') +@pytest.mark.single class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1272,6 +1280,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 29666cc402597085340677f481f4cf8b224a85e4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 21:43:50 -0500 Subject: [PATCH 140/338] ENH: expose Int64VectorData in hashtable.pxd --- pandas/hashtable.pxd | 14 ++++++++++++++ pandas/src/hashtable_class_helper.pxi.in | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index cd06b938310a8..cabfa43a76f26 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,5 +1,6 @@ from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t, int64_t, float64_t) +from numpy cimport ndarray # prototypes for sharing @@ -35,3 +36,16 @@ cdef class StringHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + +cdef struct Int64VectorData: + int64_t *data + size_t n, m + +cdef class Int64Vector: + cdef Int64VectorData *data + cdef ndarray ao + + cdef resize(self) + cpdef to_array(self) + cdef inline void append(self, int64_t x) + cdef extend(self, int64_t[:] x) diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 74c38dfdb393e..ef385ba7dca1c 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -24,10 +24,14 @@ dtypes = [('Float64', 'float64', 'float64_t'), {{for name, dtype, arg in dtypes}} +{{if dtype != 'int64'}} + ctypedef struct {{name}}VectorData: {{arg}} *data size_t n, m +{{endif}} + @cython.wraparound(False) @cython.boundscheck(False) @@ -65,9 +69,11 @@ dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), cdef class {{name}}Vector: + {{if dtype != 'int64'}} cdef: {{name}}VectorData *data ndarray ao + {{endif}} def __cinit__(self): self.data = <{{name}}VectorData *>PyMem_Malloc( @@ -92,7 +98,7 @@ cdef class {{name}}Vector: def __len__(self): return self.data.n - def to_array(self): + cpdef to_array(self): self.ao.resize(self.data.n) self.data.m = self.data.n return self.ao @@ -104,6 +110,10 @@ cdef class {{name}}Vector: append_data_{{dtype}}(self.data, x) + cdef extend(self, {{arg}}[:] x): + for i in range(len(x)): + self.append(x[i]) + {{endfor}} cdef class StringVector: From 0c38886281b1eb691c5ea2896a0dc458b7e87b0a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 21:54:11 -0500 Subject: [PATCH 141/338] TST: xfail most test_gbq tests for now --- pandas/tests/io/test_gbq.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 0317ebc49ad2c..316afaf306011 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -363,7 +363,6 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.single class GBQUnitTests(tm.TestCase): def setUp(self): @@ -450,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -504,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -907,7 +906,7 @@ def test_configuration_without_query(self): configuration=config) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1022,8 +1021,6 @@ def test_upload_data_if_table_exists_append(self): def test_upload_data_if_table_exists_replace(self): - pytest.skip("buggy test") - destination_table = DESTINATION_TABLE + "4" test_size = 10 @@ -1222,7 +1219,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1280,7 +1277,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 8eda10b16889d69d4a0b67ebfb3190356abcfad7 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 14 Feb 2017 08:29:18 -0500 Subject: [PATCH 142/338] TST: Fix gbq integration tests. gbq._Dataset.dataset() would not return full results This PR resolves an issue where `gbq._Dataset.datasets()` would not return all datasets under a Google BigQuery project. If `'nextPageToken'` is populated, then another `datasets().list()` request should be sent with `'pageToken'` set to collect more results. In the past few days, additional datasets were added under the Google BigQuery project id used by pandas as part of the following github project : https://github.com/pydata/pandas-gbq . The addition of datasets caused many gbq unit tests to fail because in function `clean_gbq_environment()`, we check to see if the dataset exists using the incomplete results from `gbq._Dataset.datasets()` before we attempt to delete it. Author: Anthonios Partheniou Closes #15381 from parthea/fix-broken-gbq-unit-tests and squashes the following commits: 61bc1e7 [Anthonios Partheniou] TST: Fix gbq tests. gbq.dataset()/gbq.tables would not return full results. --- pandas/io/gbq.py | 67 ++++++++++++++++++++++++------------- pandas/tests/io/test_gbq.py | 16 ++++----- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 169a2b1df9b4c..0ffb6b4bf8c05 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1056,21 +1056,32 @@ def datasets(self): List of datasets under the specific project """ - try: - list_dataset_response = self.service.datasets().list( - projectId=self.project_id).execute().get('datasets', None) + dataset_list = [] + next_page_token = None + first_query = True - if not list_dataset_response: - return [] + while first_query or next_page_token: + first_query = False - dataset_list = list() + try: + list_dataset_response = self.service.datasets().list( + projectId=self.project_id, + pageToken=next_page_token).execute() - for row_num, raw_row in enumerate(list_dataset_response): - dataset_list.append(raw_row['datasetReference']['datasetId']) + dataset_response = list_dataset_response.get('datasets') + next_page_token = list_dataset_response.get('nextPageToken') - return dataset_list - except self.http_error as ex: - self.process_http_error(ex) + if not dataset_response: + return dataset_list + + for row_num, raw_row in enumerate(dataset_response): + dataset_list.append( + raw_row['datasetReference']['datasetId']) + + except self.http_error as ex: + self.process_http_error(ex) + + return dataset_list def create(self, dataset_id): """ Create a dataset in Google BigQuery @@ -1140,19 +1151,29 @@ def tables(self, dataset_id): List of tables under the specific dataset """ - try: - list_table_response = self.service.tables().list( - projectId=self.project_id, - datasetId=dataset_id).execute().get('tables', None) + table_list = [] + next_page_token = None + first_query = True - if not list_table_response: - return [] + while first_query or next_page_token: + first_query = False - table_list = list() + try: + list_table_response = self.service.tables().list( + projectId=self.project_id, + datasetId=dataset_id, + pageToken=next_page_token).execute() - for row_num, raw_row in enumerate(list_table_response): - table_list.append(raw_row['tableReference']['tableId']) + table_response = list_table_response.get('tables') + next_page_token = list_table_response.get('nextPageToken') - return table_list - except self.http_error as ex: - self.process_http_error(ex) + if not table_response: + return table_list + + for row_num, raw_row in enumerate(table_response): + table_list.append(raw_row['tableReference']['tableId']) + + except self.http_error as ex: + self.process_http_error(ex) + + return table_list diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 316afaf306011..dfbf3ca69b111 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -449,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -503,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -906,7 +906,7 @@ def test_configuration_without_query(self): configuration=config) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1219,7 +1219,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1277,7 +1277,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 4b098f95eae111940a72fd171d2d36c8216df7f8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Feb 2017 08:33:34 -0500 Subject: [PATCH 143/338] Bug: Raise ValueError with interpolate & fillna limit = 0 (#9217) closes #9217 Author: Matt Roeschke Closes #14994 from mroeschke/fix_9217 and squashes the following commits: c1790ee [Matt Roeschke] Unify ValueError message and correct cython limits 6f041e6 [Matt Roeschke] Bug: Raise ValueError with interpolate limit = 0 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 6 ++--- pandas/core/internals.py | 4 +++ pandas/core/missing.py | 8 ++++-- pandas/src/algos_common_helper.pxi.in | 36 ++++++++++++++++++--------- pandas/tests/series/test_missing.py | 18 ++++++++++++++ 6 files changed, 56 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index aa620bce0df59..d76e33caffbf1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -421,6 +421,7 @@ Other API Changes - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) +- ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 228dd2acd2124..20e6e027dbf09 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3262,7 +3262,7 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be - filled. + filled. Must be greater than 0 if not None. downcast : dict, default is None a dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate @@ -3281,6 +3281,7 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None): inplace = validate_bool_kwarg(inplace, 'inplace') + if isinstance(value, (list, tuple)): raise TypeError('"value" parameter must be a scalar or dict, but ' 'you passed a "{0}"'.format(type(value).__name__)) @@ -3292,7 +3293,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, axis = 0 axis = self._get_axis_number(axis) method = missing.clean_fill_method(method) - from pandas import DataFrame if value is None: if method is None: @@ -3687,7 +3687,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 0: fill column-by-column * 1: fill row-by-row limit : int, default None. - Maximum number of consecutive NaNs to fill. + Maximum number of consecutive NaNs to fill. Must be greater than 0. limit_direction : {'forward', 'backward', 'both'}, default 'forward' If limit is specified, consecutive NaNs will be filled in this direction. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f0b1516d786c6..6cd5eceed5f2a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -372,6 +372,10 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, original_value = value mask = isnull(self.values) if limit is not None: + if not is_integer(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') if self.ndim > 2: raise NotImplementedError("number of dimensions for 'fillna' " "is currently limited to 2") diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e83a0518d97f6..ffd0423572f5e 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -12,7 +12,7 @@ is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_integer_dtype, _ensure_float64, is_scalar, - needs_i8_conversion) + needs_i8_conversion, is_integer) from pandas.types.missing import isnull @@ -169,7 +169,11 @@ def _interp_limit(invalid, fw_limit, bw_limit): # the beginning (see issues #9218 and #10420) violate_limit = sorted(start_nans) - if limit: + if limit is not None: + if not is_integer(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') if limit_direction == 'forward': violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 5e87528943005..42089f9520ab6 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -83,8 +83,10 @@ def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, if limit is None: lim = nright else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: @@ -146,8 +148,10 @@ def pad_inplace_{{name}}(ndarray[{{c_type}}] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit val = values[0] @@ -180,8 +184,10 @@ def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit for j in range(K): @@ -240,8 +246,10 @@ def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, if limit is None: lim = nright else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: @@ -304,8 +312,10 @@ def backfill_inplace_{{name}}(ndarray[{{c_type}}] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit val = values[N - 1] @@ -338,8 +348,10 @@ def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit for j in range(K): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 405d6c98a5d37..23eb6a40f5f1d 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -295,6 +295,13 @@ def test_fillna_raise(self): self.assertRaises(TypeError, s.fillna, [1, 2]) self.assertRaises(TypeError, s.fillna, (1, 2)) + # related GH 9217, make sure limit is an int and greater than 0 + s = Series([1, 2, 3, None]) + for limit in [-1, 0, 1., 2.]: + for method in ['backfill', 'bfill', 'pad', 'ffill', None]: + with tm.assertRaises(ValueError): + s.fillna(1, limit=limit, method=method) + def test_fillna_nat(self): series = Series([0, 1, 2, tslib.iNaT], dtype='M8[ns]') @@ -865,6 +872,17 @@ def test_interp_limit(self): result = s.interpolate(method='linear', limit=2) assert_series_equal(result, expected) + # GH 9217, make sure limit is an int and greater than 0 + methods = ['linear', 'time', 'index', 'values', 'nearest', 'zero', + 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', + 'polynomial', 'spline', 'piecewise_polynomial', None, + 'from_derivatives', 'pchip', 'akima'] + s = pd.Series([1, 2, np.nan, np.nan, 5]) + for limit in [-1, 0, 1., 2.]: + for method in methods: + with tm.assertRaises(ValueError): + s.interpolate(limit=limit, method=method) + def test_interp_limit_forward(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) From 811e8c1dbce69eacdc9a59a55aff1416ffc11eb2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 17:35:05 -0500 Subject: [PATCH 144/338] CLN: create core/sorting.py just a small reorg to put sorting / grouping utilities into a separate area Author: Jeff Reback Closes #15402 from jreback/sorting and squashes the following commits: fdcf9a1 [Jeff Reback] change a couple of sorting.py functions to be non-private (public to pandas internals) 90ff22d [Jeff Reback] split up some value_counts groupby tests a bit 18ea902 [Jeff Reback] CLN: create core/sorting.py 92dcb07 [Jeff Reback] CLN: remove numpy_groupby as not used --- pandas/core/frame.py | 26 +- pandas/core/groupby.py | 376 +--------------------- pandas/core/reshape.py | 13 +- pandas/core/series.py | 10 +- pandas/core/sorting.py | 357 ++++++++++++++++++++ pandas/indexes/multi.py | 12 +- pandas/tests/groupby/test_filters.py | 21 -- pandas/tests/groupby/test_groupby.py | 169 ---------- pandas/tests/groupby/test_misc.py | 101 ------ pandas/tests/groupby/test_value_counts.py | 60 ++++ pandas/tests/test_sorting.py | 339 +++++++++++++++++++ pandas/tests/tools/test_merge.py | 135 +------- pandas/tools/merge.py | 4 +- 13 files changed, 802 insertions(+), 821 deletions(-) create mode 100644 pandas/core/sorting.py delete mode 100644 pandas/tests/groupby/test_misc.py create mode 100644 pandas/tests/groupby/test_value_counts.py create mode 100644 pandas/tests/test_sorting.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aa03bfb9a54b9..16f8d4658dc20 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3141,7 +3141,7 @@ def duplicated(self, subset=None, keep='first'): ------- duplicated : Series """ - from pandas.core.groupby import get_group_index + from pandas.core.sorting import get_group_index from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): @@ -3179,7 +3179,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, raise ValueError('Length of ascending (%d) != length of by (%d)' % (len(ascending), len(by))) if len(by) > 1: - from pandas.core.groupby import _lexsort_indexer + from pandas.core.sorting import lexsort_indexer def trans(v): if needs_i8_conversion(v): @@ -3193,11 +3193,11 @@ def trans(v): raise ValueError('Cannot sort by duplicate column %s' % str(x)) keys.append(trans(k)) - indexer = _lexsort_indexer(keys, orders=ascending, - na_position=na_position) + indexer = lexsort_indexer(keys, orders=ascending, + na_position=na_position) indexer = _ensure_platform_int(indexer) else: - from pandas.core.groupby import _nargsort + from pandas.core.sorting import nargsort by = by[0] k = self.xs(by, axis=other_axis).values @@ -3214,8 +3214,8 @@ def trans(v): if isinstance(ascending, (tuple, list)): ascending = ascending[0] - indexer = _nargsort(k, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort(k, kind=kind, ascending=ascending, + na_position=na_position) new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), @@ -3300,17 +3300,17 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sort_remaining=sort_remaining) elif isinstance(labels, MultiIndex): - from pandas.core.groupby import _lexsort_indexer + from pandas.core.sorting import lexsort_indexer # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer if not labels.is_lexsorted(): labels = MultiIndex.from_tuples(labels.values) - indexer = _lexsort_indexer(labels.labels, orders=ascending, - na_position=na_position) + indexer = lexsort_indexer(labels.labels, orders=ascending, + na_position=na_position) else: - from pandas.core.groupby import _nargsort + from pandas.core.sorting import nargsort # GH11080 - Check monotonic-ness before sort an index # if monotonic (already sorted), return None or copy() according @@ -3322,8 +3322,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self.copy() - indexer = _nargsort(labels, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort(labels, kind=kind, ascending=ascending, + na_position=na_position) new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a228861270aea..23c835318b0e6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -7,7 +7,7 @@ import copy from pandas.compat import ( - zip, range, long, lzip, + zip, range, lzip, callable, map ) from pandas import compat @@ -47,6 +47,9 @@ from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel +from pandas.core.sorting import (get_group_index_sorter, get_group_index, + compress_group_index, get_flattened_iterator, + decons_obs_group_ids, get_indexer_dict) from pandas.util.decorators import (cache_readonly, Substitution, Appender, make_signature, deprecate_kwarg) from pandas.formats.printing import pprint_thing @@ -59,7 +62,6 @@ from pandas.lib import Timestamp import pandas.tslib as tslib import pandas.algos as _algos -import pandas.hashtable as _hash _doc_template = """ @@ -729,7 +731,7 @@ def _cumcount_array(self, ascending=True): (though the default is sort=True) for groupby in general """ ids, _, ngroups = self.grouper.group_info - sorter = _get_group_index_sorter(ids, ngroups) + sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) if count == 0: @@ -1616,9 +1618,12 @@ def _get_group_keys(self): return self.levels[0] else: comp_ids, _, ngroups = self.group_info + # provide "flattened" iterator for multi-group setting - mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels) - return [mapper.get_key(i) for i in range(ngroups)] + return get_flattened_iterator(comp_ids, + ngroups, + self.levels, + self.labels) def apply(self, f, data, axis=0): mutated = self.mutated @@ -1662,7 +1667,7 @@ def indices(self): label_list = [ping.labels for ping in self.groupings] keys = [_values_from_object(ping.group_index) for ping in self.groupings] - return _get_indices_dict(label_list, keys) + return get_indexer_dict(label_list, keys) @property def labels(self): @@ -1726,7 +1731,7 @@ def _get_compressed_labels(self): if len(all_labels) > 1: group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) - return _compress_group_index(group_index, sort=self.sort) + return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] return ping.labels, np.arange(len(ping.group_index)) @@ -2027,7 +2032,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() - indexer = _get_group_index_sorter(group_index, ngroups) + indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer, convert=False) group_index = algos.take_nd(group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, @@ -2424,7 +2429,6 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, a BaseGrouper. """ - group_axis = obj._get_axis(axis) # validate that the passed level is compatible with the passed @@ -4206,7 +4210,7 @@ def slabels(self): @cache_readonly def sort_idx(self): # Counting sort indexer - return _get_group_index_sorter(self.labels, self.ngroups) + return get_group_index_sorter(self.labels, self.ngroups) def __iter__(self): sdata = self._get_sorted_data() @@ -4302,355 +4306,3 @@ def get_splitter(data, *args, **kwargs): klass = NDFrameSplitter return klass(data, *args, **kwargs) - - -# ---------------------------------------------------------------------- -# Misc utilities - - -def get_group_index(labels, shape, sort, xnull): - """ - For the particular label_list, gets the offsets into the hypothetical list - representing the totally ordered cartesian product of all possible label - combinations, *as long as* this space fits within int64 bounds; - otherwise, though group indices identify unique combinations of - labels, they cannot be deconstructed. - - If `sort`, rank of returned ids preserve lexical ranks of labels. - i.e. returned id's can be used to do lexical sort on labels; - - If `xnull` nulls (-1 labels) are passed through. - - Parameters - ---------- - labels: sequence of arrays - Integers identifying levels at each location - shape: sequence of ints same length as labels - Number of unique levels at each location - sort: boolean - If the ranks of returned ids should match lexical ranks of labels - xnull: boolean - If true nulls are excluded. i.e. -1 values in the labels are - passed through - Returns - ------- - An array of type int64 where two elements are equal if their corresponding - labels are equal at all location. - """ - def _int64_cut_off(shape): - acc = long(1) - for i, mul in enumerate(shape): - acc *= long(mul) - if not acc < _INT64_MAX: - return i - return len(shape) - - def loop(labels, shape): - # how many levels can be done without overflow: - nlev = _int64_cut_off(shape) - - # compute flat ids for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - out = stride * labels[0].astype('i8', subok=False, copy=False) - - for i in range(1, nlev): - if shape[i] == 0: - stride = 0 - else: - stride //= shape[i] - out += labels[i] * stride - - if xnull: # exclude nulls - mask = labels[0] == -1 - for lab in labels[1:nlev]: - mask |= lab == -1 - out[mask] = -1 - - if nlev == len(shape): # all levels done! - return out - - # compress what has been done so far in order to avoid overflow - # to retain lexical ranks, obs_ids should be sorted - comp_ids, obs_ids = _compress_group_index(out, sort=sort) - - labels = [comp_ids] + labels[nlev:] - shape = [len(obs_ids)] + shape[nlev:] - - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) - - -_INT64_MAX = np.iinfo(np.int64).max - - -def _int64_overflow_possible(shape): - the_prod = long(1) - for x in shape: - the_prod *= long(x) - - return the_prod >= _INT64_MAX - - -def decons_group_index(comp_labels, shape): - # reconstruct labels - if _int64_overflow_possible(shape): - # at some point group indices are factorized, - # and may not be deconstructed here! wrong path! - raise ValueError('cannot deconstruct factorized group indices!') - - label_list = [] - factor = 1 - y = 0 - x = comp_labels - for i in reversed(range(len(shape))): - labels = (x - y) % (factor * shape[i]) // factor - np.putmask(labels, comp_labels < 0, -1) - label_list.append(labels) - y = labels * factor - factor *= shape[i] - return label_list[::-1] - - -def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): - """ - reconstruct labels from observed group ids - - Parameters - ---------- - xnull: boolean, - if nulls are excluded; i.e. -1 labels are passed through - """ - from pandas.hashtable import unique_label_indices - - if not xnull: - lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') - shape = np.asarray(shape, dtype='i8') + lift - - if not _int64_overflow_possible(shape): - # obs ids are deconstructable! take the fast route! - out = decons_group_index(obs_ids, shape) - return out if xnull or not lift.any() \ - else [x - y for x, y in zip(out, lift)] - - i = unique_label_indices(comp_ids) - i8copy = lambda a: a.astype('i8', subok=False, copy=True) - return [i8copy(lab[i]) for lab in labels] - - -def _indexer_from_factorized(labels, shape, compress=True): - ids = get_group_index(labels, shape, sort=True, xnull=False) - - if not compress: - ngroups = (ids.size and ids.max()) + 1 - else: - ids, obs = _compress_group_index(ids, sort=True) - ngroups = len(obs) - - return _get_group_index_sorter(ids, ngroups) - - -def _lexsort_indexer(keys, orders=None, na_position='last'): - labels = [] - shape = [] - if isinstance(orders, bool): - orders = [orders] * len(keys) - elif orders is None: - orders = [True] * len(keys) - - for key, order in zip(keys, orders): - - # we are already a Categorical - if is_categorical_dtype(key): - c = key - - # create the Categorical - else: - c = Categorical(key, ordered=True) - - if na_position not in ['last', 'first']: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - - n = len(c.categories) - codes = c.codes.copy() - - mask = (c.codes == -1) - if order: # ascending - if na_position == 'last': - codes = np.where(mask, n, codes) - elif na_position == 'first': - codes += 1 - else: # not order means descending - if na_position == 'last': - codes = np.where(mask, n, n - codes - 1) - elif na_position == 'first': - codes = np.where(mask, 0, n - codes) - if mask.any(): - n += 1 - - shape.append(n) - labels.append(codes) - - return _indexer_from_factorized(labels, shape) - - -def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): - """ - This is intended to be a drop-in replacement for np.argsort which - handles NaNs. It adds ascending and na_position parameters. - GH #6399, #5231 - """ - - # specially handle Categorical - if is_categorical_dtype(items): - return items.argsort(ascending=ascending) - - items = np.asanyarray(items) - idx = np.arange(len(items)) - mask = isnull(items) - non_nans = items[~mask] - non_nan_idx = idx[~mask] - nan_idx = np.nonzero(mask)[0] - if not ascending: - non_nans = non_nans[::-1] - non_nan_idx = non_nan_idx[::-1] - indexer = non_nan_idx[non_nans.argsort(kind=kind)] - if not ascending: - indexer = indexer[::-1] - # Finally, place the NaNs at the end or the beginning according to - # na_position - if na_position == 'last': - indexer = np.concatenate([indexer, nan_idx]) - elif na_position == 'first': - indexer = np.concatenate([nan_idx, indexer]) - else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - return indexer - - -class _KeyMapper(object): - - """ - Ease my suffering. Map compressed group id -> key tuple - """ - - def __init__(self, comp_ids, ngroups, labels, levels): - self.levels = levels - self.labels = labels - self.comp_ids = comp_ids.astype(np.int64) - - self.k = len(labels) - self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)] - - self._populate_tables() - - def _populate_tables(self): - for labs, table in zip(self.labels, self.tables): - table.map(self.comp_ids, labs.astype(np.int64)) - - def get_key(self, comp_id): - return tuple(level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels)) - - -def _get_indices_dict(label_list, keys): - shape = list(map(len, keys)) - - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - ngroups = ((group_index.size and group_index.max()) + 1) \ - if _int64_overflow_possible(shape) \ - else np.prod(shape, dtype='i8') - - sorter = _get_group_index_sorter(group_index, ngroups) - - sorted_labels = [lab.take(sorter) for lab in label_list] - group_index = group_index.take(sorter) - - return lib.indices_fast(sorter, group_index, keys, sorted_labels) - - -# ---------------------------------------------------------------------- -# sorting levels...cleverly? - -def _get_group_index_sorter(group_index, ngroups): - """ - _algos.groupsort_indexer implements `counting sort` and it is at least - O(ngroups), where - ngroups = prod(shape) - shape = map(len, keys) - that is, linear in the number of combinations (cartesian product) of unique - values of groupby keys. This can be huge when doing multi-key groupby. - np.argsort(kind='mergesort') is O(count x log(count)) where count is the - length of the data-frame; - Both algorithms are `stable` sort and that is necessary for correctness of - groupby operations. e.g. consider: - df.groupby(key)[col].transform('first') - """ - count = len(group_index) - alpha = 0.0 # taking complexities literally; there may be - beta = 1.0 # some room for fine-tuning these parameters - do_groupsort = (count > 0 and ((alpha + beta * ngroups) < - (count * np.log(count)))) - if do_groupsort: - sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), - ngroups) - return _ensure_platform_int(sorter) - else: - return group_index.argsort(kind='mergesort') - - -def _compress_group_index(group_index, sort=True): - """ - Group_index is offsets into cartesian product of all possible labels. This - space can be huge, so this function compresses it, by computing offsets - (comp_ids) into the list of unique labels (obs_group_ids). - """ - - size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) - table = _hash.Int64HashTable(size_hint) - - group_index = _ensure_int64(group_index) - - # note, group labels come out ascending (ie, 1,2,3 etc) - comp_ids, obs_group_ids = table.get_labels_groupby(group_index) - - if sort and len(obs_group_ids) > 0: - obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - - return comp_ids, obs_group_ids - - -def _reorder_by_uniques(uniques, labels): - # sorter is index where elements ought to go - sorter = uniques.argsort() - - # reverse_indexer is where elements came from - reverse_indexer = np.empty(len(sorter), dtype=np.int64) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = labels < 0 - - # move labels to right locations (ie, unsort ascending labels) - labels = algos.take_nd(reverse_indexer, labels, allow_fill=False) - np.putmask(labels, mask, -1) - - # sort observed ids - uniques = algos.take_nd(uniques, sorter, allow_fill=False) - - return uniques, labels - - -def numpy_groupby(data, labels, axis=0): - s = np.argsort(labels) - keys, inv = np.unique(labels, return_inverse=True) - i = inv.take(s) - groups_at = np.where(i != np.concatenate(([-1], i[:-1])))[0] - ordered_data = data.take(s, axis=axis) - group_sums = np.add.reduceat(ordered_data, groups_at, axis=axis) - - return group_sums diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index cebaf4e3fd89b..5fc0d590a6885 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -20,7 +20,8 @@ from pandas._sparse import IntIndex from pandas.core.categorical import Categorical, _factorize_from_iterable -from pandas.core.groupby import get_group_index, _compress_group_index +from pandas.core.sorting import (get_group_index, compress_group_index, + decons_obs_group_ids) import pandas.core.algorithms as algos import pandas.algos as _algos @@ -156,7 +157,7 @@ def get_result(self): # filter out missing levels if values.shape[1] > 0: - col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) + col_inds, obs_ids = compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] @@ -245,8 +246,6 @@ def get_new_index(self): def _unstack_multiple(data, clocs): - from pandas.core.groupby import decons_obs_group_ids - if len(clocs) == 0: return data @@ -268,7 +267,7 @@ def _unstack_multiple(data, clocs): shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape, sort=False, xnull=False) - comp_ids, obs_ids = _compress_group_index(group_index, sort=False) + comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) @@ -459,10 +458,8 @@ def _unstack_frame(obj, level, fill_value=None): def get_compressed_ids(labels, sizes): - from pandas.core.groupby import get_group_index - ids = get_group_index(labels, sizes, sort=True, xnull=False) - return _compress_group_index(ids, sort=True) + return compress_group_index(ids, sort=True) def stack(frame, level=-1, dropna=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index e1eac8f66017e..da47ab5dfb003 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1786,12 +1786,12 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): - from pandas.core.groupby import _lexsort_indexer - indexer = _lexsort_indexer(index.labels, orders=ascending) + from pandas.core.sorting import lexsort_indexer + indexer = lexsort_indexer(index.labels, orders=ascending) else: - from pandas.core.groupby import _nargsort - indexer = _nargsort(index, kind=kind, ascending=ascending, - na_position=na_position) + from pandas.core.sorting import nargsort + indexer = nargsort(index, kind=kind, ascending=ascending, + na_position=na_position) indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py new file mode 100644 index 0000000000000..71314da7745c0 --- /dev/null +++ b/pandas/core/sorting.py @@ -0,0 +1,357 @@ +""" miscellaneous sorting / groupby utilities """ + +import numpy as np +from pandas.compat import long +from pandas.core.categorical import Categorical +from pandas.types.common import (_ensure_platform_int, + _ensure_int64, + is_categorical_dtype) +from pandas.types.missing import isnull +import pandas.core.algorithms as algos +import pandas.algos as _algos +import pandas.hashtable as _hash +from pandas import lib + + +_INT64_MAX = np.iinfo(np.int64).max + + +def get_group_index(labels, shape, sort, xnull): + """ + For the particular label_list, gets the offsets into the hypothetical list + representing the totally ordered cartesian product of all possible label + combinations, *as long as* this space fits within int64 bounds; + otherwise, though group indices identify unique combinations of + labels, they cannot be deconstructed. + - If `sort`, rank of returned ids preserve lexical ranks of labels. + i.e. returned id's can be used to do lexical sort on labels; + - If `xnull` nulls (-1 labels) are passed through. + + Parameters + ---------- + labels: sequence of arrays + Integers identifying levels at each location + shape: sequence of ints same length as labels + Number of unique levels at each location + sort: boolean + If the ranks of returned ids should match lexical ranks of labels + xnull: boolean + If true nulls are excluded. i.e. -1 values in the labels are + passed through + Returns + ------- + An array of type int64 where two elements are equal if their corresponding + labels are equal at all location. + """ + def _int64_cut_off(shape): + acc = long(1) + for i, mul in enumerate(shape): + acc *= long(mul) + if not acc < _INT64_MAX: + return i + return len(shape) + + def loop(labels, shape): + # how many levels can be done without overflow: + nlev = _int64_cut_off(shape) + + # compute flat ids for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype='i8') + out = stride * labels[0].astype('i8', subok=False, copy=False) + + for i in range(1, nlev): + if shape[i] == 0: + stride = 0 + else: + stride //= shape[i] + out += labels[i] * stride + + if xnull: # exclude nulls + mask = labels[0] == -1 + for lab in labels[1:nlev]: + mask |= lab == -1 + out[mask] = -1 + + if nlev == len(shape): # all levels done! + return out + + # compress what has been done so far in order to avoid overflow + # to retain lexical ranks, obs_ids should be sorted + comp_ids, obs_ids = compress_group_index(out, sort=sort) + + labels = [comp_ids] + labels[nlev:] + shape = [len(obs_ids)] + shape[nlev:] + + return loop(labels, shape) + + def maybe_lift(lab, size): # pormote nan values + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + return loop(list(labels), list(shape)) + + +def is_int64_overflow_possible(shape): + the_prod = long(1) + for x in shape: + the_prod *= long(x) + + return the_prod >= _INT64_MAX + + +def decons_group_index(comp_labels, shape): + # reconstruct labels + if is_int64_overflow_possible(shape): + # at some point group indices are factorized, + # and may not be deconstructed here! wrong path! + raise ValueError('cannot deconstruct factorized group indices!') + + label_list = [] + factor = 1 + y = 0 + x = comp_labels + for i in reversed(range(len(shape))): + labels = (x - y) % (factor * shape[i]) // factor + np.putmask(labels, comp_labels < 0, -1) + label_list.append(labels) + y = labels * factor + factor *= shape[i] + return label_list[::-1] + + +def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): + """ + reconstruct labels from observed group ids + + Parameters + ---------- + xnull: boolean, + if nulls are excluded; i.e. -1 labels are passed through + """ + from pandas.hashtable import unique_label_indices + + if not xnull: + lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') + shape = np.asarray(shape, dtype='i8') + lift + + if not is_int64_overflow_possible(shape): + # obs ids are deconstructable! take the fast route! + out = decons_group_index(obs_ids, shape) + return out if xnull or not lift.any() \ + else [x - y for x, y in zip(out, lift)] + + i = unique_label_indices(comp_ids) + i8copy = lambda a: a.astype('i8', subok=False, copy=True) + return [i8copy(lab[i]) for lab in labels] + + +def indexer_from_factorized(labels, shape, compress=True): + ids = get_group_index(labels, shape, sort=True, xnull=False) + + if not compress: + ngroups = (ids.size and ids.max()) + 1 + else: + ids, obs = compress_group_index(ids, sort=True) + ngroups = len(obs) + + return get_group_index_sorter(ids, ngroups) + + +def lexsort_indexer(keys, orders=None, na_position='last'): + labels = [] + shape = [] + if isinstance(orders, bool): + orders = [orders] * len(keys) + elif orders is None: + orders = [True] * len(keys) + + for key, order in zip(keys, orders): + + # we are already a Categorical + if is_categorical_dtype(key): + c = key + + # create the Categorical + else: + c = Categorical(key, ordered=True) + + if na_position not in ['last', 'first']: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + n = len(c.categories) + codes = c.codes.copy() + + mask = (c.codes == -1) + if order: # ascending + if na_position == 'last': + codes = np.where(mask, n, codes) + elif na_position == 'first': + codes += 1 + else: # not order means descending + if na_position == 'last': + codes = np.where(mask, n, n - codes - 1) + elif na_position == 'first': + codes = np.where(mask, 0, n - codes) + if mask.any(): + n += 1 + + shape.append(n) + labels.append(codes) + + return indexer_from_factorized(labels, shape) + + +def nargsort(items, kind='quicksort', ascending=True, na_position='last'): + """ + This is intended to be a drop-in replacement for np.argsort which + handles NaNs. It adds ascending and na_position parameters. + GH #6399, #5231 + """ + + # specially handle Categorical + if is_categorical_dtype(items): + return items.argsort(ascending=ascending) + + items = np.asanyarray(items) + idx = np.arange(len(items)) + mask = isnull(items) + non_nans = items[~mask] + non_nan_idx = idx[~mask] + nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + if not ascending: + indexer = indexer[::-1] + # Finally, place the NaNs at the end or the beginning according to + # na_position + if na_position == 'last': + indexer = np.concatenate([indexer, nan_idx]) + elif na_position == 'first': + indexer = np.concatenate([nan_idx, indexer]) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + return indexer + + +class _KeyMapper(object): + + """ + Ease my suffering. Map compressed group id -> key tuple + """ + + def __init__(self, comp_ids, ngroups, levels, labels): + self.levels = levels + self.labels = labels + self.comp_ids = comp_ids.astype(np.int64) + + self.k = len(labels) + self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)] + + self._populate_tables() + + def _populate_tables(self): + for labs, table in zip(self.labels, self.tables): + table.map(self.comp_ids, labs.astype(np.int64)) + + def get_key(self, comp_id): + return tuple(level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels)) + + +def get_flattened_iterator(comp_ids, ngroups, levels, labels): + # provide "flattened" iterator for multi-group setting + mapper = _KeyMapper(comp_ids, ngroups, levels, labels) + return [mapper.get_key(i) for i in range(ngroups)] + + +def get_indexer_dict(label_list, keys): + """ return a diction of {labels} -> {indexers} """ + shape = list(map(len, keys)) + + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + ngroups = ((group_index.size and group_index.max()) + 1) \ + if is_int64_overflow_possible(shape) \ + else np.prod(shape, dtype='i8') + + sorter = get_group_index_sorter(group_index, ngroups) + + sorted_labels = [lab.take(sorter) for lab in label_list] + group_index = group_index.take(sorter) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + + +# ---------------------------------------------------------------------- +# sorting levels...cleverly? + +def get_group_index_sorter(group_index, ngroups): + """ + _algos.groupsort_indexer implements `counting sort` and it is at least + O(ngroups), where + ngroups = prod(shape) + shape = map(len, keys) + that is, linear in the number of combinations (cartesian product) of unique + values of groupby keys. This can be huge when doing multi-key groupby. + np.argsort(kind='mergesort') is O(count x log(count)) where count is the + length of the data-frame; + Both algorithms are `stable` sort and that is necessary for correctness of + groupby operations. e.g. consider: + df.groupby(key)[col].transform('first') + """ + count = len(group_index) + alpha = 0.0 # taking complexities literally; there may be + beta = 1.0 # some room for fine-tuning these parameters + do_groupsort = (count > 0 and ((alpha + beta * ngroups) < + (count * np.log(count)))) + if do_groupsort: + sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), + ngroups) + return _ensure_platform_int(sorter) + else: + return group_index.argsort(kind='mergesort') + + +def compress_group_index(group_index, sort=True): + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + """ + + size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) + table = _hash.Int64HashTable(size_hint) + + group_index = _ensure_int64(group_index) + + # note, group labels come out ascending (ie, 1,2,3 etc) + comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + + if sort and len(obs_group_ids) > 0: + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) + + return comp_ids, obs_group_ids + + +def _reorder_by_uniques(uniques, labels): + # sorter is index where elements ought to go + sorter = uniques.argsort() + + # reverse_indexer is where elements came from + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + + # move labels to right locations (ie, unsort ascending labels) + labels = algos.take_nd(reverse_indexer, labels, allow_fill=False) + np.putmask(labels, mask, -1) + + # sort observed ids + uniques = algos.take_nd(uniques, sorter, allow_fill=False) + + return uniques, labels diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 9ab07d87fd13b..653ba1fee5691 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -663,7 +663,7 @@ def is_unique(self): False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) def duplicated(self, keep='first'): - from pandas.core.groupby import get_group_index + from pandas.core.sorting import get_group_index from pandas.hashtable import duplicated_int64 shape = map(len, self.levels) @@ -1405,7 +1405,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Indices of output values in original index """ - from pandas.core.groupby import _indexer_from_factorized + from pandas.core.sorting import indexer_from_factorized if isinstance(level, (compat.string_types, int)): level = [level] @@ -1417,8 +1417,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): if not len(level) == len(ascending): raise ValueError("level must have same length as ascending") - from pandas.core.groupby import _lexsort_indexer - indexer = _lexsort_indexer(self.labels, orders=ascending) + from pandas.core.sorting import lexsort_indexer + indexer = lexsort_indexer(self.labels, orders=ascending) # level ordering else: @@ -1436,8 +1436,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): else: sortorder = level[0] - indexer = _indexer_from_factorized(primary, primshp, - compress=False) + indexer = indexer_from_factorized(primary, primshp, + compress=False) if not ascending: indexer = indexer[::-1] diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 1640858802047..46ddb5a5318fb 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -616,24 +616,3 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v) - - -def test_decons(): - from pandas.core.groupby import decons_group_index, get_group_index - - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) - - for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) - - shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] - testit(label_list, shape) - - shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] - testit(label_list, shape) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d625fa07d932c..3a6a9eaaa8e72 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1510,59 +1510,6 @@ def check_nunique(df, keys, as_index=True): check_nunique(frame, ['jim'], as_index=False) check_nunique(frame, ['jim', 'joe'], as_index=False) - def test_series_groupby_value_counts(self): - from itertools import product - np.random.seed(1234) - - def rebuild_index(df): - arr = list(map(df.index.get_level_values, range(df.index.nlevels))) - df.index = MultiIndex.from_arrays(arr, names=df.index.names) - return df - - def check_value_counts(df, keys, bins): - for isort, normalize, sort, ascending, dropna \ - in product((False, True), repeat=5): - - kwargs = dict(normalize=normalize, sort=sort, - ascending=ascending, dropna=dropna, bins=bins) - - gr = df.groupby(keys, sort=isort) - left = gr['3rd'].value_counts(**kwargs) - - gr = df.groupby(keys, sort=isort) - right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] - - # have to sort on index because of unstable sort on values - left, right = map(rebuild_index, (left, right)) # xref GH9212 - assert_series_equal(left.sort_index(), right.sort_index()) - - def loop(df): - bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) - keys = '1st', '2nd', ('1st', '2nd') - for k, b in product(keys, bins): - check_value_counts(df, k, b) - - days = date_range('2015-08-24', periods=10) - - for n, m in product((100, 1000), (5, 20)): - frame = DataFrame({ - '1st': np.random.choice( - list('abcd'), n), - '2nd': np.random.choice(days, n), - '3rd': np.random.randint(1, m + 1, n) - }) - - loop(frame) - - frame.loc[1::11, '1st'] = nan - frame.loc[3::17, '2nd'] = nan - frame.loc[7::19, '3rd'] = nan - frame.loc[8::19, '3rd'] = nan - frame.loc[9::19, '3rd'] = nan - - loop(frame) - def test_multiindex_passthru(self): # GH 7997 @@ -3071,22 +3018,6 @@ def test_panel_groupby(self): agged = grouped.mean() self.assert_index_equal(agged.minor_axis, Index([0, 1])) - def test_numpy_groupby(self): - from pandas.core.groupby import numpy_groupby - - data = np.random.randn(100, 100) - labels = np.random.randint(0, 10, size=100) - - df = DataFrame(data) - - result = df.groupby(labels).sum().values - expected = numpy_groupby(data, labels) - assert_almost_equal(result, expected) - - result = df.groupby(labels, axis=1).sum().values - expected = numpy_groupby(data, labels, axis=1) - assert_almost_equal(result, expected) - def test_groupby_2d_malformed(self): d = DataFrame(index=lrange(2)) d['group'] = ['g1', 'g2'] @@ -3112,85 +3043,6 @@ def test_int32_overflow(self): right = df.groupby(['D', 'C', 'B', 'A']).sum() self.assertEqual(len(left), len(right)) - def test_int64_overflow(self): - from pandas.core.groupby import _int64_overflow_possible - - B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) - A = np.arange(2500) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': A, - 'F': B, - 'G': A, - 'H': B, - 'values': np.random.randn(2500)}) - - lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) - rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) - - left = lg.sum()['values'] - right = rg.sum()['values'] - - exp_index, _ = left.index.sortlevel() - self.assert_index_equal(left.index, exp_index) - - exp_index, _ = right.index.sortlevel(0) - self.assert_index_equal(right.index, exp_index) - - tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' - ]].values)) - tups = com._asarray_tuplesafe(tups) - - expected = df.groupby(tups).sum()['values'] - - for k, v in compat.iteritems(expected): - self.assertEqual(left[k], right[k[::-1]]) - self.assertEqual(left[k], v) - self.assertEqual(len(left), len(right)) - - # GH9096 - values = range(55109) - data = pd.DataFrame.from_dict({'a': values, - 'b': values, - 'c': values, - 'd': values}) - grouped = data.groupby(['a', 'b', 'c', 'd']) - self.assertEqual(len(grouped), len(values)) - - arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) - i = np.random.choice(len(arr), len(arr) * 4) - arr = np.vstack((arr, arr[i])) # add sume duplicate rows - - i = np.random.permutation(len(arr)) - arr = arr[i] # shuffle rows - - df = DataFrame(arr, columns=list('abcde')) - df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 - gr = df.groupby(list('abcde')) - - # verify this is testing what it is supposed to test! - self.assertTrue(_int64_overflow_possible(gr.grouper.shape)) - - # mannually compute groupings - jim, joe = defaultdict(list), defaultdict(list) - for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): - jim[key].append(a) - joe[key].append(b) - - self.assertEqual(len(gr), len(jim)) - mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) - - def aggr(func): - f = lambda a: np.fromiter(map(func, a), dtype='f8') - arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=['jim', 'joe'], index=mi) - return res.sort_index() - - assert_frame_equal(gr.mean(), aggr(np.mean)) - assert_frame_equal(gr.median(), aggr(np.median)) - def test_groupby_sort_multi(self): df = DataFrame({'a': ['foo', 'bar', 'baz'], 'b': [3, 2, 1], @@ -4451,24 +4303,3 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v) - - -def test_decons(): - from pandas.core.groupby import decons_group_index, get_group_index - - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) - - for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) - - shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] - testit(label_list, shape) - - shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] - testit(label_list, shape) diff --git a/pandas/tests/groupby/test_misc.py b/pandas/tests/groupby/test_misc.py deleted file mode 100644 index 9395304385681..0000000000000 --- a/pandas/tests/groupby/test_misc.py +++ /dev/null @@ -1,101 +0,0 @@ -""" misc non-groupby routines, as they are defined in core/groupby.py """ - -import pytest -import numpy as np -from numpy import nan -from pandas.util import testing as tm -from pandas.core.groupby import _nargsort, _lexsort_indexer - - -class TestSorting(tm.TestCase): - - def test_lexsort_indexer(self): - keys = [[nan] * 5 + list(range(100)) + [nan] * 5] - # orders=True, na_position='last' - result = _lexsort_indexer(keys, orders=True, na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=True, na_position='first' - result = _lexsort_indexer(keys, orders=True, na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='last' - result = _lexsort_indexer(keys, orders=False, na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='first' - result = _lexsort_indexer(keys, orders=False, na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - def test_nargsort(self): - # np.argsort(items) places NaNs last - items = [nan] * 5 + list(range(100)) + [nan] * 5 - # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype='O') - - try: - # GH 2785; due to a regression in NumPy1.6.2 - np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) - np.argsort(items2, kind='mergesort') - except TypeError: - pytest.skip('requested sort not available for type') - - # mergesort is the most difficult to get right because we want it to be - # stable. - - # According to numpy/core/tests/test_multiarray, """The number of - # sorted items must be greater than ~50 to check the actual algorithm - # because quick and merge sort fall over to insertion sort for small - # arrays.""" - - # mergesort, ascending=True, na_position='last' - result = _nargsort(items, kind='mergesort', ascending=True, - na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = _nargsort(items, kind='mergesort', ascending=True, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = _nargsort(items, kind='mergesort', ascending=False, - na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = _nargsort(items, kind='mergesort', ascending=False, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='last' - result = _nargsort(items2, kind='mergesort', ascending=True, - na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = _nargsort(items2, kind='mergesort', ascending=True, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = _nargsort(items2, kind='mergesort', ascending=False, - na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = _nargsort(items2, kind='mergesort', ascending=False, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py new file mode 100644 index 0000000000000..801d0da070112 --- /dev/null +++ b/pandas/tests/groupby/test_value_counts.py @@ -0,0 +1,60 @@ +import pytest + +from itertools import product +import numpy as np + +from pandas.util import testing as tm +from pandas import MultiIndex, DataFrame, Series, date_range + + +@pytest.mark.parametrize("n,m", product((100, 1000), (5, 20))) +def test_series_groupby_value_counts(n, m): + np.random.seed(1234) + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + def check_value_counts(df, keys, bins): + for isort, normalize, sort, ascending, dropna \ + in product((False, True), repeat=5): + + kwargs = dict(normalize=normalize, sort=sort, + ascending=ascending, dropna=dropna, bins=bins) + + gr = df.groupby(keys, sort=isort) + left = gr['3rd'].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr['3rd'].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ['3rd'] + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) + + def loop(df): + bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) + keys = '1st', '2nd', ('1st', '2nd') + for k, b in product(keys, bins): + check_value_counts(df, k, b) + + days = date_range('2015-08-24', periods=10) + + frame = DataFrame({ + '1st': np.random.choice( + list('abcd'), n), + '2nd': np.random.choice(days, n), + '3rd': np.random.randint(1, m + 1, n) + }) + + loop(frame) + + frame.loc[1::11, '1st'] = np.nan + frame.loc[3::17, '2nd'] = np.nan + frame.loc[7::19, '3rd'] = np.nan + frame.loc[8::19, '3rd'] = np.nan + frame.loc[9::19, '3rd'] = np.nan + + loop(frame) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py new file mode 100644 index 0000000000000..99361695b2371 --- /dev/null +++ b/pandas/tests/test_sorting.py @@ -0,0 +1,339 @@ +import pytest +from itertools import product +from collections import defaultdict + +import numpy as np +from numpy import nan +import pandas as pd +from pandas.core import common as com +from pandas import DataFrame, MultiIndex, merge, concat, Series, compat +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.core.sorting import (is_int64_overflow_possible, + decons_group_index, + get_group_index, + nargsort, + lexsort_indexer) + + +class TestSorting(tm.TestCase): + + def test_int64_overflow(self): + + B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) + A = np.arange(2500) + df = DataFrame({'A': A, + 'B': B, + 'C': A, + 'D': B, + 'E': A, + 'F': B, + 'G': A, + 'H': B, + 'values': np.random.randn(2500)}) + + lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) + rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) + + left = lg.sum()['values'] + right = rg.sum()['values'] + + exp_index, _ = left.index.sortlevel() + self.assert_index_equal(left.index, exp_index) + + exp_index, _ = right.index.sortlevel(0) + self.assert_index_equal(right.index, exp_index) + + tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' + ]].values)) + tups = com._asarray_tuplesafe(tups) + + expected = df.groupby(tups).sum()['values'] + + for k, v in compat.iteritems(expected): + self.assertEqual(left[k], right[k[::-1]]) + self.assertEqual(left[k], v) + self.assertEqual(len(left), len(right)) + + # GH9096 + values = range(55109) + data = pd.DataFrame.from_dict({'a': values, + 'b': values, + 'c': values, + 'd': values}) + grouped = data.groupby(['a', 'b', 'c', 'd']) + self.assertEqual(len(grouped), len(values)) + + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) + i = np.random.choice(len(arr), len(arr) * 4) + arr = np.vstack((arr, arr[i])) # add sume duplicate rows + + i = np.random.permutation(len(arr)) + arr = arr[i] # shuffle rows + + df = DataFrame(arr, columns=list('abcde')) + df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 + gr = df.groupby(list('abcde')) + + # verify this is testing what it is supposed to test! + self.assertTrue(is_int64_overflow_possible(gr.grouper.shape)) + + # mannually compute groupings + jim, joe = defaultdict(list), defaultdict(list) + for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): + jim[key].append(a) + joe[key].append(b) + + self.assertEqual(len(gr), len(jim)) + mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) + + def aggr(func): + f = lambda a: np.fromiter(map(func, a), dtype='f8') + arr = np.vstack((f(jim.values()), f(joe.values()))).T + res = DataFrame(arr, columns=['jim', 'joe'], index=mi) + return res.sort_index() + + assert_frame_equal(gr.mean(), aggr(np.mean)) + assert_frame_equal(gr.median(), aggr(np.median)) + + def test_lexsort_indexer(self): + keys = [[nan] * 5 + list(range(100)) + [nan] * 5] + # orders=True, na_position='last' + result = lexsort_indexer(keys, orders=True, na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=True, na_position='first' + result = lexsort_indexer(keys, orders=True, na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='last' + result = lexsort_indexer(keys, orders=False, na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='first' + result = lexsort_indexer(keys, orders=False, na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + def test_nargsort(self): + # np.argsort(items) places NaNs last + items = [nan] * 5 + list(range(100)) + [nan] * 5 + # np.argsort(items2) may not place NaNs first + items2 = np.array(items, dtype='O') + + try: + # GH 2785; due to a regression in NumPy1.6.2 + np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) + np.argsort(items2, kind='mergesort') + except TypeError: + pytest.skip('requested sort not available for type') + + # mergesort is the most difficult to get right because we want it to be + # stable. + + # According to numpy/core/tests/test_multiarray, """The number of + # sorted items must be greater than ~50 to check the actual algorithm + # because quick and merge sort fall over to insertion sort for small + # arrays.""" + + # mergesort, ascending=True, na_position='last' + result = nargsort(items, kind='mergesort', ascending=True, + na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = nargsort(items, kind='mergesort', ascending=True, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = nargsort(items, kind='mergesort', ascending=False, + na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = nargsort(items, kind='mergesort', ascending=False, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='last' + result = nargsort(items2, kind='mergesort', ascending=True, + na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = nargsort(items2, kind='mergesort', ascending=True, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = nargsort(items2, kind='mergesort', ascending=False, + na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = nargsort(items2, kind='mergesort', ascending=False, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + +class TestMerge(tm.TestCase): + + @pytest.mark.slow + def test_int64_overflow_issues(self): + + # #2690, combinatorial explosion + df1 = DataFrame(np.random.randn(1000, 7), + columns=list('ABCDEF') + ['G1']) + df2 = DataFrame(np.random.randn(1000, 7), + columns=list('ABCDEF') + ['G2']) + + # it works! + result = merge(df1, df2, how='outer') + self.assertTrue(len(result) == 2000) + + low, high, n = -1 << 10, 1 << 10, 1 << 20 + left = DataFrame(np.random.randint(low, high, (n, 7)), + columns=list('ABCDEFG')) + left['left'] = left.sum(axis=1) + + # one-2-one match + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + right.columns = right.columns[:-1].tolist() + ['right'] + right.index = np.arange(len(right)) + right['right'] *= -1 + + out = merge(left, right, how='outer') + self.assertEqual(len(out), len(left)) + assert_series_equal(out['left'], - out['right'], check_names=False) + result = out.iloc[:, :-2].sum(axis=1) + assert_series_equal(out['left'], result, check_names=False) + self.assertTrue(result.name is None) + + out.sort_values(out.columns.tolist(), inplace=True) + out.index = np.arange(len(out)) + for how in ['left', 'right', 'outer', 'inner']: + assert_frame_equal(out, merge(left, right, how=how, sort=True)) + + # check that left merge w/ sort=False maintains left frame order + out = merge(left, right, how='left', sort=False) + assert_frame_equal(left, out[left.columns.tolist()]) + + out = merge(right, left, how='left', sort=False) + assert_frame_equal(right, out[right.columns.tolist()]) + + # one-2-many/none match + n = 1 << 11 + left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), + columns=list('ABCDEFG')) + + # confirm that this is checking what it is supposed to check + shape = left.apply(Series.nunique).values + self.assertTrue(is_int64_overflow_possible(shape)) + + # add duplicates to left frame + left = concat([left, left], ignore_index=True) + + right = DataFrame(np.random.randint(low, high, (n // 2, 7)) + .astype('int64'), + columns=list('ABCDEFG')) + + # add duplicates & overlap with left to the right frame + i = np.random.choice(len(left), n) + right = concat([right, right, left.iloc[i]], ignore_index=True) + + left['left'] = np.random.randn(len(left)) + right['right'] = np.random.randn(len(right)) + + # shuffle left & right frames + i = np.random.permutation(len(left)) + left = left.iloc[i].copy() + left.index = np.arange(len(left)) + + i = np.random.permutation(len(right)) + right = right.iloc[i].copy() + right.index = np.arange(len(right)) + + # manually compute outer merge + ldict, rdict = defaultdict(list), defaultdict(list) + + for idx, row in left.set_index(list('ABCDEFG')).iterrows(): + ldict[idx].append(row['left']) + + for idx, row in right.set_index(list('ABCDEFG')).iterrows(): + rdict[idx].append(row['right']) + + vals = [] + for k, lval in ldict.items(): + rval = rdict.get(k, [np.nan]) + for lv, rv in product(lval, rval): + vals.append(k + tuple([lv, rv])) + + for k, rval in rdict.items(): + if k not in ldict: + for rv in rval: + vals.append(k + tuple([np.nan, rv])) + + def align(df): + df = df.sort_values(df.columns.tolist()) + df.index = np.arange(len(df)) + return df + + def verify_order(df): + kcols = list('ABCDEFG') + assert_frame_equal(df[kcols].copy(), + df[kcols].sort_values(kcols, kind='mergesort')) + + out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) + out = align(out) + + jmask = {'left': out['left'].notnull(), + 'right': out['right'].notnull(), + 'inner': out['left'].notnull() & out['right'].notnull(), + 'outer': np.ones(len(out), dtype='bool')} + + for how in 'left', 'right', 'outer', 'inner': + mask = jmask[how] + frame = align(out[mask].copy()) + self.assertTrue(mask.all() ^ mask.any() or how == 'outer') + + for sort in [False, True]: + res = merge(left, right, how=how, sort=sort) + if sort: + verify_order(res) + + # as in GH9092 dtypes break with outer/right join + assert_frame_equal(frame, align(res), + check_dtype=how not in ('right', 'outer')) + + +def test_decons(): + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index d66cd793ec0be..472d8674f9f8d 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -10,9 +10,7 @@ from pandas.compat import lrange, lzip from pandas.tools.concat import concat from pandas.tools.merge import merge, MergeError -from pandas.util.testing import (assert_frame_equal, - assert_series_equal, - slow) +from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm @@ -1092,137 +1090,6 @@ def test_merge_na_keys(self): tm.assert_frame_equal(result, expected) - @slow - def test_int64_overflow_issues(self): - from itertools import product - from collections import defaultdict - from pandas.core.groupby import _int64_overflow_possible - - # #2690, combinatorial explosion - df1 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G1']) - df2 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G2']) - - # it works! - result = merge(df1, df2, how='outer') - self.assertTrue(len(result) == 2000) - - low, high, n = -1 << 10, 1 << 10, 1 << 20 - left = DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) - left['left'] = left.sum(axis=1) - - # one-2-one match - i = np.random.permutation(len(left)) - right = left.iloc[i].copy() - right.columns = right.columns[:-1].tolist() + ['right'] - right.index = np.arange(len(right)) - right['right'] *= -1 - - out = merge(left, right, how='outer') - self.assertEqual(len(out), len(left)) - assert_series_equal(out['left'], - out['right'], check_names=False) - result = out.iloc[:, :-2].sum(axis=1) - assert_series_equal(out['left'], result, check_names=False) - self.assertTrue(result.name is None) - - out.sort_values(out.columns.tolist(), inplace=True) - out.index = np.arange(len(out)) - for how in ['left', 'right', 'outer', 'inner']: - assert_frame_equal(out, merge(left, right, how=how, sort=True)) - - # check that left merge w/ sort=False maintains left frame order - out = merge(left, right, how='left', sort=False) - assert_frame_equal(left, out[left.columns.tolist()]) - - out = merge(right, left, how='left', sort=False) - assert_frame_equal(right, out[right.columns.tolist()]) - - # one-2-many/none match - n = 1 << 11 - left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), - columns=list('ABCDEFG')) - - # confirm that this is checking what it is supposed to check - shape = left.apply(Series.nunique).values - self.assertTrue(_int64_overflow_possible(shape)) - - # add duplicates to left frame - left = concat([left, left], ignore_index=True) - - right = DataFrame(np.random.randint(low, high, (n // 2, 7)) - .astype('int64'), - columns=list('ABCDEFG')) - - # add duplicates & overlap with left to the right frame - i = np.random.choice(len(left), n) - right = concat([right, right, left.iloc[i]], ignore_index=True) - - left['left'] = np.random.randn(len(left)) - right['right'] = np.random.randn(len(right)) - - # shuffle left & right frames - i = np.random.permutation(len(left)) - left = left.iloc[i].copy() - left.index = np.arange(len(left)) - - i = np.random.permutation(len(right)) - right = right.iloc[i].copy() - right.index = np.arange(len(right)) - - # manually compute outer merge - ldict, rdict = defaultdict(list), defaultdict(list) - - for idx, row in left.set_index(list('ABCDEFG')).iterrows(): - ldict[idx].append(row['left']) - - for idx, row in right.set_index(list('ABCDEFG')).iterrows(): - rdict[idx].append(row['right']) - - vals = [] - for k, lval in ldict.items(): - rval = rdict.get(k, [np.nan]) - for lv, rv in product(lval, rval): - vals.append(k + tuple([lv, rv])) - - for k, rval in rdict.items(): - if k not in ldict: - for rv in rval: - vals.append(k + tuple([np.nan, rv])) - - def align(df): - df = df.sort_values(df.columns.tolist()) - df.index = np.arange(len(df)) - return df - - def verify_order(df): - kcols = list('ABCDEFG') - assert_frame_equal(df[kcols].copy(), - df[kcols].sort_values(kcols, kind='mergesort')) - - out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) - out = align(out) - - jmask = {'left': out['left'].notnull(), - 'right': out['right'].notnull(), - 'inner': out['left'].notnull() & out['right'].notnull(), - 'outer': np.ones(len(out), dtype='bool')} - - for how in 'left', 'right', 'outer', 'inner': - mask = jmask[how] - frame = align(out[mask].copy()) - self.assertTrue(mask.all() ^ mask.any() or how == 'outer') - - for sort in [False, True]: - res = merge(left, right, how=how, sort=sort) - if sort: - verify_order(res) - - # as in GH9092 dtypes break with outer/right join - assert_frame_equal(frame, align(res), - check_dtype=how not in ('right', 'outer')) - def test_join_multi_levels(self): # GH 3662 diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d938c2eeacbef..e82e702cb6e55 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -34,6 +34,7 @@ concatenate_block_managers) from pandas.util.decorators import Appender, Substitution +from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos import pandas.core.common as com @@ -1397,10 +1398,9 @@ def _sort_labels(uniques, left, right): def _get_join_keys(llab, rlab, shape, sort): - from pandas.core.groupby import _int64_overflow_possible # how many levels can be done without overflow - pred = lambda i: not _int64_overflow_possible(shape[:i]) + pred = lambda i: not is_int64_overflow_possible(shape[:i]) nlev = next(filter(pred, range(len(shape), 0, -1))) # get keys for the first `nlev` levels From a4ddb8f0a6465307ea0a88b433e26496abdd52a1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 19:57:51 -0500 Subject: [PATCH 145/338] TST: disable gbq tests again --- pandas/tests/io/test_gbq.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index dfbf3ca69b111..0a76267054ee6 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -449,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -503,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -906,7 +906,7 @@ def test_configuration_without_query(self): configuration=config) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1219,7 +1219,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1277,7 +1277,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 7c625ecc6abfa2a17f27c37706fef462474d4b8f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 20:15:20 -0500 Subject: [PATCH 146/338] TST: fix incorrect url in compressed url network tests in parser --- pandas/tests/io/parser/test_network.py | 53 ++++++++++---------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 4d75b59b09560..6e762368f82c5 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -7,7 +7,6 @@ import os import pytest -import functools from itertools import product import pandas.util.testing as tm @@ -15,42 +14,32 @@ from pandas.io.parsers import read_csv, read_table -class TestCompressedUrl(object): +@pytest.fixture(scope='module') +def salaries_table(): + path = os.path.join(tm.get_data_path(), 'salaries.csv') + return read_table(path) - compression_to_extension = { - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', - } - def setup(self): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - self.local_table = read_table(path) - self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/' - 'pandas/io/tests/parser/data/salaries.csv') +@tm.network +@pytest.mark.parametrize( + "compression,extension", [('gzip', '.gz'), ('bz2', '.bz2'), + ('zip', '.zip'), ('xz', '.xz')]) +def test_compressed_urls(salaries_table, compression, extension): + # test reading compressed urls with various engines and + # extension inference + base_url = ('https://github.com/pandas-dev/pandas/raw/master/' + 'pandas/tests/io/parser/data/salaries.csv') + + url = base_url + extension + + # args is a (compression, engine) tuple + for (c, engine) in product([compression, 'infer'], ['python', 'c']): - @tm.network - def test_compressed_urls(self): - # Test reading compressed tables from URL. - msg = ('Test reading {}-compressed tables from URL: ' - 'compression="{}", engine="{}"') - - for compression, extension in self.compression_to_extension.items(): - url = self.base_url + extension - # args is a (compression, engine) tuple - for args in product([compression, 'infer'], ['python', 'c']): - # test_fxn is a workaround for more descriptive nose reporting. - # See http://stackoverflow.com/a/37393684/4651668. - test_fxn = functools.partial(self.check_table) - test_fxn.description = msg.format(compression, *args) - yield (test_fxn, url) + args - - def check_table(self, url, compression, engine): if url.endswith('.xz'): tm._skip_if_no_lzma() - url_table = read_table(url, compression=compression, engine=engine) - tm.assert_frame_equal(url_table, self.local_table) + + url_table = read_table(url, compression=c, engine=engine) + tm.assert_frame_equal(url_table, salaries_table) class TestS3(tm.TestCase): From 2f108007ef232d68940d8d3a7a9d5c9c77bb6f2c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 20:44:44 -0500 Subject: [PATCH 147/338] TST: incorrect skip in when --skip-network is run closes #15407 --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b3683de3a173b..623feb99e9cdc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -17,5 +17,5 @@ def pytest_runtest_setup(item): if 'slow' not in item.keywords and item.config.getoption("--only-slow"): pytest.skip("skipping due to --only-slow") - if 'skip' in item.keywords and item.config.getoption("--skip-network"): + if 'network' in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") From 65a6be55e7dc23b8eeece95bbd7caeac7ac9df60 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 22:25:23 -0500 Subject: [PATCH 148/338] TST: fix test_nework.py fixture under py27 --- pandas/tests/io/parser/test_network.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 6e762368f82c5..721d447262149 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -20,11 +20,15 @@ def salaries_table(): return read_table(path) -@tm.network @pytest.mark.parametrize( "compression,extension", [('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'), ('xz', '.xz')]) def test_compressed_urls(salaries_table, compression, extension): + check_compressed_urls(salaries_table, compression, extension) + + +@tm.network +def check_compressed_urls(salaries_table, compression, extension): # test reading compressed urls with various engines and # extension inference base_url = ('https://github.com/pandas-dev/pandas/raw/master/' From 49f0c75a1d465f019b3131ef91e23f02dedba038 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 15 Feb 2017 10:20:27 -0500 Subject: [PATCH 149/338] BLD: Numexpr 2.4.6 required closes #15213 Author: Francesc Alted Closes #15383 from FrancescAlted/numexpr-2.4.6 and squashes the following commits: c417fe2 [Francesc Alted] Simplify and remove UserWarning testing on numexpr import e1b34a9 [Francesc Alted] Force a reload of pd.computation for actually triggering the UserWarning c081199 [Francesc Alted] Relax the exact message for the ImportError 73f0319 [Francesc Alted] numexpr requisite raised to 2.4.6 0d4ab9a [Francesc Alted] Restored the old numexpr version dependencies to adjust for old requirements c1aae19 [Francesc Alted] Fixed a lint error 7575ba2 [Francesc Alted] Using constants instead of literals for numexpr version 7a275ce [Francesc Alted] Fixed a typo 93f54aa [Francesc Alted] numexpr section moved to Other API changes section 3b6e58b [Francesc Alted] Removed recomendation for numexpr 2.6.2 f225598 [Francesc Alted] Updated test_compat for numexpr 2.4.6 8bd4ed1 [Francesc Alted] numexpr 2.4.6 requirement moved to other enhancements section e45b742 [Francesc Alted] Moved pinned versions in CI folder to 2.4.6 6e12e29 [Francesc Alted] Added a notice on the recommended numexpr version ac62653 [Francesc Alted] Require numexpr 2.4.6 ab79c54 [Francesc Alted] Require numexpr 2.6.2 --- ci/requirements-3.4_SLOW.run | 2 +- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.20.0.txt | 4 +++- pandas/computation/__init__.py | 17 +++++------------ pandas/tests/computation/test_compat.py | 15 ++++----------- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/ci/requirements-3.4_SLOW.run b/ci/requirements-3.4_SLOW.run index 39018439a1223..90156f62c6e71 100644 --- a/ci/requirements-3.4_SLOW.run +++ b/ci/requirements-3.4_SLOW.run @@ -9,7 +9,7 @@ html5lib patsy beautiful-soup scipy -numexpr=2.4.4 +numexpr=2.4.6 pytables matplotlib lxml diff --git a/doc/source/install.rst b/doc/source/install.rst index 1c7cbc9326614..80a5d7e7d375b 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -226,7 +226,7 @@ Recommended Dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.1 or higher (excluding a buggy 2.4.4). Version 2.4.6 or higher is highly recommended. + If installed, must be Version 2.4.6 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d76e33caffbf1..26006083d81b4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1,6 +1,6 @@ .. _whatsnew_0200: -v0.20.0 (????, 2016) +v0.20.0 (????, 2017) -------------------- This is a major release from 0.19 and includes a small number of API changes, several new features, @@ -158,6 +158,7 @@ Other enhancements .. _whatsnew_0200.api_breaking: + Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -429,6 +430,7 @@ Other API Changes - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) - The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). - Reorganization of timeseries development tests (:issue:`14854`) +- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). .. _whatsnew_0200.deprecations: diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py index 9e94215eecf62..e13faf890d1f8 100644 --- a/pandas/computation/__init__.py +++ b/pandas/computation/__init__.py @@ -3,26 +3,19 @@ from distutils.version import LooseVersion _NUMEXPR_INSTALLED = False +_MIN_NUMEXPR_VERSION = "2.4.6" try: import numexpr as ne ver = ne.__version__ - _NUMEXPR_INSTALLED = ver >= LooseVersion('2.1') + _NUMEXPR_INSTALLED = ver >= LooseVersion(_MIN_NUMEXPR_VERSION) - # we specifically disallow 2.4.4 as - # has some hard-to-diagnose bugs - if ver == LooseVersion('2.4.4'): - _NUMEXPR_INSTALLED = False - warnings.warn( - "The installed version of numexpr {ver} is not supported " - "in pandas and will be not be used\n".format(ver=ver), - UserWarning) - - elif not _NUMEXPR_INSTALLED: + if not _NUMEXPR_INSTALLED: warnings.warn( "The installed version of numexpr {ver} is not supported " "in pandas and will be not be used\nThe minimum supported " - "version is 2.1\n".format(ver=ver), UserWarning) + "version is {min_ver}\n".format( + ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning) except ImportError: # pragma: no cover pass diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index 599d0c10336dc..77994ac6d2f53 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -10,6 +10,7 @@ from pandas.computation.engines import _engines import pandas.computation.expr as expr +from pandas.computation import _MIN_NUMEXPR_VERSION ENGINES_PARSERS = list(product(_engines, expr._parsers)) @@ -21,15 +22,10 @@ def test_compat(): try: import numexpr as ne ver = ne.__version__ - if ver == LooseVersion('2.4.4'): + if ver < LooseVersion(_MIN_NUMEXPR_VERSION): assert not _NUMEXPR_INSTALLED - elif ver < LooseVersion('2.1'): - with tm.assert_produces_warning(UserWarning, - check_stacklevel=False): - assert not _NUMEXPR_INSTALLED else: assert _NUMEXPR_INSTALLED - except ImportError: pytest.skip("not testing numexpr version compat") @@ -51,12 +47,9 @@ def testit(): except ImportError: pytest.skip("no numexpr") else: - if ne.__version__ < LooseVersion('2.1'): - with tm.assertRaisesRegexp(ImportError, "'numexpr' version is " - ".+, must be >= 2.1"): + if ne.__version__ < LooseVersion(_MIN_NUMEXPR_VERSION): + with tm.assertRaises(ImportError): testit() - elif ne.__version__ == LooseVersion('2.4.4'): - pytest.skip("numexpr version==2.4.4") else: testit() else: From d6adeec977197f0631a4a439defd3c391a27c81c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 10:23:36 -0500 Subject: [PATCH 150/338] TST: print skipped tests files xref #15341 Author: Jeff Reback Closes #15408 from jreback/skip and squashes the following commits: 547bee6 [Jeff Reback] TST: print skipped tests files --- .travis.yml | 3 ++- ci/install_travis.sh | 1 + ci/print_skipped.py | 7 ++++--- ci/script_multi.sh | 8 ++++---- ci/script_single.sh | 8 ++++---- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6b90e49b336b2..6245213cec06f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -332,5 +332,6 @@ after_script: - echo "after_script start" - ci/install_test.sh - source activate pandas && python -c "import pandas; pandas.show_versions();" - - ci/print_skipped.py /tmp/pytest.xml + - ci/print_skipped.py /tmp/single.xml + - ci/print_skipped.py /tmp/multiple.xml - echo "after_script done" diff --git a/ci/install_travis.sh b/ci/install_travis.sh index ad804b96a0d82..802d8c9f6b776 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -112,6 +112,7 @@ fi source activate pandas pip install pytest-xdist + if [ "$LINT" ]; then conda install flake8 pip install cpplint diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 9fb05df64bcea..dd2180f6eeb19 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -30,20 +30,21 @@ def parse_results(filename): i += 1 assert i - 1 == len(skipped) assert i - 1 == len(skipped) - assert len(skipped) == int(root.attrib['skip']) + # assert len(skipped) == int(root.attrib['skip']) return '\n'.join(skipped) def main(args): print('SKIPPED TESTS:') - print(parse_results(args.filename)) + for fn in args.filename: + print(parse_results(fn)) return 0 def parse_args(): import argparse parser = argparse.ArgumentParser() - parser.add_argument('filename', help='XUnit file to parse') + parser.add_argument('filename', nargs='+', help='XUnit file to parse') return parser.parse_args() diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 83f8427cc57ad..f5fbcbbc12f83 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/cov.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/cov.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas else - echo pytest -n 2 -m "not single" $TEST_ARGS pandas - pytest -n 2 -m "not single" $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/ci/script_single.sh b/ci/script_single.sh index 38021fcac5721..2d7962352842b 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas - pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" $TEST_ARGS pandas - pytest -m "single" $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" From 06ad2960efff779e1e672787a0174d6b2f58d998 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 10:24:45 -0500 Subject: [PATCH 151/338] PERF: high memory in MI closes #13904 Creates an efficient MultiIndexHashTable in cython. This allows us to efficiently store a multi-index for fast indexing (.get_loc() and .get_indexer()), with the current tuple-based (and gil holding) use of the PyObject Hash Table. This uses the pandas.tools.hashing routines to hash each of the 'values' of a MI to a single uint64. So this makes MI more memory friendly and much more efficient. You get these speedups, because the creation of the hashtable is now much more efficient. Author: Jeff Reback Closes #15245 from jreback/mi and squashes the following commits: 7df6c34 [Jeff Reback] PERF: high memory in MI --- asv_bench/benchmarks/indexing.py | 30 +++- asv_bench/benchmarks/reindex.py | 4 +- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/core/algorithms.py | 3 +- pandas/core/frame.py | 3 +- pandas/hashtable.pxd | 8 + pandas/index.pyx | 39 ++++- pandas/indexes/base.py | 5 +- pandas/indexes/multi.py | 203 ++++++++++++++++++---- pandas/io/pytables.py | 4 +- pandas/src/algos_common_helper.pxi.in | 4 +- pandas/src/hashtable_class_helper.pxi.in | 152 +++++++++++++--- pandas/tests/frame/test_mutate_columns.py | 29 +++- pandas/tests/frame/test_repr_info.py | 32 ++++ pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/indexes/test_multi.py | 136 +++++++++++++-- pandas/tests/indexing/test_multiindex.py | 3 +- pandas/tests/test_multilevel.py | 4 +- pandas/tests/tools/test_hashing.py | 12 ++ pandas/tests/tools/test_join.py | 6 +- pandas/tools/hashing.py | 44 +++-- pandas/types/cast.py | 3 +- 22 files changed, 605 insertions(+), 125 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 27cd320c661e0..d938cc6a6dc4d 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -88,7 +88,7 @@ def setup(self): def time_getitem_scalar(self): self.ts[self.dt] - + class DataFrameIndexing(object): goal_time = 0.2 @@ -189,6 +189,15 @@ def setup(self): self.eps_C = 5 self.eps_D = 5000 self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() + self.miint = MultiIndex.from_product( + [np.arange(1000), + np.arange(1000)], names=['one', 'two']) + + import string + self.mistring = MultiIndex.from_product( + [np.arange(1000), + np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) def time_series_xs_mi_ix(self): self.s.ix[999] @@ -197,7 +206,24 @@ def time_frame_xs_mi_ix(self): self.df.ix[999] def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + self.mdt2.loc[self.idx[ + (self.test_A - self.eps_A):(self.test_A + self.eps_A), + (self.test_B - self.eps_B):(self.test_B + self.eps_B), + (self.test_C - self.eps_C):(self.test_C + self.eps_C), + (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + + def time_multiindex_get_indexer(self): + self.miint.get_indexer( + np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object)) + + def time_multiindex_string_get_loc(self): + self.mistring.get_loc((999, 19, 'Z')) + + def time_is_monotonic(self): + self.miint.is_monotonic class PanelIndexing(object): diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8db0cd7629332..6fe6c32a96df9 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -16,8 +16,8 @@ def setup(self): data=np.random.rand(10000, 30), columns=range(30)) # multi-index - N = 1000 - K = 20 + N = 5000 + K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 26006083d81b4..4708abe4d592e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -472,7 +472,7 @@ Performance Improvements - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - +- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. @@ -502,6 +502,8 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) + +- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 05cfb1bd9ec27..c922ac21e12eb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = _ensure_int64(indexer) + indexer = _ensure_int64(indexer, copy=False) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False @@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 16f8d4658dc20..9c66f6dbb273e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1752,7 +1752,8 @@ def _sizeof_fmt(num, size_qualifier): # all cases (e.g., it misses categorical data even with object # categories) deep = False - if 'object' in counts or is_object_dtype(self.index): + if ('object' in counts or + self.index._is_memory_usage_qualified()): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index cabfa43a76f26..9b352ae1c003b 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -31,6 +31,14 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) +cdef class MultiIndexHashTable(HashTable): + cdef: + kh_uint64_t *table + object mi + + cpdef get_item(self, object val) + cpdef set_item(self, object key, Py_ssize_t val) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/index.pyx b/pandas/index.pyx index 0c975d1775a03..37fe7d90bebe0 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -182,7 +182,7 @@ cdef class IndexEngine: Py_ssize_t i, n int last_true - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) n = len(values) result = np.empty(n, dtype=bool) @@ -284,7 +284,6 @@ cdef class IndexEngine: if not self.is_mapping_populated: values = self._get_index_values() - self.mapping = self._make_hash_table(len(values)) self.mapping.map_locations(values) @@ -322,7 +321,7 @@ cdef class IndexEngine: Py_ssize_t i, j, n, n_t, n_alloc self._ensure_mapping_populated() - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) stargets = set(targets) n = len(values) n_t = len(targets) @@ -554,5 +553,39 @@ cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _du_utc) +cdef class MultiIndexEngine(IndexEngine): + + def _call_monotonic(self, object mi): + # defer these back to the mi iteself + return (mi.is_monotonic_increasing, + mi.is_monotonic_decreasing, + mi.is_unique) + + def get_backfill_indexer(self, other, limit=None): + # we coerce to ndarray-of-tuples + values = np.array(self._get_index_values()) + return algos.backfill_object(values, other, limit=limit) + + def get_pad_indexer(self, other, limit=None): + # we coerce to ndarray-of-tuples + values = np.array(self._get_index_values()) + return algos.pad_object(values, other, limit=limit) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError("'{val}' is an invalid key".format(val=val)) + + self._ensure_mapping_populated() + if not self.unique: + return self._get_loc_duplicates(val) + + try: + return self.mapping.get_item(val) + except TypeError: + raise KeyError(val) + + cdef _make_hash_table(self, n): + return _hash.MultiIndexHashTable(n) + # Generated from template. include "index_class_helper.pxi" diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index bb2941a121452..c483fb0764a4c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1431,6 +1431,10 @@ def inferred_type(self): """ return a string of the type inferred from the values """ return lib.infer_dtype(self) + def _is_memory_usage_qualified(self): + """ return a boolean if we need a qualified .info display """ + return self.is_object() + def is_type_compatible(self, kind): return kind == self.inferred_type @@ -2446,7 +2450,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): 'if index and target are monotonic' % method) side = 'left' if method == 'pad' else 'right' - target = np.asarray(target) # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 653ba1fee5691..57739548a17d6 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -14,7 +14,6 @@ from pandas.compat.numpy import function as nv from pandas import compat - from pandas.types.common import (_ensure_int64, _ensure_platform_int, is_object_dtype, @@ -73,6 +72,7 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] + _engine_type = _index.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -114,7 +114,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, result._verify_integrity() if _set_identity: result._reset_identity() - return result def _verify_integrity(self, labels=None, levels=None): @@ -429,6 +428,12 @@ def _shallow_copy(self, values=None, **kwargs): def dtype(self): return np.dtype('O') + def _is_memory_usage_qualified(self): + """ return a boolean if we need a qualified .info display """ + def f(l): + return 'mixed' in l or 'string' in l or 'unicode' in l + return any([f(l) for l in self._inferred_type_levels]) + @Appender(Index.memory_usage.__doc__) def memory_usage(self, deep=False): # we are overwriting our base class to avoid @@ -619,6 +624,10 @@ def _get_level_number(self, level): _tuples = None + @cache_readonly + def _engine(self): + return self._engine_type(lambda: self, len(self)) + @property def values(self): if self._tuples is not None: @@ -655,10 +664,95 @@ def _has_complex_internals(self): # to disable groupby tricks return True + @cache_readonly + def is_monotonic(self): + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + return self.is_monotonic_increasing + + @cache_readonly + def is_monotonic_increasing(self): + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + + # reversed() because lexsort() wants the most significant key last. + values = [self._get_level_values(i) + for i in reversed(range(len(self.levels)))] + try: + sort_order = np.lexsort(values) + return Index(sort_order).is_monotonic + except TypeError: + + # we have mixed types and np.lexsort is not happy + return Index(self.values).is_monotonic + + @property + def is_monotonic_decreasing(self): + """ + return if the index is monotonic decreasing (only equal or + decreasing) values. + """ + return False + @cache_readonly def is_unique(self): return not self.duplicated().any() + @cache_readonly + def _have_mixed_levels(self): + """ return a boolean list indicated if we have mixed levels """ + return ['mixed' in l for l in self._inferred_type_levels] + + @cache_readonly + def _inferred_type_levels(self): + """ return a list of the inferred types, one for each level """ + return [i.inferred_type for i in self.levels] + + @cache_readonly + def _hashed_values(self): + """ return a uint64 ndarray of my hashed values """ + from pandas.tools.hashing import hash_tuples + return hash_tuples(self) + + def _hashed_indexing_key(self, key): + """ + validate and return the hash for the provided key + + *this is internal for use for the cython routines* + + Paramters + --------- + key : string or tuple + + Returns + ------- + np.uint64 + + Notes + ----- + we need to stringify if we have mixed levels + + """ + from pandas.tools.hashing import hash_tuples + + if not isinstance(key, tuple): + return hash_tuples(key) + + if not len(key) == self.nlevels: + raise KeyError + + def f(k, stringify): + if stringify and not isinstance(k, compat.string_types): + k = str(k) + return k + key = tuple([f(k, stringify) + for k, stringify in zip(key, self._have_mixed_levels)]) + return hash_tuples(key) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) @@ -748,26 +842,44 @@ def _try_mi(k): raise InvalidIndexError(key) - def get_level_values(self, level): + def _get_level_values(self, level): """ - Return vector of label values for requested level, equal to the length - of the index + Return vector of label values for requested level, + equal to the length of the index + + **this is an internal method** Parameters ---------- - level : int or level name + level : int level Returns ------- values : ndarray """ - num = self._get_level_number(level) - unique = self.levels[num] # .values - labels = self.labels[num] - filled = algos.take_1d(unique.values, labels, + + unique = self.levels[level] + labels = self.labels[level] + filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) - values = unique._shallow_copy(filled) - return values + return filled + + def get_level_values(self, level): + """ + Return vector of label values for requested level, + equal to the length of the index + + Parameters + ---------- + level : int or level name + + Returns + ------- + values : Index + """ + level = self._get_level_number(level) + values = self._get_level_values(level) + return self.levels[level]._shallow_copy(values) def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep=None, formatter=None): @@ -852,7 +964,8 @@ def to_frame(self, index=True): from pandas import DataFrame result = DataFrame({(name or level): self.get_level_values(level) for name, level in - zip(self.names, range(len(self.levels)))}) + zip(self.names, range(len(self.levels)))}, + copy=False) if index: result.index = self return result @@ -1482,29 +1595,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) - target_index = target - if isinstance(target, MultiIndex): - target_index = target._tuple_index + # empty indexer + if is_list_like(target) and not len(target): + return _ensure_platform_int(np.array([])) + + if not isinstance(target, MultiIndex): + try: + target = MultiIndex.from_tuples(target) + except (TypeError, ValueError): - if not is_object_dtype(target_index.dtype): - return np.ones(len(target_index)) * -1 + # let's instead try with a straight Index + if method is None: + return Index(self.values).get_indexer(target, + method=method, + limit=limit, + tolerance=tolerance) if not self.is_unique: raise Exception('Reindexing only valid with uniquely valued Index ' 'objects') - self_index = self._tuple_index - if method == 'pad' or method == 'backfill': if tolerance is not None: raise NotImplementedError("tolerance not implemented yet " 'for MultiIndex') - indexer = self_index._get_fill_indexer(target, method, limit) + indexer = self._get_fill_indexer(target, method, limit) elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for MultiIndex; see GitHub issue 9365') else: - indexer = self_index._engine.get_indexer(target._values) + # we may not compare equally because of hashing if we + # don't have the same dtypes + if self._inferred_type_levels != target._inferred_type_levels: + return Index(self.values).get_indexer(target.values) + + indexer = self._engine.get_indexer(target) return _ensure_platform_int(indexer) @@ -1571,17 +1696,6 @@ def reindex(self, target, method=None, level=None, limit=None, return target, indexer - @cache_readonly - def _tuple_index(self): - """ - Convert MultiIndex to an Index of tuples - - Returns - ------- - index : Index - """ - return Index(self._values) - def get_slice_bound(self, label, side, kind): if not isinstance(label, tuple): @@ -1828,8 +1942,9 @@ def partial_selection(key, indexer=None): key = tuple(self[indexer].tolist()[0]) - return (self._engine.get_loc(_values_from_object(key)), - None) + return (self._engine.get_loc( + _values_from_object(key)), None) + else: return partial_selection(key) else: @@ -2115,10 +2230,24 @@ def equals(self, other): return False for i in range(self.nlevels): + slabels = self.labels[i] + slabels = slabels[slabels != -1] svalues = algos.take_nd(np.asarray(self.levels[i]._values), - self.labels[i], allow_fill=False) + slabels, allow_fill=False) + + olabels = other.labels[i] + olabels = olabels[olabels != -1] ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - other.labels[i], allow_fill=False) + olabels, allow_fill=False) + + # since we use NaT both datetime64 and timedelta64 + # we can have a situation where a level is typed say + # timedelta64 in self (IOW it has other values than NaT) + # but types datetime64 in other (where its all NaT) + # but these are equivalent + if len(svalues) == 0 and len(ovalues) == 0: + continue + if not array_equivalent(svalues, ovalues): return False diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7661cd19e4d14..31895c2ac54c3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3790,9 +3790,9 @@ def read(self, where=None, columns=None, **kwargs): lp = DataFrame(c.data, index=long_index, columns=c.values) # need a better algorithm - tuple_index = long_index._tuple_index + tuple_index = long_index.values - unique_tuples = lib.fast_unique(tuple_index.values) + unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 42089f9520ab6..b83dec1d26242 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -579,12 +579,12 @@ def get_dispatch(dtypes): {{for name, c_type, dtype in get_dispatch(dtypes)}} -cpdef ensure_{{name}}(object arr): +cpdef ensure_{{name}}(object arr, copy=True): if util.is_array(arr): if ( arr).descr.type_num == NPY_{{c_type}}: return arr else: - return arr.astype(np.{{dtype}}) + return arr.astype(np.{{dtype}}, copy=copy) else: return np.array(arr, dtype=np.{{dtype}}) diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index ef385ba7dca1c..3ce82dace40a9 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -262,13 +262,6 @@ cdef class {{name}}HashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, {{dtype}}_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): cdef: khiter_t k @@ -501,18 +494,6 @@ cdef class StringHashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef: - Py_ssize_t i, val - char *v - - v = util.get_c_string(key) - - for i in range(iterations): - k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k @@ -755,15 +736,6 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k @@ -874,3 +846,127 @@ cdef class PyObjectHashTable(HashTable): count += 1 return np.asarray(labels) + + +cdef class MultiIndexHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_uint64() + self.mi = None + kh_resize_uint64(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + kh_destroy_uint64(self.table) + self.table = NULL + + def __len__(self): + return self.table.size + + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(uint64_t) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + + def _check_for_collisions(self, int64_t[:] locs, object mi): + # validate that the locs map to the actual values + # provided in the mi + # we can only check if we *don't* have any missing values + # :< + cdef: + ndarray[int64_t] alocs + + alocs = np.asarray(locs) + if (alocs != -1).all(): + + result = self.mi.take(locs) + if isinstance(mi, tuple): + from pandas import Index + mi = Index([mi]) + if not result.equals(mi): + raise AssertionError( + "hash collision\nlocs:\n{}\n" + "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + + def __contains__(self, object key): + try: + self.get_item(key) + return True + except (KeyError, ValueError, TypeError): + return False + + cpdef get_item(self, object key): + cdef: + khiter_t k + uint64_t value + int64_t[:] locs + Py_ssize_t loc + + value = self.mi._hashed_indexing_key(key) + k = kh_get_uint64(self.table, value) + if k != self.table.n_buckets: + loc = self.table.vals[k] + locs = np.array([loc], dtype=np.int64) + self._check_for_collisions(locs, key) + return loc + else: + raise KeyError(key) + + cpdef set_item(self, object key, Py_ssize_t val): + raise NotImplementedError + + @cython.boundscheck(False) + def map_locations(self, object mi): + cdef: + Py_ssize_t i, n + ndarray[uint64_t] values + uint64_t val + int ret = 0 + khiter_t k + + self.mi = mi + n = len(mi) + values = mi._hashed_values + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_uint64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, object mi): + # look up with a target mi + cdef: + Py_ssize_t i, n + ndarray[uint64_t] values + int ret = 0 + uint64_t val + khiter_t k + int64_t[:] locs + + n = len(mi) + values = mi._hashed_values + + locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_uint64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + self._check_for_collisions(locs, mi) + return np.asarray(locs) + + def unique(self, object mi): + raise NotImplementedError + + def get_labels(self, object mi, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + raise NotImplementedError diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 6b4c56747c981..fe3f3c554a9b5 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import print_function - +import pytest from pandas.compat import range, lrange import numpy as np -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, MultiIndex from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -165,6 +165,31 @@ def test_delitem(self): del self.frame['A'] self.assertNotIn('A', self.frame) + def test_delitem_multiindex(self): + midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + df = DataFrame(np.random.randn(4, 4), columns=midx) + assert len(df.columns) == 4 + assert ('A', ) in df.columns + assert 'A' in df.columns + + result = df['A'] + assert isinstance(result, DataFrame) + del df['A'] + + assert len(df.columns) == 2 + + # A still in the levels, BUT get a KeyError if trying + # to delete + assert ('A', ) not in df.columns + with pytest.raises(KeyError): + del df[('A',)] + + # xref: https://github.com/pandas-dev/pandas/issues/2770 + # the 'A' is STILL in the columns! + assert 'A' in df.columns + with pytest.raises(KeyError): + del df['A'] + def test_pop(self): self.frame.columns.name = 'baz' diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 2df297d03bcdf..024e11e63a924 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -301,10 +301,12 @@ def test_info_memory_usage(self): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() + # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " in res[-1]) + # do not display memory usage cas df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() @@ -312,11 +314,13 @@ def test_info_memory_usage(self): df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() + # memory usage is a lower bound, so print it as XYZ+ MB self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() + # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) @@ -380,6 +384,34 @@ def test_info_memory_usage(self): diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100) + def test_info_memory_usage_qualified(self): + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=[1, 2, 3]) + df.info(buf=buf) + self.assertFalse('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=list('ABC')) + df.info(buf=buf) + self.assertTrue('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=pd.MultiIndex.from_product( + [range(3), range(3)])) + df.info(buf=buf) + self.assertFalse('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=pd.MultiIndex.from_product( + [range(3), ['foo', 'bar']])) + df.info(buf=buf) + self.assertTrue('+' in buf.getvalue()) + def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3a6a9eaaa8e72..d53446870beb1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1588,7 +1588,7 @@ def test_groupby_as_index_cython(self): result = grouped.mean() expected = data.groupby(['A', 'B']).mean() - arrays = lzip(*expected.index._tuple_index) + arrays = lzip(*expected.index.values) expected.insert(0, 'A', arrays[0]) expected.insert(1, 'B', arrays[1]) expected.index = np.arange(len(expected)) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 702c4758da245..5611492b4af1b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1046,6 +1046,21 @@ def test_contains(self): self.assertNotIn(('bar', 'two'), self.index) self.assertNotIn(None, self.index) + def test_contains_top_level(self): + midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + assert 'A' in midx + assert 'A' not in midx._engine + + def test_contains_with_nat(self): + # MI with a NaT + mi = MultiIndex(levels=[['C'], + pd.date_range('2012-01-01', periods=5)], + labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, 'B']) + self.assertTrue(('C', pd.Timestamp('2012-01-01')) in mi) + for val in mi.values: + self.assertTrue(val in mi) + def test_is_all_dates(self): self.assertFalse(self.index.is_all_dates) @@ -1102,6 +1117,17 @@ def test_get_loc_duplicates(self): xp = 0 assert (rs == xp) + def test_get_value_duplicates(self): + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + + assert index.get_loc('D') == slice(0, 3) + with pytest.raises(KeyError): + index._engine.get_value(np.array([]), 'D') + def test_get_loc_level(self): index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( @@ -1294,7 +1320,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rbfill1) # pass non-MultiIndex - r1 = idx1.get_indexer(idx2._tuple_index) + r1 = idx1.get_indexer(idx2.values) rexp1 = idx1.get_indexer(idx2) assert_almost_equal(r1, rexp1) @@ -1316,6 +1342,19 @@ def test_get_indexer_nearest(self): with tm.assertRaises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_hash_collisions(self): + # non-smoke test that we don't get hash collisions + + index = MultiIndex.from_product([np.arange(1000), np.arange(1000)], + names=['one', 'two']) + result = index.get_indexer(index.values) + self.assert_numpy_array_equal(result, + np.arange(len(index), dtype='int64')) + + for i in [0, 1, len(index) - 2, len(index) - 1]: + result = index.get_loc(index[i]) + self.assertEqual(result, i) + def test_format(self): self.index.format() self.index[:0].format() @@ -1420,12 +1459,13 @@ def test_bounds(self): self.index._bounds def test_equals_multi(self): - self.assertTrue(self.index.equals(self.index)) - self.assertTrue(self.index.equal_levels(self.index)) - - self.assertFalse(self.index.equals(self.index[:-1])) + assert self.index.equals(self.index) + assert not self.index.equals(self.index.values) + assert self.index.equals(Index(self.index.values)) - self.assertTrue(self.index.equals(self.index._tuple_index)) + assert self.index.equal_levels(self.index) + assert not self.index.equals(self.index[:-1]) + assert not self.index.equals(self.index[-1]) # different number of levels index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( @@ -1433,8 +1473,8 @@ def test_equals_multi(self): [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) index2 = MultiIndex(levels=index.levels[:-1], labels=index.labels[:-1]) - self.assertFalse(index.equals(index2)) - self.assertFalse(index.equal_levels(index2)) + assert not index.equals(index2) + assert not index.equal_levels(index2) # levels are different major_axis = Index(lrange(4)) @@ -1445,8 +1485,8 @@ def test_equals_multi(self): index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) - self.assertFalse(self.index.equals(index)) - self.assertFalse(self.index.equal_levels(index)) + assert not self.index.equals(index) + assert not self.index.equal_levels(index) # some of the labels are different major_axis = Index(['foo', 'bar', 'baz', 'qux']) @@ -1457,7 +1497,16 @@ def test_equals_multi(self): index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) - self.assertFalse(self.index.equals(index)) + assert not self.index.equals(index) + + def test_equals_missing_values(self): + # make sure take is not using -1 + i = pd.MultiIndex.from_tuples([(0, pd.NaT), + (0, pd.Timestamp('20130101'))]) + result = i[0:1].equals(i[0]) + self.assertFalse(result) + result = i[1:2].equals(i[1]) + self.assertFalse(result) def test_identical(self): mi = self.index.copy() @@ -1510,7 +1559,7 @@ def test_union(self): the_union = piece1 | piece2 - tups = sorted(self.index._tuple_index) + tups = sorted(self.index.values) expected = MultiIndex.from_tuples(tups) self.assertTrue(the_union.equals(expected)) @@ -1523,7 +1572,7 @@ def test_union(self): self.assertIs(the_union, self.index) # won't work in python 3 - # tuples = self.index._tuple_index + # tuples = self.index.values # result = self.index[:4] | tuples[4:] # self.assertTrue(result.equals(tuples)) @@ -1543,7 +1592,7 @@ def test_intersection(self): piece2 = self.index[3:] the_int = piece1 & piece2 - tups = sorted(self.index[3:5]._tuple_index) + tups = sorted(self.index[3:5].values) expected = MultiIndex.from_tuples(tups) self.assertTrue(the_int.equals(expected)) @@ -1557,7 +1606,7 @@ def test_intersection(self): self.assertTrue(empty.equals(expected)) # can't do in python 3 - # tuples = self.index._tuple_index + # tuples = self.index.values # result = self.index & tuples # self.assertTrue(result.equals(tuples)) @@ -1616,7 +1665,7 @@ def test_difference(self): self.assertEqual(len(result), 0) # raise Exception called with non-MultiIndex - result = first.difference(first._tuple_index) + result = first.difference(first.values) self.assertTrue(result.equals(first[:0])) # name from empty array @@ -1642,7 +1691,7 @@ def test_from_tuples(self): def test_argsort(self): result = self.index.argsort() - expected = self.index._tuple_index.argsort() + expected = self.index.values.argsort() tm.assert_numpy_array_equal(result, expected) def test_sortlevel(self): @@ -2297,11 +2346,60 @@ def test_level_setting_resets_attributes(self): ind = MultiIndex.from_arrays([ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] ]) - assert ind.is_monotonic + self.assertTrue(ind.is_monotonic) ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + # if this fails, probably didn't reset the cache correctly. - assert not ind.is_monotonic + self.assertFalse(ind.is_monotonic) + + def test_is_monotonic(self): + i = MultiIndex.from_product([np.arange(10), + np.arange(10)], names=['one', 'two']) + self.assertTrue(i.is_monotonic) + self.assertTrue(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([np.arange(10, 0, -1), + np.arange(10)], names=['one', 'two']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([np.arange(10), + np.arange(10, 0, -1)], + names=['one', 'two']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + # string ordering + i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], + ['mom', 'next', 'zenith']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.assertTrue(i.is_monotonic) + self.assertTrue(Index(i.values).is_monotonic) + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', + 'nl0000289783', + 'nl0000289965', 'nl0000301109']], + labels=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=['household_id', 'asset_id']) + + self.assertFalse(i.is_monotonic) def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 1e6ecbbcdc756..b6b9ac93b234c 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -413,9 +413,10 @@ def f(): df.loc[idx[:, :, 'Stock'], 'price'] *= 2 tm.assert_frame_equal(df, expected) - def test_getitem_multiindex(self): + def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! + index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8e0628eefa392..0f36af2c8c4e7 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1469,7 +1469,7 @@ def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' - arrays = [np.array(x) for x in zip(*df.columns._tuple_index)] + arrays = [np.array(x) for x in zip(*df.columns.values)] result = df['foo'] result2 = df.loc[:, 'foo'] @@ -1493,7 +1493,7 @@ def test_series_getitem_not_sorted(self): index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) - arrays = [np.array(x) for x in zip(*index._tuple_index)] + arrays = [np.array(x) for x in zip(*index.values)] result = s['qux'] result2 = s.loc['qux'] diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 05a352f259e8b..9bed0d428bc41 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -152,6 +152,18 @@ def test_categorical_consistency(self): tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) + def test_categorical_with_nan_consistency(self): + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], + categories=pd.date_range('2012-01-01', periods=5, name='B')) + expected = hash_array(c, categorize=False) + c = pd.Categorical.from_codes( + [-1, 0], + categories=[pd.Timestamp('2012-01-01')]) + result = hash_array(c, categorize=False) + assert result[0] in expected + assert result[1] in expected + def test_pandas_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: diff --git a/pandas/tests/tools/test_join.py b/pandas/tests/tools/test_join.py index ab42b1212301b..ee6b3d57b852d 100644 --- a/pandas/tests/tools/test_join.py +++ b/pandas/tests/tools/test_join.py @@ -7,7 +7,7 @@ from pandas.compat import lrange import pandas.compat as compat from pandas.util.testing import assert_frame_equal -from pandas import DataFrame, MultiIndex, Series, merge, concat +from pandas import DataFrame, MultiIndex, Series, Index, merge, concat import pandas._join as _join import pandas.util.testing as tm @@ -368,7 +368,7 @@ def test_join_multiindex(self): df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index.union(index2._tuple_index) + ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) @@ -378,7 +378,7 @@ def test_join_multiindex(self): df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) - ex_index = index1._tuple_index.union(index2._tuple_index) + ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 800e0b8815443..ef863510cdd87 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -5,7 +5,6 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex -import pandas.core.algorithms as algos from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, @@ -142,20 +141,18 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) - # create a list-of-ndarrays - def get_level_values(num): - unique = vals.levels[num] # .values - labels = vals.labels[num] - filled = algos.take_1d(unique._values, labels, - fill_value=unique._na_value) - return filled - - vals = [get_level_values(level) + # create a list-of-Categoricals + vals = [Categorical(vals.labels[level], + vals.levels[level], + ordered=False, + fastpath=True) for level in range(vals.nlevels)] # hash the list-of-ndarrays - hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) - for l in vals) + hashes = (_hash_categorical(cat, + encoding=encoding, + hash_key=hash_key) + for cat in vals) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] @@ -178,9 +175,26 @@ def _hash_categorical(c, encoding, hash_key): ------- ndarray of hashed values array, same size as len(c) """ - cat_hashed = hash_array(c.categories.values, encoding, hash_key, - categorize=False).astype(np.uint64, copy=False) - return c.rename_categories(cat_hashed).astype(np.uint64, copy=False) + hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construt the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH 15362 + + mask = c.isnull() + if len(hashed): + result = hashed.take(c.codes) + else: + result = np.zeros(len(mask), dtype='uint64') + + if mask.any(): + result[mask] = np.iinfo(np.uint64).max + + return result def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 6b1c3f9c00351..b1a17df64aecf 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -12,7 +12,8 @@ is_datetime64tz_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_dtype_equal, is_float_dtype, is_complex_dtype, - is_integer_dtype, is_datetime_or_timedelta_dtype, + is_integer_dtype, + is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, _string_dtypes, _coerce_to_dtype, From af89a0433d495e48fbaf46dc49c7779c0fc17ecb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 12:59:11 -0500 Subject: [PATCH 152/338] STYLE: flake8 upgraded to 3.3 on conda (#15412) fixes for E305, 2 blank lines after a class definition --- pandas/compat/numpy/__init__.py | 1 + pandas/compat/numpy/function.py | 7 +++++++ pandas/computation/expr.py | 1 + pandas/core/algorithms.py | 2 ++ pandas/core/config.py | 1 + pandas/core/config_init.py | 2 ++ pandas/core/frame.py | 2 +- pandas/core/indexing.py | 2 ++ pandas/formats/format.py | 2 +- pandas/indexes/numeric.py | 3 +++ pandas/indexes/range.py | 1 + pandas/io/common.py | 2 ++ pandas/io/excel.py | 5 +++++ pandas/io/gbq.py | 1 + pandas/io/packers.py | 2 ++ pandas/io/parsers.py | 2 ++ pandas/io/pytables.py | 4 ++++ pandas/io/sql.py | 1 + pandas/io/stata.py | 1 + pandas/msgpack/__init__.py | 1 + pandas/sparse/frame.py | 1 + pandas/sparse/series.py | 1 + pandas/stats/moments.py | 3 +++ pandas/tests/sparse/test_libsparse.py | 2 +- pandas/tests/test_generic.py | 1 + pandas/tools/merge.py | 5 +++++ pandas/tools/plotting.py | 1 + pandas/tseries/frequencies.py | 2 ++ pandas/tseries/holiday.py | 2 ++ pandas/tseries/index.py | 1 + pandas/tseries/interval.py | 3 --- pandas/tseries/offsets.py | 4 ++++ pandas/tseries/resample.py | 4 ++++ pandas/tseries/timedeltas.py | 1 + pandas/types/generic.py | 1 + pandas/util/print_versions.py | 1 + pandas/util/terminal.py | 1 + pandas/util/testing.py | 3 +++ 38 files changed, 74 insertions(+), 6 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index bfd770d7af2c6..4a9a2647ece0f 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -67,6 +67,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs): return np.array(arr, *args, **kwargs) + __all__ = ['np', '_np_version_under1p8', '_np_version_under1p9', diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index eb9e9ecc359b2..4053994efa005 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -55,6 +55,7 @@ def __call__(self, args, kwargs, fname=None, raise ValueError("invalid validation method " "'{method}'".format(method=method)) + ARGMINMAX_DEFAULTS = dict(out=None) validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin', method='both', max_fname_arg_count=1) @@ -97,6 +98,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): validate_argmax(args, kwargs) return skipna + ARGSORT_DEFAULTS = OrderedDict() ARGSORT_DEFAULTS['axis'] = -1 ARGSORT_DEFAULTS['kind'] = 'quicksort' @@ -121,6 +123,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs): validate_argsort(args, kwargs, max_fname_arg_count=1) return ascending + CLIP_DEFAULTS = dict(out=None) validate_clip = CompatValidator(CLIP_DEFAULTS, fname='clip', method='both', max_fname_arg_count=3) @@ -141,6 +144,7 @@ def validate_clip_with_axis(axis, args, kwargs): validate_clip(args, kwargs) return axis + COMPRESS_DEFAULTS = OrderedDict() COMPRESS_DEFAULTS['axis'] = None COMPRESS_DEFAULTS['out'] = None @@ -170,6 +174,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): validate_cum_func(args, kwargs, fname=name) return skipna + LOGICAL_FUNC_DEFAULTS = dict(out=None) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') @@ -236,6 +241,7 @@ def validate_take_with_convert(convert, args, kwargs): validate_take(args, kwargs, max_fname_arg_count=3, method='both') return convert + TRANSPOSE_DEFAULTS = dict(axes=None) validate_transpose = CompatValidator(TRANSPOSE_DEFAULTS, fname='transpose', method='both', max_fname_arg_count=0) @@ -318,6 +324,7 @@ def validate_groupby_func(name, args, kwargs, allowed=None): "with groupby. Use .groupby(...)." "{func}() instead".format(func=name))) + RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod', 'mean', 'std', 'var') diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f1cf210754d12..a782287175327 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -669,6 +669,7 @@ def visitor(x, y): operands = node.values return reduce(visitor, operands) + # ast.Call signature changed on 3.5, # conditionally change which methods is named # visit_Call depending on Python version, #11097 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c922ac21e12eb..4ae46fe33a5cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -926,6 +926,7 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr): else: return inds + _dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} @@ -959,6 +960,7 @@ def _hashtable_algo(f, values, return_dtype=None): # use Object return f(htable.PyObjectHashTable, _ensure_object) + _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), diff --git a/pandas/core/config.py b/pandas/core/config.py index ed63c865ebfb4..1c0eb60b8ec2f 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -804,6 +804,7 @@ def inner(x): return inner + # common type validators, for convenience # usage: register_option(... , validator = is_int) is_int = is_type_factory(int) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fe47391c9ff81..d3db633f3aa04 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -278,6 +278,7 @@ def mpl_style_cb(key): return val + with cf.config_prefix('display'): cf.register_option('precision', 6, pc_precision_doc, validator=is_int) cf.register_option('float_format', None, float_format_doc, @@ -380,6 +381,7 @@ def use_inf_as_null_cb(key): from pandas.types.missing import _use_inf_as_null _use_inf_as_null(key) + with cf.config_prefix('mode'): cf.register_option('use_inf_as_null', False, use_inf_as_null_doc, cb=use_inf_as_null_cb) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c66f6dbb273e..f7c306ea7ce95 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5741,9 +5741,9 @@ def _from_nested_dict(data): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) + # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - DataFrame.plot = base.AccessorProperty(gfx.FramePlotMethods, gfx.FramePlotMethods) DataFrame.hist = gfx.hist_frame diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6bb2d1c479844..66510a7708e64 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -36,6 +36,7 @@ def get_indexers_list(): ('iat', _iAtIndexer), ] + # "null slice" _NS = slice(None, None) @@ -1850,6 +1851,7 @@ def _convert_key(self, key, is_setter=False): "indexers") return key + # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 1a7a06199ad8a..6b235b5e1bc33 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2479,9 +2479,9 @@ def _has_names(index): else: return index.name is not None + # ----------------------------------------------------------------------------- # Global formatting options - _initial_defencoding = None diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 0b9b337731d7f..00ddf5b0c918d 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -159,6 +159,7 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + Int64Index._add_numeric_methods() Int64Index._add_logical_methods() @@ -238,6 +239,7 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + UInt64Index._add_numeric_methods() UInt64Index._add_logical_methods() @@ -391,5 +393,6 @@ def isin(self, values, level=None): return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) + Float64Index._add_numeric_methods() Float64Index._add_logical_methods_disabled() diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 7a7902b503bd6..cc78361f843bf 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -652,5 +652,6 @@ def _evaluate_numeric_binop(self, other): reversed=True, step=operator.div) + RangeIndex._add_numeric_methods() RangeIndex._add_logical_methods() diff --git a/pandas/io/common.py b/pandas/io/common.py index c2e4301a23731..0630311620e20 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -82,6 +82,7 @@ class ParserError(ValueError): """ pass + # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -123,6 +124,7 @@ def __iter__(self): def __next__(self): raise AbstractMethodError(self) + if not compat.PY3: BaseIterator.next = lambda self: self.__next__() diff --git a/pandas/io/excel.py b/pandas/io/excel.py index f34ba65cf7b51..2821983213646 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -886,12 +886,14 @@ def _convert_to_style(cls, style_dict): return xls_style + register_writer(_Openpyxl1Writer) class _OpenpyxlWriter(_Openpyxl1Writer): engine = 'openpyxl' + register_writer(_OpenpyxlWriter) @@ -1368,6 +1370,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for k, v in style_kwargs.items(): setattr(xcell, k, v) + register_writer(_Openpyxl22Writer) @@ -1491,6 +1494,7 @@ def _convert_to_style(cls, style_dict, num_format_str=None): return style + register_writer(_XlwtWriter) @@ -1603,4 +1607,5 @@ def _convert_to_style(self, style_dict, num_format_str=None): return xl_format + register_writer(_XlsxWriter) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 0ffb6b4bf8c05..a5558866937cf 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -58,6 +58,7 @@ def _test_google_api_imports(): raise ImportError("Missing module required for Google BigQuery " "support: {0}".format(str(e))) + logger = logging.getLogger('pandas.io.gbq') logger.setLevel(logging.ERROR) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index ab44e46c96b77..3f4be6ad459d8 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -217,6 +217,7 @@ def read(fh): raise ValueError('path_or_buf needs to be a string file path or file-like') + dtype_dict = {21: np.dtype('M8[ns]'), u('datetime64[ns]'): np.dtype('M8[ns]'), u('datetime64[us]'): np.dtype('M8[us]'), @@ -237,6 +238,7 @@ def dtype_for(t): return dtype_dict[t] return np.typeDict.get(t, t) + c2f_dict = {'complex': np.float64, 'complex128': np.float64, 'complex64': np.float32} diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8905dfa315c4..88d0c6c12c04f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -409,6 +409,7 @@ def _read(filepath_or_buffer, kwds): return data + _parser_defaults = { 'delimiter': None, @@ -655,6 +656,7 @@ def parser_f(filepath_or_buffer, return parser_f + read_csv = _make_parser_function('read_csv', sep=',') read_csv = Appender(_read_csv_doc)(read_csv) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 31895c2ac54c3..e962cff9913eb 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -74,6 +74,7 @@ def _ensure_encoding(encoding): encoding = _default_encoding return encoding + Term = Expr @@ -112,6 +113,7 @@ class ClosedFileError(Exception): class IncompatibilityWarning(Warning): pass + incompatibility_doc = """ where criteria is being ignored as this version [%s] is too old (or not-defined), read the file in and write it out to a new file to upgrade (with @@ -122,6 +124,7 @@ class IncompatibilityWarning(Warning): class AttributeConflictWarning(Warning): pass + attribute_conflict_doc = """ the [%s] attribute of the existing index is [%s] which conflicts with the new [%s], resetting the attribute to None @@ -131,6 +134,7 @@ class AttributeConflictWarning(Warning): class DuplicateWarning(Warning): pass + duplicate_doc = """ duplicate entries in table, taking most recently appended """ diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9fa01c413aca8..55e145b493dd9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -495,6 +495,7 @@ def has_table(table_name, con, flavor=None, schema=None): pandas_sql = pandasSQL_builder(con, flavor=flavor, schema=schema) return pandas_sql.has_table(table_name) + table_exists = has_table diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2be7657883e88..1698ade4c0102 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -459,6 +459,7 @@ class PossiblePrecisionLoss(Warning): class ValueLabelTypeMismatch(Warning): pass + value_label_mismatch_doc = """ Stata value labels (pandas categories) must be strings. Column {0} contains non-string labels which will be converted to strings. Please check that the diff --git a/pandas/msgpack/__init__.py b/pandas/msgpack/__init__.py index 33d60a12ef0a3..4d6e241171281 100644 --- a/pandas/msgpack/__init__.py +++ b/pandas/msgpack/__init__.py @@ -41,6 +41,7 @@ def packb(o, **kwargs): """ return Packer(**kwargs).pack(o) + # alias for compatibility to simplejson/marshal/pickle. load = unpack loads = unpackb diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 1fc93a967bdbb..61b8434b0ea09 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -863,6 +863,7 @@ def homogenize(series_dict): return output + # use unaccelerated ops for sparse objects ops.add_flex_arithmetic_methods(SparseDataFrame, use_numexpr=False, **ops.frame_flex_funcs) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 2d3a9effe6939..dfdbb3c89814a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -832,6 +832,7 @@ def from_coo(cls, A, dense_index=False): """ return _coo_to_sparse_series(A, dense_index=dense_index) + # overwrite series methods with unaccelerated versions ops.add_special_arithmetic_methods(SparseSeries, use_numexpr=False, **ops.series_special_funcs) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 95b209aee0b0c..914c4c08863a2 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -385,6 +385,7 @@ def ewmstd(arg, com=None, span=None, halflife=None, alpha=None, min_periods=0, bias=bias, func_kw=['bias']) + ewmvol = ewmstd @@ -476,6 +477,7 @@ def f(arg, window, min_periods=None, freq=None, center=False, **kwargs) return f + rolling_max = _rolling_func('max', 'Moving maximum.', how='max') rolling_min = _rolling_func('min', 'Moving minimum.', how='min') rolling_sum = _rolling_func('sum', 'Moving sum.') @@ -683,6 +685,7 @@ def f(arg, min_periods=1, freq=None, **kwargs): **kwargs) return f + expanding_max = _expanding_func('max', 'Expanding maximum.') expanding_min = _expanding_func('min', 'Expanding minimum.') expanding_sum = _expanding_func('sum', 'Expanding sum.') diff --git a/pandas/tests/sparse/test_libsparse.py b/pandas/tests/sparse/test_libsparse.py index 4d5a93d77cf14..0435b732911da 100644 --- a/pandas/tests/sparse/test_libsparse.py +++ b/pandas/tests/sparse/test_libsparse.py @@ -560,8 +560,8 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): check_cases(_check_case) -# too cute? oh but how I abhor code duplication +# too cute? oh but how I abhor code duplication check_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 28f1dc61533c1..b087ca21d3c25 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1588,6 +1588,7 @@ def test_to_xarray(self): # non-convertible self.assertRaises(ValueError, lambda: result.to_pandas()) + # run all the tests, but wrap each in a warning catcher for t in ['test_rename', 'test_rename_axis', 'test_get_numeric_data', 'test_get_default', 'test_nonzero', diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index e82e702cb6e55..ba53d42fccec7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -53,6 +53,7 @@ def wrapper(*args, **kwargs): return pd.concat(*args, **kwargs) return wrapper + concat = concat_wrap() @@ -66,6 +67,8 @@ def merge(left, right, how='inner', on=None, left_on=None, right_on=None, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, indicator=indicator) return op.get_result() + + if __debug__: merge.__doc__ = _merge_doc % '\nleft : DataFrame' @@ -264,6 +267,7 @@ def _merger(x, y): result = _merger(left, right) return result + ordered_merge.__doc__ = merge_ordered.__doc__ @@ -1334,6 +1338,7 @@ def _right_outer_join(x, y, max_groups): right_indexer, left_indexer = _join.left_outer_join(y, x, max_groups) return left_indexer, right_indexer + _join_functions = { 'inner': _join.inner_join, 'left': _join.left_outer_join, diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 0b1ced97d2b81..b2050d7d8d81e 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -149,6 +149,7 @@ def _mpl_ge_2_0_0(): except ImportError: return False + if _mpl_ge_1_5_0(): # Compat with mp 1.5, which uses cycler. import cycler diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e0c602bf5a037..957a934d13f09 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -660,6 +660,7 @@ def get_standard_freq(freq): warnings.warn(msg, FutureWarning, stacklevel=2) return to_offset(freq).rule_code + # --------------------------------------------------------------------- # Period codes @@ -795,6 +796,7 @@ def infer_freq(index, warn=True): inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() + _ONE_MICRO = long(1000) _ONE_MILLI = _ONE_MICRO * 1000 _ONE_SECOND = _ONE_MILLI * 1000 diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 31e40c6bcbb2c..d3d936693c266 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -286,6 +286,7 @@ def _apply_rule(self, dates): dates += offset return dates + holiday_calendars = {} @@ -461,6 +462,7 @@ def merge(self, other, inplace=False): else: return holidays + USMemorialDay = Holiday('MemorialDay', month=5, day=31, offset=DateOffset(weekday=MO(-1))) USLaborDay = Holiday('Labor Day', month=9, day=1, diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 6cbb696783e09..5f00e8b648689 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -138,6 +138,7 @@ def _ensure_datetime64(other): return other raise TypeError('%s type object %s' % (type(other), str(other))) + _midnight = time(0, 0) diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py index 6698c7e924758..22801318a1853 100644 --- a/pandas/tseries/interval.py +++ b/pandas/tseries/interval.py @@ -33,6 +33,3 @@ def __new__(self, starts, ends): def dtype(self): return self.values.dtype - -if __name__ == '__main__': - pass diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 370dd00762896..79227f6de90a5 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1652,6 +1652,7 @@ class WeekDay(object): SAT = 5 SUN = 6 + _int_to_weekday = { WeekDay.MON: 'MON', WeekDay.TUE: 'TUE', @@ -1924,6 +1925,7 @@ def onOffset(self, dt): modMonth = (dt.month - self.startingMonth) % 3 return BMonthEnd().onOffset(dt) and modMonth == 0 + _int_to_month = tslib._MONTH_ALIASES _month_to_int = dict((v, k) for k, v in _int_to_month.items()) @@ -2799,6 +2801,7 @@ def _delta_to_tick(delta): else: # pragma: no cover return Nano(nanos) + _delta_to_nanoseconds = tslib._delta_to_nanoseconds @@ -2931,6 +2934,7 @@ def generate_range(start=None, end=None, periods=None, raise ValueError('Offset %s did not decrement date' % offset) cur = next_date + prefix_mapping = dict((offset._prefix, offset) for offset in [ YearBegin, # 'AS' YearEnd, # 'A' diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 5692d6c5cabde..a6a10c08966d6 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -552,6 +552,8 @@ def var(self, ddof=1, *args, **kwargs): """ nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) + + Resampler._deprecated_valids += dir(Resampler) # downsample methods @@ -969,6 +971,8 @@ def resample(obj, kind=None, **kwds): """ create a TimeGrouper and return our resampler """ tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) + + resample.__doc__ = Resampler.__doc__ diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 9bf39652a4e00..5a5d1533bfa91 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -87,6 +87,7 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors) + _unit_map = { 'Y': 'Y', 'y': 'Y', diff --git a/pandas/types/generic.py b/pandas/types/generic.py index 756fb47596700..e7b54ccc6f25e 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -57,4 +57,5 @@ class _ABCGeneric(type): def __instancecheck__(cls, inst): return hasattr(inst, "_data") + ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 7c5148caf7e74..b0f5d3994ed64 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -153,5 +153,6 @@ def main(): return 0 + if __name__ == "__main__": sys.exit(main()) diff --git a/pandas/util/terminal.py b/pandas/util/terminal.py index 6b8428ff75806..dadd09ae74ea4 100644 --- a/pandas/util/terminal.py +++ b/pandas/util/terminal.py @@ -115,6 +115,7 @@ def ioctl_GWINSZ(fd): return None return int(cr[1]), int(cr[0]) + if __name__ == "__main__": sizex, sizey = get_terminal_size() print('width = %s height = %s' % (sizex, sizey)) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 566ceec027b2b..cda386781e2ec 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -74,6 +74,7 @@ def reset_testing_mode(): if 'deprecate' in testing_mode: warnings.simplefilter('ignore', _testing_mode_warnings) + set_testing_mode() @@ -1381,6 +1382,7 @@ def assert_panelnd_equal(left, right, for i, item in enumerate(right._get_axis(0)): assert item in left, "non-matching item (left) '%s'" % item + # TODO: strangely check_names fails in py3 ? _panel_frame_equal = partial(assert_frame_equal, check_names=False) assert_panel_equal = partial(assert_panelnd_equal, @@ -2076,6 +2078,7 @@ def dec(f): return wrapper + # skip tests on exceptions with this message _network_error_messages = ( # 'urlopen error timed out', From a07db74c04d4bd783d5581500f511de01b5f2716 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 13:00:36 -0500 Subject: [PATCH 153/338] DOC: use shared_docs for Index.get_indexer, get_indexer_non_unique (#15411) * STYLE: flake8 upgraded to 3.3 on conda fixes for E305, 2 blank lines after a class definition * DOC: use shared_docs for Index.get_indexer, get_indexer_non_unique fix non-populated doc-strings for some methods in Index (take) --- pandas/indexes/base.py | 41 +++++++++++++++++++++++++++++--------- pandas/indexes/category.py | 40 +++++++------------------------------ pandas/indexes/multi.py | 40 ++++++++++--------------------------- pandas/tseries/period.py | 5 +++++ 4 files changed, 55 insertions(+), 71 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index c483fb0764a4c..e51824e72a2a0 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -65,6 +65,7 @@ _unsortable_types = frozenset(('mixed', 'mixed-integer')) _index_doc_kwargs = dict(klass='Index', inplace='', + target_klass='Index', unique='Index', duplicated='np.ndarray') _index_shared_docs = dict() @@ -1605,7 +1606,7 @@ def _append_same_dtype(self, to_concat, name): numpy.ndarray.take """ - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) @@ -2350,15 +2351,14 @@ def get_level_values(self, level): self._validate_index_level(level) return self - def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ + _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. Parameters ---------- - target : Index + target : %(target_klass)s method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -2387,6 +2387,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): positions matches the corresponding target values. Missing values in the target are marked by -1. """ + + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) if tolerance is not None: @@ -2496,11 +2499,28 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): indexer = np.where(distance <= tolerance, indexer, -1) return indexer + _index_shared_docs['get_indexer_non_unique'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array + """ + + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): - """ return an indexer suitable for taking from a non unique index - return the labels in the same order as the target, and - return a missing indexer into the target (missing are marked as -1 - in the indexer); target must be an iterable """ target = _ensure_index(target) pself, ptarget = self._possibly_promote(target) if pself is not self or ptarget is not target: @@ -2516,7 +2536,10 @@ def get_indexer_non_unique(self, target): return Index(indexer), missing def get_indexer_for(self, target, **kwargs): - """ guaranteed return of an indexer even when non-unique """ + """ + guaranteed return of an indexer even when non-unique + This dispatches to get_indexer or get_indexer_nonunique as appropriate + """ if self.is_unique: return self.get_indexer(target, **kwargs) indexer, _ = self.get_indexer_non_unique(target, **kwargs) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index e2e0fd056b111..acb2758641a62 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -18,6 +18,8 @@ import pandas.core.base as base import pandas.core.missing as missing import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) class CategoricalIndex(Index, base.PandasDelegate): @@ -289,7 +291,7 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() - @Appender(base._shared_docs['unique'] % ibase._index_doc_kwargs) + @Appender(base._shared_docs['unique'] % _index_doc_kwargs) def unique(self): result = base.IndexOpsMixin.unique(self) # CategoricalIndex._shallow_copy uses keeps original categories @@ -299,7 +301,7 @@ def unique(self): @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 codes = self.codes.astype('i8') @@ -425,34 +427,8 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. The mask determines whether labels are - found or not in the current index - - Parameters - ---------- - target : MultiIndex or Index (of tuples) - method : {'pad', 'ffill', 'backfill', 'bfill'} - pad / ffill: propagate LAST valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - - Notes - ----- - This is a low-level method and probably should be used at your own risk - - Examples - -------- - >>> indexer, mask = index.get_indexer(new_index) - >>> new_values = cur_values.take(indexer) - >>> new_values[-mask] = np.nan - - Returns - ------- - (indexer, mask) : (ndarray, ndarray) - """ method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) @@ -472,10 +448,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _ensure_platform_int(indexer) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): - """ this is the same for a CategoricalIndex for get_indexer; the API - returns the missing values as well - """ target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): @@ -497,7 +471,7 @@ def _convert_list_indexer(self, keyarr, kind=None): return None - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 57739548a17d6..18e1da7303d6d 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -43,6 +43,10 @@ _get_na_value, InvalidIndexError, _index_shared_docs) import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(klass='MultiIndex', + target_klass='MultiIndex or list of tuples')) class MultiIndex(Index): @@ -755,7 +759,7 @@ def f(k, stringify): @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.core.sorting import get_group_index from pandas.hashtable import duplicated_int64 @@ -1244,7 +1248,7 @@ def __getitem__(self, key): names=self.names, sortorder=sortorder, verify_integrity=False) - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) @@ -1564,34 +1568,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): return new_index, indexer + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. The mask determines whether labels are - found or not in the current index - - Parameters - ---------- - target : MultiIndex or Index (of tuples) - method : {'pad', 'ffill', 'backfill', 'bfill'} - pad / ffill: propagate LAST valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - - Notes - ----- - This is a low-level method and probably should be used at your own risk - - Examples - -------- - >>> indexer, mask = index.get_indexer(new_index) - >>> new_values = cur_values.take(indexer) - >>> new_values[-mask] = np.nan - - Returns - ------- - (indexer, mask) : (ndarray, ndarray) - """ method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) @@ -1633,6 +1611,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _ensure_platform_int(indexer) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + return super(MultiIndex, self).get_indexer_non_unique(target) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 98151d5b6130c..8a6b0c153bb50 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -44,6 +44,10 @@ from pandas.lib import infer_dtype import pandas.tslib as tslib from pandas.compat import zip, u +import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(target_klass='PeriodIndex or list of Periods')) def _field_accessor(name, alias, docstring=None): @@ -759,6 +763,7 @@ def get_value(self, series, key): return com._maybe_box(self, self._engine.get_value(s, key), series, key) + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = _ensure_index(target) From 9f91c151e1b1b2def92805f70fcc4ff91e999d65 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 15:20:52 -0500 Subject: [PATCH 154/338] BLD: use latest conda version with latest miniconda installer on appveyor change 3.6 build to use numpy=1.12 & add back xlwt (was not on defaults for a while) Author: Jeff Reback Closes #15415 from jreback/appveyor and squashes the following commits: 2019f37 [Jeff Reback] force numpy version f82877b [Jeff Reback] remove extra conda list 3ace9f2 [Jeff Reback] CI: use numpy=1.12 on appveyor 6855a7b [Jeff Reback] BLD: use latest conda version with latest miniconda installer on appveyor --- appveyor.yml | 15 ++++++--------- ci/requirements-3.5-64.run | 2 +- ci/requirements-3.6-64.run | 4 ++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 42c3be13af809..d96e1dfcf76de 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,19 +18,19 @@ environment: matrix: - - CONDA_ROOT: "C:\\Miniconda3.5_64" + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" - CONDA_NPY: "111" + CONDA_NPY: "112" - - CONDA_ROOT: "C:\\Miniconda3.5_64" + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" CONDA_NPY: "110" - - CONDA_ROOT: "C:\\Miniconda3.5_64" + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "3.5" PYTHON_ARCH: "64" CONDA_PY: "35" @@ -66,8 +66,7 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - #- cmd: conda update -q conda - - cmd: conda install conda=4.2.15 + - cmd: conda update -q conda - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority @@ -83,7 +82,7 @@ install: - cmd: '%CMD_IN_ENV% conda build ci\appveyor.recipe -q' # create our env - - cmd: conda create -q -n pandas python=%PYTHON_VERSION% nose pytest + - cmd: conda create -q -n pandas python=%PYTHON_VERSION% pytest - cmd: activate pandas - SET REQ=ci\requirements-%PYTHON_VERSION%-%PYTHON_ARCH%.run - cmd: echo "installing requirements from %REQ%" @@ -95,7 +94,5 @@ install: test_script: # tests - cmd: activate pandas - - cmd: conda list - cmd: cd \ - cmd: python -c "import pandas; pandas.test(['--skip-slow', '--skip-network'])" - diff --git a/ci/requirements-3.5-64.run b/ci/requirements-3.5-64.run index 905c2ff3625bd..ad66f578d702a 100644 --- a/ci/requirements-3.5-64.run +++ b/ci/requirements-3.5-64.run @@ -1,6 +1,6 @@ python-dateutil pytz -numpy +numpy=1.11* openpyxl xlsxwriter xlrd diff --git a/ci/requirements-3.6-64.run b/ci/requirements-3.6-64.run index 58ba103504b2c..840d2867e9297 100644 --- a/ci/requirements-3.6-64.run +++ b/ci/requirements-3.6-64.run @@ -1,10 +1,10 @@ python-dateutil pytz -numpy +numpy=1.12* openpyxl xlsxwriter xlrd -#xlwt +xlwt scipy feather-format numexpr From 0c7e906b5580786d13b2daf167357ed2b4d71807 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Feb 2017 09:12:16 -0500 Subject: [PATCH 155/338] TST: convert yield based test_pickle.py to parametrized to remove warnings xref #15341 Author: Jeff Reback Closes #15416 from jreback/warn and squashes the following commits: a6af576 [Jeff Reback] TST: convert yield based test_pickle.py to parametrized to remove warnings xref #15341 --- pandas/tests/io/test_pickle.py | 535 +++++++++++++++++---------------- 1 file changed, 277 insertions(+), 258 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 5445c506b050c..1e3816c1556f6 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -1,6 +1,17 @@ # pylint: disable=E1101,E1103,W0232 -""" manage legacy pickle tests """ +""" +manage legacy pickle tests + +How to add pickle tests: + +1. Install pandas version intended to output the pickle. + +2. Execute "generate_legacy_storage_files.py" to create the pickle. +$ python generate_legacy_storage_files.py pickle + +3. Move the created pickle to "data/legacy_pickle/" directory. +""" import pytest import os @@ -9,277 +20,285 @@ import pandas as pd from pandas import Index -from pandas.compat import u, is_platform_little_endian +from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd -class TestPickle(): - """ - How to add pickle tests: +@pytest.fixture(scope='module') +def current_pickle_data(): + # our current version pickle data + from pandas.tests.io.generate_legacy_storage_files import ( + create_pickle_data) + return create_pickle_data() + + +# --------------------- +# comparision functions +# --------------------- +def compare_element(result, expected, typ, version=None): + if isinstance(expected, Index): + tm.assert_index_equal(expected, result) + return + + if typ.startswith('sp_'): + comparator = getattr(tm, "assert_%s_equal" % typ) + comparator(result, expected, exact_indices=False) + elif typ == 'timestamp': + if expected is pd.NaT: + assert result is pd.NaT + else: + tm.assert_equal(result, expected) + tm.assert_equal(result.freq, expected.freq) + else: + comparator = getattr(tm, "assert_%s_equal" % + typ, tm.assert_almost_equal) + comparator(result, expected) + + +def compare(data, vf, version): + + # py3 compat when reading py2 pickle + try: + data = pandas.read_pickle(vf) + except (ValueError) as e: + if 'unsupported pickle protocol:' in str(e): + # trying to read a py3 pickle in py2 + return + else: + raise + + m = globals() + for typ, dv in data.items(): + for dt, result in dv.items(): + try: + expected = data[typ][dt] + except (KeyError): + if version in ('0.10.1', '0.11.0') and dt == 'reg': + break + else: + raise + + # use a specific comparator + # if available + comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) + + comparator = m.get(comparator, m['compare_element']) + comparator(result, expected, typ, version) + return data + + +def compare_sp_series_ts(res, exp, typ, version): + # SparseTimeSeries integrated into SparseSeries in 0.12.0 + # and deprecated in 0.17.0 + if version and LooseVersion(version) <= "0.12.0": + tm.assert_sp_series_equal(res, exp, check_series_type=False) + else: + tm.assert_sp_series_equal(res, exp) + + +def compare_series_ts(result, expected, typ, version): + # GH 7748 + tm.assert_series_equal(result, expected) + tm.assert_equal(result.index.freq, expected.index.freq) + tm.assert_equal(result.index.freq.normalize, False) + tm.assert_series_equal(result > 0, expected > 0) + + # GH 9291 + freq = result.index.freq + tm.assert_equal(freq + Day(1), Day(2)) + + res = freq + pandas.Timedelta(hours=1) + tm.assert_equal(isinstance(res, pandas.Timedelta), True) + tm.assert_equal(res, pandas.Timedelta(days=1, hours=1)) + + res = freq + pandas.Timedelta(nanoseconds=1) + tm.assert_equal(isinstance(res, pandas.Timedelta), True) + tm.assert_equal(res, pandas.Timedelta(days=1, nanoseconds=1)) + + +def compare_series_dt_tz(result, expected, typ, version): + # 8260 + # dtype is object < 0.17.0 + if LooseVersion(version) < '0.17.0': + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + else: + tm.assert_series_equal(result, expected) - 1. Install pandas version intended to output the pickle. - 2. Execute "generate_legacy_storage_files.py" to create the pickle. - $ python generate_legacy_storage_files.py pickle +def compare_series_cat(result, expected, typ, version): + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_series_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': + tm.assert_series_equal(result, expected, check_categorical=False) + else: + tm.assert_series_equal(result, expected) - 3. Move the created pickle to "data/legacy_pickle/" directory. - NOTE: TestPickle can't be a subclass of tm.Testcase to use test generator. - http://stackoverflow.com/questions/6689537/ - nose-test-generators-inside-class - """ +def compare_frame_dt_mixed_tzs(result, expected, typ, version): + # 8260 + # dtype is object < 0.17.0 + if LooseVersion(version) < '0.17.0': + expected = expected.astype(object) + tm.assert_frame_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) - @classmethod - def setup_class(cls): - from pandas.tests.io.generate_legacy_storage_files import ( - create_pickle_data) - cls.data = create_pickle_data() - cls.path = u('__%s__.pickle' % tm.rands(10)) - def compare_element(self, result, expected, typ, version=None): - if isinstance(expected, Index): - tm.assert_index_equal(expected, result) - return +def compare_frame_cat_onecol(result, expected, typ, version): + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_frame_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': + tm.assert_frame_equal(result, expected, check_categorical=False) + else: + tm.assert_frame_equal(result, expected) - if typ.startswith('sp_'): - comparator = getattr(tm, "assert_%s_equal" % typ) - comparator(result, expected, exact_indices=False) - elif typ == 'timestamp': - if expected is pd.NaT: - assert result is pd.NaT - else: - tm.assert_equal(result, expected) - tm.assert_equal(result.freq, expected.freq) - else: - comparator = getattr(tm, "assert_%s_equal" % - typ, tm.assert_almost_equal) - comparator(result, expected) - - def compare(self, vf, version): - - # py3 compat when reading py2 pickle - try: - data = pandas.read_pickle(vf) - except (ValueError) as e: - if 'unsupported pickle protocol:' in str(e): - # trying to read a py3 pickle in py2 - return - else: - raise - - for typ, dv in data.items(): - for dt, result in dv.items(): - try: - expected = self.data[typ][dt] - except (KeyError): - if version in ('0.10.1', '0.11.0') and dt == 'reg': - break - else: - raise - - # use a specific comparator - # if available - comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) - comparator = getattr(self, comparator, self.compare_element) - comparator(result, expected, typ, version) - return data - - def compare_sp_series_ts(self, res, exp, typ, version): - # SparseTimeSeries integrated into SparseSeries in 0.12.0 - # and deprecated in 0.17.0 - if version and LooseVersion(version) <= "0.12.0": - tm.assert_sp_series_equal(res, exp, check_series_type=False) - else: - tm.assert_sp_series_equal(res, exp) - def compare_series_ts(self, result, expected, typ, version): - # GH 7748 - tm.assert_series_equal(result, expected) - tm.assert_equal(result.index.freq, expected.index.freq) - tm.assert_equal(result.index.freq.normalize, False) - tm.assert_series_equal(result > 0, expected > 0) - - # GH 9291 - freq = result.index.freq - tm.assert_equal(freq + Day(1), Day(2)) - - res = freq + pandas.Timedelta(hours=1) - tm.assert_equal(isinstance(res, pandas.Timedelta), True) - tm.assert_equal(res, pandas.Timedelta(days=1, hours=1)) - - res = freq + pandas.Timedelta(nanoseconds=1) - tm.assert_equal(isinstance(res, pandas.Timedelta), True) - tm.assert_equal(res, pandas.Timedelta(days=1, nanoseconds=1)) - - def compare_series_dt_tz(self, result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < '0.17.0': - expected = expected.astype(object) - tm.assert_series_equal(result, expected) - else: - tm.assert_series_equal(result, expected) - - def compare_series_cat(self, result, expected, typ, version): - # Categorical dtype is added in 0.15.0 - # ordered is changed in 0.16.0 - if LooseVersion(version) < '0.15.0': - tm.assert_series_equal(result, expected, check_dtype=False, - check_categorical=False) - elif LooseVersion(version) < '0.16.0': - tm.assert_series_equal(result, expected, check_categorical=False) - else: - tm.assert_series_equal(result, expected) - - def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < '0.17.0': - expected = expected.astype(object) - tm.assert_frame_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) - - def compare_frame_cat_onecol(self, result, expected, typ, version): - # Categorical dtype is added in 0.15.0 - # ordered is changed in 0.16.0 - if LooseVersion(version) < '0.15.0': - tm.assert_frame_equal(result, expected, check_dtype=False, - check_categorical=False) - elif LooseVersion(version) < '0.16.0': - tm.assert_frame_equal(result, expected, check_categorical=False) - else: - tm.assert_frame_equal(result, expected) - - def compare_frame_cat_and_float(self, result, expected, typ, version): - self.compare_frame_cat_onecol(result, expected, typ, version) - - def compare_index_period(self, result, expected, typ, version): - tm.assert_index_equal(result, expected) - tm.assertIsInstance(result.freq, MonthEnd) - tm.assert_equal(result.freq, MonthEnd()) - tm.assert_equal(result.freqstr, 'M') - tm.assert_index_equal(result.shift(2), expected.shift(2)) - - def compare_sp_frame_float(self, result, expected, typ, version): - if LooseVersion(version) <= '0.18.1': - tm.assert_sp_frame_equal(result, expected, exact_indices=False, - check_dtype=False) - else: - tm.assert_sp_frame_equal(result, expected) - - def read_pickles(self, version): - if not is_platform_little_endian(): - pytest.skip("known failure on non-little endian") - - pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) - n = 0 - for f in os.listdir(pth): - vf = os.path.join(pth, f) - data = self.compare(vf, version) - - if data is None: - continue - n += 1 - assert n > 0, 'Pickle files are not tested' - - def test_pickles(self): - pickle_path = tm.get_data_path('legacy_pickle') - n = 0 - for v in os.listdir(pickle_path): - pth = os.path.join(pickle_path, v) - if os.path.isdir(pth): - yield self.read_pickles, v - n += 1 - assert n > 0, 'Pickle files are not tested' - - def test_round_trip_current(self): - - try: - import cPickle as c_pickle - - def c_pickler(obj, path): - with open(path, 'wb') as fh: - c_pickle.dump(obj, fh, protocol=-1) - - def c_unpickler(path): - with open(path, 'rb') as fh: - fh.seek(0) - return c_pickle.load(fh) - except: - c_pickler = None - c_unpickler = None - - import pickle as python_pickle - - def python_pickler(obj, path): +def compare_frame_cat_and_float(result, expected, typ, version): + compare_frame_cat_onecol(result, expected, typ, version) + + +def compare_index_period(result, expected, typ, version): + tm.assert_index_equal(result, expected) + tm.assertIsInstance(result.freq, MonthEnd) + tm.assert_equal(result.freq, MonthEnd()) + tm.assert_equal(result.freqstr, 'M') + tm.assert_index_equal(result.shift(2), expected.shift(2)) + + +def compare_sp_frame_float(result, expected, typ, version): + if LooseVersion(version) <= '0.18.1': + tm.assert_sp_frame_equal(result, expected, exact_indices=False, + check_dtype=False) + else: + tm.assert_sp_frame_equal(result, expected) + + +# --------------------- +# tests +# --------------------- +def legacy_pickle_versions(): + # yield the pickle versions + pickle_path = tm.get_data_path('legacy_pickle') + for v in os.listdir(pickle_path): + pth = os.path.join(pickle_path, v) + if os.path.isdir(pth): + yield v + + +@pytest.mark.parametrize('version', legacy_pickle_versions()) +def test_pickles(current_pickle_data, version): + if not is_platform_little_endian(): + pytest.skip("known failure on non-little endian") + + pth = tm.get_data_path('legacy_pickle/{0}'.format(version)) + n = 0 + for f in os.listdir(pth): + vf = os.path.join(pth, f) + data = compare(current_pickle_data, vf, version) + + if data is None: + continue + n += 1 + assert n > 0, 'Pickle files are not tested' + + +def test_round_trip_current(current_pickle_data): + + try: + import cPickle as c_pickle + + def c_pickler(obj, path): with open(path, 'wb') as fh: - python_pickle.dump(obj, fh, protocol=-1) + c_pickle.dump(obj, fh, protocol=-1) - def python_unpickler(path): + def c_unpickler(path): with open(path, 'rb') as fh: fh.seek(0) - return python_pickle.load(fh) - - for typ, dv in self.data.items(): - for dt, expected in dv.items(): - - for writer in [pd.to_pickle, c_pickler, python_pickler]: - if writer is None: - continue - - with tm.ensure_clean(self.path) as path: - - # test writing with each pickler - writer(expected, path) - - # test reading with each unpickler - result = pd.read_pickle(path) - self.compare_element(result, expected, typ) - - if c_unpickler is not None: - result = c_unpickler(path) - self.compare_element(result, expected, typ) - - result = python_unpickler(path) - self.compare_element(result, expected, typ) - - def test_pickle_v0_14_1(self): - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') - # This code was executed once on v0.14.1 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - - def test_pickle_v0_15_2(self): - # ordered -> _ordered - # GH 9347 - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') - # This code was executed once on v0.15.2 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + return c_pickle.load(fh) + except: + c_pickler = None + c_unpickler = None + + import pickle as python_pickle + + def python_pickler(obj, path): + with open(path, 'wb') as fh: + python_pickle.dump(obj, fh, protocol=-1) + + def python_unpickler(path): + with open(path, 'rb') as fh: + fh.seek(0) + return python_pickle.load(fh) + + data = current_pickle_data + for typ, dv in data.items(): + for dt, expected in dv.items(): + + for writer in [pd.to_pickle, c_pickler, python_pickler]: + if writer is None: + continue + + with tm.ensure_clean() as path: + + # test writing with each pickler + writer(expected, path) + + # test reading with each unpickler + result = pd.read_pickle(path) + compare_element(result, expected, typ) + + if c_unpickler is not None: + result = c_unpickler(path) + compare_element(result, expected, typ) + + result = python_unpickler(path) + compare_element(result, expected, typ) + + +def test_pickle_v0_14_1(): + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + +def test_pickle_v0_15_2(): + # ordered -> _ordered + # GH 9347 + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) From 0460a72b2512076c7480ecf0c712f11b97fba626 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Thu, 16 Feb 2017 09:13:42 -0500 Subject: [PATCH 156/338] TST: Parametrize simple yield tests xref #15341 Author: Elliott Sales de Andrade Closes #15406 from QuLogic/pytest-simple-yield and squashes the following commits: b002752 [Elliott Sales de Andrade] TST: Set PYTHONHASHSEED so xdist doesn't break. 8368772 [Elliott Sales de Andrade] TST: Use fixtures for engine/parser where possible. c6cd346 [Elliott Sales de Andrade] TST: Parametrize remaining simple yield tests. 47bf1a1 [Elliott Sales de Andrade] TST: Replace ENGINES_PARSERS by parametrize. --- ci/script_multi.sh | 6 + pandas/tests/computation/test_compat.py | 11 +- pandas/tests/computation/test_eval.py | 233 ++++++------------------ pandas/tests/io/parser/test_network.py | 26 +-- pandas/util/testing.py | 15 +- 5 files changed, 92 insertions(+), 199 deletions(-) diff --git a/ci/script_multi.sh b/ci/script_multi.sh index f5fbcbbc12f83..41f71fd21f63f 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -17,6 +17,12 @@ if [ -n "$LOCALE_OVERRIDE" ]; then python -c "$pycmd" fi +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +echo PYTHONHASHSEED=$PYTHONHASHSEED + if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index 77994ac6d2f53..59bdde83aedd8 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -12,8 +12,6 @@ import pandas.computation.expr as expr from pandas.computation import _MIN_NUMEXPR_VERSION -ENGINES_PARSERS = list(product(_engines, expr._parsers)) - def test_compat(): # test we have compat with our version of nu @@ -30,12 +28,9 @@ def test_compat(): pytest.skip("not testing numexpr version compat") -def test_invalid_numexpr_version(): - for engine, parser in ENGINES_PARSERS: - yield check_invalid_numexpr_version, engine, parser - - -def check_invalid_numexpr_version(engine, parser): +@pytest.mark.parametrize('engine', _engines) +@pytest.mark.parametrize('parser', expr._parsers) +def test_invalid_numexpr_version(engine, parser): def testit(): a, b = 1, 2 res = pd.eval('a + b', engine=engine, parser=parser) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index ada714c8ac52e..b42f79fe5009b 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -20,6 +20,7 @@ from pandas.computation import pytables from pandas.computation.engines import _engines, NumExprClobberingError from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor +from pandas.computation.expressions import _USE_NUMEXPR, _NUMEXPR_INSTALLED from pandas.computation.ops import (_binary_ops_dict, _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms, @@ -38,6 +39,23 @@ _scalar_skip = 'in', 'not in' +@pytest.fixture(params=( + pytest.mark.skipif(engine == 'numexpr' and not _USE_NUMEXPR, + reason='numexpr enabled->{enabled}, ' + 'installed->{installed}'.format( + enabled=_USE_NUMEXPR, + installed=_NUMEXPR_INSTALLED))(engine) + for engine in _engines +)) +def engine(request): + return request.param + + +@pytest.fixture(params=expr._parsers) +def parser(request): + return request.param + + def engine_has_neg_frac(engine): return _engines[engine].has_neg_frac @@ -774,17 +792,17 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): f = lambda *args, **kwargs: np.random.randn() -ENGINES_PARSERS = list(product(_engines, expr._parsers)) - #------------------------------------- # typecasting rules consistency with python # issue #12388 class TestTypeCasting(object): - - def check_binop_typecasting(self, engine, parser, op, dt): - tm.skip_if_no_ne(engine) + @pytest.mark.parametrize('op', ['+', '-', '*', '**', '/']) + # maybe someday... numexpr has too many upcasting rules now + # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) + @pytest.mark.parametrize('dt', [np.float32, np.float64]) + def test_binop_typecasting(self, engine, parser, op, dt): df = mkdf(5, 3, data_gen_f=f, dtype=dt) s = 'df {} 3'.format(op) res = pd.eval(s, engine=engine, parser=parser) @@ -798,15 +816,6 @@ def check_binop_typecasting(self, engine, parser, op, dt): assert res.values.dtype == dt assert_frame_equal(res, eval(s)) - def test_binop_typecasting(self): - for engine, parser in ENGINES_PARSERS: - for op in ['+', '-', '*', '**', '/']: - # maybe someday... numexpr has too many upcasting rules now - # for dt in chain(*(np.sctypes[x] for x in ['uint', 'int', - # 'float'])): - for dt in [np.float32, np.float64]: - yield self.check_binop_typecasting, engine, parser, op, dt - #------------------------------------- # basic and complex alignment @@ -826,19 +835,13 @@ class TestAlignment(object): index_types = 'i', 'u', 'dt' lhs_index_types = index_types + ('s',) # 'p' - def check_align_nested_unary_op(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_align_nested_unary_op(self, engine, parser): s = 'df * ~2' df = mkdf(5, 3, data_gen_f=f) res = pd.eval(s, engine=engine, parser=parser) assert_frame_equal(res, df * ~2) - def test_align_nested_unary_op(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_align_nested_unary_op, engine, parser - - def check_basic_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_basic_frame_alignment(self, engine, parser): args = product(self.lhs_index_types, self.index_types, self.index_types) with warnings.catch_warnings(record=True): @@ -856,12 +859,7 @@ def check_basic_frame_alignment(self, engine, parser): res = pd.eval('df + df2', engine=engine, parser=parser) assert_frame_equal(res, df + df2) - def test_basic_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_basic_frame_alignment, engine, parser - - def check_frame_comparison(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_frame_comparison(self, engine, parser): args = product(self.lhs_index_types, repeat=2) for r_idx_type, c_idx_type in args: df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, @@ -874,12 +872,8 @@ def check_frame_comparison(self, engine, parser): res = pd.eval('df < df3', engine=engine, parser=parser) assert_frame_equal(res, df < df3) - def test_frame_comparison(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_frame_comparison, engine, parser - - def check_medium_complex_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) + @slow + def test_medium_complex_frame_alignment(self, engine, parser): args = product(self.lhs_index_types, self.index_types, self.index_types, self.index_types) @@ -899,14 +893,7 @@ def check_medium_complex_frame_alignment(self, engine, parser): engine=engine, parser=parser) assert_frame_equal(res, df + df2 + df3) - @slow - def test_medium_complex_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_medium_complex_frame_alignment, engine, parser - - def check_basic_frame_series_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) - + def test_basic_frame_series_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) @@ -932,13 +919,7 @@ def testit(r_idx_type, c_idx_type, index_name): for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) - def test_basic_frame_series_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_basic_frame_series_alignment, engine, parser - - def check_basic_series_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) - + def test_basic_series_frame_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) @@ -968,12 +949,7 @@ def testit(r_idx_type, c_idx_type, index_name): for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) - def test_basic_series_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_basic_series_frame_alignment, engine, parser - - def check_series_frame_commutativity(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_series_frame_commutativity(self, engine, parser): args = product(self.lhs_index_types, self.index_types, ('+', '*'), ('index', 'columns')) @@ -1000,13 +976,8 @@ def check_series_frame_commutativity(self, engine, parser): if engine == 'numexpr': assert_frame_equal(a, b) - def test_series_frame_commutativity(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_series_frame_commutativity, engine, parser - - def check_complex_series_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) - + @slow + def test_complex_series_frame_alignment(self, engine, parser): import random args = product(self.lhs_index_types, self.index_types, self.index_types, self.index_types) @@ -1050,13 +1021,7 @@ def check_complex_series_frame_alignment(self, engine, parser): tm.assert_equal(res.shape, expected.shape) assert_frame_equal(res, expected) - @slow - def test_complex_series_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_complex_series_frame_alignment, engine, parser - - def check_performance_warning_for_poor_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_performance_warning_for_poor_alignment(self, engine, parser): df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) if engine == 'numexpr': @@ -1098,11 +1063,6 @@ def check_performance_warning_for_poor_alignment(self, engine, parser): "".format(1, 'df', np.log10(s.size - df.shape[1]))) tm.assert_equal(msg, expected) - def test_performance_warning_for_poor_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield (self.check_performance_warning_for_poor_alignment, engine, - parser) - #------------------------------------ # slightly more complex ops @@ -1762,18 +1722,12 @@ def setUpClass(cls): class TestScope(object): - def check_global_scope(self, e, engine, parser): - tm.skip_if_no_ne(engine) + def test_global_scope(self, engine, parser): + e = '_var_s * 2' tm.assert_numpy_array_equal(_var_s * 2, pd.eval(e, engine=engine, parser=parser)) - def test_global_scope(self): - e = '_var_s * 2' - for engine, parser in product(_engines, expr._parsers): - yield self.check_global_scope, e, engine, parser - - def check_no_new_locals(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_no_new_locals(self, engine, parser): x = 1 lcls = locals().copy() pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) @@ -1781,22 +1735,13 @@ def check_no_new_locals(self, engine, parser): lcls2.pop('lcls') tm.assert_equal(lcls, lcls2) - def test_no_new_locals(self): - for engine, parser in product(_engines, expr._parsers): - yield self.check_no_new_locals, engine, parser - - def check_no_new_globals(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_no_new_globals(self, engine, parser): x = 1 gbls = globals().copy() pd.eval('x + 1', engine=engine, parser=parser) gbls2 = globals().copy() tm.assert_equal(gbls, gbls2) - def test_no_new_globals(self): - for engine, parser in product(_engines, expr._parsers): - yield self.check_no_new_globals, engine, parser - def test_invalid_engine(): tm.skip_if_no_ne() @@ -1816,7 +1761,9 @@ def test_invalid_parser(): 'pandas': PandasExprVisitor} -def check_disallowed_nodes(engine, parser): +@pytest.mark.parametrize('engine', _parsers) +@pytest.mark.parametrize('parser', _parsers) +def test_disallowed_nodes(engine, parser): tm.skip_if_no_ne(engine) VisitorClass = _parsers[parser] uns_ops = VisitorClass.unsupported_nodes @@ -1827,38 +1774,19 @@ def check_disallowed_nodes(engine, parser): getattr(inst, ops)() -def test_disallowed_nodes(): - for engine, visitor in product(_parsers, repeat=2): - yield check_disallowed_nodes, engine, visitor - - -def check_syntax_error_exprs(engine, parser): - tm.skip_if_no_ne(engine) +def test_syntax_error_exprs(engine, parser): e = 's +' with pytest.raises(SyntaxError): pd.eval(e, engine=engine, parser=parser) -def test_syntax_error_exprs(): - for engine, parser in ENGINES_PARSERS: - yield check_syntax_error_exprs, engine, parser - - -def check_name_error_exprs(engine, parser): - tm.skip_if_no_ne(engine) +def test_name_error_exprs(engine, parser): e = 's + t' with tm.assertRaises(NameError): pd.eval(e, engine=engine, parser=parser) -def test_name_error_exprs(): - for engine, parser in ENGINES_PARSERS: - yield check_name_error_exprs, engine, parser - - -def check_invalid_local_variable_reference(engine, parser): - tm.skip_if_no_ne(engine) - +def test_invalid_local_variable_reference(engine, parser): a, b = 1, 2 exprs = 'a + @b', '@a + b', '@a + @b' for expr in exprs: @@ -1870,13 +1798,7 @@ def check_invalid_local_variable_reference(engine, parser): pd.eval(exprs, engine=engine, parser=parser) -def test_invalid_local_variable_reference(): - for engine, parser in ENGINES_PARSERS: - yield check_invalid_local_variable_reference, engine, parser - - -def check_numexpr_builtin_raises(engine, parser): - tm.skip_if_no_ne(engine) +def test_numexpr_builtin_raises(engine, parser): sin, dotted_line = 1, 2 if engine == 'numexpr': with tm.assertRaisesRegexp(NumExprClobberingError, @@ -1887,51 +1809,35 @@ def check_numexpr_builtin_raises(engine, parser): tm.assert_equal(res, sin + dotted_line) -def test_numexpr_builtin_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_numexpr_builtin_raises, engine, parser - - -def check_bad_resolver_raises(engine, parser): - tm.skip_if_no_ne(engine) +def test_bad_resolver_raises(engine, parser): cannot_resolve = 42, 3.0 with tm.assertRaisesRegexp(TypeError, 'Resolver of type .+'): pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, parser=parser) -def test_bad_resolver_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_bad_resolver_raises, engine, parser - - -def check_empty_string_raises(engine, parser): +def test_empty_string_raises(engine, parser): # GH 13139 - tm.skip_if_no_ne(engine) with tm.assertRaisesRegexp(ValueError, 'expr cannot be an empty string'): pd.eval('', engine=engine, parser=parser) -def test_empty_string_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_empty_string_raises, engine, parser - - -def check_more_than_one_expression_raises(engine, parser): - tm.skip_if_no_ne(engine) +def test_more_than_one_expression_raises(engine, parser): with tm.assertRaisesRegexp(SyntaxError, 'only a single expression is allowed'): pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) -def test_more_than_one_expression_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_more_than_one_expression_raises, engine, parser +@pytest.mark.parametrize('cmp', ('and', 'or')) +@pytest.mark.parametrize('lhs', (int, float)) +@pytest.mark.parametrize('rhs', (int, float)) +def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): + gen = {int: lambda: np.random.randint(10), float: np.random.randn} + mid = gen[lhs]() + lhs = gen[lhs]() + rhs = gen[rhs]() -def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): - tm.skip_if_no_ne(engine) - mid = gen[type(lhs)]() ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) @@ -1940,32 +1846,14 @@ def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): pd.eval(ex, engine=engine, parser=parser) -def test_bool_ops_fails_on_scalars(): - _bool_ops_syms = 'and', 'or' - dtypes = int, float - gen = {int: lambda: np.random.randint(10), float: np.random.randn} - for engine, parser, dtype1, cmp, dtype2 in product(_engines, expr._parsers, - dtypes, _bool_ops_syms, - dtypes): - yield (check_bool_ops_fails_on_scalars, gen, gen[dtype1](), cmp, - gen[dtype2](), engine, parser) - - -def check_inf(engine, parser): - tm.skip_if_no_ne(engine) +def test_inf(engine, parser): s = 'inf + 1' expected = np.inf result = pd.eval(s, engine=engine, parser=parser) tm.assert_equal(result, expected) -def test_inf(): - for engine, parser in ENGINES_PARSERS: - yield check_inf, engine, parser - - -def check_negate_lt_eq_le(engine, parser): - tm.skip_if_no_ne(engine) +def test_negate_lt_eq_le(engine, parser): df = pd.DataFrame([[0, 10], [1, 20]], columns=['cat', 'count']) expected = df[~(df.cat > 0)] @@ -1980,11 +1868,6 @@ def check_negate_lt_eq_le(engine, parser): tm.assert_frame_equal(result, expected) -def test_negate_lt_eq_le(): - for engine, parser in product(_engines, expr._parsers): - yield check_negate_lt_eq_le, engine, parser - - class TestValidate(tm.TestCase): def test_validate_bool_args(self): diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 721d447262149..4d6b6c7daa3c6 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -7,7 +7,6 @@ import os import pytest -from itertools import product import pandas.util.testing as tm from pandas import DataFrame @@ -21,14 +20,18 @@ def salaries_table(): @pytest.mark.parametrize( - "compression,extension", [('gzip', '.gz'), ('bz2', '.bz2'), - ('zip', '.zip'), ('xz', '.xz')]) -def test_compressed_urls(salaries_table, compression, extension): - check_compressed_urls(salaries_table, compression, extension) + "compression,extension", + [('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'), + tm._mark_skipif_no_lzma(('xz', '.xz'))]) +@pytest.mark.parametrize('mode', ['explicit', 'infer']) +@pytest.mark.parametrize('engine', ['python', 'c']) +def test_compressed_urls(salaries_table, compression, extension, mode, engine): + check_compressed_urls(salaries_table, compression, extension, mode, engine) @tm.network -def check_compressed_urls(salaries_table, compression, extension): +def check_compressed_urls(salaries_table, compression, extension, mode, + engine): # test reading compressed urls with various engines and # extension inference base_url = ('https://github.com/pandas-dev/pandas/raw/master/' @@ -36,14 +39,11 @@ def check_compressed_urls(salaries_table, compression, extension): url = base_url + extension - # args is a (compression, engine) tuple - for (c, engine) in product([compression, 'infer'], ['python', 'c']): + if mode != 'explicit': + compression = mode - if url.endswith('.xz'): - tm._skip_if_no_lzma() - - url_table = read_table(url, compression=c, engine=engine) - tm.assert_frame_equal(url_table, salaries_table) + url_table = read_table(url, compression=compression, engine=engine) + tm.assert_frame_equal(url_table, salaries_table) class TestS3(tm.TestCase): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cda386781e2ec..1bd539469dbe3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -307,12 +307,21 @@ def _skip_if_scipy_0_17(): pytest.skip("scipy 0.17") -def _skip_if_no_lzma(): +def _check_if_lzma(): try: return compat.import_lzma() except ImportError: - import pytest - pytest.skip('need backports.lzma to run') + return False + + +def _skip_if_no_lzma(): + return _check_if_lzma() or pytest.skip('need backports.lzma to run') + + +_mark_skipif_no_lzma = pytest.mark.skipif( + not _check_if_lzma(), + reason='need backports.lzma to run' +) def _skip_if_no_xarray(): From bd9c1d22571b56f1577267a3078613371e488079 Mon Sep 17 00:00:00 2001 From: Diego Fernandez Date: Thu, 16 Feb 2017 09:21:03 -0500 Subject: [PATCH 157/338] BUG: Ensure the right values are set in SeriesGroupBy.nunique closes #13453 Author: Diego Fernandez Closes #15418 from aiguofer/gh_13453 and squashes the following commits: c53bd70 [Diego Fernandez] Add test for #13453 in test_resample and add note to whatsnew 0daab80 [Diego Fernandez] Ensure the right values are set in SeriesGroupBy.nunique --- doc/source/whatsnew/v0.20.0.txt | 7 ++++--- pandas/core/groupby.py | 2 +- pandas/tests/groupby/test_groupby.py | 13 +++++++++++++ pandas/tests/tseries/test_resample.py | 20 ++++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4708abe4d592e..09551cfc0bcf8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -418,6 +418,7 @@ New Behavior: Other API Changes ^^^^^^^^^^^^^^^^^ +- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) @@ -428,9 +429,8 @@ Other API Changes - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) -- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). +- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss (:issue:`14064`, :issue:`14305`). - Reorganization of timeseries development tests (:issue:`14854`) -- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). .. _whatsnew_0200.deprecations: @@ -473,7 +473,7 @@ Performance Improvements (or with ``compat_x=True``) (:issue:`15073`). - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) -- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. +- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) @@ -553,6 +553,7 @@ Bug Fixes - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) +- Bug in ``groupby().nunique()`` with a datetimelike-grouper where bins counts were incorrect (:issue:`13453`) - Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) - Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 23c835318b0e6..ba2de295fa0a9 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3032,7 +3032,7 @@ def nunique(self, dropna=True): # we might have duplications among the bins if len(res) != len(ri): res, out = np.zeros(len(ri), dtype=out.dtype), res - res[ids] = out + res[ids[idx]] = out return Series(res, index=ri, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d53446870beb1..59cbcab23b9e7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4159,6 +4159,19 @@ def test_nunique_with_empty_series(self): expected = pd.Series(name='name', dtype='int64') tm.assert_series_equal(result, expected) + def test_nunique_with_timegrouper(self): + # GH 13453 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + Timestamp('2016-06-28 16:09:30'), + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}).set_index('time') + result = test.groupby(pd.TimeGrouper(freq='h'))['data'].nunique() + expected = test.groupby( + pd.TimeGrouper(freq='h') + )['data'].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index afb44887fe7d1..45bbc88ef711d 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -1939,6 +1939,26 @@ def test_resample_nunique(self): result = df.ID.groupby(pd.Grouper(freq='D')).nunique() assert_series_equal(result, expected) + def test_resample_nunique_with_date_gap(self): + # GH 13453 + index = pd.date_range('1-1-2000', '2-15-2000', freq='h') + index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h') + index3 = index.append(index2) + s = pd.Series(range(len(index3)), index=index3) + r = s.resample('M') + + # Since all elements are unique, these should all be the same + results = [ + r.count(), + r.nunique(), + r.agg(pd.Series.nunique), + r.agg('nunique') + ] + + assert_series_equal(results[0], results[1]) + assert_series_equal(results[0], results[2]) + assert_series_equal(results[0], results[3]) + def test_resample_group_info(self): # GH10914 for n, k in product((10000, 100000), (10, 100, 1000)): dr = date_range(start='2015-08-27', periods=n // 10, freq='T') From 23401c1c96d6d830f969956df050d3a1109afb9d Mon Sep 17 00:00:00 2001 From: abaldenko Date: Thu, 16 Feb 2017 12:39:27 -0500 Subject: [PATCH 158/338] BUG: Concat with inner join and empty DataFrame closes #15328 Author: abaldenko Closes #15397 from abaldenko/concat_empty_dataframe and squashes the following commits: 47c8735 [abaldenko] BUG: Concat with inner join and empty DataFrame fc473b7 [abaldenko] BUG: Concat with inner join and empty DataFrame b86dcb6 [abaldenko] BUG: Concat with inner join and empty DataFrame --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/tools/test_concat.py | 10 ++++++++++ pandas/tests/tools/test_merge.py | 8 ++++++++ pandas/tools/concat.py | 4 +++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 09551cfc0bcf8..ddb9088035d89 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -576,7 +576,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - +- Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 87a0dda34a525..2a28fccdc9b94 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1825,6 +1825,16 @@ def test_concat_bug_3602(self): result = concat([df1, df2], axis=1) assert_frame_equal(result, expected) + def test_concat_inner_join_empty(self): + # GH 15328 + df_empty = pd.DataFrame() + df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') + df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + + for how, expected in [('inner', df_expected), ('outer', df_a)]: + result = pd.concat([df_a, df_empty], axis=1, join=how) + assert_frame_equal(result, expected) + def test_concat_series_axis1_same_names_ignore_index(self): dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] s1 = Series(randn(len(dates)), index=dates, name='value') diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index 472d8674f9f8d..b3b5e7e29319b 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -52,6 +52,14 @@ def setUp(self): self.right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) + def test_merge_inner_join_empty(self): + # GH 15328 + df_empty = pd.DataFrame() + df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') + result = pd.merge(df_empty, df_a, left_index=True, right_index=True) + expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + assert_frame_equal(result, expected) + def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) diff --git a/pandas/tools/concat.py b/pandas/tools/concat.py index dbbc831b19d1d..31d7a9eb9a01a 100644 --- a/pandas/tools/concat.py +++ b/pandas/tools/concat.py @@ -284,7 +284,9 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if sum(obj.shape) > 0 or isinstance(obj, Series)] if (len(non_empties) and (keys is None and names is None and - levels is None and join_axes is None)): + levels is None and + join_axes is None and + not self.intersect)): objs = non_empties sample = objs[0] From 43aba1750a9e90d7585520d3fa1acacc1c7ff0f7 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Thu, 16 Feb 2017 12:45:29 -0500 Subject: [PATCH 159/338] ENH: Added ability to freeze panes from DataFrame.to_excel() (#15160) closes #15160 Author: Jeff Carey Closes #15291 from jeffcarey/enh-15160 and squashes the following commits: cef8fce [Jeff Carey] ENH: Added ability to freeze panes from DataFrame.to_excel() --- doc/source/io.rst | 13 +++++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 19 ++++++++++++++++-- pandas/core/generic.py | 7 ++++++- pandas/io/excel.py | 34 ++++++++++++++++++++++++++------- pandas/tests/io/test_excel.py | 12 ++++++++++-- 6 files changed, 74 insertions(+), 12 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 22eac33a715ba..2d6ddf98437e5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2777,6 +2777,7 @@ Added support for Openpyxl >= 2.2 ``'xlsxwriter'`` will produce an Excel 2007-format workbook (xlsx). If omitted, an Excel 2007-formatted workbook is produced. + .. _io.excel.writers: Excel writer engines @@ -2823,6 +2824,18 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') +.. _io.excel.style: + +Style and Formatting +'''''''''''''''''''' + +The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. + +- ``float_format`` : Format string for floating point numbers (default None) +- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will +freeze the first row and first column (default None) + + .. _io.clipboard: Clipboard diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ddb9088035d89..75a8752c9bfa4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -153,6 +153,7 @@ Other enhancements - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) +- ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7c306ea7ce95..3ebdf72a5cde9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1390,7 +1390,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True): + merge_cells=True, encoding=None, inf_rep='inf', verbose=True, + freeze_panes=None): from pandas.io.excel import ExcelWriter need_save = False if encoding is None: @@ -1406,12 +1407,26 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', index_label=index_label, merge_cells=merge_cells, inf_rep=inf_rep) + formatted_cells = formatter.get_formatted_cells() + freeze_panes = self._validate_freeze_panes(freeze_panes) excel_writer.write_cells(formatted_cells, sheet_name, - startrow=startrow, startcol=startcol) + startrow=startrow, startcol=startcol, + freeze_panes=freeze_panes) if need_save: excel_writer.save() + def _validate_freeze_panes(self, freeze_panes): + if freeze_panes is not None: + if ( + len(freeze_panes) == 2 and + all(isinstance(item, int) for item in freeze_panes) + ): + return freeze_panes + + raise ValueError("freeze_panes must be of form (row, column)" + " where row and column are integers") + def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 20e6e027dbf09..204cd91ebfab0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1033,7 +1033,7 @@ def __setstate__(self, state): # I/O Methods _shared_docs['to_excel'] = """ - Write %(klass)s to a excel sheet + Write %(klass)s to an excel sheet %(versionadded_to_excel)s Parameters ---------- @@ -1072,6 +1072,11 @@ def __setstate__(self, state): inf_rep : string, default 'inf' Representation for infinity (there is no native representation for infinity in Excel) + freeze_panes : tuple of integer (length 2), default None + Specifies the bottommost row and rightmost column that + is to be frozen + + .. versionadded:: 0.20.0 Notes ----- diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 2821983213646..37a61b7dc9ab5 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -693,7 +693,8 @@ def engine(self): pass @abc.abstractmethod - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): """ Write given formated cells into Excel an excel sheet @@ -705,6 +706,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): Name of Excel sheet, if None, then use self.cur_sheet startrow: upper left cell row to dump data frame startcol: upper left cell column to dump data frame + freeze_panes: integer tuple of length 2 + contains the bottom-most row and right-most column to freeze """ pass @@ -804,7 +807,8 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter @@ -904,7 +908,8 @@ class _Openpyxl20Writer(_Openpyxl1Writer): engine = 'openpyxl20' openpyxl_majorver = 2 - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter @@ -1311,7 +1316,8 @@ class _Openpyxl22Writer(_Openpyxl20Writer): engine = 'openpyxl22' openpyxl_majorver = 2 - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) @@ -1324,6 +1330,10 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): wks.title = sheet_name self.sheets[sheet_name] = wks + if freeze_panes is not None: + wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1, + column=freeze_panes[1] + 1) + for cell in cells: xcell = wks.cell( row=startrow + cell.row + 1, @@ -1396,7 +1406,8 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -1407,6 +1418,11 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): wks = self.book.add_sheet(sheet_name) self.sheets[sheet_name] = wks + if freeze_panes is not None: + wks.set_panes_frozen(True) + wks.set_horz_split_pos(freeze_panes[0]) + wks.set_vert_split_pos(freeze_panes[1]) + style_dict = {} for cell in cells: @@ -1518,11 +1534,12 @@ def save(self): """ Save workbook to disk. """ + return self.book.close() - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using xlsxwriter. - sheet_name = self._get_sheet_name(sheet_name) if sheet_name in self.sheets: @@ -1533,6 +1550,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): style_dict = {} + if freeze_panes is not None: + wks.freeze_panes(*(freeze_panes)) + for cell in cells: val = _conv_value(cell.val) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 0c2b443cffe52..b66cb24bf44d8 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1836,6 +1836,14 @@ def test_true_and_false_value_options(self): false_values=['bar']) tm.assert_frame_equal(read_frame, expected) + def test_freeze_panes(self): + # GH15160 + expected = DataFrame([[1, 2], [3, 4]], columns=['col1', 'col2']) + with ensure_clean(self.ext) as path: + expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) + result = read_excel(path) + tm.assert_frame_equal(expected, result) + def raise_wrapper(major_ver): def versioned_raise_wrapper(orig_method): @@ -1873,7 +1881,7 @@ class OpenpyxlTests(ExcelWriterBase, tm.TestCase): def test_to_excel_styleconverter(self): _skip_if_no_openpyxl() if not openpyxl_compat.is_compat(major_ver=1): - pytest.skip('incompatiable openpyxl version') + pytest.skip('incompatible openpyxl version') import openpyxl @@ -2095,7 +2103,7 @@ def test_to_excel_styleconverter(self): def test_write_cells_merge_styled(self): if not openpyxl_compat.is_compat(major_ver=2): - pytest.skip('incompatiable openpyxl version') + pytest.skip('incompatible openpyxl version') from pandas.formats.format import ExcelCell From 3a74d95ad946b332320309206e9ecccfa3fde796 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Fri, 17 Feb 2017 00:17:38 -0800 Subject: [PATCH 160/338] Documents touch-up for DataFrame.to_excel() freeze_panes option (#15436) --- doc/source/io.rst | 4 ++-- pandas/core/generic.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 2d6ddf98437e5..55ef2c09d43e4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2832,8 +2832,8 @@ Style and Formatting The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. - ``float_format`` : Format string for floating point numbers (default None) -- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will -freeze the first row and first column (default None) +- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default None) + .. _io.clipboard: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 204cd91ebfab0..26b9a880dd2c7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1073,7 +1073,7 @@ def __setstate__(self, state): Representation for infinity (there is no native representation for infinity in Excel) freeze_panes : tuple of integer (length 2), default None - Specifies the bottommost row and rightmost column that + Specifies the one-based bottommost row and rightmost column that is to be frozen .. versionadded:: 0.20.0 From a7ba5e871cae5fd254c13101e528bd6db96b8fb3 Mon Sep 17 00:00:00 2001 From: Peter Date: Fri, 17 Feb 2017 13:09:20 +0000 Subject: [PATCH 161/338] BUG: to_sql convert index name to string (#15404) (#15423) * Converted index name to string to fix issue #15404 - BUG: to_sql errors with numeric index name - needs conversion to string * Additional int to string conversion added. Associated test cases added. * PEP 8 compliance edits * Removed extraneous brackets --- pandas/io/sql.py | 5 +++-- pandas/tests/io/test_sql.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 55e145b493dd9..bace43e785dff 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -750,7 +750,8 @@ def _get_column_names_and_types(self, dtype_mapper): for i, idx_label in enumerate(self.index): idx_type = dtype_mapper( self.frame.index.get_level_values(i)) - column_names_and_types.append((idx_label, idx_type, True)) + column_names_and_types.append((text_type(idx_label), + idx_type, True)) column_names_and_types += [ (text_type(self.frame.columns[i]), @@ -1220,7 +1221,7 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): def _get_unicode_name(name): try: - uname = name.encode("utf-8", "strict").decode("utf-8") + uname = text_type(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: raise ValueError("Cannot convert identifier to UTF-8: '%s'" % name) return uname diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 78560611da7aa..890f52e8c65e9 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -709,6 +709,21 @@ def test_to_sql_index_label(self): self.assertEqual(frame.columns[0], 'other_label', "Specified index_label not written to database") + # index name is integer + temp_frame.index.name = 0 + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace') + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], '0', + "Integer index label not written to database") + + temp_frame.index.name = None + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label=0) + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], '0', + "Integer index label not written to database") + def test_to_sql_index_label_multiindex(self): temp_frame = DataFrame({'col1': range(4)}, index=MultiIndex.from_product( From f8adc5d804fa053e3eae662795d988a86b09ed37 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Feb 2017 14:12:01 +0100 Subject: [PATCH 162/338] DOC: add whatsnew for #15423 --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 75a8752c9bfa4..c68af842a4f0c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -522,7 +522,7 @@ Bug Fixes - Bug in ``.groupby(...).rolling(...)`` when ``on`` is specified and using a ``DatetimeIndex`` (:issue:`15130`) - +- Bug in ``to_sql`` when writing a DataFrame with numeric index names (:issue:`15404`). - Bug in ``Series.iloc`` where a ``Categorical`` object for list-like indexes input was returned, where a ``Series`` was expected. (:issue:`14580`) From 68c1d3068220f271ea45d90b456c935e5f6d3ca2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Feb 2017 09:51:46 -0500 Subject: [PATCH 163/338] TST: remove yielding tests from test_msgpacks.py (#15427) --- pandas/tests/io/test_packers.py | 88 ++++++++++++++++++--------------- pandas/tests/io/test_pickle.py | 8 +-- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 911cd8164571d..097c03937ca68 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -41,6 +41,22 @@ _ZLIB_INSTALLED = True +@pytest.fixture(scope='module') +def current_packers_data(): + # our current version packers data + from pandas.tests.io.generate_legacy_storage_files import ( + create_msgpack_data) + return create_msgpack_data() + + +@pytest.fixture(scope='module') +def all_packers_data(): + # our all of our current version packers data + from pandas.tests.io.generate_legacy_storage_files import ( + create_data) + return create_data() + + def check_arbitrary(a, b): if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): @@ -778,7 +794,16 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -class TestMsgpack(): +def legacy_packers_versions(): + # yield the packers versions + path = tm.get_data_path('legacy_msgpack') + for v in os.listdir(path): + p = os.path.join(path, v) + if os.path.isdir(p): + yield v + + +class TestMsgpack(object): """ How to add msgpack tests: @@ -788,48 +813,38 @@ class TestMsgpack(): $ python generate_legacy_storage_files.py msgpack 3. Move the created pickle to "data/legacy_msgpack/" directory. - - NOTE: TestMsgpack can't be a subclass of tm.Testcase to use test generator. - http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class """ - @classmethod - def setup_class(cls): - from pandas.tests.io.generate_legacy_storage_files import ( - create_msgpack_data, create_data) - cls.data = create_msgpack_data() - cls.all_data = create_data() - cls.path = u('__%s__.msgpack' % tm.rands(10)) - cls.minimum_structure = {'series': ['float', 'int', 'mixed', - 'ts', 'mi', 'dup'], - 'frame': ['float', 'int', 'mixed', 'mi'], - 'panel': ['float'], - 'index': ['int', 'date', 'period'], - 'mi': ['reg2']} - - def check_min_structure(self, data): + minimum_structure = {'series': ['float', 'int', 'mixed', + 'ts', 'mi', 'dup'], + 'frame': ['float', 'int', 'mixed', 'mi'], + 'panel': ['float'], + 'index': ['int', 'date', 'period'], + 'mi': ['reg2']} + + def check_min_structure(self, data, version): for typ, v in self.minimum_structure.items(): assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: msg = '"{0}" not found in data["{1}"]'.format(kind, typ) assert kind in data[typ], msg - def compare(self, vf, version): + def compare(self, current_data, all_data, vf, version): # GH12277 encoding default used to be latin-1, now utf-8 if LooseVersion(version) < '0.18.0': data = read_msgpack(vf, encoding='latin-1') else: data = read_msgpack(vf) - self.check_min_structure(data) + self.check_min_structure(data, version) for typ, dv in data.items(): - assert typ in self.all_data, ('unpacked data contains ' - 'extra key "{0}"' - .format(typ)) + assert typ in all_data, ('unpacked data contains ' + 'extra key "{0}"' + .format(typ)) for dt, result in dv.items(): - assert dt in self.all_data[typ], ('data["{0}"] contains extra ' - 'key "{1}"'.format(typ, dt)) + assert dt in current_data[typ], ('data["{0}"] contains extra ' + 'key "{1}"'.format(typ, dt)) try: - expected = self.data[typ][dt] + expected = current_data[typ][dt] except KeyError: continue @@ -862,9 +877,11 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - def read_msgpacks(self, version): + @pytest.mark.parametrize('version', legacy_packers_versions()) + def test_msgpacks_legacy(self, current_packers_data, all_packers_data, + version): - pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) + pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 @@ -873,19 +890,10 @@ def read_msgpacks(self, version): continue vf = os.path.join(pth, f) try: - self.compare(vf, version) + self.compare(current_packers_data, all_packers_data, + vf, version) except ImportError: # blosc not installed continue n += 1 assert n > 0, 'Msgpack files are not tested' - - def test_msgpack(self): - msgpack_path = tm.get_data_path('legacy_msgpack') - n = 0 - for v in os.listdir(msgpack_path): - pth = os.path.join(msgpack_path, v) - if os.path.isdir(pth): - yield self.read_msgpacks, v - n += 1 - assert n > 0, 'Msgpack files are not tested' diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 1e3816c1556f6..c736ec829808a 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -187,10 +187,10 @@ def compare_sp_frame_float(result, expected, typ, version): # --------------------- def legacy_pickle_versions(): # yield the pickle versions - pickle_path = tm.get_data_path('legacy_pickle') - for v in os.listdir(pickle_path): - pth = os.path.join(pickle_path, v) - if os.path.isdir(pth): + path = tm.get_data_path('legacy_pickle') + for v in os.listdir(path): + p = os.path.join(path, v) + if os.path.isdir(p): yield v From 0630ba8e0c53f66f49399c83f872cfa5c597a984 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Fri, 17 Feb 2017 10:07:11 -0500 Subject: [PATCH 164/338] ENH: Don't add rowspan/colspan if it's 1. Just a small thing I noticed in a [footnote here](https://danluu.com/web-bloat/#appendix-irony). Probably can't do much about the extra classes, but rowspan/colspan seem like easy fixes to save a few bytes per row/col and it's already done in the other code path. Author: Elliott Sales de Andrade Closes #15403 from QuLogic/no-extra-span and squashes the following commits: 9a8fcee [Elliott Sales de Andrade] Don't add rowspan/colspan if it's 1. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/formats/style.py | 55 ++++++++++++++++-------------- pandas/tests/formats/test_style.py | 38 +++++++-------------- 3 files changed, 43 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c68af842a4f0c..8e48dbbb083e8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -154,6 +154,7 @@ Other enhancements - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) +- HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/formats/style.py b/pandas/formats/style.py index b3e0f0f6c7462..89712910a22e1 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -251,21 +251,23 @@ def format_attr(pair): "class": " ".join(cs), "is_visible": True}) - for c in range(len(clabels[0])): + for c, value in enumerate(clabels[r]): cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] cs.extend(cell_context.get( "col_headings", {}).get(r, {}).get(c, [])) - value = clabels[r][c] - row_es.append({"type": "th", - "value": value, - "display_value": value, - "class": " ".join(cs), - "is_visible": _is_visible(c, r, col_lengths), - "attributes": [ - format_attr({"key": "colspan", - "value": col_lengths.get( - (r, c), 1)}) - ]}) + es = { + "type": "th", + "value": value, + "display_value": value, + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + } + colspan = col_lengths.get((r, c), 0) + if colspan > 1: + es["attributes"] = [ + format_attr({"key": "colspan", "value": colspan}) + ] + row_es.append(es) head.append(row_es) if self.data.index.names and not all(x is None @@ -289,19 +291,22 @@ def format_attr(pair): body = [] for r, idx in enumerate(self.data.index): - # cs.extend( - # cell_context.get("row_headings", {}).get(r, {}).get(c, [])) - row_es = [{"type": "th", - "is_visible": _is_visible(r, c, idx_lengths), - "attributes": [ - format_attr({"key": "rowspan", - "value": idx_lengths.get((c, r), 1)}) - ], - "value": rlabels[r][c], - "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, - "row%s" % r]), - "display_value": rlabels[r][c]} - for c in range(len(rlabels[r]))] + row_es = [] + for c, value in enumerate(rlabels[r]): + es = { + "type": "th", + "is_visible": _is_visible(r, c, idx_lengths), + "value": value, + "display_value": value, + "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, + "row%s" % r]), + } + rowspan = idx_lengths.get((c, r), 0) + if rowspan > 1: + es["attributes"] = [ + format_attr({"key": "rowspan", "value": rowspan}) + ] + row_es.append(es) for c, col in enumerate(self.data.columns): cs = [DATA_CLASS, "row%s" % r, "col%s" % c] diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index 53bb3f9010f7e..44af0b8ebb085 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -141,21 +141,18 @@ def test_empty_index_name_doesnt_display(self): 'type': 'th', 'value': 'A', 'is_visible': True, - 'attributes': ["colspan=1"], }, {'class': 'col_heading level0 col1', 'display_value': 'B', 'type': 'th', 'value': 'B', 'is_visible': True, - 'attributes': ["colspan=1"], }, {'class': 'col_heading level0 col2', 'display_value': 'C', 'type': 'th', 'value': 'C', 'is_visible': True, - 'attributes': ["colspan=1"], }]] self.assertEqual(result['head'], expected) @@ -168,11 +165,9 @@ def test_index_name(self): expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', 'display_value': '', 'is_visible': True}, {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'B', 'display_value': 'B', - 'is_visible': True, 'attributes': ['colspan=1']}, + 'value': 'B', 'display_value': 'B', 'is_visible': True}, {'class': 'col_heading level0 col1', 'type': 'th', - 'value': 'C', 'display_value': 'C', - 'is_visible': True, 'attributes': ['colspan=1']}], + 'value': 'C', 'display_value': 'C', 'is_visible': True}], [{'class': 'index_name level0', 'type': 'th', 'value': 'A'}, {'class': 'blank', 'type': 'th', 'value': ''}, @@ -191,9 +186,7 @@ def test_multiindex_name(self): {'class': 'blank level0', 'type': 'th', 'value': '', 'display_value': '', 'is_visible': True}, {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'C', 'display_value': 'C', - 'is_visible': True, 'attributes': ['colspan=1'], - }], + 'value': 'C', 'display_value': 'C', 'is_visible': True}], [{'class': 'index_name level0', 'type': 'th', 'value': 'A'}, {'class': 'index_name level1', 'type': 'th', @@ -618,16 +611,14 @@ def test_mi_sparse(self): body_1 = result['body'][0][1] expected_1 = { "value": 0, "display_value": 0, "is_visible": True, - "type": "th", "attributes": ["rowspan=1"], - "class": "row_heading level1 row0", + "type": "th", "class": "row_heading level1 row0", } tm.assert_dict_equal(body_1, expected_1) body_10 = result['body'][1][0] expected_10 = { "value": 'a', "display_value": 'a', "is_visible": False, - "type": "th", "attributes": ["rowspan=1"], - "class": "row_heading level0 row1", + "type": "th", "class": "row_heading level0 row1", } tm.assert_dict_equal(body_10, expected_10) @@ -637,9 +628,8 @@ def test_mi_sparse(self): 'is_visible': True, "display_value": ''}, {'type': 'th', 'class': 'blank level0', 'value': '', 'is_visible': True, 'display_value': ''}, - {'attributes': ['colspan=1'], 'class': 'col_heading level0 col0', - 'is_visible': True, 'type': 'th', 'value': 'A', - 'display_value': 'A'}] + {'type': 'th', 'class': 'col_heading level0 col0', 'value': 'A', + 'is_visible': True, 'display_value': 'A'}] self.assertEqual(head, expected) def test_mi_sparse_disabled(self): @@ -650,7 +640,7 @@ def test_mi_sparse_disabled(self): result = df.style._translate() body = result['body'] for row in body: - self.assertEqual(row[0]['attributes'], ['rowspan=1']) + assert 'attributes' not in row[0] def test_mi_sparse_index_names(self): df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( @@ -686,28 +676,24 @@ def test_mi_sparse_column_names(self): 'type': 'th', 'is_visible': True}, {'class': 'index_name level1', 'value': 'col_1', 'display_value': 'col_1', 'is_visible': True, 'type': 'th'}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col0', + {'class': 'col_heading level1 col0', 'display_value': 1, 'is_visible': True, 'type': 'th', 'value': 1}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col1', + {'class': 'col_heading level1 col1', 'display_value': 0, 'is_visible': True, 'type': 'th', 'value': 0}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col2', + {'class': 'col_heading level1 col2', 'display_value': 1, 'is_visible': True, 'type': 'th', 'value': 1}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col3', + {'class': 'col_heading level1 col3', 'display_value': 0, 'is_visible': True, 'type': 'th', From 591427a8a04d8bf9ede33ec1ce5c504bb79119ab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 18 Feb 2017 11:52:01 +0100 Subject: [PATCH 165/338] DOC: correct rpy2 examples (GH15142) (#15450) --- doc/source/r_interface.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index b5d699cad69d5..88634d7f75c63 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -41,15 +41,17 @@ In the remainder of this page, a few examples of explicit conversion is given. T Transferring R data sets into Python ------------------------------------ -The ``pandas2ri.ri2py`` function retrieves an R data set and converts it to the -appropriate pandas object (most likely a DataFrame): +Once the pandas conversion is activated (``pandas2ri.activate()``), many conversions +of R to pandas objects will be done automatically. For example, to obtain the 'iris' dataset as a pandas DataFrame: .. ipython:: python r.data('iris') - df_iris = pandas2ri.ri2py(r['iris']) - df_iris.head() + r['iris'].head() +If the pandas conversion was not activated, the above could also be accomplished +by explicitly converting it with the ``pandas2ri.ri2py`` function +(``pandas2ri.ri2py(r['iris'])``). Converting DataFrames into R objects ------------------------------------ @@ -65,7 +67,6 @@ DataFrames into the equivalent R object (that is, **data.frame**): print(type(r_dataframe)) print(r_dataframe) - The DataFrame's index is stored as the ``rownames`` attribute of the data.frame instance. From 815aca785a0748f718d32539b7a3ebba8e6d1f12 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 18 Feb 2017 04:08:54 -0800 Subject: [PATCH 166/338] BUG: rolling not accepting Timedelta-like window args (#15443) Remove unnecessary pd.Timedelta --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/window.py | 4 +++- pandas/tests/test_window.py | 20 +++++++++++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8e48dbbb083e8..ae4a3d3c3d97f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -551,6 +551,7 @@ Bug Fixes - Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) - Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`) +- Bug in ``.rolling()`` where ``pd.Timedelta`` or ``datetime.timedelta`` was not accepted as a ``window`` argument (:issue:`15440`) - Bug in ``DataFrame.resample().median()`` if duplicate column names are present (:issue:`14233`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) diff --git a/pandas/core/window.py b/pandas/core/window.py index 50de6b84d7cba..3f9aa2b0ff392 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -10,6 +10,7 @@ import warnings import numpy as np from collections import defaultdict +from datetime import timedelta from pandas.types.generic import (ABCSeries, ABCDataFrame, @@ -1014,7 +1015,8 @@ def validate(self): # we allow rolling on a datetimelike index if (self.is_datetimelike and - isinstance(self.window, (compat.string_types, DateOffset))): + isinstance(self.window, (compat.string_types, DateOffset, + timedelta))): self._validate_monotonic() freq = self._validate_freq() diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 1bb1f91423a9d..452e8999ab13f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -4,7 +4,7 @@ import warnings from warnings import catch_warnings -from datetime import datetime +from datetime import datetime, timedelta from numpy.random import randn import numpy as np from distutils.version import LooseVersion @@ -401,6 +401,24 @@ def test_constructor_with_win_type(self): with self.assertRaises(ValueError): c(-1, win_type='boxcar') + def test_constructor_with_timedelta_window(self): + # GH 15440 + n = 10 + df = pd.DataFrame({'value': np.arange(n)}, + index=pd.date_range('2015-12-24', + periods=n, + freq="D")) + expected_data = np.append([0., 1.], np.arange(3., 27., 3)) + for window in [timedelta(days=3), pd.Timedelta(days=3)]: + result = df.rolling(window=window).sum() + expected = pd.DataFrame({'value': expected_data}, + index=pd.date_range('2015-12-24', + periods=n, + freq="D")) + tm.assert_frame_equal(result, expected) + expected = df.rolling('3D').sum() + tm.assert_frame_equal(result, expected) + def test_numpy_compat(self): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) From 4f2ef7ddbf69cd44517c5eb0d65c0fce92490844 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Feb 2017 12:04:48 -0500 Subject: [PATCH 167/338] BUG: testing on windows - we are passing builds which actually have an error - fix the small dtype issues Author: Jeff Reback Closes #15445 from jreback/windows and squashes the following commits: a5b7fb3 [Jeff Reback] change integer to power comparisions eab15c4 [Jeff Reback] don't force remove pandas cf3b9bd [Jeff Reback] more windows fixing efe6a76 [Jeff Reback] add cython to build 8194e63 [Jeff Reback] don't use appveyor recipe, just build inplace e064825 [Jeff Reback] TST: resample dtype issue xref #15418 10d9b26 [Jeff Reback] TST: run windows tests so failures show up in appeveyor --- appveyor.yml | 12 ++++---- ci/appveyor.recipe/bld.bat | 2 -- ci/appveyor.recipe/build.sh | 2 -- ci/appveyor.recipe/meta.yaml | 37 ------------------------- pandas/tests/indexing/test_timedelta.py | 3 +- pandas/tests/test_expressions.py | 10 +++---- pandas/tests/tseries/test_resample.py | 2 +- test.bat | 3 +- 8 files changed, 13 insertions(+), 58 deletions(-) delete mode 100644 ci/appveyor.recipe/bld.bat delete mode 100644 ci/appveyor.recipe/build.sh delete mode 100644 ci/appveyor.recipe/meta.yaml diff --git a/appveyor.yml b/appveyor.yml index d96e1dfcf76de..1c14698430996 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -78,21 +78,19 @@ install: # this is now the downloaded conda... - cmd: conda info -a - # build em using the local source checkout in the correct windows env - - cmd: '%CMD_IN_ENV% conda build ci\appveyor.recipe -q' - # create our env - - cmd: conda create -q -n pandas python=%PYTHON_VERSION% pytest + - cmd: conda create -q -n pandas python=%PYTHON_VERSION% cython pytest - cmd: activate pandas - SET REQ=ci\requirements-%PYTHON_VERSION%-%PYTHON_ARCH%.run - cmd: echo "installing requirements from %REQ%" - cmd: conda install -n pandas -q --file=%REQ% - cmd: conda list -n pandas - cmd: echo "installing requirements from %REQ% - done" - - ps: conda install -n pandas (conda build ci\appveyor.recipe -q --output) + + # build em using the local source checkout in the correct windows env + - cmd: '%CMD_IN_ENV% python setup.py build_ext --inplace' test_script: # tests - cmd: activate pandas - - cmd: cd \ - - cmd: python -c "import pandas; pandas.test(['--skip-slow', '--skip-network'])" + - cmd: test.bat diff --git a/ci/appveyor.recipe/bld.bat b/ci/appveyor.recipe/bld.bat deleted file mode 100644 index 284926fae8c04..0000000000000 --- a/ci/appveyor.recipe/bld.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -%PYTHON% setup.py install diff --git a/ci/appveyor.recipe/build.sh b/ci/appveyor.recipe/build.sh deleted file mode 100644 index f341bce6fcf96..0000000000000 --- a/ci/appveyor.recipe/build.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -$PYTHON setup.py install diff --git a/ci/appveyor.recipe/meta.yaml b/ci/appveyor.recipe/meta.yaml deleted file mode 100644 index 777fd9d682d48..0000000000000 --- a/ci/appveyor.recipe/meta.yaml +++ /dev/null @@ -1,37 +0,0 @@ -package: - name: pandas - version: 0.20.0 - -build: - number: {{environ.get('APPVEYOR_BUILD_NUMBER', 0)}} # [win] - string: np{{ environ.get('CONDA_NPY') }}py{{ environ.get('CONDA_PY') }}_{{ environ.get('APPVEYOR_BUILD_NUMBER', 0) }} # [win] - -source: - - # conda-build needs a full clone - # rather than a shallow git_url type clone - # https://github.com/conda/conda-build/issues/780 - path: ../../ - -requirements: - build: - - python - - cython - - numpy x.x - - setuptools - - pytz - - python-dateutil - - run: - - python - - numpy x.x - - python-dateutil - - pytz - -test: - imports: - - pandas - -about: - home: http://pandas.pydata.org - license: BSD diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index e5ccd72cac20a..5f0088382ce57 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -13,8 +13,7 @@ def test_boolean_indexing(self): [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] for cond, data in zip(conditions, expected_data): - result = df.copy() - result.loc[cond, 'x'] = 10 + result = df.assign(x=df.mask(cond, 10).astype('int64')) expected = pd.DataFrame(data, index=pd.to_timedelta(range(10), unit='s'), columns=['x']) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 3032a288032a2..f669ebe371f9d 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -12,7 +12,7 @@ from pandas.core.api import DataFrame, Panel from pandas.computation import expressions as expr -from pandas import compat, _np_version_under1p12 +from pandas import compat, _np_version_under1p11 from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_panel4d_equal, slow) @@ -70,10 +70,10 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, operations.append('div') for arith in operations: - # numpy >= 1.12 doesn't handle integers + # numpy >= 1.11 doesn't handle integers # raised to integer powers # https://github.com/pandas-dev/pandas/issues/15363 - if arith == 'pow' and not _np_version_under1p12: + if arith == 'pow' and not _np_version_under1p11: continue operator_name = arith @@ -272,10 +272,10 @@ def testit(): for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), ('div', '/'), ('pow', '**')]: - # numpy >= 1.12 doesn't handle integers + # numpy >= 1.11 doesn't handle integers # raised to integer powers # https://github.com/pandas-dev/pandas/issues/15363 - if op == 'pow' and not _np_version_under1p12: + if op == 'pow' and not _np_version_under1p11: continue if op == 'div': diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index 45bbc88ef711d..6e999c5b1d276 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -1944,7 +1944,7 @@ def test_resample_nunique_with_date_gap(self): index = pd.date_range('1-1-2000', '2-15-2000', freq='h') index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h') index3 = index.append(index2) - s = pd.Series(range(len(index3)), index=index3) + s = pd.Series(range(len(index3)), index=index3, dtype='int64') r = s.resample('M') # Since all elements are unique, these should all be the same diff --git a/test.bat b/test.bat index 7f9244abb2bc8..2c5f25c24a637 100644 --- a/test.bat +++ b/test.bat @@ -1,4 +1,3 @@ :: test on windows -:: nosetests --exe -A "not slow and not network and not disabled" pandas %* -pytest pandas +pytest --skip-slow --skip-network pandas From 54e9bf3021161c305cb67db0192e29e24c78d986 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 20 Feb 2017 09:36:19 -0500 Subject: [PATCH 168/338] BUG: MultiIndex indexing with passed Series/DataFrame/ndarray as indexers closes #15424 closes #15434 Author: Pietro Battiston Closes #15425 from toobaz/mi_indexing and squashes the following commits: 2ba2d5d [Pietro Battiston] Updated comment 900e3ce [Pietro Battiston] whatsnew 8467b57 [Pietro Battiston] Tests for previous commit 17209f3 [Pietro Battiston] BUG: support indexing MultiIndex with 1-D array 7606114 [Pietro Battiston] Whatsnew 0b719f5 [Pietro Battiston] Test for previous commit 1f2f385 [Pietro Battiston] BUG: Fix indexing MultiIndex with Series with 0 not index --- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/core/indexing.py | 25 ++++++++++++----- pandas/tests/indexing/test_multiindex.py | 34 ++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ae4a3d3c3d97f..9e71b9a11c8eb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -501,7 +501,8 @@ Bug Fixes - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) -- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) +- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) +- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 66510a7708e64..6f490875742ca 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1521,15 +1521,28 @@ def _getitem_axis(self, key, axis=0): return self._getbool_axis(key, axis=axis) elif is_list_like_indexer(key): - # GH 7349 - # possibly convert a list-like into a nested tuple - # but don't convert a list-like of tuples + # convert various list-like indexers + # to a list of keys + # we will use the *values* of the object + # and NOT the index if its a PandasObject if isinstance(labels, MultiIndex): + + if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1: + # Series, or 0,1 ndim ndarray + # GH 14730 + key = list(key) + elif isinstance(key, ABCDataFrame): + # GH 15438 + raise NotImplementedError("Indexing a MultiIndex with a " + "DataFrame key is not " + "implemented") + elif hasattr(key, 'ndim') and key.ndim > 1: + raise NotImplementedError("Indexing a MultiIndex with a " + "multidimensional key is not " + "implemented") + if (not isinstance(key, tuple) and len(key) > 1 and not isinstance(key[0], tuple)): - if isinstance(key, ABCSeries): - # GH 14730 - key = list(key) key = tuple([key]) # an iterable multi-selection diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index b6b9ac93b234c..b40f0b8cd9976 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -158,12 +158,46 @@ def test_loc_getitem_series(self): result = x.loc[[1, 3]] tm.assert_series_equal(result, expected) + # GH15424 + y1 = Series([1, 3], index=[1, 2]) + result = x.loc[y1] + tm.assert_series_equal(result, expected) + empty = Series(data=[], dtype=np.float64) expected = Series([], index=MultiIndex( levels=index.levels, labels=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected) + def test_loc_getitem_array(self): + # GH15434 + # passing an array as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = np.array([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), + dtype=np.float64) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + # empty array: + empty = np.array([]) + expected = Series([], index=MultiIndex( + levels=index.levels, labels=[[], []], dtype=np.float64)) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + # 0-dim array (scalar): + scalar = np.int64(1) + expected = Series( + data=[0, 1, 2], + index=['A', 'B', 'C'], + dtype=np.float64) + result = x.loc[scalar] + tm.assert_series_equal(result, expected) + def test_iloc_getitem_multiindex(self): mi_labels = DataFrame(np.random.randn(4, 3), columns=[['i', 'i', 'j'], ['A', 'A', 'B']], From 01b112619d59d05852a4b2fa35d85664d4e48968 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 20 Feb 2017 10:13:38 -0500 Subject: [PATCH 169/338] TST: make sure test_fash uses the same seed for launching processes --- test_fast.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test_fast.sh b/test_fast.sh index 43eb376f879cd..30ac7f84cbe8b 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -1 +1,8 @@ +#!/bin/bash + +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') + pytest pandas --skip-slow --skip-network -m "not single" -n 4 From 270fb3e8132883e662287a44ae0db051d6635cf1 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 20 Feb 2017 14:12:28 -0500 Subject: [PATCH 170/338] ENH: Add __copy__ and __deepcopy__ to NDFrame closes #15370 Author: Brian McFee Author: Jeff Reback Closes #15444 from bmcfee/deepcopy-ndframe and squashes the following commits: bf36f35 [Jeff Reback] TST: skip the panel4d deepcopy tests d58b1f6 [Brian McFee] added tests for copy and deepcopy 35f3e0f [Brian McFee] relocated Index.__deepcopy__ to live near __copy__ 1aea940 [Brian McFee] switched deepcopy test to using generic comparator 7e67e7d [Brian McFee] ndframe and index __copy__ are now proper methods 820664c [Brian McFee] moved deepcopy test to generic.py 9721041 [Brian McFee] added copy/deepcopy to ndframe, fixes #15370 --- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/core/generic.py | 8 ++++++++ pandas/indexes/base.py | 13 +++++++------ pandas/tests/test_generic.py | 24 ++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9e71b9a11c8eb..40b068547c360 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -433,6 +433,7 @@ Other API Changes - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) - The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss (:issue:`14064`, :issue:`14305`). - Reorganization of timeseries development tests (:issue:`14854`) +- Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) .. _whatsnew_0200.deprecations: @@ -500,7 +501,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) - +- Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 26b9a880dd2c7..76fbb9884753d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3161,6 +3161,14 @@ def copy(self, deep=True): data = self._data.copy(deep=deep) return self._constructor(data).__finalize__(self) + def __copy__(self, deep=True): + return self.copy(deep=deep) + + def __deepcopy__(self, memo=None): + if memo is None: + memo = {} + return self.copy(deep=True) + def _convert(self, datetime=False, numeric=False, timedelta=False, coerce=False, copy=True): """ diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index e51824e72a2a0..f1f37622b2a74 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -724,7 +724,13 @@ def copy(self, name=None, deep=False, dtype=None, **kwargs): new_index = new_index.astype(dtype) return new_index - __copy__ = copy + def __copy__(self, **kwargs): + return self.copy(**kwargs) + + def __deepcopy__(self, memo=None): + if memo is None: + memo = {} + return self.copy(deep=True) def _validate_names(self, name=None, names=None, deep=False): """ @@ -1480,11 +1486,6 @@ def __setstate__(self, state): _unpickle_compat = __setstate__ - def __deepcopy__(self, memo=None): - if memo is None: - memo = {} - return self.copy(deep=True) - def __nonzero__(self): raise ValueError("The truth value of a {0} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index b087ca21d3c25..40cdbe083acd7 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -2,6 +2,7 @@ # pylint: disable-msg=E1101,W0612 from operator import methodcaller +from copy import copy, deepcopy import pytest import numpy as np from numpy import nan @@ -675,6 +676,18 @@ def test_validate_bool_args(self): with self.assertRaises(ValueError): super(DataFrame, df).mask(cond=df.a > 2, inplace=value) + def test_copy_and_deepcopy(self): + # GH 15444 + for shape in [0, 1, 2]: + obj = self._construct(shape) + for func in [copy, + deepcopy, + lambda x: x.copy(deep=False), + lambda x: x.copy(deep=True)]: + obj_copy = func(obj) + self.assertIsNot(obj_copy, obj) + self._compare(obj_copy, obj) + class TestSeries(tm.TestCase, Generic): _typ = Series @@ -1539,6 +1552,14 @@ def test_to_xarray(self): expected, check_index_type=False) + def test_deepcopy_empty(self): + # This test covers empty frame copying with non-empty column sets + # as reported in issue GH15370 + empty_frame = DataFrame(data=[], index=[], columns=['A']) + empty_frame_copy = deepcopy(empty_frame) + + self._compare(empty_frame_copy, empty_frame) + class TestPanel(tm.TestCase, Generic): _typ = Panel @@ -1569,6 +1590,9 @@ class TestPanel4D(tm.TestCase, Generic): def test_sample(self): pytest.skip("sample on Panel4D") + def test_copy_and_deepcopy(self): + pytest.skip("copy_and_deepcopy on Panel4D") + def test_to_xarray(self): tm._skip_if_no_xarray() From e60269bcf09e17fe11222a60c951d437b256f249 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 20 Feb 2017 21:39:10 -0500 Subject: [PATCH 171/338] CI: add circle ci support this adds support for using CircleCI; configured to put 4 of our builds (3.4, 3.4-slow, 2.7 compat, and 3.5 ascii), they are still on Travis ATM. They are built/tested simultaneously on CircleCI (as we get 4 containers for open-source). Author: Jeff Reback Closes #15464 from jreback/circle and squashes the following commits: 3756674 [Jeff Reback] CI: add circle ci support --- .travis.yml | 2 +- ci/install_circle.sh | 88 ++++++++++++++++++++++ ci/install_db_circle.sh | 8 ++ ci/{install_db.sh => install_db_travis.sh} | 0 ci/run_circle.sh | 9 +++ ci/show_circle.sh | 8 ++ circle.yml | 35 +++++++++ 7 files changed, 149 insertions(+), 1 deletion(-) create mode 100755 ci/install_circle.sh create mode 100755 ci/install_db_circle.sh rename ci/{install_db.sh => install_db_travis.sh} (100%) create mode 100755 ci/run_circle.sh create mode 100755 ci/show_circle.sh create mode 100644 circle.yml diff --git a/.travis.yml b/.travis.yml index 6245213cec06f..bb96ab210c088 100644 --- a/.travis.yml +++ b/.travis.yml @@ -315,7 +315,7 @@ install: before_script: - source activate pandas && pip install codecov - - ci/install_db.sh + - ci/install_db_travis.sh script: - echo "script start" diff --git a/ci/install_circle.sh b/ci/install_circle.sh new file mode 100755 index 0000000000000..485586e9d4f49 --- /dev/null +++ b/ci/install_circle.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +home_dir=$(pwd) +echo "[home_dir: $home_dir]" + +echo "[ls -ltr]" +ls -ltr + +echo "[Using clean Miniconda install]" +rm -rf "$MINICONDA_DIR" + +# install miniconda +wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 +bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 + +export PATH="$MINICONDA_DIR/bin:$PATH" + +echo "[update conda]" +conda config --set ssl_verify false || exit 1 +conda config --set always_yes true --set changeps1 false || exit 1 +conda update -q conda + +# add the pandas channel to take priority +# to add extra packages +echo "[add channels]" +conda config --add channels pandas || exit 1 +conda config --remove channels defaults || exit 1 +conda config --add channels defaults || exit 1 + +# Useful for debugging any issues with conda +conda info -a || exit 1 + +# support env variables passed +export ENVS_FILE=".envs" + +# make sure that the .envs file exists. it is ok if it is empty +touch $ENVS_FILE + +# assume all command line arguments are environmental variables +for var in "$@" +do + echo "export $var" >> $ENVS_FILE +done + +echo "[environmental variable file]" +cat $ENVS_FILE +source $ENVS_FILE + +export REQ_BUILD=ci/requirements-${PYTHON_VERSION}${JOB_TAG}.build +export REQ_RUN=ci/requirements-${PYTHON_VERSION}${JOB_TAG}.run +export REQ_PIP=ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip + +# edit the locale override if needed +if [ -n "$LOCALE_OVERRIDE" ]; then + echo "[Adding locale to the first line of pandas/__init__.py]" + rm -f pandas/__init__.pyc + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sed -i "$sedc" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" + head -4 pandas/__init__.py + echo +fi + +# create new env +echo "[create env]" +time conda create -q -n pandas python=${PYTHON_VERSION} pytest || exit 1 + +source activate pandas + +# build deps +echo "[build installs: ${REQ_BUILD}]" +time conda install -q --file=${REQ_BUILD} || exit 1 + +# build but don't install +echo "[build em]" +time python setup.py build_ext --inplace || exit 1 + +# we may have run installations +echo "[conda installs: ${REQ_RUN}]" +if [ -e ${REQ_RUN} ]; then + time conda install -q --file=${REQ_RUN} || exit 1 +fi + +# we may have additional pip installs +echo "[pip installs: ${REQ_PIP}]" +if [ -e ${REQ_PIP} ]; then + pip install -q -r $REQ_PIP +fi diff --git a/ci/install_db_circle.sh b/ci/install_db_circle.sh new file mode 100755 index 0000000000000..a00f74f009f54 --- /dev/null +++ b/ci/install_db_circle.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "installing dbs" +mysql -e 'create database pandas_nosetest;' +psql -c 'create database pandas_nosetest;' -U postgres + +echo "done" +exit 0 diff --git a/ci/install_db.sh b/ci/install_db_travis.sh similarity index 100% rename from ci/install_db.sh rename to ci/install_db_travis.sh diff --git a/ci/run_circle.sh b/ci/run_circle.sh new file mode 100755 index 0000000000000..0e46d28ab6fc4 --- /dev/null +++ b/ci/run_circle.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +echo "[running tests]" +export PATH="$MINICONDA_DIR/bin:$PATH" + +source activate pandas + +echo "pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" +pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/show_circle.sh b/ci/show_circle.sh new file mode 100755 index 0000000000000..bfaa65c1d84f2 --- /dev/null +++ b/ci/show_circle.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +echo "[installed versions]" + +export PATH="$MINICONDA_DIR/bin:$PATH" +source activate pandas + +python -c "import pandas; pandas.show_versions();" diff --git a/circle.yml b/circle.yml new file mode 100644 index 0000000000000..97136d368ae6f --- /dev/null +++ b/circle.yml @@ -0,0 +1,35 @@ +machine: + environment: + # these are globally set + MINICONDA_DIR: /home/ubuntu/miniconda3 + +database: + override: + - ./ci/install_db_circle.sh + +checkout: + post: + # since circleci does a shallow fetch + # we need to populate our tags + - git fetch --depth=1000 + - git fetch --tags + +dependencies: + override: + - > + case $CIRCLE_NODE_INDEX in + 0) + sudo apt-get install language-pack-it && ./ci/install_circle.sh PYTHON_VERSION=2.7 JOB_TAG="_COMPAT" LOCALE_OVERRIDE="it_IT.UTF-8" ;; + 1) + sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh PYTHON_VERSION=3.4 JOB_TAG="_SLOW" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; + 2) + sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh PYTHON_VERSION=3.4 JOB_TAG="" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; + 3) + ./ci/install_circle.sh PYTHON_VERSION=3.5 JOB_TAG="_ASCII" LOCALE_OVERRIDE="C" ;; + esac + - ./ci/show_circle.sh + +test: + override: + - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: + parallel: true From b297503e5c880388cee036f87e43f5a7f343b7bc Mon Sep 17 00:00:00 2001 From: tzinckgraf Date: Tue, 21 Feb 2017 08:29:55 -0500 Subject: [PATCH 172/338] BUG: Bug on reset_index for a MultiIndex of all NaNs closes #6322 Author: tzinckgraf Closes #15466 from tzinckgraf/GH6322 and squashes the following commits: 35f97f4 [tzinckgraf] GH6322, Bug on reset_index for a MultiIndex of all NaNs --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 15 +++++++++++---- pandas/tests/frame/test_alter_axes.py | 27 +++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 40b068547c360..86f916bc0acfb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -575,6 +575,7 @@ Bug Fixes - Incorrect dtyped ``Series`` was returned by comparison methods (e.g., ``lt``, ``gt``, ...) against a constant for an empty ``DataFrame`` (:issue:`15077`) - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) +- Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3ebdf72a5cde9..bfef2cfbd0d51 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2973,10 +2973,17 @@ def _maybe_casted_values(index, labels=None): # if we have the labels, extract the values with a mask if labels is not None: mask = labels == -1 - values = values.take(labels) - if mask.any(): - values, changed = _maybe_upcast_putmask(values, mask, - np.nan) + + # we can have situations where the whole mask is -1, + # meaning there is nothing found in labels, so make all nan's + if mask.all(): + values = np.empty(len(mask)) + values.fill(np.nan) + else: + values = values.take(labels) + if mask.any(): + values, changed = _maybe_upcast_putmask(values, mask, + np.nan) return values new_index = _default_index(len(new_obj)) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e84bb6407fafc..e52bfdbd4f837 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -624,6 +624,33 @@ def test_reset_index_multiindex_col(self): ['a', 'mean', 'median', 'mean']]) assert_frame_equal(rs, xp) + def test_reset_index_multiindex_nan(self): + # GH6322, testing reset_index on MultiIndexes + # when we have a nan or all nan + df = pd.DataFrame({'A': ['a', 'b', 'c'], + 'B': [0, 1, np.nan], + 'C': np.random.rand(3)}) + rs = df.set_index(['A', 'B']).reset_index() + assert_frame_equal(rs, df) + + df = pd.DataFrame({'A': [np.nan, 'b', 'c'], + 'B': [0, 1, 2], + 'C': np.random.rand(3)}) + rs = df.set_index(['A', 'B']).reset_index() + assert_frame_equal(rs, df) + + df = pd.DataFrame({'A': ['a', 'b', 'c'], + 'B': [0, 1, 2], + 'C': [np.nan, 1.1, 2.2]}) + rs = df.set_index(['A', 'B']).reset_index() + assert_frame_equal(rs, df) + + df = pd.DataFrame({'A': ['a', 'b', 'c'], + 'B': [np.nan, np.nan, np.nan], + 'C': np.random.rand(3)}) + rs = df.set_index(['A', 'B']).reset_index() + assert_frame_equal(rs, df) + def test_reset_index_with_datetimeindex_cols(self): # GH5818 # From 717163bd8664069a74553d95243d1f53ce51f4aa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 21 Feb 2017 17:26:03 -0500 Subject: [PATCH 173/338] DOC: Link CONTRIBUTING.md to contributing.rst (#15451) Previously, we were trying to maintain two different copies of the documentation, one in the ".github" directory, and the other in the "docs/," which just imposes greater maintenance burden in the long-run. We now use the ".github" to refer to different portions of the "docs/" version with short summaries for ease of navigation. Closes gh-15349. --- .github/CONTRIBUTING.md | 519 ++-------------------------------------- 1 file changed, 14 insertions(+), 505 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 7898822e0e11d..95729f845ff5c 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,515 +1,24 @@ Contributing to pandas ====================== -Where to start? ---------------- - -All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. - -If you are simply looking to start working with the *pandas* codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. - -Or maybe through using *pandas* you have an idea of you own or are looking for something in the documentation and thinking 'this can be improved'...you can do something about it! - -Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). - -Bug reports and enhancement requests ------------------------------------- - -Bug reports are an important part of making *pandas* more stable. Having a complete bug report will allow others to reproduce the bug and provide insight into fixing. Because many versions of *pandas* are supported, knowing version information will also identify improvements made since previous versions. Trying the bug-producing code out on the *master* branch is often a worthwhile exercise to confirm the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. - -Bug reports must: - -1. Include a short, self-contained Python snippet reproducing the problem. You can format the code nicely by using [GitHub Flavored Markdown](http://github.github.com/github-flavored-markdown/): - - ```python - >>> from pandas import DataFrame - >>> df = DataFrame(...) - ... - ``` - -2. Include the full version string of *pandas* and its dependencies. In versions of *pandas* after 0.12 you can use a built in function: - - >>> from pandas.util.print_versions import show_versions - >>> show_versions() - - and in *pandas* 0.13.1 onwards: - - >>> pd.show_versions() - -3. Explain why the current behavior is wrong/not desired and what you expect instead. - -The issue will then show up to the *pandas* community and be open to comments/ideas from others. - -Working with the code ---------------------- - -Now that you have an issue you want to fix, enhancement to add, or documentation to improve, you need to learn how to work with GitHub and the *pandas* code base. - -### Version control, Git, and GitHub - -To the new user, working with Git is one of the more daunting aspects of contributing to *pandas*. It can very quickly become overwhelming, but sticking to the guidelines below will help keep the process straightforward and mostly trouble free. As always, if you are having difficulties please feel free to ask for help. - -The code is hosted on [GitHub](https://www.github.com/pandas-dev/pandas). To contribute you will need to sign up for a [free GitHub account](https://github.com/signup/free). We use [Git](http://git-scm.com/) for version control to allow many people to work together on the project. - -Some great resources for learning Git: - -- the [GitHub help pages](http://help.github.com/). -- the [NumPy's documentation](http://docs.scipy.org/doc/numpy/dev/index.html). -- Matthew Brett's [Pydagogue](http://matthew-brett.github.com/pydagogue/). - -### Getting started with Git - -[GitHub has instructions](http://help.github.com/set-up-git-redirect) for installing git, setting up your SSH key, and configuring git. All these steps need to be completed before you can work seamlessly between your local repository and GitHub. - -### Forking - -You will need your own fork to work on the code. Go to the [pandas project page](https://github.com/pandas-dev/pandas) and hit the `Fork` button. You will want to clone your fork to your machine: - - git clone git@github.com:your-user-name/pandas.git pandas-yourname - cd pandas-yourname - git remote add upstream git://github.com/pandas-dev/pandas.git - -This creates the directory pandas-yourname and connects your repository to the upstream (main project) *pandas* repository. - -The testing suite will run automatically on Travis-CI once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then Travis-CI needs to be hooked up to your GitHub repository. Instructions for doing so are [here](http://about.travis-ci.org/docs/user/getting-started/). - -### Creating a branch - -You want your master branch to reflect only production-ready code, so create a feature branch for making your changes. For example: - - git branch shiny-new-feature - git checkout shiny-new-feature - -The above can be simplified to: - - git checkout -b shiny-new-feature - -This changes your working directory to the shiny-new-feature branch. Keep any changes in this branch specific to one bug or feature so it is clear what the branch brings to *pandas*. You can have many shiny-new-features and switch in between them using the git checkout command. - -To update this branch, you need to retrieve the changes from the master branch: - - git fetch upstream - git rebase upstream/master - -This will replay your commits on top of the lastest pandas git master. If this leads to merge conflicts, you must resolve these before submitting your pull request. If you have uncommitted changes, you will need to `stash` them prior to updating. This will effectively store your changes and they can be reapplied after updating. - -### Creating a development environment - -An easy way to create a *pandas* development environment is as follows. - -- Install either Anaconda <install.anaconda> or miniconda <install.miniconda> -- Make sure that you have cloned the repository <contributing.forking> -- `cd` to the *pandas* source directory - -Tell conda to create a new environment, named `pandas_dev`, or any other name you would like for this environment, by running: - - conda create -n pandas_dev --file ci/requirements_dev.txt - -For a python 3 environment: - - conda create -n pandas_dev python=3 --file ci/requirements_dev.txt - -If you are on Windows, then you will also need to install the compiler linkages: - - conda install -n pandas_dev libpython - -This will create the new environment, and not touch any of your existing environments, nor any existing python installation. It will install all of the basic dependencies of *pandas*, as well as the development and testing tools. If you would like to install other dependencies, you can install them as follows: - - conda install -n pandas_dev -c pandas pytables scipy - -To install *all* pandas dependencies you can do the following: - - conda install -n pandas_dev -c pandas --file ci/requirements_all.txt - -To work in this environment, Windows users should `activate` it as follows: - - activate pandas_dev - -Mac OSX and Linux users should use: - - source activate pandas_dev - -You will then see a confirmation message to indicate you are in the new development environment. - -To view your environments: - - conda info -e - -To return to you home root environment: - - deactivate - -See the full conda docs [here](http://conda.pydata.org/docs). - -At this point you can easily do an *in-place* install, as detailed in the next section. - -### Making changes - -Before making your code changes, it is often necessary to build the code that was just checked out. There are two primary methods of doing this. - -1. The best way to develop *pandas* is to build the C extensions in-place by running: - - python setup.py build_ext --inplace - - If you startup the Python interpreter in the *pandas* source directory you will call the built C extensions - -2. Another very common option is to do a `develop` install of *pandas*: - - python setup.py develop - - This makes a symbolic link that tells the Python interpreter to import *pandas* from your development directory. Thus, you can always be using the development version on your system without being inside the clone directory. - -Contributing to the documentation ---------------------------------- - -If you're not the developer type, contributing to the documentation is still of huge value. You don't even have to be an expert on *pandas* to do so! Something as simple as rewriting small passages for clarity as you reference the docs is a simple but effective way to contribute. The next person to read that passage will be in your debt! - -In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a simple way to ensure it will help the next person. - -### About the *pandas* documentation - -The documentation is written in **reStructuredText**, which is almost like writing in plain English, and built using [Sphinx](http://sphinx.pocoo.org/). The Sphinx Documentation has an excellent [introduction to reST](http://sphinx.pocoo.org/rest.html). Review the Sphinx docs to perform more complex changes to the documentation as well. - -Some other important things to know about the docs: - -- The *pandas* documentation consists of two parts: the docstrings in the code itself and the docs in this folder `pandas/doc/`. - - The docstrings provide a clear explanation of the usage of the individual functions, while the documentation in this folder consists of tutorial-like overviews per topic together with some other information (what's new, installation, etc). - -- The docstrings follow the **Numpy Docstring Standard**, which is used widely in the Scientific Python community. This standard specifies the format of the different sections of the docstring. See [this document](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) for a detailed explanation, or look at some of the existing functions to extend it in a similar manner. -- The tutorials make heavy use of the [ipython directive](http://matplotlib.org/sampledoc/ipython_directive.html) sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example: - - .. ipython:: python - - x = 2 - x**3 - - will be rendered as: - - In [1]: x = 2 - - In [2]: x**3 - Out[2]: 8 - - Almost all code examples in the docs are run (and the output saved) during the doc build. This approach means that code examples will always be up to date, but it does make the doc building a bit more complex. - -> **note** -> -> The `.rst` files are used to automatically generate Markdown and HTML versions of the docs. For this reason, please do not edit `CONTRIBUTING.md` directly, but instead make any changes to `doc/source/contributing.rst`. Then, to generate `CONTRIBUTING.md`, use [pandoc](http://johnmacfarlane.net/pandoc/) with the following command: -> -> pandoc doc/source/contributing.rst -t markdown_github > CONTRIBUTING.md - -The utility script `scripts/api_rst_coverage.py` can be used to compare the list of methods documented in `doc/source/api.rst` (which is used to generate the [API Reference](http://pandas.pydata.org/pandas-docs/stable/api.html) page) and the actual public methods. This will identify methods documented in in `doc/source/api.rst` that are not actually class methods, and existing methods that are not documented in `doc/source/api.rst`. - -### How to build the *pandas* documentation - -#### Requirements - -To build the *pandas* docs there are some extra requirements: you will need to have `sphinx` and `ipython` installed. [numpydoc](https://github.com/numpy/numpydoc) is used to parse the docstrings that follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of numpydoc is included in the *pandas* source code. - -It is easiest to create a development environment <contributing.dev\_env>, then install: - - conda install -n pandas_dev sphinx ipython - -Furthermore, it is recommended to have all [optional dependencies](http://pandas.pydata.org/pandas-docs/dev/install.html#optional-dependencies) installed. This is not strictly necessary, but be aware that you will see some error messages when building the docs. This happens because all the code in the documentation is executed during the doc build, and so code examples using optional dependencies will generate errors. Run `pd.show_versions()` to get an overview of the installed version of all dependencies. - -> **warning** -> -> You need to have `sphinx` version 1.2.2 or newer, but older than version 1.3. Versions before 1.1.3 should also work. - -#### Building the documentation - -So how do you build the docs? Navigate to your local `pandas/doc/` directory in the console and run: - - python make.py html - -Then you can find the HTML output in the folder `pandas/doc/build/html/`. - -The first time you build the docs, it will take quite a while because it has to run all the code examples and build all the generated docstring pages. In subsequent evocations, sphinx will try to only build the pages that have been modified. - -If you want to do a full clean build, do: - - python make.py clean - python make.py build - -Starting with *pandas* 0.13.1 you can tell `make.py` to compile only a single section of the docs, greatly reducing the turn-around time for checking your changes. You will be prompted to delete `.rst` files that aren't required. This is okay because the prior versions of these files can be checked out from git. However, you must make sure not to commit the file deletions to your Git repository! - - #omit autosummary and API section - python make.py clean - python make.py --no-api - - # compile the docs with only a single - # section, that which is in indexing.rst - python make.py clean - python make.py --single indexing - -For comparison, a full documentation build may take 10 minutes, a `-no-api` build may take 3 minutes and a single section may take 15 seconds. Subsequent builds, which only process portions you have changed, will be faster. Open the following file in a web browser to see the full documentation you just built: - - pandas/docs/build/html/index.html +Whether you are a novice or experienced software developer, all contributions and suggestions are welcome! -And you'll have the satisfaction of seeing your new and improved documentation! +Our main contribution docs can be found [here](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst), but if you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant places in the docs for further information. -#### Building master branch documentation - -When pull requests are merged into the *pandas* `master` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted [here](http://pandas-docs.github.io/pandas-docs-travis). - -Contributing to the code base ------------------------------ - -### Code standards - -*pandas* uses the [PEP8](http://www.python.org/dev/peps/pep-0008/) standard. There are several tools to ensure you abide by this standard. - -We've written a tool to check that your commits are PEP8 great, [pip install pep8radius](https://github.com/hayd/pep8radius). Look at PEP8 fixes in your branch vs master with: - - pep8radius master --diff - -and make these changes with: - - pep8radius master --diff --in-place - -Alternatively, use the [flake8](http://pypi.python.org/pypi/flake8) tool for checking the style of your code. Additional standards are outlined on the [code style wiki page](https://github.com/pandas-dev/pandas/wiki/Code-Style-and-Conventions). - -Please try to maintain backward compatibility. *pandas* has lots of users with lots of existing code, so don't break it if at all possible. If you think breakage is required, clearly state why as part of the pull request. Also, be careful when changing method signatures and add deprecation warnings where needed. - -### Test-driven development/code writing - -*pandas* is serious about testing and strongly encourages contributors to embrace [test-driven development (TDD)](http://en.wikipedia.org/wiki/Test-driven_development). This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired improvement or new function, then produces the minimum amount of code to pass that test." So, before actually writing any code, you should write your tests. Often the test can be taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. - -Adding tests is one of the most common requests after code is pushed to *pandas*. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. - -Like many packages, *pandas* uses the [Nose testing system](https://nose.readthedocs.io/en/latest/index.html) and the convenient extensions in [numpy.testing](http://docs.scipy.org/doc/numpy/reference/routines.testing.html). - -#### Writing tests - -All tests should go into the `tests` subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for inspiration. If your test requires working with files or network connectivity, there is more information on the [testing page](https://github.com/pandas-dev/pandas/wiki/Testing) of the wiki. - -The `pandas.util.testing` module has many special `assert` functions that make it easier to make statements about whether Series or DataFrame objects are equivalent. The easiest way to verify that your code is correct is to explicitly construct the result you expect, then compare the actual result to the expected correct result: - - def test_pivot(self): - data = { - 'index' : ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns' : ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values' : [1., 2., 3., 3., 2., 1.] - } - - frame = DataFrame(data) - pivoted = frame.pivot(index='index', columns='columns', values='values') - - expected = DataFrame({ - 'One' : {'A' : 1., 'B' : 2., 'C' : 3.}, - 'Two' : {'A' : 1., 'B' : 2., 'C' : 3.} - }) - - assert_frame_equal(pivoted, expected) - -#### Running the test suite - -The tests can then be run directly inside your Git clone (without having to install *pandas*) by typing: - - nosetests pandas - -The tests suite is exhaustive and takes around 20 minutes to run. Often it is worth running only a subset of tests first around your changes before running the entire suite. This is done using one of the following constructs: - - nosetests pandas/tests/[test-module].py - nosetests pandas/tests/[test-module].py:[TestClass] - nosetests pandas/tests/[test-module].py:[TestClass].[test_method] - -#### Running the performance test suite - -Performance matters and it is worth considering whether your code has introduced performance regressions. *pandas* is in the process of migrating to the [asv library](https://github.com/spacetelescope/asv) to enable easy monitoring of the performance of critical *pandas* operations. These benchmarks are all found in the `pandas/asv_bench` directory. asv supports both python2 and python3. - -> **note** -> -> The asv benchmark suite was translated from the previous framework, vbench, so many stylistic issues are likely a result of automated transformation of the code. - -To use asv you will need either `conda` or `virtualenv`. For more details please check the [asv installation webpage](https://asv.readthedocs.io/en/latest/installing.html). - -To install asv: - - pip install git+https://github.com/spacetelescope/asv - -If you need to run a benchmark, change your directory to `/asv_bench/` and run the following if you have been developing on `master`: - - asv continuous master - -If you are working on another branch, either of the following can be used: - - asv continuous master HEAD - asv continuous master your_branch - -This will check out the master revision and run the suite on both master and your commit. Running the full test suite can take up to one hour and use up to 3GB of RAM. Usually it is sufficient to paste only a subset of the results into the pull request to show that the committed changes do not cause unexpected performance regressions. - -You can run specific benchmarks using the `-b` flag, which takes a regular expression. For example, this will only run tests from a `pandas/asv_bench/benchmarks/groupby.py` file: - - asv continuous master -b groupby - -If you want to only run a specific group of tests from a file, you can do it using `.` as a separator. For example: - - asv continuous master -b groupby.groupby_agg_builtins1 - -will only run a `groupby_agg_builtins1` test defined in a `groupby` file. - -It can also be useful to run tests in your current environment. You can simply do it by: - - asv dev - -This command is equivalent to: - - asv run --quick --show-stderr --python=same - -This will launch every test only once, display stderr from the benchmarks, and use your local `python` that comes from your `$PATH`. - -Information on how to write a benchmark can be found in the [asv documentation](https://asv.readthedocs.io/en/latest/writing_benchmarks.html). - -#### Running the vbench performance test suite (phasing out) - -Historically, *pandas* used [vbench library](https://github.com/pydata/vbench) to enable easy monitoring of the performance of critical *pandas* operations. These benchmarks are all found in the `pandas/vb_suite` directory. vbench currently only works on python2. - -To install vbench: - - pip install git+https://github.com/pydata/vbench - -Vbench also requires `sqlalchemy`, `gitpython`, and `psutil`, which can all be installed using pip. If you need to run a benchmark, change your directory to the *pandas* root and run: - - ./test_perf.sh -b master -t HEAD - -This will check out the master revision and run the suite on both master and your commit. Running the full test suite can take up to one hour and use up to 3GB of RAM. Usually it is sufficient to paste a subset of the results into the Pull Request to show that the committed changes do not cause unexpected performance regressions. - -You can run specific benchmarks using the `-r` flag, which takes a regular expression. - -See the [performance testing wiki](https://github.com/pandas-dev/pandas/wiki/Performance-Testing) for information on how to write a benchmark. - -### Documenting your code - -Changes should be reflected in the release notes located in `doc/source/whatsnew/vx.y.z.txt`. This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using `` :issue:`1234` `` where 1234 is the issue/pull request number). - -If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section regarding documentation above <contributing.documentation>. Further, to let users know when this feature was added, the `versionadded` directive is used. The sphinx syntax for that is: - -``` sourceCode -.. versionadded:: 0.17.0 -``` - -This will put the text *New in version 0.17.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method ([example](https://github.com/pandas-dev/pandas/blob/v0.16.2/pandas/core/generic.py#L1959)) or a new keyword argument ([example](https://github.com/pandas-dev/pandas/blob/v0.16.2/pandas/core/frame.py#L1171)). - -Contributing your changes to *pandas* -------------------------------------- - -### Committing your code - -Keep style fixes to a separate commit to make your pull request more readable. - -Once you've made changes, you can see them by typing: - - git status - -If you have created a new file, it is not being tracked by git. Add it by typing: - - git add path/to/file-to-be-added.py - -Doing 'git status' again should give something like: - - # On branch shiny-new-feature - # - # modified: /relative/path/to/file-you-added.py - # - -Finally, commit your changes to your local repository with an explanatory message. *Pandas* uses a convention for commit message prefixes and layout. Here are some common prefixes along with general guidelines for when to use them: - -> - ENH: Enhancement, new functionality -> - BUG: Bug fix -> - DOC: Additions/updates to documentation -> - TST: Additions/updates to tests -> - BLD: Updates to the build process/scripts -> - PERF: Performance improvement -> - CLN: Code cleanup - -The following defines how a commit message should be structured. Please reference the relevant GitHub issues in your commit message using GH1234 or \#1234. Either style is fine, but the former is generally preferred: - -> - a subject line with < 80 chars. -> - One blank line. -> - Optionally, a commit message body. - -Now you can commit your changes in your local repository: - - git commit -m - -### Combining commits - -If you have multiple commits, you may want to combine them into one commit, often referred to as "squashing" or "rebasing". This is a common request by package maintainers when submitting a pull request as it maintains a more compact commit history. To rebase your commits: - - git rebase -i HEAD~# - -Where \# is the number of commits you want to combine. Then you can pick the relevant commit message and discard others. - -To squash to the master branch do: - - git rebase -i master - -Use the `s` option on a commit to `squash`, meaning to keep the commit messages, or `f` to `fixup`, meaning to merge the commit messages. - -Then you will need to push the branch (see below) forcefully to replace the current commits with the new ones: - - git push origin shiny-new-feature -f - -### Pushing your changes - -When you want your changes to appear publicly on your GitHub page, push your forked feature branch's commits: - - git push origin shiny-new-feature - -Here `origin` is the default name given to your remote repository on GitHub. You can see the remote repositories: - - git remote -v - -If you added the upstream repository as described above you will see something like: - - origin git@github.com:yourname/pandas.git (fetch) - origin git@github.com:yourname/pandas.git (push) - upstream git://github.com/pandas-dev/pandas.git (fetch) - upstream git://github.com/pandas-dev/pandas.git (push) - -Now your code is on GitHub, but it is not yet a part of the *pandas* project. For that to happen, a pull request needs to be submitted on GitHub. - -### Review your code - -When you're ready to ask for a code review, file a pull request. Before you do, once again make sure that you have followed all the guidelines outlined in this document regarding code style, tests, performance tests, and documentation. You should also double check your branch changes against the branch it was based on: - -1. Navigate to your repository on GitHub -- -2. Click on `Branches` -3. Click on the `Compare` button for your feature branch -4. Select the `base` and `compare` branches, if necessary. This will be `master` and `shiny-new-feature`, respectively. - -### Finally, make the pull request - -If everything looks good, you are ready to make a pull request. A pull request is how code from a local repository becomes available to the GitHub community and can be looked at and eventually merged into the master version. This pull request and its associated changes will eventually be committed to the master branch and available in the next release. To submit a pull request: - -1. Navigate to your repository on GitHub -2. Click on the `Pull Request` button -3. You can then click on `Commits` and `Files Changed` to make sure everything looks okay one last time -4. Write a description of your changes in the `Preview Discussion` tab -5. Click `Send Pull Request`. - -This request then goes to the repository maintainers, and they will review the code. If you need to make more changes, you can make them in your branch, push them to GitHub, and the pull request will be automatically updated. Pushing them to GitHub again is done by: - - git push -f origin shiny-new-feature - -This will automatically update your pull request with the latest code and restart the Travis-CI tests. - -### Delete your merged branch (optional) - -Once your feature branch is accepted into upstream, you'll probably want to get rid of the branch. First, merge upstream master into your branch so git knows it is safe to delete your branch: - - git fetch upstream - git checkout master - git merge upstream/master +Getting Started +--------------- +If you are looking to contribute to the *pandas* codebase, the best place to start is the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues). This is also a great place for filing bug reports and making suggestions for ways in which we can improve the code and documentation. -Then you can just do: +If you have additional questions, feel free to ask them on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). Further information can also be found in our [Getting Started](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#where-to-start) section of our main contribution doc. - git branch -d shiny-new-feature +Filing Issues +------------- +If you notice a bug in the code or in docs or have suggestions for how we can improve either, feel free to create an issue on the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) using [GitHub's "issue" form](https://github.com/pandas-dev/pandas/issues/new). The form contains some questions that will help us best address your issue. For more information regarding how to file issues against *pandas*, please refer to the [Bug reports and enhancement requests](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#bug-reports-and-enhancement-requests) section of our main contribution doc. -Make sure you use a lower-case `-d`, or else git won't warn you if your feature branch has not actually been merged. +Contributing to the Codebase +---------------------------- +The code is hosted on [GitHub](https://www.github.com/pandas-dev/pandas), so you will need to use [Git](http://git-scm.com/) to clone the project and make changes to the codebase. Once you have obtained a copy of the code, you should create a development environment that is separate from your existing Python environment so that you can make and test changes without compromising your own work environment. For more information, please refer to our [Working with the code](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#working-with-the-code) section of our main contribution docs. -The branch will still exist on GitHub, so to delete it there do: +Before submitting your changes for review, make sure to check that your changes do not break any tests. You can find more information about our test suites can be found [here](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#test-driven-development-code-writing). We also have guidelines regarding coding style that will be enforced during testing. Details about coding style can be found [here](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#code-standards). - git push origin --delete shiny-new-feature +Once your changes are ready to be submitted, make sure to push your changes to GitHub before creating a pull request. Details about how to do that can be found in the [Contributing your changes to pandas](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#contributing-your-changes-to-pandas) section of our main contribution docs. We will review your changes, and you will most likely be asked to make additional changes before it is finally ready to merge. However, once it's ready, we will merge it, and you will have successfully contributed to the codebase! From 53c09a378e24719666909a3aeaf5460f9c612ebb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Feb 2017 10:01:48 -0500 Subject: [PATCH 174/338] TST: remove 4 builds from travis that are on circleci (#15465) --- .travis.yml | 109 ---------------------------------------------------- circle.yml | 5 ++- 2 files changed, 4 insertions(+), 110 deletions(-) diff --git a/.travis.yml b/.travis.yml index bb96ab210c088..97bf881f3b6fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -91,21 +91,6 @@ matrix: packages: - libatlas-base-dev - gfortran -# In allow_failures - - python: 2.7 - env: - - PYTHON_VERSION=2.7 - - JOB_NAME: "27_nslow_nnet_COMPAT" - - TEST_ARGS="--skip-slow --skip-network" - - LOCALE_OVERRIDE="it_IT.UTF-8" - - INSTALL_TEST=true - - JOB_TAG=_COMPAT - - CACHE_NAME="27_nslow_nnet_COMPAT" - - USE_CACHE=true - addons: - apt: - packages: - - language-pack-it # In allow_failures - python: 2.7 env: @@ -127,37 +112,6 @@ matrix: - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" - USE_CACHE=true -# In allow_failures - - python: 3.4 - env: - - PYTHON_VERSION=3.4 - - JOB_NAME: "34_nslow" - - LOCALE_OVERRIDE="zh_CN.UTF-8" - - TEST_ARGS="--skip-slow" - - FULL_DEPS=true - - CLIPBOARD=xsel - - CACHE_NAME="34_nslow" - - USE_CACHE=true - addons: - apt: - packages: - - xsel - - language-pack-zh-hans -# In allow_failures - - python: 3.4 - env: - - PYTHON_VERSION=3.4 - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - TEST_ARGS="--only-slow --skip-network" - - FULL_DEPS=true - - CLIPBOARD=xsel - - CACHE_NAME="34_slow" - - USE_CACHE=true - addons: - apt: - packages: - - xsel # In allow_failures - python: 3.5 env: @@ -173,16 +127,6 @@ matrix: packages: - libatlas-base-dev - gfortran -# In allow_failures - - python: 3.5 - env: - - PYTHON_VERSION=3.5 - - JOB_NAME: "35_ascii" - - JOB_TAG=_ASCII - - TEST_ARGS="--skip-slow --skip-network" - - LOCALE_OVERRIDE="C" - - CACHE_NAME="35_ascii" - - USE_CACHE=true # In allow_failures - python: 3.5 env: @@ -203,20 +147,6 @@ matrix: - FULL_DEPS=true - CACHE_NAME="27_slow" - USE_CACHE=true - - python: 3.4 - env: - - PYTHON_VERSION=3.4 - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - TEST_ARGS="--only-slow --skip-network" - - FULL_DEPS=true - - CLIPBOARD=xsel - - CACHE_NAME="34_slow" - - USE_CACHE=true - addons: - apt: - packages: - - xsel - python: 2.7 env: - PYTHON_VERSION=2.7 @@ -227,21 +157,6 @@ matrix: - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" - USE_CACHE=true - - python: 3.4 - env: - - PYTHON_VERSION=3.4 - - JOB_NAME: "34_nslow" - - LOCALE_OVERRIDE="zh_CN.UTF-8" - - TEST_ARGS="--skip-slow" - - FULL_DEPS=true - - CLIPBOARD=xsel - - CACHE_NAME="34_nslow" - - USE_CACHE=true - addons: - apt: - packages: - - xsel - - language-pack-zh-hans - python: 3.5 env: - PYTHON_VERSION=3.5 @@ -256,29 +171,6 @@ matrix: packages: - libatlas-base-dev - gfortran - - python: 2.7 - env: - - PYTHON_VERSION=2.7 - - JOB_NAME: "27_nslow_nnet_COMPAT" - - TEST_ARGS="--skip-slow --skip-network" - - LOCALE_OVERRIDE="it_IT.UTF-8" - - INSTALL_TEST=true - - JOB_TAG=_COMPAT - - CACHE_NAME="27_nslow_nnet_COMPAT" - - USE_CACHE=true - addons: - apt: - packages: - - language-pack-it - - python: 3.5 - env: - - PYTHON_VERSION=3.5 - - JOB_NAME: "35_ascii" - - JOB_TAG=_ASCII - - TEST_ARGS="--skip-slow --skip-network" - - LOCALE_OVERRIDE="C" - - CACHE_NAME="35_ascii" - - USE_CACHE=true - python: 3.5 env: - PYTHON_VERSION=3.5 @@ -299,7 +191,6 @@ before_install: - pwd - uname -a - python -V -# git info & get tags - git --version - git tag - ci/before_install_travis.sh diff --git a/circle.yml b/circle.yml index 97136d368ae6f..046af6e9e1389 100644 --- a/circle.yml +++ b/circle.yml @@ -3,16 +3,18 @@ machine: # these are globally set MINICONDA_DIR: /home/ubuntu/miniconda3 + database: override: - ./ci/install_db_circle.sh + checkout: post: # since circleci does a shallow fetch # we need to populate our tags - git fetch --depth=1000 - - git fetch --tags + dependencies: override: @@ -29,6 +31,7 @@ dependencies: esac - ./ci/show_circle.sh + test: override: - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: From 47a91f3c2e015e4f3f93d5537e30a93ffad56ab4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Feb 2017 10:40:47 -0500 Subject: [PATCH 175/338] update README.md for badges (circleci and fix anaconda cloud pointer) --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4293d7294d5e0..195b76f64b37f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ - + @@ -30,6 +30,15 @@ + + + + + From 4c015e92f67d7115181188716bdc0fd7e7d34cac Mon Sep 17 00:00:00 2001 From: Peter Csizsek Date: Thu, 23 Feb 2017 08:23:11 -0500 Subject: [PATCH 183/338] BUG: The roll_quantile function now throws an exception instead of causing a segfault when quantile is out of range closes #15463 Author: Peter Csizsek Closes #15476 from csizsek/fix-rolling-quantile-segfault and squashes the following commits: e31e5be [Peter Csizsek] Correctly catching exception in the test for Rolling.quantile. 4eea34a [Peter Csizsek] Refactored and moved exception throwing test to a new function for Rolling.quantile(). 8b1e020 [Peter Csizsek] Added a note about the Rolling.quantile bug fix to the changelog. f39b122 [Peter Csizsek] Added a new test case to roll_quantile_test to trigger a TypeError when called with a string. f736ca2 [Peter Csizsek] The roll_quantile function in window.pyx now raises a ValueError when the quantile value is not in [0.0, 1.0] --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/test_window.py | 14 +++++++++++++- pandas/window.pyx | 7 +++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e65276fe51fe8..fa24c973a7549 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -539,7 +539,7 @@ Bug Fixes - Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) - +- Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 452e8999ab13f..3f2973a9834ca 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1063,7 +1063,7 @@ def test_rolling_max(self): window=3, min_periods=5) def test_rolling_quantile(self): - qs = [.1, .5, .9] + qs = [0.0, .1, .5, .9, 1.0] def scoreatpercentile(a, per): values = np.sort(a, axis=0) @@ -1084,6 +1084,18 @@ def alt(x): self._check_moment_func(f, alt, name='quantile', quantile=q) + def test_rolling_quantile_param(self): + ser = Series([0.0, .1, .5, .9, 1.0]) + + with self.assertRaises(ValueError): + ser.rolling(3).quantile(-0.1) + + with self.assertRaises(ValueError): + ser.rolling(3).quantile(10.0) + + with self.assertRaises(TypeError): + ser.rolling(3).quantile('foo') + def test_rolling_apply(self): # suppress warnings about empty slices, as we are deliberately testing # with a 0-length Series diff --git a/pandas/window.pyx b/pandas/window.pyx index 8235d68e2a88b..005d42c9f68be 100644 --- a/pandas/window.pyx +++ b/pandas/window.pyx @@ -134,8 +134,8 @@ cdef class WindowIndexer: bint is_variable def get_data(self): - return (self.start, self.end, self.N, - self.win, self.minp, + return (self.start, self.end, self.N, + self.win, self.minp, self.is_variable) @@ -1285,6 +1285,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, ndarray[int64_t] start, end ndarray[double_t] output + if quantile < 0.0 or quantile > 1.0: + raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) + # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs start, end, N, win, minp, is_variable = get_window_indexer( From 28105bd3d3756d73586b9f8131e53b1f86037779 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 24 Feb 2017 07:05:11 -0500 Subject: [PATCH 184/338] TST: add pytest to asv conf --- asv_bench/asv.conf.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 155deb5bdbd1f..4fc6f9f634426 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -50,7 +50,8 @@ "openpyxl": [], "xlsxwriter": [], "xlrd": [], - "xlwt": [] + "xlwt": [], + "pytest": [], }, // Combinations of libraries/python versions can be excluded/included From 6045e7976f7fc5fa68bba59942e9f392485bab00 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 24 Feb 2017 07:13:46 -0500 Subject: [PATCH 185/338] CLN: split off frozen (immutable) data structures into pandas/indexes/frozen.py should make it a bit easier to work with these; and no reason to be in pandas/core/base.py Author: Jeff Reback Closes #15477 from jreback/frozen and squashes the following commits: 2a64a4f [Jeff Reback] CLN: split off frozen (immutable) data structures into pandas/indexes/frozen.py --- pandas/compat/pickle_compat.py | 34 +++++++- pandas/core/base.py | 105 ----------------------- pandas/indexes/base.py | 13 +-- pandas/indexes/frozen.py | 126 ++++++++++++++++++++++++++++ pandas/indexes/multi.py | 6 +- pandas/tests/indexes/test_frozen.py | 68 +++++++++++++++ pandas/tests/test_base.py | 68 +-------------- 7 files changed, 231 insertions(+), 189 deletions(-) create mode 100644 pandas/indexes/frozen.py create mode 100644 pandas/tests/indexes/test_frozen.py diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 1cdf8afd563c6..240baa848adbc 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -52,12 +52,40 @@ def load_reduce(self): stack[-1] = value + +# if classes are moved, provide compat here +_class_locations_map = { + + # 15477 + ('pandas.core.base', 'FrozenNDArray'): ('pandas.indexes.frozen', 'FrozenNDArray'), + ('pandas.core.base', 'FrozenList'): ('pandas.indexes.frozen', 'FrozenList') + } + + +# our Unpickler sub-class to override methods and some dispatcher +# functions for compat + if compat.PY3: class Unpickler(pkl._Unpickler): - pass + + def find_class(self, module, name): + # override superclass + key = (module, name) + module, name = _class_locations_map.get(key, key) + return super(Unpickler, self).find_class(module, name) + else: + class Unpickler(pkl.Unpickler): - pass + + def find_class(self, module, name): + # override superclass + key = (module, name) + module, name = _class_locations_map.get(key, key) + __import__(module) + mod = sys.modules[module] + klass = getattr(mod, name) + return klass Unpickler.dispatch = copy.copy(Unpickler.dispatch) Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce @@ -76,8 +104,6 @@ def load_newobj(self): self.stack[-1] = obj Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj -# py3 compat - def load_newobj_ex(self): kwargs = self.stack.pop() diff --git a/pandas/core/base.py b/pandas/core/base.py index 92ec6bb3d73e6..55149198b0dbf 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -17,7 +17,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError -from pandas.formats.printing import pprint_thing _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', @@ -694,110 +693,6 @@ def _gotitem(self, key, ndim, subset=None): return self -class FrozenList(PandasObject, list): - - """ - Container that doesn't allow setting item *but* - because it's technically non-hashable, will be used - for lookups, appropriately, etc. - """ - # Sidenote: This has to be of type list, otherwise it messes up PyTables - # typechecks - - def __add__(self, other): - if isinstance(other, tuple): - other = list(other) - return self.__class__(super(FrozenList, self).__add__(other)) - - __iadd__ = __add__ - - # Python 2 compat - def __getslice__(self, i, j): - return self.__class__(super(FrozenList, self).__getslice__(i, j)) - - def __getitem__(self, n): - # Python 3 compat - if isinstance(n, slice): - return self.__class__(super(FrozenList, self).__getitem__(n)) - return super(FrozenList, self).__getitem__(n) - - def __radd__(self, other): - if isinstance(other, tuple): - other = list(other) - return self.__class__(other + list(self)) - - def __eq__(self, other): - if isinstance(other, (tuple, FrozenList)): - other = list(other) - return super(FrozenList, self).__eq__(other) - - __req__ = __eq__ - - def __mul__(self, other): - return self.__class__(super(FrozenList, self).__mul__(other)) - - __imul__ = __mul__ - - def __reduce__(self): - return self.__class__, (list(self),) - - def __hash__(self): - return hash(tuple(self)) - - def _disabled(self, *args, **kwargs): - """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % - self.__class__.__name__) - - def __unicode__(self): - return pprint_thing(self, quote_strings=True, - escape_chars=('\t', '\r', '\n')) - - def __repr__(self): - return "%s(%s)" % (self.__class__.__name__, - str(self)) - - __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled - pop = append = extend = remove = sort = insert = _disabled - - -class FrozenNDArray(PandasObject, np.ndarray): - - # no __array_finalize__ for now because no metadata - def __new__(cls, data, dtype=None, copy=False): - if copy is None: - copy = not isinstance(data, FrozenNDArray) - res = np.array(data, dtype=dtype, copy=copy).view(cls) - return res - - def _disabled(self, *args, **kwargs): - """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % - self.__class__) - - __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled - put = itemset = fill = _disabled - - def _shallow_copy(self): - return self.view() - - def values(self): - """returns *copy* of underlying array""" - arr = self.view(np.ndarray).copy() - return arr - - def __unicode__(self): - """ - Return a string representation for this object. - - Invoked by unicode(df) in py2 only. Yields a Unicode String in both - py2/py3. - """ - prepr = pprint_thing(self, escape_chars=('\t', '\r', '\n'), - quote_strings=True) - return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) - - class IndexOpsMixin(object): """ common ops mixin to support a unified inteface / docs for Series / Index diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index f1f37622b2a74..4837fc0d7438c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -35,16 +35,15 @@ needs_i8_conversion, is_iterator, is_list_like, is_scalar) -from pandas.types.cast import _coerce_indexer_dtype from pandas.core.common import (is_bool_indexer, _values_from_object, _asarray_tuplesafe) -from pandas.core.base import (PandasObject, FrozenList, FrozenNDArray, - IndexOpsMixin) +from pandas.core.base import PandasObject, IndexOpsMixin import pandas.core.base as base from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate, deprecate_kwarg) +from pandas.indexes.frozen import FrozenList import pandas.core.common as com import pandas.types.concat as _concat import pandas.core.missing as missing @@ -3844,14 +3843,6 @@ def _get_na_value(dtype): np.timedelta64: tslib.NaT}.get(dtype, np.nan) -def _ensure_frozen(array_like, categories, copy=False): - array_like = _coerce_indexer_dtype(array_like, categories) - array_like = array_like.view(FrozenNDArray) - if copy: - array_like = array_like.copy() - return array_like - - def _ensure_has_len(seq): """If seq is an iterator, put its values into a list.""" try: diff --git a/pandas/indexes/frozen.py b/pandas/indexes/frozen.py new file mode 100644 index 0000000000000..e043ba64bbad7 --- /dev/null +++ b/pandas/indexes/frozen.py @@ -0,0 +1,126 @@ +""" +frozen (immutable) data structures to support MultiIndexing + +These are used for: + +- .names (FrozenList) +- .levels & .labels (FrozenNDArray) + +""" + +import numpy as np +from pandas.core.base import PandasObject +from pandas.types.cast import _coerce_indexer_dtype +from pandas.formats.printing import pprint_thing + + +class FrozenList(PandasObject, list): + + """ + Container that doesn't allow setting item *but* + because it's technically non-hashable, will be used + for lookups, appropriately, etc. + """ + # Sidenote: This has to be of type list, otherwise it messes up PyTables + # typechecks + + def __add__(self, other): + if isinstance(other, tuple): + other = list(other) + return self.__class__(super(FrozenList, self).__add__(other)) + + __iadd__ = __add__ + + # Python 2 compat + def __getslice__(self, i, j): + return self.__class__(super(FrozenList, self).__getslice__(i, j)) + + def __getitem__(self, n): + # Python 3 compat + if isinstance(n, slice): + return self.__class__(super(FrozenList, self).__getitem__(n)) + return super(FrozenList, self).__getitem__(n) + + def __radd__(self, other): + if isinstance(other, tuple): + other = list(other) + return self.__class__(other + list(self)) + + def __eq__(self, other): + if isinstance(other, (tuple, FrozenList)): + other = list(other) + return super(FrozenList, self).__eq__(other) + + __req__ = __eq__ + + def __mul__(self, other): + return self.__class__(super(FrozenList, self).__mul__(other)) + + __imul__ = __mul__ + + def __reduce__(self): + return self.__class__, (list(self),) + + def __hash__(self): + return hash(tuple(self)) + + def _disabled(self, *args, **kwargs): + """This method will not function because object is immutable.""" + raise TypeError("'%s' does not support mutable operations." % + self.__class__.__name__) + + def __unicode__(self): + return pprint_thing(self, quote_strings=True, + escape_chars=('\t', '\r', '\n')) + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, + str(self)) + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + pop = append = extend = remove = sort = insert = _disabled + + +class FrozenNDArray(PandasObject, np.ndarray): + + # no __array_finalize__ for now because no metadata + def __new__(cls, data, dtype=None, copy=False): + if copy is None: + copy = not isinstance(data, FrozenNDArray) + res = np.array(data, dtype=dtype, copy=copy).view(cls) + return res + + def _disabled(self, *args, **kwargs): + """This method will not function because object is immutable.""" + raise TypeError("'%s' does not support mutable operations." % + self.__class__) + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + put = itemset = fill = _disabled + + def _shallow_copy(self): + return self.view() + + def values(self): + """returns *copy* of underlying array""" + arr = self.view(np.ndarray).copy() + return arr + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + prepr = pprint_thing(self, escape_chars=('\t', '\r', '\n'), + quote_strings=True) + return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + + +def _ensure_frozen(array_like, categories, copy=False): + array_like = _coerce_indexer_dtype(array_like, categories) + array_like = array_like.view(FrozenNDArray) + if copy: + array_like = array_like.copy() + return array_like diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 18e1da7303d6d..ec30d2c44efd7 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -28,7 +28,6 @@ UnsortedIndexError) -from pandas.core.base import FrozenList import pandas.core.base as base from pandas.util.decorators import (Appender, cache_readonly, deprecate, deprecate_kwarg) @@ -39,9 +38,10 @@ from pandas.core.config import get_option -from pandas.indexes.base import (Index, _ensure_index, _ensure_frozen, +from pandas.indexes.base import (Index, _ensure_index, _get_na_value, InvalidIndexError, _index_shared_docs) +from pandas.indexes.frozen import FrozenNDArray, FrozenList, _ensure_frozen import pandas.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( @@ -1276,7 +1276,7 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, for new_label in taken: label_values = new_label.values() label_values[mask] = na_value - masked.append(base.FrozenNDArray(label_values)) + masked.append(FrozenNDArray(label_values)) taken = masked else: taken = [lab.take(indices) for lab in self.labels] diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py new file mode 100644 index 0000000000000..a82409fbf9513 --- /dev/null +++ b/pandas/tests/indexes/test_frozen.py @@ -0,0 +1,68 @@ +import numpy as np +from pandas.util import testing as tm +from pandas.tests.test_base import CheckImmutable, CheckStringMixin +from pandas.indexes.frozen import FrozenList, FrozenNDArray +from pandas.compat import u + + +class TestFrozenList(CheckImmutable, CheckStringMixin, tm.TestCase): + mutable_methods = ('extend', 'pop', 'remove', 'insert') + unicode_container = FrozenList([u("\u05d0"), u("\u05d1"), "c"]) + + def setUp(self): + self.lst = [1, 2, 3, 4, 5] + self.container = FrozenList(self.lst) + self.klass = FrozenList + + def test_add(self): + result = self.container + (1, 2, 3) + expected = FrozenList(self.lst + [1, 2, 3]) + self.check_result(result, expected) + + result = (1, 2, 3) + self.container + expected = FrozenList([1, 2, 3] + self.lst) + self.check_result(result, expected) + + def test_inplace(self): + q = r = self.container + q += [5] + self.check_result(q, self.lst + [5]) + # other shouldn't be mutated + self.check_result(r, self.lst) + + +class TestFrozenNDArray(CheckImmutable, CheckStringMixin, tm.TestCase): + mutable_methods = ('put', 'itemset', 'fill') + unicode_container = FrozenNDArray([u("\u05d0"), u("\u05d1"), "c"]) + + def setUp(self): + self.lst = [3, 5, 7, -2] + self.container = FrozenNDArray(self.lst) + self.klass = FrozenNDArray + + def test_shallow_copying(self): + original = self.container.copy() + self.assertIsInstance(self.container.view(), FrozenNDArray) + self.assertFalse(isinstance( + self.container.view(np.ndarray), FrozenNDArray)) + self.assertIsNot(self.container.view(), self.container) + self.assert_numpy_array_equal(self.container, original) + # shallow copy should be the same too + self.assertIsInstance(self.container._shallow_copy(), FrozenNDArray) + + # setting should not be allowed + def testit(container): + container[0] = 16 + + self.check_mutable_error(testit, self.container) + + def test_values(self): + original = self.container.view(np.ndarray).copy() + n = original[0] + 15 + vals = self.container.values() + self.assert_numpy_array_equal(original, vals) + self.assertIsNot(original, vals) + vals[0] = n + self.assertIsInstance(self.container, FrozenNDArray) + self.assert_numpy_array_equal(self.container.values(), original) + self.assertEqual(vals[0], n) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 473f1d81c9532..8264ad33950f9 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -14,10 +14,9 @@ import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta) -from pandas.compat import u, StringIO +from pandas.compat import StringIO from pandas.compat.numpy import np_array_datetime64_compat -from pandas.core.base import (FrozenList, FrozenNDArray, PandasDelegate, - NoNewAttributesMixin) +from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.tseries.base import DatetimeIndexOpsMixin @@ -83,69 +82,6 @@ def check_result(self, result, expected, klass=None): self.assertEqual(result, expected) -class TestFrozenList(CheckImmutable, CheckStringMixin, tm.TestCase): - mutable_methods = ('extend', 'pop', 'remove', 'insert') - unicode_container = FrozenList([u("\u05d0"), u("\u05d1"), "c"]) - - def setUp(self): - self.lst = [1, 2, 3, 4, 5] - self.container = FrozenList(self.lst) - self.klass = FrozenList - - def test_add(self): - result = self.container + (1, 2, 3) - expected = FrozenList(self.lst + [1, 2, 3]) - self.check_result(result, expected) - - result = (1, 2, 3) + self.container - expected = FrozenList([1, 2, 3] + self.lst) - self.check_result(result, expected) - - def test_inplace(self): - q = r = self.container - q += [5] - self.check_result(q, self.lst + [5]) - # other shouldn't be mutated - self.check_result(r, self.lst) - - -class TestFrozenNDArray(CheckImmutable, CheckStringMixin, tm.TestCase): - mutable_methods = ('put', 'itemset', 'fill') - unicode_container = FrozenNDArray([u("\u05d0"), u("\u05d1"), "c"]) - - def setUp(self): - self.lst = [3, 5, 7, -2] - self.container = FrozenNDArray(self.lst) - self.klass = FrozenNDArray - - def test_shallow_copying(self): - original = self.container.copy() - self.assertIsInstance(self.container.view(), FrozenNDArray) - self.assertFalse(isinstance( - self.container.view(np.ndarray), FrozenNDArray)) - self.assertIsNot(self.container.view(), self.container) - self.assert_numpy_array_equal(self.container, original) - # shallow copy should be the same too - self.assertIsInstance(self.container._shallow_copy(), FrozenNDArray) - - # setting should not be allowed - def testit(container): - container[0] = 16 - - self.check_mutable_error(testit, self.container) - - def test_values(self): - original = self.container.view(np.ndarray).copy() - n = original[0] + 15 - vals = self.container.values() - self.assert_numpy_array_equal(original, vals) - self.assertIsNot(original, vals) - vals[0] = n - self.assertIsInstance(self.container, pd.core.base.FrozenNDArray) - self.assert_numpy_array_equal(self.container.values(), original) - self.assertEqual(vals[0], n) - - class TestPandasDelegate(tm.TestCase): class Delegator(object): From f8c5a24080dfbc48e4a01799d36961e4bce5d626 Mon Sep 17 00:00:00 2001 From: Fumito Hamamura Date: Sat, 25 Feb 2017 04:53:43 +0900 Subject: [PATCH 186/338] DOC: Fix to docstrings of is_type_factory and is_instance_factory (#15499) Closes #15485 --- pandas/core/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/config.py b/pandas/core/config.py index 1c0eb60b8ec2f..39ed2f9545266 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -747,8 +747,8 @@ def is_type_factory(_type): Returns ------- - validator - a function of a single argument x , which returns the - True if type(x) is equal to `_type` + validator - a function of a single argument x , which raises + ValueError if type(x) is not equal to `_type` """ @@ -768,8 +768,8 @@ def is_instance_factory(_type): Returns ------- - validator - a function of a single argument x , which returns the - True if x is an instance of `_type` + validator - a function of a single argument x , which raises + ValueError if x is not an instance of `_type` """ if isinstance(_type, (tuple, list)): From 28f8c8f88905e949c9198b62abc04df6b866774c Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 24 Feb 2017 14:56:09 -0500 Subject: [PATCH 187/338] BUG: incorrect ranking in an ordered categorical check for categorical, and then pass the underlying integer codes. closes #15420 Author: Prasanjit Prakash Closes #15422 from ikilledthecat/rank_categorical and squashes the following commits: a7e573b [Prasanjit Prakash] moved test for categorical, in rank, to top 3ba4e3a [Prasanjit Prakash] corrections after rebasing c43a029 [Prasanjit Prakash] using if/else construct to pick sorting function for categoricals f8ec019 [Prasanjit Prakash] ask Categorical for ranking function 40d88c1 [Prasanjit Prakash] return values for rank from categorical object 049c0fc [Prasanjit Prakash] GH#15420 added support for na_option when ranking categorical 5e5bbeb [Prasanjit Prakash] BUG: GH#15420 rank for categoricals ef999c3 [Prasanjit Prakash] merged with upstream master fbaba1b [Prasanjit Prakash] return values for rank from categorical object fa0b4c2 [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 9a6b5cd [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 4220e56 [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 6b70921 [Prasanjit Prakash] GH#15420 move rank inside categoricals bf4e36c [Prasanjit Prakash] GH#15420 added support for na_option when ranking categorical ce90207 [Prasanjit Prakash] BUG: GH#15420 rank for categoricals 85b267a [Prasanjit Prakash] Added support for categorical datatype in rank - issue#15420 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 5 +- pandas/core/categorical.py | 22 ++++++++ pandas/tests/series/test_analytics.py | 78 +++++++++++++++++++++++++++ 4 files changed, 105 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index fa24c973a7549..0b501adba5039 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -578,6 +578,7 @@ Bug Fixes +- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4ae46fe33a5cc..b11927a80fb2e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -973,6 +973,10 @@ def _hashtable_algo(f, values, return_dtype=None): def _get_data_algo(values, func_map): f = None + + if is_categorical_dtype(values): + values = values._values_for_rank() + if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -988,7 +992,6 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) - else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b6898f11ffa74..b88a6b171b316 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1404,6 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + def _values_for_rank(self): + """ + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes with -1 translated to NaN. + + Returns + ------- + numpy array + + """ + if self.ordered: + values = self.codes + mask = values == -1 + if mask.any(): + values = values.astype('float64') + values[mask] = np.nan + else: + values = np.array(self) + return values + def order(self, inplace=False, ascending=True, na_position='last'): """ DEPRECATED: use :meth:`Categorical.sort_values`. That function diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 222165e9d3633..b092e4f084767 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1057,6 +1057,84 @@ def test_rank(self): iranks = iseries.rank() assert_series_equal(iranks, exp) + def test_rank_categorical(self): + # GH issue #15420 rank incorrectly orders ordered categories + + # Test ascending/descending ranking for ordered categoricals + exp = pd.Series([1., 2., 3., 4., 5., 6.]) + exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) + ordered = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] + ).astype('category', ).cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ordered=True + ) + assert_series_equal(ordered.rank(), exp) + assert_series_equal(ordered.rank(ascending=False), exp_desc) + + # Unordered categoricals should be ranked as objects + unordered = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype('category').cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ordered=False + ) + exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.]) + res = unordered.rank() + assert_series_equal(res, exp_unordered) + + # Test na_option for rank data + na_ser = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] + ).astype('category', ).cat.set_categories( + [ + 'first', 'second', 'third', 'fourth', + 'fifth', 'sixth', 'seventh' + ], + ordered=True + ) + + exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.]) + exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.]) + exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top'), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) + + # Test na_option for rank data with ascending False + exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.]) + exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.]) + exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN]) + + assert_series_equal( + na_ser.rank(na_option='top', ascending=False), + exp_top + ) + assert_series_equal( + na_ser.rank(na_option='bottom', ascending=False), + exp_bot + ) + assert_series_equal( + na_ser.rank(na_option='keep', ascending=False), + exp_keep + ) + + # Test with pct=True + na_ser = pd.Series( + ['first', 'second', 'third', 'fourth', np.NaN], + ).astype('category').cat.set_categories( + ['first', 'second', 'third', 'fourth'], + ordered=True + ) + exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2]) + exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.]) + exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') From f1ea8b105628c3d72d03fe4723fdeaa8737541a8 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 24 Feb 2017 14:59:04 -0500 Subject: [PATCH 188/338] BUG: Accept Generic Array-Like for .where Author: gfyoung Closes #15414 from gfyoung/generic-where-gen-array and squashes the following commits: 5037932 [gfyoung] BUG: Accept generic array-like in .where --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 42 +++++----- pandas/indexes/base.py | 2 +- pandas/tests/frame/test_indexing.py | 89 ++++++++++++++++++++++ pandas/tests/indexes/common.py | 12 +++ pandas/tests/indexes/period/test_period.py | 11 ++- pandas/tests/indexes/test_category.py | 12 ++- pandas/tests/indexes/test_multi.py | 9 +++ pandas/tests/series/test_indexing.py | 54 +++++++++++++ 9 files changed, 211 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0b501adba5039..4b3a65780f939 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -548,6 +548,7 @@ Bug Fixes +- Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) - Bug in ``Series`` construction with a datetimetz (:issue:`14928`) - Bug in compat for passing long integers to ``Timestamp.replace`` (:issue:`15030`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 76fbb9884753d..921fa2fb1bd48 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4726,25 +4726,37 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, """ inplace = validate_bool_kwarg(inplace, 'inplace') + # align the cond to same shape as myself cond = com._apply_if_callable(cond, self) - if isinstance(cond, NDFrame): cond, _ = cond.align(self, join='right', broadcast_axis=1) else: if not hasattr(cond, 'shape'): - raise ValueError('where requires an ndarray like object for ' - 'its condition') + cond = np.asanyarray(cond) if cond.shape != self.shape: raise ValueError('Array conditional must be same shape as ' 'self') cond = self._constructor(cond, **self._construct_axes_dict()) - if inplace: - cond = -(cond.fillna(True).astype(bool)) + # make sure we are boolean + fill_value = True if inplace else False + cond = cond.fillna(fill_value) + + msg = "Boolean array expected for the condition, not {dtype}" + + if not isinstance(cond, pd.DataFrame): + # This is a single-dimensional object. + if not is_bool_dtype(cond): + raise ValueError(msg.format(dtype=cond.dtype)) else: - cond = cond.fillna(False).astype(bool) + for dt in cond.dtypes: + if not is_bool_dtype(dt): + raise ValueError(msg.format(dtype=dt)) - # try to align + cond = cond.astype(bool, copy=False) + cond = -cond if inplace else cond + + # try to align with other try_quick = True if hasattr(other, 'align'): @@ -4891,26 +4903,20 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, Parameters ---------- - cond : boolean %(klass)s, array or callable + cond : boolean %(klass)s, array-like, or callable If cond is callable, it is computed on the %(klass)s and - should return boolean %(klass)s or array. - The callable must not change input %(klass)s - (though pandas doesn't check it). + should return boolean %(klass)s or array. The callable must + not change input %(klass)s (though pandas doesn't check it). .. versionadded:: 0.18.1 - A callable can be used as cond. - other : scalar, %(klass)s, or callable If other is callable, it is computed on the %(klass)s and - should return scalar or %(klass)s. - The callable must not change input %(klass)s - (though pandas doesn't check it). + should return scalar or %(klass)s. The callable must not + change input %(klass)s (though pandas doesn't check it). .. versionadded:: 0.18.1 - A callable can be used as other. - inplace : boolean, default False Whether to perform the operation in place on the data axis : alignment axis if needed, default None diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 4837fc0d7438c..dcbcccdfcd610 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -573,7 +573,7 @@ def repeat(self, repeats, *args, **kwargs): Parameters ---------- - cond : boolean same length as self + cond : boolean array-like with the same length as self other : scalar, or array-like """ diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index c06faa75ed346..18fb17b98570a 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2479,6 +2479,95 @@ def _check_set(df, cond, check_dtypes=True): expected = df[df['a'] == 1].reindex(df.index) assert_frame_equal(result, expected) + def test_where_array_like(self): + # see gh-15414 + klasses = [list, tuple, np.array] + + df = DataFrame({'a': [1, 2, 3]}) + cond = [[False], [True], [True]] + expected = DataFrame({'a': [np.nan, 2, 3]}) + + for klass in klasses: + result = df.where(klass(cond)) + assert_frame_equal(result, expected) + + df['b'] = 2 + expected['b'] = [2, np.nan, 2] + cond = [[False, True], [True, False], [True, True]] + + for klass in klasses: + result = df.where(klass(cond)) + assert_frame_equal(result, expected) + + def test_where_invalid_input(self): + # see gh-15414: only boolean arrays accepted + df = DataFrame({'a': [1, 2, 3]}) + msg = "Boolean array expected for the condition" + + conds = [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({'a': [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], + [pd.NaT], [Timestamp("2017-01-02")]] + ] + + for cond in conds: + with tm.assertRaisesRegexp(ValueError, msg): + df.where(cond) + + df['b'] = 2 + conds = [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], + ["True", "True"]], + DataFrame({'a': [2, 5, 7], 'b': [4, 8, 9]}), + [[pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")]] + ] + + for cond in conds: + with tm.assertRaisesRegexp(ValueError, msg): + df.where(cond) + + def test_where_dataframe_col_match(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + cond = DataFrame([[True, False, True], [False, False, True]]) + + out = df.where(cond) + expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) + tm.assert_frame_equal(out, expected) + + cond.columns = ["a", "b", "c"] # Columns no longer match. + msg = "Boolean array expected for the condition" + with tm.assertRaisesRegexp(ValueError, msg): + df.where(cond) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + + cond = [True] + with tm.assertRaisesRegexp(ValueError, msg): + df.where(cond) + + expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + cond = np.array([False, True, False, True]) + with tm.assertRaisesRegexp(ValueError, msg): + df.where(cond) + + expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + def test_where_bug(self): # GH 2793 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 81ad0524807f3..7b39a33266ffa 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -497,6 +497,18 @@ def test_where(self): result = i.where(cond) tm.assert_index_equal(result, expected) + def test_where_array_like(self): + i = self.create_index() + + _nan = i._na_value + cond = [False] + [True] * (len(i) - 1) + klasses = [list, tuple, np.array, pd.Series] + expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) + + for klass in klasses: + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 6a8128bb8985f..b80ab6feeeb23 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -89,13 +89,22 @@ def test_where(self): expected = i tm.assert_index_equal(result, expected) - i2 = i.copy() i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq='D') result = i.where(notnull(i2)) expected = i2 tm.assert_index_equal(result, expected) + def test_where_array_like(self): + i = self.create_index() + cond = [False] + [True] * (len(i) - 1) + klasses = [list, tuple, np.array, Series] + expected = pd.PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') + + for klass in klasses: + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + def test_where_other(self): i = self.create_index() diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 6b6885c082533..64a0e71bd5ace 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -240,13 +240,23 @@ def test_where(self): expected = i tm.assert_index_equal(result, expected) - i2 = i.copy() i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), categories=i.categories) result = i.where(notnull(i2)) expected = i2 tm.assert_index_equal(result, expected) + def test_where_array_like(self): + i = self.create_index() + cond = [False] + [True] * (len(i) - 1) + klasses = [list, tuple, np.array, pd.Series] + expected = pd.CategoricalIndex([np.nan] + i[1:].tolist(), + categories=i.categories) + + for klass in klasses: + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + def test_append(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5611492b4af1b..80ff67ab3d043 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -88,6 +88,15 @@ def f(): self.assertRaises(NotImplementedError, f) + def test_where_array_like(self): + i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + klasses = [list, tuple, np.array, pd.Series] + cond = [False, True] + + for klass in klasses: + f = lambda: i.where(klass(cond)) + self.assertRaises(NotImplementedError, f) + def test_repeat(self): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index a20cb8324d2a3..8a2cc53b42938 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1193,6 +1193,60 @@ def f(): expected = Series(np.nan, index=[9]) assert_series_equal(result, expected) + def test_where_array_like(self): + # see gh-15414 + s = Series([1, 2, 3]) + cond = [False, True, True] + expected = Series([np.nan, 2, 3]) + klasses = [list, tuple, np.array, Series] + + for klass in klasses: + result = s.where(klass(cond)) + assert_series_equal(result, expected) + + def test_where_invalid_input(self): + # see gh-15414: only boolean arrays accepted + s = Series([1, 2, 3]) + msg = "Boolean array expected for the condition" + + conds = [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), + pd.NaT, Timestamp("2017-01-02")] + ] + + for cond in conds: + with tm.assertRaisesRegexp(ValueError, msg): + s.where(cond) + + msg = "Array conditional must be same shape as self" + with tm.assertRaisesRegexp(ValueError, msg): + s.where([True]) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + s = Series([1, 2, 3]) + + cond = [True] + with tm.assertRaisesRegexp(ValueError, msg): + s.where(cond) + + expected = Series([1, np.nan, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + cond = np.array([False, True, False, True]) + with tm.assertRaisesRegexp(ValueError, msg): + s.where(cond) + + expected = Series([np.nan, 2, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + def test_where_setitem_invalid(self): # GH 2702 From 90909f628ad47a0c9e713691892418f026121456 Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Fri, 24 Feb 2017 15:07:25 -0500 Subject: [PATCH 189/338] BUG: GH #12223, GH #15262. Allow ints for names in MultiIndex closes #12223 closes #15262 Author: Dr-Irv Closes #15478 from Dr-Irv/Issue15262 and squashes the following commits: 15d8433 [Dr-Irv] Address jreback comments 10667a3 [Dr-Irv] Fix types for test 8935068 [Dr-Irv] resolve conflicts 385ca3e [Dr-Irv] BUG: GH #12223, GH #15262. Allow ints for names in MultiIndex --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 6 +++--- pandas/core/groupby.py | 6 +++--- pandas/core/reshape.py | 3 ++- pandas/formats/format.py | 2 +- pandas/indexes/base.py | 10 ++++++---- pandas/indexes/multi.py | 14 ++++++++------ pandas/io/sql.py | 2 +- pandas/tests/frame/test_combine_concat.py | 18 ++++++++++++++++++ pandas/util/doctools.py | 6 +++--- 10 files changed, 46 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4b3a65780f939..7426b5ca2a69d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -550,6 +550,7 @@ Bug Fixes - Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) - Bug in ``Series`` construction with a datetimetz (:issue:`14928`) +- Bug in output formatting of a ``MultiIndex`` when names are integers (:issue:`12223`, :issue:`15262`) - Bug in compat for passing long integers to ``Timestamp.replace`` (:issue:`15030`) - Bug in ``.loc`` that would not return the correct dtype for scalar access for a DataFrame (:issue:`11617`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bfef2cfbd0d51..ce3481fc17c5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2876,7 +2876,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, names = [x for x in self.index.names] if isinstance(self.index, MultiIndex): for i in range(self.index.nlevels): - arrays.append(self.index.get_level_values(i)) + arrays.append(self.index._get_level_values(i)) else: arrays.append(self.index) @@ -2886,9 +2886,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, # append all but the last column so we don't have to modify # the end of this loop for n in range(col.nlevels - 1): - arrays.append(col.get_level_values(n)) + arrays.append(col._get_level_values(n)) - level = col.get_level_values(col.nlevels - 1) + level = col._get_level_values(col.nlevels - 1) names.extend(col.names) elif isinstance(col, Series): level = col._values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0b3fcba1c1ba5..831ca3886773e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -291,8 +291,8 @@ def _set_grouper(self, obj, sort=False): # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - ax = Index(ax.get_level_values( - level), name=ax.names[level]) + ax = Index(ax._get_level_values(level), + name=ax.names[level]) else: if level not in (0, ax.name): @@ -761,7 +761,7 @@ def _index_with_as_index(self, b): gp = self.grouper levels = chain((gp.levels[i][gp.labels[i][b]] for i in range(len(gp.groupings))), - (original.get_level_values(i)[b] + (original._get_level_values(i)[b] for i in range(original.nlevels))) new = MultiIndex.from_arrays(list(levels)) new.names = gp.names + original.names diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 5fc0d590a6885..87cb088c2e91e 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -811,7 +811,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index - mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N) + mdata[col] = np.asanyarray(frame.columns + ._get_level_values(i)).repeat(N) return DataFrame(mdata, columns=mcolumns) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 6b235b5e1bc33..4c081770e0125 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1566,7 +1566,7 @@ def _save_header(self): if isinstance(index_label, list) and len(index_label) > 1: col_line.extend([''] * (len(index_label) - 1)) - col_line.extend(columns.get_level_values(i)) + col_line.extend(columns._get_level_values(i)) writer.writerow(col_line) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index dcbcccdfcd610..5d43d2d32af67 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2334,9 +2334,9 @@ def set_value(self, arr, key, value): self._engine.set_value(_values_from_object(arr), _values_from_object(key), value) - def get_level_values(self, level): + def _get_level_values(self, level): """ - Return vector of label values for requested level, equal to the length + Return an Index of values for requested level, equal to the length of the index Parameters @@ -2345,12 +2345,14 @@ def get_level_values(self, level): Returns ------- - values : ndarray + values : Index """ - # checks that level number is actually just 1 + self._validate_index_level(level) return self + get_level_values = _get_level_values + _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index ec30d2c44efd7..23a42265a149b 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -684,7 +684,7 @@ def is_monotonic_increasing(self): """ # reversed() because lexsort() wants the most significant key last. - values = [self._get_level_values(i) + values = [self._get_level_values(i).values for i in reversed(range(len(self.levels)))] try: sort_order = np.lexsort(values) @@ -866,7 +866,8 @@ def _get_level_values(self, level): labels = self.labels[level] filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) - return filled + values = unique._shallow_copy(filled) + return values def get_level_values(self, level): """ @@ -883,7 +884,7 @@ def get_level_values(self, level): """ level = self._get_level_number(level) values = self._get_level_values(level) - return self.levels[level]._shallow_copy(values) + return values def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep=None, formatter=None): @@ -966,7 +967,8 @@ def to_frame(self, index=True): """ from pandas import DataFrame - result = DataFrame({(name or level): self.get_level_values(level) + result = DataFrame({(name or level): + self._get_level_values(level) for name, level in zip(self.names, range(len(self.levels)))}, copy=False) @@ -1301,8 +1303,8 @@ def append(self, other): for o in other): arrays = [] for i in range(self.nlevels): - label = self.get_level_values(i) - appended = [o.get_level_values(i) for o in other] + label = self._get_level_values(i) + appended = [o._get_level_values(i) for o in other] arrays.append(label.append(appended)) return MultiIndex.from_arrays(arrays, names=self.names) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index bace43e785dff..2ab642b3af0c7 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -749,7 +749,7 @@ def _get_column_names_and_types(self, dtype_mapper): if self.index is not None: for i, idx_label in enumerate(self.index): idx_type = dtype_mapper( - self.frame.index.get_level_values(i)) + self.frame.index._get_level_values(i)) column_names_and_types.append((text_type(idx_label), idx_type, True)) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index eed4d6261d6e8..6f06a55ad065e 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -422,6 +422,24 @@ def test_concat_axis_parameter(self): with assertRaisesRegexp(ValueError, 'No axis named'): pd.concat([series1, series2], axis='something') + def test_concat_numerical_names(self): + # #15262 # #12223 + df = pd.DataFrame({'col': range(9)}, + dtype='int32', + index=(pd.MultiIndex + .from_product([['A0', 'A1', 'A2'], + ['B0', 'B1', 'B2']], + names=[1, 2]))) + result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) + expected = pd.DataFrame({'col': [0, 1, 7, 8]}, + dtype='int32', + index=pd.MultiIndex.from_tuples([('A0', 'B0'), + ('A0', 'B1'), + ('A2', 'B1'), + ('A2', 'B2')], + names=[1, 2])) + tm.assert_frame_equal(result, expected) + class TestDataFrameCombineFirst(tm.TestCase, TestData): diff --git a/pandas/util/doctools.py b/pandas/util/doctools.py index 62dcba1405581..6df6444aeafab 100644 --- a/pandas/util/doctools.py +++ b/pandas/util/doctools.py @@ -113,12 +113,12 @@ def _insert_index(self, data): else: for i in range(idx_nlevels): data.insert(i, 'Index{0}'.format(i), - data.index.get_level_values(i)) + data.index._get_level_values(i)) col_nlevels = data.columns.nlevels if col_nlevels > 1: - col = data.columns.get_level_values(0) - values = [data.columns.get_level_values(i).values + col = data.columns._get_level_values(0) + values = [data.columns._get_level_values(i).values for i in range(1, col_nlevels)] col_df = pd.DataFrame(values) data.columns = col_df.columns From 889b0dd23a3700e6ddfdfc38517c2dc54f242750 Mon Sep 17 00:00:00 2001 From: Arco Bast Date: Fri, 24 Feb 2017 15:37:18 -0500 Subject: [PATCH 190/338] BUG: msgpack supports CategoricalIndex closes #15487 Author: Arco Bast Closes #15493 from abast/CategoricalIndex_msgpack and squashes the following commits: c1c68e4 [Arco Bast] corrections 3c1f2e7 [Arco Bast] whatsnew 215c2aa [Arco Bast] improve tests cd9354f [Arco Bast] improve tests 7895c16 [Arco Bast] flake8 f3f492a [Arco Bast] fix test 91d85cb [Arco Bast] msgpack supports CategoricalIndex --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/io/packers.py | 2 +- pandas/tests/io/test_packers.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7426b5ca2a69d..c94429b469641 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -615,6 +615,7 @@ Bug Fixes - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) - Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) +- Bug in ``pd.read_msgpack`` when deserializing a ``CategoricalIndex`` (:issue:`15487`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) @@ -630,3 +631,4 @@ Bug Fixes - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) +- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 3f4be6ad459d8..7afe8a06b6af1 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -54,7 +54,7 @@ from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, - Categorical) + Categorical, CategoricalIndex) from pandas.tslib import NaTType from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 097c03937ca68..251c6ae8b4dec 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -311,6 +311,7 @@ def setUp(self): 'period': Index(period_range('2012-1-1', freq='M', periods=3)), 'date2': Index(date_range('2013-01-1', periods=10)), 'bdate': Index(bdate_range('2013-01-02', periods=10)), + 'cat': tm.makeCategoricalIndex(100) } self.mi = { @@ -349,6 +350,13 @@ def test_unicode(self): i_rec = self.encode_decode(i) self.assert_index_equal(i, i_rec) + def categorical_index(self): + # GH15487 + df = DataFrame(np.random.randn(10, 2)) + df = df.astype({0: 'category'}).set_index(0) + result = self.encode_decode(df) + tm.assert_frame_equal(result, df) + class TestSeries(TestPackers): From ebd124b3420de1c9bfc630e8c81e68aadd95cc94 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 25 Feb 2017 22:38:56 +0100 Subject: [PATCH 191/338] DOC: fix doc build warnings (#15505) --- doc/source/advanced.rst | 1 + doc/source/basics.rst | 2 +- doc/source/contributing.rst | 6 +++--- doc/source/install.rst | 2 +- doc/source/io.rst | 8 ++++---- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/generic.py | 1 + pandas/io/html.py | 2 +- pandas/io/json/normalize.py | 9 ++++----- pandas/io/parsers.py | 10 +++++----- 10 files changed, 22 insertions(+), 21 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index b6f015c15606d..f380070ddac79 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -965,6 +965,7 @@ The different indexing operation can potentially change the dtype of a ``Series` res .. ipython:: python + series2 = pd.Series([True]) series2.dtype res = series2.reindex_like(series1) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index f5f7c73223595..f649b3fd8a9a3 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1889,7 +1889,7 @@ gotchas Performing selection operations on ``integer`` type data can easily upcast the data to ``floating``. The dtype of the input data will be preserved in cases where ``nans`` are not introduced (starting in 0.11.0) -See also :ref:`Support for integer ``NA`` ` +See also :ref:`Support for integer NA ` .. ipython:: python diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 5c2bb9b73d618..2f838a3ab2386 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -461,7 +461,7 @@ C (cpplint) *pandas* uses the `Google `_ standard. Google provides an open source style checker called ``cpplint``, but we -use a fork of it that can be found `here `_. +use a fork of it that can be found `here `__. Here are *some* of the more common ``cpplint`` issues: - we restrict line-length to 80 characters to promote readability @@ -479,7 +479,7 @@ You can also run this command on an entire directory if necessary:: To make your commits compliant with this standard, you can install the `ClangFormat `_ tool, which can be -downloaded `here `_. To configure, in your home directory, +downloaded `here `__. To configure, in your home directory, run the following command:: clang-format style=google -dump-config > .clang-format @@ -611,7 +611,7 @@ Or with one of the following constructs:: pytest pandas/tests/[test-module].py::[TestClass] pytest pandas/tests/[test-module].py::[TestClass]::[test_method] -For more, see the `pytest`_ documentation. +For more, see the `pytest `_ documentation. .. versionadded:: 0.20.0 diff --git a/doc/source/install.rst b/doc/source/install.rst index 80a5d7e7d375b..8b0fec6a3dac3 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -282,7 +282,7 @@ Optional Dependencies okay.) * `BeautifulSoup4`_ and `lxml`_ * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ - * Only `lxml`_, although see :ref:`HTML Table Parsing ` + * Only `lxml`_, although see :ref:`HTML Table Parsing ` for reasons as to why you should probably **not** take this approach. .. warning:: diff --git a/doc/source/io.rst b/doc/source/io.rst index 55ef2c09d43e4..35e8b77782183 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2043,7 +2043,7 @@ Reading HTML Content .. warning:: - We **highly encourage** you to read the :ref:`HTML Table Parsing gotchas` + We **highly encourage** you to read the :ref:`HTML Table Parsing gotchas ` below regarding the issues surrounding the BeautifulSoup4/html5lib/lxml parsers. .. versionadded:: 0.12.0 @@ -4681,7 +4681,7 @@ The key functions are: Supported Data Types -++++++++++++++++++++ +'''''''''''''''''''' Pandas supports all these `BigQuery data types `__: ``STRING``, ``INTEGER`` (64bit), ``FLOAT`` (64 bit), ``BOOLEAN`` and @@ -4689,7 +4689,7 @@ Pandas supports all these `BigQuery data types `. + HTML parsing libraries `. Expect to do some cleanup after you call this function. For example, you might need to manually assign column names if the column names are diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index d684441c5974d..f29472155da17 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -106,11 +106,10 @@ def json_normalize(data, record_path=None, meta=None, path to records is ['foo', 'bar'] meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' - - * ignore : will ignore KeyError if keys listed in meta are not - always present - * raise : will raise KeyError if keys listed in meta are not - always present + * 'ignore' : will ignore KeyError if keys listed in meta are not + always present + * 'raise' : will raise KeyError if keys listed in meta are not + always present .. versionadded:: 0.20.0 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 88d0c6c12c04f..78c5247818970 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -181,7 +181,7 @@ If True and parse_dates is enabled, pandas will attempt to infer the format of the datetime strings in the columns, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the - parsing speed by ~5-10x. + parsing speed by 5-10x. keep_date_col : boolean, default False If True and parse_dates specifies combining multiple columns then keep the original columns. @@ -200,10 +200,10 @@ Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. chunksize : int, default None - Return TextFileReader object for iteration. `See IO Tools docs for more - information - `_ on - ``iterator`` and ``chunksize``. + Return TextFileReader object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', From fdac7c3a4cbe3579c3527f628c0718754f3da866 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 25 Feb 2017 17:26:06 -0500 Subject: [PATCH 192/338] DOC: Fix versionadded for cond in .where (#15509) [ci skip] --- pandas/core/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 85c7130ca2827..cdc37e00f70e0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4910,6 +4910,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, not change input %(klass)s (though pandas doesn't check it). .. versionadded:: 0.18.1 + A callable can be used as cond. other : scalar, %(klass)s, or callable If other is callable, it is computed on the %(klass)s and @@ -4917,6 +4918,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, change input %(klass)s (though pandas doesn't check it). .. versionadded:: 0.18.1 + A callable can be used as other. inplace : boolean, default False Whether to perform the operation in place on the data From 4adfeb8b5313303be1b5a185f3894701ed11ca9a Mon Sep 17 00:00:00 2001 From: Stephen Rauch Date: Mon, 27 Feb 2017 09:39:27 -0500 Subject: [PATCH 193/338] BUG: Parse two date columns broken in read_csv with multiple headers In `io/parsers/_try_convert_dates()` when selecting columns based on a column index from a set of columns with multi- level names, the column `name` was converted to a string. This appears to be a bug since the `name` was a tuple before the conversion. This causes problems downstream when there is an attempt to use this name to lookup a column, and that lookup fails because the desired column is keyed from the tuple, not its string representation closes #15376 Author: Stephen Rauch Closes #15378 from stephenrauch/fix_read_csv_merge_datetime and squashes the following commits: 030f5ec [Stephen Rauch] BUG: Parse two date columns broken in read_csv with multiple headers --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 2 +- pandas/tests/io/parser/parse_dates.py | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 123fc346441cb..be487e165c602 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -625,6 +625,7 @@ Bug Fixes +- Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 78c5247818970..811844ec35deb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2858,7 +2858,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): if c in colset: colnames.append(c) elif isinstance(c, int) and c not in columns: - colnames.append(str(columns[c])) + colnames.append(columns[c]) else: colnames.append(c) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 6197d07d4eafa..b1960159bb41d 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -18,6 +18,7 @@ import pandas.tseries.tools as tools import pandas.util.testing as tm +import pandas.io.date_converters as conv from pandas import DataFrame, Series, Index, DatetimeIndex from pandas import compat from pandas.compat import parse_date, StringIO, lrange @@ -491,3 +492,21 @@ def test_parse_dates_noconvert_thousands(self): result = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected) + + def test_parse_date_time_multi_level_column_name(self): + data = """\ +D,T,A,B +date, time,a,b +2001-01-05, 09:00:00, 0.0, 10. +2001-01-06, 00:00:00, 1.0, 11. +""" + datecols = {'date_time': [0, 1]} + result = self.read_csv(StringIO(data), sep=',', header=[0, 1], + parse_dates=datecols, + date_parser=conv.parse_date_time) + + expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.], + [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]] + expected = DataFrame(expected_data, + columns=['date_time', ('A', 'a'), ('B', 'b')]) + tm.assert_frame_equal(result, expected) From 8d90d6c5d9f1fb47976b5bf477a67f95dd739476 Mon Sep 17 00:00:00 2001 From: Stephen Rauch Date: Mon, 27 Feb 2017 10:41:56 -0500 Subject: [PATCH 194/338] BUG: GH15426 timezone lost in groupby-agg with cython functions closes #15426 Author: Stephen Rauch Closes #15433 from stephenrauch/tz-lost-in-groupby-agg and squashes the following commits: 64a84ca [Stephen Rauch] BUG: GH15426 timezone lost in groupby-agg with cython functions --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/groupby/test_aggregate.py | 31 +++++++++++++++++++++++++- pandas/tests/types/test_cast.py | 12 +++++++++- pandas/types/cast.py | 3 ++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index be487e165c602..f337d4404abfc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -622,6 +622,7 @@ Bug Fixes - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) +- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index a1fc97eb8d780..cb739546a2312 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -6,7 +6,7 @@ """ from __future__ import print_function -from datetime import datetime +from datetime import datetime, timedelta from functools import partial import numpy as np @@ -738,3 +738,32 @@ def test_agg_over_numpy_arrays(self): columns=expected_column) assert_frame_equal(result, expected) + + def test_agg_timezone_round_trip(self): + # GH 15426 + ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') + df = pd.DataFrame({'a': 1, 'b': [ts + timedelta(minutes=nn) + for nn in range(10)]}) + + result1 = df.groupby('a')['b'].agg(np.min).iloc[0] + result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] + result3 = df.groupby('a')['b'].min().iloc[0] + + assert result1 == ts + assert result2 == ts + assert result3 == ts + + dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific') + for i in range(1, 5)] + df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates}) + grouped = df.groupby('A') + + ts = df['B'].iloc[0] + assert ts == grouped.nth(0)['B'].iloc[0] + assert ts == grouped.head(1)['B'].iloc[0] + assert ts == grouped.first()['B'].iloc[0] + assert ts == grouped.apply(lambda x: x.iloc[0])[0] + + ts = df['B'].iloc[2] + assert ts == grouped.last()['B'].iloc[0] + assert ts == grouped.apply(lambda x: x.iloc[-1])[0] diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 497130b117289..70f69cc7d5701 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -8,7 +8,7 @@ from datetime import datetime import numpy as np -from pandas import Timedelta, Timestamp +from pandas import Timedelta, Timestamp, DatetimeIndex from pandas.types.cast import (_possibly_downcast_to_dtype, _possibly_convert_objects, _infer_dtype_from_scalar, @@ -71,6 +71,16 @@ def test_datetimelikes_nan(self): res = _possibly_downcast_to_dtype(arr, 'timedelta64[ns]') tm.assert_numpy_array_equal(res, exp) + def test_datetime_with_timezone(self): + # GH 15426 + ts = Timestamp("2016-01-01 12:00:00", tz='US/Pacific') + exp = DatetimeIndex([ts, ts]) + res = _possibly_downcast_to_dtype(exp, exp.dtype) + tm.assert_index_equal(res, exp) + + res = _possibly_downcast_to_dtype(exp.asi8, exp.dtype) + tm.assert_index_equal(res, exp) + class TestInferDtype(tm.TestCase): diff --git a/pandas/types/cast.py b/pandas/types/cast.py index b1a17df64aecf..8cc3fe41f73c8 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -133,7 +133,8 @@ def trans(x): # noqa if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime - result = to_datetime(result).tz_localize(dtype.tz) + result = to_datetime(result).tz_localize('utc') + result = result.tz_convert(dtype.tz) except: pass From cd640276b0508397b850a778f926edd824de12f0 Mon Sep 17 00:00:00 2001 From: Alexis Mignon Date: Thu, 16 Jun 2016 15:11:46 +0200 Subject: [PATCH 195/338] BUG: Fix a bug occuring when using DataFrame.to_records with unicode column names in python 2. closes #11879 closes #13462 --- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/core/frame.py | 12 ++++++++---- pandas/tests/frame/test_convert_to.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f337d4404abfc..947a114f1ce95 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -615,7 +615,8 @@ Bug Fixes - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) - Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) -- Bug in ``pd.read_msgpack`` when deserializing a ``CategoricalIndex`` (:issue:`15487`) +- Bug in ``pd.read_msgpack()`` when deserializing a ``CategoricalIndex`` (:issue:`15487`) +- Bug in ``pd.DataFrame.to_records()`` which failed with unicode characters in column names (:issue:`11879`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ce3481fc17c5b..adf397e63984f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1105,13 +1105,17 @@ def to_records(self, index=True, convert_datetime64=True): count += 1 elif index_names[0] is None: index_names = ['index'] - names = lmap(str, index_names) + lmap(str, self.columns) + names = (lmap(compat.text_type, index_names) + + lmap(compat.text_type, self.columns)) else: arrays = [self[c].get_values() for c in self.columns] - names = lmap(str, self.columns) + names = lmap(compat.text_type, self.columns) - dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)]) - return np.rec.fromarrays(arrays, dtype=dtype, names=names) + formats = [v.dtype for v in arrays] + return np.rec.fromarrays( + arrays, + dtype={'names': names, 'formats': formats} + ) @classmethod def from_items(cls, items, columns=None, orient='columns'): diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 1bc8313726d0c..0dde113dd5147 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -177,3 +177,18 @@ def test_to_records_with_unicode_index(self): .to_records() expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) tm.assert_almost_equal(result, expected) + + def test_to_records_with_unicode_column_names(self): + # xref issue: https://github.com/numpy/numpy/issues/2407 + # Issue #11879. to_records used to raise an exception when used + # with column names containing non ascii caracters in Python 2 + result = DataFrame(data={u"accented_name_é": [1.0]}).to_records() + + # Note that numpy allows for unicode field names but dtypes need + # to be specified using dictionnary intsead of list of tuples. + expected = np.rec.array( + [(0, 1.0)], + dtype={"names": ["index", u"accented_name_é"], + "formats": [' Date: Mon, 27 Feb 2017 14:26:07 -0500 Subject: [PATCH 196/338] BUG: reindex_like after shape comparison in assert_frame_equal, if check_like, the former code reindex_like before shape comparison. for example: if left.shape=(2,2), right.shpae=(2.0), after reindex_like, left.shape=(2,0),right.shape=(2,0),then the shape comparison will not find out that the two dataframes are different. For that, the assert_frame_equal will not raise assertion errors. But in fact it should raise. Author: jojomdt Closes #15496 from jojomdt/master and squashes the following commits: 7b3437b [jojomdt] fix test_frame_equal_message error 0340b5c [jojomdt] change check_like description c03e0af [jojomdt] add test for TestAssertFrameEqual 470dbaa [jojomdt] combine row and column shape comparison ce7bd74 [jojomdt] reindex_like after shape comparison --- pandas/tests/test_testing.py | 32 +++++++++++++++++--------------- pandas/util/testing.py | 25 ++++++++----------------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 07bfdc8fc9078..2fb58ef70e3cb 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -13,8 +13,6 @@ RNGContext) from pandas.compat import is_platform_windows -# let's get meta. - class TestAssertAlmostEqual(tm.TestCase): @@ -594,6 +592,20 @@ def _assert_not_equal(self, a, b, **kwargs): self.assertRaises(AssertionError, assert_frame_equal, a, b, **kwargs) self.assertRaises(AssertionError, assert_frame_equal, b, a, **kwargs) + def test_equal_with_different_row_order(self): + # check_like=True ignores row-column orderings + df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + index=['a', 'b', 'c']) + df2 = pd.DataFrame({'A': [3, 2, 1], 'B': [6, 5, 4]}, + index=['c', 'b', 'a']) + + self._assert_equal(df1, df2, check_like=True) + self._assert_not_equal(df1, df2) + + def test_not_equal_with_different_shape(self): + self._assert_not_equal(pd.DataFrame({'A': [1, 2, 3]}), + pd.DataFrame({'A': [1, 2, 3, 4]})) + def test_index_dtype(self): df1 = DataFrame.from_records( {'a': [1, 2], 'c': ['l1', 'l2']}, index=['a']) @@ -621,19 +633,9 @@ def test_frame_equal_message(self): expected = """DataFrame are different -DataFrame shape \\(number of rows\\) are different -\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\) -\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)""" - - with assertRaisesRegexp(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A': [1, 2, 3]}), - pd.DataFrame({'A': [1, 2, 3, 4]})) - - expected = """DataFrame are different - -DataFrame shape \\(number of columns\\) are different -\\[left\\]: 2, Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) -\\[right\\]: 1, Index\\(\\[u?'A'\\], dtype='object'\\)""" +DataFrame shape mismatch +\\[left\\]: \\(3, 2\\) +\\[right\\]: \\(3, 1\\)""" with assertRaisesRegexp(AssertionError, expected): assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 1bd539469dbe3..e4b10488c69b2 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1254,7 +1254,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_categorical : bool, default True Whether to compare internal Categorical exactly. check_like : bool, default False - If true, then reindex_like operands + If true, ignore the order of rows & columns obj : str, default 'DataFrame' Specify object name being compared, internally used to show appropriate assertion message @@ -1270,25 +1270,16 @@ def assert_frame_equal(left, right, check_dtype=True, assertIsInstance(left, type(right)) # assert_class_equal(left, right, obj=obj) + # shape comparison + if left.shape != right.shape: + raise_assert_detail(obj, + 'DataFrame shape mismatch', + '({0}, {1})'.format(*left.shape), + '({0}, {1})'.format(*right.shape)) + if check_like: left, right = left.reindex_like(right), right - # shape comparison (row) - if left.shape[0] != right.shape[0]: - raise_assert_detail(obj, - 'DataFrame shape (number of rows) are different', - '{0}, {1}'.format(left.shape[0], left.index), - '{0}, {1}'.format(right.shape[0], right.index)) - # shape comparison (columns) - if left.shape[1] != right.shape[1]: - raise_assert_detail(obj, - 'DataFrame shape (number of columns) ' - 'are different', - '{0}, {1}'.format(left.shape[1], - left.columns), - '{0}, {1}'.format(right.shape[1], - right.columns)) - # index comparison assert_index_equal(left.index, right.index, exact=check_index_type, check_names=check_names, From a38a9577b079be5b15ef144f63f5f26c6b5ad7dd Mon Sep 17 00:00:00 2001 From: Aleksey Bilogur Date: Mon, 27 Feb 2017 15:26:00 -0500 Subject: [PATCH 197/338] TST: DataFrame.hist() does not get along with matplotlib.pyplot.tight_layout() (#15515) * Add unit test for #9351 * Tweaks. * add _check_plot_works; rm aux method * Add whatsnew entry. --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/plotting/test_hist_method.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 947a114f1ce95..f13b584a4ee13 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -629,7 +629,7 @@ Bug Fixes - Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) - +- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 0.2.0``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 4f64f66bd3c4d..22de7055e3cea 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -238,6 +238,16 @@ def test_hist_layout(self): with tm.assertRaises(ValueError): df.hist(layout=(-1, -1)) + @slow + # GH 9351 + def test_tight_layout(self): + if self.mpl_ge_2_0_0: + df = DataFrame(randn(100, 2)) + _check_plot_works(df.hist) + self.plt.tight_layout() + + tm.close() + @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): From ff81cc984c2cf673a6e8897193f984541bca863d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 15:47:10 -0500 Subject: [PATCH 198/338] CLN: remove pandas/io/gbq.py and tests and replace with pandas-gbq closes #15347 Author: Jeff Reback Closes #15484 from jreback/gbq and squashes the following commits: 0fd8d06 [Jeff Reback] wip 3222de1 [Jeff Reback] CLN: remove pandas/io/gbq.py and tests and replace with pandas-gbq --- ci/requirements-2.7.pip | 5 +- ci/requirements-3.4.pip | 3 - ci/requirements-3.4_SLOW.pip | 3 - ci/requirements-3.5.pip | 1 + doc/source/io.rst | 289 +------ doc/source/whatsnew/v0.20.0.txt | 9 + pandas/core/frame.py | 8 +- pandas/io/gbq.py | 1192 +---------------------------- pandas/tests/io/test_gbq.py | 1242 +------------------------------ pandas/util/decorators.py | 38 +- pandas/util/print_versions.py | 3 +- 11 files changed, 116 insertions(+), 2677 deletions(-) delete mode 100644 ci/requirements-3.4_SLOW.pip diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index d16b932c8be4f..08240184f2934 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -1,8 +1,5 @@ blosc -httplib2 -google-api-python-client==1.2 -python-gflags==2.0 -oauth2client==1.5.0 +pandas-gbq pathlib backports.lzma py diff --git a/ci/requirements-3.4.pip b/ci/requirements-3.4.pip index 55986a0220bf0..4e5fe52d56cf1 100644 --- a/ci/requirements-3.4.pip +++ b/ci/requirements-3.4.pip @@ -1,5 +1,2 @@ python-dateutil==2.2 blosc -httplib2 -google-api-python-client -oauth2client diff --git a/ci/requirements-3.4_SLOW.pip b/ci/requirements-3.4_SLOW.pip deleted file mode 100644 index 05c938abcbab6..0000000000000 --- a/ci/requirements-3.4_SLOW.pip +++ /dev/null @@ -1,3 +0,0 @@ -httplib2 -google-api-python-client -oauth2client diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 0d9e44cf39fa4..6e4f7b65f9728 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1 +1,2 @@ xarray==0.9.1 +pandas-gbq diff --git a/doc/source/io.rst b/doc/source/io.rst index 35e8b77782183..b36ae8c2ed450 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4652,293 +4652,18 @@ And then issue the following queries: Google BigQuery --------------- -.. versionadded:: 0.13.0 - -The :mod:`pandas.io.gbq` module provides a wrapper for Google's BigQuery -analytics web service to simplify retrieving results from BigQuery tables -using SQL-like queries. Result sets are parsed into a pandas -DataFrame with a shape and data types derived from the source table. -Additionally, DataFrames can be inserted into new BigQuery tables or appended -to existing tables. - -.. warning:: - - To use this module, you will need a valid BigQuery account. Refer to the - `BigQuery Documentation `__ - for details on the service itself. - -The key functions are: - -.. currentmodule:: pandas.io.gbq - -.. autosummary:: - :toctree: generated/ - - read_gbq - to_gbq - -.. currentmodule:: pandas - - -Supported Data Types -'''''''''''''''''''' - -Pandas supports all these `BigQuery data types `__: -``STRING``, ``INTEGER`` (64bit), ``FLOAT`` (64 bit), ``BOOLEAN`` and -``TIMESTAMP`` (microsecond precision). Data types ``BYTES`` and ``RECORD`` -are not supported. - -Integer and boolean ``NA`` handling -''''''''''''''''''''''''''''''''''' - -.. versionadded:: 0.20 - -Since all columns in BigQuery queries are nullable, and NumPy lacks of ``NA`` -support for integer and boolean types, this module will store ``INTEGER`` or -``BOOLEAN`` columns with at least one ``NULL`` value as ``dtype=object``. -Otherwise those columns will be stored as ``dtype=int64`` or ``dtype=bool`` -respectively. - -This is opposite to default pandas behaviour which will promote integer -type to float in order to store NAs. See the :ref:`gotchas` -for detailed explaination. - -While this trade-off works well for most cases, it breaks down for storing -values greater than 2**53. Such values in BigQuery can represent identifiers -and unnoticed precision lost for identifier is what we want to avoid. - -.. _io.bigquery_deps: - -Dependencies -'''''''''''' - -This module requires following additional dependencies: - -- `httplib2 `__: HTTP client -- `google-api-python-client `__: Google's API client -- `oauth2client `__: authentication and authorization for Google's API - -.. _io.bigquery_authentication: - -Authentication -'''''''''''''' - -.. versionadded:: 0.18.0 - -Authentication to the Google ``BigQuery`` service is via ``OAuth 2.0``. -Is possible to authenticate with either user account credentials or service account credentials. - -Authenticating with user account credentials is as simple as following the prompts in a browser window -which will be automatically opened for you. You will be authenticated to the specified -``BigQuery`` account using the product name ``pandas GBQ``. It is only possible on local host. -The remote authentication using user account credentials is not currently supported in pandas. -Additional information on the authentication mechanism can be found -`here `__. - -Authentication with service account credentials is possible via the `'private_key'` parameter. This method -is particularly useful when working on remote servers (eg. jupyter iPython notebook on remote host). -Additional information on service accounts can be found -`here `__. - -Authentication via ``application default credentials`` is also possible. This is only valid -if the parameter ``private_key`` is not provided. This method also requires that -the credentials can be fetched from the environment the code is running in. -Otherwise, the OAuth2 client-side authentication is used. -Additional information on -`application default credentials `__. - -.. versionadded:: 0.19.0 - -.. note:: - - The `'private_key'` parameter can be set to either the file path of the service account key - in JSON format, or key contents of the service account key in JSON format. - -.. note:: - - A private key can be obtained from the Google developers console by clicking - `here `__. Use JSON key type. - -.. _io.bigquery_reader: - -Querying -'''''''' - -Suppose you want to load all data from an existing BigQuery table : `test_dataset.test_table` -into a DataFrame using the :func:`~pandas.io.gbq.read_gbq` function. - -.. code-block:: python - - # Insert your BigQuery Project ID Here - # Can be found in the Google web console - projectid = "xxxxxxxx" - - data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', projectid) - - -You can define which column from BigQuery to use as an index in the -destination DataFrame as well as a preferred column order as follows: - -.. code-block:: python - - data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', - index_col='index_column_name', - col_order=['col1', 'col2', 'col3'], projectid) - - -Starting with 0.20.0, you can specify the query config as parameter to use additional options of your job. -For more information about query configuration parameters see -`here `__. - -.. code-block:: python - - configuration = { - 'query': { - "useQueryCache": False - } - } - data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', - configuration=configuration, projectid) - - -.. note:: - - You can find your project id in the `Google developers console `__. - - -.. note:: - - You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``. - -.. note:: - - The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL - or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``. For more information - on BigQuery's standard SQL, see `BigQuery SQL Reference - `__ - -.. _io.bigquery_writer: - -Writing DataFrames -'''''''''''''''''' - -Assume we want to write a DataFrame ``df`` into a BigQuery table using :func:`~pandas.DataFrame.to_gbq`. - -.. ipython:: python - - df = pd.DataFrame({'my_string': list('abc'), - 'my_int64': list(range(1, 4)), - 'my_float64': np.arange(4.0, 7.0), - 'my_bool1': [True, False, True], - 'my_bool2': [False, True, False], - 'my_dates': pd.date_range('now', periods=3)}) - - df - df.dtypes - -.. code-block:: python - - df.to_gbq('my_dataset.my_table', projectid) - -.. note:: - - The destination table and destination dataset will automatically be created if they do not already exist. - -The ``if_exists`` argument can be used to dictate whether to ``'fail'``, ``'replace'`` -or ``'append'`` if the destination table already exists. The default value is ``'fail'``. - -For example, assume that ``if_exists`` is set to ``'fail'``. The following snippet will raise -a ``TableCreationError`` if the destination table already exists. - -.. code-block:: python - - df.to_gbq('my_dataset.my_table', projectid, if_exists='fail') - -.. note:: - - If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will - be written to the table using the defined table schema and column types. The - dataframe must match the destination table in structure and data types. - If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a - different schema, a delay of 2 minutes will be forced to ensure that the new schema - has propagated in the Google environment. See - `Google BigQuery issue 191 `__. - -Writing large DataFrames can result in errors due to size limitations being exceeded. -This can be avoided by setting the ``chunksize`` argument when calling :func:`~pandas.DataFrame.to_gbq`. -For example, the following writes ``df`` to a BigQuery table in batches of 10000 rows at a time: - -.. code-block:: python - - df.to_gbq('my_dataset.my_table', projectid, chunksize=10000) - -You can also see the progress of your post via the ``verbose`` flag which defaults to ``True``. -For example: - -.. code-block:: python - - In [8]: df.to_gbq('my_dataset.my_table', projectid, chunksize=10000, verbose=True) - - Streaming Insert is 10% Complete - Streaming Insert is 20% Complete - Streaming Insert is 30% Complete - Streaming Insert is 40% Complete - Streaming Insert is 50% Complete - Streaming Insert is 60% Complete - Streaming Insert is 70% Complete - Streaming Insert is 80% Complete - Streaming Insert is 90% Complete - Streaming Insert is 100% Complete - -.. note:: - - If an error occurs while streaming data to BigQuery, see - `Troubleshooting BigQuery Errors `__. - -.. note:: - - The BigQuery SQL query language has some oddities, see the - `BigQuery Query Reference Documentation `__. - -.. note:: - - While BigQuery uses SQL-like syntax, it has some important differences from traditional - databases both in functionality, API limitations (size and quantity of queries or uploads), - and how Google charges for use of the service. You should refer to `Google BigQuery documentation `__ - often as the service seems to be changing and evolving. BiqQuery is best for analyzing large - sets of data quickly, but it is not a direct replacement for a transactional database. - -.. _io.bigquery_create_tables: - -Creating BigQuery Tables -'''''''''''''''''''''''' - .. warning:: - As of 0.17, the function :func:`~pandas.io.gbq.generate_bq_schema` has been deprecated and will be - removed in a future version. - -As of 0.15.2, the gbq module has a function :func:`~pandas.io.gbq.generate_bq_schema` which will -produce the dictionary representation schema of the specified pandas DataFrame. - -.. code-block:: ipython - - In [10]: gbq.generate_bq_schema(df, default_type='STRING') + Starting in 0.20.0, pandas has split off Google BigQuery support into the + separate package ``pandas-gbq``. You can ``pip install pandas-gbq`` to get it. - Out[10]: {'fields': [{'name': 'my_bool1', 'type': 'BOOLEAN'}, - {'name': 'my_bool2', 'type': 'BOOLEAN'}, - {'name': 'my_dates', 'type': 'TIMESTAMP'}, - {'name': 'my_float64', 'type': 'FLOAT'}, - {'name': 'my_int64', 'type': 'INTEGER'}, - {'name': 'my_string', 'type': 'STRING'}]} - -.. note:: +The ``pandas-gbq`` package provides functionality to read/write from Google BigQuery. - If you delete and re-create a BigQuery table with the same name, but different table schema, - you must wait 2 minutes before streaming data into the table. As a workaround, consider creating - the new table with a different name. Refer to - `Google BigQuery issue 191 `__. +pandas integrates with this external package. if ``pandas-gbq`` is installed, you can +use the pandas methods ``pd.read_gbq`` and ``DataFrame.to_gbq``, which will call the +respective functions from ``pandas-gbq``. +Full cocumentation can be found `here `__ .. _io.stata: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f13b584a4ee13..f0e4176472861 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -360,6 +360,15 @@ New Behavior: In [5]: df['a']['2011-12-31 23:59:59'] Out[5]: 1 +.. _whatsnew_0200.api_breaking.gbq: + +Pandas Google BigQuery support has moved +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``pip install pandas-gbq`` to get it. +The functionality of ``pd.read_gbq()`` and ``.to_gbq()`` remains the same with the currently released version of ``pandas-gbq=0.1.2``. (:issue:`15347`) +Documentation is now hosted `here `__ + .. _whatsnew_0200.api_breaking.memory_usage: Memory Usage for Index is more Accurate diff --git a/pandas/core/frame.py b/pandas/core/frame.py index adf397e63984f..7b02926ea8837 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,7 +77,8 @@ OrderedDict, raise_with_traceback) from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import deprecate_kwarg, Appender, Substitution +from pandas.util.decorators import (deprecate_kwarg, Appender, + Substitution, docstring_wrapper) from pandas.util.validators import validate_bool_kwarg from pandas.tseries.period import PeriodIndex @@ -941,6 +942,11 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, chunksize=chunksize, verbose=verbose, reauth=reauth, if_exists=if_exists, private_key=private_key) + def _f(): + from pandas.io.gbq import _try_import + return _try_import().to_gbq.__doc__ + to_gbq = docstring_wrapper(to_gbq, _f) + @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None): diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index a5558866937cf..3407f51af5e83 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,1180 +1,52 @@ -import warnings -from datetime import datetime -import json -import logging -from time import sleep -import uuid -import time -import sys +""" Google BigQuery support """ -import numpy as np +from pandas.util.decorators import docstring_wrapper -from distutils.version import StrictVersion -from pandas import compat, DataFrame, concat -from pandas.core.common import PandasError -from pandas.compat import lzip, bytes_to_str - - -def _check_google_client_version(): +def _try_import(): + # since pandas is a dependency of pandas-gbq + # we need to import on first use try: - import pkg_resources - + import pandas_gbq except ImportError: - raise ImportError('Could not import pkg_resources (setuptools).') - - if compat.PY3: - google_api_minimum_version = '1.4.1' - else: - google_api_minimum_version = '1.2.0' - - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution( - 'google-api-python-client').version - - if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) < - StrictVersion(google_api_minimum_version)): - raise ImportError("pandas requires google-api-python-client >= {0} " - "for Google BigQuery support, " - "current version {1}" - .format(google_api_minimum_version, - _GOOGLE_API_CLIENT_VERSION)) - - -def _test_google_api_imports(): - - try: - import httplib2 # noqa - try: - from googleapiclient.discovery import build # noqa - from googleapiclient.errors import HttpError # noqa - except: - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa - from oauth2client.client import AccessTokenRefreshError # noqa - from oauth2client.client import OAuth2WebServerFlow # noqa - from oauth2client.file import Storage # noqa - from oauth2client.tools import run_flow, argparser # noqa - except ImportError as e: - raise ImportError("Missing module required for Google BigQuery " - "support: {0}".format(str(e))) - - -logger = logging.getLogger('pandas.io.gbq') -logger.setLevel(logging.ERROR) - - -class InvalidPrivateKeyFormat(PandasError, ValueError): - """ - Raised when provided private key has invalid format. - """ - pass - - -class AccessDenied(PandasError, ValueError): - """ - Raised when invalid credentials are provided, or tokens have expired. - """ - pass - - -class DatasetCreationError(PandasError, ValueError): - """ - Raised when the create dataset method fails - """ - pass - - -class GenericGBQException(PandasError, ValueError): - """ - Raised when an unrecognized Google API Error occurs. - """ - pass - - -class InvalidColumnOrder(PandasError, ValueError): - """ - Raised when the provided column order for output - results DataFrame does not match the schema - returned by BigQuery. - """ - pass - - -class InvalidPageToken(PandasError, ValueError): - """ - Raised when Google BigQuery fails to return, - or returns a duplicate page token. - """ - pass - - -class InvalidSchema(PandasError, ValueError): - """ - Raised when the provided DataFrame does - not match the schema of the destination - table in BigQuery. - """ - pass - - -class NotFoundException(PandasError, ValueError): - """ - Raised when the project_id, table or dataset provided in the query could - not be found. - """ - pass - - -class StreamingInsertError(PandasError, ValueError): - """ - Raised when BigQuery reports a streaming insert error. - For more information see `Streaming Data Into BigQuery - `__ - """ - - -class TableCreationError(PandasError, ValueError): - """ - Raised when the create table method fails - """ - pass - - -class GbqConnector(object): - scope = 'https://www.googleapis.com/auth/bigquery' - - def __init__(self, project_id, reauth=False, verbose=False, - private_key=None, dialect='legacy'): - _check_google_client_version() - _test_google_api_imports() - self.project_id = project_id - self.reauth = reauth - self.verbose = verbose - self.private_key = private_key - self.dialect = dialect - self.credentials = self.get_credentials() - self.service = self.get_service() - - def get_credentials(self): - if self.private_key: - return self.get_service_account_credentials() - else: - # Try to retrieve Application Default Credentials - credentials = self.get_application_default_credentials() - if not credentials: - credentials = self.get_user_account_credentials() - return credentials - - def get_application_default_credentials(self): - """ - This method tries to retrieve the "default application credentials". - This could be useful for running code on Google Cloud Platform. - - .. versionadded:: 0.19.0 - - Parameters - ---------- - None - - Returns - ------- - - GoogleCredentials, - If the default application credentials can be retrieved - from the environment. The retrieved credentials should also - have access to the project (self.project_id) on BigQuery. - - OR None, - If default application credentials can not be retrieved - from the environment. Or, the retrieved credentials do not - have access to the project (self.project_id) on BigQuery. - """ - import httplib2 - try: - from googleapiclient.discovery import build - except ImportError: - from apiclient.discovery import build - try: - from oauth2client.client import GoogleCredentials - except ImportError: - return None - - try: - credentials = GoogleCredentials.get_application_default() - except: - return None - - http = httplib2.Http() - try: - http = credentials.authorize(http) - bigquery_service = build('bigquery', 'v2', http=http) - # Check if the application has rights to the BigQuery project - jobs = bigquery_service.jobs() - job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} - jobs.insert(projectId=self.project_id, body=job_data).execute() - return credentials - except: - return None - - def get_user_account_credentials(self): - from oauth2client.client import OAuth2WebServerFlow - from oauth2client.file import Storage - from oauth2client.tools import run_flow, argparser - - flow = OAuth2WebServerFlow( - client_id=('495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd' - '.apps.googleusercontent.com'), - client_secret='kOc9wMptUtxkcIFbtZCcrEAc', - scope=self.scope, - redirect_uri='urn:ietf:wg:oauth:2.0:oob') - - storage = Storage('bigquery_credentials.dat') - credentials = storage.get() - - if credentials is None or credentials.invalid or self.reauth: - credentials = run_flow(flow, storage, argparser.parse_args([])) - - return credentials - - def get_service_account_credentials(self): - # Bug fix for https://github.com/pandas-dev/pandas/issues/12572 - # We need to know that a supported version of oauth2client is installed - # Test that either of the following is installed: - # - SignedJwtAssertionCredentials from oauth2client.client - # - ServiceAccountCredentials from oauth2client.service_account - # SignedJwtAssertionCredentials is available in oauthclient < 2.0.0 - # ServiceAccountCredentials is available in oauthclient >= 2.0.0 - oauth2client_v1 = True - oauth2client_v2 = True - - try: - from oauth2client.client import SignedJwtAssertionCredentials - except ImportError: - oauth2client_v1 = False - - try: - from oauth2client.service_account import ServiceAccountCredentials - except ImportError: - oauth2client_v2 = False - - if not oauth2client_v1 and not oauth2client_v2: - raise ImportError("Missing oauth2client required for BigQuery " - "service account support") - - from os.path import isfile - - try: - if isfile(self.private_key): - with open(self.private_key) as f: - json_key = json.loads(f.read()) - else: - # ugly hack: 'private_key' field has new lines inside, - # they break json parser, but we need to preserve them - json_key = json.loads(self.private_key.replace('\n', ' ')) - json_key['private_key'] = json_key['private_key'].replace( - ' ', '\n') - - if compat.PY3: - json_key['private_key'] = bytes( - json_key['private_key'], 'UTF-8') - - if oauth2client_v1: - return SignedJwtAssertionCredentials( - json_key['client_email'], - json_key['private_key'], - self.scope, - ) - else: - return ServiceAccountCredentials.from_json_keyfile_dict( - json_key, - self.scope) - except (KeyError, ValueError, TypeError, AttributeError): - raise InvalidPrivateKeyFormat( - "Private key is missing or invalid. It should be service " - "account private key JSON (file path or string contents) " - "with at least two keys: 'client_email' and 'private_key'. " - "Can be obtained from: https://console.developers.google." - "com/permissions/serviceaccounts") - - def _print(self, msg, end='\n'): - if self.verbose: - sys.stdout.write(msg + end) - sys.stdout.flush() - - def _start_timer(self): - self.start = time.time() - - def get_elapsed_seconds(self): - return round(time.time() - self.start, 2) - - def print_elapsed_seconds(self, prefix='Elapsed', postfix='s.', - overlong=7): - sec = self.get_elapsed_seconds() - if sec > overlong: - self._print('{} {} {}'.format(prefix, sec, postfix)) - - # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size - @staticmethod - def sizeof_fmt(num, suffix='b'): - fmt = "%3.1f %s%s" - for unit in ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z']: - if abs(num) < 1024.0: - return fmt % (num, unit, suffix) - num /= 1024.0 - return fmt % (num, 'Y', suffix) - - def get_service(self): - import httplib2 - try: - from googleapiclient.discovery import build - except: - from apiclient.discovery import build - - http = httplib2.Http() - http = self.credentials.authorize(http) - bigquery_service = build('bigquery', 'v2', http=http) - - return bigquery_service - - @staticmethod - def process_http_error(ex): - # See `BigQuery Troubleshooting Errors - # `__ - - status = json.loads(bytes_to_str(ex.content))['error'] - errors = status.get('errors', None) - - if errors: - for error in errors: - reason = error['reason'] - message = error['message'] - - raise GenericGBQException( - "Reason: {0}, Message: {1}".format(reason, message)) - - raise GenericGBQException(errors) - - def process_insert_errors(self, insert_errors): - for insert_error in insert_errors: - row = insert_error['index'] - errors = insert_error.get('errors', None) - for error in errors: - reason = error['reason'] - message = error['message'] - location = error['location'] - error_message = ('Error at Row: {0}, Reason: {1}, ' - 'Location: {2}, Message: {3}' - .format(row, reason, location, message)) - - # Report all error messages if verbose is set - if self.verbose: - self._print(error_message) - else: - raise StreamingInsertError(error_message + - '\nEnable verbose logging to ' - 'see all errors') - - raise StreamingInsertError - - def run_query(self, query, **kwargs): - try: - from googleapiclient.errors import HttpError - except: - from apiclient.errors import HttpError - from oauth2client.client import AccessTokenRefreshError - - _check_google_client_version() - - job_collection = self.service.jobs() - - job_config = { - 'query': { - 'query': query, - 'useLegacySql': self.dialect == 'legacy' - # 'allowLargeResults', 'createDisposition', - # 'preserveNulls', destinationTable, useQueryCache - } - } - config = kwargs.get('configuration') - if config is not None: - if len(config) != 1: - raise ValueError("Only one job type must be specified, but " - "given {}".format(','.join(config.keys()))) - if 'query' in config: - if 'query' in config['query'] and query is not None: - raise ValueError("Query statement can't be specified " - "inside config while it is specified " - "as parameter") - - job_config['query'].update(config['query']) - else: - raise ValueError("Only 'query' job type is supported") - - job_data = { - 'configuration': job_config - } - - self._start_timer() - try: - self._print('Requesting query... ', end="") - query_reply = job_collection.insert( - projectId=self.project_id, body=job_data).execute() - self._print('ok.\nQuery running...') - except (AccessTokenRefreshError, ValueError): - if self.private_key: - raise AccessDenied( - "The service account credentials are not valid") - else: - raise AccessDenied( - "The credentials have been revoked or expired, " - "please re-run the application to re-authorize") - except HttpError as ex: - self.process_http_error(ex) - - job_reference = query_reply['jobReference'] - - while not query_reply.get('jobComplete', False): - self.print_elapsed_seconds(' Elapsed', 's. Waiting...') - try: - query_reply = job_collection.getQueryResults( - projectId=job_reference['projectId'], - jobId=job_reference['jobId']).execute() - except HttpError as ex: - self.process_http_error(ex) - - if self.verbose: - if query_reply['cacheHit']: - self._print('Query done.\nCache hit.\n') - else: - bytes_processed = int(query_reply.get( - 'totalBytesProcessed', '0')) - self._print('Query done.\nProcessed: {}\n'.format( - self.sizeof_fmt(bytes_processed))) - - self._print('Retrieving results...') - - total_rows = int(query_reply['totalRows']) - result_pages = list() - seen_page_tokens = list() - current_row = 0 - # Only read schema on first page - schema = query_reply['schema'] - - # Loop through each page of data - while 'rows' in query_reply and current_row < total_rows: - page = query_reply['rows'] - result_pages.append(page) - current_row += len(page) - - self.print_elapsed_seconds( - ' Got page: {}; {}% done. Elapsed'.format( - len(result_pages), - round(100.0 * current_row / total_rows))) - - if current_row == total_rows: - break - - page_token = query_reply.get('pageToken', None) - - if not page_token and current_row < total_rows: - raise InvalidPageToken("Required pageToken was missing. " - "Received {0} of {1} rows" - .format(current_row, total_rows)) - - elif page_token in seen_page_tokens: - raise InvalidPageToken("A duplicate pageToken was returned") - - seen_page_tokens.append(page_token) - - try: - query_reply = job_collection.getQueryResults( - projectId=job_reference['projectId'], - jobId=job_reference['jobId'], - pageToken=page_token).execute() - except HttpError as ex: - self.process_http_error(ex) - - if current_row < total_rows: - raise InvalidPageToken() - - # print basic query stats - self._print('Got {} rows.\n'.format(total_rows)) - - return schema, result_pages - - def load_data(self, dataframe, dataset_id, table_id, chunksize): - try: - from googleapiclient.errors import HttpError - except: - from apiclient.errors import HttpError - - job_id = uuid.uuid4().hex - rows = [] - remaining_rows = len(dataframe) - - total_rows = remaining_rows - self._print("\n\n") - - for index, row in dataframe.reset_index(drop=True).iterrows(): - row_dict = dict() - row_dict['json'] = json.loads(row.to_json(force_ascii=False, - date_unit='s', - date_format='iso')) - row_dict['insertId'] = job_id + str(index) - rows.append(row_dict) - remaining_rows -= 1 - - if (len(rows) % chunksize == 0) or (remaining_rows == 0): - self._print("\rStreaming Insert is {0}% Complete".format( - ((total_rows - remaining_rows) * 100) / total_rows)) - - body = {'rows': rows} - - try: - response = self.service.tabledata().insertAll( - projectId=self.project_id, - datasetId=dataset_id, - tableId=table_id, - body=body).execute() - except HttpError as ex: - self.process_http_error(ex) - - # For streaming inserts, even if you receive a success HTTP - # response code, you'll need to check the insertErrors property - # of the response to determine if the row insertions were - # successful, because it's possible that BigQuery was only - # partially successful at inserting the rows. See the `Success - # HTTP Response Codes - # `__ - # section - - insert_errors = response.get('insertErrors', None) - if insert_errors: - self.process_insert_errors(insert_errors) - - sleep(1) # Maintains the inserts "per second" rate per API - rows = [] - - self._print("\n") - - def verify_schema(self, dataset_id, table_id, schema): - try: - from googleapiclient.errors import HttpError - except: - from apiclient.errors import HttpError - - try: - remote_schema = self.service.tables().get( - projectId=self.project_id, - datasetId=dataset_id, - tableId=table_id).execute()['schema'] - - fields_remote = set([json.dumps(field_remote) - for field_remote in remote_schema['fields']]) - fields_local = set(json.dumps(field_local) - for field_local in schema['fields']) - - return fields_remote == fields_local - except HttpError as ex: - self.process_http_error(ex) - - def delete_and_recreate_table(self, dataset_id, table_id, table_schema): - delay = 0 - - # Changes to table schema may take up to 2 minutes as of May 2015 See - # `Issue 191 - # `__ - # Compare previous schema with new schema to determine if there should - # be a 120 second delay - - if not self.verify_schema(dataset_id, table_id, table_schema): - self._print('The existing table has a different schema. Please ' - 'wait 2 minutes. See Google BigQuery issue #191') - delay = 120 - - table = _Table(self.project_id, dataset_id, - private_key=self.private_key) - table.delete(table_id) - table.create(table_id, table_schema) - sleep(delay) - - -def _parse_data(schema, rows): - # see: - # http://pandas.pydata.org/pandas-docs/dev/missing_data.html - # #missing-data-casting-rules-and-indexing - dtype_map = {'FLOAT': np.dtype(float), - 'TIMESTAMP': 'M8[ns]'} - - fields = schema['fields'] - col_types = [field['type'] for field in fields] - col_names = [str(field['name']) for field in fields] - col_dtypes = [dtype_map.get(field['type'], object) for field in fields] - page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes)) - for row_num, raw_row in enumerate(rows): - entries = raw_row.get('f', []) - for col_num, field_type in enumerate(col_types): - field_value = _parse_entry(entries[col_num].get('v', ''), - field_type) - page_array[row_num][col_num] = field_value - - return DataFrame(page_array, columns=col_names) + # give a nice error message + raise ImportError("Load data from Google BigQuery\n" + "\n" + "the pandas-gbq package is not installed\n" + "see the docs: https://pandas-gbq.readthedocs.io\n" + "\n" + "you can install via:\n" + "pip install pandas-gbq\n") -def _parse_entry(field_value, field_type): - if field_value is None or field_value == 'null': - return None - if field_type == 'INTEGER': - return int(field_value) - elif field_type == 'FLOAT': - return float(field_value) - elif field_type == 'TIMESTAMP': - timestamp = datetime.utcfromtimestamp(float(field_value)) - return np.datetime64(timestamp) - elif field_type == 'BOOLEAN': - return field_value == 'true' - return field_value + return pandas_gbq def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, dialect='legacy', **kwargs): - r"""Load data from Google BigQuery. + pandas_gbq = _try_import() + return pandas_gbq.read_gbq( + query, project_id=project_id, + index_col=index_col, col_order=col_order, + reauth=reauth, verbose=verbose, + private_key=private_key, + dialect=dialect, + **kwargs) - THIS IS AN EXPERIMENTAL LIBRARY - The main method a user calls to execute a Query in Google BigQuery - and read results into a pandas DataFrame. - - Google BigQuery API Client Library v2 for Python is used. - Documentation is available at - https://developers.google.com/api-client-library/python/apis/bigquery/v2 - - Authentication to the Google BigQuery service is via OAuth 2.0. - - - If "private_key" is not provided: - - By default "application default credentials" are used. - - .. versionadded:: 0.19.0 - - If default application credentials are not found or are restrictive, - user account credentials are used. In this case, you will be asked to - grant permissions for product name 'pandas GBQ'. - - - If "private_key" is provided: - - Service account credentials will be used to authenticate. - - Parameters - ---------- - query : str - SQL-Like Query to return data values - project_id : str - Google BigQuery Account project ID. - index_col : str (optional) - Name of result column to use for index in results DataFrame - col_order : list(str) (optional) - List of BigQuery column names in the desired order for results - DataFrame - reauth : boolean (default False) - Force Google BigQuery to reauthenticate the user. This is useful - if multiple accounts are used. - verbose : boolean (default True) - Verbose output - private_key : str (optional) - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) - - .. versionadded:: 0.18.1 - - dialect : {'legacy', 'standard'}, default 'legacy' - 'legacy' : Use BigQuery's legacy SQL dialect. - 'standard' : Use BigQuery's standard SQL (beta), which is - compliant with the SQL 2011 standard. For more information - see `BigQuery SQL Reference - `__ - - .. versionadded:: 0.19.0 - - **kwargs : Arbitrary keyword arguments - configuration (dict): query config parameters for job processing. - For example: - - configuration = {'query': {'useQueryCache': False}} - - For more information see `BigQuery SQL Reference - ` - - .. versionadded:: 0.20.0 - - Returns - ------- - df: DataFrame - DataFrame representing results of query - - """ - - if not project_id: - raise TypeError("Missing required parameter: project_id") - - if dialect not in ('legacy', 'standard'): - raise ValueError("'{0}' is not valid for dialect".format(dialect)) - - connector = GbqConnector(project_id, reauth=reauth, verbose=verbose, - private_key=private_key, - dialect=dialect) - schema, pages = connector.run_query(query, **kwargs) - dataframe_list = [] - while len(pages) > 0: - page = pages.pop() - dataframe_list.append(_parse_data(schema, page)) - - if len(dataframe_list) > 0: - final_df = concat(dataframe_list, ignore_index=True) - else: - final_df = _parse_data(schema, []) - - # Reindex the DataFrame on the provided column - if index_col is not None: - if index_col in final_df.columns: - final_df.set_index(index_col, inplace=True) - else: - raise InvalidColumnOrder( - 'Index column "{0}" does not exist in DataFrame.' - .format(index_col) - ) - - # Change the order of columns in the DataFrame based on provided list - if col_order is not None: - if sorted(col_order) == sorted(final_df.columns): - final_df = final_df[col_order] - else: - raise InvalidColumnOrder( - 'Column order does not match this DataFrame.' - ) - - # cast BOOLEAN and INTEGER columns from object to bool/int - # if they dont have any nulls - type_map = {'BOOLEAN': bool, 'INTEGER': int} - for field in schema['fields']: - if field['type'] in type_map and \ - final_df[field['name']].notnull().all(): - final_df[field['name']] = \ - final_df[field['name']].astype(type_map[field['type']]) - - connector.print_elapsed_seconds( - 'Total time taken', - datetime.now().strftime('s.\nFinished at %Y-%m-%d %H:%M:%S.'), - 0 - ) - - return final_df +read_gbq = docstring_wrapper(read_gbq, + lambda: _try_import().read_gbq.__doc__) def to_gbq(dataframe, destination_table, project_id, chunksize=10000, verbose=True, reauth=False, if_exists='fail', private_key=None): - """Write a DataFrame to a Google BigQuery table. - - THIS IS AN EXPERIMENTAL LIBRARY - - The main method a user calls to export pandas DataFrame contents to - Google BigQuery table. - - Google BigQuery API Client Library v2 for Python is used. - Documentation is available at - https://developers.google.com/api-client-library/python/apis/bigquery/v2 - - Authentication to the Google BigQuery service is via OAuth 2.0. - - - If "private_key" is not provided: - - By default "application default credentials" are used. - - .. versionadded:: 0.19.0 - - If default application credentials are not found or are restrictive, - user account credentials are used. In this case, you will be asked to - grant permissions for product name 'pandas GBQ'. - - - If "private_key" is provided: - - Service account credentials will be used to authenticate. - - Parameters - ---------- - dataframe : DataFrame - DataFrame to be written - destination_table : string - Name of table to be written, in the form 'dataset.tablename' - project_id : str - Google BigQuery Account project ID. - chunksize : int (default 10000) - Number of rows to be inserted in each chunk from the dataframe. - verbose : boolean (default True) - Show percentage complete - reauth : boolean (default False) - Force Google BigQuery to reauthenticate the user. This is useful - if multiple accounts are used. - if_exists : {'fail', 'replace', 'append'}, default 'fail' - 'fail': If table exists, do nothing. - 'replace': If table exists, drop it, recreate it, and insert data. - 'append': If table exists, insert data. Create if does not exist. - private_key : str (optional) - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) - """ - - if if_exists not in ('fail', 'replace', 'append'): - raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) - - if '.' not in destination_table: - raise NotFoundException( - "Invalid Table Name. Should be of the form 'datasetId.tableId' ") - - connector = GbqConnector(project_id, reauth=reauth, verbose=verbose, - private_key=private_key) - dataset_id, table_id = destination_table.rsplit('.', 1) - - table = _Table(project_id, dataset_id, reauth=reauth, - private_key=private_key) - - table_schema = _generate_bq_schema(dataframe) - - # If table exists, check if_exists parameter - if table.exists(table_id): - if if_exists == 'fail': - raise TableCreationError("Could not create the table because it " - "already exists. " - "Change the if_exists parameter to " - "append or replace data.") - elif if_exists == 'replace': - connector.delete_and_recreate_table( - dataset_id, table_id, table_schema) - elif if_exists == 'append': - if not connector.verify_schema(dataset_id, table_id, table_schema): - raise InvalidSchema("Please verify that the structure and " - "data types in the DataFrame match the " - "schema of the destination table.") - else: - table.create(table_id, table_schema) - - connector.load_data(dataframe, dataset_id, table_id, chunksize) - - -def generate_bq_schema(df, default_type='STRING'): - # deprecation TimeSeries, #11121 - warnings.warn("generate_bq_schema is deprecated and will be removed in " - "a future version", FutureWarning, stacklevel=2) - - return _generate_bq_schema(df, default_type=default_type) - - -def _generate_bq_schema(df, default_type='STRING'): - """ Given a passed df, generate the associated Google BigQuery schema. - - Parameters - ---------- - df : DataFrame - default_type : string - The default big query type in case the type of the column - does not exist in the schema. - """ - - type_mapping = { - 'i': 'INTEGER', - 'b': 'BOOLEAN', - 'f': 'FLOAT', - 'O': 'STRING', - 'S': 'STRING', - 'U': 'STRING', - 'M': 'TIMESTAMP' - } - - fields = [] - for column_name, dtype in df.dtypes.iteritems(): - fields.append({'name': column_name, - 'type': type_mapping.get(dtype.kind, default_type)}) - - return {'fields': fields} - - -class _Table(GbqConnector): - - def __init__(self, project_id, dataset_id, reauth=False, verbose=False, - private_key=None): - try: - from googleapiclient.errors import HttpError - except: - from apiclient.errors import HttpError - self.http_error = HttpError - self.dataset_id = dataset_id - super(_Table, self).__init__(project_id, reauth, verbose, private_key) - - def exists(self, table_id): - """ Check if a table exists in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - table : str - Name of table to be verified - - Returns - ------- - boolean - true if table exists, otherwise false - """ - - try: - self.service.tables().get( - projectId=self.project_id, - datasetId=self.dataset_id, - tableId=table_id).execute() - return True - except self.http_error as ex: - if ex.resp.status == 404: - return False - else: - self.process_http_error(ex) - - def create(self, table_id, schema): - """ Create a table in Google BigQuery given a table and schema - - .. versionadded:: 0.17.0 - - Parameters - ---------- - table : str - Name of table to be written - schema : str - Use the generate_bq_schema to generate your table schema from a - dataframe. - """ - - if self.exists(table_id): - raise TableCreationError( - "The table could not be created because it already exists") - - if not _Dataset(self.project_id, - private_key=self.private_key).exists(self.dataset_id): - _Dataset(self.project_id, - private_key=self.private_key).create(self.dataset_id) - - body = { - 'schema': schema, - 'tableReference': { - 'tableId': table_id, - 'projectId': self.project_id, - 'datasetId': self.dataset_id - } - } - - try: - self.service.tables().insert( - projectId=self.project_id, - datasetId=self.dataset_id, - body=body).execute() - except self.http_error as ex: - self.process_http_error(ex) - - def delete(self, table_id): - """ Delete a table in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - table : str - Name of table to be deleted - """ - - if not self.exists(table_id): - raise NotFoundException("Table does not exist") - - try: - self.service.tables().delete( - datasetId=self.dataset_id, - projectId=self.project_id, - tableId=table_id).execute() - except self.http_error as ex: - self.process_http_error(ex) - - -class _Dataset(GbqConnector): - - def __init__(self, project_id, reauth=False, verbose=False, - private_key=None): - try: - from googleapiclient.errors import HttpError - except: - from apiclient.errors import HttpError - self.http_error = HttpError - super(_Dataset, self).__init__(project_id, reauth, verbose, - private_key) - - def exists(self, dataset_id): - """ Check if a dataset exists in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - dataset_id : str - Name of dataset to be verified - - Returns - ------- - boolean - true if dataset exists, otherwise false - """ - - try: - self.service.datasets().get( - projectId=self.project_id, - datasetId=dataset_id).execute() - return True - except self.http_error as ex: - if ex.resp.status == 404: - return False - else: - self.process_http_error(ex) - - def datasets(self): - """ Return a list of datasets in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - None - - Returns - ------- - list - List of datasets under the specific project - """ - - dataset_list = [] - next_page_token = None - first_query = True - - while first_query or next_page_token: - first_query = False - - try: - list_dataset_response = self.service.datasets().list( - projectId=self.project_id, - pageToken=next_page_token).execute() - - dataset_response = list_dataset_response.get('datasets') - next_page_token = list_dataset_response.get('nextPageToken') - - if not dataset_response: - return dataset_list - - for row_num, raw_row in enumerate(dataset_response): - dataset_list.append( - raw_row['datasetReference']['datasetId']) - - except self.http_error as ex: - self.process_http_error(ex) - - return dataset_list - - def create(self, dataset_id): - """ Create a dataset in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - dataset : str - Name of dataset to be written - """ - - if self.exists(dataset_id): - raise DatasetCreationError( - "The dataset could not be created because it already exists") - - body = { - 'datasetReference': { - 'projectId': self.project_id, - 'datasetId': dataset_id - } - } - - try: - self.service.datasets().insert( - projectId=self.project_id, - body=body).execute() - except self.http_error as ex: - self.process_http_error(ex) - - def delete(self, dataset_id): - """ Delete a dataset in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - dataset : str - Name of dataset to be deleted - """ - - if not self.exists(dataset_id): - raise NotFoundException( - "Dataset {0} does not exist".format(dataset_id)) - - try: - self.service.datasets().delete( - datasetId=dataset_id, - projectId=self.project_id).execute() - - except self.http_error as ex: - self.process_http_error(ex) - - def tables(self, dataset_id): - """ List tables in the specific dataset in Google BigQuery - - .. versionadded:: 0.17.0 - - Parameters - ---------- - dataset : str - Name of dataset to list tables for - - Returns - ------- - list - List of tables under the specific dataset - """ - - table_list = [] - next_page_token = None - first_query = True - - while first_query or next_page_token: - first_query = False - - try: - list_table_response = self.service.tables().list( - projectId=self.project_id, - datasetId=dataset_id, - pageToken=next_page_token).execute() - - table_response = list_table_response.get('tables') - next_page_token = list_table_response.get('nextPageToken') - - if not table_response: - return table_list - - for row_num, raw_row in enumerate(table_response): - table_list.append(raw_row['tableReference']['tableId']) + pandas_gbq = _try_import() + pandas_gbq.to_gbq(dataframe, destination_table, project_id, + chunksize=chunksize, + verbose=verbose, reauth=reauth, + if_exists=if_exists, private_key=private_key) - except self.http_error as ex: - self.process_http_error(ex) - return table_list +to_gbq = docstring_wrapper(to_gbq, + lambda: _try_import().to_gbq.__doc__) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 0a76267054ee6..13529e7b54714 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -1,23 +1,18 @@ -import re -from datetime import datetime import pytest +from datetime import datetime import pytz import platform from time import sleep import os -import logging import numpy as np +import pandas as pd +from pandas import compat, DataFrame -from distutils.version import StrictVersion -from pandas import compat - -from pandas import NaT -from pandas.compat import u, range -from pandas.core.frame import DataFrame -import pandas.io.gbq as gbq +from pandas.compat import range import pandas.util.testing as tm -from pandas.compat.numpy import np_datetime64_compat + +pandas_gbq = pytest.importorskip('pandas_gbq') PROJECT_ID = None PRIVATE_KEY_JSON_PATH = None @@ -33,12 +28,6 @@ VERSION = platform.python_version() -_IMPORTS = False -_GOOGLE_API_CLIENT_INSTALLED = False -_GOOGLE_API_CLIENT_VALID_VERSION = False -_HTTPLIB2_INSTALLED = False -_SETUPTOOLS_INSTALLED = False - def _skip_if_no_project_id(): if not _get_project_id(): @@ -46,23 +35,12 @@ def _skip_if_no_project_id(): "Cannot run integration tests without a project id") -def _skip_local_auth_if_in_travis_env(): - if _in_travis_environment(): - pytest.skip("Cannot run local auth in travis environment") - - def _skip_if_no_private_key_path(): if not _get_private_key_path(): pytest.skip("Cannot run integration tests without a " "private key json file path") -def _skip_if_no_private_key_contents(): - if not _get_private_key_contents(): - pytest.skip("Cannot run integration tests without a " - "private key json contents") - - def _in_travis_environment(): return 'TRAVIS_BUILD_DIR' in os.environ and \ 'GBQ_PROJECT_ID' in os.environ @@ -83,146 +61,15 @@ def _get_private_key_path(): return PRIVATE_KEY_JSON_PATH -def _get_private_key_contents(): - if _in_travis_environment(): - with open(os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', - 'travis_gbq.json'])) as f: - return f.read() - else: - return PRIVATE_KEY_JSON_CONTENTS - - -def _test_imports(): - global _GOOGLE_API_CLIENT_INSTALLED, _GOOGLE_API_CLIENT_VALID_VERSION, \ - _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED - - try: - import pkg_resources - _SETUPTOOLS_INSTALLED = True - except ImportError: - _SETUPTOOLS_INSTALLED = False - - if compat.PY3: - google_api_minimum_version = '1.4.1' - else: - google_api_minimum_version = '1.2.0' - - if _SETUPTOOLS_INSTALLED: - try: - try: - from googleapiclient.discovery import build # noqa - from googleapiclient.errors import HttpError # noqa - except: - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa - - from oauth2client.client import OAuth2WebServerFlow # noqa - from oauth2client.client import AccessTokenRefreshError # noqa - - from oauth2client.file import Storage # noqa - from oauth2client.tools import run_flow # noqa - _GOOGLE_API_CLIENT_INSTALLED = True - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution( - 'google-api-python-client').version - - if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) >= - StrictVersion(google_api_minimum_version)): - _GOOGLE_API_CLIENT_VALID_VERSION = True - - except ImportError: - _GOOGLE_API_CLIENT_INSTALLED = False - - try: - import httplib2 # noqa - _HTTPLIB2_INSTALLED = True - except ImportError: - _HTTPLIB2_INSTALLED = False - - if not _SETUPTOOLS_INSTALLED: - raise ImportError('Could not import pkg_resources (setuptools).') - - if not _GOOGLE_API_CLIENT_INSTALLED: - raise ImportError('Could not import Google API Client.') - - if not _GOOGLE_API_CLIENT_VALID_VERSION: - raise ImportError("pandas requires google-api-python-client >= {0} " - "for Google BigQuery support, " - "current version {1}" - .format(google_api_minimum_version, - _GOOGLE_API_CLIENT_VERSION)) - - if not _HTTPLIB2_INSTALLED: - raise ImportError( - "pandas requires httplib2 for Google BigQuery support") - - # Bug fix for https://github.com/pandas-dev/pandas/issues/12572 - # We need to know that a supported version of oauth2client is installed - # Test that either of the following is installed: - # - SignedJwtAssertionCredentials from oauth2client.client - # - ServiceAccountCredentials from oauth2client.service_account - # SignedJwtAssertionCredentials is available in oauthclient < 2.0.0 - # ServiceAccountCredentials is available in oauthclient >= 2.0.0 - oauth2client_v1 = True - oauth2client_v2 = True - - try: - from oauth2client.client import SignedJwtAssertionCredentials # noqa - except ImportError: - oauth2client_v1 = False - - try: - from oauth2client.service_account import ServiceAccountCredentials # noqa - except ImportError: - oauth2client_v2 = False - - if not oauth2client_v1 and not oauth2client_v2: - raise ImportError("Missing oauth2client required for BigQuery " - "service account support") - - -def _setup_common(): - try: - _test_imports() - except (ImportError, NotImplementedError) as import_exception: - pytest.skip(import_exception) - - if _in_travis_environment(): - logging.getLogger('oauth2client').setLevel(logging.ERROR) - logging.getLogger('apiclient').setLevel(logging.ERROR) - - -def _check_if_can_get_correct_default_credentials(): - # Checks if "Application Default Credentials" can be fetched - # from the environment the tests are running in. - # See Issue #13577 - - import httplib2 - try: - from googleapiclient.discovery import build - except ImportError: - from apiclient.discovery import build - try: - from oauth2client.client import GoogleCredentials - credentials = GoogleCredentials.get_application_default() - http = httplib2.Http() - http = credentials.authorize(http) - bigquery_service = build('bigquery', 'v2', http=http) - jobs = bigquery_service.jobs() - job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} - jobs.insert(projectId=_get_project_id(), body=job_data).execute() - return True - except: - return False - - def clean_gbq_environment(private_key=None): - dataset = gbq._Dataset(_get_project_id(), private_key=private_key) + dataset = pandas_gbq.gbq._Dataset(_get_project_id(), + private_key=private_key) for i in range(1, 10): if DATASET_ID + str(i) in dataset.datasets(): dataset_id = DATASET_ID + str(i) - table = gbq._Table(_get_project_id(), dataset_id, - private_key=private_key) + table = pandas_gbq.gbq._Table(_get_project_id(), dataset_id, + private_key=private_key) for j in range(1, 20): if TABLE_ID + str(j) in dataset.tables(dataset_id): table.delete(TABLE_ID + str(j)) @@ -246,673 +93,8 @@ def make_mixed_dataframe_v2(test_size): index=range(test_size)) -def test_generate_bq_schema_deprecated(): - # 11121 Deprecation of generate_bq_schema - with tm.assert_produces_warning(FutureWarning): - df = make_mixed_dataframe_v2(10) - gbq.generate_bq_schema(df) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): - - def setUp(self): - _setup_common() - _skip_if_no_project_id() - _skip_local_auth_if_in_travis_env() - - self.sut = gbq.GbqConnector(_get_project_id()) - - def test_should_be_able_to_make_a_connector(self): - self.assertTrue(self.sut is not None, - 'Could not create a GbqConnector') - - def test_should_be_able_to_get_valid_credentials(self): - credentials = self.sut.get_credentials() - self.assertFalse(credentials.invalid, 'Returned credentials invalid') - - def test_should_be_able_to_get_a_bigquery_service(self): - bigquery_service = self.sut.get_service() - self.assertTrue(bigquery_service is not None, 'No service returned') - - def test_should_be_able_to_get_schema_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - self.assertTrue(schema is not None) - - def test_should_be_able_to_get_results_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - self.assertTrue(pages is not None) - - def test_get_application_default_credentials_does_not_throw_error(self): - if _check_if_can_get_correct_default_credentials(): - pytest.skip("Can get default_credentials " - "from the environment!") - credentials = self.sut.get_application_default_credentials() - self.assertIsNone(credentials) - - def test_get_application_default_credentials_returns_credentials(self): - if not _check_if_can_get_correct_default_credentials(): - pytest.skip("Cannot get default_credentials " - "from the environment!") - from oauth2client.client import GoogleCredentials - credentials = self.sut.get_application_default_credentials() - self.assertTrue(isinstance(credentials, GoogleCredentials)) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): - def setUp(self): - _setup_common() - - _skip_if_no_project_id() - _skip_if_no_private_key_path() - - self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_path()) - - def test_should_be_able_to_make_a_connector(self): - self.assertTrue(self.sut is not None, - 'Could not create a GbqConnector') - - def test_should_be_able_to_get_valid_credentials(self): - credentials = self.sut.get_credentials() - self.assertFalse(credentials.invalid, 'Returned credentials invalid') - - def test_should_be_able_to_get_a_bigquery_service(self): - bigquery_service = self.sut.get_service() - self.assertTrue(bigquery_service is not None, 'No service returned') - - def test_should_be_able_to_get_schema_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - self.assertTrue(schema is not None) - - def test_should_be_able_to_get_results_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - self.assertTrue(pages is not None) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): - def setUp(self): - _setup_common() - - _skip_if_no_project_id() - _skip_if_no_private_key_contents() - - self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_contents()) - - def test_should_be_able_to_make_a_connector(self): - self.assertTrue(self.sut is not None, - 'Could not create a GbqConnector') - - def test_should_be_able_to_get_valid_credentials(self): - credentials = self.sut.get_credentials() - self.assertFalse(credentials.invalid, 'Returned credentials invalid') - - def test_should_be_able_to_get_a_bigquery_service(self): - bigquery_service = self.sut.get_service() - self.assertTrue(bigquery_service is not None, 'No service returned') - - def test_should_be_able_to_get_schema_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - self.assertTrue(schema is not None) - - def test_should_be_able_to_get_results_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - self.assertTrue(pages is not None) - - -class GBQUnitTests(tm.TestCase): - - def setUp(self): - _setup_common() - - def test_import_google_api_python_client(self): - if compat.PY2: - with tm.assertRaises(ImportError): - from googleapiclient.discovery import build # noqa - from googleapiclient.errors import HttpError # noqa - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa - else: - from googleapiclient.discovery import build # noqa - from googleapiclient.errors import HttpError # noqa - - def test_should_return_bigquery_integers_as_python_ints(self): - result = gbq._parse_entry(1, 'INTEGER') - tm.assert_equal(result, int(1)) - - def test_should_return_bigquery_floats_as_python_floats(self): - result = gbq._parse_entry(1, 'FLOAT') - tm.assert_equal(result, float(1)) - - def test_should_return_bigquery_timestamps_as_numpy_datetime(self): - result = gbq._parse_entry('0e9', 'TIMESTAMP') - tm.assert_equal(result, np_datetime64_compat('1970-01-01T00:00:00Z')) - - def test_should_return_bigquery_booleans_as_python_booleans(self): - result = gbq._parse_entry('false', 'BOOLEAN') - tm.assert_equal(result, False) - - def test_should_return_bigquery_strings_as_python_strings(self): - result = gbq._parse_entry('STRING', 'STRING') - tm.assert_equal(result, 'STRING') - - def test_to_gbq_should_fail_if_invalid_table_name_passed(self): - with tm.assertRaises(gbq.NotFoundException): - gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234") - - def test_to_gbq_with_no_project_id_given_should_fail(self): - with tm.assertRaises(TypeError): - gbq.to_gbq(DataFrame(), 'dataset.tablename') - - def test_read_gbq_with_no_project_id_given_should_fail(self): - with tm.assertRaises(TypeError): - gbq.read_gbq('SELECT 1') - - def test_that_parse_data_works_properly(self): - test_schema = {'fields': [ - {'mode': 'NULLABLE', 'name': 'valid_string', 'type': 'STRING'}]} - test_page = [{'f': [{'v': 'PI'}]}] - - test_output = gbq._parse_data(test_schema, test_page) - correct_output = DataFrame({'valid_string': ['PI']}) - tm.assert_frame_equal(test_output, correct_output) - - def test_read_gbq_with_invalid_private_key_json_should_fail(self): - with tm.assertRaises(gbq.InvalidPrivateKeyFormat): - gbq.read_gbq('SELECT 1', project_id='x', private_key='y') - - def test_read_gbq_with_empty_private_key_json_should_fail(self): - with tm.assertRaises(gbq.InvalidPrivateKeyFormat): - gbq.read_gbq('SELECT 1', project_id='x', private_key='{}') - - def test_read_gbq_with_private_key_json_wrong_types_should_fail(self): - with tm.assertRaises(gbq.InvalidPrivateKeyFormat): - gbq.read_gbq( - 'SELECT 1', project_id='x', - private_key='{ "client_email" : 1, "private_key" : True }') - - def test_read_gbq_with_empty_private_key_file_should_fail(self): - with tm.ensure_clean() as empty_file_path: - with tm.assertRaises(gbq.InvalidPrivateKeyFormat): - gbq.read_gbq('SELECT 1', project_id='x', - private_key=empty_file_path) - - def test_read_gbq_with_corrupted_private_key_json_should_fail(self): - _skip_if_no_private_key_contents() - - with tm.assertRaises(gbq.InvalidPrivateKeyFormat): - gbq.read_gbq( - 'SELECT 1', project_id='x', - private_key=re.sub('[a-z]', '9', _get_private_key_contents())) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestReadGBQIntegration(tm.TestCase): - - @classmethod - def setUpClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* - # executing *ALL* tests described below. - - _skip_if_no_project_id() - - _setup_common() - - def setUp(self): - # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test is - # executed. - pass - - @classmethod - def tearDownClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* - # executing all tests. - pass - - def tearDown(self): - # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test is - # executed. - pass - - def test_should_read_as_user_account(self): - _skip_local_auth_if_in_travis_env() - - query = 'SELECT "PI" AS valid_string' - df = gbq.read_gbq(query, project_id=_get_project_id()) - tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) - - def test_should_read_as_service_account_with_key_path(self): - _skip_if_no_private_key_path() - query = 'SELECT "PI" AS valid_string' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) - - def test_should_read_as_service_account_with_key_contents(self): - _skip_if_no_private_key_contents() - query = 'SELECT "PI" AS valid_string' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_contents()) - tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): - - @classmethod - def setUpClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* - # executing *ALL* tests described below. - - _skip_if_no_project_id() - _skip_if_no_private_key_path() - - _setup_common() - - def setUp(self): - # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test is - # executed. - pass - - @classmethod - def tearDownClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* - # executing all tests. - pass - - def tearDown(self): - # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test is - # executed. - pass - - def test_should_properly_handle_valid_strings(self): - query = 'SELECT "PI" AS valid_string' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) - - def test_should_properly_handle_empty_strings(self): - query = 'SELECT "" AS empty_string' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'empty_string': [""]})) - - def test_should_properly_handle_null_strings(self): - query = 'SELECT STRING(NULL) AS null_string' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'null_string': [None]})) - - def test_should_properly_handle_valid_integers(self): - query = 'SELECT INTEGER(3) AS valid_integer' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'valid_integer': [3]})) - - def test_should_properly_handle_nullable_integers(self): - query = '''SELECT * FROM - (SELECT 1 AS nullable_integer), - (SELECT NULL AS nullable_integer)''' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal( - df, DataFrame({'nullable_integer': [1, None]}).astype(object)) - - def test_should_properly_handle_valid_longs(self): - query = 'SELECT 1 << 62 AS valid_long' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal( - df, DataFrame({'valid_long': [1 << 62]})) - - def test_should_properly_handle_nullable_longs(self): - query = '''SELECT * FROM - (SELECT 1 << 62 AS nullable_long), - (SELECT NULL AS nullable_long)''' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal( - df, DataFrame({'nullable_long': [1 << 62, None]}).astype(object)) - - def test_should_properly_handle_null_integers(self): - query = 'SELECT INTEGER(NULL) AS null_integer' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'null_integer': [None]})) - - def test_should_properly_handle_valid_floats(self): - from math import pi - query = 'SELECT PI() AS valid_float' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame( - {'valid_float': [pi]})) - - def test_should_properly_handle_nullable_floats(self): - from math import pi - query = '''SELECT * FROM - (SELECT PI() AS nullable_float), - (SELECT NULL AS nullable_float)''' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal( - df, DataFrame({'nullable_float': [pi, None]})) - - def test_should_properly_handle_valid_doubles(self): - from math import pi - query = 'SELECT PI() * POW(10, 307) AS valid_double' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame( - {'valid_double': [pi * 10 ** 307]})) - - def test_should_properly_handle_nullable_doubles(self): - from math import pi - query = '''SELECT * FROM - (SELECT PI() * POW(10, 307) AS nullable_double), - (SELECT NULL AS nullable_double)''' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal( - df, DataFrame({'nullable_double': [pi * 10 ** 307, None]})) - - def test_should_properly_handle_null_floats(self): - query = 'SELECT FLOAT(NULL) AS null_float' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'null_float': [np.nan]})) - - def test_should_properly_handle_timestamp_unix_epoch(self): - query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame( - {'unix_epoch': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) - - def test_should_properly_handle_arbitrary_timestamp(self): - query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({ - 'valid_timestamp': [np.datetime64('2004-09-15T05:00:00.000000Z')] - })) - - def test_should_properly_handle_null_timestamp(self): - query = 'SELECT TIMESTAMP(NULL) AS null_timestamp' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'null_timestamp': [NaT]})) - - def test_should_properly_handle_true_boolean(self): - query = 'SELECT BOOLEAN(TRUE) AS true_boolean' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'true_boolean': [True]})) - - def test_should_properly_handle_false_boolean(self): - query = 'SELECT BOOLEAN(FALSE) AS false_boolean' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'false_boolean': [False]})) - - def test_should_properly_handle_null_boolean(self): - query = 'SELECT BOOLEAN(NULL) AS null_boolean' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'null_boolean': [None]})) - - def test_should_properly_handle_nullable_booleans(self): - query = '''SELECT * FROM - (SELECT BOOLEAN(TRUE) AS nullable_boolean), - (SELECT NULL AS nullable_boolean)''' - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal( - df, DataFrame({'nullable_boolean': [True, None]}).astype(object)) - - def test_unicode_string_conversion_and_normalization(self): - correct_test_datatype = DataFrame( - {'unicode_string': [u("\xe9\xfc")]} - ) - - unicode_string = "\xc3\xa9\xc3\xbc" - - if compat.PY3: - unicode_string = unicode_string.encode('latin-1').decode('utf8') - - query = 'SELECT "{0}" AS unicode_string'.format(unicode_string) - - df = gbq.read_gbq(query, project_id=_get_project_id(), - private_key=_get_private_key_path()) - tm.assert_frame_equal(df, correct_test_datatype) - - def test_index_column(self): - query = "SELECT 'a' AS string_1, 'b' AS string_2" - result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - index_col="string_1", - private_key=_get_private_key_path()) - correct_frame = DataFrame( - {'string_1': ['a'], 'string_2': ['b']}).set_index("string_1") - tm.assert_equal(result_frame.index.name, correct_frame.index.name) - - def test_column_order(self): - query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" - col_order = ['string_3', 'string_1', 'string_2'] - result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - col_order=col_order, - private_key=_get_private_key_path()) - correct_frame = DataFrame({'string_1': ['a'], 'string_2': [ - 'b'], 'string_3': ['c']})[col_order] - tm.assert_frame_equal(result_frame, correct_frame) - - def test_column_order_plus_index(self): - query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" - col_order = ['string_3', 'string_2'] - result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - index_col='string_1', col_order=col_order, - private_key=_get_private_key_path()) - correct_frame = DataFrame( - {'string_1': ['a'], 'string_2': ['b'], 'string_3': ['c']}) - correct_frame.set_index('string_1', inplace=True) - correct_frame = correct_frame[col_order] - tm.assert_frame_equal(result_frame, correct_frame) - - def test_malformed_query(self): - with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", - project_id=_get_project_id(), - private_key=_get_private_key_path()) - - def test_bad_project_id(self): - with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELECT 1", project_id='001', - private_key=_get_private_key_path()) - - def test_bad_table_name(self): - with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", - project_id=_get_project_id(), - private_key=_get_private_key_path()) - - def test_download_dataset_larger_than_200k_rows(self): - test_size = 200005 - # Test for known BigQuery bug in datasets larger than 100k rows - # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results - df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] " - "GROUP EACH BY id ORDER BY id ASC LIMIT {0}" - .format(test_size), - project_id=_get_project_id(), - private_key=_get_private_key_path()) - self.assertEqual(len(df.drop_duplicates()), test_size) - - def test_zero_rows(self): - # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 - df = gbq.read_gbq("SELECT title, id, is_bot, " - "SEC_TO_TIMESTAMP(timestamp) ts " - "FROM [publicdata:samples.wikipedia] " - "WHERE timestamp=-9999999", - project_id=_get_project_id(), - private_key=_get_private_key_path()) - page_array = np.zeros( - (0,), dtype=[('title', object), ('id', np.dtype(int)), - ('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')]) - expected_result = DataFrame( - page_array, columns=['title', 'id', 'is_bot', 'ts']) - self.assert_frame_equal(df, expected_result) - - def test_legacy_sql(self): - legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" - - # Test that a legacy sql statement fails when - # setting dialect='standard' - with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq(legacy_sql, project_id=_get_project_id(), - dialect='standard', - private_key=_get_private_key_path()) - - # Test that a legacy sql statement succeeds when - # setting dialect='legacy' - df = gbq.read_gbq(legacy_sql, project_id=_get_project_id(), - dialect='legacy', - private_key=_get_private_key_path()) - self.assertEqual(len(df.drop_duplicates()), 10) - - def test_standard_sql(self): - standard_sql = "SELECT DISTINCT id FROM " \ - "`publicdata.samples.wikipedia` LIMIT 10" - - # Test that a standard sql statement fails when using - # the legacy SQL dialect (default value) - with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq(standard_sql, project_id=_get_project_id(), - private_key=_get_private_key_path()) - - # Test that a standard sql statement succeeds when - # setting dialect='standard' - df = gbq.read_gbq(standard_sql, project_id=_get_project_id(), - dialect='standard', - private_key=_get_private_key_path()) - self.assertEqual(len(df.drop_duplicates()), 10) - - def test_invalid_option_for_sql_dialect(self): - sql_statement = "SELECT DISTINCT id FROM " \ - "`publicdata.samples.wikipedia` LIMIT 10" - - # Test that an invalid option for `dialect` raises ValueError - with tm.assertRaises(ValueError): - gbq.read_gbq(sql_statement, project_id=_get_project_id(), - dialect='invalid', - private_key=_get_private_key_path()) - - # Test that a correct option for dialect succeeds - # to make sure ValueError was due to invalid dialect - gbq.read_gbq(sql_statement, project_id=_get_project_id(), - dialect='standard', private_key=_get_private_key_path()) - - def test_query_with_parameters(self): - sql_statement = "SELECT @param1 + @param2 AS valid_result" - config = { - 'query': { - "useLegacySql": False, - "parameterMode": "named", - "queryParameters": [ - { - "name": "param1", - "parameterType": { - "type": "INTEGER" - }, - "parameterValue": { - "value": 1 - } - }, - { - "name": "param2", - "parameterType": { - "type": "INTEGER" - }, - "parameterValue": { - "value": 2 - } - } - ] - } - } - # Test that a query that relies on parameters fails - # when parameters are not supplied via configuration - with tm.assertRaises(ValueError): - gbq.read_gbq(sql_statement, project_id=_get_project_id(), - private_key=_get_private_key_path()) - - # Test that the query is successful because we have supplied - # the correct query parameters via the 'config' option - df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) - - def test_query_inside_configuration(self): - query_no_use = 'SELECT "PI_WRONG" AS valid_string' - query = 'SELECT "PI" AS valid_string' - config = { - 'query': { - "query": query, - "useQueryCache": False, - } - } - # Test that it can't pass query both - # inside config and as parameter - with tm.assertRaises(ValueError): - gbq.read_gbq(query_no_use, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - - df = gbq.read_gbq(None, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) - - def test_configuration_without_query(self): - sql_statement = 'SELECT 1' - config = { - 'copy': { - "sourceTable": { - "projectId": _get_project_id(), - "datasetId": "publicdata:samples", - "tableId": "wikipedia" - }, - "destinationTable": { - "projectId": _get_project_id(), - "datasetId": "publicdata:samples", - "tableId": "wikipedia_copied" - }, - } - } - # Test that only 'query' configurations are supported - # nor 'copy','load','extract' - with tm.assertRaises(ValueError): - gbq.read_gbq(sql_statement, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): - # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 - # As a workaround to this issue, each test should use a unique table name. - # Make sure to modify the for loop range in the tearDownClass when a new - # test is added See `Issue 191 - # `__ @classmethod def setUpClass(cls): @@ -923,24 +105,10 @@ def setUpClass(cls): _skip_if_no_project_id() _skip_if_no_private_key_path() - _setup_common() clean_gbq_environment(_get_private_key_path()) - - gbq._Dataset(_get_project_id(), - private_key=_get_private_key_path() - ).create(DATASET_ID + "1") - - def setUp(self): - # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test is - # executed. - - self.dataset = gbq._Dataset(_get_project_id(), - private_key=_get_private_key_path()) - self.table = gbq._Table(_get_project_id(), DATASET_ID + "1", - private_key=_get_private_key_path()) - self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_path()) + pandas_gbq.gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path() + ).create(DATASET_ID + "1") @classmethod def tearDownClass(cls): @@ -950,387 +118,19 @@ def tearDownClass(cls): clean_gbq_environment(_get_private_key_path()) - def tearDown(self): - # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test is - # executed. - pass - - def test_upload_data(self): + def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_path()) - - sleep(30) # <- Curses Google!!! - - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" - .format(destination_table), - project_id=_get_project_id(), - private_key=_get_private_key_path()) - self.assertEqual(result['num_rows'][0], test_size) - - def test_upload_data_if_table_exists_fail(self): - destination_table = DESTINATION_TABLE + "2" - - test_size = 10 - df = make_mixed_dataframe_v2(test_size) - self.table.create(TABLE_ID + "2", gbq._generate_bq_schema(df)) - - # Test the default value of if_exists is 'fail' - with tm.assertRaises(gbq.TableCreationError): - gbq.to_gbq(df, destination_table, _get_project_id(), - private_key=_get_private_key_path()) - - # Test the if_exists parameter with value 'fail' - with tm.assertRaises(gbq.TableCreationError): - gbq.to_gbq(df, destination_table, _get_project_id(), - if_exists='fail', private_key=_get_private_key_path()) - - def test_upload_data_if_table_exists_append(self): - destination_table = DESTINATION_TABLE + "3" - - test_size = 10 - df = make_mixed_dataframe_v2(test_size) - df_different_schema = tm.makeMixedDataFrame() - - # Initialize table with sample data - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_path()) - - # Test the if_exists parameter with value 'append' - gbq.to_gbq(df, destination_table, _get_project_id(), - if_exists='append', private_key=_get_private_key_path()) - - sleep(30) # <- Curses Google!!! - - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" - .format(destination_table), - project_id=_get_project_id(), - private_key=_get_private_key_path()) - self.assertEqual(result['num_rows'][0], test_size * 2) - - # Try inserting with a different schema, confirm failure - with tm.assertRaises(gbq.InvalidSchema): - gbq.to_gbq(df_different_schema, destination_table, - _get_project_id(), if_exists='append', - private_key=_get_private_key_path()) - - def test_upload_data_if_table_exists_replace(self): - - destination_table = DESTINATION_TABLE + "4" - - test_size = 10 - df = make_mixed_dataframe_v2(test_size) - df_different_schema = tm.makeMixedDataFrame() - - # Initialize table with sample data - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_path()) - - # Test the if_exists parameter with the value 'replace'. - gbq.to_gbq(df_different_schema, destination_table, - _get_project_id(), if_exists='replace', - private_key=_get_private_key_path()) - - sleep(30) # <- Curses Google!!! - - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" - .format(destination_table), - project_id=_get_project_id(), - private_key=_get_private_key_path()) - self.assertEqual(result['num_rows'][0], 5) - - @tm.slow - def test_google_upload_errors_should_raise_exception(self): - destination_table = DESTINATION_TABLE + "5" - - test_timestamp = datetime.now(pytz.timezone('US/Arizona')) - bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], - 'ints': [0, '1'], 'strs': ['a', 1], - 'times': [test_timestamp, test_timestamp]}, - index=range(2)) - - with tm.assertRaises(gbq.StreamingInsertError): - gbq.to_gbq(bad_df, destination_table, _get_project_id(), - verbose=True, private_key=_get_private_key_path()) - - def test_generate_schema(self): - df = tm.makeMixedDataFrame() - schema = gbq._generate_bq_schema(df) - - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - - self.assertEqual(schema, test_schema) - - def test_create_table(self): - destination_table = TABLE_ID + "6" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - self.table.create(destination_table, test_schema) - self.assertTrue(self.table.exists(destination_table), - 'Expected table to exist') - - def test_table_does_not_exist(self): - self.assertTrue(not self.table.exists(TABLE_ID + "7"), - 'Expected table not to exist') - - def test_delete_table(self): - destination_table = TABLE_ID + "8" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - self.table.create(destination_table, test_schema) - self.table.delete(destination_table) - self.assertTrue(not self.table.exists( - destination_table), 'Expected table not to exist') - - def test_list_table(self): - destination_table = TABLE_ID + "9" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - self.table.create(destination_table, test_schema) - self.assertTrue( - destination_table in self.dataset.tables(DATASET_ID + "1"), - 'Expected table list to contain table {0}' - .format(destination_table)) - - def test_verify_schema_allows_flexible_column_order(self): - destination_table = TABLE_ID + "10" - test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - - self.table.create(destination_table, test_schema_1) - self.assertTrue(self.sut.verify_schema( - DATASET_ID + "1", destination_table, test_schema_2), - 'Expected schema to match') - - def test_verify_schema_fails_different_data_type(self): - destination_table = TABLE_ID + "11" - test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'STRING'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - - self.table.create(destination_table, test_schema_1) - self.assertFalse(self.sut.verify_schema( - DATASET_ID + "1", destination_table, test_schema_2), - 'Expected different schema') - - def test_verify_schema_fails_different_structure(self): - destination_table = TABLE_ID + "12" - test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B2', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} - - self.table.create(destination_table, test_schema_1) - self.assertFalse(self.sut.verify_schema( - DATASET_ID + "1", destination_table, test_schema_2), - 'Expected different schema') - - def test_upload_data_flexible_column_order(self): - destination_table = DESTINATION_TABLE + "13" - - test_size = 10 - df = make_mixed_dataframe_v2(test_size) - - # Initialize table with sample data - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_path()) - - df_columns_reversed = df[df.columns[::-1]] - - gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(), - if_exists='append', private_key=_get_private_key_path()) - - def test_list_dataset(self): - dataset_id = DATASET_ID + "1" - self.assertTrue(dataset_id in self.dataset.datasets(), - 'Expected dataset list to contain dataset {0}' - .format(dataset_id)) - - def test_list_table_zero_results(self): - dataset_id = DATASET_ID + "2" - self.dataset.create(dataset_id) - table_list = gbq._Dataset(_get_project_id(), - private_key=_get_private_key_path() - ).tables(dataset_id) - self.assertEqual(len(table_list), 0, - 'Expected gbq.list_table() to return 0') - - def test_create_dataset(self): - dataset_id = DATASET_ID + "3" - self.dataset.create(dataset_id) - self.assertTrue(dataset_id in self.dataset.datasets(), - 'Expected dataset to exist') - - def test_delete_dataset(self): - dataset_id = DATASET_ID + "4" - self.dataset.create(dataset_id) - self.dataset.delete(dataset_id) - self.assertTrue(dataset_id not in self.dataset.datasets(), - 'Expected dataset not to exist') - - def test_dataset_exists(self): - dataset_id = DATASET_ID + "5" - self.dataset.create(dataset_id) - self.assertTrue(self.dataset.exists(dataset_id), - 'Expected dataset to exist') - - def create_table_data_dataset_does_not_exist(self): - dataset_id = DATASET_ID + "6" - table_id = TABLE_ID + "1" - table_with_new_dataset = gbq._Table(_get_project_id(), dataset_id) - df = make_mixed_dataframe_v2(10) - table_with_new_dataset.create(table_id, gbq._generate_bq_schema(df)) - self.assertTrue(self.dataset.exists(dataset_id), - 'Expected dataset to exist') - self.assertTrue(table_with_new_dataset.exists( - table_id), 'Expected dataset to exist') - - def test_dataset_does_not_exist(self): - self.assertTrue(not self.dataset.exists( - DATASET_ID + "_not_found"), 'Expected dataset not to exist') - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): - # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 - # As a workaround to this issue, each test should use a unique table name. - # Make sure to modify the for loop range in the tearDownClass when a new - # test is added - # See `Issue 191 - # `__ - - @classmethod - def setUpClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* - # executing *ALL* tests described below. - - _skip_if_no_project_id() - _skip_local_auth_if_in_travis_env() - - _setup_common() - clean_gbq_environment() - - def setUp(self): - # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test - # is executed. - pass - - @classmethod - def tearDownClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* - # executing all tests. - - clean_gbq_environment() - - def tearDown(self): - # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test - # is executed. - pass - - def test_upload_data(self): - destination_table = "{0}.{1}".format(DATASET_ID + "2", TABLE_ID + "1") - - test_size = 10 - df = make_mixed_dataframe_v2(test_size) - - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000) - - sleep(30) # <- Curses Google!!! - - result = gbq.read_gbq( - "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), - project_id=_get_project_id()) - - self.assertEqual(result['num_rows'][0], test_size) - - -@pytest.mark.xfail(run=False, reason="intermittent failures") -class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): - # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 - # As a workaround to this issue, each test should use a unique table name. - # Make sure to modify the for loop range in the tearDownClass when a new - # test is added - # See `Issue 191 - # `__ - - @classmethod - def setUpClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* - # executing *ALL* tests described below. - - _setup_common() - _skip_if_no_project_id() - _skip_if_no_private_key_contents() - - clean_gbq_environment(_get_private_key_contents()) - - def setUp(self): - # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test - # is executed. - pass - - @classmethod - def tearDownClass(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* - # executing all tests. - - clean_gbq_environment(_get_private_key_contents()) - - def tearDown(self): - # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test - # is executed. - pass - - def test_upload_data(self): - destination_table = "{0}.{1}".format(DATASET_ID + "3", TABLE_ID + "1") - - test_size = 10 - df = make_mixed_dataframe_v2(test_size) - - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_contents()) + df.to_gbq(destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! - result = gbq.read_gbq( - "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), - project_id=_get_project_id(), - private_key=_get_private_key_contents()) + result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" + .format(destination_table), + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['num_rows'][0], test_size) diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 1b501eb1d9bda..d966d6b7a1b32 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -3,7 +3,7 @@ import sys import warnings from textwrap import dedent -from functools import wraps +from functools import wraps, update_wrapper def deprecate(name, alternative, alt_name=None): @@ -233,3 +233,39 @@ def make_signature(func): if spec.keywords: args.append('**' + spec.keywords) return args, spec.args + + +class docstring_wrapper(object): + """ + decorator to wrap a function, + provide a dynamically evaluated doc-string + + Parameters + ---------- + func : callable + creator : callable + return the doc-string + default : str, optional + return this doc-string on error + """ + _attrs = ['__module__', '__name__', + '__qualname__', '__annotations__'] + + def __init__(self, func, creator, default=None): + self.func = func + self.creator = creator + self.default = default + update_wrapper( + self, func, [attr for attr in self._attrs + if hasattr(func, attr)]) + + def __call__(self, func, *args, **kwargs): + return self.func(*args, **kwargs) + + @property + def __doc__(self): + try: + return self.creator() + except Exception as exc: + msg = self.default or str(exc) + return msg diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index b0f5d3994ed64..ca75d4d02e927 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -88,13 +88,12 @@ def show_versions(as_json=False): ("lxml", lambda mod: mod.etree.__version__), ("bs4", lambda mod: mod.__version__), ("html5lib", lambda mod: mod.__version__), - ("httplib2", lambda mod: mod.__version__), - ("apiclient", lambda mod: mod.__version__), ("sqlalchemy", lambda mod: mod.__version__), ("pymysql", lambda mod: mod.__version__), ("psycopg2", lambda mod: mod.__version__), ("jinja2", lambda mod: mod.__version__), ("s3fs", lambda mod: mod.__version__), + ("pandas_gbq", lambda mod: mod.__version__), ("pandas_datareader", lambda mod: mod.__version__) ] From ca8c53c321cc162afb1c53fea92da9f6c2deb2f6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 14:51:20 -0500 Subject: [PATCH 199/338] BUG: GH15429 transform result of timedelta from datetime The transform() operation needs to return a like-indexed. To facilitate this, transform starts with a copy of the original series. Then, after the computation for each group, sets the appropriate elements of the copied series equal to the result. At that point is does a type comparison, and discovers that the timedelta is not cast- able to a datetime. closes #10972 Author: Jeff Reback Author: Stephen Rauch Closes #15430 from stephenrauch/group-by-transform-timedelta-from-datetime and squashes the following commits: c3b0dd0 [Jeff Reback] PEP fix 2f48549 [Jeff Reback] fixup slow transforms cc43503 [Stephen Rauch] BUG: GH15429 transform result of timedelta from datetime --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/groupby.py | 34 +++++++++++----------- pandas/tests/groupby/test_filters.py | 1 + pandas/tests/groupby/test_transform.py | 39 +++++++++++++++++++++++++- 4 files changed, 57 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f0e4176472861..7b32cee7f7064 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -637,6 +637,7 @@ Bug Fixes - Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) +- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`) - Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 0.2.0``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 831ca3886773e..2c61a73d6814e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2890,32 +2890,32 @@ def transform(self, func, *args, **kwargs): lambda: getattr(self, func)(*args, **kwargs)) # reg transform - dtype = self._selected_obj.dtype - result = self._selected_obj.values.copy() - + klass = self._selected_obj.__class__ + results = [] wrapper = lambda x: func(x, *args, **kwargs) - for i, (name, group) in enumerate(self): + for name, group in self: object.__setattr__(group, 'name', name) res = wrapper(group) if hasattr(res, 'values'): res = res.values - # may need to astype - try: - common_type = np.common_type(np.array(res), result) - if common_type != result.dtype: - result = result.astype(common_type) - except: - pass - indexer = self._get_index(name) - result[indexer] = res + s = klass(res, indexer) + results.append(s) - result = _possibly_downcast_to_dtype(result, dtype) - return self._selected_obj.__class__(result, - index=self._selected_obj.index, - name=self._selected_obj.name) + from pandas.tools.concat import concat + result = concat(results).sort_index() + + # we will only try to coerce the result type if + # we have a numeric dtype + dtype = self._selected_obj.dtype + if is_numeric_dtype(dtype): + result = _possibly_downcast_to_dtype(result, dtype) + + result.name = self._selected_obj.name + result.index = self._selected_obj.index + return result def _transform_fast(self, func): """ diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 46ddb5a5318fb..de6757786a363 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -216,6 +216,7 @@ def test_filter_against_workaround(self): grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index cf5e9eb26ff13..51920ec642705 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from pandas.util import testing as tm -from pandas import Series, DataFrame, Timestamp, MultiIndex, concat +from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range from pandas.types.common import _ensure_platform_int from .common import MixIn, assert_fp_equal @@ -190,6 +190,43 @@ def test_transform_bug(self): expected = Series(np.arange(5, 0, step=-1), name='B') assert_series_equal(result, expected) + def test_transform_datetime_to_timedelta(self): + # GH 15429 + # transforming a datetime to timedelta + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + expected = pd.Series([ + Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') + + # this does date math without changing result type in transform + base_time = df['A'][0] + result = df.groupby('A')['A'].transform( + lambda x: x.max() - x.min() + base_time) - base_time + assert_series_equal(result, expected) + + # this does date math and causes the transform to return timedelta + result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) + assert_series_equal(result, expected) + + def test_transform_datetime_to_numeric(self): + # GH 10972 + # convert dt to float + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) + + expected = Series([-0.5, 0.5], name='b') + assert_series_equal(result, expected) + + # convert dt to int + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) + + expected = Series([0, 1], name='b') + assert_series_equal(result, expected) + def test_transform_multiple(self): grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) From d18d0b4727fb02a7d1ae13b8c24680737aa83da7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 15:07:49 -0500 Subject: [PATCH 200/338] BUG: fix groupby.aggregate resulting dtype coercion, xref #11444, #13046 make sure .size includes the name of the grouped --- doc/source/whatsnew/v0.20.0.txt | 4 ++-- pandas/core/groupby.py | 23 ++++++++++++++------ pandas/tests/groupby/test_aggregate.py | 23 ++++++++++++++++++++ pandas/tests/groupby/test_transform.py | 29 +++++++++++++++++++++++++- pandas/tests/tseries/test_resample.py | 6 ++---- 5 files changed, 72 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7b32cee7f7064..9b4e6fbe3be10 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -632,12 +632,12 @@ Bug Fixes - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) -- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`) +- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`) - Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) -- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`) +- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`, :issue:`11444`) - Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 0.2.0``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2c61a73d6814e..3828e5dac5729 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -767,11 +767,14 @@ def _index_with_as_index(self, b): new.names = gp.names + original.names return new - def _try_cast(self, result, obj): + def _try_cast(self, result, obj, numeric_only=False): """ try to cast the result to our obj original type, we may have roundtripped thru object in the mean-time + if numeric_only is True, then only try to cast numerics + and not datetimelikes + """ if obj.ndim > 1: dtype = obj.values.dtype @@ -779,7 +782,8 @@ def _try_cast(self, result, obj): dtype = obj.dtype if not is_scalar(result): - result = _possibly_downcast_to_dtype(result, dtype) + if numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = _possibly_downcast_to_dtype(result, dtype) return result @@ -830,7 +834,7 @@ def _python_agg_general(self, func, *args, **kwargs): for name, obj in self._iterate_slices(): try: result, counts = self.grouper.agg_series(obj, f) - output[name] = self._try_cast(result, obj) + output[name] = self._try_cast(result, obj, numeric_only=True) except TypeError: continue @@ -1117,7 +1121,11 @@ def sem(self, ddof=1): @Appender(_doc_template) def size(self): """Compute group sizes""" - return self.grouper.size() + result = self.grouper.size() + + if isinstance(self.obj, Series): + result.name = getattr(self, 'name', None) + return result sum = _groupby_function('sum', 'add', np.sum) prod = _groupby_function('prod', 'prod', np.prod) @@ -1689,7 +1697,9 @@ def size(self): ids, _, ngroup = self.group_info ids = _ensure_platform_int(ids) out = np.bincount(ids[ids != -1], minlength=ngroup or None) - return Series(out, index=self.result_index, dtype='int64') + return Series(out, + index=self.result_index, + dtype='int64') @cache_readonly def _max_groupsize(self): @@ -2908,7 +2918,8 @@ def transform(self, func, *args, **kwargs): result = concat(results).sort_index() # we will only try to coerce the result type if - # we have a numeric dtype + # we have a numeric dtype, as these are *always* udfs + # the cython take a different path (and casting) dtype = self._selected_obj.dtype if is_numeric_dtype(dtype): result = _possibly_downcast_to_dtype(result, dtype) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index cb739546a2312..52b35048b6762 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -154,6 +154,29 @@ def test_agg_dict_parameter_cast_result_dtypes(self): assert_series_equal(grouped.time.last(), exp['time']) assert_series_equal(grouped.time.agg('last'), exp['time']) + # count + exp = pd.Series([2, 2, 2, 2], + index=Index(list('ABCD'), name='class'), + name='time') + assert_series_equal(grouped.time.agg(len), exp) + assert_series_equal(grouped.time.size(), exp) + + exp = pd.Series([0, 1, 1, 2], + index=Index(list('ABCD'), name='class'), + name='time') + assert_series_equal(grouped.time.count(), exp) + + def test_agg_cast_results_dtypes(self): + # similar to GH12821 + # xref #11444 + u = [datetime(2015, x + 1, 1) for x in range(12)] + v = list('aaabbbbbbccd') + df = pd.DataFrame({'X': v, 'Y': u}) + + result = df.groupby('X')['Y'].agg(len) + expected = df.groupby('X')['Y'].count() + assert_series_equal(result, expected) + def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 51920ec642705..2d21eab5822fe 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -4,7 +4,8 @@ import pandas as pd from pandas.util import testing as tm from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range -from pandas.types.common import _ensure_platform_int +from pandas.types.common import _ensure_platform_int, is_timedelta64_dtype +from pandas.compat import StringIO from .common import MixIn, assert_fp_equal from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -227,6 +228,32 @@ def test_transform_datetime_to_numeric(self): expected = Series([0, 1], name='b') assert_series_equal(result, expected) + def test_transform_casting(self): + # 13046 + data = """ + idx A ID3 DATETIME + 0 B-028 b76cd912ff "2014-10-08 13:43:27" + 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" + 2 B-076 1a682034f8 "2014-10-08 14:29:01" + 3 B-023 b76cd912ff "2014-10-08 18:39:34" + 4 B-023 f88g8d7sds "2014-10-08 18:40:18" + 5 B-033 b76cd912ff "2014-10-08 18:44:30" + 6 B-032 b76cd912ff "2014-10-08 18:46:00" + 7 B-037 b76cd912ff "2014-10-08 18:52:15" + 8 B-046 db959faf02 "2014-10-08 18:59:59" + 9 B-053 b76cd912ff "2014-10-08 19:17:48" + 10 B-065 b76cd912ff "2014-10-08 19:21:38" + """ + df = pd.read_csv(StringIO(data), sep='\s+', + index_col=[0], parse_dates=['DATETIME']) + + result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) + assert is_timedelta64_dtype(result.dtype) + + result = df[['ID3', 'DATETIME']].groupby('ID3').transform( + lambda x: x.diff()) + assert is_timedelta64_dtype(result.DATETIME.dtype) + def test_transform_multiple(self): grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index 6e999c5b1d276..1535bd665fe8b 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -757,10 +757,8 @@ def test_resample_empty_series(self): freq in ['M', 'D']): # GH12871 - TODO: name should propagate, but currently # doesn't on lower / same frequency with PeriodIndex - assert_series_equal(result, expected, check_dtype=False, - check_names=False) - # this assert will break when fixed - self.assertTrue(result.name is None) + assert_series_equal(result, expected, check_dtype=False) + else: assert_series_equal(result, expected, check_dtype=False) From 99d354448c205869c4b717b348bcf017c3747054 Mon Sep 17 00:00:00 2001 From: "Dr. Irv" Date: Mon, 27 Feb 2017 16:43:01 -0500 Subject: [PATCH 201/338] DOC: Update contributing for test_fast, fix doc Windows build (#15523) * DOC: Update contributing for test_fast, fix doc Windows build * add pip install for xdist --- doc/make.py | 4 ++-- doc/source/contributing.rst | 29 ++++++++++++++++++++--------- test_fast.bat | 3 +++ 3 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 test_fast.bat diff --git a/doc/make.py b/doc/make.py index d46be2611ce3d..8a6d4e5df24f0 100755 --- a/doc/make.py +++ b/doc/make.py @@ -202,8 +202,8 @@ def html(): raise SystemExit("Building HTML failed.") try: # remove stale file - os.system('rm source/html-styling.html') - os.system('cd build; rm -f html/pandas.zip;') + os.remove('source/html-styling.html') + os.remove('build/html/pandas.zip') except: pass diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 2f838a3ab2386..83f99b4f01b26 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -520,15 +520,6 @@ submitting code to run the check yourself on the diff:: git diff master | flake8 --diff -Furthermore, we've written a tool to check that your commits are PEP8 great, `pip install pep8radius -`_. Look at PEP8 fixes in your branch vs master with:: - - pep8radius master --diff - -and make these changes with:: - - pep8radius master --diff --in-place - Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -611,6 +602,26 @@ Or with one of the following constructs:: pytest pandas/tests/[test-module].py::[TestClass] pytest pandas/tests/[test-module].py::[TestClass]::[test_method] +Using `pytest-xdist `_, one can +speed up local testing on multicore machines. To use this feature, you will +need to install `pytest-xdist` via:: + + pip install pytest-xdist + +Two scripts are provided to assist with this. These scripts distribute +testing across 4 threads. + +On Unix variants, one can type:: + + test_fast.sh + +On Windows, one can type:: + + test_fast.bat + +This can significantly reduce the time it takes to locally run tests before +submitting a pull request. + For more, see the `pytest `_ documentation. .. versionadded:: 0.20.0 diff --git a/test_fast.bat b/test_fast.bat new file mode 100644 index 0000000000000..17dc54b580137 --- /dev/null +++ b/test_fast.bat @@ -0,0 +1,3 @@ +:: test on windows +set PYTHONHASHSEED=314159265 +pytest --skip-slow --skip-network -m "not single" -n 4 pandas From 2d18b7fa13c9b7daecb771b68c0bcb56e04cd0e4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 20:18:21 -0500 Subject: [PATCH 202/338] BUG: fix to_gbq calling convention; now its a bound method of DataFrame xref #15484 --- pandas/core/frame.py | 16 +++++++++++----- pandas/util/decorators.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7b02926ea8837..0963d14762ce5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -942,11 +942,6 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, chunksize=chunksize, verbose=verbose, reauth=reauth, if_exists=if_exists, private_key=private_key) - def _f(): - from pandas.io.gbq import _try_import - return _try_import().to_gbq.__doc__ - to_gbq = docstring_wrapper(to_gbq, _f) - @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None): @@ -5430,6 +5425,17 @@ def combineMult(self, other): _EMPTY_SERIES = Series([]) +# patch in the doc-string for to_gbq +# and bind this method +def _f(): + from pandas.io.gbq import _try_import + return _try_import().to_gbq.__doc__ + + +DataFrame.to_gbq = types.MethodType(docstring_wrapper(DataFrame.to_gbq, _f), + DataFrame) + + def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index d966d6b7a1b32..d1ca480f7a568 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -259,7 +259,7 @@ def __init__(self, func, creator, default=None): self, func, [attr for attr in self._attrs if hasattr(func, attr)]) - def __call__(self, func, *args, **kwargs): + def __call__(self, *args, **kwargs): return self.func(*args, **kwargs) @property From 36631dc3f4b4dde3c43c157631c73f3fc4f8a492 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 23:43:16 -0500 Subject: [PATCH 203/338] BUG: fix calling convention for to_gbq, take 2 --- pandas/core/frame.py | 3 +-- pandas/util/decorators.py | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0963d14762ce5..c47490bfbede4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5432,8 +5432,7 @@ def _f(): return _try_import().to_gbq.__doc__ -DataFrame.to_gbq = types.MethodType(docstring_wrapper(DataFrame.to_gbq, _f), - DataFrame) +DataFrame.to_gbq = docstring_wrapper(DataFrame.to_gbq, _f) def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index d1ca480f7a568..ee7e2f4302b10 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -1,5 +1,6 @@ from pandas.compat import StringIO, callable, signature from pandas.lib import cache_readonly # noqa +import types import sys import warnings from textwrap import dedent @@ -259,6 +260,10 @@ def __init__(self, func, creator, default=None): self, func, [attr for attr in self._attrs if hasattr(func, attr)]) + def __get__(self, instance, cls=None): + # we want to return the actual passed instance + return types.MethodType(self, instance) + def __call__(self, *args, **kwargs): return self.func(*args, **kwargs) From afae7975b0b908739a498e89c8dc2c0e510694e5 Mon Sep 17 00:00:00 2001 From: Sam Foo Date: Tue, 28 Feb 2017 02:43:22 -0800 Subject: [PATCH 204/338] DEPR: rename consolidate to _consolidate and create deprecation warning (#15501) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 14 +++++++++++--- pandas/core/groupby.py | 4 ++-- pandas/io/pytables.py | 6 +++--- pandas/tests/frame/test_block_internals.py | 11 ++++++++--- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/io/test_pytables.py | 14 +++++++------- pandas/tests/test_generic.py | 2 +- pandas/tests/test_panel4d.py | 2 +- pandas/tools/concat.py | 2 +- pandas/tseries/resample.py | 2 +- 11 files changed, 37 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9b4e6fbe3be10..f91ffcdb81f9b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -493,6 +493,7 @@ Deprecations - ``DataFrame.astype()`` has deprecated the ``raise_on_error`` parameter in favor of ``errors`` (:issue:`14878`) - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) +- ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) .. _whatsnew_0200.prior_deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cdc37e00f70e0..127aac970fbc1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2875,11 +2875,10 @@ def f(): self._protect_consolidate(f) - def consolidate(self, inplace=False): + def _consolidate(self, inplace=False): """ Compute NDFrame with "consolidated" internals (data of each dtype - grouped together in a single ndarray). Mainly an internal API function, - but available here to the savvy user + grouped together in a single ndarray). Parameters ---------- @@ -2898,6 +2897,15 @@ def consolidate(self, inplace=False): cons_data = self._protect_consolidate(f) return self._constructor(cons_data).__finalize__(self) + def consolidate(self, inplace=False): + """ + DEPRECATED: consolidate will be an internal implementation only. + """ + # 15483 + warnings.warn("consolidate is deprecated and will be removed in a " + "future release.", FutureWarning, stacklevel=2) + return self._consolidate(inplace) + @property def _is_mixed_type(self): f = lambda: self._data.is_mixed_type diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3828e5dac5729..381a8edcb5192 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3904,7 +3904,7 @@ def _wrap_aggregated_output(self, output, names=None): if not self.as_index: result = DataFrame(output, columns=output_keys) self._insert_inaxis_grouper_inplace(result) - result = result.consolidate() + result = result._consolidate() else: index = self.grouper.result_index result = DataFrame(output, index=index, columns=output_keys) @@ -3924,7 +3924,7 @@ def _wrap_agged_blocks(self, items, blocks): result = DataFrame(mgr) self._insert_inaxis_grouper_inplace(result) - result = result.consolidate() + result = result._consolidate() else: index = self.grouper.result_index mgr = BlockManager(blocks, [items, index]) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e962cff9913eb..f83dde5048912 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -838,7 +838,7 @@ def func(_start, _stop, _where): # concat and return return concat(objs, axis=axis, - verify_integrity=False).consolidate() + verify_integrity=False)._consolidate() # create the iterator it = TableIterator(self, s, func, where=where, nrows=nrows, @@ -3445,7 +3445,7 @@ def get_blk_items(mgr, blocks): return [mgr.items.take(blk.mgr_locs) for blk in blocks] # figure out data_columns and get out blocks - block_obj = self.get_object(obj).consolidate() + block_obj = self.get_object(obj)._consolidate() blocks = block_obj._data.blocks blk_items = get_blk_items(block_obj._data, blocks) if len(self.non_index_axes): @@ -3812,7 +3812,7 @@ def read(self, where=None, columns=None, **kwargs): if len(objs) == 1: wp = objs[0] else: - wp = concat(objs, axis=0, verify_integrity=False).consolidate() + wp = concat(objs, axis=0, verify_integrity=False)._consolidate() # apply the selection filters & axis orderings wp = self.process_axes(wp, columns=columns) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 7b64dea8c102d..accd3ddeb03d7 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -40,19 +40,24 @@ def test_cast_internals(self): def test_consolidate(self): self.frame['E'] = 7. - consolidated = self.frame.consolidate() + consolidated = self.frame._consolidate() self.assertEqual(len(consolidated._data.blocks), 1) # Ensure copy, do I want this? - recons = consolidated.consolidate() + recons = consolidated._consolidate() self.assertIsNot(recons, consolidated) assert_frame_equal(recons, consolidated) self.frame['F'] = 8. self.assertEqual(len(self.frame._data.blocks), 3) - self.frame.consolidate(inplace=True) + self.frame._consolidate(inplace=True) self.assertEqual(len(self.frame._data.blocks), 1) + def test_consolidate_deprecation(self): + self.frame['E'] = 7 + with tm.assert_produces_warning(FutureWarning): + self.frame.consolidate() + def test_consolidate_inplace(self): frame = self.frame.copy() # noqa diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 4ad88a12a2625..d6bcb85e01910 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -87,7 +87,7 @@ def check(result, expected=None): check(df, expected) # consolidate - df = df.consolidate() + df = df._consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index a840ff46aa845..d5a8b380d01f9 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -418,7 +418,7 @@ def test_repr(self): df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[3:6, ['obj1']] = np.nan - df = df.consolidate()._convert(datetime=True) + df = df._consolidate()._convert(datetime=True) warnings.filterwarnings('ignore', category=PerformanceWarning) store['df'] = df @@ -762,7 +762,7 @@ def test_put_mixed_type(self): df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[3:6, ['obj1']] = np.nan - df = df.consolidate()._convert(datetime=True) + df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df') @@ -2077,7 +2077,7 @@ def test_table_mixed_dtypes(self): df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[3:6, ['obj1']] = np.nan - df = df.consolidate()._convert(datetime=True) + df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: store.append('df1_mixed', df) @@ -2091,7 +2091,7 @@ def test_table_mixed_dtypes(self): wp['bool2'] = wp['ItemB'] > 0 wp['int1'] = 1 wp['int2'] = 2 - wp = wp.consolidate() + wp = wp._consolidate() with ensure_clean_store(self.path) as store: store.append('p1_mixed', wp) @@ -2106,7 +2106,7 @@ def test_table_mixed_dtypes(self): wp['bool2'] = wp['l2'] > 0 wp['int1'] = 1 wp['int2'] = 2 - wp = wp.consolidate() + wp = wp._consolidate() with ensure_clean_store(self.path) as store: store.append('p4d_mixed', wp) @@ -2134,7 +2134,7 @@ def test_unimplemented_dtypes_table_columns(self): df['obj1'] = 'foo' df['obj2'] = 'bar' df['datetime1'] = datetime.date(2001, 1, 2) - df = df.consolidate()._convert(datetime=True) + df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: # this fails because we have a date in the object block...... @@ -2949,7 +2949,7 @@ def _make_one(): df['bool2'] = df['B'] > 0 df['int1'] = 1 df['int2'] = 2 - return df.consolidate() + return df._consolidate() df1 = _make_one() df2 = _make_one() diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 40cdbe083acd7..a2329e2d1768e 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -658,7 +658,7 @@ def test_validate_bool_args(self): super(DataFrame, df).sort_index(inplace=value) with self.assertRaises(ValueError): - super(DataFrame, df).consolidate(inplace=value) + super(DataFrame, df)._consolidate(inplace=value) with self.assertRaises(ValueError): super(DataFrame, df).fillna(value=0, inplace=value) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 902b42e7d77d7..2491bac2a7f19 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -677,7 +677,7 @@ def test_consolidate(self): self.panel4d['foo'] = 1. self.assertFalse(self.panel4d._data.is_consolidated()) - panel4d = self.panel4d.consolidate() + panel4d = self.panel4d._consolidate() self.assertTrue(panel4d._data.is_consolidated()) def test_ctor_dict(self): diff --git a/pandas/tools/concat.py b/pandas/tools/concat.py index 31d7a9eb9a01a..6405106118472 100644 --- a/pandas/tools/concat.py +++ b/pandas/tools/concat.py @@ -263,7 +263,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, raise TypeError("cannot concatenate a non-NDFrame object") # consolidate - obj.consolidate(inplace=True) + obj._consolidate(inplace=True) ndims.add(obj.ndim) # get the sample diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index a6a10c08966d6..75e550a065fd2 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -221,7 +221,7 @@ def _convert_obj(self, obj): ------- obj : converted object """ - obj = obj.consolidate() + obj = obj._consolidate() return obj def _get_binner_for_time(self): From 4116d9ebd987775de19e5fafe8e9e3d26c27931b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 Feb 2017 09:17:08 -0500 Subject: [PATCH 205/338] DEPR: remove pd.TimeSeries & Series.is_time_series xref #10890 Author: Jeff Reback Closes #15098 from jreback/time_series and squashes the following commits: d9101bc [Jeff Reback] fix back-compat for < 0.13 ed57bd5 [Jeff Reback] DEPR: remove legacy pd.TimeSeries class in favor of pd.Series --- doc/source/whatsnew/v0.20.0.txt | 42 ++++++++++++++++++++- pandas/compat/pickle_compat.py | 5 ++- pandas/core/api.py | 2 +- pandas/core/series.py | 17 --------- pandas/io/pytables.py | 4 -- pandas/tests/api/test_api.py | 2 +- pandas/tests/indexes/data/s1-0.12.0.pickle | Bin 862 -> 0 bytes pandas/tests/indexes/data/s2-0.12.0.pickle | Bin 814 -> 0 bytes pandas/tests/indexes/test_base.py | 11 ------ pandas/tests/io/data/legacy_hdf/legacy.h5 | Bin 14928 -> 0 bytes pandas/tests/io/test_pytables.py | 9 ----- pandas/tests/series/test_alter_axes.py | 3 -- pandas/tests/series/test_constructors.py | 15 -------- pandas/tests/series/test_timeseries.py | 2 - 14 files changed, 47 insertions(+), 65 deletions(-) delete mode 100644 pandas/tests/indexes/data/s1-0.12.0.pickle delete mode 100644 pandas/tests/indexes/data/s2-0.12.0.pickle delete mode 100644 pandas/tests/io/data/legacy_hdf/legacy.h5 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f91ffcdb81f9b..671df5760fb84 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -246,6 +246,46 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi df.iloc[[0, 2], df.columns.get_loc('A')] +.. _whatsnew.api_breaking.io_compat + +Possible incompat for HDF5 formats for pandas < 0.13.0 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``pd.TimeSeries`` was deprecated officially in 0.17.0, though has only been an alias since 0.13.0. It has +been dropped in favor of ``pd.Series``. (:issue:``15098). + +This *may* cause HDF5 files that were created in prior versions to become unreadable if ``pd.TimeSeries`` +was used. This is most likely to be for pandas < 0.13.0. If you find yourself in this situation. +You can use a recent prior version of pandas to read in your HDF5 files, +then write them out again after applying the procedure below. + +.. code-block:: ipython + + In [2]: s = pd.TimeSeries([1,2,3], index=pd.date_range('20130101', periods=3)) + + In [3]: s + Out[3]: + 2013-01-01 1 + 2013-01-02 2 + 2013-01-03 3 + Freq: D, dtype: int64 + + In [4]: type(s) + Out[4]: pandas.core.series.TimeSeries + + In [5]: s = pd.Series(s) + + In [6]: s + Out[6]: + 2013-01-01 1 + 2013-01-02 2 + 2013-01-03 3 + Freq: D, dtype: int64 + + In [7]: type(s) + Out[7]: pandas.core.series.Series + + .. _whatsnew_0200.api_breaking.index_map: Map on Index types now return other Index types @@ -507,7 +547,7 @@ Removal of prior version deprecations/changes Similar functionality can be found in the `Google2Pandas `__ package. - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) - ``pandas.stats.fama_macbeth``, ``pandas.stats.ols``, ``pandas.stats.plm`` and ``pandas.stats.var``, as well as the top-level ``pandas.fama_macbeth`` and ``pandas.ols`` routines are removed. Similar functionaility can be found in the `statsmodels `__ package. (:issue:`11898`) - +- ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:``) .. _whatsnew_0200.performance: diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 240baa848adbc..b8ccd13c153d4 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -58,7 +58,10 @@ def load_reduce(self): # 15477 ('pandas.core.base', 'FrozenNDArray'): ('pandas.indexes.frozen', 'FrozenNDArray'), - ('pandas.core.base', 'FrozenList'): ('pandas.indexes.frozen', 'FrozenList') + ('pandas.core.base', 'FrozenList'): ('pandas.indexes.frozen', 'FrozenList'), + + # 10890 + ('pandas.core.series', 'TimeSeries'): ('pandas.core.series', 'Series') } diff --git a/pandas/core/api.py b/pandas/core/api.py index 177e7b31cbd4f..eaebf45a038a0 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -13,7 +13,7 @@ UInt64Index, RangeIndex, Float64Index, MultiIndex) -from pandas.core.series import Series, TimeSeries +from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D diff --git a/pandas/core/series.py b/pandas/core/series.py index da47ab5dfb003..ffe1be26fda54 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -277,13 +277,6 @@ def _constructor_expanddim(self): def _can_hold_na(self): return self._data._can_hold_na - @property - def is_time_series(self): - warnings.warn("is_time_series is deprecated. Please use " - "Series.index.is_all_dates", FutureWarning, stacklevel=2) - # return self._subtyp in ['time_series', 'sparse_time_series'] - return self.index.is_all_dates - _index = None def _set_axis(self, axis, labels, fastpath=False): @@ -2985,16 +2978,6 @@ def create_from_value(value, index, dtype): return subarr -# backwards compatiblity -class TimeSeries(Series): - - def __init__(self, *args, **kwargs): - # deprecation TimeSeries, #10890 - warnings.warn("TimeSeries is deprecated. Please use Series", - FutureWarning, stacklevel=2) - - super(TimeSeries, self).__init__(*args, **kwargs) - # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f83dde5048912..a8fdd5fcfe204 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -23,8 +23,6 @@ from pandas.types.missing import array_equivalent import numpy as np - -import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index, isnull, concat, SparseSeries, SparseDataFrame, PeriodIndex, @@ -166,7 +164,6 @@ class DuplicateWarning(Warning): Series: u('series'), SparseSeries: u('sparse_series'), - pd.TimeSeries: u('series'), DataFrame: u('frame'), SparseDataFrame: u('sparse_frame'), Panel: u('wide'), @@ -175,7 +172,6 @@ class DuplicateWarning(Warning): # storer class map _STORER_MAP = { - u('TimeSeries'): 'LegacySeriesFixed', u('Series'): 'LegacySeriesFixed', u('DataFrame'): 'LegacyFrameFixed', u('DataMatrix'): 'LegacyFrameFixed', diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 90a0c1d5c9347..8ca369f8df83a 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -57,7 +57,7 @@ class TestPDApi(Base, tm.TestCase): 'TimedeltaIndex', 'Timestamp'] # these are already deprecated; awaiting removal - deprecated_classes = ['TimeSeries', 'WidePanel', + deprecated_classes = ['WidePanel', 'SparseTimeSeries', 'Panel4D', 'SparseList'] diff --git a/pandas/tests/indexes/data/s1-0.12.0.pickle b/pandas/tests/indexes/data/s1-0.12.0.pickle deleted file mode 100644 index 0ce9cfdf3aa94fdfd9f8ad6ea00e72fa7eda6552..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 862 zcmZo*O3o|IEvVE>&M!*U%Pq|*$xJLNO049HFG@|$&nqq|DorloDr8J9NX$z~EQTm6 zPA$qzE#?Zz%uNl3FbkQy8CpXbliGs{nKir_z2#aV#&V^UR2HOi6|#gfrCE40cryYO zuxfZShcepu`T2SM2LdqR%}|om8T!FSUEoT>tcUxr)s_8Ijs3LWI_;WmV#&?@vK#N0 zoL>KQf8a;{X^E~M_s?K#x%;8zsr`i_ZuK`-fA@cJS*24K`*Oe8y*l?PA3p8(`ei62 zbnS%w5$TfaS1jJ`*NZZ%)|&BS|JvJ~j)$H<-hVEx)^DEoqx~v>jx4H;Il6z+xzj(_ zHvYH2f1A@KgW>9a)gQa(NFF=8|Ec5GEr(Bh*?<3;k-CNYJNvscUe7tHch>&PvcCaG zj~}sj+&X>D?sxa?tt`$aH9h;b|8i~p|IC=r_VS!ZE_W{fxSv5`anj?~XZ!izWbc?U z_u77j@IL~N8t(6RE7W>y+IiN#Bm*9>C6GYX%gjqjt>AJ=EJ-Z^2CpZWSI7nrU3N(5 zW&i`!7Z#u#8s03J0a~jO%9K|Oj0+$vX#)o@1H;3TA1{@885j)MdM#9dbp3S2b`X6| zW7|~_ExXH0MHNVM<#7n80qMy9bK*esiV22mLG*XUW4}SP+vX@Qbs%k=E@7qtq?>-M zX#vp}I~^~BXa(tOikd(=e@#{ah>l6nh|vPlt3U2p1)_g71^xulo+8iOwSn}ErB%~F zbZewVlMax+^6Jb55UpI3D6b2oi+I1~g6Mh<-JF)+2r15n#=YH_g(Fr^oAYBm;f)fRGVgmUKP z7v(1AWLBjX@^pr>K~xv=CZ|B7g9{WVDXBRniCl$zz~l^szy!@7%9354Qd}rd8_JmH R%~+b`*WQ-o*VdM#2LML=S0Vra diff --git a/pandas/tests/indexes/data/s2-0.12.0.pickle b/pandas/tests/indexes/data/s2-0.12.0.pickle deleted file mode 100644 index 2318be2d9978bd2edefc4fe1afb2244e0f4c4601..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 814 zcmZY7Z%7ki9KiA0G;?Q5kzN!cDrru$iA6FbI2KcLS0k+>lx5wvGnekR`!h;wro&Cv zs|e9i=8K{d(<1sqr0`HpGi+k`r@(CBaFL0)GE-C5b1r?;J#e2p`0>2>@~ao?X{*Un zBiEZ*N^Y`N^G1?o$r|(wOX9fc?+Ad{*{T=rTjKyZwHh~7?*j!)rvISJi}9740r_w|xsf(d z7f}_-Q#{OAwEo9LZC2bGu2>1f9oq;OEyE-K4`-7RVw!`^;o+U@84+Yi_IxZ=iXe$E z)v0Mvl#Y(u11$};l?!%U%jp*UoznkDu;59VIvsl8+|FDX)V|c!DEPsp91%=T7*EN7 zn!Cn+Dzz!U~i}3ioJMQuxLl1a4*z-x3)#lQL3nVSi_Bq=86iA;yLuNl{3;5 z$Eo1vGLno22DLFUGD1R|Srb`ptfiQ3E+Q&C%}Dizf7wx?y@9IcNtD?R*ApJNps_?` Y)dd5`#MuZDjf<>0O_NinaXNMKziLWDZU6uP diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2f5b98d145e57..79d10cbda565e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,8 +9,6 @@ from pandas.compat import (range, lrange, lzip, u, text_type, zip, PY3, PY36) import operator -import os - import numpy as np from pandas import (period_range, date_range, Series, @@ -381,15 +379,6 @@ def test_view_with_args(self): # with arguments ind.view('i8') - def test_legacy_pickle_identity(self): - - # GH 8431 - pth = tm.get_data_path() - s1 = pd.read_pickle(os.path.join(pth, 's1-0.12.0.pickle')) - s2 = pd.read_pickle(os.path.join(pth, 's2-0.12.0.pickle')) - self.assertFalse(s1.index.identical(s2.index)) - self.assertFalse(s1.index.equals(s2.index)) - def test_astype(self): casted = self.intIndex.astype('i8') diff --git a/pandas/tests/io/data/legacy_hdf/legacy.h5 b/pandas/tests/io/data/legacy_hdf/legacy.h5 deleted file mode 100644 index 38b822dd169945b5f5022a219cc784a4c9951320..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14928 zcmeHN2|Sfq8^0(aNlls-)tzdjME0fVxU%M2vRxdOIp-4O%o+G)mJFVB)2b> zR!SU1C>2wKk%ViP>|4lp-|f3TrP24zeA85~pZ9;3=RNN^&w0-Cod0=msJ^!D6rnjn zB;*nhAWa~NqDkU5j9;cJu@WLr)H~n_9vt`K$l}EkJS2V+Qa1}P7scBr^I-)Ic)5j{ zjt+?xV&MLp&<5Sx#dQlO6X@v4 zboJ-jvnC<%foN`QqRqpJUbwlE1SY({0m44XV(*tD`rjx1emOE;okBGHe({gCmWa=V zGq5h2;yva1qmRSi@o+pO>H~2{Nc3A2alo7lu`T4{<5&bDf)pd0oQZgzmzNiPQh?(x zkpv%)=wuFXuH|@(SZ+FQQAnYL8L#nZJ&g3@h=~&qQ_RdLHk>?H{%t>wm?$AHFQdT8 zjg%jsp3#RmH_ZNd|84&%#6lnhMj>IQ`*3}Ys{U-VOdXnvA;WUrCc2Wbs;$4VqShJ} zIjX5?fRhr{z{Fpkrf2DE%kXtIwAHtuu3=dC2T-?K(3q;$dQMs@6cq;3NZU8qc8i|5 zk-s5b(NLKYpcD|`tK{WG*L77=Hc{T>>`Hf)w=&Z6u-37qF^z1Tf&y*b6cv2dIT^?) zY+7UKq2;ckX1>nD*2%!%YK@Y=v5m67vBD->McqJy&1tzUG$3 zI*yK;186Q@pNf@cND*qJh?G2|6krD7KXgv^Qm=)}oqlUGwV+&J~O5cICj* zZ3za2T@7%CnqcS7NhPrKzVO7_;w)%hQd0SAPAL#ns~O-CZ-)!geN}8D?f{aDII|pC z85vqzQ52`|L~%>R9u%M62t~0$G7?2?lNc0ZnLndA7|lYFmUR(DPo*>z^VVmfxOjIi zin~|^D3(7eLa~ifg(B6x4n@|%W)vsfXhYHASvQIS#uJ|-{VNO*LUDQYR227R&OouI zem07Iwo6g8_)Z4Jqyz;Ng&(P+=-#D)Am6H*nxpxH@cG52z&*pcu*kXnY4D6bSjueJ zb|fepyf|qhq?%F&9d15}kIHxfLlQ#r{F|Zy%cxH*n%w{rCy*q>O5ecU*L-{u=)VDx z74z!y5*lE)Mc>BRU)O-C8fUdv);)s9&UA0DkS>FEGn4E#oahG^=1(r`yD$Xp?b3gU zF{^?_#mfd(@TCF%FvI%P^-(}Tl=n>X+-D#(M_R_ZI~59xSlOOl(+LNbP{rBOU2x@G z+bP~bselrebh>&OoVZ&L%t5fqn zyZNPPt|b3$z!~kUW{Ol4H5i_JP$EU5=nCbhFrcP zjJCsY^mcLd-xv_ga76`>u!K+vB0g$61~}^aIeKBWNcp?@3Rwngf4qF9bOk0)?DMeP zm;2|U0b~t~94imb_jz!`n~zt1sNHkrksYT1X)}lLAWb25J#wGuAK*v#c0=r05Vdtx z{JmrKgAd=YP9phZ(;jV#XpOmW^R zOp--MjPM3zKHlYTM$U`5s~;WXrSJALm0x2D9Iu}_F&mR7_T9Mpzrbgrfw9g-_ThF! zsNs)37a@3YtT^@t8vY{gy86(#kI9eZy(8oPe|vt(Lz;}mdffjJ{jQvYb(qWNMNv8A zT$1pAh{V`_Z}}9gg};~I8_nnU{mur3@GG>r*l54pF%66rN4fStzPQ4be>=G5fA2AhT7S)Q2Cm-{EDfzffDjE~R(wvRLg{NYV{=NJk7T1x{Vwdb9JTV4fdmIwU zfjou)UQGC(6N}$3_o>wWb#Yw?wNn}%SVs2)r=9=t-3bvLdw)8J+cEJZHbb8pFX#HD zxA#vYc{lew`c^5s8AQHx5|XJbfk|X(E&H4% zaQ^ZJT`QLs&e?tU9owU?p|l+TuR8(4zu83%;mnN(w*>z zaonK#&H->cQAuqmGXs{{x5*7|dkJ@_UcILGT_sd^lrpEfJq2+hZ#vKK?Sm}WBX{eL zS3>&4V;&tBs=!sZZjJ2kSztwE((`2YSs+81)*r!}1`gD0$l4{526@W;4d!_~ftUFQ zXY2F6fT;^3ykgqw;NI=5r$JAj1Njv$p356`EsK<~ot!xElTJ1qTrc8xes44MovI^g*j@^L zig-kQl2QQV*H;&a=rzFRy^^p_fQwL(Q+hY~rA_posTSZ9zgWq+Jd@pTpuLmc6>8NE#b;eRs2JS{dG#3a*_R4HB14?^t9~UoK>eO&|N1eI1pSuP@!0*L2D|tF*LH-l$TM=GQ2^FkY8tclco|j2e>lmA(87q>#g^u6I^Lnad?YafwghNv2`3pFSH*77_Tt zG`tdiZr=_G-d2fkbXd}uEBjF1hJ5!ia^xJ66B|XYNBoldqJhsw17q>bJbYadiukj5 zCT$t^1{(e%?tbp$=9y$Utiy!+_;8--z7%WW@69t$;CjHP?=2vBW*+|i{4D{z0NaIg zGT6`6#uEEv%184%f_HH3e|+bdT=}>8&%4i$|7f0xetPE23&EJMDf)l&9IhTd)NlOn zdVV+0#J;<0{+rKZ7XHWc%tOkU#{XKL`Dg3@lUev*2J)9^pky^Kxc}@KX#Tm8-d*E6 zUGwW;!?CP=Cpmnt9+t-J>}ml&9&epOmP-J!?hcGxk8+s4%P_<(z7x)EbW$*w*9aay zc~u>@{yw<+u(sc30vpEMuav1=R0XNM$>Jt^TEY3;_Wc*<6oPuWc7fyO9bms{_T7rs zSFlt3)PtNDrw9I9t3jdI$><+ezJ`19^|XfVs-V=JoCVvGdO?=N#YHyWzW{qpSiF2e zEszYk{b2A|Jx~ysrt3{@0z6UIQqGd^fmTJv-lMv$;2)i~X@{;S0K343@8m3dfpht` zqmifYf(=!V8`xXeFvF&g{5_=#lz7s7ww-SW`aAk}2(Ntw*{-HLX!onYy!Gl^4((_F zvZu~IIrQ=gbU&T_;^d)PxKvtVi|*lWu(@jcw~>!{GCDTNOpZVI3W$6iQhM064SYp) z6MQ;ltuv5t^$X0lVP*>1@ebc!d_0x?i9g773V>!UxKr z+ghXP*Y|Wo+5@RW^$%{t#i?E4TCg4%T=jc(di^y(qu5^d=^cQASsFlS#ubp@F!P>y zOB?JPvba}t^(;K2bP6O!6v81dp`FrlQK0X5wm_(CCS2^Uus7g!4M^O*Dk!XY7`UYQ z(+xY*;R*5liEW_|U_|hip)A!dC{etOA?QrXXg+Ls>e!2VsC{f1Fwd_52f{br-TpKa z9L;fy6mzWrb`Q7aR;yP7i+Hn+21Pd1F7mLf%pCyYB7&6zce0_Lg44AMhblOHxZ;5Q z<3iY4U(@HY_9f&yAtjX_(+FO=#b!&LZUzZ`hWl?l=mJ-7?Dd;5aR8V+HVqNqT?bko x8?i3QR6}yy+Rzlh2C-)qD{|JB!30;^z>@kW@LtB&8lx@sKrE{CS2Ol6@LyVSHP`?E diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index d5a8b380d01f9..821d9956a2dfa 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4454,15 +4454,6 @@ def test_pytables_native2_read(self): d1 = store['detector'] self.assertIsInstance(d1, DataFrame) - def test_legacy_read(self): - with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy.h5'), - mode='r') as store: - store['a'] - store['b'] - store['c'] - store['d'] - def test_legacy_table_read(self): # legacy table types with ensure_clean_store( diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 6473dbeeaa1bc..5997b91097cbc 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -107,9 +107,6 @@ def test_set_index_makes_timeseries(self): s = Series(lrange(10)) s.index = idx - - with tm.assert_produces_warning(FutureWarning): - self.assertTrue(s.is_time_series) self.assertTrue(s.index.is_all_dates) def test_reset_index(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index aef4c9269bc62..c15171f331df3 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -37,22 +37,11 @@ def test_scalar_conversion(self): self.assertEqual(int(Series([1.])), 1) self.assertEqual(long(Series([1.])), 1) - def test_TimeSeries_deprecation(self): - - # deprecation TimeSeries, #10890 - with tm.assert_produces_warning(FutureWarning): - pd.TimeSeries(1, index=date_range('20130101', periods=3)) - def test_constructor(self): - # Recognize TimeSeries - with tm.assert_produces_warning(FutureWarning): - self.assertTrue(self.ts.is_time_series) self.assertTrue(self.ts.index.is_all_dates) # Pass in Series derived = Series(self.ts) - with tm.assert_produces_warning(FutureWarning): - self.assertTrue(derived.is_time_series) self.assertTrue(derived.index.is_all_dates) self.assertTrue(tm.equalContents(derived.index, self.ts.index)) @@ -64,11 +53,7 @@ def test_constructor(self): self.assertEqual(mixed.dtype, np.object_) self.assertIs(mixed[1], np.NaN) - with tm.assert_produces_warning(FutureWarning): - self.assertFalse(self.empty.is_time_series) self.assertFalse(self.empty.index.is_all_dates) - with tm.assert_produces_warning(FutureWarning): - self.assertFalse(Series({}).is_time_series) self.assertFalse(Series({}).index.is_all_dates) self.assertRaises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index e0db813e60c14..8c22b3f047210 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -383,8 +383,6 @@ def test_mpl_compat_hack(self): def test_timeseries_coercion(self): idx = tm.makeDateIndex(10000) ser = Series(np.random.randn(len(idx)), idx.astype(object)) - with tm.assert_produces_warning(FutureWarning): - self.assertTrue(ser.is_time_series) self.assertTrue(ser.index.is_all_dates) self.assertIsInstance(ser.index, DatetimeIndex) From 4a3035b8c0b1090ff159c67aa6c9adb8b8eaac88 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 28 Feb 2017 09:26:10 -0500 Subject: [PATCH 206/338] BUG: DataFrame index & column returned by corr & cov are the same (#14617) closes #14617 Author: Matt Roeschke Closes #15528 from mroeschke/fix_14617 and squashes the following commits: 5a46f0a [Matt Roeschke] Bug:DataFrame index & column returned by corr & cov are the same (#14617) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/frame.py | 6 ++++-- pandas/tests/frame/test_analytics.py | 9 +++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 671df5760fb84..54df7514a882d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -631,7 +631,7 @@ Bug Fixes - Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`) - +- Bug in ``.corr()`` and ``.cov()`` where the column and index were the same object (:issue:`14617`) - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c47490bfbede4..021ce59e3402b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4725,6 +4725,7 @@ def corr(self, method='pearson', min_periods=1): """ numeric_df = self._get_numeric_data() cols = numeric_df.columns + idx = cols.copy() mat = numeric_df.values if method == 'pearson': @@ -4757,7 +4758,7 @@ def corr(self, method='pearson', min_periods=1): correl[i, j] = c correl[j, i] = c - return self._constructor(correl, index=cols, columns=cols) + return self._constructor(correl, index=idx, columns=cols) def cov(self, min_periods=None): """ @@ -4780,6 +4781,7 @@ def cov(self, min_periods=None): """ numeric_df = self._get_numeric_data() cols = numeric_df.columns + idx = cols.copy() mat = numeric_df.values if notnull(mat).all(): @@ -4793,7 +4795,7 @@ def cov(self, min_periods=None): baseCov = _algos.nancorr(_ensure_float64(mat), cov=True, minp=min_periods) - return self._constructor(baseCov, index=cols, columns=cols) + return self._constructor(baseCov, index=idx, columns=cols) def corrwith(self, other, axis=0, drop=False): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 1f0d16e959cd7..111195363beb2 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -118,6 +118,15 @@ def test_corr_int_and_boolean(self): for meth in ['pearson', 'kendall', 'spearman']: tm.assert_frame_equal(df.corr(meth), expected) + def test_corr_cov_independent_index_column(self): + # GH 14617 + df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), + columns=list("abcd")) + for method in ['cov', 'corr']: + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) + def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov() From 95f545d44e4b73fd30560c68caa5c3b5f2481225 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Mar 2017 14:16:50 -0500 Subject: [PATCH 207/338] BLD: fix 2.7_LOCALE build simplify install_travis.sh, always using a clean miniconda install bump matplotlib to 1.4.0 for _LOCALE, numpy to 1.8.2 So this removes our testing of mpl 1.2.1 with numpy 1.7.1. We *still* test elsewhere 1.7.1, and mpl 1.3.1 (with 1.8.2) Author: Jeff Reback Closes #15540 from jreback/build and squashes the following commits: 58b6f2f [Jeff Reback] BLD: fix 2.7_LOCALE build --- ci/install_travis.sh | 88 +++++++++++++++----------------- ci/requirements-2.7_LOCALE.build | 2 +- ci/requirements-2.7_LOCALE.run | 8 ++- 3 files changed, 44 insertions(+), 54 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 802d8c9f6b776..b337f6e443be2 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -32,58 +32,50 @@ edit_init home_dir=$(pwd) echo "[home_dir: $home_dir]" -MINICONDA_DIR="$HOME/miniconda3" +# install miniconda +echo "[Using clean Miniconda install]" -if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE" ]; then - echo "[Miniconda install already present from cache: $MINICONDA_DIR]" - - conda config --set always_yes yes --set changeps1 no || exit 1 - echo "[update conda]" - conda update -q conda || exit 1 - - # Useful for debugging any issues with conda - conda info -a || exit 1 - - # set the compiler cache to work - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then - echo "[Using ccache]" - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - gcc=$(which gcc) - echo "[gcc: $gcc]" - ccache=$(which ccache) - echo "[ccache: $ccache]" - export CC='ccache gcc' - fi +MINICONDA_DIR="$HOME/miniconda3" +if [ -d "$MINICONDA_DIR" ]; then + rm -rf "$MINICONDA_DIR" +fi +# install miniconda +if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 +else + wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 +fi +bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 + +echo "[update conda]" +conda config --set ssl_verify false || exit 1 +conda config --set always_yes true --set changeps1 false || exit 1 +conda update -q conda + +# add the pandas channel to take priority +# to add extra packages +echo "[add channels]" +conda config --add channels pandas || exit 1 +conda config --remove channels defaults || exit 1 +conda config --add channels defaults || exit 1 + +conda install anaconda-client + +# Useful for debugging any issues with conda +conda info -a || exit 1 + +# set the compiler cache to work +if [ "$USE_CACHE" ] && "${TRAVIS_OS_NAME}" == "linux" ]; then + echo "[Using ccache]" + export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH + gcc=$(which gcc) + echo "[gcc: $gcc]" + ccache=$(which ccache) + echo "[ccache: $ccache]" + export CC='ccache gcc' else - echo "[Using clean Miniconda install]" echo "[Not using ccache]" - rm -rf "$MINICONDA_DIR" - # install miniconda - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 - else - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 - fi - bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 - - echo "[update conda]" - conda config --set ssl_verify false || exit 1 - conda config --set always_yes true --set changeps1 false || exit 1 - conda update -q conda - - # add the pandas channel to take priority - # to add extra packages - echo "[add channels]" - conda config --add channels pandas || exit 1 - conda config --remove channels defaults || exit 1 - conda config --add channels defaults || exit 1 - - conda install anaconda-client - - # Useful for debugging any issues with conda - conda info -a || exit 1 - fi # may have installation instructions for this build diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build index c17730b912651..28e2b96851eff 100644 --- a/ci/requirements-2.7_LOCALE.build +++ b/ci/requirements-2.7_LOCALE.build @@ -1,4 +1,4 @@ python-dateutil pytz=2013b -numpy=1.7.1 +numpy=1.8.2 cython=0.23 diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run index 1a9b42d832b0b..5d7cc31b7d55e 100644 --- a/ci/requirements-2.7_LOCALE.run +++ b/ci/requirements-2.7_LOCALE.run @@ -1,16 +1,14 @@ python-dateutil pytz=2013b -numpy=1.7.1 +numpy=1.8.2 xlwt=0.7.5 openpyxl=1.6.2 xlsxwriter=0.4.6 xlrd=0.9.2 bottleneck=0.8.0 -matplotlib=1.2.1 -patsy=0.1.0 +matplotlib=1.3.1 sqlalchemy=0.8.1 html5lib=1.0b2 lxml=3.2.1 -scipy=0.11.0 +scipy beautiful-soup=4.2.1 -bigquery=2.0.17 From 9732cd651c4e393325edbbb71cd2963b5c144df7 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Wed, 1 Mar 2017 16:16:12 -0500 Subject: [PATCH 208/338] PERF: Rank categorical perf closes #15498 Author: Prasanjit Prakash Closes #15518 from ikilledthecat/rank_categorical_perf and squashes the following commits: 30b49b9 [Prasanjit Prakash] PERF: GH15498 - pep8 changes ad38544 [Prasanjit Prakash] PERF: GH15498 - asv tests and whatsnew 1ebdb56 [Prasanjit Prakash] PERF: categorical rank GH#15498 a67cd85 [Prasanjit Prakash] PERF: categorical rank GH#15498 81df7df [Prasanjit Prakash] PERF: categorical rank GH#15498 45dd125 [Prasanjit Prakash] PERF: categorical rank GH#15498 33249b3 [Prasanjit Prakash] PERF: categorical rank GH#15498 --- asv_bench/benchmarks/categoricals.py | 34 +++++++++++++++++++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 1 + pandas/core/categorical.py | 9 ++++++- pandas/tests/series/test_analytics.py | 33 +++++++++++++++++++------- 5 files changed, 69 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index cca652c68cf15..153107911ca2c 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -63,3 +63,37 @@ def time_value_counts_dropna(self): def time_rendering(self): str(self.sel) + + +class Categoricals3(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + ncats = 100 + + self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats))) + self.s1_cat = self.s1.astype('category') + self.s1_cat_ordered = self.s1.astype('category', ordered=True) + + self.s2 = Series(np.random.randint(0, ncats, size=N)) + self.s2_cat = self.s2.astype('category') + self.s2_cat_ordered = self.s2.astype('category', ordered=True) + + def time_rank_string(self): + self.s1.rank() + + def time_rank_string_cat(self): + self.s1_cat.rank() + + def time_rank_string_cat_ordered(self): + self.s1_cat_ordered.rank() + + def time_rank_int(self): + self.s2.rank() + + def time_rank_int_cat(self): + self.s2_cat.rank() + + def time_rank_int_cat_ordered(self): + self.s2_cat_ordered.rank() diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 54df7514a882d..6e9dfb92dfd90 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -562,6 +562,7 @@ Performance Improvements - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) +- Improved performance of `rank()` for categorical data (:issue:`15498`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b11927a80fb2e..55d404f05dd1d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -992,6 +992,7 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) + else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b88a6b171b316..d5dce250275d9 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1416,14 +1416,21 @@ def _values_for_rank(self): numpy array """ + from pandas import Series if self.ordered: values = self.codes mask = values == -1 if mask.any(): values = values.astype('float64') values[mask] = np.nan - else: + elif self.categories.is_numeric(): values = np.array(self) + else: + # reorder the categories (so rank can use the float codes) + # instead of passing an object array to rank + values = np.array( + self.rename_categories(Series(self.categories).rank()) + ) return values def order(self, inplace=False, ascending=True, na_position='last'): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b092e4f084767..b6985abb64e40 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1065,8 +1065,10 @@ def test_rank_categorical(self): exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) ordered = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ).astype('category', ).cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype( + 'category', + categories=['first', 'second', 'third', + 'fourth', 'fifth', 'sixth'], ordered=True ) assert_series_equal(ordered.rank(), exp) @@ -1075,19 +1077,33 @@ def test_rank_categorical(self): # Unordered categoricals should be ranked as objects unordered = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], - ).astype('category').cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype( + 'category', + categories=['first', 'second', 'third', + 'fourth', 'fifth', 'sixth'], ordered=False ) exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.]) res = unordered.rank() assert_series_equal(res, exp_unordered) + unordered1 = pd.Series( + [1, 2, 3, 4, 5, 6], + ).astype( + 'category', + categories=[1, 2, 3, 4, 5, 6], + ordered=False + ) + exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.]) + res1 = unordered1.rank() + assert_series_equal(res1, exp_unordered1) + # Test na_option for rank data na_ser = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] - ).astype('category', ).cat.set_categories( - [ + ).astype( + 'category', + categories=[ 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh' ], @@ -1123,8 +1139,9 @@ def test_rank_categorical(self): # Test with pct=True na_ser = pd.Series( ['first', 'second', 'third', 'fourth', np.NaN], - ).astype('category').cat.set_categories( - ['first', 'second', 'third', 'fourth'], + ).astype( + 'category', + categories=['first', 'second', 'third', 'fourth'], ordered=True ) exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2]) From 1931c1e43b268deb63a1145d4e10c0cc01d4c761 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Mar 2017 19:39:14 -0500 Subject: [PATCH 209/338] COMPAT: if docstring_wrapper is activated on a class, don't fail --- pandas/util/decorators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index ee7e2f4302b10..62ff6ef14418a 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -261,6 +261,11 @@ def __init__(self, func, creator, default=None): if hasattr(func, attr)]) def __get__(self, instance, cls=None): + + # we are called with a class + if instance is None: + return self + # we want to return the actual passed instance return types.MethodType(self, instance) From 7c6bee5d1a17a30ebb74e7d026b0127ca5760305 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Mar 2017 19:54:04 -0500 Subject: [PATCH 210/338] DOC: add pandas-gbq to doc-build --- ci/requirements-3.5_DOC_BUILD.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/requirements-3.5_DOC_BUILD.sh b/ci/requirements-3.5_DOC_BUILD.sh index 25bc63acc96d1..1a5d4643edcf2 100644 --- a/ci/requirements-3.5_DOC_BUILD.sh +++ b/ci/requirements-3.5_DOC_BUILD.sh @@ -4,6 +4,8 @@ source activate pandas echo "[install DOC_BUILD deps]" +pip install pandas-gbq + conda install -n pandas -c conda-forge feather-format conda install -n pandas -c r r rpy2 --yes From 852815714a604805595a32a929d0771c835a072d Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 1 Mar 2017 21:19:55 -0500 Subject: [PATCH 211/338] DOC: Styler.set_table_attributes docstring Author: Chris Closes #15545 from chris-b1/styler-docstring and squashes the following commits: d77a9f1 [Chris] DOC: Style.set_table_attributes docstring --- pandas/formats/style.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/formats/style.py b/pandas/formats/style.py index 89712910a22e1..e712010a8b4f2 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -631,11 +631,17 @@ def set_table_attributes(self, attributes): Parameters ---------- - precision: int + attributes : string Returns ------- self : Styler + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(10, 4)) + >>> df.style.set_table_attributes('class="pure-table"') + # ...
latest release
latest releaselatest release
Package Status
+ + circleci build status + +
From 3819bc27bb5dac2679bc43b2ee5924e730d3e79f Mon Sep 17 00:00:00 2001 From: James Goppert Date: Wed, 22 Feb 2017 10:58:41 -0500 Subject: [PATCH 176/338] ENH: Adds custom plot formatting for TimedeltaIndex. Author: James Goppert Author: James Goppert Closes #8711 Closes #15067 from jgoppert/tdi_plot_fix and squashes the following commits: 945ec14 [James Goppert] Merge branch 'master' into tdi_plot_fix 7db61ec [James Goppert] Create TimeSeries_TimedeltaFormatter. 232efe6 [James Goppert] Fix comment format and exception type for tdi plotting. 4eff697 [James Goppert] Add more time delta series plotting tests. f5f32bc [James Goppert] Link time delta index docs to better matplotlib docs. d588c2c [James Goppert] Fixes test for tdi w/o autofmt_xdate. b6e6a81 [James Goppert] Disables autofmt_xdate testing. c7851e3 [James Goppert] Adjusts tdi test draw calls to try to fix CI issue. 7d28842 [James Goppert] Switch to draw_idle to try to fix bug on xticks update. 3abc310 [James Goppert] Try plt.draw() instead of canvas.draw() to fix issue on osx 3.5. 91954bd [James Goppert] Finished unit test for timedelta plotting. 41ebc85 [James Goppert] Fixes for review comments from #15067. f021cbd [James Goppert] Support nano-second level precision x-axis labels. 5ec65fa [James Goppert] Plot fix for tdi and added more comments. b967d24 [James Goppert] flake8 fixes for tdi plotting. efe5636 [James Goppert] Adds custom plot formatting for TimedeltaIndex. --- doc/source/visualization.rst | 12 ++++ doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/plotting/test_datetimelike.py | 58 +++++++++++++++++ pandas/tools/plotting.py | 2 +- pandas/tseries/converter.py | 30 +++++++++ pandas/tseries/plotting.py | 73 +++++++++++++++------- 6 files changed, 154 insertions(+), 23 deletions(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 2b2012dbf0b8a..e8998bf6f6f5c 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -1245,6 +1245,18 @@ in ``pandas.plot_params`` can be used in a `with statement`: plt.close('all') +Automatic Date Tick Adjustment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +``TimedeltaIndex`` now uses the native matplotlib +tick locator methods, it is useful to call the automatic +date tick adjustment from matplotlib for figures whose ticklabels overlap. + +See the :meth:`autofmt_xdate ` method and the +`matplotlib documentation `__ for more. + Subplots ~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 86f916bc0acfb..9124929ee5665 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -155,7 +155,7 @@ Other enhancements - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - +- ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations .. _whatsnew_0200.api_breaking: diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 25568f7eb61dc..cdacded4d7f35 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -9,6 +9,7 @@ from pandas import Index, Series, DataFrame from pandas.tseries.index import date_range, bdate_range +from pandas.tseries.tdi import timedelta_range from pandas.tseries.offsets import DateOffset from pandas.tseries.period import period_range, Period, PeriodIndex from pandas.tseries.resample import DatetimeIndex @@ -1270,6 +1271,63 @@ def test_plot_outofbounds_datetime(self): values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] self.plt.plot(values) + def test_format_timedelta_ticks_narrow(self): + + expected_labels = [ + '00:00:00.00000000{:d}'.format(i) + for i in range(10)] + + rng = timedelta_range('0', periods=10, freq='ns') + df = DataFrame(np.random.randn(len(rng), 3), rng) + ax = df.plot(fontsize=2) + fig = ax.get_figure() + fig.canvas.draw() + labels = ax.get_xticklabels() + self.assertEqual(len(labels), len(expected_labels)) + for l, l_expected in zip(labels, expected_labels): + self.assertEqual(l.get_text(), l_expected) + + def test_format_timedelta_ticks_wide(self): + + expected_labels = [ + '00:00:00', + '1 days 03:46:40', + '2 days 07:33:20', + '3 days 11:20:00', + '4 days 15:06:40', + '5 days 18:53:20', + '6 days 22:40:00', + '8 days 02:26:40', + '' + ] + + rng = timedelta_range('0', periods=10, freq='1 d') + df = DataFrame(np.random.randn(len(rng), 3), rng) + ax = df.plot(fontsize=2) + fig = ax.get_figure() + fig.canvas.draw() + labels = ax.get_xticklabels() + self.assertEqual(len(labels), len(expected_labels)) + for l, l_expected in zip(labels, expected_labels): + self.assertEqual(l.get_text(), l_expected) + + def test_timedelta_plot(self): + # test issue #8711 + s = Series(range(5), timedelta_range('1day', periods=5)) + _check_plot_works(s.plot) + + # test long period + index = timedelta_range('1 day 2 hr 30 min 10 s', + periods=10, freq='1 d') + s = Series(np.random.randn(len(index)), index) + _check_plot_works(s.plot) + + # test short period + index = timedelta_range('1 day 2 hr 30 min 10 s', + periods=10, freq='1 ns') + s = Series(np.random.randn(len(index)), index) + _check_plot_works(s.plot) + def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index b2050d7d8d81e..d46c38c117445 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1781,7 +1781,7 @@ def _ts_plot(cls, ax, x, data, style=None, **kwds): lines = cls._plot(ax, data.index, data.values, style=style, **kwds) # set date formatter, locators and rescale limits - format_dateaxis(ax, ax.freq) + format_dateaxis(ax, ax.freq, data.index) return lines def _get_stacking_id(self): diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 95ff9578fa3ee..db7049ebc89b3 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -1000,3 +1000,33 @@ def __call__(self, x, pos=0): else: fmt = self.formatdict.pop(x, '') return Period(ordinal=int(x), freq=self.freq).strftime(fmt) + + +class TimeSeries_TimedeltaFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. + """ + + @staticmethod + def format_timedelta_ticks(x, pos, n_decimals): + """ + Convert seconds to 'D days HH:MM:SS.F' + """ + s, ns = divmod(x, 1e9) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + d, h = divmod(h, 24) + decimals = int(ns * 10**(n_decimals - 9)) + s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)) + if n_decimals > 0: + s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals) + if d != 0: + s = '{:d} days '.format(int(d)) + s + return s + + def __call__(self, x, pos=0): + (vmin, vmax) = tuple(self.axis.get_view_interval()) + n_decimals = int(np.ceil(np.log10(100 * 1e9 / (vmax - vmin)))) + if n_decimals > 9: + n_decimals = 9 + return self.format_timedelta_ticks(x, pos, n_decimals) diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 89aecf2acc07e..4eddf54701889 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -12,11 +12,14 @@ from pandas.tseries.offsets import DateOffset import pandas.tseries.frequencies as frequencies from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex +from pandas.tseries.tdi import TimedeltaIndex from pandas.formats.printing import pprint_thing import pandas.compat as compat from pandas.tseries.converter import (TimeSeries_DateLocator, - TimeSeries_DateFormatter) + TimeSeries_DateFormatter, + TimeSeries_TimedeltaFormatter) # --------------------------------------------------------------------- # Plotting functions and monkey patches @@ -49,7 +52,7 @@ def tsplot(series, plotf, ax=None, **kwargs): lines = plotf(ax, series.index._mpl_repr(), series.values, **kwargs) # set date formatter, locators and rescale limits - format_dateaxis(ax, ax.freq) + format_dateaxis(ax, ax.freq, series.index) return lines @@ -278,8 +281,24 @@ def _maybe_convert_index(ax, data): # Patch methods for subplot. Only format_dateaxis is currently used. # Do we need the rest for convenience? - -def format_dateaxis(subplot, freq): +def format_timedelta_ticks(x, pos, n_decimals): + """ + Convert seconds to 'D days HH:MM:SS.F' + """ + s, ns = divmod(x, 1e9) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + d, h = divmod(h, 24) + decimals = int(ns * 10**(n_decimals - 9)) + s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)) + if n_decimals > 0: + s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals) + if d != 0: + s = '{:d} days '.format(int(d)) + s + return s + + +def format_dateaxis(subplot, freq, index): """ Pretty-formats the date axis (x-axis). @@ -288,26 +307,38 @@ def format_dateaxis(subplot, freq): default, changing the limits of the x axis will intelligently change the positions of the ticks. """ - majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, - minor_locator=False, - plot_obj=subplot) - minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, - minor_locator=True, - plot_obj=subplot) - subplot.xaxis.set_major_locator(majlocator) - subplot.xaxis.set_minor_locator(minlocator) - - majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + + # handle index specific formatting + # Note: DatetimeIndex does not use this + # interface. DatetimeIndex uses matplotlib.date directly + if isinstance(index, PeriodIndex): + + majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot) - minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot) - subplot.xaxis.set_major_formatter(majformatter) - subplot.xaxis.set_minor_formatter(minformatter) - - # x and y coord info - subplot.format_coord = lambda t, y: ( - "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y)) + subplot.xaxis.set_major_locator(majlocator) + subplot.xaxis.set_minor_locator(minlocator) + + majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minor_locator=False, + plot_obj=subplot) + minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minor_locator=True, + plot_obj=subplot) + subplot.xaxis.set_major_formatter(majformatter) + subplot.xaxis.set_minor_formatter(minformatter) + + # x and y coord info + subplot.format_coord = lambda t, y: ( + "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y)) + + elif isinstance(index, TimedeltaIndex): + subplot.xaxis.set_major_formatter( + TimeSeries_TimedeltaFormatter()) + else: + raise TypeError('index type not supported') pylab.draw_if_interactive() From 52a54816229e9b9c85976efe9d6b349cead8e595 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Feb 2017 11:04:25 -0500 Subject: [PATCH 177/338] TST: skip some timedelta plotting tests on mac (on travis) for precision display issues xref #15067 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/plotting/test_datetimelike.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9124929ee5665..355756c6e605c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -156,6 +156,7 @@ Other enhancements - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) + .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations .. _whatsnew_0200.api_breaking: diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index cdacded4d7f35..673c34903b259 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -7,7 +7,7 @@ import numpy as np from pandas import Index, Series, DataFrame - +from pandas.compat import is_platform_mac from pandas.tseries.index import date_range, bdate_range from pandas.tseries.tdi import timedelta_range from pandas.tseries.offsets import DateOffset @@ -1272,6 +1272,8 @@ def test_plot_outofbounds_datetime(self): self.plt.plot(values) def test_format_timedelta_ticks_narrow(self): + if is_platform_mac(): + pytest.skip("skip on mac for precision display issue on older mpl") expected_labels = [ '00:00:00.00000000{:d}'.format(i) @@ -1288,6 +1290,8 @@ def test_format_timedelta_ticks_narrow(self): self.assertEqual(l.get_text(), l_expected) def test_format_timedelta_ticks_wide(self): + if is_platform_mac(): + pytest.skip("skip on mac for precision display issue on older mpl") expected_labels = [ '00:00:00', From 1a0599dc552ec01410698e0231e171c74e02fa95 Mon Sep 17 00:00:00 2001 From: Justin Solinsky Date: Wed, 22 Feb 2017 11:21:27 -0500 Subject: [PATCH 178/338] ENH union_categoricals supports ignore_order GH13410 xref #13410 (ignore_order portion) Author: Justin Solinsky Closes #15219 from js3711/GH13410-ENHunion_categoricals and squashes the following commits: e9d00de [Justin Solinsky] GH15219 Documentation fixes based on feedback d278d62 [Justin Solinsky] ENH union_categoricals supports ignore_order GH13410 9b827ef [Justin Solinsky] ENH union_categoricals supports ignore_order GH13410 --- doc/source/categorical.rst | 11 +++++++ doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/tests/tools/test_concat.py | 54 +++++++++++++++++++++++++++++++ pandas/types/concat.py | 16 ++++++--- 4 files changed, 79 insertions(+), 4 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 18e429cfc92fa..db974922e1d76 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -693,6 +693,17 @@ The below raises ``TypeError`` because the categories are ordered and not identi Out[3]: TypeError: to union ordered Categoricals, all categories must be the same +.. versionadded:: 0.20.0 + +Ordered categoricals with different categories or orderings can be combined by +using the ``ignore_ordered=True`` argument. + +.. ipython:: python + + a = pd.Categorical(["a", "b", "c"], ordered=True) + b = pd.Categorical(["c", "b", "a"], ordered=True) + union_categoricals([a, b], ignore_order=True) + ``union_categoricals`` also works with a ``CategoricalIndex``, or ``Series`` containing categorical data, but note that the resulting array will always be a plain ``Categorical`` diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 355756c6e605c..bb5f19b301dc8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -156,9 +156,11 @@ Other enhancements - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) +- ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations + .. _whatsnew_0200.api_breaking: diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 2a28fccdc9b94..6d40de465bff8 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1662,6 +1662,60 @@ def test_union_categoricals_ordered(self): with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) + def test_union_categoricals_ignore_order(self): + # GH 15219 + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2], ignore_order=False) + + res = union_categoricals([c1, c1], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c1, c1], ignore_order=False) + exp = Categorical([1, 2, 3, 1, 2, 3], + categories=[1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, np.nan, 3, 2]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c2, c1], ignore_order=True, + sort_categories=True) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([4, 5, 6], ordered=True) + result = union_categoricals([c1, c2], ignore_order=True) + expected = Categorical([1, 2, 3, 4, 5, 6]) + tm.assert_categorical_equal(result, expected) + + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2], ignore_order=False) + + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 827eb160c452d..9e47a97dd621a 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -208,7 +208,7 @@ def _concat_asobject(to_concat): return _concat_asobject(to_concat) -def union_categoricals(to_union, sort_categories=False): +def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. @@ -222,6 +222,11 @@ def union_categoricals(to_union, sort_categories=False): sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. + ignore_order: boolean, default False + If true, the ordered attribute of the Categoricals will be ignored. + Results in an unordered categorical. + + .. versionadded:: 0.20.0 Returns ------- @@ -235,7 +240,7 @@ def union_categoricals(to_union, sort_categories=False): - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError - Emmpty list of categoricals passed + Empty list of categoricals passed """ from pandas import Index, Categorical, CategoricalIndex, Series @@ -264,7 +269,7 @@ def _maybe_unwrap(x): ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) - if sort_categories and ordered: + if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") @@ -272,7 +277,7 @@ def _maybe_unwrap(x): categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) - elif all(not c.ordered for c in to_union): + elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) @@ -297,6 +302,9 @@ def _maybe_unwrap(x): else: raise TypeError('Categorical.ordered must be the same') + if ignore_order: + ordered = False + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) From 90a0cf74659fabe2ac04f8c4fe68dcd178c9d2c0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Feb 2017 11:29:28 -0500 Subject: [PATCH 179/338] TST: break out union_categoricals to separate test file --- pandas/tests/tools/test_concat.py | 332 ----------------- pandas/tests/tools/test_union_categoricals.py | 339 ++++++++++++++++++ 2 files changed, 339 insertions(+), 332 deletions(-) create mode 100644 pandas/tests/tools/test_union_categoricals.py diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 6d40de465bff8..f292aeda8cbe0 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -8,7 +8,6 @@ read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, DatetimeIndex, Categorical, CategoricalIndex) -from pandas.types.concat import union_categoricals from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf, @@ -1511,337 +1510,6 @@ def test_concat_keys_with_none(self): keys=['b', 'c', 'd', 'e']) tm.assert_frame_equal(result, expected) - def test_union_categorical(self): - # GH 13361 - data = [ - (list('abc'), list('abd'), list('abcabd')), - ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), - ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), - - (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], - ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), - - (pd.date_range('2014-01-01', '2014-01-05'), - pd.date_range('2014-01-06', '2014-01-07'), - pd.date_range('2014-01-01', '2014-01-07')), - - (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), - pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), - pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), - - (pd.period_range('2014-01-01', '2014-01-05'), - pd.period_range('2014-01-06', '2014-01-07'), - pd.period_range('2014-01-01', '2014-01-07')), - ] - - for a, b, combined in data: - for box in [Categorical, CategoricalIndex, Series]: - result = union_categoricals([box(Categorical(a)), - box(Categorical(b))]) - expected = Categorical(combined) - tm.assert_categorical_equal(result, expected, - check_category_order=True) - - # new categories ordered by appearance - s = Categorical(['x', 'y', 'z']) - s2 = Categorical(['a', 'b', 'c']) - result = union_categoricals([s, s2]) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['x', 'y', 'z', 'a', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - s = Categorical([0, 1.2, 2], ordered=True) - s2 = Categorical([0, 1.2, 2], ordered=True) - result = union_categoricals([s, s2]) - expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) - tm.assert_categorical_equal(result, expected) - - # must exactly match types - s = Categorical([0, 1.2, 2]) - s2 = Categorical([2, 3, 4]) - msg = 'dtype of categories must be the same' - with tm.assertRaisesRegexp(TypeError, msg): - union_categoricals([s, s2]) - - msg = 'No Categoricals to union' - with tm.assertRaisesRegexp(ValueError, msg): - union_categoricals([]) - - def test_union_categoricals_nan(self): - # GH 13759 - res = union_categoricals([pd.Categorical([1, 2, np.nan]), - pd.Categorical([3, 2, np.nan])]) - exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) - tm.assert_categorical_equal(res, exp) - - res = union_categoricals([pd.Categorical(['A', 'B']), - pd.Categorical(['B', 'B', np.nan])]) - exp = Categorical(['A', 'B', 'B', 'B', np.nan]) - tm.assert_categorical_equal(res, exp) - - val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), - pd.NaT] - val2 = [pd.NaT, pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-02-01')] - - res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) - exp = Categorical(val1 + val2, - categories=[pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-03-01'), - pd.Timestamp('2011-02-01')]) - tm.assert_categorical_equal(res, exp) - - # all NaN - res = union_categoricals([pd.Categorical([np.nan, np.nan]), - pd.Categorical(['X'])]) - exp = Categorical([np.nan, np.nan, 'X']) - tm.assert_categorical_equal(res, exp) - - res = union_categoricals([pd.Categorical([np.nan, np.nan]), - pd.Categorical([np.nan, np.nan])]) - exp = Categorical([np.nan, np.nan, np.nan, np.nan]) - tm.assert_categorical_equal(res, exp) - - def test_union_categoricals_empty(self): - # GH 13759 - res = union_categoricals([pd.Categorical([]), - pd.Categorical([])]) - exp = Categorical([]) - tm.assert_categorical_equal(res, exp) - - res = union_categoricals([pd.Categorical([]), - pd.Categorical([1.0])]) - exp = Categorical([1.0]) - tm.assert_categorical_equal(res, exp) - - # to make dtype equal - nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) - res = union_categoricals([nanc, - pd.Categorical([])]) - tm.assert_categorical_equal(res, nanc) - - def test_union_categorical_same_category(self): - # check fastpath - c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) - c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) - res = union_categoricals([c1, c2]) - exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], - categories=[1, 2, 3, 4]) - tm.assert_categorical_equal(res, exp) - - c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) - c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) - res = union_categoricals([c1, c2]) - exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], - categories=['x', 'y', 'z']) - tm.assert_categorical_equal(res, exp) - - def test_union_categoricals_ordered(self): - c1 = Categorical([1, 2, 3], ordered=True) - c2 = Categorical([1, 2, 3], ordered=False) - - msg = 'Categorical.ordered must be the same' - with tm.assertRaisesRegexp(TypeError, msg): - union_categoricals([c1, c2]) - - res = union_categoricals([c1, c1]) - exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) - tm.assert_categorical_equal(res, exp) - - c1 = Categorical([1, 2, 3, np.nan], ordered=True) - c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) - - res = union_categoricals([c1, c2]) - exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) - tm.assert_categorical_equal(res, exp) - - c1 = Categorical([1, 2, 3], ordered=True) - c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) - - msg = "to union ordered Categoricals, all categories must be the same" - with tm.assertRaisesRegexp(TypeError, msg): - union_categoricals([c1, c2]) - - def test_union_categoricals_ignore_order(self): - # GH 15219 - c1 = Categorical([1, 2, 3], ordered=True) - c2 = Categorical([1, 2, 3], ordered=False) - - res = union_categoricals([c1, c2], ignore_order=True) - exp = Categorical([1, 2, 3, 1, 2, 3]) - tm.assert_categorical_equal(res, exp) - - msg = 'Categorical.ordered must be the same' - with tm.assertRaisesRegexp(TypeError, msg): - union_categoricals([c1, c2], ignore_order=False) - - res = union_categoricals([c1, c1], ignore_order=True) - exp = Categorical([1, 2, 3, 1, 2, 3]) - tm.assert_categorical_equal(res, exp) - - res = union_categoricals([c1, c1], ignore_order=False) - exp = Categorical([1, 2, 3, 1, 2, 3], - categories=[1, 2, 3], ordered=True) - tm.assert_categorical_equal(res, exp) - - c1 = Categorical([1, 2, 3, np.nan], ordered=True) - c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) - - res = union_categoricals([c1, c2], ignore_order=True) - exp = Categorical([1, 2, 3, np.nan, 3, 2]) - tm.assert_categorical_equal(res, exp) - - c1 = Categorical([1, 2, 3], ordered=True) - c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) - - res = union_categoricals([c1, c2], ignore_order=True) - exp = Categorical([1, 2, 3, 1, 2, 3]) - tm.assert_categorical_equal(res, exp) - - res = union_categoricals([c2, c1], ignore_order=True, - sort_categories=True) - exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) - tm.assert_categorical_equal(res, exp) - - c1 = Categorical([1, 2, 3], ordered=True) - c2 = Categorical([4, 5, 6], ordered=True) - result = union_categoricals([c1, c2], ignore_order=True) - expected = Categorical([1, 2, 3, 4, 5, 6]) - tm.assert_categorical_equal(result, expected) - - msg = "to union ordered Categoricals, all categories must be the same" - with tm.assertRaisesRegexp(TypeError, msg): - union_categoricals([c1, c2], ignore_order=False) - - with tm.assertRaisesRegexp(TypeError, msg): - union_categoricals([c1, c2]) - - def test_union_categoricals_sort(self): - # GH 13846 - c1 = Categorical(['x', 'y', 'z']) - c2 = Categorical(['a', 'b', 'c']) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'x', 'y', 'z']) - tm.assert_categorical_equal(result, expected) - - # fastpath - c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) - c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) - c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - # fastpath - skip resort - c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical(['x', np.nan]) - c2 = Categorical([np.nan, 'b']) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['x', np.nan, np.nan, 'b'], - categories=['b', 'x']) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical([np.nan]) - c2 = Categorical([np.nan]) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical([np.nan, np.nan], categories=[]) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical([]) - c2 = Categorical([]) - result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical([]) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) - c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) - with tm.assertRaises(TypeError): - union_categoricals([c1, c2], sort_categories=True) - - def test_union_categoricals_sort_false(self): - # GH 13846 - c1 = Categorical(['x', 'y', 'z']) - c2 = Categorical(['a', 'b', 'c']) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['x', 'y', 'z', 'a', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - # fastpath - c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) - c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['b', 'a', 'c']) - tm.assert_categorical_equal(result, expected) - - # fastpath - skip resort - c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical(['x', np.nan]) - c2 = Categorical([np.nan, 'b']) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['x', np.nan, np.nan, 'b'], - categories=['x', 'b']) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical([np.nan]) - c2 = Categorical([np.nan]) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical([np.nan, np.nan], categories=[]) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical([]) - c2 = Categorical([]) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical([]) - tm.assert_categorical_equal(result, expected) - - c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) - c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) - result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['b', 'a', 'a', 'c'], - categories=['b', 'a', 'c'], ordered=True) - tm.assert_categorical_equal(result, expected) - - def test_union_categorical_unwrap(self): - # GH 14173 - c1 = Categorical(['a', 'b']) - c2 = pd.Series(['b', 'c'], dtype='category') - result = union_categoricals([c1, c2]) - expected = Categorical(['a', 'b', 'b', 'c']) - tm.assert_categorical_equal(result, expected) - - c2 = CategoricalIndex(c2) - result = union_categoricals([c1, c2]) - tm.assert_categorical_equal(result, expected) - - c1 = Series(c1) - result = union_categoricals([c1, c2]) - tm.assert_categorical_equal(result, expected) - - with tm.assertRaises(TypeError): - union_categoricals([c1, ['a', 'b', 'c']]) - def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/tests/tools/test_union_categoricals.py b/pandas/tests/tools/test_union_categoricals.py new file mode 100644 index 0000000000000..299b60f2a00b0 --- /dev/null +++ b/pandas/tests/tools/test_union_categoricals.py @@ -0,0 +1,339 @@ +import numpy as np +import pandas as pd +from pandas import Categorical, Series, CategoricalIndex +from pandas.types.concat import union_categoricals +from pandas.util import testing as tm + + +class TestUnionCategoricals(tm.TestCase): + + def test_union_categorical(self): + # GH 13361 + data = [ + (list('abc'), list('abd'), list('abcabd')), + ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), + ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + + (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], + ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), + + (pd.date_range('2014-01-01', '2014-01-05'), + pd.date_range('2014-01-06', '2014-01-07'), + pd.date_range('2014-01-01', '2014-01-07')), + + (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), + pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), + pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), + + (pd.period_range('2014-01-01', '2014-01-05'), + pd.period_range('2014-01-06', '2014-01-07'), + pd.period_range('2014-01-01', '2014-01-07')), + ] + + for a, b, combined in data: + for box in [Categorical, CategoricalIndex, Series]: + result = union_categoricals([box(Categorical(a)), + box(Categorical(b))]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, + check_category_order=True) + + # new categories ordered by appearance + s = Categorical(['x', 'y', 'z']) + s2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([s, s2]) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + s = Categorical([0, 1.2, 2], ordered=True) + s2 = Categorical([0, 1.2, 2], ordered=True) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) + tm.assert_categorical_equal(result, expected) + + # must exactly match types + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3, 4]) + msg = 'dtype of categories must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([s, s2]) + + msg = 'No Categoricals to union' + with tm.assertRaisesRegexp(ValueError, msg): + union_categoricals([]) + + def test_union_categoricals_nan(self): + # GH 13759 + res = union_categoricals([pd.Categorical([1, 2, np.nan]), + pd.Categorical([3, 2, np.nan])]) + exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical(['A', 'B']), + pd.Categorical(['B', 'B', np.nan])]) + exp = Categorical(['A', 'B', 'B', 'B', np.nan]) + tm.assert_categorical_equal(res, exp) + + val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), + pd.NaT] + val2 = [pd.NaT, pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-02-01')] + + res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) + exp = Categorical(val1 + val2, + categories=[pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-03-01'), + pd.Timestamp('2011-02-01')]) + tm.assert_categorical_equal(res, exp) + + # all NaN + res = union_categoricals([pd.Categorical([np.nan, np.nan]), + pd.Categorical(['X'])]) + exp = Categorical([np.nan, np.nan, 'X']) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical([np.nan, np.nan]), + pd.Categorical([np.nan, np.nan])]) + exp = Categorical([np.nan, np.nan, np.nan, np.nan]) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_empty(self): + # GH 13759 + res = union_categoricals([pd.Categorical([]), + pd.Categorical([])]) + exp = Categorical([]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical([]), + pd.Categorical([1.0])]) + exp = Categorical([1.0]) + tm.assert_categorical_equal(res, exp) + + # to make dtype equal + nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) + res = union_categoricals([nanc, + pd.Categorical([])]) + tm.assert_categorical_equal(res, nanc) + + def test_union_categorical_same_category(self): + # check fastpath + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], + categories=[1, 2, 3, 4]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) + c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) + res = union_categoricals([c1, c2]) + exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], + categories=['x', 'y', 'z']) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_ordered(self): + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + res = union_categoricals([c1, c1]) + exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + def test_union_categoricals_ignore_order(self): + # GH 15219 + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2], ignore_order=False) + + res = union_categoricals([c1, c1], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c1, c1], ignore_order=False) + exp = Categorical([1, 2, 3, 1, 2, 3], + categories=[1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, np.nan, 3, 2]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c2, c1], ignore_order=True, + sort_categories=True) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([4, 5, 6], ordered=True) + result = union_categoricals([c1, c2], ignore_order=True) + expected = Categorical([1, 2, 3, 4, 5, 6]) + tm.assert_categorical_equal(result, expected) + + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2], ignore_order=False) + + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + def test_union_categoricals_sort(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'x', 'y', 'z']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) + c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['b', 'x']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([c1, c2], sort_categories=True) + + def test_union_categoricals_sort_false(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['b', 'a', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['x', 'b']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['b', 'a', 'a', 'c'], + categories=['b', 'a', 'c'], ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_union_categorical_unwrap(self): + # GH 14173 + c1 = Categorical(['a', 'b']) + c2 = pd.Series(['b', 'c'], dtype='category') + result = union_categoricals([c1, c2]) + expected = Categorical(['a', 'b', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c2 = CategoricalIndex(c2) + result = union_categoricals([c1, c2]) + tm.assert_categorical_equal(result, expected) + + c1 = Series(c1) + result = union_categoricals([c1, c2]) + tm.assert_categorical_equal(result, expected) + + with tm.assertRaises(TypeError): + union_categoricals([c1, ['a', 'b', 'c']]) From fa3c2e4fcca339a33d995202f2823ec1e1799901 Mon Sep 17 00:00:00 2001 From: Kernc Date: Wed, 22 Feb 2017 13:41:16 -0500 Subject: [PATCH 180/338] BUG: Categorical.unique() preserves categories closes #13179 Author: Kernc Closes #15439 from kernc/Categorical.unique-nostrip-unused and squashes the following commits: 55733b8 [Kernc] fixup! BUG: Fix .groupby(categorical, sort=False) failing 2aec326 [Kernc] fixup! BUG: Fix .groupby(categorical, sort=False) failing c813146 [Kernc] PERF: add asv for categorical grouping 0c550e6 [Kernc] BUG: Fix .groupby(categorical, sort=False) failing --- asv_bench/benchmarks/groupby.py | 37 +++++++++++++++++++++ doc/source/whatsnew/v0.20.0.txt | 34 ++++++++++++++++++- pandas/core/categorical.py | 42 ++++++++++++++++++++++++ pandas/core/groupby.py | 18 +--------- pandas/indexes/category.py | 4 +++ pandas/tests/groupby/test_categorical.py | 24 ++++++++++++++ 6 files changed, 141 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 03ff62568b405..59f55914ea4d3 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -492,6 +492,43 @@ def time_groupby_sum(self): self.df.groupby(['a'])['b'].sum() +class groupby_categorical(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + arr = np.random.random(N) + + self.df = DataFrame(dict( + a=Categorical(np.random.randint(10000, size=N)), + b=arr)) + self.df_ordered = DataFrame(dict( + a=Categorical(np.random.randint(10000, size=N), ordered=True), + b=arr)) + self.df_extra_cat = DataFrame(dict( + a=Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + b=arr)) + + def time_groupby_sort(self): + self.df.groupby('a')['b'].count() + + def time_groupby_nosort(self): + self.df.groupby('a', sort=False)['b'].count() + + def time_groupby_ordered_sort(self): + self.df_ordered.groupby('a')['b'].count() + + def time_groupby_ordered_nosort(self): + self.df_ordered.groupby('a', sort=False)['b'].count() + + def time_groupby_extra_cat_sort(self): + self.df_extra_cat.groupby('a')['b'].count() + + def time_groupby_extra_cat_nosort(self): + self.df_extra_cat.groupby('a', sort=False)['b'].count() + + class groupby_period(object): # GH 14338 goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index bb5f19b301dc8..e65276fe51fe8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -120,6 +120,39 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`) +.. _whatsnew_0200.enhancements.groupy_categorical + +GroupBy on Categoricals +^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`) + +.. ipython:: python + + chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']] + df = pd.DataFrame({ + 'A': np.random.randint(100), + 'B': np.random.randint(100), + 'C': np.random.randint(100), + 'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100), + categories=chromosomes, + ordered=True)}) + df + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + --------------------------------------------------------------------------- + ValueError: items in new_categories are not the same as in old categories + +New Behavior: + +.. ipython:: python + + df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + .. _whatsnew_0200.enhancements.other: Other enhancements @@ -163,7 +196,6 @@ Other enhancements .. _whatsnew_0200.api_breaking: - Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 491db2e080953..b6898f11ffa74 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -602,6 +602,46 @@ def _get_categories(self): categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) + def _codes_for_groupby(self, sort): + """ + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + sort : boolean + The value of the sort paramter groupby was called with. + + Returns + ------- + Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + """ + + # Already sorted according to self.categories; all is fine + if sort: + return self + + # sort=False should order groups in as-encountered order (GH-8868) + cat = self.unique() + + # But for groupby to work, all categories should be present, + # including those missing from the data (GH-13179), which .unique() + # above dropped + cat.add_categories( + self.categories[~self.categories.isin(cat.categories)], + inplace=True) + + return self.reorder_categories(cat.categories) + _ordered = None def set_ordered(self, value, inplace=False): @@ -1853,8 +1893,10 @@ def unique(self): # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) cat = self.copy() + # keep nan in codes cat._codes = unique_codes + # exclude nan from indexer for categories take_codes = unique_codes[unique_codes != -1] if self.ordered: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ba2de295fa0a9..0b3fcba1c1ba5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2300,23 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - # must have an ordered categorical - if self.sort: - if not self.grouper.ordered: - - # technically we cannot group on an unordered - # Categorical - # but this a user convenience to do so; the ordering - # is preserved and if it's a reduction it doesn't make - # any difference - pass - - # fix bug #GH8868 sort=False being ignored in categorical - # groupby - else: - cat = self.grouper.unique() - self.grouper = self.grouper.reorder_categories( - cat.categories) + self.grouper = self.grouper._codes_for_groupby(self.sort) # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index acb2758641a62..5299a094156cd 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name): result.name = name return result + def _codes_for_groupby(self, sort): + """ Return a Categorical adjusted for groupby """ + return self.values._codes_for_groupby(sort) + @classmethod def _add_comparison_methods(cls): """ add in comparison methods """ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index eebd0e0f490c1..cfcb531bedab8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self): tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_preserve_categories(self): + # GH-13179 + categories = list('abc') + + # ordered=True + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = pd.CategoricalIndex(categories, categories, ordered=True) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + + # ordered=False + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = pd.CategoricalIndex(categories, categories, ordered=False) + nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), + ordered=False) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, + sort_index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, + nosort_index) + def test_groupby_preserve_categorical_dtype(self): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2], From 3f8677bfdec2ec2bb122f13c20bc9a99b1522b43 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Feb 2017 13:52:46 -0500 Subject: [PATCH 181/338] PEP: pep issue in pandas/tests/tools/test_concat.py --- pandas/tests/tools/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index f292aeda8cbe0..a2b5773f551c9 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -7,7 +7,7 @@ from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, - DatetimeIndex, Categorical, CategoricalIndex) + DatetimeIndex) from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf, From 431132e6984c0e6ed6be45f9d6b776ed94320d42 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 23 Feb 2017 08:19:24 -0500 Subject: [PATCH 182/338] CI: use correct circleci badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 195b76f64b37f..7bc350d1c6675 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ - circleci build status + circleci build status
... """ self.table_attributes = attributes return self From 81905dfe891cd8b460f5123e5771ffa5c2184fc5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Mar 2017 03:39:34 -0500 Subject: [PATCH 212/338] TST: split tests/format/test_format.py (#15546) closes #15531 --- pandas/tests/formats/test_eng_formatting.py | 195 ++ pandas/tests/formats/test_format.py | 2610 +------------------ pandas/tests/formats/test_to_csv.py | 216 ++ pandas/tests/formats/test_to_html.py | 1861 +++++++++++++ pandas/tests/formats/test_to_latex.py | 351 +++ 5 files changed, 2653 insertions(+), 2580 deletions(-) create mode 100644 pandas/tests/formats/test_eng_formatting.py create mode 100644 pandas/tests/formats/test_to_csv.py create mode 100644 pandas/tests/formats/test_to_html.py create mode 100644 pandas/tests/formats/test_to_latex.py diff --git a/pandas/tests/formats/test_eng_formatting.py b/pandas/tests/formats/test_eng_formatting.py new file mode 100644 index 0000000000000..d2badd4fc160a --- /dev/null +++ b/pandas/tests/formats/test_eng_formatting.py @@ -0,0 +1,195 @@ +import numpy as np +import pandas as pd +from pandas import DataFrame +from pandas.compat import u +import pandas.formats.format as fmt +from pandas.util import testing as tm + + +class TestEngFormatter(tm.TestCase): + + def test_eng_float_formatter(self): + df = DataFrame({'A': [1.41, 141., 14100, 1410000.]}) + + fmt.set_eng_float_format() + result = df.to_string() + expected = (' A\n' + '0 1.410E+00\n' + '1 141.000E+00\n' + '2 14.100E+03\n' + '3 1.410E+06') + self.assertEqual(result, expected) + + fmt.set_eng_float_format(use_eng_prefix=True) + result = df.to_string() + expected = (' A\n' + '0 1.410\n' + '1 141.000\n' + '2 14.100k\n' + '3 1.410M') + self.assertEqual(result, expected) + + fmt.set_eng_float_format(accuracy=0) + result = df.to_string() + expected = (' A\n' + '0 1E+00\n' + '1 141E+00\n' + '2 14E+03\n' + '3 1E+06') + self.assertEqual(result, expected) + + self.reset_display_options() + + def compare(self, formatter, input, output): + formatted_input = formatter(input) + msg = ("formatting of %s results in '%s', expected '%s'" % + (str(input), formatted_input, output)) + self.assertEqual(formatted_input, output, msg) + + def compare_all(self, formatter, in_out): + """ + Parameters: + ----------- + formatter: EngFormatter under test + in_out: list of tuples. Each tuple = (number, expected_formatting) + + It is tested if 'formatter(number) == expected_formatting'. + *number* should be >= 0 because formatter(-number) == fmt is also + tested. *fmt* is derived from *expected_formatting* + """ + for input, output in in_out: + self.compare(formatter, input, output) + self.compare(formatter, -input, "-" + output[1:]) + + def test_exponents_with_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + f = np.sqrt(2) + in_out = [ + (f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"), + (f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"), + (f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"), + (f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"), + (f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"), + (f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"), + (f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"), + (f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"), + (f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"), + (f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"), + (f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"), + (f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"), + (f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"), + (f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"), + (f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"), + (f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"), + (f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"), + (f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"), + (f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"), + (f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"), + (f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"), + (f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"), + (f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"), + (f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"), + (f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"), + (f * 10 ** 26, " 141.421Y")] + self.compare_all(formatter, in_out) + + def test_exponents_without_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + f = np.pi + in_out = [ + (f * 10 ** -24, " 3.1416E-24"), + (f * 10 ** -23, " 31.4159E-24"), + (f * 10 ** -22, " 314.1593E-24"), + (f * 10 ** -21, " 3.1416E-21"), + (f * 10 ** -20, " 31.4159E-21"), + (f * 10 ** -19, " 314.1593E-21"), + (f * 10 ** -18, " 3.1416E-18"), + (f * 10 ** -17, " 31.4159E-18"), + (f * 10 ** -16, " 314.1593E-18"), + (f * 10 ** -15, " 3.1416E-15"), + (f * 10 ** -14, " 31.4159E-15"), + (f * 10 ** -13, " 314.1593E-15"), + (f * 10 ** -12, " 3.1416E-12"), + (f * 10 ** -11, " 31.4159E-12"), + (f * 10 ** -10, " 314.1593E-12"), + (f * 10 ** -9, " 3.1416E-09"), + (f * 10 ** -8, " 31.4159E-09"), + (f * 10 ** -7, " 314.1593E-09"), + (f * 10 ** -6, " 3.1416E-06"), + (f * 10 ** -5, " 31.4159E-06"), + (f * 10 ** -4, " 314.1593E-06"), + (f * 10 ** -3, " 3.1416E-03"), + (f * 10 ** -2, " 31.4159E-03"), + (f * 10 ** -1, " 314.1593E-03"), + (f * 10 ** 0, " 3.1416E+00"), + (f * 10 ** 1, " 31.4159E+00"), + (f * 10 ** 2, " 314.1593E+00"), + (f * 10 ** 3, " 3.1416E+03"), + (f * 10 ** 4, " 31.4159E+03"), + (f * 10 ** 5, " 314.1593E+03"), + (f * 10 ** 6, " 3.1416E+06"), + (f * 10 ** 7, " 31.4159E+06"), + (f * 10 ** 8, " 314.1593E+06"), + (f * 10 ** 9, " 3.1416E+09"), + (f * 10 ** 10, " 31.4159E+09"), + (f * 10 ** 11, " 314.1593E+09"), + (f * 10 ** 12, " 3.1416E+12"), + (f * 10 ** 13, " 31.4159E+12"), + (f * 10 ** 14, " 314.1593E+12"), + (f * 10 ** 15, " 3.1416E+15"), + (f * 10 ** 16, " 31.4159E+15"), + (f * 10 ** 17, " 314.1593E+15"), + (f * 10 ** 18, " 3.1416E+18"), + (f * 10 ** 19, " 31.4159E+18"), + (f * 10 ** 20, " 314.1593E+18"), + (f * 10 ** 21, " 3.1416E+21"), + (f * 10 ** 22, " 31.4159E+21"), + (f * 10 ** 23, " 314.1593E+21"), + (f * 10 ** 24, " 3.1416E+24"), + (f * 10 ** 25, " 31.4159E+24"), + (f * 10 ** 26, " 314.1593E+24")] + self.compare_all(formatter, in_out) + + def test_rounding(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'), + (555.555, ' 555.555'), (5555.55, ' 5.556k'), + (55555.5, ' 55.556k'), (555555, ' 555.555k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'), + (5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'), + (5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + result = formatter(0) + self.assertEqual(result, u(' 0.000')) + + def test_nan(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.nan) + self.assertEqual(result, u('NaN')) + + df = pd.DataFrame({'a': [1.5, 10.3, 20.5], + 'b': [50.3, 60.67, 70.12], + 'c': [100.2, 101.33, 120.33]}) + pt = df.pivot_table(values='a', index='b', columns='c') + fmt.set_eng_float_format(accuracy=1) + result = pt.to_string() + self.assertTrue('NaN' in result) + self.reset_display_options() + + def test_inf(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.inf) + self.assertEqual(result, u('inf')) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 476c6a636ae5a..ddf9d35841ce7 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1,50 +1,41 @@ # -*- coding: utf-8 -*- +""" +test output formatting for Series/DataFrame +including to_string & reprs +""" + # TODO(wesm): lots of issues making flake8 hard # flake8: noqa from __future__ import print_function -from distutils.version import LooseVersion import re -from pandas.compat import (range, zip, lrange, StringIO, PY3, - u, lzip, is_platform_windows, - is_platform_32bit) -import pandas.compat as compat import itertools from operator import methodcaller import os import sys -from textwrap import dedent import warnings +from datetime import datetime -from numpy import nan -from numpy.random import randn -import numpy as np - -import codecs - -div_style = '' -try: - import IPython - if IPython.__version__ < LooseVersion('3.0.0'): - div_style = ' style="max-width:1500px;overflow:auto;"' -except (ImportError, AttributeError): - pass +import pytest -from pandas import DataFrame, Series, Index, Timestamp, MultiIndex, date_range, NaT +import numpy as np +import pandas as pd +from pandas import (DataFrame, Series, Index, Timestamp, MultiIndex, + date_range, NaT, read_table) +from pandas.compat import (range, zip, lrange, StringIO, PY3, + u, lzip, is_platform_windows, + is_platform_32bit) +import pandas.compat as compat import pandas.formats.format as fmt -import pandas.util.testing as tm -import pandas.core.common as com import pandas.formats.printing as printing + +import pandas.util.testing as tm from pandas.util.terminal import get_terminal_size -import pandas as pd from pandas.core.config import (set_option, get_option, option_context, reset_option) -from datetime import datetime - -import pytest use_32bit_repr = is_platform_windows() or is_platform_32bit() @@ -288,7 +279,7 @@ def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: pytest.skip("terminal size too small, " - "{0} x {1}".format(term_width, term_height)) + "{0} x {1}".format(term_width, term_height)) def mkframe(n): index = ['%05d' % i for i in range(n)] @@ -829,1393 +820,6 @@ def test_datetimelike_frame(self): '[10 rows x 2 columns]') self.assertEqual(repr(df), expected) - def test_to_html_with_col_space(self): - def check_with_width(df, col_space): - import re - # check that col_space affects HTML generation - # and be very brittle about it. - html = df.to_html(col_space=col_space) - hdrs = [x for x in html.split(r"\n") if re.search(r"\s]", x)] - self.assertTrue(len(hdrs) > 0) - for h in hdrs: - self.assertTrue("min-width" in h) - self.assertTrue(str(col_space) in h) - - df = DataFrame(np.random.random(size=(1, 3))) - - check_with_width(df, 30) - check_with_width(df, 50) - - def test_to_html_with_empty_string_label(self): - # GH3547, to_html regards empty string labels as repeated labels - data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]} - df = DataFrame(data).set_index(['c1', 'c2']) - res = df.to_html() - self.assertTrue("rowspan" not in res) - - def test_to_html_unicode(self): - df = DataFrame({u('\u03c3'): np.arange(10.)}) - expected = u'
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
\u03c3
00.0
11.0
22.0
33.0
44.0
55.0
66.0
77.0
88.0
99.0
' - self.assertEqual(df.to_html(), expected) - df = DataFrame({'A': [u('\u03c3')]}) - expected = u'\n \n \n \n \n \n \n \n \n \n \n \n \n
A
0\u03c3
' - self.assertEqual(df.to_html(), expected) - - def test_to_html_decimal(self): - # GH 12031 - df = DataFrame({'A': [6.0, 3.1, 2.2]}) - result = df.to_html(decimal=',') - expected = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
A
06,0
13,1
22,2
') - self.assertEqual(result, expected) - - def test_to_html_escaped(self): - a = 'str", - b: ""}, - 'co>l2': {a: "", - b: ""}} - rs = DataFrame(test_dict).to_html() - xp = """ - - - - - - - - - - - - - - - - - - - -
co<l1co>l2
str<ing1 &amp;<type 'str'><type 'str'>
stri>ng2 &amp;<type 'str'><type 'str'>
""" - - self.assertEqual(xp, rs) - - def test_to_html_escape_disabled(self): - a = 'strbold", - b: "bold"}, - 'co>l2': {a: "bold", - b: "bold"}} - rs = DataFrame(test_dict).to_html(escape=False) - xp = """ - - - - - - - - - - - - - - - - - -
co - co>l2
str - boldbold
stri>ng2 &boldbold
""" - - self.assertEqual(xp, rs) - - def test_to_html_multiindex_index_false(self): - # issue 8452 - df = DataFrame({ - 'a': range(2), - 'b': range(3, 5), - 'c': range(5, 7), - 'd': range(3, 5) - }) - df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) - result = df.to_html(index=False) - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ab
cdcd
0353
1464
""" - - self.assertEqual(result, expected) - - df.index = Index(df.index.values, name='idx') - result = df.to_html(index=False) - self.assertEqual(result, expected) - - def test_to_html_multiindex_sparsify_false_multi_sparse(self): - with option_context('display.multi_sparse', False): - index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], - names=['foo', None]) - - df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) - - result = df.to_html() - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
01
foo
0001
0123
1045
1167
""" - - self.assertEqual(result, expected) - - df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], - columns=index[::2], index=index) - - result = df.to_html() - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
foo01
00
foo
0001
0123
1045
1167
""" - - self.assertEqual(result, expected) - - def test_to_html_multiindex_sparsify(self): - index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], - names=['foo', None]) - - df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) - - result = df.to_html() - expected = """ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
01
foo
0001
123
1045
167
""" - - self.assertEqual(result, expected) - - df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], columns=index[::2], - index=index) - - result = df.to_html() - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
foo01
00
foo
0001
123
1045
167
""" - - self.assertEqual(result, expected) - - def test_to_html_multiindex_odd_even_truncate(self): - # GH 14882 - Issue on truncation with odd length DataFrame - mi = MultiIndex.from_product([[100, 200, 300], - [10, 20, 30], - [1, 2, 3, 4, 5, 6, 7]], - names=['a', 'b', 'c']) - df = DataFrame({'n': range(len(mi))}, index=mi) - result = df.to_html(max_rows=60) - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
n
abc
1001010
21
32
43
54
65
76
2017
28
39
410
511
612
713
30114
215
316
417
518
619
720
20010121
222
323
424
525
626
727
20128
229
......
633
734
30135
236
337
438
539
640
741
30010142
243
344
445
546
647
748
20149
250
351
452
553
654
755
30156
257
358
459
560
661
762
""" - self.assertEqual(result, expected) - - # Test that ... appears in a middle level - result = df.to_html(max_rows=56) - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
n
abc
1001010
21
32
43
54
65
76
2017
28
39
410
511
612
713
30114
215
316
417
518
619
720
20010121
222
323
424
525
626
727
.........
30135
236
337
438
539
640
741
30010142
243
344
445
546
647
748
20149
250
351
452
553
654
755
30156
257
358
459
560
661
762
""" - self.assertEqual(result, expected) - - def test_to_html_index_formatter(self): - df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], columns=['foo', None], - index=lrange(4)) - - f = lambda x: 'abcd' [x] - result = df.to_html(formatters={'__index__': f}) - expected = """\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
fooNone
a01
b23
c45
d67
""" - - self.assertEqual(result, expected) - - def test_to_html_datetime64_monthformatter(self): - months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] - x = DataFrame({'months': months}) - - def format_func(x): - return x.strftime('%Y-%m') - result = x.to_html(formatters={'months': format_func}) - expected = """\ - - - - - - - - - - - - - - - - - -
months
02016-01
12016-02
""" - self.assertEqual(result, expected) - - def test_to_html_datetime64_hourformatter(self): - - x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')}) - - def format_func(x): - return x.strftime('%H:%M') - result = x.to_html(formatters={'hod': format_func}) - expected = """\ - - - - - - - - - - - - - - - - - -
hod
010:10
112:12
""" - self.assertEqual(result, expected) - - def test_to_html_regression_GH6098(self): - df = DataFrame({u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], - u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), - u('1er')], - 'données1': np.random.randn(5), - 'données2': np.random.randn(5)}) - # it works - df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_() - - def test_to_html_truncate(self): - pytest.skip("unreliable on travis") - index = pd.DatetimeIndex(start='20010101', freq='D', periods=20) - df = DataFrame(index=index, columns=range(20)) - fmt.set_option('display.max_rows', 8) - fmt.set_option('display.max_columns', 4) - result = df._repr_html_() - expected = '''\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
01...1819
2001-01-01NaNNaN...NaNNaN
2001-01-02NaNNaN...NaNNaN
2001-01-03NaNNaN...NaNNaN
2001-01-04NaNNaN...NaNNaN
..................
2001-01-17NaNNaN...NaNNaN
2001-01-18NaNNaN...NaNNaN
2001-01-19NaNNaN...NaNNaN
2001-01-20NaNNaN...NaNNaN
-

20 rows × 20 columns

-'''.format(div_style) - if compat.PY2: - expected = expected.decode('utf-8') - self.assertEqual(result, expected) - - def test_to_html_truncate_multi_index(self): - pytest.skip("unreliable on travis") - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - df = DataFrame(index=arrays, columns=arrays) - fmt.set_option('display.max_rows', 7) - fmt.set_option('display.max_columns', 7) - result = df._repr_html_() - expected = '''\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
barbaz...fooqux
onetwoone...twoonetwo
baroneNaNNaNNaN...NaNNaNNaN
twoNaNNaNNaN...NaNNaNNaN
bazoneNaNNaNNaN...NaNNaNNaN
...........................
footwoNaNNaNNaN...NaNNaNNaN
quxoneNaNNaNNaN...NaNNaNNaN
twoNaNNaNNaN...NaNNaNNaN
-

8 rows × 8 columns

-'''.format(div_style) - if compat.PY2: - expected = expected.decode('utf-8') - self.assertEqual(result, expected) - - def test_to_html_truncate_multi_index_sparse_off(self): - pytest.skip("unreliable on travis") - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - df = DataFrame(index=arrays, columns=arrays) - fmt.set_option('display.max_rows', 7) - fmt.set_option('display.max_columns', 7) - fmt.set_option('display.multi_sparse', False) - result = df._repr_html_() - expected = '''\ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
barbarbaz...fooquxqux
onetwoone...twoonetwo
baroneNaNNaNNaN...NaNNaNNaN
bartwoNaNNaNNaN...NaNNaNNaN
bazoneNaNNaNNaN...NaNNaNNaN
footwoNaNNaNNaN...NaNNaNNaN
quxoneNaNNaNNaN...NaNNaNNaN
quxtwoNaNNaNNaN...NaNNaNNaN
-

8 rows × 8 columns

-'''.format(div_style) - if compat.PY2: - expected = expected.decode('utf-8') - self.assertEqual(result, expected) - - def test_to_html_border(self): - df = DataFrame({'A': [1, 2]}) - result = df.to_html() - assert 'border="1"' in result - - def test_to_html_border_option(self): - df = DataFrame({'A': [1, 2]}) - with pd.option_context('html.border', 0): - result = df.to_html() - self.assertTrue('border="0"' in result) - self.assertTrue('border="0"' in df._repr_html_()) - - def test_to_html_border_zero(self): - df = DataFrame({'A': [1, 2]}) - result = df.to_html(border=0) - self.assertTrue('border="0"' in result) - def test_nonunicode_nonascii_alignment(self): df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) rep_str = df.to_string() @@ -2223,7 +827,7 @@ def test_nonunicode_nonascii_alignment(self): self.assertEqual(len(lines[1]), len(lines[2])) def test_unicode_problem_decoding_as_ascii(self): - dm = DataFrame({u('c/\u03c3'): Series({'test': np.NaN})}) + dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})}) compat.text_type(dm.to_string()) def test_string_repr_encoding(self): @@ -2271,7 +875,7 @@ def test_pprint_thing(self): # escape embedded tabs in string # GH #2038 - self.assertTrue(not "\t" in pp_t("a\tb", escape_chars=("\t", ))) + assert "\t" not in pp_t("a\tb", escape_chars=("\t", )) def test_wide_repr(self): with option_context('mode.sim_interactive', True, @@ -2294,7 +898,8 @@ def test_wide_repr(self): def test_wide_repr_wide_columns(self): with option_context('mode.sim_interactive', True): - df = DataFrame(randn(5, 3), columns=['a' * 90, 'b' * 90, 'c' * 90]) + df = DataFrame(np.random.randn(5, 3), + columns=['a' * 90, 'b' * 90, 'c' * 90]) rep_str = repr(df) self.assertEqual(len(rep_str.splitlines()), 20) @@ -2346,8 +951,8 @@ def test_wide_repr_multiindex_cols(self): with option_context('mode.sim_interactive', True): max_cols = get_option('display.max_columns') midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) - mcols = MultiIndex.from_arrays(tm.rands_array(3, size=(2, max_cols - - 1))) + mcols = MultiIndex.from_arrays( + tm.rands_array(3, size=(2, max_cols - 1))) df = DataFrame(tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols) df.index.names = ['Level 0', 'Level 1'] @@ -2465,16 +1070,14 @@ def test_index_with_nan(self): self.assertEqual(result, expected) def test_to_string(self): - from pandas import read_table - import re # big mixed - biggie = DataFrame({'A': randn(200), + biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) - biggie.loc[:20, 'A'] = nan - biggie.loc[:20, 'B'] = nan + biggie.loc[:20, 'A'] = np.nan + biggie.loc[:20, 'B'] = np.nan s = biggie.to_string() buf = StringIO() @@ -2713,414 +1316,6 @@ def test_show_dimensions(self): self.assertFalse('5 rows' in str(df)) self.assertFalse('5 rows' in df._repr_html_()) - def test_to_html(self): - # big mixed - biggie = DataFrame({'A': randn(200), - 'B': tm.makeStringIndex(200)}, - index=lrange(200)) - - biggie.loc[:20, 'A'] = nan - biggie.loc[:20, 'B'] = nan - s = biggie.to_html() - - buf = StringIO() - retval = biggie.to_html(buf=buf) - self.assertIsNone(retval) - self.assertEqual(buf.getvalue(), s) - - tm.assertIsInstance(s, compat.string_types) - - biggie.to_html(columns=['B', 'A'], col_space=17) - biggie.to_html(columns=['B', 'A'], - formatters={'A': lambda x: '%.1f' % x}) - - biggie.to_html(columns=['B', 'A'], float_format=str) - biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) - - frame = DataFrame(index=np.arange(200)) - frame.to_html() - - def test_to_html_filename(self): - biggie = DataFrame({'A': randn(200), - 'B': tm.makeStringIndex(200)}, - index=lrange(200)) - - biggie.loc[:20, 'A'] = nan - biggie.loc[:20, 'B'] = nan - with tm.ensure_clean('test.html') as path: - biggie.to_html(path) - with open(path, 'r') as f: - s = biggie.to_html() - s2 = f.read() - self.assertEqual(s, s2) - - frame = DataFrame(index=np.arange(200)) - with tm.ensure_clean('test.html') as path: - frame.to_html(path) - with open(path, 'r') as f: - self.assertEqual(frame.to_html(), f.read()) - - def test_to_html_with_no_bold(self): - x = DataFrame({'x': randn(5)}) - ashtml = x.to_html(bold_rows=False) - self.assertFalse('")]) - - def test_to_html_columns_arg(self): - result = self.frame.to_html(columns=['A']) - self.assertNotIn('B', result) - - def test_to_html_multiindex(self): - columns = MultiIndex.from_tuples(list(zip(np.arange(2).repeat(2), - np.mod(lrange(4), 2))), - names=['CL0', 'CL1']) - df = DataFrame([list('abcd'), list('efgh')], columns=columns) - result = df.to_html(justify='left') - expected = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
CL001
CL10101
0abcd
1efgh
') - - self.assertEqual(result, expected) - - columns = MultiIndex.from_tuples(list(zip( - range(4), np.mod( - lrange(4), 2)))) - df = DataFrame([list('abcd'), list('efgh')], columns=columns) - - result = df.to_html(justify='right') - expected = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
0123
0101
0abcd
1efgh
') - - self.assertEqual(result, expected) - - def test_to_html_justify(self): - df = DataFrame({'A': [6, 30000, 2], - 'B': [1, 2, 70000], - 'C': [223442, 0, 1]}, - columns=['A', 'B', 'C']) - result = df.to_html(justify='left') - expected = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
061223442
13000020
22700001
') - self.assertEqual(result, expected) - - result = df.to_html(justify='right') - expected = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
061223442
13000020
22700001
') - self.assertEqual(result, expected) - - def test_to_html_index(self): - index = ['foo', 'bar', 'baz'] - df = DataFrame({'A': [1, 2, 3], - 'B': [1.2, 3.4, 5.6], - 'C': ['one', 'two', np.NaN]}, - columns=['A', 'B', 'C'], - index=index) - expected_with_index = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
foo11.2one
bar23.4two
baz35.6NaN
') - self.assertEqual(df.to_html(), expected_with_index) - - expected_without_index = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
11.2one
23.4two
35.6NaN
') - result = df.to_html(index=False) - for i in index: - self.assertNotIn(i, result) - self.assertEqual(result, expected_without_index) - df.index = Index(['foo', 'bar', 'baz'], name='idx') - expected_with_index = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
idx
foo11.2one
bar23.4two
baz35.6NaN
') - self.assertEqual(df.to_html(), expected_with_index) - self.assertEqual(df.to_html(index=False), expected_without_index) - - tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] - df.index = MultiIndex.from_tuples(tuples) - - expected_with_index = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
foocar11.2one
bike23.4two
barcar35.6NaN
') - self.assertEqual(df.to_html(), expected_with_index) - - result = df.to_html(index=False) - for i in ['foo', 'bar', 'car', 'bike']: - self.assertNotIn(i, result) - # must be the same result as normal index - self.assertEqual(result, expected_without_index) - - df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2']) - expected_with_index = ('\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
ABC
idx1idx2
foocar11.2one
bike23.4two
barcar35.6NaN
') - self.assertEqual(df.to_html(), expected_with_index) - self.assertEqual(df.to_html(index=False), expected_without_index) - def test_repr_html(self): self.frame._repr_html_() @@ -3254,7 +1449,7 @@ def test_info_repr(self): def test_info_repr_max_cols(self): # GH #6939 - df = DataFrame(randn(10, 5)) + df = DataFrame(np.random.randn(10, 5)) with option_context('display.large_repr', 'info', 'display.max_columns', 1, 'display.max_info_columns', 4): @@ -3299,46 +1494,6 @@ def get_ipython(): self.reset_display_options() - def test_to_html_with_classes(self): - df = DataFrame() - result = df.to_html(classes="sortable draggable") - expected = dedent(""" - - - - - - - - - -
- - """).strip() - self.assertEqual(result, expected) - - result = df.to_html(classes=["sortable", "draggable"]) - self.assertEqual(result, expected) - - def test_to_html_no_index_max_rows(self): - # GH https://github.com/pandas-dev/pandas/issues/14998 - df = DataFrame({"A": [1, 2, 3, 4]}) - result = df.to_html(index=False, max_rows=1) - expected = dedent("""\ - - - - - - - - - - - -
A
1
""") - self.assertEqual(result, expected) - def test_pprint_pathological_object(self): """ if the test fails, the stack will overflow and nose crash, @@ -3373,541 +1528,6 @@ def test_dict_entries(self): self.assertTrue("'a': 1" in val) self.assertTrue("'b': 2" in val) - def test_to_latex_filename(self): - with tm.ensure_clean('test.tex') as path: - self.frame.to_latex(path) - - with open(path, 'r') as f: - self.assertEqual(self.frame.to_latex(), f.read()) - - # test with utf-8 and encoding option (GH 7061) - df = DataFrame([[u'au\xdfgangen']]) - with tm.ensure_clean('test.tex') as path: - df.to_latex(path, encoding='utf-8') - with codecs.open(path, 'r', encoding='utf-8') as f: - self.assertEqual(df.to_latex(), f.read()) - - # test with utf-8 without encoding option - if compat.PY3: # python3: pandas default encoding is utf-8 - with tm.ensure_clean('test.tex') as path: - df.to_latex(path) - with codecs.open(path, 'r', encoding='utf-8') as f: - self.assertEqual(df.to_latex(), f.read()) - else: - # python2 default encoding is ascii, so an error should be raised - with tm.ensure_clean('test.tex') as path: - self.assertRaises(UnicodeEncodeError, df.to_latex, path) - - def test_to_latex(self): - # it works! - self.frame.to_latex() - - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex() - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(withindex_result, withindex_expected) - - withoutindex_result = df.to_latex(index=False) - withoutindex_expected = r"""\begin{tabular}{rl} -\toprule - a & b \\ -\midrule - 1 & b1 \\ - 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(withoutindex_result, withoutindex_expected) - - def test_to_latex_format(self): - # GH Bug #9402 - self.frame.to_latex(column_format='ccc') - - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(column_format='ccc') - withindex_expected = r"""\begin{tabular}{ccc} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(withindex_result, withindex_expected) - - def test_to_latex_with_formatters(self): - df = DataFrame({'int': [1, 2, 3], - 'float': [1.0, 2.0, 3.0], - 'object': [(1, 2), True, False], - 'datetime64': [datetime(2016, 1, 1), - datetime(2016, 2, 5), - datetime(2016, 3, 3)]}) - - formatters = {'int': lambda x: '0x%x' % x, - 'float': lambda x: '[% 4.1f]' % x, - 'object': lambda x: '-%s-' % str(x), - 'datetime64': lambda x: x.strftime('%Y-%m'), - '__index__': lambda x: 'index: %s' % x} - result = df.to_latex(formatters=dict(formatters)) - - expected = r"""\begin{tabular}{llrrl} -\toprule -{} & datetime64 & float & int & object \\ -\midrule -index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ -index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ -index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ -\bottomrule -\end{tabular} -""" - self.assertEqual(result, expected) - - def test_to_latex_multiindex(self): - df = DataFrame({('x', 'y'): ['a']}) - result = df.to_latex() - expected = r"""\begin{tabular}{ll} -\toprule -{} & x \\ -{} & y \\ -\midrule -0 & a \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(result, expected) - - result = df.T.to_latex() - expected = r"""\begin{tabular}{lll} -\toprule - & & 0 \\ -\midrule -x & y & a \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(result, expected) - - df = DataFrame.from_dict({ - ('c1', 0): pd.Series(dict((x, x) for x in range(4))), - ('c1', 1): pd.Series(dict((x, x + 4) for x in range(4))), - ('c2', 0): pd.Series(dict((x, x) for x in range(4))), - ('c2', 1): pd.Series(dict((x, x + 4) for x in range(4))), - ('c3', 0): pd.Series(dict((x, x) for x in range(4))), - }).T - result = df.to_latex() - expected = r"""\begin{tabular}{llrrrr} -\toprule - & & 0 & 1 & 2 & 3 \\ -\midrule -c1 & 0 & 0 & 1 & 2 & 3 \\ - & 1 & 4 & 5 & 6 & 7 \\ -c2 & 0 & 0 & 1 & 2 & 3 \\ - & 1 & 4 & 5 & 6 & 7 \\ -c3 & 0 & 0 & 1 & 2 & 3 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(result, expected) - - # GH 10660 - df = pd.DataFrame({'a': [0, 0, 1, 1], - 'b': list('abab'), - 'c': [1, 2, 3, 4]}) - result = df.set_index(['a', 'b']).to_latex() - expected = r"""\begin{tabular}{llr} -\toprule - & & c \\ -a & b & \\ -\midrule -0 & a & 1 \\ - & b & 2 \\ -1 & a & 3 \\ - & b & 4 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(result, expected) - - result = df.groupby('a').describe().to_latex() - expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & ' - ' & & & & & & ' - '\\\\\n{} & count & mean & std & min & 25\\% & ' - '50\\% & 75\\% & max \\\\\na & & & ' - ' & & & & & \\\\\n\\midrule\n0 ' - '& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 ' - '& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 ' - '& 3.5 & 3.75 & 4.0 ' - '\\\\\n\\bottomrule\n\\end{tabular}\n') - - self.assertEqual(result, expected) - - def test_to_latex_escape(self): - a = 'a' - b = 'b' - - test_dict = {u('co^l1'): {a: "a", - b: "b"}, - u('co$e^x$'): {a: "a", - b: "b"}} - - unescaped_result = DataFrame(test_dict).to_latex(escape=False) - escaped_result = DataFrame(test_dict).to_latex( - ) # default: escape=True - - unescaped_expected = r'''\begin{tabular}{lll} -\toprule -{} & co$e^x$ & co^l1 \\ -\midrule -a & a & a \\ -b & b & b \\ -\bottomrule -\end{tabular} -''' - - escaped_expected = r'''\begin{tabular}{lll} -\toprule -{} & co\$e\textasciicircumx\$ & co\textasciicircuml1 \\ -\midrule -a & a & a \\ -b & b & b \\ -\bottomrule -\end{tabular} -''' - - self.assertEqual(unescaped_result, unescaped_expected) - self.assertEqual(escaped_result, escaped_expected) - - def test_to_latex_longtable(self): - self.frame.to_latex(longtable=True) - - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(longtable=True) - withindex_expected = r"""\begin{longtable}{lrl} -\toprule -{} & a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\end{longtable} -""" - - self.assertEqual(withindex_result, withindex_expected) - - withoutindex_result = df.to_latex(index=False, longtable=True) - withoutindex_expected = r"""\begin{longtable}{rl} -\toprule - a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot - 1 & b1 \\ - 2 & b2 \\ -\end{longtable} -""" - - self.assertEqual(withoutindex_result, withoutindex_expected) - - def test_to_latex_escape_special_chars(self): - special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^', - '\\'] - df = DataFrame(data=special_characters) - observed = df.to_latex() - expected = r"""\begin{tabular}{ll} -\toprule -{} & 0 \\ -\midrule -0 & \& \\ -1 & \% \\ -2 & \$ \\ -3 & \# \\ -4 & \_ \\ -5 & \{ \\ -6 & \} \\ -7 & \textasciitilde \\ -8 & \textasciicircum \\ -9 & \textbackslash \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(observed, expected) - - def test_to_latex_no_header(self): - # GH 7124 - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(header=False) - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(withindex_result, withindex_expected) - - withoutindex_result = df.to_latex(index=False, header=False) - withoutindex_expected = r"""\begin{tabular}{rl} -\toprule - 1 & b1 \\ - 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(withoutindex_result, withoutindex_expected) - - def test_to_latex_decimal(self): - # GH 12031 - self.frame.to_latex() - df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(decimal=',') - print("WHAT THE") - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1,0 & b1 \\ -1 & 2,1 & b2 \\ -\bottomrule -\end{tabular} -""" - - self.assertEqual(withindex_result, withindex_expected) - - def test_to_csv_quotechar(self): - df = DataFrame({'col': [1, 2]}) - expected = """\ -"","col" -"0","1" -"1","2" -""" - - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) - - expected = """\ -$$,$col$ -$0$,$1$ -$1$,$2$ -""" - - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, quotechar="$") - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) - - with tm.ensure_clean('test.csv') as path: - with tm.assertRaisesRegexp(TypeError, 'quotechar'): - df.to_csv(path, quoting=1, quotechar=None) - - def test_to_csv_doublequote(self): - df = DataFrame({'col': ['a"a', '"bb"']}) - expected = '''\ -"","col" -"0","a""a" -"1","""bb""" -''' - - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) - - from _csv import Error - with tm.ensure_clean('test.csv') as path: - with tm.assertRaisesRegexp(Error, 'escapechar'): - df.to_csv(path, doublequote=False) # no escapechar set - - def test_to_csv_escapechar(self): - df = DataFrame({'col': ['a"a', '"bb"']}) - expected = '''\ -"","col" -"0","a\\"a" -"1","\\"bb\\"" -''' - - with tm.ensure_clean('test.csv') as path: # QUOTE_ALL - df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) - - df = DataFrame({'col': ['a,a', ',bb,']}) - expected = """\ -,col -0,a\\,a -1,\\,bb\\, -""" - - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) - - def test_csv_to_string(self): - df = DataFrame({'col': [1, 2]}) - expected = ',col\n0,1\n1,2\n' - self.assertEqual(df.to_csv(), expected) - - def test_to_csv_decimal(self): - # GH 781 - df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) - - expected_default = ',col1,col2,col3\n0,1,a,10.1\n' - self.assertEqual(df.to_csv(), expected_default) - - expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n' - self.assertEqual( - df.to_csv(decimal=',', sep=';'), expected_european_excel) - - expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n' - self.assertEqual( - df.to_csv(float_format='%.2f'), expected_float_format_default) - - expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n' - self.assertEqual( - df.to_csv(decimal=',', sep=';', - float_format='%.2f'), expected_float_format) - - # GH 11553: testing if decimal is taken into account for '0.0' - df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) - expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n' - self.assertEqual(df.to_csv(index=False, decimal='^'), expected) - - # same but for an index - self.assertEqual(df.set_index('a').to_csv(decimal='^'), expected) - - # same for a multi-index - self.assertEqual( - df.set_index(['a', 'b']).to_csv(decimal="^"), expected) - - def test_to_csv_float_format(self): - # testing if float_format is taken into account for the index - # GH 11553 - df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) - expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n' - self.assertEqual( - df.set_index('a').to_csv(float_format='%.2f'), expected) - - # same for a multi-index - self.assertEqual( - df.set_index(['a', 'b']).to_csv(float_format='%.2f'), expected) - - def test_to_csv_na_rep(self): - # testing if NaN values are correctly represented in the index - # GH 11553 - df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) - expected = "a,b,c\n0.0,0,2\n_,1,3\n" - self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) - self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) - - # now with an index containing only NaNs - df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) - expected = "a,b,c\n_,0,2\n_,1,3\n" - self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) - self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) - - # check if na_rep parameter does not break anything when no NaN - df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) - expected = "a,b,c\n0,0,2\n0,1,3\n" - self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) - self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) - - def test_to_csv_date_format(self): - # GH 10209 - df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s') - }) - df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d') - }) - - expected_default_sec = ',A\n0,2013-01-01 00:00:00\n1,2013-01-01 00:00:01\n2,2013-01-01 00:00:02' + \ - '\n3,2013-01-01 00:00:03\n4,2013-01-01 00:00:04\n' - self.assertEqual(df_sec.to_csv(), expected_default_sec) - - expected_ymdhms_day = ',A\n0,2013-01-01 00:00:00\n1,2013-01-02 00:00:00\n2,2013-01-03 00:00:00' + \ - '\n3,2013-01-04 00:00:00\n4,2013-01-05 00:00:00\n' - self.assertEqual( - df_day.to_csv( - date_format='%Y-%m-%d %H:%M:%S'), expected_ymdhms_day) - - expected_ymd_sec = ',A\n0,2013-01-01\n1,2013-01-01\n2,2013-01-01\n3,2013-01-01\n4,2013-01-01\n' - self.assertEqual( - df_sec.to_csv(date_format='%Y-%m-%d'), expected_ymd_sec) - - expected_default_day = ',A\n0,2013-01-01\n1,2013-01-02\n2,2013-01-03\n3,2013-01-04\n4,2013-01-05\n' - self.assertEqual(df_day.to_csv(), expected_default_day) - self.assertEqual( - df_day.to_csv(date_format='%Y-%m-%d'), expected_default_day) - - # testing if date_format parameter is taken into account for - # multi-indexed dataframes (GH 7791) - df_sec['B'] = 0 - df_sec['C'] = 1 - expected_ymd_sec = 'A,B,C\n2013-01-01,0,1\n' - df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B']) - self.assertEqual(df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d'), - expected_ymd_sec) - - def test_to_csv_multi_index(self): - # see gh-6618 - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) - - exp = ",1\n,2\n0,1\n" - self.assertEqual(df.to_csv(), exp) - - exp = "1\n2\n1\n" - self.assertEqual(df.to_csv(index=False), exp) - - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), - index=pd.MultiIndex.from_arrays([[1], [2]])) - - exp = ",,1\n,,2\n1,2,1\n" - self.assertEqual(df.to_csv(), exp) - - exp = "1\n2\n1\n" - self.assertEqual(df.to_csv(index=False), exp) - - df = DataFrame( - [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) - - exp = ",foo\n,bar\n0,1\n" - self.assertEqual(df.to_csv(), exp) - - exp = "foo\nbar\n1\n" - self.assertEqual(df.to_csv(index=False), exp) - def test_period(self): # GH 12615 df = pd.DataFrame({'A': pd.period_range('2013-01', @@ -4291,7 +1911,7 @@ def test_max_multi_index_display(self): ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = list(zip(*arrays)) index = MultiIndex.from_tuples(tuples, names=['first', 'second']) - s = Series(randn(8), index=index) + s = Series(np.random.randn(8), index=index) with option_context("display.max_rows", 10): self.assertEqual(len(str(s).split('\n')), 10) @@ -4305,7 +1925,7 @@ def test_max_multi_index_display(self): self.assertEqual(len(str(s).split('\n')), 10) # index - s = Series(randn(8), None) + s = Series(np.random.randn(8), None) with option_context("display.max_rows", 10): self.assertEqual(len(str(s).split('\n')), 9) @@ -4436,176 +2056,6 @@ def test_to_string_header(self): self.assertEqual(res, exp) -class TestEngFormatter(tm.TestCase): - - def test_eng_float_formatter(self): - df = DataFrame({'A': [1.41, 141., 14100, 1410000.]}) - - fmt.set_eng_float_format() - result = df.to_string() - expected = (' A\n' - '0 1.410E+00\n' - '1 141.000E+00\n' - '2 14.100E+03\n' - '3 1.410E+06') - self.assertEqual(result, expected) - - fmt.set_eng_float_format(use_eng_prefix=True) - result = df.to_string() - expected = (' A\n' - '0 1.410\n' - '1 141.000\n' - '2 14.100k\n' - '3 1.410M') - self.assertEqual(result, expected) - - fmt.set_eng_float_format(accuracy=0) - result = df.to_string() - expected = (' A\n' - '0 1E+00\n' - '1 141E+00\n' - '2 14E+03\n' - '3 1E+06') - self.assertEqual(result, expected) - - self.reset_display_options() - - def compare(self, formatter, input, output): - formatted_input = formatter(input) - msg = ("formatting of %s results in '%s', expected '%s'" % - (str(input), formatted_input, output)) - self.assertEqual(formatted_input, output, msg) - - def compare_all(self, formatter, in_out): - """ - Parameters: - ----------- - formatter: EngFormatter under test - in_out: list of tuples. Each tuple = (number, expected_formatting) - - It is tested if 'formatter(number) == expected_formatting'. - *number* should be >= 0 because formatter(-number) == fmt is also - tested. *fmt* is derived from *expected_formatting* - """ - for input, output in in_out: - self.compare(formatter, input, output) - self.compare(formatter, -input, "-" + output[1:]) - - def test_exponents_with_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) - f = np.sqrt(2) - in_out = [(f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"), - (f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"), - (f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"), - (f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"), - (f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"), - (f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"), - (f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"), - (f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"), - (f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"), - (f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"), - (f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"), - (f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"), - (f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"), - (f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"), - (f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"), - (f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"), - (f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"), ( - f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"), - (f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"), ( - f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"), ( - f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"), ( - f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"), - (f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"), ( - f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"), ( - f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"), ( - f * 10 ** 26, " 141.421Y")] - self.compare_all(formatter, in_out) - - def test_exponents_without_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) - f = np.pi - in_out = [(f * 10 ** -24, " 3.1416E-24"), - (f * 10 ** -23, " 31.4159E-24"), - (f * 10 ** -22, " 314.1593E-24"), - (f * 10 ** -21, " 3.1416E-21"), - (f * 10 ** -20, " 31.4159E-21"), - (f * 10 ** -19, " 314.1593E-21"), - (f * 10 ** -18, " 3.1416E-18"), - (f * 10 ** -17, " 31.4159E-18"), - (f * 10 ** -16, " 314.1593E-18"), - (f * 10 ** -15, " 3.1416E-15"), - (f * 10 ** -14, " 31.4159E-15"), - (f * 10 ** -13, " 314.1593E-15"), - (f * 10 ** -12, " 3.1416E-12"), - (f * 10 ** -11, " 31.4159E-12"), - (f * 10 ** -10, " 314.1593E-12"), - (f * 10 ** -9, " 3.1416E-09"), (f * 10 ** -8, " 31.4159E-09"), - (f * 10 ** -7, " 314.1593E-09"), (f * 10 ** -6, " 3.1416E-06"), - (f * 10 ** -5, " 31.4159E-06"), (f * 10 ** -4, - " 314.1593E-06"), - (f * 10 ** -3, " 3.1416E-03"), (f * 10 ** -2, " 31.4159E-03"), - (f * 10 ** -1, " 314.1593E-03"), (f * 10 ** 0, " 3.1416E+00"), ( - f * 10 ** 1, " 31.4159E+00"), (f * 10 ** 2, " 314.1593E+00"), - (f * 10 ** 3, " 3.1416E+03"), (f * 10 ** 4, " 31.4159E+03"), ( - f * 10 ** 5, " 314.1593E+03"), (f * 10 ** 6, " 3.1416E+06"), - (f * 10 ** 7, " 31.4159E+06"), (f * 10 ** 8, " 314.1593E+06"), ( - f * 10 ** 9, " 3.1416E+09"), (f * 10 ** 10, " 31.4159E+09"), - (f * 10 ** 11, " 314.1593E+09"), (f * 10 ** 12, " 3.1416E+12"), - (f * 10 ** 13, " 31.4159E+12"), (f * 10 ** 14, " 314.1593E+12"), - (f * 10 ** 15, " 3.1416E+15"), (f * 10 ** 16, " 31.4159E+15"), - (f * 10 ** 17, " 314.1593E+15"), (f * 10 ** 18, " 3.1416E+18"), - (f * 10 ** 19, " 31.4159E+18"), (f * 10 ** 20, " 314.1593E+18"), - (f * 10 ** 21, " 3.1416E+21"), (f * 10 ** 22, " 31.4159E+21"), - (f * 10 ** 23, " 314.1593E+21"), (f * 10 ** 24, " 3.1416E+24"), - (f * 10 ** 25, " 31.4159E+24"), (f * 10 ** 26, " 314.1593E+24")] - self.compare_all(formatter, in_out) - - def test_rounding(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) - in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'), - (555.555, ' 555.555'), (5555.55, ' 5.556k'), - (55555.5, ' 55.556k'), (555555, ' 555.555k')] - self.compare_all(formatter, in_out) - - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) - in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'), - (5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')] - self.compare_all(formatter, in_out) - - formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) - in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'), - (5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')] - self.compare_all(formatter, in_out) - - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) - result = formatter(0) - self.assertEqual(result, u(' 0.000')) - - def test_nan(self): - # Issue #11981 - - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) - result = formatter(np.nan) - self.assertEqual(result, u('NaN')) - - df = pd.DataFrame({'a': [1.5, 10.3, 20.5], - 'b': [50.3, 60.67, 70.12], - 'c': [100.2, 101.33, 120.33]}) - pt = df.pivot_table(values='a', index='b', columns='c') - fmt.set_eng_float_format(accuracy=1) - result = pt.to_string() - self.assertTrue('NaN' in result) - self.reset_display_options() - - def test_inf(self): - # Issue #11981 - - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) - result = formatter(np.inf) - self.assertEqual(result, u('inf')) - - def _three_digit_exp(): return '%.4g' % 1.7e8 == '1.7e+008' diff --git a/pandas/tests/formats/test_to_csv.py b/pandas/tests/formats/test_to_csv.py new file mode 100644 index 0000000000000..51295fd750602 --- /dev/null +++ b/pandas/tests/formats/test_to_csv.py @@ -0,0 +1,216 @@ +from pandas import DataFrame +import numpy as np +import pandas as pd +from pandas.util import testing as tm + + +class TestToCSV(tm.TestCase): + + def test_to_csv_quotechar(self): + df = DataFrame({'col': [1, 2]}) + expected = """\ +"","col" +"0","1" +"1","2" +""" + + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1) # 1=QUOTE_ALL + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + expected = """\ +$$,$col$ +$0$,$1$ +$1$,$2$ +""" + + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, quotechar="$") + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(TypeError, 'quotechar'): + df.to_csv(path, quoting=1, quotechar=None) + + def test_to_csv_doublequote(self): + df = DataFrame({'col': ['a"a', '"bb"']}) + expected = '''\ +"","col" +"0","a""a" +"1","""bb""" +''' + + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + from _csv import Error + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(Error, 'escapechar'): + df.to_csv(path, doublequote=False) # no escapechar set + + def test_to_csv_escapechar(self): + df = DataFrame({'col': ['a"a', '"bb"']}) + expected = '''\ +"","col" +"0","a\\"a" +"1","\\"bb\\"" +''' + + with tm.ensure_clean('test.csv') as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + df = DataFrame({'col': ['a,a', ',bb,']}) + expected = """\ +,col +0,a\\,a +1,\\,bb\\, +""" + + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + def test_csv_to_string(self): + df = DataFrame({'col': [1, 2]}) + expected = ',col\n0,1\n1,2\n' + self.assertEqual(df.to_csv(), expected) + + def test_to_csv_decimal(self): + # GH 781 + df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) + + expected_default = ',col1,col2,col3\n0,1,a,10.1\n' + self.assertEqual(df.to_csv(), expected_default) + + expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n' + self.assertEqual( + df.to_csv(decimal=',', sep=';'), expected_european_excel) + + expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n' + self.assertEqual( + df.to_csv(float_format='%.2f'), expected_float_format_default) + + expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n' + self.assertEqual( + df.to_csv(decimal=',', sep=';', + float_format='%.2f'), expected_float_format) + + # GH 11553: testing if decimal is taken into account for '0.0' + df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) + expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n' + self.assertEqual(df.to_csv(index=False, decimal='^'), expected) + + # same but for an index + self.assertEqual(df.set_index('a').to_csv(decimal='^'), expected) + + # same for a multi-index + self.assertEqual( + df.set_index(['a', 'b']).to_csv(decimal="^"), expected) + + def test_to_csv_float_format(self): + # testing if float_format is taken into account for the index + # GH 11553 + df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) + expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n' + self.assertEqual( + df.set_index('a').to_csv(float_format='%.2f'), expected) + + # same for a multi-index + self.assertEqual( + df.set_index(['a', 'b']).to_csv(float_format='%.2f'), expected) + + def test_to_csv_na_rep(self): + # testing if NaN values are correctly represented in the index + # GH 11553 + df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) + expected = "a,b,c\n0.0,0,2\n_,1,3\n" + self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) + self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) + + # now with an index containing only NaNs + df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) + expected = "a,b,c\n_,0,2\n_,1,3\n" + self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) + self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) + + # check if na_rep parameter does not break anything when no NaN + df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) + expected = "a,b,c\n0,0,2\n0,1,3\n" + self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) + self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) + + def test_to_csv_date_format(self): + # GH 10209 + df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s') + }) + df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d') + }) + + expected_default_sec = (',A\n0,2013-01-01 00:00:00\n1,' + '2013-01-01 00:00:01\n2,2013-01-01 00:00:02' + '\n3,2013-01-01 00:00:03\n4,' + '2013-01-01 00:00:04\n') + self.assertEqual(df_sec.to_csv(), expected_default_sec) + + expected_ymdhms_day = (',A\n0,2013-01-01 00:00:00\n1,' + '2013-01-02 00:00:00\n2,2013-01-03 00:00:00' + '\n3,2013-01-04 00:00:00\n4,' + '2013-01-05 00:00:00\n') + self.assertEqual( + df_day.to_csv( + date_format='%Y-%m-%d %H:%M:%S'), expected_ymdhms_day) + + expected_ymd_sec = (',A\n0,2013-01-01\n1,2013-01-01\n2,' + '2013-01-01\n3,2013-01-01\n4,2013-01-01\n') + self.assertEqual( + df_sec.to_csv(date_format='%Y-%m-%d'), expected_ymd_sec) + + expected_default_day = (',A\n0,2013-01-01\n1,2013-01-02\n2,' + '2013-01-03\n3,2013-01-04\n4,2013-01-05\n') + self.assertEqual(df_day.to_csv(), expected_default_day) + self.assertEqual( + df_day.to_csv(date_format='%Y-%m-%d'), expected_default_day) + + # testing if date_format parameter is taken into account for + # multi-indexed dataframes (GH 7791) + df_sec['B'] = 0 + df_sec['C'] = 1 + expected_ymd_sec = 'A,B,C\n2013-01-01,0,1\n' + df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B']) + self.assertEqual(df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d'), + expected_ymd_sec) + + def test_to_csv_multi_index(self): + # see gh-6618 + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) + + exp = ",1\n,2\n0,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "1\n2\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), + index=pd.MultiIndex.from_arrays([[1], [2]])) + + exp = ",,1\n,,2\n1,2,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "1\n2\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + + df = DataFrame( + [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) + + exp = ",foo\n,bar\n0,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "foo\nbar\n1\n" + self.assertEqual(df.to_csv(index=False), exp) diff --git a/pandas/tests/formats/test_to_html.py b/pandas/tests/formats/test_to_html.py new file mode 100644 index 0000000000000..771c66e84037c --- /dev/null +++ b/pandas/tests/formats/test_to_html.py @@ -0,0 +1,1861 @@ +# -*- coding: utf-8 -*- + +import re +from textwrap import dedent +from datetime import datetime +from distutils.version import LooseVersion + +import pytest +import numpy as np +import pandas as pd +from pandas import compat, DataFrame, MultiIndex, option_context, Index +from pandas.compat import u, lrange, StringIO +from pandas.util import testing as tm +import pandas.formats.format as fmt + +div_style = '' +try: + import IPython + if IPython.__version__ < LooseVersion('3.0.0'): + div_style = ' style="max-width:1500px;overflow:auto;"' +except (ImportError, AttributeError): + pass + + +class TestToHTML(tm.TestCase): + + def test_to_html_with_col_space(self): + def check_with_width(df, col_space): + # check that col_space affects HTML generation + # and be very brittle about it. + html = df.to_html(col_space=col_space) + hdrs = [x for x in html.split(r"\n") if re.search(r"\s]", x)] + self.assertTrue(len(hdrs) > 0) + for h in hdrs: + self.assertTrue("min-width" in h) + self.assertTrue(str(col_space) in h) + + df = DataFrame(np.random.random(size=(1, 3))) + + check_with_width(df, 30) + check_with_width(df, 50) + + def test_to_html_with_empty_string_label(self): + # GH3547, to_html regards empty string labels as repeated labels + data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]} + df = DataFrame(data).set_index(['c1', 'c2']) + res = df.to_html() + self.assertTrue("rowspan" not in res) + + def test_to_html_unicode(self): + df = DataFrame({u('\u03c3'): np.arange(10.)}) + expected = u'\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
\u03c3
00.0
11.0
22.0
33.0
44.0
55.0
66.0
77.0
88.0
99.0
' # noqa + self.assertEqual(df.to_html(), expected) + df = DataFrame({'A': [u('\u03c3')]}) + expected = u'\n \n \n \n \n \n \n \n \n \n \n \n \n
A
0\u03c3
' # noqa + self.assertEqual(df.to_html(), expected) + + def test_to_html_decimal(self): + # GH 12031 + df = DataFrame({'A': [6.0, 3.1, 2.2]}) + result = df.to_html(decimal=',') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
A
06,0
13,1
22,2
') + self.assertEqual(result, expected) + + def test_to_html_escaped(self): + a = 'str", + b: ""}, + 'co>l2': {a: "", + b: ""}} + rs = DataFrame(test_dict).to_html() + xp = """ + + + + + + + + + + + + + + + + + + + +
co<l1co>l2
str<ing1 &amp;<type 'str'><type 'str'>
stri>ng2 &amp;<type 'str'><type 'str'>
""" + + self.assertEqual(xp, rs) + + def test_to_html_escape_disabled(self): + a = 'strbold", + b: "bold"}, + 'co>l2': {a: "bold", + b: "bold"}} + rs = DataFrame(test_dict).to_html(escape=False) + xp = """ + + + + + + + + + + + + + + + + + +
co + co>l2
str + boldbold
stri>ng2 &boldbold
""" + + self.assertEqual(xp, rs) + + def test_to_html_multiindex_index_false(self): + # issue 8452 + df = DataFrame({ + 'a': range(2), + 'b': range(3, 5), + 'c': range(5, 7), + 'd': range(3, 5) + }) + df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) + result = df.to_html(index=False) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ab
cdcd
0353
1464
""" + + self.assertEqual(result, expected) + + df.index = Index(df.index.values, name='idx') + result = df.to_html(index=False) + self.assertEqual(result, expected) + + def test_to_html_multiindex_sparsify_false_multi_sparse(self): + with option_context('display.multi_sparse', False): + index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], + names=['foo', None]) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) + + result = df.to_html() + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
foo
0001
0123
1045
1167
""" + + self.assertEqual(result, expected) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], + columns=index[::2], index=index) + + result = df.to_html() + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
foo01
00
foo
0001
0123
1045
1167
""" + + self.assertEqual(result, expected) + + def test_to_html_multiindex_sparsify(self): + index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], + names=['foo', None]) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) + + result = df.to_html() + expected = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
foo
0001
123
1045
167
""" + + self.assertEqual(result, expected) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], columns=index[::2], + index=index) + + result = df.to_html() + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
foo01
00
foo
0001
123
1045
167
""" + + self.assertEqual(result, expected) + + def test_to_html_multiindex_odd_even_truncate(self): + # GH 14882 - Issue on truncation with odd length DataFrame + mi = MultiIndex.from_product([[100, 200, 300], + [10, 20, 30], + [1, 2, 3, 4, 5, 6, 7]], + names=['a', 'b', 'c']) + df = DataFrame({'n': range(len(mi))}, index=mi) + result = df.to_html(max_rows=60) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
n
abc
1001010
21
32
43
54
65
76
2017
28
39
410
511
612
713
30114
215
316
417
518
619
720
20010121
222
323
424
525
626
727
20128
229
......
633
734
30135
236
337
438
539
640
741
30010142
243
344
445
546
647
748
20149
250
351
452
553
654
755
30156
257
358
459
560
661
762
""" + self.assertEqual(result, expected) + + # Test that ... appears in a middle level + result = df.to_html(max_rows=56) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
n
abc
1001010
21
32
43
54
65
76
2017
28
39
410
511
612
713
30114
215
316
417
518
619
720
20010121
222
323
424
525
626
727
.........
30135
236
337
438
539
640
741
30010142
243
344
445
546
647
748
20149
250
351
452
553
654
755
30156
257
358
459
560
661
762
""" + self.assertEqual(result, expected) + + def test_to_html_index_formatter(self): + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], columns=['foo', None], + index=lrange(4)) + + f = lambda x: 'abcd' [x] + result = df.to_html(formatters={'__index__': f}) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
fooNone
a01
b23
c45
d67
""" + + self.assertEqual(result, expected) + + def test_to_html_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({'months': months}) + + def format_func(x): + return x.strftime('%Y-%m') + result = x.to_html(formatters={'months': format_func}) + expected = """\ + + + + + + + + + + + + + + + + + +
months
02016-01
12016-02
""" + self.assertEqual(result, expected) + + def test_to_html_datetime64_hourformatter(self): + + x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')}) + + def format_func(x): + return x.strftime('%H:%M') + result = x.to_html(formatters={'hod': format_func}) + expected = """\ + + + + + + + + + + + + + + + + + +
hod
010:10
112:12
""" + self.assertEqual(result, expected) + + def test_to_html_regression_GH6098(self): + df = DataFrame({ + u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], + u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')], + 'données1': np.random.randn(5), + 'données2': np.random.randn(5)}) + + # it works + df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_() + + def test_to_html_truncate(self): + pytest.skip("unreliable on travis") + index = pd.DatetimeIndex(start='20010101', freq='D', periods=20) + df = DataFrame(index=index, columns=range(20)) + fmt.set_option('display.max_rows', 8) + fmt.set_option('display.max_columns', 4) + result = df._repr_html_() + expected = '''\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01...1819
2001-01-01NaNNaN...NaNNaN
2001-01-02NaNNaN...NaNNaN
2001-01-03NaNNaN...NaNNaN
2001-01-04NaNNaN...NaNNaN
..................
2001-01-17NaNNaN...NaNNaN
2001-01-18NaNNaN...NaNNaN
2001-01-19NaNNaN...NaNNaN
2001-01-20NaNNaN...NaNNaN
+

20 rows × 20 columns

+'''.format(div_style) + if compat.PY2: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_truncate_multi_index(self): + pytest.skip("unreliable on travis") + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = DataFrame(index=arrays, columns=arrays) + fmt.set_option('display.max_rows', 7) + fmt.set_option('display.max_columns', 7) + result = df._repr_html_() + expected = '''\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
barbaz...fooqux
onetwoone...twoonetwo
baroneNaNNaNNaN...NaNNaNNaN
twoNaNNaNNaN...NaNNaNNaN
bazoneNaNNaNNaN...NaNNaNNaN
...........................
footwoNaNNaNNaN...NaNNaNNaN
quxoneNaNNaNNaN...NaNNaNNaN
twoNaNNaNNaN...NaNNaNNaN
+

8 rows × 8 columns

+'''.format(div_style) + if compat.PY2: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_truncate_multi_index_sparse_off(self): + pytest.skip("unreliable on travis") + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = DataFrame(index=arrays, columns=arrays) + fmt.set_option('display.max_rows', 7) + fmt.set_option('display.max_columns', 7) + fmt.set_option('display.multi_sparse', False) + result = df._repr_html_() + expected = '''\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
barbarbaz...fooquxqux
onetwoone...twoonetwo
baroneNaNNaNNaN...NaNNaNNaN
bartwoNaNNaNNaN...NaNNaNNaN
bazoneNaNNaNNaN...NaNNaNNaN
footwoNaNNaNNaN...NaNNaNNaN
quxoneNaNNaNNaN...NaNNaNNaN
quxtwoNaNNaNNaN...NaNNaNNaN
+

8 rows × 8 columns

+'''.format(div_style) + if compat.PY2: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_border(self): + df = DataFrame({'A': [1, 2]}) + result = df.to_html() + assert 'border="1"' in result + + def test_to_html_border_option(self): + df = DataFrame({'A': [1, 2]}) + with pd.option_context('html.border', 0): + result = df.to_html() + self.assertTrue('border="0"' in result) + self.assertTrue('border="0"' in df._repr_html_()) + + def test_to_html_border_zero(self): + df = DataFrame({'A': [1, 2]}) + result = df.to_html(border=0) + self.assertTrue('border="0"' in result) + + def test_to_html(self): + # big mixed + biggie = DataFrame({'A': np.random.randn(200), + 'B': tm.makeStringIndex(200)}, + index=lrange(200)) + + biggie.loc[:20, 'A'] = np.nan + biggie.loc[:20, 'B'] = np.nan + s = biggie.to_html() + + buf = StringIO() + retval = biggie.to_html(buf=buf) + self.assertIsNone(retval) + self.assertEqual(buf.getvalue(), s) + + tm.assertIsInstance(s, compat.string_types) + + biggie.to_html(columns=['B', 'A'], col_space=17) + biggie.to_html(columns=['B', 'A'], + formatters={'A': lambda x: '%.1f' % x}) + + biggie.to_html(columns=['B', 'A'], float_format=str) + biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_html() + + def test_to_html_filename(self): + biggie = DataFrame({'A': np.random.randn(200), + 'B': tm.makeStringIndex(200)}, + index=lrange(200)) + + biggie.loc[:20, 'A'] = np.nan + biggie.loc[:20, 'B'] = np.nan + with tm.ensure_clean('test.html') as path: + biggie.to_html(path) + with open(path, 'r') as f: + s = biggie.to_html() + s2 = f.read() + self.assertEqual(s, s2) + + frame = DataFrame(index=np.arange(200)) + with tm.ensure_clean('test.html') as path: + frame.to_html(path) + with open(path, 'r') as f: + self.assertEqual(frame.to_html(), f.read()) + + def test_to_html_with_no_bold(self): + x = DataFrame({'x': np.random.randn(5)}) + ashtml = x.to_html(bold_rows=False) + self.assertFalse('")]) + + def test_to_html_columns_arg(self): + frame = DataFrame(tm.getSeriesData()) + result = frame.to_html(columns=['A']) + self.assertNotIn('B', result) + + def test_to_html_multiindex(self): + columns = MultiIndex.from_tuples(list(zip(np.arange(2).repeat(2), + np.mod(lrange(4), 2))), + names=['CL0', 'CL1']) + df = DataFrame([list('abcd'), list('efgh')], columns=columns) + result = df.to_html(justify='left') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
CL001
CL10101
0abcd
1efgh
') + + self.assertEqual(result, expected) + + columns = MultiIndex.from_tuples(list(zip( + range(4), np.mod( + lrange(4), 2)))) + df = DataFrame([list('abcd'), list('efgh')], columns=columns) + + result = df.to_html(justify='right') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
0123
0101
0abcd
1efgh
') + + self.assertEqual(result, expected) + + def test_to_html_justify(self): + df = DataFrame({'A': [6, 30000, 2], + 'B': [1, 2, 70000], + 'C': [223442, 0, 1]}, + columns=['A', 'B', 'C']) + result = df.to_html(justify='left') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
061223442
13000020
22700001
') + self.assertEqual(result, expected) + + result = df.to_html(justify='right') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
061223442
13000020
22700001
') + self.assertEqual(result, expected) + + def test_to_html_index(self): + index = ['foo', 'bar', 'baz'] + df = DataFrame({'A': [1, 2, 3], + 'B': [1.2, 3.4, 5.6], + 'C': ['one', 'two', np.nan]}, + columns=['A', 'B', 'C'], + index=index) + expected_with_index = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
foo11.2one
bar23.4two
baz35.6NaN
') + self.assertEqual(df.to_html(), expected_with_index) + + expected_without_index = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
11.2one
23.4two
35.6NaN
') + result = df.to_html(index=False) + for i in index: + self.assertNotIn(i, result) + self.assertEqual(result, expected_without_index) + df.index = Index(['foo', 'bar', 'baz'], name='idx') + expected_with_index = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
idx
foo11.2one
bar23.4two
baz35.6NaN
') + self.assertEqual(df.to_html(), expected_with_index) + self.assertEqual(df.to_html(index=False), expected_without_index) + + tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] + df.index = MultiIndex.from_tuples(tuples) + + expected_with_index = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
foocar11.2one
bike23.4two
barcar35.6NaN
') + self.assertEqual(df.to_html(), expected_with_index) + + result = df.to_html(index=False) + for i in ['foo', 'bar', 'car', 'bike']: + self.assertNotIn(i, result) + # must be the same result as normal index + self.assertEqual(result, expected_without_index) + + df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2']) + expected_with_index = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
idx1idx2
foocar11.2one
bike23.4two
barcar35.6NaN
') + self.assertEqual(df.to_html(), expected_with_index) + self.assertEqual(df.to_html(index=False), expected_without_index) + + def test_to_html_with_classes(self): + df = DataFrame() + result = df.to_html(classes="sortable draggable") + expected = dedent(""" + + + + + + + + + +
+ + """).strip() + self.assertEqual(result, expected) + + result = df.to_html(classes=["sortable", "draggable"]) + self.assertEqual(result, expected) + + def test_to_html_no_index_max_rows(self): + # GH https://github.com/pandas-dev/pandas/issues/14998 + df = DataFrame({"A": [1, 2, 3, 4]}) + result = df.to_html(index=False, max_rows=1) + expected = dedent("""\ + + + + + + + + + + + +
A
1
""") + self.assertEqual(result, expected) diff --git a/pandas/tests/formats/test_to_latex.py b/pandas/tests/formats/test_to_latex.py new file mode 100644 index 0000000000000..89e18e1cec06e --- /dev/null +++ b/pandas/tests/formats/test_to_latex.py @@ -0,0 +1,351 @@ +from datetime import datetime + +import pytest + +import pandas as pd +from pandas import DataFrame, compat +from pandas.util import testing as tm +from pandas.compat import u +import codecs + + +@pytest.fixture +def frame(): + return DataFrame(tm.getSeriesData()) + + +class TestToLatex(object): + + def test_to_latex_filename(self, frame): + with tm.ensure_clean('test.tex') as path: + frame.to_latex(path) + + with open(path, 'r') as f: + assert frame.to_latex() == f.read() + + # test with utf-8 and encoding option (GH 7061) + df = DataFrame([[u'au\xdfgangen']]) + with tm.ensure_clean('test.tex') as path: + df.to_latex(path, encoding='utf-8') + with codecs.open(path, 'r', encoding='utf-8') as f: + assert df.to_latex() == f.read() + + # test with utf-8 without encoding option + if compat.PY3: # python3: pandas default encoding is utf-8 + with tm.ensure_clean('test.tex') as path: + df.to_latex(path) + with codecs.open(path, 'r', encoding='utf-8') as f: + assert df.to_latex() == f.read() + else: + # python2 default encoding is ascii, so an error should be raised + with tm.ensure_clean('test.tex') as path: + with pytest.raises(UnicodeEncodeError): + df.to_latex(path) + + def test_to_latex(self, frame): + # it works! + frame.to_latex() + + df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + withindex_result = df.to_latex() + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(index=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule + a & b \\ +\midrule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutindex_result == withoutindex_expected + + def test_to_latex_format(self, frame): + # GH Bug #9402 + frame.to_latex(column_format='ccc') + + df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + withindex_result = df.to_latex(column_format='ccc') + withindex_expected = r"""\begin{tabular}{ccc} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + def test_to_latex_with_formatters(self): + df = DataFrame({'int': [1, 2, 3], + 'float': [1.0, 2.0, 3.0], + 'object': [(1, 2), True, False], + 'datetime64': [datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3)]}) + + formatters = {'int': lambda x: '0x%x' % x, + 'float': lambda x: '[% 4.1f]' % x, + 'object': lambda x: '-%s-' % str(x), + 'datetime64': lambda x: x.strftime('%Y-%m'), + '__index__': lambda x: 'index: %s' % x} + result = df.to_latex(formatters=dict(formatters)) + + expected = r"""\begin{tabular}{llrrl} +\toprule +{} & datetime64 & float & int & object \\ +\midrule +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + def test_to_latex_multiindex(self): + df = DataFrame({('x', 'y'): ['a']}) + result = df.to_latex() + expected = r"""\begin{tabular}{ll} +\toprule +{} & x \\ +{} & y \\ +\midrule +0 & a \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + result = df.T.to_latex() + expected = r"""\begin{tabular}{lll} +\toprule + & & 0 \\ +\midrule +x & y & a \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + df = DataFrame.from_dict({ + ('c1', 0): pd.Series(dict((x, x) for x in range(4))), + ('c1', 1): pd.Series(dict((x, x + 4) for x in range(4))), + ('c2', 0): pd.Series(dict((x, x) for x in range(4))), + ('c2', 1): pd.Series(dict((x, x + 4) for x in range(4))), + ('c3', 0): pd.Series(dict((x, x) for x in range(4))), + }).T + result = df.to_latex() + expected = r"""\begin{tabular}{llrrrr} +\toprule + & & 0 & 1 & 2 & 3 \\ +\midrule +c1 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ +c2 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ +c3 & 0 & 0 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + # GH 10660 + df = pd.DataFrame({'a': [0, 0, 1, 1], + 'b': list('abab'), + 'c': [1, 2, 3, 4]}) + result = df.set_index(['a', 'b']).to_latex() + expected = r"""\begin{tabular}{llr} +\toprule + & & c \\ +a & b & \\ +\midrule +0 & a & 1 \\ + & b & 2 \\ +1 & a & 3 \\ + & b & 4 \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + result = df.groupby('a').describe().to_latex() + expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & ' + ' & & & & & & ' + '\\\\\n{} & count & mean & std & min & 25\\% & ' + '50\\% & 75\\% & max \\\\\na & & & ' + ' & & & & & \\\\\n\\midrule\n0 ' + '& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 ' + '& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 ' + '& 3.5 & 3.75 & 4.0 ' + '\\\\\n\\bottomrule\n\\end{tabular}\n') + + assert result == expected + + def test_to_latex_escape(self): + a = 'a' + b = 'b' + + test_dict = {u('co^l1'): {a: "a", + b: "b"}, + u('co$e^x$'): {a: "a", + b: "b"}} + + unescaped_result = DataFrame(test_dict).to_latex(escape=False) + escaped_result = DataFrame(test_dict).to_latex( + ) # default: escape=True + + unescaped_expected = r'''\begin{tabular}{lll} +\toprule +{} & co$e^x$ & co^l1 \\ +\midrule +a & a & a \\ +b & b & b \\ +\bottomrule +\end{tabular} +''' + + escaped_expected = r'''\begin{tabular}{lll} +\toprule +{} & co\$e\textasciicircumx\$ & co\textasciicircuml1 \\ +\midrule +a & a & a \\ +b & b & b \\ +\bottomrule +\end{tabular} +''' + + assert unescaped_result == unescaped_expected + assert escaped_result == escaped_expected + + def test_to_latex_longtable(self, frame): + frame.to_latex(longtable=True) + + df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + withindex_result = df.to_latex(longtable=True) + withindex_expected = r"""\begin{longtable}{lrl} +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(index=False, longtable=True) + withoutindex_expected = r"""\begin{longtable}{rl} +\toprule + a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot + 1 & b1 \\ + 2 & b2 \\ +\end{longtable} +""" + + assert withoutindex_result == withoutindex_expected + + def test_to_latex_escape_special_chars(self): + special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^', + '\\'] + df = DataFrame(data=special_characters) + observed = df.to_latex() + expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & \& \\ +1 & \% \\ +2 & \$ \\ +3 & \# \\ +4 & \_ \\ +5 & \{ \\ +6 & \} \\ +7 & \textasciitilde \\ +8 & \textasciicircum \\ +9 & \textbackslash \\ +\bottomrule +\end{tabular} +""" + + assert observed == expected + + def test_to_latex_no_header(self): + # GH 7124 + df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + withindex_result = df.to_latex(header=False) + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(index=False, header=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutindex_result == withoutindex_expected + + def test_to_latex_decimal(self, frame): + # GH 12031 + frame.to_latex() + + df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']}) + withindex_result = df.to_latex(decimal=',') + + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1,0 & b1 \\ +1 & 2,1 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected From 7cef7677cb37702a49100a0615087f7f7bba0e68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Mar 2017 07:50:49 -0500 Subject: [PATCH 213/338] CLN: remove deprecated irow, icol, iget, iget_value (GH10711) xref https://github.com/pandas-dev/pandas/issues/6581 Author: Joris Van den Bossche Closes #15547 from jorisvandenbossche/remove-irow-icol and squashes the following commits: 06ea1bb [Joris Van den Bossche] CLN: remove deprecated irow, icol, iget, iget_value (GH10711) --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/core/frame.py | 25 -------------------- pandas/core/groupby.py | 10 -------- pandas/core/series.py | 25 -------------------- pandas/tests/frame/test_indexing.py | 23 ++++-------------- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/groupby/test_groupby.py | 16 +------------ pandas/tests/series/test_indexing.py | 16 ++----------- pandas/tests/sparse/test_frame.py | 3 +-- 9 files changed, 12 insertions(+), 110 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6e9dfb92dfd90..dc8420080b50d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -548,6 +548,8 @@ Removal of prior version deprecations/changes - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) - ``pandas.stats.fama_macbeth``, ``pandas.stats.ols``, ``pandas.stats.plm`` and ``pandas.stats.var``, as well as the top-level ``pandas.fama_macbeth`` and ``pandas.ols`` routines are removed. Similar functionaility can be found in the `statsmodels `__ package. (:issue:`11898`) - ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:``) +- The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed + in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). .. _whatsnew_0200.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 021ce59e3402b..0d14f00bee508 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1916,23 +1916,6 @@ def set_value(self, index, col, value, takeable=False): return self - def irow(self, i, copy=False): - """ - DEPRECATED. Use ``.iloc[i]`` instead - """ - - warnings.warn("irow(i) is deprecated. Please use .iloc[i]", - FutureWarning, stacklevel=2) - return self._ixs(i, axis=0) - - def icol(self, i): - """ - DEPRECATED. Use ``.iloc[:, i]`` instead - """ - warnings.warn("icol(i) is deprecated. Please use .iloc[:,i]", - FutureWarning, stacklevel=2) - return self._ixs(i, axis=1) - def _ixs(self, i, axis=0): """ i : int, slice, or sequence of integers @@ -2007,14 +1990,6 @@ def _ixs(self, i, axis=0): return result - def iget_value(self, i, j): - """ - DEPRECATED. Use ``.iat[i, j]`` instead - """ - warnings.warn("iget_value(i, j) is deprecated. Please use .iat[i, j]", - FutureWarning, stacklevel=2) - return self.iat[i, j] - def __getitem__(self, key): key = com._apply_if_callable(key, self) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 381a8edcb5192..578c334781d15 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1004,16 +1004,6 @@ class GroupBy(_GroupBy): """ _apply_whitelist = _common_apply_whitelist - def irow(self, i): - """ - DEPRECATED. Use ``.nth(i)`` instead - """ - - # 10177 - warnings.warn("irow(i) is deprecated. Please use .nth(i)", - FutureWarning, stacklevel=2) - return self.nth(i) - @Substitution(name='groupby') @Appender(_doc_template) def count(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index ffe1be26fda54..1114590421fd8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -875,31 +875,6 @@ def reshape(self, *args, **kwargs): return self._values.reshape(shape, **kwargs) - def iget_value(self, i, axis=0): - """ - DEPRECATED. Use ``.iloc[i]`` or ``.iat[i]`` instead - """ - warnings.warn("iget_value(i) is deprecated. Please use .iloc[i] or " - ".iat[i]", FutureWarning, stacklevel=2) - return self._ixs(i) - - def iget(self, i, axis=0): - """ - DEPRECATED. Use ``.iloc[i]`` or ``.iat[i]`` instead - """ - - warnings.warn("iget(i) is deprecated. Please use .iloc[i] or .iat[i]", - FutureWarning, stacklevel=2) - return self._ixs(i) - - def irow(self, i, axis=0): - """ - DEPRECATED. Use ``.iloc[i]`` or ``.iat[i]`` instead - """ - warnings.warn("irow(i) is deprecated. Please use .iloc[i] or .iat[i]", - FutureWarning, stacklevel=2) - return self._ixs(i) - def get_value(self, label, takeable=False): """ Quickly retrieve single value at passed index label diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 18fb17b98570a..36c39ffba70b3 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -1761,13 +1761,9 @@ def test_single_element_ix_dont_upcast(self): result = df.loc[[0], "b"] assert_series_equal(result, expected) - def test_irow(self): + def test_iloc_row(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2)) - # 10711, deprecated - with tm.assert_produces_warning(FutureWarning): - df.irow(1) - result = df.iloc[1] exp = df.loc[2] assert_series_equal(result, exp) @@ -1795,14 +1791,10 @@ def f(): expected = df.reindex(df.index[[1, 2, 4, 6]]) assert_frame_equal(result, expected) - def test_icol(self): + def test_iloc_col(self): df = DataFrame(np.random.randn(4, 10), columns=lrange(0, 20, 2)) - # 10711, deprecated - with tm.assert_produces_warning(FutureWarning): - df.icol(1) - result = df.iloc[:, 1] exp = df.loc[:, 2] assert_series_equal(result, exp) @@ -1828,8 +1820,7 @@ def f(): expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) assert_frame_equal(result, expected) - def test_irow_icol_duplicates(self): - # 10711, deprecated + def test_iloc_duplicates(self): df = DataFrame(np.random.rand(3, 3), columns=list('ABC'), index=list('aab')) @@ -1874,16 +1865,12 @@ def test_irow_icol_duplicates(self): expected = df.take([0], axis=1) assert_frame_equal(result, expected) - def test_icol_sparse_propegate_fill_value(self): + def test_iloc_sparse_propegate_fill_value(self): from pandas.sparse.api import SparseDataFrame df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999) self.assertTrue(len(df['A'].sp_values) == len(df.iloc[:, 0].sp_values)) - def test_iget_value(self): - # 10711 deprecated - - with tm.assert_produces_warning(FutureWarning): - self.frame.iget_value(0, 0) + def test_iat(self): for i, row in enumerate(self.frame.index): for j, col in enumerate(self.frame.columns): diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index d6bcb85e01910..bb7c7c2bd012d 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -429,7 +429,7 @@ def test_columns_with_dups(self): self.assertEqual(len(df._data._blknos), len(df.columns)) self.assertEqual(len(df._data._blklocs), len(df.columns)) - # testing iget + # testing iloc for i in range(len(df.columns)): df.iloc[:, i] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 59cbcab23b9e7..74e8c6c45946f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3828,20 +3828,6 @@ def test_groupby_whitelist(self): 'mad', 'std', 'var', 'sem'] AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] - def test_groupby_whitelist_deprecations(self): - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 10 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - - # 10711 deprecated - with tm.assert_produces_warning(FutureWarning): - df.groupby('letters').irow(0) - with tm.assert_produces_warning(FutureWarning): - df.groupby('letters').floats.irow(0) - def test_regression_whitelist_methods(self): # GH6944 @@ -3917,7 +3903,7 @@ def test_tab_completion(self): 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile', + 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 8a2cc53b42938..bb77550e01f11 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -164,22 +164,10 @@ def test_getitem_get(self): result = s.get(None) self.assertIsNone(result) - def test_iget(self): + def test_iloc(self): s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - # 10711, deprecated - with tm.assert_produces_warning(FutureWarning): - s.iget(1) - - # 10711, deprecated - with tm.assert_produces_warning(FutureWarning): - s.irow(1) - - # 10711, deprecated - with tm.assert_produces_warning(FutureWarning): - s.iget_value(1) - for i in range(len(s)): result = s.iloc[i] exp = s[s.index[i]] @@ -199,7 +187,7 @@ def test_iget(self): expected = s.reindex(s.index[[0, 2, 3, 4, 5]]) assert_series_equal(result, expected) - def test_iget_nonunique(self): + def test_iloc_nonunique(self): s = Series([0, 1, 2], index=[0, 1, 0]) self.assertEqual(s.iloc[2], 2) diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index e3b865492c043..b2283364a1631 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -389,8 +389,7 @@ def test_getitem(self): self.assertRaises(Exception, sdf.__getitem__, ['a', 'd']) - def test_icol(self): - # 10711 deprecated + def test_iloc(self): # 2227 result = self.frame.iloc[:, 0] From 5c55b7ce36c771f3be7c6ecce11aa6fa08345fac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Mar 2017 08:00:14 -0500 Subject: [PATCH 214/338] DOC: revert gbq doc-strings to be in-line rather than wrapped --- pandas/core/frame.py | 35 ++++++++++++-------- pandas/io/gbq.py | 76 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d14f00bee508..ff5dcb3f544ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -78,7 +78,7 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import (deprecate_kwarg, Appender, - Substitution, docstring_wrapper) + Substitution) from pandas.util.validators import validate_bool_kwarg from pandas.tseries.period import PeriodIndex @@ -908,7 +908,26 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, verbose=True, reauth=False, if_exists='fail', private_key=None): """Write a DataFrame to a Google BigQuery table. - THIS IS AN EXPERIMENTAL LIBRARY + The main method a user calls to export pandas DataFrame contents to + Google BigQuery table. + + Google BigQuery API Client Library v2 for Python is used. + Documentation is available `here + `__ + + Authentication to the Google BigQuery service is via OAuth 2.0. + + - If "private_key" is not provided: + + By default "application default credentials" are used. + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + + - If "private_key" is provided: + + Service account credentials will be used to authenticate. Parameters ---------- @@ -933,8 +952,6 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, Service account private key in JSON format. Can be file path or string contents. This is useful for remote server authentication (eg. jupyter iPython notebook on remote host) - - .. versionadded:: 0.17.0 """ from pandas.io import gbq @@ -5402,16 +5419,6 @@ def combineMult(self, other): _EMPTY_SERIES = Series([]) -# patch in the doc-string for to_gbq -# and bind this method -def _f(): - from pandas.io.gbq import _try_import - return _try_import().to_gbq.__doc__ - - -DataFrame.to_gbq = docstring_wrapper(DataFrame.to_gbq, _f) - - def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 3407f51af5e83..9cfb27a92bfef 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,7 +1,5 @@ """ Google BigQuery support """ -from pandas.util.decorators import docstring_wrapper - def _try_import(): # since pandas is a dependency of pandas-gbq @@ -25,6 +23,72 @@ def _try_import(): def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, dialect='legacy', **kwargs): + r"""Load data from Google BigQuery. + + The main method a user calls to execute a Query in Google BigQuery + and read results into a pandas DataFrame. + + Google BigQuery API Client Library v2 for Python is used. + Documentation is available `here + `__ + + Authentication to the Google BigQuery service is via OAuth 2.0. + + - If "private_key" is not provided: + + By default "application default credentials" are used. + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + + - If "private_key" is provided: + + Service account credentials will be used to authenticate. + + Parameters + ---------- + query : str + SQL-Like Query to return data values + project_id : str + Google BigQuery Account project ID. + index_col : str (optional) + Name of result column to use for index in results DataFrame + col_order : list(str) (optional) + List of BigQuery column names in the desired order for results + DataFrame + reauth : boolean (default False) + Force Google BigQuery to reauthenticate the user. This is useful + if multiple accounts are used. + verbose : boolean (default True) + Verbose output + private_key : str (optional) + Service account private key in JSON format. Can be file path + or string contents. This is useful for remote server + authentication (eg. jupyter iPython notebook on remote host) + + dialect : {'legacy', 'standard'}, default 'legacy' + 'legacy' : Use BigQuery's legacy SQL dialect. + 'standard' : Use BigQuery's standard SQL (beta), which is + compliant with the SQL 2011 standard. For more information + see `BigQuery SQL Reference + `__ + + **kwargs : Arbitrary keyword arguments + configuration (dict): query config parameters for job processing. + For example: + + configuration = {'query': {'useQueryCache': False}} + + For more information see `BigQuery SQL Reference + `__ + + Returns + ------- + df: DataFrame + DataFrame representing results of query + + """ pandas_gbq = _try_import() return pandas_gbq.read_gbq( query, project_id=project_id, @@ -35,10 +99,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, **kwargs) -read_gbq = docstring_wrapper(read_gbq, - lambda: _try_import().read_gbq.__doc__) - - def to_gbq(dataframe, destination_table, project_id, chunksize=10000, verbose=True, reauth=False, if_exists='fail', private_key=None): pandas_gbq = _try_import() @@ -46,7 +106,3 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, chunksize=chunksize, verbose=verbose, reauth=reauth, if_exists=if_exists, private_key=private_key) - - -to_gbq = docstring_wrapper(to_gbq, - lambda: _try_import().to_gbq.__doc__) From 8ffed0e750f735f5ca7993d953ed96b05469274b Mon Sep 17 00:00:00 2001 From: Ben Thayer Date: Thu, 2 Mar 2017 08:16:48 -0500 Subject: [PATCH 215/338] ENH: Added FrozenList difference setop closes #15475 Author: Ben Thayer Author: bthayer2365 Closes #15506 from bthayer2365/frozen-index and squashes the following commits: 428a1b3 [Ben Thayer] Added __iadd__ test, fixed whatsnew 84ba405 [Ben Thayer] Merge branch 'master' of github.com:pandas-dev/pandas into frozen-index 8dbde1e [Ben Thayer] Rebased to upstream/master 6f6c140 [Ben Thayer] Added docstrings, depricated __iadd__, changed __add__ to use self.union() 66b3b91 [Ben Thayer] Fixed issue number 3d6cee5 [Ben Thayer] Depricated __add__ in favor of union ccd75c7 [Ben Thayer] Changed __sub__ to difference cd7de26 [Ben Thayer] Added versionadded tag in docs and renamed test_inplace to test_inplace_add for consistency 0ea8d21 [Ben Thayer] Added __isub__ and groupby example to docs 79dd958 [Ben Thayer] Updated whatsnew to reflect changes 0fc7e19 [Ben Thayer] Removed whitespace 73564ab [Ben Thayer] Added FrozenList subtraction fee7a7d [bthayer2365] Merge branch 'master' into frozen-index 6a2b48d [Ben Thayer] Added docstrings, depricated __iadd__, changed __add__ to use self.union() 2ab85cb [Ben Thayer] Fixed issue number cb95089 [Ben Thayer] Depricated __add__ in favor of union 2e43849 [Ben Thayer] Changed __sub__ to difference fdcfbbb [Ben Thayer] Added versionadded tag in docs and renamed test_inplace to test_inplace_add for consistency 2fad2f7 [Ben Thayer] Added __isub__ and groupby example to docs cd73faa [Ben Thayer] Updated whatsnew to reflect changes f6381a8 [Ben Thayer] Removed whitespace ada7cda [Ben Thayer] Added FrozenList subtraction --- doc/source/groupby.rst | 10 +++++++++ doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/indexes/frozen.py | 24 ++++++++++++++++++--- pandas/tests/indexes/test_frozen.py | 33 +++++++++++++++++++++-------- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 8484ccd69a983..2d406de7c0c9b 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -126,6 +126,16 @@ We could naturally group by either the ``A`` or ``B`` columns or both: grouped = df.groupby('A') grouped = df.groupby(['A', 'B']) +.. versionadded:: 0.20 + +If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all +but the specified columns. + +.. ipython:: python + + df2 = df.set_index(['A', 'B']) + grouped = df2.groupby(level=df2.index.names.difference(['B']) + These will split the DataFrame on its index (rows). We could also split by the columns: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index dc8420080b50d..cc33a4a7ce6c6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -28,6 +28,7 @@ New features - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``FrozenList`` has gained the ``.difference()`` setop method (:issue:`15475`) @@ -534,6 +535,7 @@ Deprecations - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) +- ``FrozenList`` addition (new object and inplace) have been deprecated in favor of the ``.union()`` method. (:issue: `15475`) .. _whatsnew_0200.prior_deprecations: diff --git a/pandas/indexes/frozen.py b/pandas/indexes/frozen.py index e043ba64bbad7..47e2557333ec7 100644 --- a/pandas/indexes/frozen.py +++ b/pandas/indexes/frozen.py @@ -13,6 +13,8 @@ from pandas.types.cast import _coerce_indexer_dtype from pandas.formats.printing import pprint_thing +import warnings + class FrozenList(PandasObject, list): @@ -25,11 +27,14 @@ class FrozenList(PandasObject, list): # typechecks def __add__(self, other): + warnings.warn("__add__ is deprecated, use union(...)", FutureWarning) + return self.union(other) + + def __iadd__(self, other): + warnings.warn("__iadd__ is deprecated, use union(...)", FutureWarning) if isinstance(other, tuple): other = list(other) - return self.__class__(super(FrozenList, self).__add__(other)) - - __iadd__ = __add__ + return super(FrozenList, self).__iadd__(other) # Python 2 compat def __getslice__(self, i, j): @@ -80,6 +85,19 @@ def __repr__(self): __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled + def union(self, other): + """Returns a FrozenList with other concatenated to the end of self""" + if isinstance(other, tuple): + other = list(other) + return self.__class__(super(FrozenList, self).__add__(other)) + + def difference(self, other): + """Returns a FrozenList with the same elements as self, but with elements + that are also in other removed.""" + other = set(other) + temp = [x for x in self if x not in other] + return self.__class__(temp) + class FrozenNDArray(PandasObject, np.ndarray): diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index a82409fbf9513..a5fbf066adc83 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -15,20 +15,35 @@ def setUp(self): self.klass = FrozenList def test_add(self): - result = self.container + (1, 2, 3) + q = FrozenList([1]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + q = q + [2, 3] + expected = FrozenList([1, 2, 3]) + self.check_result(q, expected) + + def test_iadd(self): + q = FrozenList([1]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + q += [2, 3] + expected = FrozenList([1, 2, 3]) + self.check_result(q, expected) + + def test_union(self): + result = self.container.union((1, 2, 3)) expected = FrozenList(self.lst + [1, 2, 3]) self.check_result(result, expected) - result = (1, 2, 3) + self.container - expected = FrozenList([1, 2, 3] + self.lst) + def test_difference(self): + result = self.container.difference([2]) + expected = FrozenList([1, 3, 4, 5]) self.check_result(result, expected) - def test_inplace(self): - q = r = self.container - q += [5] - self.check_result(q, self.lst + [5]) - # other shouldn't be mutated - self.check_result(r, self.lst) + def test_difference_dupe(self): + result = FrozenList([1, 2, 3, 2]).difference([2]) + expected = FrozenList([1, 3]) + self.check_result(result, expected) class TestFrozenNDArray(CheckImmutable, CheckStringMixin, tm.TestCase): From ab8ad9f2f87c03614f32257578222f6402b5a108 Mon Sep 17 00:00:00 2001 From: Amol Kahat Date: Thu, 2 Mar 2017 08:33:42 -0500 Subject: [PATCH 216/338] BUG: Fix index for datetime64 conversion. Fixes #13937 closes #13937 Author: Amol Kahat Closes #14446 from amolkahat/bug_fixes and squashes the following commits: 3806983 [Amol Kahat] Modify test cases. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 4 ++-- pandas/tests/frame/test_convert_to.py | 24 ++++++++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cc33a4a7ce6c6..dca4f890e496b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -612,6 +612,7 @@ Bug Fixes - Bug in ``GroupBy.get_group()`` failing with a categorical grouper (:issue:`15155`) - Bug in ``pandas.tools.utils.cartesian_product()`` with large input can cause overflow on windows (:issue:`15265`) +- Bug in ``DataFrame.to_records()`` with converting a ``DatetimeIndex`` with a timezone (:issue:`13937`) - Bug in ``.groupby(...).rolling(...)`` when ``on`` is specified and using a ``DatetimeIndex`` (:issue:`15130`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ff5dcb3f544ec..26a0a91094e7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,7 +36,7 @@ is_object_dtype, is_extension_type, is_datetimetz, - is_datetime64_dtype, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_bool_dtype, is_integer_dtype, @@ -1103,7 +1103,7 @@ def to_records(self, index=True, convert_datetime64=True): y : recarray """ if index: - if is_datetime64_dtype(self.index) and convert_datetime64: + if is_datetime64_any_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: if isinstance(self.index, MultiIndex): diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 0dde113dd5147..8323d5ed9069f 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -from __future__ import print_function - -from numpy import nan +import pytest import numpy as np from pandas import compat @@ -10,7 +8,6 @@ date_range) import pandas.util.testing as tm - from pandas.tests.frame.common import TestData @@ -41,13 +38,13 @@ def test_to_dict(self): recons_data = DataFrame(test_data).to_dict("sp") expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'], - 'data': [[1.0, '1'], [2.0, '2'], [nan, '3']]} + 'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]} tm.assert_dict_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("r") expected_records = [{'A': 1.0, 'B': '1'}, {'A': 2.0, 'B': '2'}, - {'A': nan, 'B': '3'}] + {'A': np.nan, 'B': '3'}] tm.assertIsInstance(recons_data, list) self.assertEqual(len(recons_data), 3) for l, r in zip(recons_data, expected_records): @@ -192,3 +189,18 @@ def test_to_records_with_unicode_column_names(self): "formats": [' Date: Thu, 2 Mar 2017 09:23:58 -0500 Subject: [PATCH 217/338] TST: remove deprecated usages of FrozenList.__add__ from test code xref #15506 --- pandas/core/panel.py | 6 +++--- pandas/core/reshape.py | 6 +++--- pandas/core/strings.py | 2 +- pandas/tests/groupby/test_value_counts.py | 2 +- pandas/tools/concat.py | 2 +- test_fast.sh | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 4a6c6cf291316..c5ea513223dce 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -940,9 +940,9 @@ def construct_index_parts(idx, major=True): minor_labels, minor_levels, minor_names = construct_index_parts( self.minor_axis, major=False) - levels = major_levels + minor_levels - labels = major_labels + minor_labels - names = major_names + minor_names + levels = list(major_levels) + list(minor_levels) + labels = list(major_labels) + list(minor_labels) + names = list(major_names) + list(minor_names) index = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 87cb088c2e91e..faad6c500a21f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -216,8 +216,8 @@ def get_new_columns(self): width = len(self.value_columns) propagator = np.repeat(np.arange(width), stride) if isinstance(self.value_columns, MultiIndex): - new_levels = self.value_columns.levels + (self.removed_level,) - new_names = self.value_columns.names + (self.removed_name,) + new_levels = self.value_columns.levels.union((self.removed_level,)) + new_names = self.value_columns.names.union((self.removed_name,)) new_labels = [lab.take(propagator) for lab in self.value_columns.labels] @@ -806,7 +806,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, for col in id_vars: mdata[col] = np.tile(frame.pop(col).values, K) - mcolumns = id_vars + var_name + [value_name] + mcolumns = list(id_vars) + list(var_name) + list([value_name]) mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ac8d1db6a0bf3..51016926d6909 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -787,7 +787,7 @@ def str_extractall(arr, pat, flags=0): if 0 < len(index_list): from pandas import MultiIndex index = MultiIndex.from_tuples( - index_list, names=arr.index.names + ["match"]) + index_list, names=arr.index.names.union(["match"])) else: index = None result = arr._constructor_expanddim(match_list, index=index, diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 801d0da070112..ff01df2693c7c 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -28,7 +28,7 @@ def check_value_counts(df, keys, bins): gr = df.groupby(keys, sort=isort) right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] + right.index.names = right.index.names[:-1].union(['3rd']) # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 diff --git a/pandas/tools/concat.py b/pandas/tools/concat.py index 6405106118472..ae9d7af9d98ff 100644 --- a/pandas/tools/concat.py +++ b/pandas/tools/concat.py @@ -574,7 +574,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): " not have the same number of levels") # also copies - names = names + _get_consensus_names(indexes) + names = list(names) + list(_get_consensus_names(indexes)) return MultiIndex(levels=levels, labels=label_list, names=names, verify_integrity=False) diff --git a/test_fast.sh b/test_fast.sh index 30ac7f84cbe8b..f22ab73277e8b 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network -m "not single" -n 4 +pytest pandas --skip-slow --skip-network -m "not single" -n 4 $@ From 64172463d7c9fd12a0448d29e7df776be7bf89a9 Mon Sep 17 00:00:00 2001 From: manuels Date: Thu, 2 Mar 2017 18:20:40 -0500 Subject: [PATCH 218/338] Make Series.map() documentation a bit more verbose Author: manuels Closes #15235 from manuels/patch-1 and squashes the following commits: c5113f2 [manuels] Make Series.map() documentation a bit more verbose --- pandas/core/series.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1114590421fd8..626a4a81193cc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2089,13 +2089,15 @@ def map(self, arg, na_action=None): Examples -------- - Map inputs to outputs + Map inputs to outputs (both of type `Series`) + >>> x = pd.Series([1,2,3], index=['one', 'two', 'three']) >>> x one 1 two 2 three 3 + >>> y = pd.Series(['foo', 'bar', 'baz'], index=[1,2,3]) >>> y 1 foo 2 bar @@ -2106,6 +2108,16 @@ def map(self, arg, na_action=None): two bar three baz + Mapping a dictionary keys on the index labels works similar as + with a `Series`: + + >>> z = {1: 'A', 2: 'B', 3: 'C'} + + >>> x.map(z) + one A + two B + three C + Use na_action to control whether NA values are affected by the mapping function. @@ -2127,6 +2139,11 @@ def map(self, arg, na_action=None): 3 NaN dtype: object + See Also + -------- + Series.apply: For applying more complex functions on a Series + DataFrame.apply: Apply a function row-/column-wise + DataFrame.applymap: Apply a function elementwise on a whole DataFrame """ if is_extension_type(self.dtype): From 9204e1640cbe27cd3d48c53a7bd02ae44df666ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Gs=C3=A4nger?= Date: Fri, 3 Mar 2017 10:16:45 +0100 Subject: [PATCH 219/338] ENH: Added multicolumn/multirow support for latex (#14184) closes #13508 Print names of MultiIndex columns. Added "multicolumn" and "multirow" flags to to_latex which trigger the corresponding feature. "multicolumn_format" is used to select alignment. Multirow adds clines to visually separate sections. --- doc/source/options.rst | 295 +++++++++++++------------- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/config_init.py | 31 ++- pandas/core/frame.py | 46 +++- pandas/formats/format.py | 115 +++++++++- pandas/tests/formats/test_to_latex.py | 115 +++++++++- 6 files changed, 433 insertions(+), 170 deletions(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index 77cac6d495d13..10a13ed36df8d 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -273,151 +273,156 @@ Options are 'right', and 'left'. Available Options ----------------- -========================== ============ ================================== -Option Default Function -========================== ============ ================================== -display.chop_threshold None If set to a float value, all float - values smaller then the given - threshold will be displayed as - exactly 0 by repr and friends. -display.colheader_justify right Controls the justification of - column headers. used by DataFrameFormatter. -display.column_space 12 No description available. -display.date_dayfirst False When True, prints and parses dates - with the day first, eg 20/01/2005 -display.date_yearfirst False When True, prints and parses dates - with the year first, eg 2005/01/20 -display.encoding UTF-8 Defaults to the detected encoding - of the console. Specifies the encoding - to be used for strings returned by - to_string, these are generally strings - meant to be displayed on the console. -display.expand_frame_repr True Whether to print out the full DataFrame - repr for wide DataFrames across - multiple lines, `max_columns` is - still respected, but the output will - wrap-around across multiple "pages" - if its width exceeds `display.width`. -display.float_format None The callable should accept a floating - point number and return a string with - the desired format of the number. - This is used in some places like - SeriesFormatter. - See core.format.EngFormatter for an example. -display.height 60 Deprecated. Use `display.max_rows` instead. -display.large_repr truncate For DataFrames exceeding max_rows/max_cols, - the repr (and HTML repr) can show - a truncated table (the default from 0.13), - or switch to the view from df.info() - (the behaviour in earlier versions of pandas). - allowable settings, ['truncate', 'info'] -display.latex.repr False Whether to produce a latex DataFrame - representation for jupyter frontends - that support it. -display.latex.escape True Escapes special caracters in Dataframes, when - using the to_latex method. -display.latex.longtable False Specifies if the to_latex method of a Dataframe - uses the longtable format. -display.line_width 80 Deprecated. Use `display.width` instead. -display.max_columns 20 max_rows and max_columns are used - in __repr__() methods to decide if - to_string() or info() is used to - render an object to a string. In - case python/IPython is running in - a terminal this can be set to 0 and - pandas will correctly auto-detect - the width the terminal and swap to - a smaller format in case all columns - would not fit vertically. The IPython - notebook, IPython qtconsole, or IDLE - do not run in a terminal and hence - it is not possible to do correct - auto-detection. 'None' value means - unlimited. -display.max_colwidth 50 The maximum width in characters of - a column in the repr of a pandas - data structure. When the column overflows, - a "..." placeholder is embedded in - the output. -display.max_info_columns 100 max_info_columns is used in DataFrame.info - method to decide if per column information - will be printed. -display.max_info_rows 1690785 df.info() will usually show null-counts - for each column. For large frames - this can be quite slow. max_info_rows - and max_info_cols limit this null - check only to frames with smaller - dimensions then specified. -display.max_rows 60 This sets the maximum number of rows - pandas should output when printing - out various output. For example, - this value determines whether the - repr() for a dataframe prints out - fully or just a summary repr. - 'None' value means unlimited. -display.max_seq_items 100 when pretty-printing a long sequence, - no more then `max_seq_items` will - be printed. If items are omitted, - they will be denoted by the addition - of "..." to the resulting string. - If set to None, the number of items - to be printed is unlimited. -display.memory_usage True This specifies if the memory usage of - a DataFrame should be displayed when the - df.info() method is invoked. -display.multi_sparse True "Sparsify" MultiIndex display (don't - display repeated elements in outer - levels within groups) -display.notebook_repr_html True When True, IPython notebook will - use html representation for - pandas objects (if it is available). -display.pprint_nest_depth 3 Controls the number of nested levels - to process when pretty-printing -display.precision 6 Floating point output precision in - terms of number of places after the - decimal, for regular formatting as well - as scientific notation. Similar to - numpy's ``precision`` print option -display.show_dimensions truncate Whether to print out dimensions - at the end of DataFrame repr. - If 'truncate' is specified, only - print out the dimensions if the - frame is truncated (e.g. not display - all rows and/or columns) -display.width 80 Width of the display in characters. - In case python/IPython is running in - a terminal this can be set to None - and pandas will correctly auto-detect - the width. Note that the IPython notebook, - IPython qtconsole, or IDLE do not run in a - terminal and hence it is not possible - to correctly detect the width. -html.border 1 A ``border=value`` attribute is - inserted in the ```` tag - for the DataFrame HTML repr. -io.excel.xls.writer xlwt The default Excel writer engine for - 'xls' files. -io.excel.xlsm.writer openpyxl The default Excel writer engine for - 'xlsm' files. Available options: - 'openpyxl' (the default). -io.excel.xlsx.writer openpyxl The default Excel writer engine for - 'xlsx' files. -io.hdf.default_format None default format writing format, if - None, then put will default to - 'fixed' and append will default to - 'table' -io.hdf.dropna_table True drop ALL nan rows when appending - to a table -mode.chained_assignment warn Raise an exception, warn, or no - action if trying to use chained - assignment, The default is warn -mode.sim_interactive False Whether to simulate interactive mode - for purposes of testing -mode.use_inf_as_null False True means treat None, NaN, -INF, - INF as null (old way), False means - None and NaN are null, but INF, -INF - are not null (new way). -========================== ============ ================================== +=================================== ============ ================================== +Option Default Function +=================================== ============ ================================== +display.chop_threshold None If set to a float value, all float + values smaller then the given + threshold will be displayed as + exactly 0 by repr and friends. +display.colheader_justify right Controls the justification of + column headers. used by DataFrameFormatter. +display.column_space 12 No description available. +display.date_dayfirst False When True, prints and parses dates + with the day first, eg 20/01/2005 +display.date_yearfirst False When True, prints and parses dates + with the year first, eg 2005/01/20 +display.encoding UTF-8 Defaults to the detected encoding + of the console. Specifies the encoding + to be used for strings returned by + to_string, these are generally strings + meant to be displayed on the console. +display.expand_frame_repr True Whether to print out the full DataFrame + repr for wide DataFrames across + multiple lines, `max_columns` is + still respected, but the output will + wrap-around across multiple "pages" + if its width exceeds `display.width`. +display.float_format None The callable should accept a floating + point number and return a string with + the desired format of the number. + This is used in some places like + SeriesFormatter. + See core.format.EngFormatter for an example. +display.height 60 Deprecated. Use `display.max_rows` instead. +display.large_repr truncate For DataFrames exceeding max_rows/max_cols, + the repr (and HTML repr) can show + a truncated table (the default from 0.13), + or switch to the view from df.info() + (the behaviour in earlier versions of pandas). + allowable settings, ['truncate', 'info'] +display.latex.repr False Whether to produce a latex DataFrame + representation for jupyter frontends + that support it. +display.latex.escape True Escapes special caracters in Dataframes, when + using the to_latex method. +display.latex.longtable False Specifies if the to_latex method of a Dataframe + uses the longtable format. +display.latex.multicolumn True Combines columns when using a MultiIndex +display.latex.multicolumn_format 'l' Alignment of multicolumn labels +display.latex.multirow False Combines rows when using a MultiIndex. + Centered instead of top-aligned, + separated by clines. +display.line_width 80 Deprecated. Use `display.width` instead. +display.max_columns 20 max_rows and max_columns are used + in __repr__() methods to decide if + to_string() or info() is used to + render an object to a string. In + case python/IPython is running in + a terminal this can be set to 0 and + pandas will correctly auto-detect + the width the terminal and swap to + a smaller format in case all columns + would not fit vertically. The IPython + notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence + it is not possible to do correct + auto-detection. 'None' value means + unlimited. +display.max_colwidth 50 The maximum width in characters of + a column in the repr of a pandas + data structure. When the column overflows, + a "..." placeholder is embedded in + the output. +display.max_info_columns 100 max_info_columns is used in DataFrame.info + method to decide if per column information + will be printed. +display.max_info_rows 1690785 df.info() will usually show null-counts + for each column. For large frames + this can be quite slow. max_info_rows + and max_info_cols limit this null + check only to frames with smaller + dimensions then specified. +display.max_rows 60 This sets the maximum number of rows + pandas should output when printing + out various output. For example, + this value determines whether the + repr() for a dataframe prints out + fully or just a summary repr. + 'None' value means unlimited. +display.max_seq_items 100 when pretty-printing a long sequence, + no more then `max_seq_items` will + be printed. If items are omitted, + they will be denoted by the addition + of "..." to the resulting string. + If set to None, the number of items + to be printed is unlimited. +display.memory_usage True This specifies if the memory usage of + a DataFrame should be displayed when the + df.info() method is invoked. +display.multi_sparse True "Sparsify" MultiIndex display (don't + display repeated elements in outer + levels within groups) +display.notebook_repr_html True When True, IPython notebook will + use html representation for + pandas objects (if it is available). +display.pprint_nest_depth 3 Controls the number of nested levels + to process when pretty-printing +display.precision 6 Floating point output precision in + terms of number of places after the + decimal, for regular formatting as well + as scientific notation. Similar to + numpy's ``precision`` print option +display.show_dimensions truncate Whether to print out dimensions + at the end of DataFrame repr. + If 'truncate' is specified, only + print out the dimensions if the + frame is truncated (e.g. not display + all rows and/or columns) +display.width 80 Width of the display in characters. + In case python/IPython is running in + a terminal this can be set to None + and pandas will correctly auto-detect + the width. Note that the IPython notebook, + IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible + to correctly detect the width. +html.border 1 A ``border=value`` attribute is + inserted in the ``
`` tag + for the DataFrame HTML repr. +io.excel.xls.writer xlwt The default Excel writer engine for + 'xls' files. +io.excel.xlsm.writer openpyxl The default Excel writer engine for + 'xlsm' files. Available options: + 'openpyxl' (the default). +io.excel.xlsx.writer openpyxl The default Excel writer engine for + 'xlsx' files. +io.hdf.default_format None default format writing format, if + None, then put will default to + 'fixed' and append will default to + 'table' +io.hdf.dropna_table True drop ALL nan rows when appending + to a table +mode.chained_assignment warn Raise an exception, warn, or no + action if trying to use chained + assignment, The default is warn +mode.sim_interactive False Whether to simulate interactive mode + for purposes of testing +mode.use_inf_as_null False True means treat None, NaN, -INF, + INF as null (old way), False means + None and NaN are null, but INF, -INF + are not null (new way). +=================================== ============ ================================== .. _basics.console_output: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index dca4f890e496b..0991f3873b06f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -182,6 +182,7 @@ Other enhancements - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) +- The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index d3db633f3aa04..89616890e1de1 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -239,14 +239,35 @@ : bool This specifies if the to_latex method of a Dataframe uses escapes special characters. - method. Valid values: False,True + Valid values: False,True """ pc_latex_longtable = """ :bool This specifies if the to_latex method of a Dataframe uses the longtable format. - method. Valid values: False,True + Valid values: False,True +""" + +pc_latex_multicolumn = """ +: bool + This specifies if the to_latex method of a Dataframe uses multicolumns + to pretty-print MultiIndex columns. + Valid values: False,True +""" + +pc_latex_multicolumn_format = """ +: string + This specifies the format for multicolumn headers. + Can be surrounded with '|'. + Valid values: 'l', 'c', 'r', 'p{}' +""" + +pc_latex_multirow = """ +: bool + This specifies if the to_latex method of a Dataframe uses multirows + to pretty-print MultiIndex rows. + Valid values: False,True """ style_backup = dict() @@ -339,6 +360,12 @@ def mpl_style_cb(key): validator=is_bool) cf.register_option('latex.longtable', False, pc_latex_longtable, validator=is_bool) + cf.register_option('latex.multicolumn', True, pc_latex_multicolumn, + validator=is_bool) + cf.register_option('latex.multicolumn_format', 'l', pc_latex_multicolumn, + validator=is_text) + cf.register_option('latex.multirow', False, pc_latex_multirow, + validator=is_bool) cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 26a0a91094e7d..b3e43edc3eb55 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1614,10 +1614,11 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, bold_rows=True, column_format=None, longtable=None, escape=None, - encoding=None, decimal='.'): - """ + encoding=None, decimal='.', multicolumn=None, + multicolumn_format=None, multirow=None): + r""" Render a DataFrame to a tabular environment table. You can splice - this into a LaTeX document. Requires \\usepackage{booktabs}. + this into a LaTeX document. Requires \usepackage{booktabs}. `to_latex`-specific options: @@ -1628,27 +1629,54 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, `__ e.g 'rcl' for 3 columns longtable : boolean, default will be read from the pandas config module - default: False + Default: False. Use a longtable environment instead of tabular. Requires adding - a \\usepackage{longtable} to your LaTeX preamble. + a \usepackage{longtable} to your LaTeX preamble. escape : boolean, default will be read from the pandas config module - default: True + Default: True. When set to False prevents from escaping latex special characters in column names. encoding : str, default None A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. decimal : string, default '.' - Character recognized as decimal separator, e.g. ',' in Europe + Character recognized as decimal separator, e.g. ',' in Europe. .. versionadded:: 0.18.0 + multicolumn : boolean, default True + Use \multicolumn to enhance MultiIndex columns. + The default will be read from the config module. + + .. versionadded:: 0.20.0 + + multicolumn_format : str, default 'l' + The alignment for multicolumns, similar to `column_format` + The default will be read from the config module. + + .. versionadded:: 0.20.0 + + multirow : boolean, default False + Use \multirow to enhance MultiIndex rows. + Requires adding a \usepackage{multirow} to your LaTeX preamble. + Will print centered labels (instead of top-aligned) + across the contained rows, separating groups via clines. + The default will be read from the pandas config module. + + .. versionadded:: 0.20.0 + """ # Get defaults from the pandas config if longtable is None: longtable = get_option("display.latex.longtable") if escape is None: escape = get_option("display.latex.escape") + if multicolumn is None: + multicolumn = get_option("display.latex.multicolumn") + if multicolumn_format is None: + multicolumn_format = get_option("display.latex.multicolumn_format") + if multirow is None: + multirow = get_option("display.latex.multirow") formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, col_space=col_space, na_rep=na_rep, @@ -1660,7 +1688,9 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, index_names=index_names, escape=escape, decimal=decimal) formatter.to_latex(column_format=column_format, longtable=longtable, - encoding=encoding) + encoding=encoding, multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow) if buf is None: return formatter.buf.getvalue() diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 4c081770e0125..9dde3b0001c31 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -650,13 +650,17 @@ def _join_multiline(self, *strcols): st = ed return '\n\n'.join(str_lst) - def to_latex(self, column_format=None, longtable=False, encoding=None): + def to_latex(self, column_format=None, longtable=False, encoding=None, + multicolumn=False, multicolumn_format=None, multirow=False): """ Render a DataFrame to a LaTeX tabular/longtable environment output. """ latex_renderer = LatexFormatter(self, column_format=column_format, - longtable=longtable) + longtable=longtable, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow) if encoding is None: encoding = 'ascii' if compat.PY2 else 'utf-8' @@ -824,11 +828,15 @@ class LatexFormatter(TableFormatter): HTMLFormatter """ - def __init__(self, formatter, column_format=None, longtable=False): + def __init__(self, formatter, column_format=None, longtable=False, + multicolumn=False, multicolumn_format=None, multirow=False): self.fmt = formatter self.frame = self.fmt.frame self.column_format = column_format self.longtable = longtable + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow def write_result(self, buf): """ @@ -850,14 +858,21 @@ def get_col_type(dtype): else: return 'l' + # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, MultiIndex): clevels = self.frame.columns.nlevels strcols.pop(0) name = any(self.frame.index.names) + cname = any(self.frame.columns.names) + lastcol = self.frame.index.nlevels - 1 for i, lev in enumerate(self.frame.index.levels): lev2 = lev.format() blank = ' ' * len(lev2[0]) - lev3 = [blank] * clevels + # display column names in last index-column + if cname and i == lastcol: + lev3 = [x if x else '{}' for x in self.frame.columns.names] + else: + lev3 = [blank] * clevels if name: lev3.append(lev.name) for level_idx, group in itertools.groupby( @@ -885,10 +900,15 @@ def get_col_type(dtype): buf.write('\\begin{longtable}{%s}\n' % column_format) buf.write('\\toprule\n') - nlevels = self.frame.columns.nlevels + ilevels = self.frame.index.nlevels + clevels = self.frame.columns.nlevels + nlevels = clevels if any(self.frame.index.names): nlevels += 1 - for i, row in enumerate(zip(*strcols)): + strrows = list(zip(*strcols)) + self.clinebuf = [] + + for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: buf.write('\\midrule\n') # End of header if self.longtable: @@ -910,8 +930,17 @@ def get_col_type(dtype): if x else '{}') for x in row] else: crow = [x if x else '{}' for x in row] + if i < clevels and self.fmt.header and self.multicolumn: + # sum up columns to multicolumns + crow = self._format_multicolumn(crow, ilevels) + if (i >= nlevels and self.fmt.index and self.multirow and + ilevels > 1): + # sum up rows to multirows + crow = self._format_multirow(crow, ilevels, i, strrows) buf.write(' & '.join(crow)) buf.write(' \\\\\n') + if self.multirow and i < len(strrows) - 1: + self._print_cline(buf, i, len(strcols)) if not self.longtable: buf.write('\\bottomrule\n') @@ -919,6 +948,80 @@ def get_col_type(dtype): else: buf.write('\\end{longtable}\n') + def _format_multicolumn(self, row, ilevels): + """ + Combine columns belonging to a group to a single multicolumn entry + according to self.multicolumn_format + + e.g.: + a & & & b & c & + will become + \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} + """ + row2 = list(row[:ilevels]) + ncol = 1 + coltext = '' + + def append_col(): + # write multicolumn if needed + if ncol > 1: + row2.append('\\multicolumn{{{0:d}}}{{{1:s}}}{{{2:s}}}' + .format(ncol, self.multicolumn_format, + coltext.strip())) + # don't modify where not needed + else: + row2.append(coltext) + for c in row[ilevels:]: + # if next col has text, write the previous + if c.strip(): + if coltext: + append_col() + coltext = c + ncol = 1 + # if not, add it to the previous multicolumn + else: + ncol += 1 + # write last column name + if coltext: + append_col() + return row2 + + def _format_multirow(self, row, ilevels, i, rows): + """ + Check following rows, whether row should be a multirow + + e.g.: becomes: + a & 0 & \multirow{2}{*}{a} & 0 & + & 1 & & 1 & + b & 0 & \cline{1-2} + b & 0 & + """ + for j in range(ilevels): + if row[j].strip(): + nrow = 1 + for r in rows[i + 1:]: + if not r[j].strip(): + nrow += 1 + else: + break + if nrow > 1: + # overwrite non-multirow entry + row[j] = '\\multirow{{{0:d}}}{{*}}{{{1:s}}}'.format( + nrow, row[j].strip()) + # save when to end the current block with \cline + self.clinebuf.append([i + nrow - 1, j + 1]) + return row + + def _print_cline(self, buf, i, icol): + """ + Print clines after multirow-blocks are finished + """ + for cl in self.clinebuf: + if cl[0] == i: + buf.write('\cline{{{0:d}-{1:d}}}\n'.format(cl[1], icol)) + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i] + class HTMLFormatter(TableFormatter): diff --git a/pandas/tests/formats/test_to_latex.py b/pandas/tests/formats/test_to_latex.py index 89e18e1cec06e..17e1e18f03dd6 100644 --- a/pandas/tests/formats/test_to_latex.py +++ b/pandas/tests/formats/test_to_latex.py @@ -168,6 +168,24 @@ def test_to_latex_multiindex(self): assert result == expected + # GH 14184 + df = df.T + df.columns.names = ['a', 'b'] + result = df.to_latex() + expected = r"""\begin{tabular}{lrrrrr} +\toprule +a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ +b & 0 & 1 & 0 & 1 & 0 \\ +\midrule +0 & 0 & 4 & 0 & 4 & 0 \\ +1 & 1 & 5 & 1 & 5 & 1 \\ +2 & 2 & 6 & 2 & 6 & 2 \\ +3 & 3 & 7 & 3 & 7 & 3 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + # GH 10660 df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': list('abab'), @@ -189,16 +207,95 @@ def test_to_latex_multiindex(self): assert result == expected result = df.groupby('a').describe().to_latex() - expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & ' - ' & & & & & & ' - '\\\\\n{} & count & mean & std & min & 25\\% & ' - '50\\% & 75\\% & max \\\\\na & & & ' - ' & & & & & \\\\\n\\midrule\n0 ' - '& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 ' - '& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 ' - '& 3.5 & 3.75 & 4.0 ' - '\\\\\n\\bottomrule\n\\end{tabular}\n') + expected = r"""\begin{tabular}{lrrrrrrrr} +\toprule +{} & \multicolumn{8}{l}{c} \\ +{} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ +a & & & & & & & & \\ +\midrule +0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ +1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + def test_to_latex_multicolumnrow(self): + df = pd.DataFrame({ + ('c1', 0): dict((x, x) for x in range(5)), + ('c1', 1): dict((x, x + 5) for x in range(5)), + ('c2', 0): dict((x, x) for x in range(5)), + ('c2', 1): dict((x, x + 5) for x in range(5)), + ('c3', 0): dict((x, x) for x in range(5)) + }) + result = df.to_latex() + expected = r"""\begin{tabular}{lrrrrr} +\toprule +{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ +{} & 0 & 1 & 0 & 1 & 0 \\ +\midrule +0 & 0 & 5 & 0 & 5 & 0 \\ +1 & 1 & 6 & 1 & 6 & 1 \\ +2 & 2 & 7 & 2 & 7 & 2 \\ +3 & 3 & 8 & 3 & 8 & 3 \\ +4 & 4 & 9 & 4 & 9 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + result = df.to_latex(multicolumn=False) + expected = r"""\begin{tabular}{lrrrrr} +\toprule +{} & c1 & & c2 & & c3 \\ +{} & 0 & 1 & 0 & 1 & 0 \\ +\midrule +0 & 0 & 5 & 0 & 5 & 0 \\ +1 & 1 & 6 & 1 & 6 & 1 \\ +2 & 2 & 7 & 2 & 7 & 2 \\ +3 & 3 & 8 & 3 & 8 & 3 \\ +4 & 4 & 9 & 4 & 9 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + result = df.T.to_latex(multirow=True) + expected = r"""\begin{tabular}{llrrrrr} +\toprule + & & 0 & 1 & 2 & 3 & 4 \\ +\midrule +\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + df.index = df.T.index + result = df.T.to_latex(multirow=True, multicolumn=True, + multicolumn_format='c') + expected = r"""\begin{tabular}{llrrrrr} +\toprule + & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ + & & 0 & 1 & 0 & 1 & 0 \\ +\midrule +\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ +\bottomrule +\end{tabular} +""" assert result == expected def test_to_latex_escape(self): From e3ba132c2d9574a2bea63f839f22751263c45e7b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 3 Mar 2017 05:12:21 -0500 Subject: [PATCH 220/338] DEPR: deprecate some top-level non-used functions (#15538) closes #13790 pd.pnow pd.groupby pd.match pd.Term pd.Expr remove info.py --- doc/source/comparison_with_r.rst | 8 ----- doc/source/whatsnew/v0.20.0.txt | 6 ++++ pandas/__init__.py | 43 +++++++++++++++++++++++++- pandas/computation/api.py | 12 +++++++- pandas/core/api.py | 24 +++++++++++++-- pandas/info.py | 20 ------------ pandas/io/api.py | 14 ++++++++- pandas/tests/api/test_api.py | 49 ++++++++++++++++++++++-------- pandas/tests/scalar/test_period.py | 14 +++------ pandas/tseries/period.py | 8 ++++- 10 files changed, 141 insertions(+), 57 deletions(-) delete mode 100644 pandas/info.py diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index aa0cbab4df10b..194e022e34c7c 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -206,14 +206,6 @@ of its first argument in its second: s <- 0:4 match(s, c(2,4)) -The :meth:`~pandas.core.groupby.GroupBy.apply` method can be used to replicate -this: - -.. ipython:: python - - s = pd.Series(np.arange(5),dtype=np.float32) - pd.Series(pd.match(s,[2,4],np.nan)) - For more details and examples see :ref:`the reshaping documentation `. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0991f3873b06f..fa5974ee84d34 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -537,6 +537,12 @@ Deprecations - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) - ``FrozenList`` addition (new object and inplace) have been deprecated in favor of the ``.union()`` method. (:issue: `15475`) +- The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`) + * ``pd.pnow()``, replaced by ``Period.now()`` + * ``pd.Term``, is removed, as it is not applicable to user code. Instead use in-line string expressions in the where clause when searching in HDFStore + * ``pd.Expr``, is removed, as it is not applicable to user code. + * ``pd.match()``, is removed. + * ``pd.groupby()``, replaced by using the ``.groupby()`` method directly on a ``Series/DataFrame`` .. _whatsnew_0200.prior_deprecations: diff --git a/pandas/__init__.py b/pandas/__init__.py index 70c547010f623..3bded89e6644a 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -33,7 +33,6 @@ "the C extensions first.".format(module)) from datetime import datetime -from pandas.info import __doc__ # let init-time option registration happen import pandas.core.config_init @@ -63,3 +62,45 @@ v = get_versions() __version__ = v.get('closest-tag', v['version']) del get_versions, v + +# module level doc-string +__doc__ = """ +pandas - a powerful data analysis and manipulation library for Python +===================================================================== + +**pandas** is a Python package providing fast, flexible, and expressive data +structures designed to make working with "relational" or "labeled" data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, **real world** data analysis in Python. Additionally, it has +the broader goal of becoming **the most powerful and flexible open source data +analysis / manipulation tool available in any language**. It is already well on +its way toward this goal. + +Main Features +------------- +Here are just a few of the things that pandas does well: + + - Easy handling of missing data in floating point as well as non-floating + point data + - Size mutability: columns can be inserted and deleted from DataFrame and + higher dimensional objects + - Automatic and explicit data alignment: objects can be explicitly aligned + to a set of labels, or the user can simply ignore the labels and let + `Series`, `DataFrame`, etc. automatically align the data for you in + computations + - Powerful, flexible group by functionality to perform split-apply-combine + operations on data sets, for both aggregating and transforming data + - Make it easy to convert ragged, differently-indexed data in other Python + and NumPy data structures into DataFrame objects + - Intelligent label-based slicing, fancy indexing, and subsetting of large + data sets + - Intuitive merging and joining data sets + - Flexible reshaping and pivoting of data sets + - Hierarchical labeling of axes (possible to have multiple labels per tick) + - Robust IO tools for loading data from flat files (CSV and delimited), + Excel files, databases, and saving/loading data from the ultrafast HDF5 + format + - Time series-specific functionality: date range generation and frequency + conversion, moving window statistics, moving window linear regressions, + date shifting and lagging, etc. +""" diff --git a/pandas/computation/api.py b/pandas/computation/api.py index e5814e08c4bbe..fe3dad015048e 100644 --- a/pandas/computation/api.py +++ b/pandas/computation/api.py @@ -1,4 +1,14 @@ # flake8: noqa from pandas.computation.eval import eval -from pandas.computation.expr import Expr + + +# deprecation, xref #13790 +def Expr(*args, **kwargs): + import warnings + + warnings.warn("pd.Expr is deprecated as it is not " + "applicable to user code", + FutureWarning, stacklevel=2) + from pandas.computation.expr import Expr + return Expr(*args, **kwargs) diff --git a/pandas/core/api.py b/pandas/core/api.py index eaebf45a038a0..65253dedb8b53 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -4,7 +4,7 @@ import numpy as np -from pandas.core.algorithms import factorize, match, unique, value_counts +from pandas.core.algorithms import factorize, unique, value_counts from pandas.types.missing import isnull, notnull from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper @@ -17,7 +17,6 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D -from pandas.core.groupby import groupby from pandas.core.reshape import (pivot_simple as pivot, get_dummies, lreshape, wide_to_long) @@ -42,3 +41,24 @@ from pandas.core.config import (get_option, set_option, reset_option, describe_option, option_context, options) + + +# deprecation, xref #13790 +def match(*args, **kwargs): + import warnings + + warnings.warn("pd.match() is deprecated and will be removed " + "in a future version", + FutureWarning, stacklevel=2) + from pandas.core.algorithms import match + return match(*args, **kwargs) + + +def groupby(*args, **kwargs): + import warnings + + warnings.warn("pd.groupby() is deprecated and will be removed " + "Please use the Series.groupby() or " + "DataFrame.groupby() methods", + FutureWarning, stacklevel=2) + return args[0].groupby(*args[1:], **kwargs) diff --git a/pandas/info.py b/pandas/info.py deleted file mode 100644 index 57ecd91739eab..0000000000000 --- a/pandas/info.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -pandas - a powerful data analysis and manipulation library for Python -===================================================================== - -See http://pandas.pydata.org/ for full documentation. Otherwise, see the -docstrings of the various objects in the pandas namespace: - -Series -DataFrame -Panel -Index -DatetimeIndex -HDFStore -bdate_range -date_range -read_csv -read_fwf -read_table -ols -""" diff --git a/pandas/io/api.py b/pandas/io/api.py index 0bd86c85b4b8b..1284b3cb222d6 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -7,7 +7,7 @@ from pandas.io.parsers import read_csv, read_table, read_fwf from pandas.io.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel -from pandas.io.pytables import HDFStore, Term, get_store, read_hdf +from pandas.io.pytables import HDFStore, get_store, read_hdf from pandas.io.json import read_json from pandas.io.html import read_html from pandas.io.sql import read_sql, read_sql_table, read_sql_query @@ -17,3 +17,15 @@ from pandas.io.pickle import read_pickle, to_pickle from pandas.io.packers import read_msgpack, to_msgpack from pandas.io.gbq import read_gbq + +# deprecation, xref #13790 +def Term(*args, **kwargs): + import warnings + + warnings.warn("pd.Term is deprecated as it is not " + "applicable to user code. Instead use in-line " + "string expressions in the where clause when " + "searching in HDFStore", + FutureWarning, stacklevel=2) + from pandas.io.pytables import Term + return Term(*args, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 8ca369f8df83a..f2f7a9c778e66 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -59,13 +59,10 @@ class TestPDApi(Base, tm.TestCase): # these are already deprecated; awaiting removal deprecated_classes = ['WidePanel', 'SparseTimeSeries', 'Panel4D', - 'SparseList'] + 'SparseList', 'Expr', 'Term'] # these should be deprecated in the future - deprecated_classes_in_future = ['Term', 'Panel'] - - # these should be removed from top-level namespace - remove_classes_from_top_level_namespace = ['Expr'] + deprecated_classes_in_future = ['Panel'] # external modules exposed in pandas namespace modules = ['np', 'datetime'] @@ -75,7 +72,7 @@ class TestPDApi(Base, tm.TestCase): 'date_range', 'eval', 'factorize', 'get_dummies', 'get_store', 'infer_freq', 'isnull', 'lreshape', - 'match', 'melt', 'notnull', 'offsets', + 'melt', 'notnull', 'offsets', 'merge', 'merge_ordered', 'merge_asof', 'period_range', 'pivot', 'pivot_table', 'plot_params', 'qcut', @@ -99,9 +96,6 @@ class TestPDApi(Base, tm.TestCase): funcs_to = ['to_datetime', 'to_msgpack', 'to_numeric', 'to_pickle', 'to_timedelta'] - # these should be deprecated in the future - deprecated_funcs_in_future = ['pnow', 'groupby', 'info'] - # these are already deprecated; awaiting removal deprecated_funcs = ['ewma', 'ewmcorr', 'ewmcov', 'ewmstd', 'ewmvar', 'ewmvol', 'expanding_apply', 'expanding_corr', @@ -114,7 +108,8 @@ class TestPDApi(Base, tm.TestCase): 'rolling_kurt', 'rolling_max', 'rolling_mean', 'rolling_median', 'rolling_min', 'rolling_quantile', 'rolling_skew', 'rolling_std', 'rolling_sum', - 'rolling_var', 'rolling_window', 'ordered_merge'] + 'rolling_var', 'rolling_window', 'ordered_merge', + 'pnow', 'match', 'groupby'] def test_api(self): @@ -123,11 +118,9 @@ def test_api(self): self.modules + self.deprecated_modules + self.classes + self.deprecated_classes + self.deprecated_classes_in_future + - self.remove_classes_from_top_level_namespace + self.funcs + self.funcs_option + self.funcs_read + self.funcs_to + - self.deprecated_funcs + - self.deprecated_funcs_in_future, + self.deprecated_funcs, self.ignored) @@ -225,3 +218,33 @@ def test_deprecation_access_obj(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.datetools.monthEnd + + +class TestTopLevelDeprecations(tm.TestCase): + # top-level API deprecations + # GH 13790 + + def test_pnow(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.pnow(freq='M') + + def test_term(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.Term('index>=date') + + def test_expr(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.Expr('2>1') + + def test_match(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.match([1, 2, 3], [1]) + + def test_groupby(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.groupby(pd.Series([1, 2, 3]), [1, 1, 1]) diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index ffe00a4a62a0a..49aa44492fe81 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -864,17 +864,11 @@ def test_properties_nat(self): self.assertTrue(np.isnan(getattr(t_nat, f))) def test_pnow(self): - dt = datetime.now() - val = period.pnow('D') - exp = Period(dt, freq='D') - self.assertEqual(val, exp) - - val2 = period.pnow('2D') - exp2 = Period(dt, freq='2D') - self.assertEqual(val2, exp2) - self.assertEqual(val.ordinal, val2.ordinal) - self.assertEqual(val.ordinal, exp2.ordinal) + # deprecation, xref #13790 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + period.pnow('D') def test_constructor_corner(self): expected = Period('2007-01', freq='2M') diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8a6b0c153bb50..6e499924730b3 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1144,7 +1144,13 @@ def _make_field_arrays(*fields): def pnow(freq=None): - return Period(datetime.now(), freq=freq) + # deprecation, xref #13790 + import warnings + + warnings.warn("pd.pnow() and pandas.tseries.period.pnow() " + "are deprecated. Please use Period.now()", + FutureWarning, stacklevel=2) + return Period.now(freq=freq) def period_range(start=None, end=None, periods=None, freq='D', name=None): From c4d206121f1b8d63d1f305d7a84d68f737ac9f54 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Fri, 3 Mar 2017 08:04:43 -0500 Subject: [PATCH 221/338] BUG: Set frequency for empty Series closes #14320 Author: Sahil Dua Closes #14458 from sahildua2305/frequency-series-fix and squashes the following commits: 384e666 [Sahil Dua] BUG: Set frequency for empty Series --- doc/source/whatsnew/v0.20.0.txt | 3 +++ pandas/tests/series/test_timeseries.py | 8 ++++++++ pandas/tseries/resample.py | 18 ++++++++++-------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index fa5974ee84d34..df259f4a42b86 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -605,6 +605,9 @@ Bug Fixes - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) + +- Bug in ``.asfreq()``, where frequency was not set for empty ``Series` (:issue:`14320`) + - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 8c22b3f047210..d384460c3d030 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -260,6 +260,14 @@ def test_asfreq(self): index=[-1.0, 2.0, 1.0, 0.0]).sort_index() assert_series_equal(result, expected) + def test_asfreq_datetimeindex_empty_series(self): + # GH 14320 + expected = Series(index=pd.DatetimeIndex( + ["2016-09-29 11:00"])).asfreq('H') + result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]), + data=[3]).asfreq('H') + self.assert_index_equal(expected.index, result.index) + def test_diff(self): # Just run the function self.ts.diff() diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 75e550a065fd2..21d7dc0c177b6 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1382,16 +1382,18 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): if how is None: how = 'E' - new_index = obj.index.asfreq(freq, how=how) new_obj = obj.copy() - new_obj.index = new_index - return new_obj + new_obj.index = obj.index.asfreq(freq, how=how) + + elif len(obj.index) == 0: + new_obj = obj.copy() + new_obj.index = obj.index._shallow_copy(freq=to_offset(freq)) + else: - if len(obj.index) == 0: - return obj.copy() dti = date_range(obj.index[0], obj.index[-1], freq=freq) dti.name = obj.index.name - rs = obj.reindex(dti, method=method, fill_value=fill_value) + new_obj = obj.reindex(dti, method=method, fill_value=fill_value) if normalize: - rs.index = rs.index.normalize() - return rs + new_obj.index = new_obj.index.normalize() + + return new_obj From a94ed1ef4b3262f2f21b9a7de5a160dbec30b9b6 Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 3 Mar 2017 10:04:26 -0500 Subject: [PATCH 222/338] BUG: syntax error in hdf query with ts closes #15492 Author: Chris Closes #15544 from chris-b1/hdf-dt-error and squashes the following commits: 8288dca [Chris] lint 7c7100d [Chris] expand test cases 946a48e [Chris] ERR: more strict HDFStore string comparison 213585f [Chris] CLN: remove timetuple type check cc977f0 [Chris] BUG: syntax error in hdf query with ts --- doc/source/whatsnew/v0.20.0.txt | 31 +++++++++++++++++++++- pandas/computation/pytables.py | 21 ++++++--------- pandas/tests/io/test_pytables.py | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index df259f4a42b86..782ae6082c1cf 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -501,6 +501,35 @@ New Behavior: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) +.. _whatsnew_0200.api_breaking.hdfstore_where: + +HDFStore where string comparison +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions most types could be compared to string column in a ``HDFStore`` +usually resulting in an invalid comparsion. These comparisions will now raise a +``TypeError`` (:issue:`15492`) + +New Behavior: + +.. code-block:: ipython + + In [15]: df = pd.DataFrame({'unparsed_date': ['2014-01-01', '2014-01-01']}) + + In [16]: df.dtypes + Out[16]: + unparsed_date object + dtype: object + + In [17]: df.to_hdf('store.h5', 'key', format='table', data_columns=True) + + In [18]: ts = pd.Timestamp('2014-01-01') + + In [19]: pd.read_hdf('store.h5', 'key', where='unparsed_date > ts') + TypeError: Cannot compare 2014-01-01 00:00:00 of + type to string column + + .. _whatsnew_0200.api: Other API Changes @@ -671,7 +700,7 @@ Bug Fixes - Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`) - +- Bug in ``pd.read_hdf()`` passing a ``Timestamp`` to the ``where`` parameter with a non date column (:issue:`15492`) - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 9dc18284ec22c..7c09ca8d38773 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -1,7 +1,6 @@ """ manage PyTables query interface via Expressions """ import ast -import time import warnings from functools import partial from datetime import datetime, timedelta @@ -188,10 +187,6 @@ def stringify(value): if v.tz is not None: v = v.tz_convert('UTC') return TermValue(v, v.value, kind) - elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or - kind == u('date')): - v = time.mktime(v.timetuple()) - return TermValue(v, pd.Timestamp(v), kind) elif kind == u('timedelta64') or kind == u('timedelta'): v = _coerce_scalar_to_timedelta_type(v, unit='s').value return TermValue(int(v), v, kind) @@ -218,12 +213,13 @@ def stringify(value): else: v = bool(v) return TermValue(v, v, kind) - elif not isinstance(v, string_types): - v = stringify(v) + elif isinstance(v, string_types): + # string quoting return TermValue(v, stringify(v), u('string')) - - # string quoting - return TermValue(v, stringify(v), u('string')) + else: + raise TypeError(("Cannot compare {v} of type {typ}" + " to {kind} column").format(v=v, typ=type(v), + kind=kind)) def convert_values(self): pass @@ -558,9 +554,8 @@ def parse_back_compat(self, w, op=None, value=None): # stringify with quotes these values def convert(v): - if (isinstance(v, (datetime, np.datetime64, - timedelta, np.timedelta64)) or - hasattr(v, 'timetuple')): + if isinstance(v, (datetime, np.datetime64, + timedelta, np.timedelta64)): return "'{0}'".format(v) return v diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 821d9956a2dfa..9f1dea2094bc6 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -5071,6 +5071,50 @@ def test_query_long_float_literal(self): expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) + def test_query_compare_column_type(self): + # GH 15492 + df = pd.DataFrame({'date': ['2014-01-01', '2014-01-02'], + 'real_date': date_range('2014-01-01', periods=2), + 'float': [1.1, 1.2], + 'int': [1, 2]}, + columns=['date', 'real_date', 'float', 'int']) + + with ensure_clean_store(self.path) as store: + store.append('test', df, format='table', data_columns=True) + + ts = pd.Timestamp('2014-01-01') # noqa + result = store.select('test', where='real_date > ts') + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + for op in ['<', '>', '==']: + # non strings to string column always fail + for v in [2.1, True, pd.Timestamp('2014-01-01'), + pd.Timedelta(1, 's')]: + query = 'date {op} v'.format(op=op) + with tm.assertRaises(TypeError): + result = store.select('test', where=query) + + # strings to other columns must be convertible to type + v = 'a' + for col in ['int', 'float', 'real_date']: + query = '{col} {op} v'.format(op=op, col=col) + with tm.assertRaises(ValueError): + result = store.select('test', where=query) + + for v, col in zip(['1', '1.1', '2014-01-01'], + ['int', 'float', 'real_date']): + query = '{col} {op} v'.format(op=op, col=col) + result = store.select('test', where=query) + + if op == '==': + expected = df.loc[[0], :] + elif op == '>': + expected = df.loc[[1], :] + else: + expected = df.loc[[], :] + tm.assert_frame_equal(expected, result) + class TestHDFComplexValues(Base): # GH10447 From 06c5c6e726bc7991f512e5436eec4c1f6e17850c Mon Sep 17 00:00:00 2001 From: Kyle Kelley Date: Sat, 4 Mar 2017 03:09:45 -0800 Subject: [PATCH 223/338] BUG: handle empty lists in json_normalize (#15535) closes #15534 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/json/normalize.py | 3 +++ pandas/tests/io/json/test_normalize.py | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 782ae6082c1cf..8b6c53a159ad8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -192,6 +192,7 @@ Other enhancements - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. +- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index f29472155da17..0e7d025e81851 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -157,6 +157,9 @@ def _pull_field(js, spec): return result + if isinstance(data, list) and len(data) is 0: + return DataFrame() + # A bit of a hackjob if isinstance(data, dict): data = [data] diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index c60b81ffe504d..f881f4dafe0f3 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -62,6 +62,11 @@ def test_simple_normalize(self): tm.assert_frame_equal(result, expected) + def test_empty_array(self): + result = json_normalize([]) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self): data = [{'country': 'USA', 'states': [{'name': 'California', From 7235dde63ce39f789e393ffd78a80217be18c463 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 4 Mar 2017 05:50:04 -0600 Subject: [PATCH 224/338] ENH: Added to_json_schema (#14904) Lays the groundwork for https://github.com/pandas-dev/pandas/issues/14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup. Added publish to dataframe repr --- ci/requirements-2.7.pip | 2 + ci/requirements-3.5.run | 1 + ci/requirements-3.6.run | 1 + doc/source/api.rst | 1 + doc/source/io.rst | 120 +++++ doc/source/options.rst | 21 + doc/source/whatsnew/v0.20.0.txt | 35 ++ pandas/core/config_init.py | 10 + pandas/core/generic.py | 86 +++- pandas/io/json/__init__.py | 3 +- pandas/io/json/json.py | 89 +++- pandas/io/json/table_schema.py | 177 +++++++ pandas/tests/formats/test_printing.py | 61 +++ .../tests/io/json/test_json_table_schema.py | 462 ++++++++++++++++++ pandas/util/testing.py | 19 + 15 files changed, 1072 insertions(+), 16 deletions(-) create mode 100644 pandas/io/json/table_schema.py create mode 100644 pandas/tests/io/json/test_json_table_schema.py diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 08240184f2934..eb796368e7820 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -4,3 +4,5 @@ pathlib backports.lzma py PyCrypto +mock +ipython diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index b07ce611c79a2..43e6814ed6c8e 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -18,3 +18,4 @@ pymysql psycopg2 s3fs beautifulsoup4 +ipython diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 5d9cb05a7b402..9a6c1c7edbc5e 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -18,3 +18,4 @@ pymysql beautifulsoup4 s3fs xarray +ipython diff --git a/doc/source/api.rst b/doc/source/api.rst index 6c4a3cff5b4cf..33ac5fde651d4 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -60,6 +60,7 @@ JSON :toctree: generated/ json_normalize + build_table_schema .. currentmodule:: pandas diff --git a/doc/source/io.rst b/doc/source/io.rst index b36ae8c2ed450..c34cc1ec17512 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2033,6 +2033,126 @@ using Hadoop or Spark. df df.to_json(orient='records', lines=True) + +.. _io.table_schema: + +Table Schema +'''''''''''' + +.. versionadded:: 0.20.0 + +`Table Schema`_ is a spec for describing tabular datasets as a JSON +object. The JSON includes information on the field names, types, and +other attributes. You can use the orient ``table`` to build +a JSON string with two fields, ``schema`` and ``data``. + +.. ipython:: python + + df = pd.DataFrame( + {'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3), + }, index=pd.Index(range(3), name='idx')) + df + df.to_json(orient='table', date_format="iso") + +The ``schema`` field contains the ``fields`` key, which itself contains +a list of column name to type pairs, including the ``Index`` or ``MultiIndex`` +(see below for a list of types). +The ``schema`` field also contains a ``primaryKey`` field if the (Multi)index +is unique. + +The second field, ``data``, contains the serialized data with the ``records`` +orient. +The index is included, and any datetimes are ISO 8601 formatted, as required +by the Table Schema spec. + +The full list of types supported are described in the Table Schema +spec. This table shows the mapping from pandas types: + +============== ================= +Pandas type Table Schema type +============== ================= +int64 integer +float64 number +bool boolean +datetime64[ns] datetime +timedelta64[ns] duration +categorical any +object str +=============== ================= + +A few notes on the generated table schema: + +- The ``schema`` object contains a ``pandas_version`` field. This contains + the version of pandas' dialect of the schema, and will be incremented + with each revision. +- All dates are converted to UTC when serializing. Even timezone naïve values, + which are treated as UTC with an offset of 0. + + .. ipython:: python: + + from pandas.io.json import build_table_schema + s = pd.Series(pd.date_range('2016', periods=4)) + build_table_schema(s) + +- datetimes with a timezone (before serializing), include an additional field + ``tz`` with the time zone name (e.g. ``'US/Central'``). + + .. ipython:: python + + s_tz = pd.Series(pd.date_range('2016', periods=12, + tz='US/Central')) + build_table_schema(s_tz) + +- Periods are converted to timestamps before serialization, and so have the + same behavior of being converted to UTC. In addition, periods will contain + and additional field ``freq`` with the period's frequency, e.g. ``'A-DEC'`` + + .. ipython:: python + + s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC', + periods=4)) + build_table_schema(s_per) + +- Categoricals use the ``any`` type and an ``enum`` constraint listing + the set of possible values. Additionally, an ``ordered`` field is included + + .. ipython:: python + + s_cat = pd.Series(pd.Categorical(['a', 'b', 'a'])) + build_table_schema(s_cat) + +- A ``primaryKey`` field, containing an array of labels, is included + *if the index is unique*: + + .. ipython:: python + + s_dupe = pd.Series([1, 2], index=[1, 1]) + build_table_schema(s_dupe) + +- The ``primaryKey`` behavior is the same with MultiIndexes, but in this + case the ``primaryKey`` is an array: + + .. ipython:: python + + s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'), + (0, 1)])) + build_table_schema(s_multi) + +- The default naming roughly follows these rules: + + + For series, the ``object.name`` is used. If that's none, then the + name is ``values`` + + For DataFrames, the stringified version of the column name is used + + For ``Index`` (not ``MultiIndex``), ``index.name`` is used, with a + fallback to ``index`` if that is None. + + For ``MultiIndex``, ``mi.names`` is used. If any level has no name, + then ``level_`` is used. + + +_Table Schema: http://specs.frictionlessdata.io/json-table-schema/ + HTML ---- diff --git a/doc/source/options.rst b/doc/source/options.rst index 10a13ed36df8d..1a0e5cf6b7235 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -397,6 +397,9 @@ display.width 80 Width of the display in charact IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to correctly detect the width. +display.html.table_schema False Whether to publish a Table Schema + representation for frontends that + support it. html.border 1 A ``border=value`` attribute is inserted in the ``
`` tag for the DataFrame HTML repr. @@ -424,6 +427,7 @@ mode.use_inf_as_null False True means treat None, NaN, -IN are not null (new way). =================================== ============ ================================== + .. _basics.console_output: Number Formatting @@ -512,3 +516,20 @@ Enabling ``display.unicode.ambiguous_as_wide`` lets pandas to figure these chara pd.set_option('display.unicode.east_asian_width', False) pd.set_option('display.unicode.ambiguous_as_wide', False) + +.. _options.table_schema: + +Table Schema Display +-------------------- + +.. versionadded:: 0.20.0 + +``DataFrame`` and ``Series`` will publish a Table Schema representation +by default. False by default, this can be enabled globally with the +``display.html.table_schema`` option: + +.. ipython:: python + + pd.set_option('display.html.table_schema', True) + +Only ``'display.max_rows'`` are serialized and published. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8b6c53a159ad8..7b4538bd181d2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -12,6 +12,7 @@ Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` - Switched the test framework to `pytest`_ (:issue:`13097`) +- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref: `here ` .. _pytest: http://doc.pytest.org/en/latest/ @@ -154,6 +155,40 @@ New Behavior: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() +.. _whatsnew_0200.enhancements.table_schema + +Table Schema Output +^^^^^^^^^^^^^^^^^^^ + +The new orient ``'table'`` for :meth:`DataFrame.to_json` +will generate a `Table Schema`_ compatible string representation of +the data. + +.. ipython:: python + + df = pd.DataFrame( + {'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3), + }, index=pd.Index(range(3), name='idx')) + df + df.to_json(orient='table') + + +See :ref:`IO: Table Schema for more`. + +Additionally, the repr for ``DataFrame`` and ``Series`` can now publish +this JSON Table schema representation of the Series or DataFrame if you are +using IPython (or another frontend like `nteract`_ using the Jupyter messaging +protocol). +This gives frontends like the Jupyter notebook and `nteract`_ +more flexiblity in how they display pandas objects, since they have +more information about the data. +You must enable this by setting the ``display.html.table_schema`` option to True. + +.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ +.. _nteract: http://nteract.io/ + .. _whatsnew_0200.enhancements.other: Other enhancements diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 89616890e1de1..931fe0661818d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -164,6 +164,13 @@ (default: False) """ +pc_table_schema_doc = """ +: boolean + Whether to publish a Table Schema representation for frontends + that support it. + (default: False) +""" + pc_line_width_deprecation_warning = """\ line_width has been deprecated, use display.width instead (currently both are identical) @@ -366,6 +373,9 @@ def mpl_style_cb(key): validator=is_text) cf.register_option('latex.multirow', False, pc_latex_multirow, validator=is_bool) + cf.register_option('html.table_schema', False, pc_table_schema_doc, + validator=is_bool) + cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 127aac970fbc1..298fa75779420 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,6 +4,7 @@ import operator import weakref import gc +import json import numpy as np import pandas.lib as lib @@ -129,6 +130,37 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + def _ipython_display_(self): + try: + from IPython.display import display + except ImportError: + return None + + # Series doesn't define _repr_html_ or _repr_latex_ + latex = self._repr_latex_() if hasattr(self, '_repr_latex_') else None + html = self._repr_html_() if hasattr(self, '_repr_html_') else None + table_schema = self._repr_table_schema_() + # We need the inital newline since we aren't going through the + # usual __repr__. See + # https://github.com/pandas-dev/pandas/pull/14904#issuecomment-277829277 + text = "\n" + repr(self) + + reprs = {"text/plain": text, "text/html": html, "text/latex": latex, + "application/vnd.dataresource+json": table_schema} + reprs = {k: v for k, v in reprs.items() if v} + display(reprs, raw=True) + + def _repr_table_schema_(self): + """ + Not a real Jupyter special repr method, but we use the same + naming convention. + """ + if config.get_option("display.html.table_schema"): + data = self.head(config.get_option('display.max_rows')) + payload = json.loads(data.to_json(orient='table'), + object_pairs_hook=collections.OrderedDict) + return payload + def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -1094,7 +1126,7 @@ def __setstate__(self, state): strings before writing. """ - def to_json(self, path_or_buf=None, orient=None, date_format='epoch', + def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): """ @@ -1129,10 +1161,17 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array + - table : dict like {'schema': {schema}, 'data': {data}} + describing the data, and the data component is + like ``orient='records'``. - date_format : {'epoch', 'iso'} + .. versionchanged:: 0.20.0 + + date_format : {None, 'epoch', 'iso'} Type of date conversion. `epoch` = epoch milliseconds, - `iso`` = ISO8601, default is epoch. + `iso` = ISO8601. The default depends on the `orient`. For + `orient='table'`, the default is `'iso'`. For all other orients, + the default is `'epoch'`. double_precision : The number of decimal places to use when encoding floating point values, default 10. force_ascii : force encoded string to be ASCII, default True. @@ -1151,14 +1190,53 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', .. versionadded:: 0.19.0 - Returns ------- same type as input object with filtered info axis + See Also + -------- + pd.read_json + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '{"schema": {"fields": [{"name": "index", "type": "string"}, + {"name": "col 1", "type": "string"}, + {"name": "col 2", "type": "string"}], + "primaryKey": "index", + "pandas_version": "0.20.0"}, + "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, + {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ from pandas.io import json + if date_format is None and orient == 'table': + date_format = 'iso' + elif date_format is None: + date_format = 'epoch' return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, date_format=date_format, double_precision=double_precision, diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index a9390a04cc2cd..32d110b3404a9 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -1,4 +1,5 @@ from .json import to_json, read_json, loads, dumps # noqa from .normalize import json_normalize # noqa +from .table_schema import build_table_schema # noqa -del json, normalize # noqa +del json, normalize, table_schema # noqa diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 6fc766081eefe..a00d3492e8a37 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -1,5 +1,4 @@ # pylint: disable-msg=E1101,W0613,W0603 - import os import numpy as np @@ -12,10 +11,14 @@ from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits +from .table_schema import build_table_schema +from pandas.types.common import is_period_dtype loads = _json.loads dumps = _json.dumps +TABLE_SCHEMA_VERSION = '0.20.0' + # interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', @@ -26,19 +29,22 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', raise ValueError( "'lines' keyword only valid when 'orient' is records") - if isinstance(obj, Series): - s = SeriesWriter( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler).write() + if orient == 'table' and isinstance(obj, Series): + obj = obj.to_frame(name=obj.name or 'values') + if orient == 'table' and isinstance(obj, DataFrame): + writer = JSONTableWriter + elif isinstance(obj, Series): + writer = SeriesWriter elif isinstance(obj, DataFrame): - s = FrameWriter( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler).write() + writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") + s = writer( + obj, orient=orient, date_format=date_format, + double_precision=double_precision, ensure_ascii=force_ascii, + date_unit=date_unit, default_handler=default_handler).write() + if lines: s = _convert_to_line_delimits(s) @@ -81,7 +87,8 @@ def write(self): ensure_ascii=self.ensure_ascii, date_unit=self.date_unit, iso_dates=self.date_format == 'iso', - default_handler=self.default_handler) + default_handler=self.default_handler + ) class SeriesWriter(Writer): @@ -108,6 +115,55 @@ def _format_axes(self): "'%s'." % self.orient) +class JSONTableWriter(FrameWriter): + _default_orient = 'records' + + def __init__(self, obj, orient, date_format, double_precision, + ensure_ascii, date_unit, default_handler=None): + """ + Adds a `schema` attribut with the Table Schema, resets + the index (can't do in caller, because the schema inference needs + to know what the index is, forces orient to records, and forces + date_format to 'iso'. + """ + super(JSONTableWriter, self).__init__( + obj, orient, date_format, double_precision, ensure_ascii, + date_unit, default_handler=default_handler) + + if date_format != 'iso': + msg = ("Trying to write with `orient='table'` and " + "`date_format='%s'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`" % date_format) + raise ValueError(msg) + + self.schema = build_table_schema(obj) + + # TODO: Do this timedelta properly in objToJSON.c See GH #15137 + if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or + len(obj.columns & obj.index.names)): + msg = "Overlapping names between the index and columns" + raise ValueError(msg) + + obj = obj.copy() + timedeltas = obj.select_dtypes(include=['timedelta']).columns + if len(timedeltas): + obj[timedeltas] = obj[timedeltas].applymap( + lambda x: x.isoformat()) + # Convert PeriodIndex to datetimes before serialzing + if is_period_dtype(obj.index): + obj.index = obj.index.to_timestamp() + + self.obj = obj.reset_index() + self.date_format = 'iso' + self.orient = 'records' + + def write(self): + data = super(JSONTableWriter, self).write() + serialized = '{{"schema": {}, "data": {}}}'.format( + dumps(self.schema), data) + return serialized + + def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, @@ -244,6 +300,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, col 1 col 2 0 a b 1 c d + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '{"schema": {"fields": [{"name": "index", "type": "string"}, + {"name": "col 1", "type": "string"}, + {"name": "col 2", "type": "string"}], + "primaryKey": "index", + "pandas_version": "0.20.0"}, + "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, + {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py new file mode 100644 index 0000000000000..48f92d28baf61 --- /dev/null +++ b/pandas/io/json/table_schema.py @@ -0,0 +1,177 @@ +""" +Table Schema builders + +http://specs.frictionlessdata.io/json-table-schema/ +""" +from pandas.types.common import ( + is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype, + is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_categorical_dtype, is_period_dtype, is_string_dtype +) + + +def as_json_table_type(x): + """ + Convert a NumPy / pandas type to its corresponding json_table. + + Parameters + ---------- + x : array or dtype + + Returns + ------- + t : str + the Table Schema data types + + Notes + ----- + This table shows the relationship between NumPy / pandas dtypes, + and Table Schema dtypes. + + ============== ================= + Pandas type Table Schema type + ============== ================= + int64 integer + float64 number + bool boolean + datetime64[ns] datetime + timedelta64[ns] duration + object str + categorical any + =============== ================= + """ + if is_integer_dtype(x): + return 'integer' + elif is_bool_dtype(x): + return 'boolean' + elif is_numeric_dtype(x): + return 'number' + elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or + is_period_dtype(x)): + return 'datetime' + elif is_timedelta64_dtype(x): + return 'duration' + elif is_categorical_dtype(x): + return 'any' + elif is_string_dtype(x): + return 'string' + else: + return 'any' + + +def set_default_names(data): + """Sets index names to 'index' for regular, or 'level_x' for Multi""" + if all(name is not None for name in data.index.names): + return data + + data = data.copy() + if data.index.nlevels > 1: + names = [name if name is not None else 'level_{}'.format(i) + for i, name in enumerate(data.index.names)] + data.index.names = names + else: + data.index.name = data.index.name or 'index' + return data + + +def make_field(arr, dtype=None): + dtype = dtype or arr.dtype + field = {'name': arr.name or 'values', + 'type': as_json_table_type(dtype)} + + if is_categorical_dtype(arr): + if hasattr(arr, 'categories'): + cats = arr.categories + ordered = arr.ordered + else: + cats = arr.cat.categories + ordered = arr.cat.ordered + field['constraints'] = {"enum": list(cats)} + field['ordered'] = ordered + elif is_period_dtype(arr): + field['freq'] = arr.freqstr + elif is_datetime64tz_dtype(arr): + if hasattr(arr, 'dt'): + field['tz'] = arr.dt.tz.zone + else: + field['tz'] = arr.tz.zone + return field + + +def build_table_schema(data, index=True, primary_key=None, version=True): + """ + Create a Table schema from ``data``. + + Parameters + ---------- + data : Series, DataFrame + index : bool, default True + Whether to include ``data.index`` in the schema. + primary_key : bool or None, default True + column names to designate as the primary key. + The default `None` will set `'primaryKey'` to the index + level or levels if the index is unique. + version : bool, default True + Whether to include a field `pandas_version` with the version + of pandas that generated the schema. + + Returns + ------- + schema : dict + + Examples + -------- + >>> df = pd.DataFrame( + ... {'A': [1, 2, 3], + ... 'B': ['a', 'b', 'c'], + ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... }, index=pd.Index(range(3), name='idx')) + >>> build_table_schema(df) + {'fields': [{'name': 'idx', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}], + 'pandas_version': '0.20.0', + 'primaryKey': ['idx']} + + Notes + ----- + See `_as_json_table_type` for conversion types. + Timedeltas as converted to ISO8601 duration format with + 9 decimal places after the secnods field for nanosecond precision. + + Categoricals are converted to the `any` dtype, and use the `enum` field + constraint to list the allowed values. The `ordered` attribute is included + in an `ordered` field. + """ + if index is True: + data = set_default_names(data) + + schema = {} + fields = [] + + if index: + if data.index.nlevels > 1: + for level in data.index.levels: + fields.append(make_field(level)) + else: + fields.append(make_field(data.index)) + + if data.ndim > 1: + for column, s in data.iteritems(): + fields.append(make_field(s)) + else: + fields.append(make_field(data)) + + schema['fields'] = fields + if index and data.index.is_unique and primary_key is None: + if data.index.nlevels == 1: + schema['primaryKey'] = [data.index.name] + else: + schema['primaryKey'] = data.index.names + elif primary_key is not None: + schema['primaryKey'] = primary_key + + if version: + schema['pandas_version'] = '0.20.0' + return schema diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 52f3e06c6cbd0..cacba2ad3f3ba 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import pytest from pandas import compat +import pandas as pd import pandas.formats.printing as printing import pandas.formats.format as fmt import pandas.util.testing as tm @@ -118,6 +120,65 @@ def test_ambiguous_width(self): self.assertEqual(adjoined, expected) +class TestTableSchemaRepr(tm.TestCase): + + @classmethod + def setUpClass(cls): + pytest.importorskip('IPython') + try: + import mock + except ImportError: + try: + from unittest import mock + except ImportError: + pytest.skip("Mock is not installed") + cls.mock = mock + + def test_publishes(self): + df = pd.DataFrame({"A": [1, 2]}) + objects = [df['A'], df, df] # dataframe / series + expected_keys = [ + {'text/plain', 'application/vnd.dataresource+json'}, + {'text/plain', 'text/html', 'application/vnd.dataresource+json'}, + ] + + make_patch = self.mock.patch('IPython.display.display') + opt = pd.option_context('display.html.table_schema', True) + for obj, expected in zip(objects, expected_keys): + with opt, make_patch as mock_display: + handle = obj._ipython_display_() + self.assertEqual(mock_display.call_count, 1) + self.assertIsNone(handle) + args, kwargs = mock_display.call_args + arg, = args # just one argument + + self.assertEqual(kwargs, {"raw": True}) + self.assertEqual(set(arg.keys()), expected) + + with_latex = pd.option_context('display.latex.repr', True) + + with opt, with_latex, make_patch as mock_display: + handle = obj._ipython_display_() + args, kwargs = mock_display.call_args + arg, = args + + expected = {'text/plain', 'text/html', 'text/latex', + 'application/vnd.dataresource+json'} + self.assertEqual(set(arg.keys()), expected) + + def test_config_on(self): + df = pd.DataFrame({"A": [1, 2]}) + with pd.option_context("display.html.table_schema", True): + result = df._repr_table_schema_() + self.assertIsNotNone(result) + + def test_config_default_off(self): + df = pd.DataFrame({"A": [1, 2]}) + with pd.option_context("display.html.table_schema", False): + result = df._repr_table_schema_() + self.assertIsNone(result) + + # TODO: fix this broken test # def test_console_encode(): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py new file mode 100644 index 0000000000000..d1795f2816817 --- /dev/null +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -0,0 +1,462 @@ +"""Tests for Table Schema integration.""" +import json +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pytest + +from pandas import DataFrame +from pandas.types.dtypes import PeriodDtype, CategoricalDtype, DatetimeTZDtype +import pandas.util.testing as tm +from pandas.io.json.table_schema import ( + as_json_table_type, build_table_schema, make_field, set_default_names +) + + +class TestBuildSchema(tm.TestCase): + + def setUp(self): + self.df = DataFrame( + {'A': [1, 2, 3, 4], + 'B': ['a', 'b', 'c', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=4), + 'D': pd.timedelta_range('1H', periods=4, freq='T'), + }, + index=pd.Index(range(4), name='idx')) + + def test_build_table_schema(self): + result = build_table_schema(self.df, version=False) + expected = { + 'fields': [{'name': 'idx', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}, + {'name': 'D', 'type': 'duration'}, + ], + 'primaryKey': ['idx'] + } + self.assertEqual(result, expected) + result = build_table_schema(self.df) + self.assertTrue("pandas_version" in result) + + def test_series(self): + s = pd.Series([1, 2, 3], name='foo') + result = build_table_schema(s, version=False) + expected = {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'foo', 'type': 'integer'}], + 'primaryKey': ['index']} + self.assertEqual(result, expected) + result = build_table_schema(s) + self.assertTrue('pandas_version' in result) + + def tets_series_unnamed(self): + result = build_table_schema(pd.Series([1, 2, 3]), version=False) + expected = {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'values', 'type': 'integer'}], + 'primaryKey': ['index']} + self.assertEqual(result, expected) + + def test_multiindex(self): + df = self.df.copy() + idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)]) + df.index = idx + + result = build_table_schema(df, version=False) + expected = { + 'fields': [{'name': 'level_0', 'type': 'string'}, + {'name': 'level_1', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}, + {'name': 'D', 'type': 'duration'}, + ], + 'primaryKey': ['level_0', 'level_1'] + } + self.assertEqual(result, expected) + + df.index.names = ['idx0', None] + expected['fields'][0]['name'] = 'idx0' + expected['primaryKey'] = ['idx0', 'level_1'] + result = build_table_schema(df, version=False) + self.assertEqual(result, expected) + + +class TestTableSchemaType(tm.TestCase): + + def test_as_json_table_type_int_data(self): + int_data = [1, 2, 3] + int_types = [np.int, np.int16, np.int32, np.int64] + for t in int_types: + self.assertEqual(as_json_table_type(np.array(int_data, dtype=t)), + 'integer') + + def test_as_json_table_type_float_data(self): + float_data = [1., 2., 3.] + float_types = [np.float, np.float16, np.float32, np.float64] + for t in float_types: + self.assertEqual(as_json_table_type(np.array(float_data, + dtype=t)), + 'number') + + def test_as_json_table_type_bool_data(self): + bool_data = [True, False] + bool_types = [bool, np.bool] + for t in bool_types: + self.assertEqual(as_json_table_type(np.array(bool_data, dtype=t)), + 'boolean') + + def test_as_json_table_type_date_data(self): + date_data = [pd.to_datetime(['2016']), + pd.to_datetime(['2016'], utc=True), + pd.Series(pd.to_datetime(['2016'])), + pd.Series(pd.to_datetime(['2016'], utc=True)), + pd.period_range('2016', freq='A', periods=3)] + for arr in date_data: + self.assertEqual(as_json_table_type(arr), 'datetime') + + def test_as_json_table_type_string_data(self): + strings = [pd.Series(['a', 'b']), pd.Index(['a', 'b'])] + for t in strings: + self.assertEqual(as_json_table_type(t), 'string') + + def test_as_json_table_type_categorical_data(self): + self.assertEqual(as_json_table_type(pd.Categorical(['a'])), 'any') + self.assertEqual(as_json_table_type(pd.Categorical([1])), 'any') + self.assertEqual(as_json_table_type( + pd.Series(pd.Categorical([1]))), 'any') + self.assertEqual(as_json_table_type(pd.CategoricalIndex([1])), 'any') + self.assertEqual(as_json_table_type(pd.Categorical([1])), 'any') + + # ------ + # dtypes + # ------ + def test_as_json_table_type_int_dtypes(self): + integers = [np.int, np.int16, np.int32, np.int64] + for t in integers: + self.assertEqual(as_json_table_type(t), 'integer') + + def test_as_json_table_type_float_dtypes(self): + floats = [np.float, np.float16, np.float32, np.float64] + for t in floats: + self.assertEqual(as_json_table_type(t), 'number') + + def test_as_json_table_type_bool_dtypes(self): + bools = [bool, np.bool] + for t in bools: + self.assertEqual(as_json_table_type(t), 'boolean') + + def test_as_json_table_type_date_dtypes(self): + # TODO: datedate.date? datetime.time? + dates = [np.datetime64, np.dtype(" Date: Sat, 4 Mar 2017 15:14:36 +0100 Subject: [PATCH 225/338] DEPR/CLN: remove SparseTimeSeries class (follow-up GH15098) (#15567) --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/compat/pickle_compat.py | 3 ++- pandas/sparse/api.py | 2 +- pandas/sparse/series.py | 11 ----------- pandas/tests/api/test_api.py | 3 +-- pandas/tests/sparse/test_series.py | 6 ------ 6 files changed, 6 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7b4538bd181d2..eac187b52f65d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -621,6 +621,8 @@ Removal of prior version deprecations/changes Similar functionality can be found in the `Google2Pandas `__ package. - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) - ``pandas.stats.fama_macbeth``, ``pandas.stats.ols``, ``pandas.stats.plm`` and ``pandas.stats.var``, as well as the top-level ``pandas.fama_macbeth`` and ``pandas.ols`` routines are removed. Similar functionaility can be found in the `statsmodels `__ package. (:issue:`11898`) +- The ``TimeSeries`` and ``SparseTimeSeries`` classes, aliases of ``Series`` + and ``SparseSeries``, are removed (:issue:`10890`, :issue:`15098`). - ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:``) - The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index b8ccd13c153d4..25a170c3eb121 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -61,7 +61,8 @@ def load_reduce(self): ('pandas.core.base', 'FrozenList'): ('pandas.indexes.frozen', 'FrozenList'), # 10890 - ('pandas.core.series', 'TimeSeries'): ('pandas.core.series', 'Series') + ('pandas.core.series', 'TimeSeries'): ('pandas.core.series', 'Series'), + ('pandas.sparse.series', 'SparseTimeSeries'): ('pandas.sparse.series', 'SparseSeries') } diff --git a/pandas/sparse/api.py b/pandas/sparse/api.py index 55841fbeffa2d..90be0a216535f 100644 --- a/pandas/sparse/api.py +++ b/pandas/sparse/api.py @@ -2,5 +2,5 @@ # flake8: noqa from pandas.sparse.array import SparseArray from pandas.sparse.list import SparseList -from pandas.sparse.series import SparseSeries, SparseTimeSeries +from pandas.sparse.series import SparseSeries from pandas.sparse.frame import SparseDataFrame diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index dfdbb3c89814a..a3b701169ce91 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -844,14 +844,3 @@ def from_coo(cls, A, dense_index=False): comp_method=_arith_method, bool_method=None, use_numexpr=False, force=True) - - -# backwards compatiblity -class SparseTimeSeries(SparseSeries): - - def __init__(self, *args, **kwargs): - # deprecation TimeSeries, #10890 - warnings.warn("SparseTimeSeries is deprecated. Please use " - "SparseSeries", FutureWarning, stacklevel=2) - - super(SparseTimeSeries, self).__init__(*args, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index f2f7a9c778e66..2f8ebc4cc1df4 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -57,8 +57,7 @@ class TestPDApi(Base, tm.TestCase): 'TimedeltaIndex', 'Timestamp'] # these are already deprecated; awaiting removal - deprecated_classes = ['WidePanel', - 'SparseTimeSeries', 'Panel4D', + deprecated_classes = ['WidePanel', 'Panel4D', 'SparseList', 'Expr', 'Term'] # these should be deprecated in the future diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index d4543b97af4dd..de6636162ff05 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -112,12 +112,6 @@ def test_iteration_and_str(self): [x for x in self.bseries] str(self.bseries) - def test_TimeSeries_deprecation(self): - - # deprecation TimeSeries, #10890 - with tm.assert_produces_warning(FutureWarning): - pd.SparseTimeSeries(1, index=pd.date_range('20130101', periods=3)) - def test_construct_DataFrame_with_sp_series(self): # it works! df = DataFrame({'col': self.bseries}) From dd5d3b141cce8fbdd784caadf02444dbc00f572d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 4 Mar 2017 10:37:58 -0500 Subject: [PATCH 226/338] Revert FrozenList changes (doc build slowdown, #15559) See #15559. This temporarily reverts #15506, to see if this fixes the doc build slowdown. Author: Joris Van den Bossche Closes #15566 from jorisvandenbossche/revert and squashes the following commits: befd858 [Joris Van den Bossche] Revert "ENH: Added FrozenList difference setop" 527ded9 [Joris Van den Bossche] Revert "TST: remove deprecated usages of FrozenList.__add__ from test code" --- doc/source/groupby.rst | 10 ------- doc/source/whatsnew/v0.20.0.txt | 2 -- pandas/core/panel.py | 6 ++--- pandas/core/reshape.py | 6 ++--- pandas/core/strings.py | 2 +- pandas/indexes/frozen.py | 24 +++-------------- pandas/tests/groupby/test_value_counts.py | 2 +- pandas/tests/indexes/test_frozen.py | 33 +++++++---------------- pandas/tools/concat.py | 2 +- test_fast.sh | 2 +- 10 files changed, 22 insertions(+), 67 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 2d406de7c0c9b..8484ccd69a983 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -126,16 +126,6 @@ We could naturally group by either the ``A`` or ``B`` columns or both: grouped = df.groupby('A') grouped = df.groupby(['A', 'B']) -.. versionadded:: 0.20 - -If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all -but the specified columns. - -.. ipython:: python - - df2 = df.set_index(['A', 'B']) - grouped = df2.groupby(level=df2.index.names.difference(['B']) - These will split the DataFrame on its index (rows). We could also split by the columns: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index eac187b52f65d..1ba327a4ea50c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -29,7 +29,6 @@ New features - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) -- ``FrozenList`` has gained the ``.difference()`` setop method (:issue:`15475`) @@ -601,7 +600,6 @@ Deprecations - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) -- ``FrozenList`` addition (new object and inplace) have been deprecated in favor of the ``.union()`` method. (:issue: `15475`) - The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`) * ``pd.pnow()``, replaced by ``Period.now()`` * ``pd.Term``, is removed, as it is not applicable to user code. Instead use in-line string expressions in the where clause when searching in HDFStore diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c5ea513223dce..4a6c6cf291316 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -940,9 +940,9 @@ def construct_index_parts(idx, major=True): minor_labels, minor_levels, minor_names = construct_index_parts( self.minor_axis, major=False) - levels = list(major_levels) + list(minor_levels) - labels = list(major_labels) + list(minor_labels) - names = list(major_names) + list(minor_names) + levels = major_levels + minor_levels + labels = major_labels + minor_labels + names = major_names + minor_names index = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index faad6c500a21f..87cb088c2e91e 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -216,8 +216,8 @@ def get_new_columns(self): width = len(self.value_columns) propagator = np.repeat(np.arange(width), stride) if isinstance(self.value_columns, MultiIndex): - new_levels = self.value_columns.levels.union((self.removed_level,)) - new_names = self.value_columns.names.union((self.removed_name,)) + new_levels = self.value_columns.levels + (self.removed_level,) + new_names = self.value_columns.names + (self.removed_name,) new_labels = [lab.take(propagator) for lab in self.value_columns.labels] @@ -806,7 +806,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, for col in id_vars: mdata[col] = np.tile(frame.pop(col).values, K) - mcolumns = list(id_vars) + list(var_name) + list([value_name]) + mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 51016926d6909..ac8d1db6a0bf3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -787,7 +787,7 @@ def str_extractall(arr, pat, flags=0): if 0 < len(index_list): from pandas import MultiIndex index = MultiIndex.from_tuples( - index_list, names=arr.index.names.union(["match"])) + index_list, names=arr.index.names + ["match"]) else: index = None result = arr._constructor_expanddim(match_list, index=index, diff --git a/pandas/indexes/frozen.py b/pandas/indexes/frozen.py index 47e2557333ec7..e043ba64bbad7 100644 --- a/pandas/indexes/frozen.py +++ b/pandas/indexes/frozen.py @@ -13,8 +13,6 @@ from pandas.types.cast import _coerce_indexer_dtype from pandas.formats.printing import pprint_thing -import warnings - class FrozenList(PandasObject, list): @@ -27,14 +25,11 @@ class FrozenList(PandasObject, list): # typechecks def __add__(self, other): - warnings.warn("__add__ is deprecated, use union(...)", FutureWarning) - return self.union(other) - - def __iadd__(self, other): - warnings.warn("__iadd__ is deprecated, use union(...)", FutureWarning) if isinstance(other, tuple): other = list(other) - return super(FrozenList, self).__iadd__(other) + return self.__class__(super(FrozenList, self).__add__(other)) + + __iadd__ = __add__ # Python 2 compat def __getslice__(self, i, j): @@ -85,19 +80,6 @@ def __repr__(self): __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled - def union(self, other): - """Returns a FrozenList with other concatenated to the end of self""" - if isinstance(other, tuple): - other = list(other) - return self.__class__(super(FrozenList, self).__add__(other)) - - def difference(self, other): - """Returns a FrozenList with the same elements as self, but with elements - that are also in other removed.""" - other = set(other) - temp = [x for x in self if x not in other] - return self.__class__(temp) - class FrozenNDArray(PandasObject, np.ndarray): diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index ff01df2693c7c..801d0da070112 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -28,7 +28,7 @@ def check_value_counts(df, keys, bins): gr = df.groupby(keys, sort=isort) right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1].union(['3rd']) + right.index.names = right.index.names[:-1] + ['3rd'] # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index a5fbf066adc83..a82409fbf9513 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -15,35 +15,20 @@ def setUp(self): self.klass = FrozenList def test_add(self): - q = FrozenList([1]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - q = q + [2, 3] - expected = FrozenList([1, 2, 3]) - self.check_result(q, expected) - - def test_iadd(self): - q = FrozenList([1]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - q += [2, 3] - expected = FrozenList([1, 2, 3]) - self.check_result(q, expected) - - def test_union(self): - result = self.container.union((1, 2, 3)) + result = self.container + (1, 2, 3) expected = FrozenList(self.lst + [1, 2, 3]) self.check_result(result, expected) - def test_difference(self): - result = self.container.difference([2]) - expected = FrozenList([1, 3, 4, 5]) + result = (1, 2, 3) + self.container + expected = FrozenList([1, 2, 3] + self.lst) self.check_result(result, expected) - def test_difference_dupe(self): - result = FrozenList([1, 2, 3, 2]).difference([2]) - expected = FrozenList([1, 3]) - self.check_result(result, expected) + def test_inplace(self): + q = r = self.container + q += [5] + self.check_result(q, self.lst + [5]) + # other shouldn't be mutated + self.check_result(r, self.lst) class TestFrozenNDArray(CheckImmutable, CheckStringMixin, tm.TestCase): diff --git a/pandas/tools/concat.py b/pandas/tools/concat.py index ae9d7af9d98ff..6405106118472 100644 --- a/pandas/tools/concat.py +++ b/pandas/tools/concat.py @@ -574,7 +574,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): " not have the same number of levels") # also copies - names = list(names) + list(_get_consensus_names(indexes)) + names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, labels=label_list, names=names, verify_integrity=False) diff --git a/test_fast.sh b/test_fast.sh index f22ab73277e8b..30ac7f84cbe8b 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network -m "not single" -n 4 $@ +pytest pandas --skip-slow --skip-network -m "not single" -n 4 From db51afb400da40269299b6322e10cfae3ca2b079 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Mar 2017 13:10:27 -0500 Subject: [PATCH 227/338] DEPR: silence some deprecation warnings --- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_panel.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0f36af2c8c4e7..c809b39bb566e 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1646,7 +1646,7 @@ def test_multilevel_consolidate(self): 'bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) - df = df.consolidate() + df = df._consolidate() def test_ix_preserve_names(self): result = self.ymd.loc[2000] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 2f329f241a5b8..373f590cbf9eb 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -688,7 +688,7 @@ def test_ix_setitem_slice_dataframe(self): def test_ix_align(self): from pandas import Series b = Series(np.random.randn(10), name=0) - b.sort() + b.sort_values() df_orig = Panel(np.random.randn(3, 10, 2)) df = df_orig.copy() @@ -1001,7 +1001,7 @@ def test_consolidate(self): self.panel['foo'] = 1. self.assertFalse(self.panel._data.is_consolidated()) - panel = self.panel.consolidate() + panel = self.panel._consolidate() self.assertTrue(panel._data.is_consolidated()) def test_ctor_dict(self): From f82ffa07106b8cad8463a50b235e4385a71f3f17 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 4 Mar 2017 16:04:15 -0500 Subject: [PATCH 228/338] CLN: clean up PeriodIndex constructor closes #13232 Material clean up of PeriodIndex constructor, which was doing a few weird things (https://github.com/pydata/pandas/issues/13232#issuecomme nt-220788816), and generally getting messy. Author: Maximilian Roos Closes #13277 from MaximilianR/period-float and squashes the following commits: 5cae7aa [Maximilian Roos] @jreback changes 75ff54d [Maximilian Roos] _new_PeriodIndex for unpickling 240172f [Maximilian Roos] coerce freq object earlier for perf ba5133b [Maximilian Roos] documentation b0fc0a7 [Maximilian Roos] final changes fa0fa9d [Maximilian Roos] clean up PeriodIndex constructor --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 4 +- pandas/indexes/base.py | 5 + pandas/io/packers.py | 2 +- .../tests/indexes/period/test_construction.py | 9 +- pandas/tests/indexes/period/test_period.py | 6 + pandas/tseries/period.py | 156 +++++++++--------- setup.cfg | 1 + 8 files changed, 98 insertions(+), 86 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1ba327a4ea50c..ca093eca30511 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -657,6 +657,7 @@ Bug Fixes - Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) +- Cleaned up ``PeriodIndex`` constructor, including raising on floats more consistently (:issue:`13277`) - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 55d404f05dd1d..d37c98c9b9b90 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -471,8 +471,8 @@ def _value_counts_arraylike(values, dropna=True): # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) - if is_period_type: - keys = PeriodIndex._simple_new(keys, freq=freq) + elif is_period_type: + keys = PeriodIndex._from_ordinals(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 5d43d2d32af67..e441d9a88690d 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -88,6 +88,11 @@ def _new_Index(cls, d): """ This is called upon unpickling, rather than the default which doesn't have arguments and breaks __new__ """ + # required for backward compat, because PI can't be instantiated with + # ordinals through __new__ GH #13277 + if issubclass(cls, ABCPeriodIndex): + from pandas.tseries.period import _new_PeriodIndex + return _new_PeriodIndex(cls, **d) return cls.__new__(cls, **d) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 7afe8a06b6af1..39bc1a4ecf225 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -573,7 +573,7 @@ def decode(obj): elif typ == u'period_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) - return globals()[obj[u'klass']](data, **d) + return globals()[obj[u'klass']]._from_ordinals(data, **d) elif typ == u'datetime_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 228615829b5b8..f13a84f4f0e92 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -120,7 +120,7 @@ def test_constructor_fromarraylike(self): self.assertRaises(ValueError, PeriodIndex, idx._values) self.assertRaises(ValueError, PeriodIndex, list(idx._values)) - self.assertRaises(ValueError, PeriodIndex, + self.assertRaises(TypeError, PeriodIndex, data=Period('2007', freq='A')) result = PeriodIndex(iter(idx)) @@ -285,12 +285,15 @@ def test_constructor_simple_new_empty(self): result = idx._simple_new(idx, name='p', freq='M') tm.assert_index_equal(result, idx) - def test_constructor_simple_new_floats(self): + def test_constructor_floats(self): # GH13079 - for floats in [[1.1], np.array([1.1])]: + for floats in [[1.1, 2.1], np.array([1.1, 2.1])]: with self.assertRaises(TypeError): pd.PeriodIndex._simple_new(floats, freq='M') + with self.assertRaises(TypeError): + pd.PeriodIndex(floats, freq='M') + def test_constructor_nat(self): self.assertRaises(ValueError, period_range, start='NaT', end='2011-01-01', freq='M') diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index b80ab6feeeb23..1739211982b10 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -53,6 +53,12 @@ def test_astype_raises(self): def test_pickle_compat_construction(self): pass + def test_pickle_round_trip(self): + for freq in ['D', 'M', 'Y']: + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + result = self.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) + def test_get_loc(self): idx = pd.period_range('2000-01-01', periods=3) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 6e499924730b3..bfe7724a1cfaa 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -17,7 +17,6 @@ is_period_dtype, is_bool_dtype, pandas_dtype, - _ensure_int64, _ensure_object) from pandas.types.dtypes import PeriodDtype from pandas.types.generic import ABCSeries @@ -114,6 +113,13 @@ def wrapper(self, other): return wrapper +def _new_PeriodIndex(cls, **d): + # GH13277 for unpickling + if d['data'].dtype == 'int64': + values = d.pop('data') + return cls._from_ordinals(values=values, **d) + + class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in @@ -209,17 +215,57 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, msg = 'specified freq and dtype are different' raise IncompatibleFrequency(msg) + # coerce freq to freq object, otherwise it can be coerced elementwise + # which is slow + if freq: + freq = Period._maybe_convert_freq(freq) + if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, freq, kwargs) - else: - ordinal, freq = cls._from_arraylike(data, freq, tz) - data = np.array(ordinal, dtype=np.int64, copy=copy) + return cls._from_ordinals(data, name=name, freq=freq) - return cls._simple_new(data, name=name, freq=freq) + if isinstance(data, PeriodIndex): + if freq is None or freq == data.freq: # no freq change + freq = data.freq + data = data._values + else: + base1, _ = _gfc(data.freq) + base2, _ = _gfc(freq) + data = period.period_asfreq_arr(data._values, + base1, base2, 1) + return cls._simple_new(data, name=name, freq=freq) + + # not array / index + if not isinstance(data, (np.ndarray, PeriodIndex, + DatetimeIndex, Int64Index)): + if is_scalar(data) or isinstance(data, Period): + cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + data = np.asarray(data) + + # datetime other than period + if is_datetime64_dtype(data.dtype): + data = dt64arr_to_periodarr(data, freq, tz) + return cls._from_ordinals(data, name=name, freq=freq) + + # check not floats + if infer_dtype(data) == 'floating' and len(data) > 0: + raise TypeError("PeriodIndex does not allow " + "floating point in construction") + + # anything else, likely an array of strings or periods + data = _ensure_object(data) + freq = freq or period.extract_freq(data) + data = period.extract_ordinals(data, freq) + return cls._from_ordinals(data, name=name, freq=freq) @classmethod def _generate_range(cls, start, end, periods, freq, fields): @@ -240,77 +286,26 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq - @classmethod - def _from_arraylike(cls, data, freq, tz): - if freq is not None: - freq = Period._maybe_convert_freq(freq) - - if not isinstance(data, (np.ndarray, PeriodIndex, - DatetimeIndex, Int64Index)): - if is_scalar(data) or isinstance(data, Period): - raise ValueError('PeriodIndex() must be called with a ' - 'collection of some kind, %s was passed' - % repr(data)) - - # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - - try: - data = _ensure_int64(data) - if freq is None: - raise ValueError('freq not specified') - data = np.array([Period(x, freq=freq) for x in data], - dtype=np.int64) - except (TypeError, ValueError): - data = _ensure_object(data) - - if freq is None: - freq = period.extract_freq(data) - data = period.extract_ordinals(data, freq) - else: - if isinstance(data, PeriodIndex): - if freq is None or freq == data.freq: - freq = data.freq - data = data._values - else: - base1, _ = _gfc(data.freq) - base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, - base1, base2, 1) - else: - if is_object_dtype(data): - inferred = infer_dtype(data) - if inferred == 'integer': - data = data.astype(np.int64) - - if freq is None and is_object_dtype(data): - # must contain Period instance and thus extract ordinals - freq = period.extract_freq(data) - data = period.extract_ordinals(data, freq) - - if freq is None: - msg = 'freq not specified and cannot be inferred' - raise ValueError(msg) - - if data.dtype != np.int64: - if np.issubdtype(data.dtype, np.datetime64): - data = dt64arr_to_periodarr(data, freq, tz) - else: - data = _ensure_object(data) - data = period.extract_ordinals(data, freq) - - return data, freq - @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - + """ + Values can be any type that can be coerced to Periods. + Ordinals in an ndarray are fastpath-ed to `_from_ordinals` + """ if not is_integer_dtype(values): values = np.array(values, copy=False) - if (len(values) > 0 and is_float_dtype(values)): + if len(values) > 0 and is_float_dtype(values): raise TypeError("PeriodIndex can't take floats") - else: - return cls(values, name=name, freq=freq, **kwargs) + return cls(values, name=name, freq=freq, **kwargs) + + return cls._from_ordinals(values, name, freq, **kwargs) + + @classmethod + def _from_ordinals(cls, values, name=None, freq=None, **kwargs): + """ + Values should be int ordinals + `__new__` & `_simple_new` cooerce to ordinals and call this method + """ values = np.array(values, dtype='int64', copy=False) @@ -318,7 +313,7 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._data = values result.name = name if freq is None: - raise ValueError('freq is not specified') + raise ValueError('freq is not specified and cannot be inferred') result.freq = Period._maybe_convert_freq(freq) result._reset_identity() return result @@ -327,13 +322,13 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): """ we always want to return a PeriodIndex """ return self._shallow_copy(values=values, **kwargs) - def _shallow_copy(self, values=None, **kwargs): - if kwargs.get('freq') is None: - # freq must be provided - kwargs['freq'] = self.freq + def _shallow_copy(self, values=None, freq=None, **kwargs): + if freq is None: + freq = self.freq if values is None: values = self._values - return super(PeriodIndex, self)._shallow_copy(values=values, **kwargs) + return super(PeriodIndex, self)._shallow_copy(values=values, + freq=freq, **kwargs) def _coerce_scalar_to_index(self, item): """ @@ -413,7 +408,7 @@ def __array_wrap__(self, result, context=None): return result # the result is object dtype array of Period # cannot pass _simple_new as it is - return PeriodIndex(result, freq=self.freq, name=self.name) + return self._shallow_copy(result, freq=self.freq, name=self.name) @property def _box_func(self): @@ -708,7 +703,7 @@ def shift(self, n): values = self._values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT - return PeriodIndex(data=values, name=self.name, freq=self.freq) + return self._shallow_copy(values=values) @cache_readonly def dtype(self): @@ -945,7 +940,8 @@ def _wrap_union_result(self, other, result): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex(rawarr, freq=self.freq) + rawarr = PeriodIndex._from_ordinals(rawarr, freq=self.freq, + name=self.name) return rawarr def _format_native_types(self, na_rep=u('NaT'), date_format=None, diff --git a/setup.cfg b/setup.cfg index b9de7a3532209..8de4fc955bd50 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,6 +13,7 @@ parentdir_prefix = pandas- [flake8] ignore = E731,E402 +max-line-length = 79 [yapf] based_on_style = pep8 From ef9a93a802bb97d30b5affef563cdeb127426211 Mon Sep 17 00:00:00 2001 From: Nicholas Ver Halen Date: Sat, 4 Mar 2017 16:38:35 -0500 Subject: [PATCH 229/338] BUG: pivot_table over Categorical Columns closes #15193 Author: Nicholas Ver Halen Closes #15511 from verhalenn/issue15193 and squashes the following commits: bf0fdeb [Nicholas Ver Halen] Added description to code change. adf8616 [Nicholas Ver Halen] Added whatsnew for issue 15193 a643267 [Nicholas Ver Halen] Added test for issue 15193 d605251 [Nicholas Ver Halen] Made sure pivot_table propped na columns --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/tools/test_pivot.py | 33 ++++++++++++++++++++++++++++++++ pandas/tools/pivot.py | 4 ++++ 3 files changed, 38 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ca093eca30511..f51ff4cd0c908 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -735,6 +735,7 @@ Bug Fixes - Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`) +- Bug in ``DataFrame.pivot_table()`` where ``dropna=True`` would not drop all-NaN columns when the columns was a ``category`` dtype (:issue:`15193`) - Bug in ``pd.read_hdf()`` passing a ``Timestamp`` to the ``where`` parameter with a non date column (:issue:`15492`) diff --git a/pandas/tests/tools/test_pivot.py b/pandas/tests/tools/test_pivot.py index f5d91d0088306..62863372dbd02 100644 --- a/pandas/tests/tools/test_pivot.py +++ b/pandas/tests/tools/test_pivot.py @@ -86,6 +86,39 @@ def test_pivot_table_dropna(self): tm.assert_index_equal(pv_col.columns, m) tm.assert_index_equal(pv_ind.index, m) + def test_pivot_table_dropna_categoricals(self): + # GH 15193 + categories = ['a', 'b', 'c', 'd'] + + df = DataFrame({'A': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], + 'B': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'C': range(0, 9)}) + + df['A'] = df['A'].astype('category', ordered=False, + categories=categories) + result_true = df.pivot_table(index='B', columns='A', values='C', + dropna=True) + expected_columns = Series(['a', 'b', 'c'], name='A') + expected_columns = expected_columns.astype('category', ordered=False, + categories=categories) + expected_index = Series([1, 2, 3], name='B') + expected_true = DataFrame([[0.0, 3.0, 6.0], + [1.0, 4.0, 7.0], + [2.0, 5.0, 8.0]], + index=expected_index, + columns=expected_columns,) + tm.assert_frame_equal(expected_true, result_true) + + result_false = df.pivot_table(index='B', columns='A', values='C', + dropna=False) + expected_columns = Series(['a', 'b', 'c', 'd'], name='A') + expected_false = DataFrame([[0.0, 3.0, 6.0, np.NaN], + [1.0, 4.0, 7.0, np.NaN], + [2.0, 5.0, 8.0, np.NaN]], + index=expected_index, + columns=expected_columns,) + tm.assert_frame_equal(expected_false, result_false) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 41fc705691a96..e23beb8332fd4 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -175,6 +175,10 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if len(index) == 0 and len(columns) > 0: table = table.T + # GH 15193 Makse sure empty columns are removed if dropna=True + if isinstance(table, DataFrame) and dropna: + table = table.dropna(how='all', axis=1) + return table From 161a9afd593e9ea3939f15b0d6e6a0bb1c7f6a82 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 4 Mar 2017 22:58:32 +0100 Subject: [PATCH 230/338] DOC: fix build_table_schema docs (#15571) --- doc/source/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index c34cc1ec17512..c7a68a0fe9fbb 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2090,7 +2090,7 @@ A few notes on the generated table schema: - All dates are converted to UTC when serializing. Even timezone naïve values, which are treated as UTC with an offset of 0. - .. ipython:: python: + .. ipython:: python from pandas.io.json import build_table_schema s = pd.Series(pd.date_range('2016', periods=4)) From 78c08ac066514b42f450026ef826edd0746c8814 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Sat, 4 Mar 2017 19:45:20 -0600 Subject: [PATCH 231/338] BUG: DataFrame.isin empty datetimelike (#15570) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/ops.py | 2 +- pandas/tests/frame/test_analytics.py | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f51ff4cd0c908..c29dfaba2604a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -677,7 +677,7 @@ Bug Fixes - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) - +- Bug in ``DataFrame.isin`` comparing datetimelike to empty frame (:issue:`15473`) - Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) - Bug in ``Series`` construction with a datetimetz (:issue:`14928`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 697a99f63f62f..6cc43cd9228f6 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1249,7 +1249,7 @@ def na_op(x, y): result = op(x, y) except TypeError: xrav = x.ravel() - result = np.empty(x.size, dtype=x.dtype) + result = np.empty(x.size, dtype=bool) if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 111195363beb2..4758ee1323ca0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1502,6 +1502,27 @@ def test_isin_multiIndex(self): result = df1.isin(df2) tm.assert_frame_equal(result, expected) + def test_isin_empty_datetimelike(self): + # GH 15473 + df1_ts = DataFrame({'date': + pd.to_datetime(['2014-01-01', '2014-01-02'])}) + df1_td = DataFrame({'date': + [pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]}) + df2 = DataFrame({'date': []}) + df3 = DataFrame() + + expected = DataFrame({'date': [False, False]}) + + result = df1_ts.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_ts.isin(df3) + tm.assert_frame_equal(result, expected) + + result = df1_td.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_td.isin(df3) + tm.assert_frame_equal(result, expected) + # ---------------------------------------------------------------------- # Row deduplication From ad24d7ff471663a84c76485e290ee5ac7fd0e8de Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 4 Mar 2017 18:02:41 -0800 Subject: [PATCH 232/338] BUG: Groupby.cummin/max DataError on datetimes (#15561) (#15569) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 10 +++++++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c29dfaba2604a..4e528daa6e876 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -635,7 +635,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). -- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) - Improved performance of `rank()` for categorical data (:issue:`15498`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 578c334781d15..43c57a88b4d19 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1442,7 +1442,7 @@ def cummin(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform('cummin', **kwargs) + return self._cython_transform('cummin', numeric_only=False) @Substitution(name='groupby') @Appender(_doc_template) @@ -1451,7 +1451,7 @@ def cummax(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform('cummax', **kwargs) + return self._cython_transform('cummax', numeric_only=False) @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 74e8c6c45946f..e846963732883 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1954,7 +1954,8 @@ def test_arg_passthru(self): for attr in ['cummin', 'cummax']: f = getattr(df.groupby('group'), attr) result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) + # GH 15561: numeric_only=False set by default like min/max + tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) @@ -4295,6 +4296,13 @@ def test_cummin_cummax(self): result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(expected, result) + # GH 15561 + df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) + expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') + for method in ['cummax', 'cummin']: + result = getattr(df.groupby('a')['b'], method)() + tm.assert_series_equal(expected, result) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From dd178848027015753746d9e4d20f5762fbbf40ec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 5 Mar 2017 11:28:57 +0100 Subject: [PATCH 233/338] DOC: reset table_schema option after example (#15572) --- doc/source/options.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/options.rst b/doc/source/options.rst index 1a0e5cf6b7235..1b219f640cc87 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -533,3 +533,9 @@ by default. False by default, this can be enabled globally with the pd.set_option('display.html.table_schema', True) Only ``'display.max_rows'`` are serialized and published. + + +.. ipython:: python + :suppress: + + pd.reset_option('display.html.table_schema') \ No newline at end of file From 5b8ded103a7d2fb8808ea8bd5b034bcf310d7abb Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Sun, 5 Mar 2017 03:23:57 -0800 Subject: [PATCH 234/338] ENH: str.replace accepts a compiled expression (#15456) - Series.str.replace now accepts a compiled regular expression for `pat`. - Signature for .str.replace changed, but remains backwards compatible. See #15446 --- doc/source/text.rst | 21 +++++++++++ doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/core/strings.py | 65 +++++++++++++++++++++++++-------- pandas/tests/test_strings.py | 59 ++++++++++++++++++++++++++++++ 4 files changed, 132 insertions(+), 16 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 52e05c5d511bc..2b2520cb6100f 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -164,6 +164,27 @@ positional argument (a regex object) and return a string. repl = lambda m: m.group('two').swapcase() pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) +The ``replace`` method also accepts a compiled regular expression object +from :func:`re.compile` as a pattern. All flags should be included in the +compiled regular expression object. + +.. versionadded:: 0.20.0 + +.. ipython:: python + + import re + regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE) + s3.str.replace(regex_pat, 'XX-XX ') + +Including a ``flags`` argument when calling ``replace`` with a compiled +regular expression object will raise a ``ValueError``. + +.. ipython:: + + @verbatim + In [1]: s3.str.replace(regex_pat, 'XX-XX ', flags=re.IGNORECASE) + --------------------------------------------------------------------------- + ValueError: case and flags cannot be set when pat is a compiled regex Indexing with ``.str`` ---------------------- diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4e528daa6e876..fe9035106e4af 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -28,7 +28,8 @@ New features ~~~~~~~~~~~~ - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. -- ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ac8d1db6a0bf3..46ba48b4cd846 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -9,7 +9,8 @@ is_string_like, is_list_like, is_scalar, - is_integer) + is_integer, + is_re) from pandas.core.common import _values_from_object from pandas.core.algorithms import take_1d @@ -303,7 +304,7 @@ def str_endswith(arr, pat, na=np.nan): return _na_map(f, arr, na, dtype=bool) -def str_replace(arr, pat, repl, n=-1, case=True, flags=0): +def str_replace(arr, pat, repl, n=-1, case=None, flags=0): """ Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or @@ -311,8 +312,12 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Parameters ---------- - pat : string - Character sequence or regular expression + pat : string or compiled regex + String can be a character sequence or regular expression. + + .. versionadded:: 0.20.0 + `pat` also accepts a compiled regex. + repl : string or callable Replacement string or a callable. The callable is passed the regex match object and must return a replacement string to be used. @@ -323,15 +328,24 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): n : int, default -1 (all) Number of replacements to make from start - case : boolean, default True - If True, case sensitive + case : boolean, default None + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE + - re module flags, e.g. re.IGNORECASE + - Cannot be set if `pat` is a compiled regex Returns ------- replaced : Series/Index of objects + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case` or `flags` with a compiled regex will + raise an error. + Examples -------- When `repl` is a string, every `pat` is replaced as with @@ -372,21 +386,42 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): 0 tWO 1 bAR dtype: object + + Using a compiled regex with flags + + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') + 0 foo + 1 bar + 2 NaN + dtype: object """ # Check whether repl is valid (GH 13438, GH 15055) if not (is_string_like(repl) or callable(repl)): raise TypeError("repl must be a string or callable") - use_re = not case or len(pat) > 1 or flags or callable(repl) - if use_re: - if not case: + is_compiled_re = is_re(pat) + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError("case and flags cannot be set" + " when pat is a compiled regex") + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: flags |= re.IGNORECASE - regex = re.compile(pat, flags=flags) - n = n if n >= 0 else 0 - def f(x): - return regex.sub(repl, x, count=n) + use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl) + + if use_re: + n = n if n >= 0 else 0 + regex = re.compile(pat, flags=flags) + f = lambda x: regex.sub(repl=repl, string=x, count=n) else: f = lambda x: x.replace(pat, repl, n) @@ -1558,7 +1593,7 @@ def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): return self._wrap_result(result) @copy(str_replace) - def replace(self, pat, repl, n=-1, case=True, flags=0): + def replace(self, pat, repl, n=-1, case=None, flags=0): result = str_replace(self._data, pat, repl, n=n, case=case, flags=flags) return self._wrap_result(result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ce97b09b7e3ca..f98cabbb70477 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -469,6 +469,65 @@ def test_replace_callable(self): exp = Series(['bAR', NA]) tm.assert_series_equal(result, exp) + def test_replace_compiled_regex(self): + # GH 15446 + values = Series(['fooBAD__barBAD', NA]) + + # test with compiled regex + pat = re.compile(r'BAD[_]*') + result = values.str.replace(pat, '') + exp = Series(['foobar', NA]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', + None, 1, 2.]) + + rs = Series(mixed).str.replace(pat, '') + xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + tm.assertIsInstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA]) + + result = values.str.replace(pat, '') + exp = Series([u('foobar'), NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace(pat, '', n=1) + exp = Series([u('foobarBAD'), NA]) + tm.assert_series_equal(result, exp) + + # flags + unicode + values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) + exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) + pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) + result = values.str.replace(pat, ", ") + tm.assert_series_equal(result, exp) + + # case and flags provided to str.replace will have no effect + # and will produce warnings + values = Series(['fooBAD__barBAD__bad', NA]) + pat = re.compile(r'BAD[_]*') + + with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + result = values.str.replace(pat, '', flags=re.IGNORECASE) + + with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + result = values.str.replace(pat, '', case=False) + + with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + result = values.str.replace(pat, '', case=True) + + # test with callable + values = Series(['fooBAD__barBAD', NA]) + repl = lambda m: m.group(0).swapcase() + pat = re.compile('[a-z][A-Z]{2}') + result = values.str.replace(pat, repl, n=2) + exp = Series(['foObaD__baRbaD', NA]) + tm.assert_series_equal(result, exp) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) From 45a9bf62fd767a8405aaa2ad50b16ad6becf396a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 5 Mar 2017 13:01:53 +0100 Subject: [PATCH 235/338] TST: fix test str_replace error messge (#15456) --- pandas/tests/test_strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f98cabbb70477..f8ce0070b2c78 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -511,13 +511,13 @@ def test_replace_compiled_regex(self): values = Series(['fooBAD__barBAD__bad', NA]) pat = re.compile(r'BAD[_]*') - with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + with tm.assertRaisesRegexp(ValueError, "case and flags cannot be"): result = values.str.replace(pat, '', flags=re.IGNORECASE) - with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + with tm.assertRaisesRegexp(ValueError, "case and flags cannot be"): result = values.str.replace(pat, '', case=False) - with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + with tm.assertRaisesRegexp(ValueError, "case and flags cannot be"): result = values.str.replace(pat, '', case=True) # test with callable From 388aab9ab91f3b1bcf33cb2fbf14912ec50cb0ed Mon Sep 17 00:00:00 2001 From: Petio Petrov Date: Sun, 5 Mar 2017 11:21:14 -0500 Subject: [PATCH 236/338] Update dtypes.py (#15577) --- pandas/types/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 5b6d7905d4095..43135ba94ab46 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -73,7 +73,7 @@ def __ne__(self, other): @classmethod def is_dtype(cls, dtype): - """ Return a boolean if we if the passed type is an actual dtype that + """ Return a boolean if the passed type is an actual dtype that we can match (via string or type) """ if hasattr(dtype, 'dtype'): From 47968e695340bc3e50a7c040ed97048b58e6db3a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 5 Mar 2017 11:25:46 -0500 Subject: [PATCH 237/338] BUG: Floating point accuracy with DatetimeIndex.round (#14440) closes #14440 Employs @eoincondron's fix for float point inaccuracies when rounding by milliseconds for `DatetimeIndex.round` and `Timestamp.round` Author: Matt Roeschke Closes #15568 from mroeschke/fix_14440 and squashes the following commits: c5a7cbc [Matt Roeschke] BUG:Floating point accuracy with DatetimeIndex.round (#14440) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/indexes/datetimes/test_ops.py | 11 +++++++++++ pandas/tests/scalar/test_timestamp.py | 9 +++++++++ pandas/tseries/base.py | 2 +- pandas/tslib.pyx | 3 ++- 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index fe9035106e4af..db803e6e7856b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -652,6 +652,7 @@ Bug Fixes - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) +- Bug in ``DatetimeIndex.round()`` and ``Timestamp.round()`` floating point accuracy when rounding by milliseconds (:issue: `14440`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 8eb9128d8d1c8..3a6402ae83ae2 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -175,6 +175,17 @@ def test_round(self): tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') + # GH 14440 + index = pd.DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz) + result = index.round('ms') + expected = pd.DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz) + tm.assert_index_equal(result, expected) + + index = pd.DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz) + result = index.round('ms') + expected = pd.DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz) + tm.assert_index_equal(result, expected) + def test_repeat_range(self): rng = date_range('1/1/2000', '1/1/2001') diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 2abc83ca6109c..ae278ebfa2533 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -732,6 +732,15 @@ def test_round(self): for freq in ['Y', 'M', 'foobar']: self.assertRaises(ValueError, lambda: dti.round(freq)) + # GH 14440 + result = pd.Timestamp('2016-10-17 12:00:00.0015').round('ms') + expected = pd.Timestamp('2016-10-17 12:00:00.002000') + self.assertEqual(result, expected) + + result = pd.Timestamp('2016-10-17 12:00:00.00149').round('ms') + expected = pd.Timestamp('2016-10-17 12:00:00.001000') + self.assertEqual(result, expected) + def test_class_ops_pytz(self): tm._skip_if_no_pytz() from pytz import timezone diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index ee9234d6c8237..5891481677ed2 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -83,7 +83,7 @@ def _round(self, freq, rounder): # round the local times values = _ensure_datetimelike_to_i8(self) - result = (unit * rounder(values / float(unit))).astype('i8') + result = (unit * rounder(values / float(unit)).astype('i8')) result = self._maybe_mask_results(result, fill_value=tslib.NaT) attribs = self._get_attributes_dict() if 'freq' in attribs: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index fc6e689a35d81..b96e9434e617a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -421,7 +421,8 @@ class Timestamp(_Timestamp): value = self.tz_localize(None).value else: value = self.value - result = Timestamp(unit * rounder(value / float(unit)), unit='ns') + result = (unit * rounder(value / float(unit)).astype('i8')) + result = Timestamp(result, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) return result From 90eb1409a40b0180d9dc51b5afcca3417c5e5bcd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 5 Mar 2017 17:09:17 -0500 Subject: [PATCH 238/338] PERF: faster unstacking closes #15503 Author: Jeff Reback Closes #15510 from jreback/reshape3 and squashes the following commits: ec29226 [Jeff Reback] PERF: faster unstacking --- asv_bench/benchmarks/reshape.py | 21 ++++++++ doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/reshape.py | 56 +++++++++++++++++---- pandas/src/reshape.pyx | 35 +++++++++++++ pandas/src/reshape_helper.pxi.in | 81 ++++++++++++++++++++++++++++++ pandas/tests/frame/test_reshape.py | 13 +++-- setup.py | 3 ++ 7 files changed, 196 insertions(+), 15 deletions(-) create mode 100644 pandas/src/reshape.pyx create mode 100644 pandas/src/reshape_helper.pxi.in diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index a3ecfff52c794..b9346c497b9ef 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -59,6 +59,27 @@ def time_reshape_unstack_simple(self): self.df.unstack(1) +class reshape_unstack_large_single_dtype(object): + goal_time = 0.2 + + def setup(self): + m = 100 + n = 1000 + + levels = np.arange(m) + index = pd.MultiIndex.from_product([levels]*2) + columns = np.arange(n) + values = np.arange(m*m*n).reshape(m*m, n) + self.df = pd.DataFrame(values, index, columns) + self.df2 = self.df.iloc[:-1] + + def time_unstack_full_product(self): + self.df.unstack() + + def time_unstack_with_mask(self): + self.df2.unstack() + + class unstack_sparse_keyspace(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index db803e6e7856b..725dc7fc52ed0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -640,7 +640,7 @@ Performance Improvements - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) - Improved performance of `rank()` for categorical data (:issue:`15498`) - +- Improved performance when using ``.unstack()`` (:issue:`15503`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 87cb088c2e91e..7bcd9f2d30b79 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -7,7 +7,9 @@ import numpy as np -from pandas.types.common import _ensure_platform_int, is_list_like +from pandas.types.common import (_ensure_platform_int, + is_list_like, is_bool_dtype, + needs_i8_conversion) from pandas.types.cast import _maybe_promote from pandas.types.missing import notnull import pandas.types.concat as _concat @@ -25,6 +27,7 @@ import pandas.core.algorithms as algos import pandas.algos as _algos +import pandas._reshape as _reshape from pandas.core.index import MultiIndex, _get_na_value @@ -182,9 +185,21 @@ def get_new_values(self): stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) + mask = self.mask + mask_all = mask.all() + + # we can simply reshape if we don't have a mask + if mask_all and len(values): + new_values = (self.sorted_values + .reshape(length, width, stride) + .swapaxes(1, 2) + .reshape(result_shape) + ) + new_mask = np.ones(result_shape, dtype=bool) + return new_values, new_mask # if our mask is all True, then we can use our existing dtype - if self.mask.all(): + if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: @@ -194,13 +209,36 @@ def get_new_values(self): new_mask = np.zeros(result_shape, dtype=bool) - # is there a simpler / faster way of doing this? - for i in range(values.shape[1]): - chunk = new_values[:, i * width:(i + 1) * width] - mask_chunk = new_mask[:, i * width:(i + 1) * width] - - chunk.flat[self.mask] = self.sorted_values[:, i] - mask_chunk.flat[self.mask] = True + name = np.dtype(dtype).name + sorted_values = self.sorted_values + + # we need to convert to a basic dtype + # and possibly coerce an input to our output dtype + # e.g. ints -> floats + if needs_i8_conversion(values): + sorted_values = sorted_values.view('i8') + new_values = new_values.view('i8') + name = 'int64' + elif is_bool_dtype(values): + sorted_values = sorted_values.astype('object') + new_values = new_values.astype('object') + name = 'object' + else: + sorted_values = sorted_values.astype(name, copy=False) + + # fill in our values & mask + f = getattr(_reshape, "unstack_{}".format(name)) + f(sorted_values, + mask.view('u1'), + stride, + length, + width, + new_values, + new_mask.view('u1')) + + # reconstruct dtype if needed + if needs_i8_conversion(values): + new_values = new_values.view(values.dtype) return new_values, new_mask diff --git a/pandas/src/reshape.pyx b/pandas/src/reshape.pyx new file mode 100644 index 0000000000000..82851b7e80994 --- /dev/null +++ b/pandas/src/reshape.pyx @@ -0,0 +1,35 @@ +# cython: profile=False + +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef double NaN = np.NaN +cdef double nan = NaN + +include "reshape_helper.pxi" diff --git a/pandas/src/reshape_helper.pxi.in b/pandas/src/reshape_helper.pxi.in new file mode 100644 index 0000000000000..bb9a5977f8b45 --- /dev/null +++ b/pandas/src/reshape_helper.pxi.in @@ -0,0 +1,81 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +# ---------------------------------------------------------------------- +# reshape +# ---------------------------------------------------------------------- + +{{py: + +# name, c_type +dtypes = [('uint8', 'uint8_t'), + ('uint16', 'uint16_t'), + ('uint32', 'uint32_t'), + ('uint64', 'uint64_t'), + ('int8', 'int8_t'), + ('int16', 'int16_t'), + ('int32', 'int32_t'), + ('int64', 'int64_t'), + ('float32', 'float32_t'), + ('float64', 'float64_t'), + ('object', 'object')] +}} + +{{for dtype, c_type in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values, + ndarray[uint8_t, ndim=1] mask, + Py_ssize_t stride, + Py_ssize_t length, + Py_ssize_t width, + ndarray[{{c_type}}, ndim=2] new_values, + ndarray[uint8_t, ndim=2] new_mask): + """ + transform long sorted_values to wide new_values + + Parameters + ---------- + values : typed ndarray + mask : boolean ndarray + stride : int + length : int + width : int + new_values : typed ndarray + result array + new_mask : boolean ndarray + result mask + + """ + + cdef: + Py_ssize_t i, j, w, nulls, s, offset + + {{if dtype == 'object'}} + if True: + {{else}} + with nogil: + {{endif}} + + for i in range(stride): + + nulls = 0 + for j in range(length): + + for w in range(width): + + offset = j * width + w + + if mask[offset]: + s = i * width + w + new_values[j, s] = values[offset - nulls, i] + new_mask[j, s] = 1 + else: + nulls += 1 + +{{endfor}} diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 1890b33e3dbaa..c8c7313ddd071 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -121,19 +121,22 @@ def test_pivot_index_none(self): assert_frame_equal(result, expected) def test_stack_unstack(self): - stacked = self.frame.stack() + f = self.frame.copy() + f[:] = np.arange(np.prod(f.shape)).reshape(f.shape) + + stacked = f.stack() stacked_df = DataFrame({'foo': stacked, 'bar': stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() - assert_frame_equal(unstacked, self.frame) - assert_frame_equal(unstacked_df['bar'], self.frame) + assert_frame_equal(unstacked, f) + assert_frame_equal(unstacked_df['bar'], f) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) - assert_frame_equal(unstacked_cols.T, self.frame) - assert_frame_equal(unstacked_cols_df['bar'].T, self.frame) + assert_frame_equal(unstacked_cols.T, f) + assert_frame_equal(unstacked_cols_df['bar'].T, f) def test_unstack_fill(self): diff --git a/setup.py b/setup.py index cbcadce459c67..525cbdf600c78 100755 --- a/setup.py +++ b/setup.py @@ -113,6 +113,7 @@ def is_platform_mac(): _pxi_dep_template = { 'algos': ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', 'algos_take_helper.pxi.in', 'algos_rank_helper.pxi.in'], + '_reshape': ['reshape_helper.pxi.in'], '_join': ['join_helper.pxi.in', 'joins_func_helper.pxi.in'], 'hashtable': ['hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'], @@ -496,6 +497,8 @@ def pxd(name): algos={'pyxfile': 'algos', 'pxdfiles': ['src/util', 'hashtable'], 'depends': _pxi_dep['algos']}, + _reshape={'pyxfile': 'src/reshape', + 'depends': _pxi_dep['_reshape']}, _join={'pyxfile': 'src/join', 'pxdfiles': ['src/util', 'hashtable'], 'depends': _pxi_dep['_join']}, From b4b9075f158cbe5d6822b4c4d725c10bc369e930 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Mar 2017 10:32:57 +0100 Subject: [PATCH 239/338] API: return Index instead of array from datetime field accessors (GH15022) --- pandas/tests/indexes/datetimes/test_misc.py | 4 ++-- pandas/tests/indexes/period/test_construction.py | 4 ++-- pandas/tests/indexes/period/test_period.py | 8 ++++---- pandas/tseries/index.py | 15 +++++++++------ pandas/tseries/period.py | 5 +++-- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 6b0191edbda5a..8fcb26ab517bf 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -307,5 +307,5 @@ def test_datetimeindex_accessors(self): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - self.assert_numpy_array_equal(dti.nanosecond, - np.arange(10, dtype=np.int32)) + self.assert_index_equal(dti.nanosecond, + pd.Index(np.arange(10, dtype=np.int64))) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index f13a84f4f0e92..ab70ad59846e8 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -91,8 +91,8 @@ def test_constructor_arrays_negative_year(self): pindex = PeriodIndex(year=years, quarter=quarters) - self.assert_numpy_array_equal(pindex.year, years) - self.assert_numpy_array_equal(pindex.quarter, quarters) + self.assert_index_equal(pindex.year, pd.Index(years)) + self.assert_index_equal(pindex.quarter, pd.Index(quarters)) def test_constructor_invalid_quarters(self): self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 1739211982b10..16b8ce6569802 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -653,10 +653,10 @@ def test_pindex_fieldaccessor_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2012-03', '2012-04'], freq='D') - exp = np.array([2011, 2011, -1, 2012, 2012], dtype=np.int64) - self.assert_numpy_array_equal(idx.year, exp) - exp = np.array([1, 2, -1, 3, 4], dtype=np.int64) - self.assert_numpy_array_equal(idx.month, exp) + exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64) + self.assert_index_equal(idx.year, exp) + exp = Index([1, 2, -1, 3, 4], dtype=np.int64) + self.assert_index_equal(idx.month, exp) def test_pindex_qaccess(self): pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 5f00e8b648689..f9821c4b799e6 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -77,16 +77,19 @@ def f(self): result = tslib.get_start_end_field(values, field, self.freqstr, month_kw) + result = self._maybe_mask_results(result, convert='float64') + elif field in ['weekday_name']: result = tslib.get_date_name_field(values, field) - return self._maybe_mask_results(result) + result = self._maybe_mask_results(result) elif field in ['is_leap_year']: # no need to mask NaT - return tslib.get_date_field(values, field) + result = tslib.get_date_field(values, field) else: result = tslib.get_date_field(values, field) + result = self._maybe_mask_results(result, convert='float64') - return self._maybe_mask_results(result, convert='float64') + return Index(result) f.__name__ = name f.__doc__ = docstring @@ -1913,9 +1916,9 @@ def to_julian_date(self): """ # http://mysite.verizon.net/aesir_research/date/jdalg2.htm - year = self.year - month = self.month - day = self.day + year = np.asarray(self.year) + month = np.asarray(self.month) + day = np.asarray(self.day) testarr = month < 3 year[testarr] -= 1 month[testarr] += 12 diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index bfe7724a1cfaa..56f88b7ed800c 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -52,7 +52,8 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - return get_period_field_arr(alias, self._values, base) + result = get_period_field_arr(alias, self._values, base) + return Index(result) f.__name__ = name f.__doc__ = docstring return property(f) @@ -585,7 +586,7 @@ def to_datetime(self, dayfirst=False): @property def is_leap_year(self): """ Logical indicating if the date belongs to a leap year """ - return tslib._isleapyear_arr(self.year) + return tslib._isleapyear_arr(np.asarray(self.year)) @property def start_time(self): From 6b3d8db6d8e62390f3381f4f9fdb8ae0b82cba57 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Mar 2017 10:38:22 +0100 Subject: [PATCH 240/338] Revert "API: return Index instead of array from datetime field accessors (GH15022)" This reverts commit dc323507672ec0ceb4b2b0366445a794f3e92ee7. --- pandas/tests/indexes/datetimes/test_misc.py | 4 ++-- pandas/tests/indexes/period/test_construction.py | 4 ++-- pandas/tests/indexes/period/test_period.py | 8 ++++---- pandas/tseries/index.py | 15 ++++++--------- pandas/tseries/period.py | 5 ++--- 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 8fcb26ab517bf..6b0191edbda5a 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -307,5 +307,5 @@ def test_datetimeindex_accessors(self): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - self.assert_index_equal(dti.nanosecond, - pd.Index(np.arange(10, dtype=np.int64))) + self.assert_numpy_array_equal(dti.nanosecond, + np.arange(10, dtype=np.int32)) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index ab70ad59846e8..f13a84f4f0e92 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -91,8 +91,8 @@ def test_constructor_arrays_negative_year(self): pindex = PeriodIndex(year=years, quarter=quarters) - self.assert_index_equal(pindex.year, pd.Index(years)) - self.assert_index_equal(pindex.quarter, pd.Index(quarters)) + self.assert_numpy_array_equal(pindex.year, years) + self.assert_numpy_array_equal(pindex.quarter, quarters) def test_constructor_invalid_quarters(self): self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 16b8ce6569802..1739211982b10 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -653,10 +653,10 @@ def test_pindex_fieldaccessor_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2012-03', '2012-04'], freq='D') - exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64) - self.assert_index_equal(idx.year, exp) - exp = Index([1, 2, -1, 3, 4], dtype=np.int64) - self.assert_index_equal(idx.month, exp) + exp = np.array([2011, 2011, -1, 2012, 2012], dtype=np.int64) + self.assert_numpy_array_equal(idx.year, exp) + exp = np.array([1, 2, -1, 3, 4], dtype=np.int64) + self.assert_numpy_array_equal(idx.month, exp) def test_pindex_qaccess(self): pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f9821c4b799e6..5f00e8b648689 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -77,19 +77,16 @@ def f(self): result = tslib.get_start_end_field(values, field, self.freqstr, month_kw) - result = self._maybe_mask_results(result, convert='float64') - elif field in ['weekday_name']: result = tslib.get_date_name_field(values, field) - result = self._maybe_mask_results(result) + return self._maybe_mask_results(result) elif field in ['is_leap_year']: # no need to mask NaT - result = tslib.get_date_field(values, field) + return tslib.get_date_field(values, field) else: result = tslib.get_date_field(values, field) - result = self._maybe_mask_results(result, convert='float64') - return Index(result) + return self._maybe_mask_results(result, convert='float64') f.__name__ = name f.__doc__ = docstring @@ -1916,9 +1913,9 @@ def to_julian_date(self): """ # http://mysite.verizon.net/aesir_research/date/jdalg2.htm - year = np.asarray(self.year) - month = np.asarray(self.month) - day = np.asarray(self.day) + year = self.year + month = self.month + day = self.day testarr = month < 3 year[testarr] -= 1 month[testarr] += 12 diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 56f88b7ed800c..bfe7724a1cfaa 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -52,8 +52,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - result = get_period_field_arr(alias, self._values, base) - return Index(result) + return get_period_field_arr(alias, self._values, base) f.__name__ = name f.__doc__ = docstring return property(f) @@ -586,7 +585,7 @@ def to_datetime(self, dayfirst=False): @property def is_leap_year(self): """ Logical indicating if the date belongs to a leap year """ - return tslib._isleapyear_arr(np.asarray(self.year)) + return tslib._isleapyear_arr(self.year) @property def start_time(self): From f3b57a1ac456e096bdc0f60bee5054325ba11b4e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 6 Mar 2017 17:12:44 -0500 Subject: [PATCH 241/338] DOC: updated badges --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7bc350d1c6675..8595043cf68c3 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,15 @@ + + + + From 21cc0675a1d692ca62b48672b69b12e4ab5d659f Mon Sep 17 00:00:00 2001 From: Mark Mandel Date: Mon, 6 Mar 2017 16:15:10 -0600 Subject: [PATCH 242/338] DOC: remove wakari.io section (#15596) --- doc/source/install.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 8b0fec6a3dac3..fe2a9fa4ba509 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -23,18 +23,6 @@ Officially Python 2.7, 3.4, 3.5, and 3.6 Installing pandas ----------------- -Trying out pandas, no installation required! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The easiest way to start experimenting with pandas doesn't involve installing -pandas at all. - -`Wakari `__ is a free service that provides a hosted -`IPython Notebook `__ service in the cloud. - -Simply create an account, and have access to pandas from within your brower via -an `IPython Notebook `__ in a few minutes. - .. _install.anaconda: Installing pandas with Anaconda From 01894d8a9f596a3374ec9c668e5b208594927d62 Mon Sep 17 00:00:00 2001 From: Leon Yin Date: Mon, 6 Mar 2017 23:41:30 -0800 Subject: [PATCH 243/338] DOC: remove Panel4D from the API docs #15579 (#15598) --- doc/source/api.rst | 53 +------------------------------------ scripts/api_rst_coverage.py | 4 +-- 2 files changed, 3 insertions(+), 54 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 33ac5fde651d4..fbce64df84859 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1237,58 +1237,7 @@ Serialization / IO / Conversion Panel.to_frame Panel.to_xarray Panel.to_clipboard - -.. _api.panel4d: - -Panel4D -------- - -Constructor -~~~~~~~~~~~ -.. autosummary:: - :toctree: generated/ - - Panel4D - -Serialization / IO / Conversion -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autosummary:: - :toctree: generated/ - - Panel4D.to_xarray - -Attributes and underlying data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -**Axes** - - * **labels**: axis 1; each label corresponds to a Panel contained inside - * **items**: axis 2; each item corresponds to a DataFrame contained inside - * **major_axis**: axis 3; the index (rows) of each of the DataFrames - * **minor_axis**: axis 4; the columns of each of the DataFrames - -.. autosummary:: - :toctree: generated/ - - Panel4D.values - Panel4D.axes - Panel4D.ndim - Panel4D.size - Panel4D.shape - Panel4D.dtypes - Panel4D.ftypes - Panel4D.get_dtype_counts - Panel4D.get_ftype_counts - -Conversion -~~~~~~~~~~ -.. autosummary:: - :toctree: generated/ - - Panel4D.astype - Panel4D.copy - Panel4D.isnull - Panel4D.notnull - + .. _api.index: Index diff --git a/scripts/api_rst_coverage.py b/scripts/api_rst_coverage.py index cc456f03c02ec..6bb5383509be6 100644 --- a/scripts/api_rst_coverage.py +++ b/scripts/api_rst_coverage.py @@ -4,11 +4,11 @@ def main(): # classes whose members to check - classes = [pd.Series, pd.DataFrame, pd.Panel, pd.Panel4D] + classes = [pd.Series, pd.DataFrame, pd.Panel] def class_name_sort_key(x): if x.startswith('Series'): - # make sure Series precedes DataFrame, Panel, and Panel4D + # make sure Series precedes DataFrame, and Panel. return ' ' + x else: return x From 69902bd589514dbeea49ffdb0db928eb5a3c1947 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Tue, 7 Mar 2017 08:21:32 -0500 Subject: [PATCH 244/338] CLN: Moved freeze_panes validation to io/excel.py (#15160) follow up to #15160 Author: Jeff Carey Closes #15592 from jeffcarey/enh-15160-touchup2 and squashes the following commits: 81cb86f [Jeff Carey] Cleaned up freeze_panes validation code a802fc7 [Jeff Carey] Moved freeze_panes validation to io/excel.py --- pandas/core/frame.py | 12 ------------ pandas/io/excel.py | 22 +++++++++++++++++++--- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b3e43edc3eb55..15179ac321076 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1431,24 +1431,12 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', inf_rep=inf_rep) formatted_cells = formatter.get_formatted_cells() - freeze_panes = self._validate_freeze_panes(freeze_panes) excel_writer.write_cells(formatted_cells, sheet_name, startrow=startrow, startcol=startcol, freeze_panes=freeze_panes) if need_save: excel_writer.save() - def _validate_freeze_panes(self, freeze_panes): - if freeze_panes is not None: - if ( - len(freeze_panes) == 2 and - all(isinstance(item, int) for item in freeze_panes) - ): - return freeze_panes - - raise ValueError("freeze_panes must be of form (row, column)" - " where row and column are integers") - def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 37a61b7dc9ab5..00ec8bcf060ef 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -543,6 +543,22 @@ def __exit__(self, exc_type, exc_value, traceback): self.close() +def _validate_freeze_panes(freeze_panes): + if freeze_panes is not None: + if ( + len(freeze_panes) == 2 and + all(isinstance(item, int) for item in freeze_panes) + ): + return True + + raise ValueError("freeze_panes must be of form (row, column)" + " where row and column are integers") + + # freeze_panes wasn't specified, return False so it won't be applied + # to output sheet + return False + + def _trim_excel_header(row): # trim header row so auto-index inference works # xlrd uses '' , openpyxl None @@ -1330,7 +1346,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, wks.title = sheet_name self.sheets[sheet_name] = wks - if freeze_panes is not None: + if _validate_freeze_panes(freeze_panes): wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1, column=freeze_panes[1] + 1) @@ -1418,7 +1434,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, wks = self.book.add_sheet(sheet_name) self.sheets[sheet_name] = wks - if freeze_panes is not None: + if _validate_freeze_panes(freeze_panes): wks.set_panes_frozen(True) wks.set_horz_split_pos(freeze_panes[0]) wks.set_vert_split_pos(freeze_panes[1]) @@ -1550,7 +1566,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, style_dict = {} - if freeze_panes is not None: + if _validate_freeze_panes(freeze_panes): wks.freeze_panes(*(freeze_panes)) for cell in cells: From c4f6960b3a07c82a8d3624326c10416df3583e87 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 7 Mar 2017 08:23:18 -0500 Subject: [PATCH 245/338] BUG: Timestamp.round precision error for ns (#15578) closes #15578 Author: Matt Roeschke Closes #15588 from mroeschke/fix_15578 and squashes the following commits: af95baa [Matt Roeschke] BUG: Timestamp.round precision error for ns (#15578) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/indexes/datetimes/test_ops.py | 14 +++++++++++++- pandas/tests/scalar/test_timestamp.py | 13 ++++++++++++- pandas/tseries/base.py | 16 +++++++++++++--- pandas/tslib.pyx | 13 ++++++++++++- 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 725dc7fc52ed0..f1df8f456159a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -652,7 +652,7 @@ Bug Fixes - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) -- Bug in ``DatetimeIndex.round()`` and ``Timestamp.round()`` floating point accuracy when rounding by milliseconds (:issue: `14440`) +- Bug in ``DatetimeIndex.round()`` and ``Timestamp.round()`` floating point accuracy when rounding by milliseconds or less (:issue: `14440`, :issue:`15578`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 3a6402ae83ae2..312017eef3446 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -175,17 +175,29 @@ def test_round(self): tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') - # GH 14440 + # GH 14440 & 15578 index = pd.DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz) result = index.round('ms') expected = pd.DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz) tm.assert_index_equal(result, expected) + for freq in ['us', 'ns']: + tm.assert_index_equal(index, index.round(freq)) + index = pd.DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz) result = index.round('ms') expected = pd.DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz) tm.assert_index_equal(result, expected) + index = pd.DatetimeIndex(['2016-10-17 12:00:00.001501031']) + result = index.round('10ns') + expected = pd.DatetimeIndex(['2016-10-17 12:00:00.001501030']) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(): + ts = '2016-10-17 12:00:00.001501031' + pd.DatetimeIndex([ts]).round('1010ns') + def test_repeat_range(self): rng = date_range('1/1/2000', '1/1/2001') diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index ae278ebfa2533..bbcdce922f58a 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -732,7 +732,7 @@ def test_round(self): for freq in ['Y', 'M', 'foobar']: self.assertRaises(ValueError, lambda: dti.round(freq)) - # GH 14440 + # GH 14440 & 15578 result = pd.Timestamp('2016-10-17 12:00:00.0015').round('ms') expected = pd.Timestamp('2016-10-17 12:00:00.002000') self.assertEqual(result, expected) @@ -741,6 +741,17 @@ def test_round(self): expected = pd.Timestamp('2016-10-17 12:00:00.001000') self.assertEqual(result, expected) + ts = pd.Timestamp('2016-10-17 12:00:00.0015') + for freq in ['us', 'ns']: + self.assertEqual(ts, ts.round(freq)) + + result = pd.Timestamp('2016-10-17 12:00:00.001501031').round('10ns') + expected = pd.Timestamp('2016-10-17 12:00:00.001501030') + self.assertEqual(result, expected) + + with tm.assert_produces_warning(): + pd.Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') + def test_class_ops_pytz(self): tm._skip_if_no_pytz() from pytz import timezone diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 5891481677ed2..2e22c35868cb3 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -1,6 +1,7 @@ """ Base and utility classes for tseries type pandas objects. """ +import warnings from datetime import datetime, timedelta @@ -79,11 +80,20 @@ def _round(self, freq, rounder): from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - # round the local times values = _ensure_datetimelike_to_i8(self) - - result = (unit * rounder(values / float(unit)).astype('i8')) + if unit < 1000 and unit % 1000 != 0: + # for nano rounding, work with the last 6 digits separately + # due to float precision + buff = 1000000 + result = (buff * (values // buff) + unit * + (rounder((values % buff) / float(unit))).astype('i8')) + elif unit >= 1000 and unit % 1000 != 0: + msg = 'Precision will be lost using frequency: {}' + warnings.warn(msg.format(freq)) + result = (unit * rounder(values / float(unit)).astype('i8')) + else: + result = (unit * rounder(values / float(unit)).astype('i8')) result = self._maybe_mask_results(result, fill_value=tslib.NaT) attribs = self._get_attributes_dict() if 'freq' in attribs: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index b96e9434e617a..8ee92e9fb900d 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -421,7 +421,18 @@ class Timestamp(_Timestamp): value = self.tz_localize(None).value else: value = self.value - result = (unit * rounder(value / float(unit)).astype('i8')) + if unit < 1000 and unit % 1000 != 0: + # for nano rounding, work with the last 6 digits separately + # due to float precision + buff = 1000000 + result = (buff * (value // buff) + unit * + (rounder((value % buff) / float(unit))).astype('i8')) + elif unit >= 1000 and unit % 1000 != 0: + msg = 'Precision will be lost using frequency: {}' + warnings.warn(msg.format(freq)) + result = (unit * rounder(value / float(unit)).astype('i8')) + else: + result = (unit * rounder(value / float(unit)).astype('i8')) result = Timestamp(result, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) From ea5d7ad48e28f5b4a53dc98024ac2cc72c5b1bc2 Mon Sep 17 00:00:00 2001 From: Kernc Date: Tue, 7 Mar 2017 08:27:38 -0500 Subject: [PATCH 246/338] BUG: repr SparseDataFrame after setting a value closes #15488 Author: Kernc Closes #15489 from kernc/sdf-repr and squashes the following commits: 2dc145c [Kernc] BUG: repr SparseDataFrame after setting a value --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/formats/format.py | 3 --- pandas/tests/sparse/test_format.py | 12 ++++++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f1df8f456159a..e459c854dfab9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -698,7 +698,7 @@ Bug Fixes - Bug in ``to_sql`` when writing a DataFrame with numeric index names (:issue:`15404`). - Bug in ``Series.iloc`` where a ``Categorical`` object for list-like indexes input was returned, where a ``Series`` was expected. (:issue:`14580`) - +- Bug in repr-formatting a ``SparseDataFrame`` after a value was set on (a copy of) one of its series (:issue:`15488`) - Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 9dde3b0001c31..622c4cd3bbcc7 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -716,9 +716,6 @@ def to_html(self, classes=None, notebook=False, border=None): def _get_formatted_column_labels(self, frame): from pandas.core.index import _sparsify - def is_numeric_dtype(dtype): - return issubclass(dtype.type, np.number) - columns = frame.columns if isinstance(columns, MultiIndex): diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 0c0e773d19bb9..ba870a2c33801 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -116,3 +116,15 @@ def test_sparse_frame(self): with option_context("display.max_rows", 3): self.assertEqual(repr(sparse), repr(df)) + + def test_sparse_repr_after_set(self): + # GH 15488 + sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) + res = sdf.copy() + + # Ignore the warning + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 # This line triggers the bug + + repr(sdf) + tm.assert_sp_frame_equal(sdf, res) From f5fc387a1ecca434b49092cc0dedebc7d310010d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Mar 2017 09:30:06 -0500 Subject: [PATCH 247/338] DOC/BUILD: Parallelize doc build closes #15591 a couple of minutes faster with -j 2. fixes some deprecated use of pd.Term Author: Jeff Reback Closes #15600 from jreback/docs and squashes the following commits: c19303d [Jeff Reback] DOC: parallel build for docs --- doc/make.py | 6 +++--- doc/source/io.rst | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/make.py b/doc/make.py index 8a6d4e5df24f0..a2f5be5594e44 100755 --- a/doc/make.py +++ b/doc/make.py @@ -197,7 +197,7 @@ def html(): print(e) print("Failed to convert %s" % nb) - if os.system('sphinx-build -P -b html -d build/doctrees ' + if os.system('sphinx-build -j 2 -P -b html -d build/doctrees ' 'source build/html'): raise SystemExit("Building HTML failed.") try: @@ -222,7 +222,7 @@ def latex(): check_build() if sys.platform != 'win32': # LaTeX format. - if os.system('sphinx-build -b latex -d build/doctrees ' + if os.system('sphinx-build -j 2 -b latex -d build/doctrees ' 'source build/latex'): raise SystemExit("Building LaTeX failed.") # Produce pdf. @@ -245,7 +245,7 @@ def latex_forced(): check_build() if sys.platform != 'win32': # LaTeX format. - if os.system('sphinx-build -b latex -d build/doctrees ' + if os.system('sphinx-build -j 2 -b latex -d build/doctrees ' 'source build/latex'): raise SystemExit("Building LaTeX failed.") # Produce pdf. diff --git a/doc/source/io.rst b/doc/source/io.rst index c7a68a0fe9fbb..fa57d6d692152 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3758,7 +3758,7 @@ be data_columns # on-disk operations store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) - store.select('df_dc', [ pd.Term('B>0') ]) + store.select('df_dc', where='B>0') # getting creative store.select('df_dc', 'B > 0 & C > 0 & string == foo') @@ -4352,6 +4352,9 @@ HDFStore supports ``Panel4D`` storage. .. ipython:: python :okwarning: + wp = pd.Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) p4d = pd.Panel4D({ 'l1' : wp }) p4d store.append('p4d', p4d) @@ -4368,8 +4371,7 @@ object). This cannot be changed after table creation. :okwarning: store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis']) - store - store.select('p4d2', [ pd.Term('labels=l1'), pd.Term('items=Item1'), pd.Term('minor_axis=A_big_strings') ]) + store.select('p4d2', where='labels=l1 and items=Item1 and minor_axis=A') .. ipython:: python :suppress: From 9ee698021bfbf67441ccf3440d36952b5aaca00f Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 7 Mar 2017 16:14:53 -0500 Subject: [PATCH 248/338] BUG: fix SparseSeries reindex by using Series implementation closes #15447 Author: Pietro Battiston Closes #15461 from toobaz/drop_sparse_reindex and squashes the following commits: 9084246 [Pietro Battiston] Test SparseSeries.reindex with fill_value and nearest d6a46da [Pietro Battiston] Use _shared_docs for documentation 922c7b0 [Pietro Battiston] Test "copy" argument af99190 [Pietro Battiston] Whatsnew 7945cb4 [Pietro Battiston] Tests for .loc() and .reindex() on sparse series with MultiIndex 55b99f8 [Pietro Battiston] BUG: Drop faulty and redundant reindex() for SparseSeries --- doc/source/whatsnew/v0.20.0.txt | 4 +++ pandas/sparse/series.py | 24 +++---------- pandas/tests/sparse/test_indexing.py | 53 +++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e459c854dfab9..ece9ff4a1adff 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -672,6 +672,10 @@ Bug Fixes - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) +- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) + + + - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series` (:issue:`14320`) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a3b701169ce91..c3dd089e8409a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -32,7 +32,7 @@ _coo_to_sparse_series) -_shared_doc_kwargs = dict(klass='SparseSeries', +_shared_doc_kwargs = dict(axes='index', klass='SparseSeries', axes_single_arg="{0, 'index'}") # ----------------------------------------------------------------------------- @@ -570,27 +570,13 @@ def copy(self, deep=True): return self._constructor(new_data, sparse_index=self.sp_index, fill_value=self.fill_value).__finalize__(self) + @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs) def reindex(self, index=None, method=None, copy=True, limit=None, **kwargs): - """ - Conform SparseSeries to new Index - - See Series.reindex docstring for general behavior - Returns - ------- - reindexed : SparseSeries - """ - new_index = _ensure_index(index) - - if self.index.equals(new_index): - if copy: - return self.copy() - else: - return self - return self._constructor(self._data.reindex(new_index, method=method, - limit=limit, copy=copy), - index=new_index).__finalize__(self) + return super(SparseSeries, self).reindex(index=index, method=method, + copy=copy, limit=limit, + **kwargs) def sparse_reindex(self, new_index): """ diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index 357a7103f4027..1a0782c0a3db9 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -366,7 +366,7 @@ def test_reindex(self): exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse() tm.assert_sp_series_equal(res, exp) - def test_reindex_fill_value(self): + def test_fill_value_reindex(self): orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) sparse = orig.to_sparse(fill_value=0) @@ -397,6 +397,23 @@ def test_reindex_fill_value(self): exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) + def test_reindex_fill_value(self): + floats = pd.Series([1., 2., 3.]).to_sparse() + result = floats.reindex([1, 2, 3], fill_value=0) + expected = pd.Series([2., 3., 0], index=[1, 2, 3]).to_sparse() + tm.assert_sp_series_equal(result, expected) + + def test_reindex_nearest(self): + s = pd.Series(np.arange(10, dtype='float64')).to_sparse() + target = [0.1, 0.9, 1.5, 2.0] + actual = s.reindex(target, method='nearest') + expected = pd.Series(np.around(target), target).to_sparse() + tm.assert_sp_series_equal(expected, actual) + + actual = s.reindex(target, method='nearest', tolerance=0.2) + expected = pd.Series([0, 1, np.nan, 2], target).to_sparse() + tm.assert_sp_series_equal(expected, actual) + def tests_indexing_with_sparse(self): # GH 13985 @@ -504,6 +521,11 @@ def test_loc(self): exp = orig.loc[[1, 3, 4, 5]].to_sparse() tm.assert_sp_series_equal(result, exp) + # single element list (GH 15447) + result = sparse.loc[['A']] + exp = orig.loc[['A']].to_sparse() + tm.assert_sp_series_equal(result, exp) + # dense array result = sparse.loc[orig % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() @@ -537,6 +559,35 @@ def test_loc_slice(self): orig.loc['A':'B'].to_sparse()) tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + def test_reindex(self): + # GH 15447 + orig = self.orig + sparse = self.sparse + + res = sparse.reindex([('A', 0), ('C', 1)]) + exp = orig.reindex([('A', 0), ('C', 1)]).to_sparse() + tm.assert_sp_series_equal(res, exp) + + # On specific level: + res = sparse.reindex(['A', 'C', 'B'], level=0) + exp = orig.reindex(['A', 'C', 'B'], level=0).to_sparse() + tm.assert_sp_series_equal(res, exp) + + # single element list (GH 15447) + res = sparse.reindex(['A'], level=0) + exp = orig.reindex(['A'], level=0).to_sparse() + tm.assert_sp_series_equal(res, exp) + + with tm.assertRaises(TypeError): + # Incomplete keys are not accepted for reindexing: + sparse.reindex(['A', 'C']) + + # "copy" argument: + res = sparse.reindex(sparse.index, copy=True) + exp = orig.reindex(orig.index, copy=True).to_sparse() + tm.assert_sp_series_equal(res, exp) + self.assertIsNot(sparse, res) + class TestSparseDataFrameIndexing(tm.TestCase): From 520eb1a324c9dad08d5f4465105af86f5e3810a0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Mar 2017 18:21:18 -0500 Subject: [PATCH 249/338] BLD: consolidate remaining extensions moves extensions to pandas/_libs, which holds the extension code and also the generated builds (as its importable). pandas/_libs/src is now almost an includes dir, holding low-frequency changing code. This consolidates the import process making it more uniform and consistent throughout the codebase. Finally this cleans up the remaining top-level namespace (with some deprecations in place for example pandas.lib, pandas.tslib, pandas.json, pandas.parser. I listed all of the changes in the whatsnew, but I don't think worthwhile deprecating anything else. Author: Jeff Reback Closes #15537 from jreback/extensions3 and squashes the following commits: a6d6cfa [Jeff Reback] BLD: rename / move some extensions --- Makefile | 2 +- asv_bench/benchmarks/binary_ops.py | 2 +- asv_bench/benchmarks/pandas_vb_common.py | 14 +- asv_bench/benchmarks/panel_methods.py | 2 +- doc/source/whatsnew/v0.20.0.txt | 29 +++ pandas/__init__.py | 14 +- pandas/_libs/__init__.py | 8 + pandas/{ => _libs}/algos.pyx | 4 +- .../{src => _libs}/algos_common_helper.pxi.in | 2 +- .../algos_groupby_helper.pxi.in | 0 .../{src => _libs}/algos_rank_helper.pxi.in | 0 .../{src => _libs}/algos_take_helper.pxi.in | 0 pandas/{ => _libs}/hashtable.pxd | 0 pandas/{ => _libs}/hashtable.pyx | 2 +- .../hashtable_class_helper.pxi.in | 0 .../hashtable_func_helper.pxi.in | 0 pandas/{ => _libs}/index.pyx | 4 +- .../{src => _libs}/index_class_helper.pxi.in | 0 pandas/{src => _libs}/join.pyx | 4 +- .../join_func_helper.pxi.in} | 0 pandas/{src => _libs}/join_helper.pxi.in | 0 pandas/{ => _libs}/lib.pxd | 0 pandas/{ => _libs}/lib.pyx | 0 pandas/{src => _libs}/period.pyx | 44 ++-- pandas/{src => _libs}/reshape.pyx | 0 pandas/{src => _libs}/reshape_helper.pxi.in | 0 pandas/{ => _libs}/src/datetime.pxd | 0 pandas/{ => _libs}/src/datetime/np_datetime.c | 0 pandas/{ => _libs}/src/datetime/np_datetime.h | 0 .../src/datetime/np_datetime_strings.c | 0 .../src/datetime/np_datetime_strings.h | 0 pandas/{ => _libs}/src/datetime_helper.h | 0 pandas/{ => _libs}/src/headers/math.h | 0 pandas/{ => _libs}/src/headers/ms_inttypes.h | 0 pandas/{ => _libs}/src/headers/ms_stdint.h | 0 pandas/{ => _libs}/src/headers/portable.h | 0 pandas/{ => _libs}/src/headers/stdint.h | 0 pandas/{ => _libs}/src/helper.h | 0 pandas/{ => _libs}/src/inference.pyx | 0 pandas/{ => _libs}/src/khash.pxd | 0 pandas/{ => _libs}/src/klib/khash.h | 0 pandas/{ => _libs}/src/klib/khash_python.h | 0 pandas/{ => _libs}/src/klib/ktypes.h | 0 pandas/{ => _libs}/src/klib/kvec.h | 0 pandas/{ => _libs}/src/msgpack/pack.h | 0 .../{ => _libs}/src/msgpack/pack_template.h | 0 pandas/{ => _libs}/src/msgpack/sysdep.h | 0 pandas/{ => _libs}/src/msgpack/unpack.h | 0 .../{ => _libs}/src/msgpack/unpack_define.h | 0 .../{ => _libs}/src/msgpack/unpack_template.h | 0 pandas/{ => _libs}/src/numpy.pxd | 0 pandas/{ => _libs}/src/numpy_helper.h | 0 pandas/{ => _libs}/src/offsets.pyx | 0 pandas/{ => _libs}/src/parse_helper.h | 0 pandas/{ => _libs}/src/parser/.gitignore | 0 pandas/{ => _libs}/src/parser/Makefile | 0 pandas/{ => _libs}/src/parser/io.c | 0 pandas/{ => _libs}/src/parser/io.h | 0 pandas/{ => _libs}/src/parser/tokenizer.c | 0 pandas/{ => _libs}/src/parser/tokenizer.h | 0 pandas/{ => _libs}/src/period_helper.c | 0 pandas/{ => _libs}/src/period_helper.h | 0 pandas/{ => _libs}/src/properties.pyx | 0 pandas/{ => _libs}/src/reduce.pyx | 0 pandas/{ => _libs}/src/skiplist.h | 0 pandas/{ => _libs}/src/skiplist.pxd | 0 pandas/{ => _libs}/src/skiplist.pyx | 0 pandas/{ => _libs}/src/ujson/lib/ultrajson.h | 0 .../{ => _libs}/src/ujson/lib/ultrajsondec.c | 0 .../{ => _libs}/src/ujson/lib/ultrajsonenc.c | 0 .../{ => _libs}/src/ujson/python/JSONtoObj.c | 0 .../{ => _libs}/src/ujson/python/objToJSON.c | 2 +- .../{ => _libs}/src/ujson/python/py_defines.h | 0 pandas/{ => _libs}/src/ujson/python/ujson.c | 8 +- pandas/{ => _libs}/src/ujson/python/version.h | 0 pandas/{ => _libs}/src/util.pxd | 0 pandas/{ => _libs}/tslib.pxd | 0 pandas/{ => _libs}/tslib.pyx | 0 pandas/compat/pickle_compat.py | 8 +- pandas/computation/scope.py | 8 +- pandas/core/algorithms.py | 9 +- pandas/core/base.py | 2 +- pandas/core/categorical.py | 10 +- pandas/core/common.py | 5 +- pandas/core/frame.py | 32 ++- pandas/core/generic.py | 6 +- pandas/core/groupby.py | 48 ++-- pandas/core/internals.py | 9 +- pandas/core/missing.py | 4 +- pandas/core/nanops.py | 3 +- pandas/core/ops.py | 39 +-- pandas/core/reshape.py | 5 +- pandas/core/series.py | 31 ++- pandas/core/sorting.py | 25 +- pandas/core/strings.py | 2 +- pandas/core/window.py | 9 +- pandas/{ => core}/window.pyx | 2 +- pandas/formats/format.py | 5 +- pandas/indexes/api.py | 2 +- pandas/indexes/base.py | 38 ++- pandas/indexes/category.py | 6 +- pandas/indexes/multi.py | 10 +- pandas/indexes/numeric.py | 49 ++-- pandas/indexes/range.py | 4 +- pandas/io/api.py | 2 +- pandas/io/date_converters.py | 2 +- pandas/io/excel.py | 6 +- pandas/io/json/json.py | 8 +- pandas/io/json/normalize.py | 2 +- pandas/{ => io}/msgpack/__init__.py | 8 +- pandas/{ => io}/msgpack/_packer.pyx | 6 +- pandas/{ => io}/msgpack/_unpacker.pyx | 8 +- pandas/{ => io}/msgpack/_version.py | 0 pandas/{ => io}/msgpack/exceptions.py | 0 pandas/io/packers.py | 4 +- pandas/io/parsers.py | 8 +- pandas/{parser.pyx => io/parsers.pyx} | 7 +- pandas/io/pytables.py | 4 +- pandas/io/sas/__init__.py | 1 + pandas/io/sas/{saslib.pyx => sas.pyx} | 0 pandas/io/sas/sas7bdat.py | 2 +- pandas/io/sql.py | 2 +- pandas/io/stata.py | 4 +- pandas/json.py | 7 + pandas/lib.py | 7 + pandas/parser.py | 8 + pandas/sparse/array.py | 8 +- pandas/sparse/list.py | 2 +- pandas/sparse/series.py | 6 +- pandas/{src => sparse}/sparse.pyx | 0 .../{src => sparse}/sparse_op_helper.pxi.in | 0 pandas/tests/api/test_api.py | 48 +++- pandas/tests/computation/test_eval.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_indexing.py | 4 +- pandas/tests/frame/test_to_csv.py | 2 +- pandas/tests/groupby/test_bin_groupby.py | 4 +- pandas/tests/groupby/test_transform.py | 13 +- pandas/tests/indexes/common.py | 9 +- .../indexes/datetimes/test_construction.py | 5 +- .../indexes/datetimes/test_date_range.py | 2 +- .../tests/indexes/datetimes/test_datetime.py | 2 +- pandas/tests/indexes/datetimes/test_ops.py | 2 +- pandas/tests/indexes/datetimes/test_setops.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 4 +- pandas/tests/indexes/period/test_indexing.py | 3 +- pandas/tests/indexes/period/test_ops.py | 2 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_multi.py | 9 +- pandas/tests/indexes/test_numeric.py | 2 +- .../indexes/timedeltas/test_construction.py | 4 +- pandas/tests/indexes/timedeltas/test_ops.py | 6 +- pandas/tests/indexes/timedeltas/test_tools.py | 5 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/io/json/test_pandas.py | 36 +-- pandas/tests/io/json/test_ujson.py | 4 +- pandas/tests/{ => io}/msgpack/__init__.py | 0 pandas/tests/{ => io}/msgpack/test_buffer.py | 2 +- pandas/tests/{ => io}/msgpack/test_case.py | 2 +- pandas/tests/{ => io}/msgpack/test_except.py | 2 +- .../tests/{ => io}/msgpack/test_extension.py | 4 +- pandas/tests/{ => io}/msgpack/test_format.py | 2 +- pandas/tests/{ => io}/msgpack/test_limits.py | 2 +- pandas/tests/{ => io}/msgpack/test_newspec.py | 2 +- pandas/tests/{ => io}/msgpack/test_obj.py | 2 +- pandas/tests/{ => io}/msgpack/test_pack.py | 2 +- .../tests/{ => io}/msgpack/test_read_size.py | 2 +- pandas/tests/{ => io}/msgpack/test_seq.py | 2 +- .../tests/{ => io}/msgpack/test_sequnpack.py | 4 +- pandas/tests/{ => io}/msgpack/test_subtype.py | 2 +- pandas/tests/{ => io}/msgpack/test_unpack.py | 2 +- .../tests/{ => io}/msgpack/test_unpack_raw.py | 2 +- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/parser/converters.py | 2 +- pandas/tests/io/parser/parse_dates.py | 4 +- pandas/tests/io/parser/test_textreader.py | 4 +- pandas/tests/io/parser/usecols.py | 2 +- pandas/tests/io/test_html.py | 2 +- pandas/tests/io/test_packers.py | 5 +- pandas/tests/io/test_pytables.py | 2 +- pandas/tests/io/test_stata.py | 2 +- pandas/tests/scalar/test_period.py | 14 +- pandas/tests/scalar/test_timedelta.py | 13 +- pandas/tests/scalar/test_timestamp.py | 10 +- pandas/tests/series/test_constructors.py | 23 +- pandas/tests/series/test_dtypes.py | 2 +- pandas/tests/series/test_indexing.py | 6 +- pandas/tests/series/test_internals.py | 2 +- pandas/tests/series/test_missing.py | 27 +-- pandas/tests/series/test_replace.py | 2 +- pandas/tests/series/test_timeseries.py | 2 +- pandas/tests/sparse/test_array.py | 2 +- pandas/tests/sparse/test_frame.py | 2 +- pandas/tests/sparse/test_libsparse.py | 2 +- pandas/tests/sparse/test_series.py | 2 +- pandas/tests/test_algos.py | 41 ++-- pandas/tests/test_base.py | 7 +- pandas/tests/test_internals.py | 2 +- pandas/tests/test_join.py | 2 +- pandas/tests/test_lib.py | 2 +- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_take.py | 2 +- pandas/tests/tools/test_join.py | 8 +- pandas/tests/tseries/test_offsets.py | 4 +- pandas/tests/tseries/test_resample.py | 2 +- pandas/tests/tseries/test_timezones.py | 5 +- pandas/tests/types/test_inference.py | 14 +- pandas/tests/types/test_io.py | 4 +- pandas/tests/types/test_missing.py | 2 +- pandas/tools/hashing.py | 5 +- pandas/{src/hash.pyx => tools/hashing.pyx} | 0 pandas/tools/merge.py | 28 +-- pandas/tools/tile.py | 2 +- pandas/tools/util.py | 2 +- pandas/tseries/api.py | 2 +- pandas/tseries/base.py | 44 ++-- pandas/tseries/common.py | 3 +- pandas/tseries/converter.py | 2 +- pandas/tseries/frequencies.py | 6 +- pandas/tseries/index.py | 106 ++++----- pandas/tseries/offsets.py | 3 +- pandas/tseries/period.py | 12 +- pandas/tseries/resample.py | 7 +- pandas/tseries/tdi.py | 41 ++-- pandas/tseries/timedeltas.py | 2 +- pandas/tseries/tools.py | 3 +- pandas/tslib.py | 8 + pandas/types/cast.py | 10 +- pandas/types/common.py | 2 +- pandas/types/concat.py | 2 +- pandas/types/inference.py | 2 +- pandas/types/missing.py | 4 +- pandas/util/decorators.py | 2 +- pandas/util/depr_module.py | 30 ++- pandas/util/testing.py | 28 ++- pandas/{src => util}/testing.pyx | 0 scripts/bench_join.py | 2 +- scripts/bench_join_multi.py | 2 +- scripts/groupby_test.py | 2 +- scripts/roll_median_leak.py | 2 +- setup.py | 223 +++++++++--------- vb_suite/pandas_vb_common.py | 2 +- 243 files changed, 885 insertions(+), 771 deletions(-) create mode 100644 pandas/_libs/__init__.py rename pandas/{ => _libs}/algos.pyx (99%) rename pandas/{src => _libs}/algos_common_helper.pxi.in (99%) rename pandas/{src => _libs}/algos_groupby_helper.pxi.in (100%) rename pandas/{src => _libs}/algos_rank_helper.pxi.in (100%) rename pandas/{src => _libs}/algos_take_helper.pxi.in (100%) rename pandas/{ => _libs}/hashtable.pxd (100%) rename pandas/{ => _libs}/hashtable.pyx (99%) rename pandas/{src => _libs}/hashtable_class_helper.pxi.in (100%) rename pandas/{src => _libs}/hashtable_func_helper.pxi.in (100%) rename pandas/{ => _libs}/index.pyx (99%) rename pandas/{src => _libs}/index_class_helper.pxi.in (100%) rename pandas/{src => _libs}/join.pyx (98%) rename pandas/{src/joins_func_helper.pxi.in => _libs/join_func_helper.pxi.in} (100%) rename pandas/{src => _libs}/join_helper.pxi.in (100%) rename pandas/{ => _libs}/lib.pxd (100%) rename pandas/{ => _libs}/lib.pyx (100%) rename pandas/{src => _libs}/period.pyx (98%) rename pandas/{src => _libs}/reshape.pyx (100%) rename pandas/{src => _libs}/reshape_helper.pxi.in (100%) rename pandas/{ => _libs}/src/datetime.pxd (100%) rename pandas/{ => _libs}/src/datetime/np_datetime.c (100%) rename pandas/{ => _libs}/src/datetime/np_datetime.h (100%) rename pandas/{ => _libs}/src/datetime/np_datetime_strings.c (100%) rename pandas/{ => _libs}/src/datetime/np_datetime_strings.h (100%) rename pandas/{ => _libs}/src/datetime_helper.h (100%) rename pandas/{ => _libs}/src/headers/math.h (100%) rename pandas/{ => _libs}/src/headers/ms_inttypes.h (100%) rename pandas/{ => _libs}/src/headers/ms_stdint.h (100%) rename pandas/{ => _libs}/src/headers/portable.h (100%) rename pandas/{ => _libs}/src/headers/stdint.h (100%) rename pandas/{ => _libs}/src/helper.h (100%) rename pandas/{ => _libs}/src/inference.pyx (100%) rename pandas/{ => _libs}/src/khash.pxd (100%) rename pandas/{ => _libs}/src/klib/khash.h (100%) rename pandas/{ => _libs}/src/klib/khash_python.h (100%) rename pandas/{ => _libs}/src/klib/ktypes.h (100%) rename pandas/{ => _libs}/src/klib/kvec.h (100%) rename pandas/{ => _libs}/src/msgpack/pack.h (100%) rename pandas/{ => _libs}/src/msgpack/pack_template.h (100%) rename pandas/{ => _libs}/src/msgpack/sysdep.h (100%) rename pandas/{ => _libs}/src/msgpack/unpack.h (100%) rename pandas/{ => _libs}/src/msgpack/unpack_define.h (100%) rename pandas/{ => _libs}/src/msgpack/unpack_template.h (100%) rename pandas/{ => _libs}/src/numpy.pxd (100%) rename pandas/{ => _libs}/src/numpy_helper.h (100%) rename pandas/{ => _libs}/src/offsets.pyx (100%) rename pandas/{ => _libs}/src/parse_helper.h (100%) rename pandas/{ => _libs}/src/parser/.gitignore (100%) rename pandas/{ => _libs}/src/parser/Makefile (100%) rename pandas/{ => _libs}/src/parser/io.c (100%) rename pandas/{ => _libs}/src/parser/io.h (100%) rename pandas/{ => _libs}/src/parser/tokenizer.c (100%) rename pandas/{ => _libs}/src/parser/tokenizer.h (100%) rename pandas/{ => _libs}/src/period_helper.c (100%) rename pandas/{ => _libs}/src/period_helper.h (100%) rename pandas/{ => _libs}/src/properties.pyx (100%) rename pandas/{ => _libs}/src/reduce.pyx (100%) rename pandas/{ => _libs}/src/skiplist.h (100%) rename pandas/{ => _libs}/src/skiplist.pxd (100%) rename pandas/{ => _libs}/src/skiplist.pyx (100%) rename pandas/{ => _libs}/src/ujson/lib/ultrajson.h (100%) rename pandas/{ => _libs}/src/ujson/lib/ultrajsondec.c (100%) rename pandas/{ => _libs}/src/ujson/lib/ultrajsonenc.c (100%) rename pandas/{ => _libs}/src/ujson/python/JSONtoObj.c (100%) rename pandas/{ => _libs}/src/ujson/python/objToJSON.c (99%) rename pandas/{ => _libs}/src/ujson/python/py_defines.h (100%) rename pandas/{ => _libs}/src/ujson/python/ujson.c (95%) rename pandas/{ => _libs}/src/ujson/python/version.h (100%) rename pandas/{ => _libs}/src/util.pxd (100%) rename pandas/{ => _libs}/tslib.pxd (100%) rename pandas/{ => _libs}/tslib.pyx (100%) rename pandas/{ => core}/window.pyx (99%) rename pandas/{ => io}/msgpack/__init__.py (81%) rename pandas/{ => io}/msgpack/_packer.pyx (98%) rename pandas/{ => io}/msgpack/_unpacker.pyx (98%) rename pandas/{ => io}/msgpack/_version.py (100%) rename pandas/{ => io}/msgpack/exceptions.py (100%) rename pandas/{parser.pyx => io/parsers.pyx} (99%) rename pandas/io/sas/{saslib.pyx => sas.pyx} (100%) create mode 100644 pandas/json.py create mode 100644 pandas/lib.py create mode 100644 pandas/parser.py rename pandas/{src => sparse}/sparse.pyx (100%) rename pandas/{src => sparse}/sparse_op_helper.pxi.in (100%) rename pandas/tests/{ => io}/msgpack/__init__.py (100%) rename pandas/tests/{ => io}/msgpack/test_buffer.py (90%) rename pandas/tests/{ => io}/msgpack/test_case.py (98%) rename pandas/tests/{ => io}/msgpack/test_except.py (96%) rename pandas/tests/{ => io}/msgpack/test_extension.py (96%) rename pandas/tests/{ => io}/msgpack/test_format.py (98%) rename pandas/tests/{ => io}/msgpack/test_limits.py (97%) rename pandas/tests/{ => io}/msgpack/test_newspec.py (97%) rename pandas/tests/{ => io}/msgpack/test_obj.py (98%) rename pandas/tests/{ => io}/msgpack/test_pack.py (98%) rename pandas/tests/{ => io}/msgpack/test_read_size.py (96%) rename pandas/tests/{ => io}/msgpack/test_seq.py (96%) rename pandas/tests/{ => io}/msgpack/test_sequnpack.py (97%) rename pandas/tests/{ => io}/msgpack/test_subtype.py (90%) rename pandas/tests/{ => io}/msgpack/test_unpack.py (96%) rename pandas/tests/{ => io}/msgpack/test_unpack_raw.py (94%) rename pandas/{src/hash.pyx => tools/hashing.pyx} (100%) create mode 100644 pandas/tslib.py rename pandas/{src => util}/testing.pyx (100%) diff --git a/Makefile b/Makefile index 9a768932b8bea..90dcd16d955d6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -tseries: pandas/lib.pyx pandas/tslib.pyx pandas/hashtable.pyx +tseries: pandas/_libs/lib.pyx pandas/_libs/tslib.pyx pandas/_libs/hashtable.pyx python setup.py build_ext --inplace .PHONY : develop build clean clean_pyc tseries doc diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 53cb1cf465698..72700c3de282e 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -107,4 +107,4 @@ def setup(self): self.s = Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) self.ts = self.s[self.halfway] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) \ No newline at end of file + self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 25b0b5dd4d1b0..56ccc94c414fb 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -8,16 +8,22 @@ import random import numpy as np import threading +from importlib import import_module + try: from pandas.compat import range except ImportError: pass np.random.seed(1234) -try: - import pandas._tseries as lib -except: - import pandas.lib as lib + +# try em until it works! +for imp in ['pandas_tseries', 'pandas.lib', 'pandas._libs.lib']: + try: + lib = import_module(imp) + break + except: + pass try: Panel = Panel diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index ebe278f6e68b5..6609305502011 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -21,4 +21,4 @@ def time_shift(self): self.panel.shift(1) def time_shift_minor(self): - self.panel.shift(1, axis='minor') \ No newline at end of file + self.panel.shift(1, axis='minor') diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ece9ff4a1adff..8f2033de6c77f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -484,6 +484,35 @@ New Behavior: In [11]: index.memory_usage(deep=True) Out[11]: 260 +.. _whatsnew_0200.api_breaking.extensions: + +Extension Modules Moved +^^^^^^^^^^^^^^^^^^^^^^^ + +Some formerly public c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API. +If indicated, a deprecation warning will be issued if you reference that module. (:issue:`12588`) + +.. csv-table:: + :header: "Previous Location", "New Location", "Deprecated" + :widths: 30, 30, 4 + + "pandas.lib", "pandas._libs.lib", "X" + "pandas.tslib", "pandas._libs.tslib", "X" + "pandas._join", "pandas._libs.join", "" + "pandas._period", "pandas._libs.period", "" + "pandas.msgpack", "pandas.io.msgpack", "" + "pandas.index", "pandas._libs.index", "" + "pandas.algos", "pandas._libs.algos", "" + "pandas.hashtable", "pandas._libs.hashtable", "" + "pandas.json", "pandas.io.json.libjson", "X" + "pandas.parser", "pandas.io.libparsers", "X" + "pandas.io.sas.saslib", "pandas.io.sas.libsas", "" + "pandas._testing", "pandas.util.libtesting", "" + "pandas._sparse", "pandas.sparse.libsparse", "" + "pandas._hash", "pandas.tools.libhash", "" + "pandas._window", "pandas.core.libwindow", "" + + .. _whatsnew_0200.api_breaking.groupby_describe: Groupby Describe Formatting diff --git a/pandas/__init__.py b/pandas/__init__.py index 3bded89e6644a..5c7c9d44c5d10 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -23,7 +23,9 @@ from pandas.compat.numpy import * try: - from pandas import hashtable, tslib, lib + from pandas._libs import (hashtable as _hashtable, + lib as _lib, + tslib as _tslib) except ImportError as e: # pragma: no cover # hack but overkill to use re module = str(e).lstrip('cannot import name ') @@ -52,11 +54,17 @@ from pandas.tools.util import to_numeric from pandas.core.reshape import melt from pandas.util.print_versions import show_versions - from pandas.io.api import * - from pandas.util._tester import test +# extension module deprecations +from pandas.util.depr_module import _DeprecatedModule + +json = _DeprecatedModule(deprmod='pandas.json', deprmodto='pandas.io.json.libjson') +parser = _DeprecatedModule(deprmod='pandas.parser', deprmodto='pandas.io.libparsers') +lib = _DeprecatedModule(deprmod='pandas.lib', deprmodto='pandas._libs.lib') +tslib = _DeprecatedModule(deprmod='pandas.tslib', deprmodto='pandas._libs.tslib') + # use the closest tagged version if possible from ._version import get_versions v = get_versions() diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py new file mode 100644 index 0000000000000..ab3832d0292ba --- /dev/null +++ b/pandas/_libs/__init__.py @@ -0,0 +1,8 @@ +# flake8: noqa + +from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime + +# TODO +# period is directly dependent on tslib and imports python +# modules, so exposing Period as an alias is currently not possible +# from period import Period diff --git a/pandas/algos.pyx b/pandas/_libs/algos.pyx similarity index 99% rename from pandas/algos.pyx rename to pandas/_libs/algos.pyx index 32955fd0f465b..7d3ce3280ec1e 100644 --- a/pandas/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -37,7 +37,7 @@ float64 = np.dtype(np.float64) cdef double NaN = np.NaN cdef double nan = NaN -cdef extern from "src/headers/math.h": +cdef extern from "../src/headers/math.h": double sqrt(double x) nogil double fabs(double) nogil @@ -46,7 +46,7 @@ from util cimport numeric, get_nat cimport lib from lib cimport is_null_datetimelike -from pandas import lib +from pandas._libs import lib cdef int64_t iNaT = get_nat() diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in similarity index 99% rename from pandas/src/algos_common_helper.pxi.in rename to pandas/_libs/algos_common_helper.pxi.in index b83dec1d26242..336dd77ea9a89 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -433,7 +433,7 @@ def arrmap_{{name}}(ndarray[{{c_type}}] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) - from pandas.lib import maybe_convert_objects + from pandas._libs.lib import maybe_convert_objects for i in range(length): result[i] = func(index[i]) diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/_libs/algos_groupby_helper.pxi.in similarity index 100% rename from pandas/src/algos_groupby_helper.pxi.in rename to pandas/_libs/algos_groupby_helper.pxi.in diff --git a/pandas/src/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in similarity index 100% rename from pandas/src/algos_rank_helper.pxi.in rename to pandas/_libs/algos_rank_helper.pxi.in diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in similarity index 100% rename from pandas/src/algos_take_helper.pxi.in rename to pandas/_libs/algos_take_helper.pxi.in diff --git a/pandas/hashtable.pxd b/pandas/_libs/hashtable.pxd similarity index 100% rename from pandas/hashtable.pxd rename to pandas/_libs/hashtable.pxd diff --git a/pandas/hashtable.pyx b/pandas/_libs/hashtable.pyx similarity index 99% rename from pandas/hashtable.pyx rename to pandas/_libs/hashtable.pyx index 276b0679070dc..eee287b2c157b 100644 --- a/pandas/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -22,7 +22,7 @@ cdef extern from "numpy/npy_math.h": cimport cython cimport numpy as cnp -from pandas.lib import checknull +from pandas._libs.lib import checknull cnp.import_array() cnp.import_ufunc() diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in similarity index 100% rename from pandas/src/hashtable_class_helper.pxi.in rename to pandas/_libs/hashtable_class_helper.pxi.in diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in similarity index 100% rename from pandas/src/hashtable_func_helper.pxi.in rename to pandas/_libs/hashtable_func_helper.pxi.in diff --git a/pandas/index.pyx b/pandas/_libs/index.pyx similarity index 99% rename from pandas/index.pyx rename to pandas/_libs/index.pyx index 37fe7d90bebe0..c7a537acf5d6f 100644 --- a/pandas/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,8 +17,8 @@ import numpy as np cimport tslib from hashtable cimport * -from pandas import algos, tslib, hashtable as _hash -from pandas.tslib import Timestamp, Timedelta +from pandas._libs import tslib, algos, hashtable as _hash +from pandas._libs.tslib import Timestamp, Timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) diff --git a/pandas/src/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in similarity index 100% rename from pandas/src/index_class_helper.pxi.in rename to pandas/_libs/index_class_helper.pxi.in diff --git a/pandas/src/join.pyx b/pandas/_libs/join.pyx similarity index 98% rename from pandas/src/join.pyx rename to pandas/_libs/join.pyx index 65c790beb5dbf..385a9762ed90d 100644 --- a/pandas/src/join.pyx +++ b/pandas/_libs/join.pyx @@ -32,10 +32,10 @@ float64 = np.dtype(np.float64) cdef double NaN = np.NaN cdef double nan = NaN -from pandas.algos import groupsort_indexer, ensure_platform_int +from pandas._libs.algos import groupsort_indexer, ensure_platform_int from pandas.core.algorithms import take_nd -include "joins_func_helper.pxi" +include "join_func_helper.pxi" def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/_libs/join_func_helper.pxi.in similarity index 100% rename from pandas/src/joins_func_helper.pxi.in rename to pandas/_libs/join_func_helper.pxi.in diff --git a/pandas/src/join_helper.pxi.in b/pandas/_libs/join_helper.pxi.in similarity index 100% rename from pandas/src/join_helper.pxi.in rename to pandas/_libs/join_helper.pxi.in diff --git a/pandas/lib.pxd b/pandas/_libs/lib.pxd similarity index 100% rename from pandas/lib.pxd rename to pandas/_libs/lib.pxd diff --git a/pandas/lib.pyx b/pandas/_libs/lib.pyx similarity index 100% rename from pandas/lib.pyx rename to pandas/_libs/lib.pyx diff --git a/pandas/src/period.pyx b/pandas/_libs/period.pyx similarity index 98% rename from pandas/src/period.pyx rename to pandas/_libs/period.pyx index 2d92b9f192328..f30035910a62f 100644 --- a/pandas/src/period.pyx +++ b/pandas/_libs/period.pyx @@ -16,19 +16,15 @@ cdef extern from "datetime_helper.h": from libc.stdlib cimport free from pandas import compat - -from pandas.tseries import offsets -from pandas.tseries.tools import parse_time_string +from pandas.compat import PY2 cimport cython from datetime cimport * -cimport util -cimport lib +cimport util, lib from lib cimport is_null_datetimelike, is_period -import lib -from pandas import tslib -from tslib import Timedelta, Timestamp, iNaT, NaT -from tslib import have_pytz, _get_utcoffset +from pandas._libs import tslib, lib +from pandas._libs.tslib import (Timedelta, Timestamp, iNaT, + NaT, have_pytz, _get_utcoffset) from tslib cimport ( maybe_get_tz, _is_utc, @@ -37,12 +33,10 @@ from tslib cimport ( _nat_scalar_rules, ) +from pandas.tseries import offsets +from pandas.tseries.tools import parse_time_string from pandas.tseries import frequencies -from sys import version_info - -cdef bint PY2 = version_info[0] == 2 - cdef int64_t NPY_NAT = util.get_nat() cdef int RESO_US = frequencies.RESO_US @@ -474,7 +468,7 @@ def extract_ordinals(ndarray[object] values, freq): p = values[i] if is_null_datetimelike(p): - ordinals[i] = tslib.iNaT + ordinals[i] = iNaT else: try: ordinals[i] = p.ordinal @@ -485,9 +479,9 @@ def extract_ordinals(ndarray[object] values, freq): except AttributeError: p = Period(p, freq=freq) - if p is tslib.NaT: + if p is NaT: # input may contain NaT-like string - ordinals[i] = tslib.iNaT + ordinals[i] = iNaT else: ordinals[i] = p.ordinal @@ -716,8 +710,8 @@ cdef class _Period(object): """ Fast creation from an ordinal and freq that are already validated! """ - if ordinal == tslib.iNaT: - return tslib.NaT + if ordinal == iNaT: + return NaT else: self = _Period.__new__(cls) self.ordinal = ordinal @@ -730,7 +724,7 @@ cdef class _Period(object): msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) - elif other is tslib.NaT: + elif other is NaT: return _nat_scalar_rules[op] # index/series like elif hasattr(other, '_typ'): @@ -776,8 +770,8 @@ cdef class _Period(object): offsets.Tick, offsets.DateOffset, Timedelta)): return self._add_delta(other) - elif other is tslib.NaT: - return tslib.NaT + elif other is NaT: + return NaT elif lib.is_integer(other): ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) @@ -808,8 +802,8 @@ cdef class _Period(object): else: # pragma: no cover return NotImplemented elif isinstance(other, Period): - if self is tslib.NaT: - return tslib.NaT + if self is NaT: + return NaT return NotImplemented else: return NotImplemented @@ -1164,7 +1158,7 @@ class Period(_Period): if (year is None and month is None and quarter is None and day is None and hour is None and minute is None and second is None): - ordinal = tslib.iNaT + ordinal = iNaT else: if freq is None: raise ValueError("If value is None, freq cannot be None") @@ -1190,7 +1184,7 @@ class Period(_Period): ordinal = converted.ordinal elif is_null_datetimelike(value) or value in tslib._nat_strings: - ordinal = tslib.iNaT + ordinal = iNaT elif isinstance(value, compat.string_types) or lib.is_integer(value): if lib.is_integer(value): diff --git a/pandas/src/reshape.pyx b/pandas/_libs/reshape.pyx similarity index 100% rename from pandas/src/reshape.pyx rename to pandas/_libs/reshape.pyx diff --git a/pandas/src/reshape_helper.pxi.in b/pandas/_libs/reshape_helper.pxi.in similarity index 100% rename from pandas/src/reshape_helper.pxi.in rename to pandas/_libs/reshape_helper.pxi.in diff --git a/pandas/src/datetime.pxd b/pandas/_libs/src/datetime.pxd similarity index 100% rename from pandas/src/datetime.pxd rename to pandas/_libs/src/datetime.pxd diff --git a/pandas/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c similarity index 100% rename from pandas/src/datetime/np_datetime.c rename to pandas/_libs/src/datetime/np_datetime.c diff --git a/pandas/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h similarity index 100% rename from pandas/src/datetime/np_datetime.h rename to pandas/_libs/src/datetime/np_datetime.h diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/_libs/src/datetime/np_datetime_strings.c similarity index 100% rename from pandas/src/datetime/np_datetime_strings.c rename to pandas/_libs/src/datetime/np_datetime_strings.c diff --git a/pandas/src/datetime/np_datetime_strings.h b/pandas/_libs/src/datetime/np_datetime_strings.h similarity index 100% rename from pandas/src/datetime/np_datetime_strings.h rename to pandas/_libs/src/datetime/np_datetime_strings.h diff --git a/pandas/src/datetime_helper.h b/pandas/_libs/src/datetime_helper.h similarity index 100% rename from pandas/src/datetime_helper.h rename to pandas/_libs/src/datetime_helper.h diff --git a/pandas/src/headers/math.h b/pandas/_libs/src/headers/math.h similarity index 100% rename from pandas/src/headers/math.h rename to pandas/_libs/src/headers/math.h diff --git a/pandas/src/headers/ms_inttypes.h b/pandas/_libs/src/headers/ms_inttypes.h similarity index 100% rename from pandas/src/headers/ms_inttypes.h rename to pandas/_libs/src/headers/ms_inttypes.h diff --git a/pandas/src/headers/ms_stdint.h b/pandas/_libs/src/headers/ms_stdint.h similarity index 100% rename from pandas/src/headers/ms_stdint.h rename to pandas/_libs/src/headers/ms_stdint.h diff --git a/pandas/src/headers/portable.h b/pandas/_libs/src/headers/portable.h similarity index 100% rename from pandas/src/headers/portable.h rename to pandas/_libs/src/headers/portable.h diff --git a/pandas/src/headers/stdint.h b/pandas/_libs/src/headers/stdint.h similarity index 100% rename from pandas/src/headers/stdint.h rename to pandas/_libs/src/headers/stdint.h diff --git a/pandas/src/helper.h b/pandas/_libs/src/helper.h similarity index 100% rename from pandas/src/helper.h rename to pandas/_libs/src/helper.h diff --git a/pandas/src/inference.pyx b/pandas/_libs/src/inference.pyx similarity index 100% rename from pandas/src/inference.pyx rename to pandas/_libs/src/inference.pyx diff --git a/pandas/src/khash.pxd b/pandas/_libs/src/khash.pxd similarity index 100% rename from pandas/src/khash.pxd rename to pandas/_libs/src/khash.pxd diff --git a/pandas/src/klib/khash.h b/pandas/_libs/src/klib/khash.h similarity index 100% rename from pandas/src/klib/khash.h rename to pandas/_libs/src/klib/khash.h diff --git a/pandas/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h similarity index 100% rename from pandas/src/klib/khash_python.h rename to pandas/_libs/src/klib/khash_python.h diff --git a/pandas/src/klib/ktypes.h b/pandas/_libs/src/klib/ktypes.h similarity index 100% rename from pandas/src/klib/ktypes.h rename to pandas/_libs/src/klib/ktypes.h diff --git a/pandas/src/klib/kvec.h b/pandas/_libs/src/klib/kvec.h similarity index 100% rename from pandas/src/klib/kvec.h rename to pandas/_libs/src/klib/kvec.h diff --git a/pandas/src/msgpack/pack.h b/pandas/_libs/src/msgpack/pack.h similarity index 100% rename from pandas/src/msgpack/pack.h rename to pandas/_libs/src/msgpack/pack.h diff --git a/pandas/src/msgpack/pack_template.h b/pandas/_libs/src/msgpack/pack_template.h similarity index 100% rename from pandas/src/msgpack/pack_template.h rename to pandas/_libs/src/msgpack/pack_template.h diff --git a/pandas/src/msgpack/sysdep.h b/pandas/_libs/src/msgpack/sysdep.h similarity index 100% rename from pandas/src/msgpack/sysdep.h rename to pandas/_libs/src/msgpack/sysdep.h diff --git a/pandas/src/msgpack/unpack.h b/pandas/_libs/src/msgpack/unpack.h similarity index 100% rename from pandas/src/msgpack/unpack.h rename to pandas/_libs/src/msgpack/unpack.h diff --git a/pandas/src/msgpack/unpack_define.h b/pandas/_libs/src/msgpack/unpack_define.h similarity index 100% rename from pandas/src/msgpack/unpack_define.h rename to pandas/_libs/src/msgpack/unpack_define.h diff --git a/pandas/src/msgpack/unpack_template.h b/pandas/_libs/src/msgpack/unpack_template.h similarity index 100% rename from pandas/src/msgpack/unpack_template.h rename to pandas/_libs/src/msgpack/unpack_template.h diff --git a/pandas/src/numpy.pxd b/pandas/_libs/src/numpy.pxd similarity index 100% rename from pandas/src/numpy.pxd rename to pandas/_libs/src/numpy.pxd diff --git a/pandas/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h similarity index 100% rename from pandas/src/numpy_helper.h rename to pandas/_libs/src/numpy_helper.h diff --git a/pandas/src/offsets.pyx b/pandas/_libs/src/offsets.pyx similarity index 100% rename from pandas/src/offsets.pyx rename to pandas/_libs/src/offsets.pyx diff --git a/pandas/src/parse_helper.h b/pandas/_libs/src/parse_helper.h similarity index 100% rename from pandas/src/parse_helper.h rename to pandas/_libs/src/parse_helper.h diff --git a/pandas/src/parser/.gitignore b/pandas/_libs/src/parser/.gitignore similarity index 100% rename from pandas/src/parser/.gitignore rename to pandas/_libs/src/parser/.gitignore diff --git a/pandas/src/parser/Makefile b/pandas/_libs/src/parser/Makefile similarity index 100% rename from pandas/src/parser/Makefile rename to pandas/_libs/src/parser/Makefile diff --git a/pandas/src/parser/io.c b/pandas/_libs/src/parser/io.c similarity index 100% rename from pandas/src/parser/io.c rename to pandas/_libs/src/parser/io.c diff --git a/pandas/src/parser/io.h b/pandas/_libs/src/parser/io.h similarity index 100% rename from pandas/src/parser/io.h rename to pandas/_libs/src/parser/io.h diff --git a/pandas/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c similarity index 100% rename from pandas/src/parser/tokenizer.c rename to pandas/_libs/src/parser/tokenizer.c diff --git a/pandas/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h similarity index 100% rename from pandas/src/parser/tokenizer.h rename to pandas/_libs/src/parser/tokenizer.h diff --git a/pandas/src/period_helper.c b/pandas/_libs/src/period_helper.c similarity index 100% rename from pandas/src/period_helper.c rename to pandas/_libs/src/period_helper.c diff --git a/pandas/src/period_helper.h b/pandas/_libs/src/period_helper.h similarity index 100% rename from pandas/src/period_helper.h rename to pandas/_libs/src/period_helper.h diff --git a/pandas/src/properties.pyx b/pandas/_libs/src/properties.pyx similarity index 100% rename from pandas/src/properties.pyx rename to pandas/_libs/src/properties.pyx diff --git a/pandas/src/reduce.pyx b/pandas/_libs/src/reduce.pyx similarity index 100% rename from pandas/src/reduce.pyx rename to pandas/_libs/src/reduce.pyx diff --git a/pandas/src/skiplist.h b/pandas/_libs/src/skiplist.h similarity index 100% rename from pandas/src/skiplist.h rename to pandas/_libs/src/skiplist.h diff --git a/pandas/src/skiplist.pxd b/pandas/_libs/src/skiplist.pxd similarity index 100% rename from pandas/src/skiplist.pxd rename to pandas/_libs/src/skiplist.pxd diff --git a/pandas/src/skiplist.pyx b/pandas/_libs/src/skiplist.pyx similarity index 100% rename from pandas/src/skiplist.pyx rename to pandas/_libs/src/skiplist.pyx diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h similarity index 100% rename from pandas/src/ujson/lib/ultrajson.h rename to pandas/_libs/src/ujson/lib/ultrajson.h diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c similarity index 100% rename from pandas/src/ujson/lib/ultrajsondec.c rename to pandas/_libs/src/ujson/lib/ultrajsondec.c diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c similarity index 100% rename from pandas/src/ujson/lib/ultrajsonenc.c rename to pandas/_libs/src/ujson/lib/ultrajsonenc.c diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c similarity index 100% rename from pandas/src/ujson/python/JSONtoObj.c rename to pandas/_libs/src/ujson/python/JSONtoObj.c diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c similarity index 99% rename from pandas/src/ujson/python/objToJSON.c rename to pandas/_libs/src/ujson/python/objToJSON.c index e3c75d3b6e081..26a68b8a9ae3a 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -180,7 +180,7 @@ void initObjToJSON(void) Py_DECREF(mod_pandas); } - mod_tslib = PyImport_ImportModule("pandas.tslib"); + mod_tslib = PyImport_ImportModule("pandas._libs.tslib"); if (mod_tslib) { cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_tslib, "NaTType"); Py_DECREF(mod_tslib); diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/_libs/src/ujson/python/py_defines.h similarity index 100% rename from pandas/src/ujson/python/py_defines.h rename to pandas/_libs/src/ujson/python/py_defines.h diff --git a/pandas/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c similarity index 95% rename from pandas/src/ujson/python/ujson.c rename to pandas/_libs/src/ujson/python/ujson.c index 8c25975f12409..ec6720f16bc77 100644 --- a/pandas/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -80,7 +80,7 @@ static PyMethodDef ujsonMethods[] = { static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, - "_pandasujson", + "_libjson", 0, /* m_doc */ -1, /* m_size */ ujsonMethods, /* m_methods */ @@ -90,14 +90,14 @@ static struct PyModuleDef moduledef = { NULL /* m_free */ }; -#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) +#define PYMODINITFUNC PyMODINIT_FUNC PyInit_libjson(void) #define PYMODULE_CREATE() PyModule_Create(&moduledef) #define MODINITERROR return NULL #else -#define PYMODINITFUNC PyMODINIT_FUNC initjson(void) -#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods) +#define PYMODINITFUNC PyMODINIT_FUNC initlibjson(void) +#define PYMODULE_CREATE() Py_InitModule("libjson", ujsonMethods) #define MODINITERROR return #endif diff --git a/pandas/src/ujson/python/version.h b/pandas/_libs/src/ujson/python/version.h similarity index 100% rename from pandas/src/ujson/python/version.h rename to pandas/_libs/src/ujson/python/version.h diff --git a/pandas/src/util.pxd b/pandas/_libs/src/util.pxd similarity index 100% rename from pandas/src/util.pxd rename to pandas/_libs/src/util.pxd diff --git a/pandas/tslib.pxd b/pandas/_libs/tslib.pxd similarity index 100% rename from pandas/tslib.pxd rename to pandas/_libs/tslib.pxd diff --git a/pandas/tslib.pyx b/pandas/_libs/tslib.pyx similarity index 100% rename from pandas/tslib.pyx rename to pandas/_libs/tslib.pyx diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 25a170c3eb121..279a82fea1cc2 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -62,7 +62,13 @@ def load_reduce(self): # 10890 ('pandas.core.series', 'TimeSeries'): ('pandas.core.series', 'Series'), - ('pandas.sparse.series', 'SparseTimeSeries'): ('pandas.sparse.series', 'SparseSeries') + ('pandas.sparse.series', 'SparseTimeSeries'): ('pandas.sparse.series', 'SparseSeries'), + + # 12588, extensions moving + ('pandas._sparse', 'BlockIndex'): ('pandas.sparse.libsparse', 'BlockIndex'), + ('pandas.tslib', 'Timestamp'): ('pandas._libs.tslib', 'Timestamp'), + ('pandas.tslib', '__nat_unpickle'): ('pandas._libs.tslib', '__nat_unpickle'), + ('pandas._period', 'Period'): ('pandas._libs.period', 'Period') } diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py index 875aaa959b264..9ade755e0ff12 100644 --- a/pandas/computation/scope.py +++ b/pandas/computation/scope.py @@ -1,4 +1,5 @@ -"""Module for scope operations +""" +Module for scope operations """ import sys @@ -10,7 +11,8 @@ import numpy as np -import pandas as pd +import pandas +import pandas as pd # noqa from pandas.compat import DeepChainMap, map, StringIO from pandas.core.base import StringMixin import pandas.computation as compu @@ -46,7 +48,7 @@ def _raw_hex_id(obj): _DEFAULT_GLOBALS = { - 'Timestamp': pd.lib.Timestamp, + 'Timestamp': pandas._libs.lib.Timestamp, 'datetime': datetime.datetime, 'True': True, 'False': False, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d37c98c9b9b90..6937675603c10 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,7 +6,7 @@ from warnings import warn import numpy as np -from pandas import compat, lib, tslib, _np_version_under1p8 +from pandas import compat, _np_version_under1p8 from pandas.types.cast import _maybe_promote from pandas.types.generic import ABCSeries, ABCIndex from pandas.types.common import (is_unsigned_integer_dtype, @@ -34,10 +34,9 @@ from pandas.types.missing import isnull import pandas.core.common as com -import pandas.algos as algos -import pandas.hashtable as htable from pandas.compat import string_types -from pandas.tslib import iNaT +from pandas._libs import algos, lib, hashtable as htable +from pandas._libs.tslib import iNaT # --------------- # @@ -1412,7 +1411,7 @@ def diff(arr, n, axis=0): if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') - na = tslib.iNaT + na = iNaT is_timedelta = True elif issubclass(dtype.type, np.integer): dtype = np.float64 diff --git a/pandas/core/base.py b/pandas/core/base.py index 55149198b0dbf..d7c9e35ab6a51 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -12,7 +12,7 @@ from pandas.core import common as com import pandas.core.nanops as nanops -import pandas.lib as lib +import pandas._libs.lib as lib from pandas.compat.numpy import function as nv from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d5dce250275d9..47db86ce1e73e 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -4,9 +4,9 @@ from warnings import warn import types -from pandas import compat, lib +from pandas import compat from pandas.compat import u, lzip -import pandas.algos as _algos +from pandas._libs import lib, algos as libalgos from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex from pandas.types.missing import isnull, notnull @@ -1817,8 +1817,8 @@ def _reverse_indexer(self): """ categories = self.categories - r, counts = _algos.groupsort_indexer(self.codes.astype('int64'), - categories.size) + r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'), + categories.size) counts = counts.cumsum() result = [r[counts[indexer]:counts[indexer + 1]] for indexer in range(len(counts) - 1)] @@ -1897,7 +1897,7 @@ def mode(self): modes : `Categorical` (sorted) """ - import pandas.hashtable as htable + import pandas._libs.hashtable as htable good = self._codes != -1 values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) result = self._constructor(values=values, categories=self.categories, diff --git a/pandas/core/common.py b/pandas/core/common.py index fddac1f29d454..93e24dce8b623 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -8,8 +8,8 @@ from functools import partial import numpy as np -import pandas.lib as lib -import pandas.tslib as tslib +from pandas._libs import lib, tslib + from pandas import compat from pandas.compat import long, zip, iteritems from pandas.core.config import get_option @@ -476,7 +476,6 @@ def _where_compat(mask, arr1, arr2): new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8')) return new_vals.view(_NS_DTYPE) - import pandas.tslib as tslib if arr1.dtype == _NS_DTYPE: arr1 = tslib.ints_to_pydatetime(arr1.view('i8')) if arr2.dtype == _NS_DTYPE: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 15179ac321076..4e7a5ebdf6f67 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -71,7 +71,7 @@ from pandas.core.series import Series from pandas.core.categorical import Categorical import pandas.computation.expressions as expressions -import pandas.core.algorithms as algos +import pandas.core.algorithms as algorithms from pandas.computation.eval import eval as _eval from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -93,8 +93,7 @@ from pandas.formats.printing import pprint_thing import pandas.tools.plotting as gfx -import pandas.lib as lib -import pandas.algos as _algos +from pandas._libs import lib, algos as libalgos from pandas.core.config import get_option @@ -2794,8 +2793,8 @@ def _reindex_multi(self, axes, copy, fill_value): if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer - new_values = algos.take_2d_multi(self.values, indexer, - fill_value=fill_value) + new_values = algorithms.take_2d_multi(self.values, indexer, + fill_value=fill_value) return self._constructor(new_values, index=new_index, columns=new_columns) else: @@ -3180,12 +3179,11 @@ def duplicated(self, subset=None, keep='first'): duplicated : Series """ from pandas.core.sorting import get_group_index - from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): - labels, shape = algos.factorize(vals, - size_hint=min(len(self), - _SIZE_HINT_LIMIT)) + labels, shape = algorithms.factorize( + vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) return labels.astype('i8', copy=False), len(shape) if subset is None: @@ -3437,7 +3435,7 @@ def nlargest(self, n, columns, keep='first'): 1 10 b 2 2 8 d NaN """ - return algos.select_n_frame(self, columns, n, 'nlargest', keep) + return algorithms.select_n_frame(self, columns, n, 'nlargest', keep) def nsmallest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` smallest @@ -3471,7 +3469,7 @@ def nsmallest(self, n, columns, keep='first'): 0 1 a 1 2 8 d NaN """ - return algos.select_n_frame(self, columns, n, 'nsmallest', keep) + return algorithms.select_n_frame(self, columns, n, 'nsmallest', keep) def swaplevel(self, i=-2, j=-1, axis=0): """ @@ -4739,10 +4737,10 @@ def corr(self, method='pearson', min_periods=1): mat = numeric_df.values if method == 'pearson': - correl = _algos.nancorr(_ensure_float64(mat), minp=min_periods) + correl = libalgos.nancorr(_ensure_float64(mat), minp=min_periods) elif method == 'spearman': - correl = _algos.nancorr_spearman(_ensure_float64(mat), - minp=min_periods) + correl = libalgos.nancorr_spearman(_ensure_float64(mat), + minp=min_periods) else: if min_periods is None: min_periods = 1 @@ -4802,8 +4800,8 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = _algos.nancorr(_ensure_float64(mat), cov=True, - minp=min_periods) + baseCov = libalgos.nancorr(_ensure_float64(mat), cov=True, + minp=min_periods) return self._constructor(baseCov, index=idx, columns=cols) @@ -5669,7 +5667,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): indexer = indexer_cache[id(index)] = index.get_indexer(columns) values = _values_from_object(s) - aligned_values.append(algos.take_1d(values, indexer)) + aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 298fa75779420..ff58a2aa77447 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,11 +7,9 @@ import json import numpy as np -import pandas.lib as lib - import pandas as pd - +from pandas._libs import tslib, lib from pandas.types.common import (_coerce_to_dtype, _ensure_int64, needs_i8_conversion, @@ -6115,7 +6113,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): issubclass(y.dtype.type, (np.datetime64, np.timedelta64))): result = accum_func(y, axis) mask = isnull(self) - np.putmask(result, mask, pd.tslib.iNaT) + np.putmask(result, mask, tslib.iNaT) elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): mask = isnull(self) np.putmask(y, mask, mask_a) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 43c57a88b4d19..a10be078a8f96 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -55,13 +55,12 @@ from pandas.formats.printing import pprint_thing from pandas.util.validators import validate_kwargs -import pandas.core.algorithms as algos +import pandas.core.algorithms as algorithms import pandas.core.common as com from pandas.core.config import option_context -import pandas.lib as lib -from pandas.lib import Timestamp -import pandas.tslib as tslib -import pandas.algos as _algos + +from pandas._libs import lib, algos as libalgos, Timestamp, NaT, iNaT +from pandas._libs.lib import count_level_2d _doc_template = """ @@ -1474,11 +1473,11 @@ def shift(self, periods=1, freq=None, axis=0): # filled in by Cython indexer = np.zeros_like(labels) - _algos.group_shift_indexer(indexer, labels, ngroups, periods) + libalgos.group_shift_indexer(indexer, labels, ngroups, periods) output = {} for name, obj in self._iterate_slices(): - output[name] = algos.take_nd(obj.values, indexer) + output[name] = algorithms.take_nd(obj.values, indexer) return self._wrap_transformed_output(output) @@ -1815,13 +1814,13 @@ def _get_cython_function(self, kind, how, values, is_numeric): def get_func(fname): # see if there is a fused-type version of function # only valid for numeric - f = getattr(_algos, fname, None) + f = getattr(libalgos, fname, None) if f is not None and is_numeric: return f # otherwise find dtype-specific version, falling back to object for dt in [dtype_str, 'object']: - f = getattr(_algos, "%s_%s" % (fname, dtype_str), None) + f = getattr(libalgos, "%s_%s" % (fname, dtype_str), None) if f is not None: return f @@ -1901,7 +1900,7 @@ def _cython_operation(self, kind, values, how, axis): elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition - if (values == tslib.iNaT).any(): + if (values == iNaT).any(): values = _ensure_float64(values) else: values = values.astype('int64', copy=False) @@ -1943,7 +1942,7 @@ def _cython_operation(self, kind, values, how, axis): result, values, labels, func, is_numeric, is_datetimelike) if is_integer_dtype(result): - mask = result == tslib.iNaT + mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan @@ -2034,7 +2033,8 @@ def _aggregate_series_fast(self, obj, func): dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer, convert=False) - group_index = algos.take_nd(group_index, indexer, allow_fill=False) + group_index = algorithms.take_nd( + group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() @@ -2132,7 +2132,7 @@ def groups(self): # GH 3881 result = {} for key, value in zip(self.binlabels, self.bins): - if key is not tslib.NaT: + if key is not NaT: result[key] = value return result @@ -2159,7 +2159,7 @@ def get_iterator(self, data, axis=0): start = 0 for edge, label in zip(self.bins, self.binlabels): - if label is not tslib.NaT: + if label is not NaT: yield label, slicer(start, edge) start = edge @@ -2173,7 +2173,7 @@ def indices(self): i = 0 for label, bin in zip(self.binlabels, self.bins): if i < bin: - if label is not tslib.NaT: + if label is not NaT: indices[label] = list(range(i, bin)) i = bin return indices @@ -2383,7 +2383,8 @@ def group_index(self): def _make_labels(self): if self._labels is None or self._group_index is None: - labels, uniques = algos.factorize(self.grouper, sort=self.sort) + labels, uniques = algorithms.factorize( + self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @@ -2928,7 +2929,7 @@ def _transform_fast(self, func): ids, _, ngroup = self.grouper.group_info cast = (self.size().fillna(0) > 0).any() - out = algos.take_1d(func().values, ids) + out = algorithms.take_1d(func().values, ids) if cast: out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) @@ -2985,7 +2986,7 @@ def nunique(self, dropna=True): except TypeError: # catches object dtypes assert val.dtype == object, \ 'val.dtype must be object, got %s' % val.dtype - val, _ = algos.factorize(val, sort=False) + val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) _isnull = lambda a: a == -1 else: @@ -3069,7 +3070,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, ids, val = ids[mask], val[mask] if bins is None: - lab, lev = algos.factorize(val, sort=True) + lab, lev = algorithms.factorize(val, sort=True) else: cat, bins = cut(val, bins, retbins=True) # bins[:-1] for backward compat; @@ -3108,7 +3109,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if dropna: m = ids[lab == -1] if _np_version_under1p8: - mi, ml = algos.factorize(m) + mi, ml = algorithms.factorize(m) d[ml] = d[ml] - np.bincount(mi) else: np.add.at(d, m, -1) @@ -3130,7 +3131,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out = _ensure_int64(out) return Series(out, index=mi, name=self.name) - # for compat. with algos.value_counts need to ensure every + # for compat. with libalgos.value_counts need to ensure every # bin is present at every index level, null filled with zeros diff = np.zeros(len(out), dtype='bool') for lab in labels[:-1]: @@ -3701,7 +3702,7 @@ def _transform_fast(self, result, obj): ids, _, ngroup = self.grouper.group_info output = [] for i, _ in enumerate(result.columns): - res = algos.take_1d(result.iloc[:, i].values, ids) + res = algorithms.take_1d(result.iloc[:, i].values, ids) if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) @@ -3995,7 +3996,6 @@ def _apply_to_column_groupbys(self, func): def count(self): """ Compute count of group, excluding missing values """ from functools import partial - from pandas.lib import count_level_2d from pandas.types.missing import _isnull_ndarraylike as isnull data, _ = self._get_data_to_aggregate() @@ -4190,7 +4190,7 @@ def __init__(self, data, labels, ngroups, axis=0): @cache_readonly def slabels(self): # Sorted labels - return algos.take_nd(self.labels, self.sort_idx, allow_fill=False) + return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False) @cache_readonly def sort_idx(self): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6cd5eceed5f2a..4b43574f49820 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -53,18 +53,17 @@ import pandas.core.missing as missing from pandas.sparse.array import _maybe_to_sparse, SparseArray -import pandas.lib as lib -import pandas.tslib as tslib +from pandas._libs import lib, tslib +from pandas._libs.tslib import Timedelta +from pandas._libs.lib import BlockPlacement + import pandas.computation.expressions as expressions from pandas.util.decorators import cache_readonly from pandas.util.validators import validate_bool_kwarg -from pandas.tslib import Timedelta from pandas import compat, _np_version_under1p9 from pandas.compat import range, map, zip, u -from pandas.lib import BlockPlacement - class Block(PandasObject): """ diff --git a/pandas/core/missing.py b/pandas/core/missing.py index ffd0423572f5e..3b9bfe1de48e7 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -5,8 +5,8 @@ import numpy as np from distutils.version import LooseVersion -import pandas.algos as algos -import pandas.lib as lib +from pandas._libs import algos, lib + from pandas.compat import range, string_types from pandas.types.common import (is_numeric_v_string_like, is_float_dtype, is_datetime64_dtype, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0cc3a2d039b5e..bb6c9b4546d0f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -9,7 +9,8 @@ except ImportError: # pragma: no cover _USE_BOTTLENECK = False -from pandas import compat, lib, algos, tslib +from pandas import compat +from pandas._libs import tslib, algos, lib from pandas.types.common import (_get_dtype, is_float, is_scalar, is_integer, is_complex, is_float_dtype, diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 6cc43cd9228f6..fe83f8a352851 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -10,15 +10,17 @@ import numpy as np import pandas as pd import datetime -from pandas import compat, lib, tslib -import pandas.index as _index + +from pandas._libs import (lib, index as libindex, + tslib as libts, algos as libalgos, iNaT) + +from pandas import compat from pandas.util.decorators import Appender import pandas.computation.expressions as expressions -from pandas.lib import isscalar -from pandas.tslib import iNaT + from pandas.compat import bind_method import pandas.core.missing as missing -import pandas.algos as _algos + from pandas.core.common import (_values_from_object, _maybe_match_name, PerformanceWarning) from pandas.types.missing import notnull, isnull @@ -29,6 +31,7 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_bool_dtype, is_datetimetz, is_list_like, + is_scalar, _ensure_object) from pandas.types.cast import _maybe_upcast_putmask, _find_common_type from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex @@ -476,7 +479,7 @@ def _convert_to_array(self, values, name=None, other=None): values = values._values elif not (isinstance(values, (np.ndarray, ABCSeries)) and is_datetime64_dtype(values)): - values = tslib.array_to_datetime(values) + values = libts.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, errors='coerce', box=False) @@ -680,12 +683,12 @@ def safe_na_op(lvalues, rvalues): if isinstance(rvalues, ABCSeries): if is_object_dtype(rvalues): # if dtype is object, try elementwise op - return _algos.arrmap_object(rvalues, - lambda x: op(lvalues, x)) + return libalgos.arrmap_object(rvalues, + lambda x: op(lvalues, x)) else: if is_object_dtype(lvalues): - return _algos.arrmap_object(lvalues, - lambda x: op(x, rvalues)) + return libalgos.arrmap_object(lvalues, + lambda x: op(x, rvalues)) raise def wrapper(left, right, name=name, na_op=na_op): @@ -754,7 +757,7 @@ def na_op(x, y): # in either operand if is_categorical_dtype(x): return op(x, y) - elif is_categorical_dtype(y) and not isscalar(y): + elif is_categorical_dtype(y) and not is_scalar(y): return op(y, x) if is_object_dtype(x.dtype): @@ -770,7 +773,7 @@ def na_op(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None - if isscalar(y) and isnull(y): + if is_scalar(y) and isnull(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: @@ -779,11 +782,11 @@ def na_op(x, y): # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or - (not isscalar(y) and needs_i8_conversion(y))): + (not is_scalar(y) and needs_i8_conversion(y))): - if isscalar(y): + if is_scalar(y): mask = isnull(x) - y = _index.convert_scalar(x, _values_from_object(y)) + y = libindex.convert_scalar(x, _values_from_object(y)) else: mask = isnull(x) | isnull(y) y = y.view('i8') @@ -819,7 +822,7 @@ def wrapper(self, other, axis=None): elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast - if (not lib.isscalar(lib.item_from_zerodim(other)) and + if (not is_scalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') @@ -855,7 +858,7 @@ def wrapper(self, other, axis=None): with np.errstate(all='ignore'): res = na_op(values, other) - if isscalar(res): + if is_scalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) @@ -1333,7 +1336,7 @@ def na_op(x, y): # work only for scalars def f(self, other): - if not isscalar(other): + if not is_scalar(other): raise ValueError('Simple arithmetic with %s can only be ' 'done with scalar values' % self._constructor.__name__) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7bcd9f2d30b79..3279a8f2be39d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -19,15 +19,14 @@ from pandas.core.sparse import SparseDataFrame, SparseSeries from pandas.sparse.array import SparseArray -from pandas._sparse import IntIndex +from pandas.sparse.libsparse import IntIndex from pandas.core.categorical import Categorical, _factorize_from_iterable from pandas.core.sorting import (get_group_index, compress_group_index, decons_obs_group_ids) import pandas.core.algorithms as algos -import pandas.algos as _algos -import pandas._reshape as _reshape +from pandas._libs import algos as _algos, reshape as _reshape from pandas.core.index import MultiIndex, _get_na_value diff --git a/pandas/core/series.py b/pandas/core/series.py index 626a4a81193cc..83036ffef0bed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -60,7 +60,7 @@ from pandas.compat.numpy import function as nv import pandas.core.ops as ops -import pandas.core.algorithms as algos +import pandas.core.algorithms as algorithms import pandas.core.common as com import pandas.core.nanops as nanops @@ -68,10 +68,7 @@ from pandas.util.decorators import Appender, deprecate_kwarg, Substitution from pandas.util.validators import validate_bool_kwarg -import pandas.lib as lib -import pandas.tslib as tslib -import pandas.index as _index - +from pandas._libs import index as libindex, tslib as libts, lib, iNaT from pandas.core.config import get_option __all__ = ['Series'] @@ -294,7 +291,7 @@ def _set_axis(self, axis, labels, fastpath=False): # need to set here becuase we changed the index if fastpath: self._data.set_axis(axis, labels) - except (tslib.OutOfBoundsDatetime, ValueError): + except (libts.OutOfBoundsDatetime, ValueError): # labels may exceeds datetime bounds, # or not be a DatetimeIndex pass @@ -568,7 +565,7 @@ def _ixs(self, i, axis=0): # dispatch to the values if we need values = self._values if isinstance(values, np.ndarray): - return _index.get_value_at(values, i) + return libindex.get_value_at(values, i) else: return values[i] except IndexError: @@ -582,7 +579,7 @@ def _ixs(self, i, axis=0): if isinstance(label, Index): return self.take(i, axis=axis, convert=True) else: - return _index.get_value_at(self, i) + return libindex.get_value_at(self, i) @property def _is_mixed_type(self): @@ -733,7 +730,7 @@ def setitem(key, value): elif is_timedelta64_dtype(self.dtype): # reassign a null value to iNaT if isnull(value): - value = tslib.iNaT + value = iNaT try: self.index._engine.set_value(self._values, key, @@ -1202,7 +1199,7 @@ def mode(self): modes : Series (sorted) """ # TODO: Add option for bins like value_counts() - return algos.mode(self) + return algorithms.mode(self) @Appender(base._shared_docs['unique'] % _shared_doc_kwargs) def unique(self): @@ -1424,7 +1421,7 @@ def diff(self, periods=1): ------- diffed : Series """ - result = algos.diff(_values_from_object(self), periods) + result = algorithms.diff(_values_from_object(self), periods) return self._constructor(result, index=self.index).__finalize__(self) def autocorr(self, lag=1): @@ -1915,7 +1912,8 @@ def nlargest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nlargest(10) # only sorts up to the N requested """ - return algos.select_n_series(self, n=n, keep=keep, method='nlargest') + return algorithms.select_n_series(self, n=n, keep=keep, + method='nlargest') @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @@ -1953,7 +1951,8 @@ def nsmallest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nsmallest(10) # only sorts up to the N requested """ - return algos.select_n_series(self, n=n, keep=keep, method='nsmallest') + return algorithms.select_n_series(self, n=n, keep=keep, + method='nsmallest') def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -2166,7 +2165,7 @@ def map_f(values, f): arg = self._constructor(arg, index=arg.keys()) indexer = arg.index.get_indexer(values) - new_values = algos.take_1d(arg._values, indexer) + new_values = algorithms.take_1d(arg._values, indexer) else: new_values = map_f(values, arg) @@ -2324,7 +2323,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self # be subclass-friendly - new_values = algos.take_1d(self.get_values(), indexer) + new_values = algorithms.take_1d(self.get_values(), indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -2484,7 +2483,7 @@ def isin(self, values): dtype: bool """ - result = algos.isin(_values_from_object(self), values) + result = algorithms.isin(_values_from_object(self), values) return self._constructor(result, index=self.index).__finalize__(self) def between(self, left, right, inclusive=True): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 71314da7745c0..205d0d94d2ec3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -7,10 +7,9 @@ _ensure_int64, is_categorical_dtype) from pandas.types.missing import isnull -import pandas.core.algorithms as algos -import pandas.algos as _algos -import pandas.hashtable as _hash -from pandas import lib +import pandas.core.algorithms as algorithms +from pandas._libs import lib, algos, hashtable +from pandas._libs.hashtable import unique_label_indices _INT64_MAX = np.iinfo(np.int64).max @@ -131,7 +130,6 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): xnull: boolean, if nulls are excluded; i.e. -1 labels are passed through """ - from pandas.hashtable import unique_label_indices if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') @@ -250,7 +248,8 @@ def __init__(self, comp_ids, ngroups, levels, labels): self.comp_ids = comp_ids.astype(np.int64) self.k = len(labels) - self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)] + self.tables = [hashtable.Int64HashTable(ngroups) + for _ in range(self.k)] self._populate_tables() @@ -291,7 +290,7 @@ def get_indexer_dict(label_list, keys): def get_group_index_sorter(group_index, ngroups): """ - _algos.groupsort_indexer implements `counting sort` and it is at least + algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) @@ -309,8 +308,8 @@ def get_group_index_sorter(group_index, ngroups): do_groupsort = (count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))) if do_groupsort: - sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), - ngroups) + sorter, _ = algos.groupsort_indexer(_ensure_int64(group_index), + ngroups) return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort') @@ -323,8 +322,8 @@ def compress_group_index(group_index, sort=True): (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) - table = _hash.Int64HashTable(size_hint) + size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) + table = hashtable.Int64HashTable(size_hint) group_index = _ensure_int64(group_index) @@ -348,10 +347,10 @@ def _reorder_by_uniques(uniques, labels): mask = labels < 0 # move labels to right locations (ie, unsort ascending labels) - labels = algos.take_nd(reverse_indexer, labels, allow_fill=False) + labels = algorithms.take_nd(reverse_indexer, labels, allow_fill=False) np.putmask(labels, mask, -1) # sort observed ids - uniques = algos.take_nd(uniques, sorter, allow_fill=False) + uniques = algorithms.take_nd(uniques, sorter, allow_fill=False) return uniques, labels diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 46ba48b4cd846..b5b5d58235eaa 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -18,7 +18,7 @@ from pandas.core.base import AccessorProperty, NoNewAttributesMixin from pandas.util.decorators import Appender import re -import pandas.lib as lib +import pandas._libs.lib as lib import warnings import textwrap import codecs diff --git a/pandas/core/window.py b/pandas/core/window.py index 3f9aa2b0ff392..6fda60c449f42 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -24,13 +24,14 @@ needs_i8_conversion, is_timedelta64_dtype, is_list_like, - _ensure_float64) + _ensure_float64, + is_scalar) import pandas as pd -from pandas.lib import isscalar + from pandas.core.base import (PandasObject, SelectionMixin, GroupByMixin) import pandas.core.common as com -import pandas._window as _window +import pandas.core.libwindow as _window from pandas.tseries.offsets import DateOffset from pandas import compat from pandas.compat.numpy import function as nv @@ -154,7 +155,7 @@ def _gotitem(self, key, ndim, subset=None): self = self._shallow_copy(subset) self._reset_cache() if subset.ndim == 2: - if isscalar(key) and key in subset or is_list_like(key): + if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self diff --git a/pandas/window.pyx b/pandas/core/window.pyx similarity index 99% rename from pandas/window.pyx rename to pandas/core/window.pyx index 005d42c9f68be..a06e616002ee2 100644 --- a/pandas/window.pyx +++ b/pandas/core/window.pyx @@ -58,7 +58,7 @@ from util cimport numeric from skiplist cimport * -cdef extern from "src/headers/math.h": +cdef extern from "../src/headers/math.h": double sqrt(double x) nogil int signbit(double) nogil diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 622c4cd3bbcc7..d354911a825bc 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -33,8 +33,9 @@ from pandas.io.common import _get_handle, UnicodeWriter, _expand_user from pandas.formats.printing import adjoin, justify, pprint_thing import pandas.core.common as com -import pandas.lib as lib -from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime +import pandas._libs.lib as lib +from pandas._libs.tslib import (iNaT, Timestamp, Timedelta, + format_array_from_datetime) from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex import pandas as pd diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py index 64992e46613e5..a38453e0d2ccc 100644 --- a/pandas/indexes/api.py +++ b/pandas/indexes/api.py @@ -8,7 +8,7 @@ from pandas.indexes.range import RangeIndex # noqa import pandas.core.common as com -import pandas.lib as lib +import pandas._libs.lib as lib # TODO: there are many places that rely on these private methods existing in # pandas.core.index diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index e441d9a88690d..607a463083fdd 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -3,12 +3,10 @@ import operator import numpy as np -import pandas.tslib as tslib -import pandas.lib as lib -import pandas._join as _join -import pandas.algos as _algos -import pandas.index as _index -from pandas.lib import Timestamp, Timedelta, is_datetime_array +from pandas._libs import (lib, index as libindex, tslib as libts, + algos as libalgos, join as libjoin, + Timestamp, Timedelta, ) +from pandas._libs.lib import is_datetime_array from pandas.compat import range, u from pandas.compat.numpy import function as nv @@ -120,11 +118,11 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): _join_precedence = 1 # Cython methods - _arrmap = _algos.arrmap_object - _left_indexer_unique = _join.left_join_indexer_unique_object - _left_indexer = _join.left_join_indexer_object - _inner_indexer = _join.inner_join_indexer_object - _outer_indexer = _join.outer_join_indexer_object + _arrmap = libalgos.arrmap_object + _left_indexer_unique = libjoin.left_join_indexer_unique_object + _left_indexer = libjoin.left_join_indexer_object + _inner_indexer = libjoin.inner_join_indexer_object + _outer_indexer = libjoin.outer_join_indexer_object _box_scalars = False _typ = 'index' @@ -144,7 +142,7 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): # used to infer integers as datetime-likes _infer_as_myclass = False - _engine_type = _index.ObjectEngine + _engine_type = libindex.ObjectEngine def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): @@ -285,7 +283,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except tslib.OutOfBoundsDatetime: + except libts.OutOfBoundsDatetime: pass elif inferred.startswith('timedelta'): @@ -2314,7 +2312,7 @@ def get_value(self, series, key): raise try: - return tslib.get_value_box(s, key) + return libts.get_value_box(s, key) except IndexError: raise except TypeError: @@ -2972,7 +2970,6 @@ def _join_level(self, other, level, how='left', return_indexers=False, order of the data indexed by the MultiIndex will not be changed; otherwise, it will tie out with `other`. """ - from pandas.algos import groupsort_indexer from .multi import MultiIndex def _get_leaf_sorter(labels): @@ -2985,7 +2982,7 @@ def _get_leaf_sorter(labels): if len(labels) == 1: lab = _ensure_int64(labels[0]) - sorter, _ = groupsort_indexer(lab, 1 + lab.max()) + sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) return sorter # find indexers of begining of each set of @@ -3051,8 +3048,9 @@ def _get_leaf_sorter(labels): else: # tie out the order with other if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_labels.max() - left_indexer, counts = groupsort_indexer(new_lev_labels, - ngroups) + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_labels, ngroups) + # missing values are placed first; drop them! left_indexer = left_indexer[counts[0]:] new_labels = [lab[left_indexer] for lab in new_labels] @@ -3846,8 +3844,8 @@ def _ensure_index(index_like, copy=False): def _get_na_value(dtype): - return {np.datetime64: tslib.NaT, - np.timedelta64: tslib.NaT}.get(dtype, np.nan) + return {np.datetime64: libts.NaT, + np.timedelta64: libts.NaT}.get(dtype, np.nan) def _ensure_has_len(seq): diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 5299a094156cd..3d8f76fc56b01 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -1,5 +1,5 @@ import numpy as np -import pandas.index as _index +from pandas._libs import index as libindex from pandas import compat from pandas.compat.numpy import function as nv @@ -45,7 +45,7 @@ class CategoricalIndex(Index, base.PandasDelegate): """ _typ = 'categoricalindex' - _engine_type = _index.Int64Engine + _engine_type = libindex.Int64Engine _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, @@ -303,7 +303,7 @@ def unique(self): False: 'first'}) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): - from pandas.hashtable import duplicated_int64 + from pandas._libs.hashtable import duplicated_int64 codes = self.codes.astype('i8') return duplicated_int64(codes, keep) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 23a42265a149b..bca1db83b6645 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -6,9 +6,7 @@ from sys import getsizeof import numpy as np -import pandas.lib as lib -import pandas.index as _index -from pandas.lib import Timestamp +from pandas._libs import index as libindex, lib, Timestamp from pandas.compat import range, zip, lrange, lzip, map from pandas.compat.numpy import function as nv @@ -76,7 +74,7 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] - _engine_type = _index.MultiIndexEngine + _engine_type = libindex.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -762,7 +760,7 @@ def f(k, stringify): @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.core.sorting import get_group_index - from pandas.hashtable import duplicated_int64 + from pandas._libs.hashtable import duplicated_int64 shape = map(len, self.levels) ids = get_group_index(self.labels, shape, sort=False, xnull=False) @@ -813,7 +811,7 @@ def _try_mi(k): pass try: - return _index.get_value_at(s, k) + return libindex.get_value_at(s, k) except IndexError: raise except TypeError: diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 00ddf5b0c918d..9bb70feb2501f 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -1,9 +1,6 @@ import numpy as np -import pandas.lib as lib -import pandas._join as _join -import pandas.algos as _algos -import pandas.index as _index - +from pandas._libs import (lib, index as libindex, + algos as libalgos, join as libjoin) from pandas.types.common import (is_dtype_equal, pandas_dtype, is_float_dtype, is_object_dtype, is_integer_dtype, is_scalar) @@ -114,16 +111,13 @@ class Int64Index(NumericIndex): __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args _typ = 'int64index' - _arrmap = _algos.arrmap_int64 - _left_indexer_unique = _join.left_join_indexer_unique_int64 - _left_indexer = _join.left_join_indexer_int64 - _inner_indexer = _join.inner_join_indexer_int64 - _outer_indexer = _join.outer_join_indexer_int64 - + _arrmap = libalgos.arrmap_int64 + _left_indexer_unique = libjoin.left_join_indexer_unique_int64 + _left_indexer = libjoin.left_join_indexer_int64 + _inner_indexer = libjoin.inner_join_indexer_int64 + _outer_indexer = libjoin.outer_join_indexer_int64 _can_hold_na = False - - _engine_type = _index.Int64Engine - + _engine_type = libindex.Int64Engine _default_dtype = np.int64 @property @@ -175,17 +169,14 @@ class UInt64Index(NumericIndex): __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args _typ = 'uint64index' - _arrmap = _algos.arrmap_uint64 - _left_indexer_unique = _join.left_join_indexer_unique_uint64 - _left_indexer = _join.left_join_indexer_uint64 - _inner_indexer = _join.inner_join_indexer_uint64 - _outer_indexer = _join.outer_join_indexer_uint64 - + _arrmap = libalgos.arrmap_uint64 + _left_indexer_unique = libjoin.left_join_indexer_unique_uint64 + _left_indexer = libjoin.left_join_indexer_uint64 + _inner_indexer = libjoin.inner_join_indexer_uint64 + _outer_indexer = libjoin.outer_join_indexer_uint64 _can_hold_na = False _na_value = 0 - - _engine_type = _index.UInt64Engine - + _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 @property @@ -255,12 +246,12 @@ class Float64Index(NumericIndex): __doc__ = _num_index_shared_docs['class_descr'] % _float64_descr_args _typ = 'float64index' - _engine_type = _index.Float64Engine - _arrmap = _algos.arrmap_float64 - _left_indexer_unique = _join.left_join_indexer_unique_float64 - _left_indexer = _join.left_join_indexer_float64 - _inner_indexer = _join.inner_join_indexer_float64 - _outer_indexer = _join.outer_join_indexer_float64 + _engine_type = libindex.Float64Engine + _arrmap = libalgos.arrmap_float64 + _left_indexer_unique = libjoin.left_join_indexer_unique_float64 + _left_indexer = libjoin.left_join_indexer_float64 + _inner_indexer = libjoin.inner_join_indexer_float64 + _outer_indexer = libjoin.outer_join_indexer_float64 _default_dtype = np.float64 diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index cc78361f843bf..103a3ac2fd5f4 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -2,7 +2,7 @@ import operator import numpy as np -import pandas.index as _index +from pandas._libs import index as libindex from pandas.types.common import (is_integer, is_scalar, @@ -39,7 +39,7 @@ class RangeIndex(Int64Index): """ _typ = 'rangeindex' - _engine_type = _index.Int64Engine + _engine_type = libindex.Int64Engine def __new__(cls, start=None, stop=None, step=None, name=None, dtype=None, fastpath=False, copy=False, **kwargs): diff --git a/pandas/io/api.py b/pandas/io/api.py index 1284b3cb222d6..e312e7bc2f300 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -11,7 +11,7 @@ from pandas.io.json import read_json from pandas.io.html import read_html from pandas.io.sql import read_sql, read_sql_table, read_sql_query -from pandas.io.sas.sasreader import read_sas +from pandas.io.sas import read_sas from pandas.io.feather_format import read_feather from pandas.io.stata import read_stata from pandas.io.pickle import read_pickle, to_pickle diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 3ffcef4b21552..080d6c3e273a3 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,7 +1,7 @@ """This module is designed for community supported date conversion functions""" from pandas.compat import range, map import numpy as np -import pandas.lib as lib +import pandas._libs.lib as lib def parse_date_time(date_col, time_col): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 00ec8bcf060ef..82ea2e8a46592 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -19,7 +19,7 @@ EmptyDataError, get_filepath_or_buffer, _NA_VALUES) from pandas.tseries.period import Period -from pandas import json +from pandas.io.json import libjson from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, string_types, OrderedDict) from pandas.core import config @@ -1450,7 +1450,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, elif isinstance(cell.val, date): num_format_str = self.date_format - stylekey = json.dumps(cell.style) + stylekey = libjson.dumps(cell.style) if num_format_str: stylekey += num_format_str @@ -1578,7 +1578,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, elif isinstance(cell.val, date): num_format_str = self.date_format - stylekey = json.dumps(cell.style) + stylekey = libjson.dumps(cell.style) if num_format_str: stylekey += num_format_str diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index a00d3492e8a37..114ec4bb2723e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -2,8 +2,8 @@ import os import numpy as np -import pandas.json as _json -from pandas.tslib import iNaT +from pandas.io.json import libjson +from pandas._libs.tslib import iNaT from pandas.compat import StringIO, long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime @@ -14,8 +14,8 @@ from .table_schema import build_table_schema from pandas.types.common import is_period_dtype -loads = _json.loads -dumps = _json.dumps +loads = libjson.loads +dumps = libjson.dumps TABLE_SCHEMA_VERSION = '0.20.0' diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 0e7d025e81851..4da4a6ad57850 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -5,7 +5,7 @@ from collections import defaultdict import numpy as np -from pandas.lib import convert_json_to_lines +from pandas._libs.lib import convert_json_to_lines from pandas import compat, DataFrame diff --git a/pandas/msgpack/__init__.py b/pandas/io/msgpack/__init__.py similarity index 81% rename from pandas/msgpack/__init__.py rename to pandas/io/msgpack/__init__.py index 4d6e241171281..984e90ee03e69 100644 --- a/pandas/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -2,8 +2,8 @@ from collections import namedtuple -from pandas.msgpack.exceptions import * # noqa -from pandas.msgpack._version import version # noqa +from pandas.io.msgpack.exceptions import * # noqa +from pandas.io.msgpack._version import version # noqa class ExtType(namedtuple('ExtType', 'code data')): @@ -19,8 +19,8 @@ def __new__(cls, code, data): import os # noqa -from pandas.msgpack._packer import Packer # noqa -from pandas.msgpack._unpacker import unpack, unpackb, Unpacker # noqa +from pandas.io.msgpack._packer import Packer # noqa +from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa def pack(o, stream, **kwargs): diff --git a/pandas/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx similarity index 98% rename from pandas/msgpack/_packer.pyx rename to pandas/io/msgpack/_packer.pyx index 008dbe5541d50..ad7ce1fb2531a 100644 --- a/pandas/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -6,11 +6,11 @@ from libc.stdlib cimport * from libc.string cimport * from libc.limits cimport * -from pandas.msgpack.exceptions import PackValueError -from pandas.msgpack import ExtType +from pandas.io.msgpack.exceptions import PackValueError +from pandas.io.msgpack import ExtType -cdef extern from "../src/msgpack/pack.h": +cdef extern from "../../src/msgpack/pack.h": struct msgpack_packer: char* buf size_t length diff --git a/pandas/msgpack/_unpacker.pyx b/pandas/io/msgpack/_unpacker.pyx similarity index 98% rename from pandas/msgpack/_unpacker.pyx rename to pandas/io/msgpack/_unpacker.pyx index 6f23a24adde6c..504bfed48df3c 100644 --- a/pandas/msgpack/_unpacker.pyx +++ b/pandas/io/msgpack/_unpacker.pyx @@ -11,12 +11,12 @@ from libc.stdlib cimport * from libc.string cimport * from libc.limits cimport * -from pandas.msgpack.exceptions import (BufferFull, OutOfData, - UnpackValueError, ExtraData) -from pandas.msgpack import ExtType +from pandas.io.msgpack.exceptions import (BufferFull, OutOfData, + UnpackValueError, ExtraData) +from pandas.io.msgpack import ExtType -cdef extern from "../src/msgpack/unpack.h": +cdef extern from "../../src/msgpack/unpack.h": ctypedef struct msgpack_user: bint use_list PyObject* object_hook diff --git a/pandas/msgpack/_version.py b/pandas/io/msgpack/_version.py similarity index 100% rename from pandas/msgpack/_version.py rename to pandas/io/msgpack/_version.py diff --git a/pandas/msgpack/exceptions.py b/pandas/io/msgpack/exceptions.py similarity index 100% rename from pandas/msgpack/exceptions.py rename to pandas/io/msgpack/exceptions.py diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 39bc1a4ecf225..404be758a7fbe 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -55,7 +55,7 @@ Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, Categorical, CategoricalIndex) -from pandas.tslib import NaTType +from pandas._libs.tslib import NaTType from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame @@ -64,7 +64,7 @@ from pandas.core.internals import BlockManager, make_block, _safe_reshape import pandas.core.internals as internals -from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType +from pandas.io.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType from pandas.util._move import ( BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer, diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 811844ec35deb..9aedddc811830 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -36,8 +36,8 @@ from pandas.util.decorators import Appender -import pandas.lib as lib -import pandas.parser as _parser +import pandas._libs.lib as lib +import pandas.io.libparsers as libparsers # BOM character (byte order mark) @@ -1415,7 +1415,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( - cvals, _parser.na_values, + cvals, libparsers.na_values, self.use_unsigned) result[c] = cvals @@ -1533,7 +1533,7 @@ def __init__(self, src, **kwds): # #2442 kwds['allow_leading_cols'] = self.index_col is not False - self._reader = _parser.TextReader(src, **kwds) + self._reader = libparsers.TextReader(src, **kwds) # XXX self.usecols, self.usecols_dtype = _validate_usecols_arg( diff --git a/pandas/parser.pyx b/pandas/io/parsers.pyx similarity index 99% rename from pandas/parser.pyx rename to pandas/io/parsers.pyx index 23aee860b3108..a5858accbb6f5 100644 --- a/pandas/parser.pyx +++ b/pandas/io/parsers.pyx @@ -13,11 +13,12 @@ from cpython cimport (PyObject, PyBytes_FromString, PyUnicode_Check, PyUnicode_AsUTF8String, PyErr_Occurred, PyErr_Fetch) from cpython.ref cimport PyObject, Py_XDECREF -from io.common import ParserError, DtypeWarning, EmptyDataError, ParserWarning +from pandas.io.common import (ParserError, DtypeWarning, + EmptyDataError, ParserWarning) # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. -from io.common import CParserError +from pandas.io.common import CParserError cdef extern from "Python.h": object PyUnicode_FromString(char *v) @@ -36,7 +37,7 @@ from numpy cimport ndarray, uint8_t, uint64_t import numpy as np cimport util -import pandas.lib as lib +import pandas._libs.lib as lib import pandas.compat as compat from pandas.types.common import (is_categorical_dtype, CategoricalDtype, is_integer_dtype, is_float_dtype, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a8fdd5fcfe204..e4f72f901a417 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -44,9 +44,7 @@ from pandas.core.config import get_option from pandas.computation.pytables import Expr, maybe_expression -import pandas.lib as lib -import pandas.algos as algos -import pandas.tslib as tslib +from pandas._libs import tslib, algos, lib from distutils.version import LooseVersion diff --git a/pandas/io/sas/__init__.py b/pandas/io/sas/__init__.py index e69de29bb2d1d..fa6b29a1a3fcc 100644 --- a/pandas/io/sas/__init__.py +++ b/pandas/io/sas/__init__.py @@ -0,0 +1 @@ +from .sasreader import read_sas # noqa diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/sas.pyx similarity index 100% rename from pandas/io/sas/saslib.pyx rename to pandas/io/sas/sas.pyx diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 91f417abc0502..d33cee2c5a1bc 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -20,7 +20,7 @@ import numpy as np import struct import pandas.io.sas.sas_constants as const -from pandas.io.sas.saslib import Parser +from pandas.io.sas.libsas import Parser class _subheader_pointer(object): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 2ab642b3af0c7..b210baedaaf6d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -11,7 +11,7 @@ import re import numpy as np -import pandas.lib as lib +import pandas._libs.lib as lib from pandas.types.missing import isnull from pandas.types.dtypes import DatetimeTZDtype from pandas.types.common import (is_list_like, is_dict_like, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1698ade4c0102..af4bc6a6b7ddb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -30,8 +30,8 @@ import pandas as pd from pandas.io.common import get_filepath_or_buffer, BaseIterator -from pandas.lib import max_len_string_array, infer_dtype -from pandas.tslib import NaT, Timestamp +from pandas._libs.lib import max_len_string_array, infer_dtype +from pandas._libs.tslib import NaT, Timestamp _version_error = ("Version of given Stata file is not 104, 105, 108, " "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " diff --git a/pandas/json.py b/pandas/json.py new file mode 100644 index 0000000000000..5b1e395fa4b74 --- /dev/null +++ b/pandas/json.py @@ -0,0 +1,7 @@ +# flake8: noqa + +import warnings +warnings.warn("The pandas.json module is deprecated and will be " + "removed in a future version. Please import from " + "the pandas.io.json instead", FutureWarning, stacklevel=2) +from pandas.io.json.libjson import dumps, loads diff --git a/pandas/lib.py b/pandas/lib.py new file mode 100644 index 0000000000000..6c26627a97de3 --- /dev/null +++ b/pandas/lib.py @@ -0,0 +1,7 @@ +# flake8: noqa + +import warnings +warnings.warn("The pandas.lib module is deprecated and will be " + "removed in a future version. Please import from " + "the pandas._libs.lib instead", FutureWarning, stacklevel=2) +from pandas._libs.lib import * diff --git a/pandas/parser.py b/pandas/parser.py new file mode 100644 index 0000000000000..af203c3df8cc9 --- /dev/null +++ b/pandas/parser.py @@ -0,0 +1,8 @@ +# flake8: noqa + +import warnings +warnings.warn("The pandas.parser module is deprecated and will be " + "removed in a future version. Please import from " + "the pandas.io.parser instead", FutureWarning, stacklevel=2) +from pandas.io.libparsers import na_values +from pandas.io.common import CParserError diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index c65e0dd5c9f7b..762b6d869eae0 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -25,9 +25,9 @@ _astype_nansafe, _find_common_type) from pandas.types.missing import isnull, notnull, na_value_for_dtype -from pandas._sparse import SparseIndex, BlockIndex, IntIndex -import pandas._sparse as splib -import pandas.index as _index +from pandas.sparse import libsparse as splib +from pandas.sparse.libsparse import SparseIndex, BlockIndex, IntIndex +from pandas._libs import index as libindex import pandas.core.algorithms as algos import pandas.core.ops as ops import pandas.formats.printing as printing @@ -447,7 +447,7 @@ def _get_val_at(self, loc): if sp_loc == -1: return self.fill_value else: - return _index.get_value_at(self, sp_loc) + return libindex.get_value_at(self, sp_loc) @Appender(_index_shared_docs['take'] % _sparray_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py index d294e65bbf10c..54ebf5e51045d 100644 --- a/pandas/sparse/list.py +++ b/pandas/sparse/list.py @@ -6,7 +6,7 @@ from pandas.types.common import is_scalar from pandas.sparse.array import SparseArray from pandas.util.validators import validate_bool_kwarg -import pandas._sparse as splib +import pandas.sparse.libsparse as splib class SparseList(PandasObject): diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index c3dd089e8409a..7ec42f02c3998 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -20,13 +20,13 @@ from pandas.core import generic import pandas.core.common as com import pandas.core.ops as ops -import pandas.index as _index +import pandas._libs.index as _index from pandas.util.decorators import Appender from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray, _make_index) -from pandas._sparse import BlockIndex, IntIndex -import pandas._sparse as splib +from pandas.sparse.libsparse import BlockIndex, IntIndex +import pandas.sparse.libsparse as splib from pandas.sparse.scipy_sparse import (_sparse_series_to_coo, _coo_to_sparse_series) diff --git a/pandas/src/sparse.pyx b/pandas/sparse/sparse.pyx similarity index 100% rename from pandas/src/sparse.pyx rename to pandas/sparse/sparse.pyx diff --git a/pandas/src/sparse_op_helper.pxi.in b/pandas/sparse/sparse_op_helper.pxi.in similarity index 100% rename from pandas/src/sparse_op_helper.pxi.in rename to pandas/sparse/sparse_op_helper.pxi.in diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 2f8ebc4cc1df4..db92210478182 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from warnings import catch_warnings import numpy as np import pandas as pd @@ -33,16 +34,12 @@ class TestPDApi(Base, tm.TestCase): # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', 'indexes', 'formats', 'pandas', - 'test', 'tools', 'tseries', + 'test', 'tools', 'tseries', 'sparse', 'types', 'util', 'options', 'io'] - # top-level packages that are c-imports, should rename to _* - # to avoid naming conflicts - lib_to_rename = ['algos', 'hashtable', 'tslib', 'msgpack', 'sparse', - 'json', 'lib', 'index', 'parser'] - # these are already deprecated; awaiting removal - deprecated_modules = ['stats', 'datetools'] + deprecated_modules = ['stats', 'datetools', 'parser', + 'json', 'lib', 'tslib'] # misc misc = ['IndexSlice', 'NaT'] @@ -113,7 +110,7 @@ class TestPDApi(Base, tm.TestCase): def test_api(self): self.check(pd, - self.lib + self.lib_to_rename + self.misc + + self.lib + self.misc + self.modules + self.deprecated_modules + self.classes + self.deprecated_classes + self.deprecated_classes_in_future + @@ -206,7 +203,7 @@ def test_removed_from_core_common(self): self.assertRaises(AttributeError, lambda: getattr(com, t)) -class TestDatetools(tm.TestCase): +class TestDatetoolsDeprecation(tm.TestCase): def test_deprecation_access_func(self): with tm.assert_produces_warning(FutureWarning, @@ -247,3 +244,36 @@ def test_groupby(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.groupby(pd.Series([1, 2, 3]), [1, 1, 1]) + + +class TestJson(tm.TestCase): + + def test_deprecation_access_func(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.json.dumps([]) + + +class TestParser(tm.TestCase): + + def test_deprecation_access_func(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.parser.na_values + + +class TestLib(tm.TestCase): + + def test_deprecation_access_func(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.lib.infer_dtype + + +class TestTSLib(tm.TestCase): + + def test_deprecation_access_func(self): + # some libraries may be imported before we + # test and could show the warning + with catch_warnings(record=True): + pd.tslib.Timestamp diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index b42f79fe5009b..ed6006440441e 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -28,7 +28,7 @@ import pandas.computation.expr as expr import pandas.util.testing as tm -import pandas.lib as lib +import pandas._libs.lib as lib from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_numpy_array_equal, assert_produces_warning, assert_series_equal, diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 76eb61bd81110..ba7e45d7e66fb 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -23,7 +23,7 @@ from pandas.core.common import PandasError import pandas as pd import pandas.core.common as com -import pandas.lib as lib +import pandas._libs.lib as lib import pandas.util.testing as tm from pandas.tests.frame.common import TestData diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 36c39ffba70b3..f0dfc4553886b 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -18,6 +18,7 @@ date_range) import pandas as pd +from pandas._libs.tslib import iNaT from pandas.tseries.offsets import BDay from pandas.types.common import (is_float_dtype, is_integer, @@ -1491,8 +1492,7 @@ def test_setitem_single_column_mixed_datetime(self): assert_series_equal(result, expected) # set an allowable datetime64 type - from pandas import tslib - df.loc['b', 'timestamp'] = tslib.iNaT + df.loc['b', 'timestamp'] = iNaT self.assertTrue(isnull(df.loc['b', 'timestamp'])) # allow this syntax diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 471fc536a90f6..e49dfffc48803 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -8,7 +8,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) -from pandas.parser import ParserError +from pandas.io.common import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, date_range, read_csv, compat, to_datetime) import pandas as pd diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 51a10f4141ab5..77c5bde332cff 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -7,8 +7,8 @@ from pandas import Index, isnull from pandas.util.testing import assert_almost_equal import pandas.util.testing as tm -import pandas.lib as lib -import pandas.algos as algos +import pandas._libs.lib as lib +import pandas._libs.algos as algos def test_series_grouper(): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 2d21eab5822fe..4acf9dd4755f4 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -6,6 +6,7 @@ from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range from pandas.types.common import _ensure_platform_int, is_timedelta64_dtype from pandas.compat import StringIO +from pandas._libs import algos from .common import MixIn, assert_fp_equal from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -417,8 +418,8 @@ def test_cython_group_transform_algos(self): dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, np.uint64, np.float32, np.float64] - ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]), - (pd.algos.group_cumsum, np.cumsum, dtypes)] + ops = [(algos.group_cumprod_float64, np.cumproduct, [np.float64]), + (algos.group_cumsum, np.cumsum, dtypes)] is_datetimelike = False for pd_op, np_op, dtypes in ops: @@ -436,13 +437,13 @@ def test_cython_group_transform_algos(self): data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike) + algos.group_cumprod_float64(actual, data, labels, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumsum(actual, data, labels, is_datetimelike) + algos.group_cumsum(actual, data, labels, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) @@ -450,8 +451,8 @@ def test_cython_group_transform_algos(self): is_datetimelike = True data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] actual = np.zeros_like(data, dtype='int64') - pd.algos.group_cumsum(actual, data.view('int64'), labels, - is_datetimelike) + algos.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 7b39a33266ffa..3581f894e53a3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -10,6 +10,7 @@ TimedeltaIndex, PeriodIndex, notnull, isnull) from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp +from pandas._libs.tslib import iNaT import pandas.util.testing as tm @@ -322,7 +323,7 @@ def test_get_unique_index(self): if needs_i8_conversion(ind): vals = ind.asi8[[0] * 5] - vals[0] = pd.tslib.iNaT + vals[0] = iNaT else: vals = ind.values[[0] * 5] vals[0] = np.nan @@ -407,7 +408,7 @@ def test_numpy_argsort(self): # pandas compatibility input validation - the # rest already perform separate (or no) such # validation via their 'values' attribute as - # defined in pandas/indexes/base.py - they + # defined in pandas.indexes/base.py - they # cannot be changed at the moment due to # backwards compatibility concerns if isinstance(type(ind), (CategoricalIndex, RangeIndex)): @@ -836,7 +837,7 @@ def test_hasnans_isnans(self): if len(index) == 0: continue elif isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin): - values[1] = pd.tslib.iNaT + values[1] = iNaT elif isinstance(index, (Int64Index, UInt64Index)): continue else: @@ -876,7 +877,7 @@ def test_fillna(self): values = idx.values if isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin): - values[1] = pd.tslib.iNaT + values[1] = iNaT elif isinstance(index, (Int64Index, UInt64Index)): continue else: diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 772d76305cff2..16881de6e8c39 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -2,9 +2,10 @@ from datetime import timedelta import pandas as pd -from pandas import tslib, offsets, lib +from pandas import offsets import pandas.util.testing as tm -from pandas.tslib import OutOfBoundsDatetime +from pandas._libs import tslib, lib +from pandas._libs.tslib import OutOfBoundsDatetime from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range, to_datetime) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 80664ce246bf8..67e82e5c71d75 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -350,7 +350,7 @@ def test_range_tz_dateutil(self): # GH 2906 tm._skip_if_no_dateutil() # Use maybe_get_tz to fix filename in tz under dateutil. - from pandas.tslib import maybe_get_tz + from pandas._libs.tslib import maybe_get_tz tz = lambda x: maybe_get_tz('dateutil/' + x) start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 2c87c48bcda11..78c37f773547a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -117,7 +117,7 @@ def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): def test_time_loc(self): # GH8667 from datetime import time - from pandas.index import _SIZE_CUTOFF + from pandas._libs.index import _SIZE_CUTOFF ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) key = time(15, 11, 30) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 312017eef3446..4abc282252559 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -5,7 +5,7 @@ from itertools import product import pandas as pd -import pandas.tslib as tslib +import pandas._libs.tslib as tslib import pandas.util.testing as tm from pandas.core.common import PerformanceWarning from pandas.tseries.index import cdate_range diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 8d05a4016ba45..a1ad147f84aff 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -326,7 +326,7 @@ def test_month_range_union_tz_pytz(self): def test_month_range_union_tz_dateutil(self): tm._skip_if_windows_python_3() tm._skip_if_no_dateutil() - from pandas.tslib import _dateutil_gettz as timezone + from pandas._libs.tslib import _dateutil_gettz as timezone tz = timezone('US/Eastern') early_start = datetime(2011, 1, 1) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 1b67ffce63b10..512a3e1c38629 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -9,7 +9,7 @@ from distutils.version import LooseVersion import pandas as pd -from pandas import tslib +from pandas._libs import tslib, lib from pandas.tseries import tools from pandas.tseries.tools import normalize_date from pandas.compat import lmap @@ -19,7 +19,7 @@ from pandas.util.testing import assert_series_equal, _skip_if_has_locale from pandas import (isnull, to_datetime, Timestamp, Series, DataFrame, Index, DatetimeIndex, NaT, date_range, bdate_range, - compat, lib) + compat) class TimeConversionFormats(tm.TestCase): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 8d9e26406defc..ff83b50a2a7b2 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -4,8 +4,9 @@ import pandas as pd from pandas.util import testing as tm from pandas.compat import lrange +from pandas._libs import tslib from pandas import (PeriodIndex, Series, DatetimeIndex, - period_range, Period, tslib, _np_version_under1p9) + period_range, Period, _np_version_under1p9) class TestGetItem(tm.TestCase): diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 82a881d7c65bc..4533428cf1514 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -2,7 +2,7 @@ from datetime import timedelta import pandas as pd -import pandas.tslib as tslib +import pandas._libs.tslib as tslib import pandas.util.testing as tm import pandas.tseries.period as period from pandas import (DatetimeIndex, PeriodIndex, period_range, Series, Period, diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index e09d405afd375..f9a1df3d824f1 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -6,7 +6,7 @@ import pandas.tseries.period as period from pandas.compat import lrange from pandas.tseries.frequencies import get_freq, MONTHS -from pandas._period import period_ordinal, period_asfreq +from pandas._libs.period import period_ordinal, period_asfreq from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, date_range, to_datetime, period_range) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 79d10cbda565e..8c0a399cb58b3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -24,7 +24,7 @@ from pandas.tseries.index import _to_m8 import pandas as pd -from pandas.lib import Timestamp +from pandas._libs.lib import Timestamp class TestIndex(Base, tm.TestCase): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 80ff67ab3d043..f67231e78983c 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -17,7 +17,8 @@ from pandas.compat import PY3, long, lrange, lzip, range, u from pandas.core.common import PerformanceWarning, UnsortedIndexError from pandas.indexes.base import InvalidIndexError -from pandas.lib import Timestamp +from pandas._libs import lib +from pandas._libs.lib import Timestamp import pandas.util.testing as tm @@ -851,7 +852,7 @@ def test_from_product_invalid_input(self): def test_from_product_datetimeindex(self): dt_index = date_range('2000-01-01', periods=2) mi = pd.MultiIndex.from_product([[1, 2], dt_index]) - etalon = pd.lib.list_to_object_array([(1, pd.Timestamp( + etalon = lib.list_to_object_array([(1, pd.Timestamp( '2000-01-01')), (1, pd.Timestamp('2000-01-02')), (2, pd.Timestamp( '2000-01-01')), (2, pd.Timestamp('2000-01-02'))]) tm.assert_numpy_array_equal(mi.values, etalon) @@ -878,7 +879,7 @@ def test_values_boxed(self): (3, pd.Timestamp('2000-01-03'))] mi = pd.MultiIndex.from_tuples(tuples) tm.assert_numpy_array_equal(mi.values, - pd.lib.list_to_object_array(tuples)) + lib.list_to_object_array(tuples)) # Check that code branches for boxed values produce identical results tm.assert_numpy_array_equal(mi.values[:4], mi[:4].values) @@ -2181,7 +2182,7 @@ def check(nlevels, with_nulls): for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) - right = pd.hashtable.duplicated_object(mi.values, keep=keep) + right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 1bf9a10628542..e23e7c19ed799 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -11,7 +11,7 @@ import pandas.util.testing as tm import pandas as pd -from pandas.lib import Timestamp +from pandas._libs.lib import Timestamp from pandas.tests.indexes.common import Base diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 0810b13eb0f53..9a3dd1c6bca71 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -3,9 +3,7 @@ import pandas as pd import pandas.util.testing as tm -from pandas import TimedeltaIndex, timedelta_range, tslib, to_timedelta - -iNaT = tslib.iNaT +from pandas import TimedeltaIndex, timedelta_range, to_timedelta class TestTimedeltaIndex(tm.TestCase): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 406a5bdbf3bcd..8c7b88a9cf2ca 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -8,8 +8,8 @@ from pandas.util.testing import assert_series_equal, assert_frame_equal from pandas import (Series, Timedelta, DataFrame, Timestamp, TimedeltaIndex, timedelta_range, date_range, DatetimeIndex, Int64Index, - _np_version_under1p10, Float64Index, Index, tslib) - + _np_version_under1p10, Float64Index, Index) +from pandas._libs.tslib import iNaT from pandas.tests.test_base import Ops @@ -772,7 +772,7 @@ def test_nat_new(self): tm.assert_index_equal(result, exp) result = idx._nat_new(box=False) - exp = np.array([tslib.iNaT] * 5, dtype=np.int64) + exp = np.array([iNaT] * 5, dtype=np.int64) tm.assert_numpy_array_equal(result, exp) def test_shift(self): diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 2442051547312..ade9366c7e994 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -4,8 +4,9 @@ import pandas as pd import pandas.util.testing as tm from pandas.util.testing import assert_series_equal -from pandas import (Series, Timedelta, to_timedelta, tslib, isnull, +from pandas import (Series, Timedelta, to_timedelta, isnull, TimedeltaIndex) +from pandas._libs.tslib import iNaT class TestTimedeltas(tm.TestCase): @@ -26,7 +27,7 @@ def conv(v): # empty string result = to_timedelta('', box=False) - self.assertEqual(result.astype('int64'), tslib.iNaT) + self.assertEqual(result.astype('int64'), iNaT) result = to_timedelta(['', '']) self.assertTrue(isnull(result).all()) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f7a4af711bbb8..4502e0171dfbe 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -9,7 +9,7 @@ is_float_dtype, is_scalar) from pandas.compat import range, lrange, lzip, StringIO, lmap -from pandas.tslib import NaT +from pandas._libs.tslib import NaT from numpy import nan from numpy.random import randn import numpy as np diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c298b3841096c..7dbcf25c60b45 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -637,13 +637,14 @@ def test_convert_dates(self): def test_convert_dates_infer(self): # GH10747 + from pandas.io.json import dumps infer_words = ['trade_time', 'date', 'datetime', 'sold_at', 'modified', 'timestamp', 'timestamps'] for infer_word in infer_words: data = [{'id': 1, infer_word: 1036713600000}, {'id': 2}] expected = DataFrame([[1, Timestamp('2002-11-08')], [2, pd.NaT]], columns=['id', infer_word]) - result = read_json(pd.json.dumps(data))[['id', infer_word]] + result = read_json(dumps(data))[['id', infer_word]] assert_frame_equal(result, expected) def test_date_format_frame(self): @@ -910,50 +911,53 @@ def test_sparse(self): self.assertEqual(expected, ss.to_json()) def test_tz_is_utc(self): + from pandas.io.json import dumps exp = '"2013-01-10T05:00:00.000Z"' ts = Timestamp('2013-01-10 05:00:00Z') - self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) + self.assertEqual(exp, dumps(ts, iso_dates=True)) dt = ts.to_pydatetime() - self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) + self.assertEqual(exp, dumps(dt, iso_dates=True)) ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern') - self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) + self.assertEqual(exp, dumps(ts, iso_dates=True)) dt = ts.to_pydatetime() - self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) + self.assertEqual(exp, dumps(dt, iso_dates=True)) ts = Timestamp('2013-01-10 00:00:00-0500') - self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) + self.assertEqual(exp, dumps(ts, iso_dates=True)) dt = ts.to_pydatetime() - self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) + self.assertEqual(exp, dumps(dt, iso_dates=True)) def test_tz_range_is_utc(self): + from pandas.io.json import dumps + exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' dfexp = ('{"DT":{' '"0":"2013-01-01T05:00:00.000Z",' '"1":"2013-01-02T05:00:00.000Z"}}') tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2) - self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) + self.assertEqual(exp, dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) - self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) + self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + self.assertEqual(dfexp, dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00', periods=2, tz='US/Eastern') - self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) + self.assertEqual(exp, dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) - self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) + self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + self.assertEqual(dfexp, dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2) - self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) + self.assertEqual(exp, dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) - self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) + self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + self.assertEqual(dfexp, dumps(df, iso_dates=True)) def test_read_jsonl(self): # GH9180 diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 6a986710ae444..e66721beed288 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -17,7 +17,7 @@ import decimal from functools import partial from pandas.compat import range, zip, StringIO, u -import pandas.json as ujson +import pandas.io.json.libjson as ujson import pandas.compat as compat import numpy as np @@ -400,7 +400,7 @@ def test_npy_nat(self): assert ujson.encode(input) == 'null', "Expected null" def test_datetime_units(self): - from pandas.lib import Timestamp + from pandas._libs.lib import Timestamp val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val) diff --git a/pandas/tests/msgpack/__init__.py b/pandas/tests/io/msgpack/__init__.py similarity index 100% rename from pandas/tests/msgpack/__init__.py rename to pandas/tests/io/msgpack/__init__.py diff --git a/pandas/tests/msgpack/test_buffer.py b/pandas/tests/io/msgpack/test_buffer.py similarity index 90% rename from pandas/tests/msgpack/test_buffer.py rename to pandas/tests/io/msgpack/test_buffer.py index caaa22bfd08fc..5a2dc3dba5dfa 100644 --- a/pandas/tests/msgpack/test_buffer.py +++ b/pandas/tests/io/msgpack/test_buffer.py @@ -1,6 +1,6 @@ # coding: utf-8 -from pandas.msgpack import packb, unpackb +from pandas.io.msgpack import packb, unpackb def test_unpack_buffer(): diff --git a/pandas/tests/msgpack/test_case.py b/pandas/tests/io/msgpack/test_case.py similarity index 98% rename from pandas/tests/msgpack/test_case.py rename to pandas/tests/io/msgpack/test_case.py index a8a45b5b37eb0..3927693a94dd8 100644 --- a/pandas/tests/msgpack/test_case.py +++ b/pandas/tests/io/msgpack/test_case.py @@ -1,6 +1,6 @@ # coding: utf-8 -from pandas.msgpack import packb, unpackb +from pandas.io.msgpack import packb, unpackb def check(length, obj): diff --git a/pandas/tests/msgpack/test_except.py b/pandas/tests/io/msgpack/test_except.py similarity index 96% rename from pandas/tests/msgpack/test_except.py rename to pandas/tests/io/msgpack/test_except.py index 76b91bb375bbc..4bcef3607bfa4 100644 --- a/pandas/tests/msgpack/test_except.py +++ b/pandas/tests/io/msgpack/test_except.py @@ -1,7 +1,7 @@ # coding: utf-8 import unittest -from pandas.msgpack import packb, unpackb +from pandas.io.msgpack import packb, unpackb class DummyException(Exception): diff --git a/pandas/tests/msgpack/test_extension.py b/pandas/tests/io/msgpack/test_extension.py similarity index 96% rename from pandas/tests/msgpack/test_extension.py rename to pandas/tests/io/msgpack/test_extension.py index 97f0962a753d9..a5a111efbb835 100644 --- a/pandas/tests/msgpack/test_extension.py +++ b/pandas/tests/io/msgpack/test_extension.py @@ -1,7 +1,7 @@ from __future__ import print_function import array -import pandas.msgpack as msgpack -from pandas.msgpack import ExtType +import pandas.io.msgpack as msgpack +from pandas.io.msgpack import ExtType def test_pack_ext_type(): diff --git a/pandas/tests/msgpack/test_format.py b/pandas/tests/io/msgpack/test_format.py similarity index 98% rename from pandas/tests/msgpack/test_format.py rename to pandas/tests/io/msgpack/test_format.py index a4b309ebb657d..3659602e1381f 100644 --- a/pandas/tests/msgpack/test_format.py +++ b/pandas/tests/io/msgpack/test_format.py @@ -1,6 +1,6 @@ # coding: utf-8 -from pandas.msgpack import unpackb +from pandas.io.msgpack import unpackb def check(src, should, use_list=0): diff --git a/pandas/tests/msgpack/test_limits.py b/pandas/tests/io/msgpack/test_limits.py similarity index 97% rename from pandas/tests/msgpack/test_limits.py rename to pandas/tests/io/msgpack/test_limits.py index 9c08f328b90dd..a908ee3547634 100644 --- a/pandas/tests/msgpack/test_limits.py +++ b/pandas/tests/io/msgpack/test_limits.py @@ -3,7 +3,7 @@ unicode_literals) import pandas.util.testing as tm -from pandas.msgpack import packb, unpackb, Packer, Unpacker, ExtType +from pandas.io.msgpack import packb, unpackb, Packer, Unpacker, ExtType class TestLimits(tm.TestCase): diff --git a/pandas/tests/msgpack/test_newspec.py b/pandas/tests/io/msgpack/test_newspec.py similarity index 97% rename from pandas/tests/msgpack/test_newspec.py rename to pandas/tests/io/msgpack/test_newspec.py index 4eb9a0425c57b..783bfc1b364f8 100644 --- a/pandas/tests/msgpack/test_newspec.py +++ b/pandas/tests/io/msgpack/test_newspec.py @@ -1,6 +1,6 @@ # coding: utf-8 -from pandas.msgpack import packb, unpackb, ExtType +from pandas.io.msgpack import packb, unpackb, ExtType def test_str8(): diff --git a/pandas/tests/msgpack/test_obj.py b/pandas/tests/io/msgpack/test_obj.py similarity index 98% rename from pandas/tests/msgpack/test_obj.py rename to pandas/tests/io/msgpack/test_obj.py index bcc76929fe8f8..b067dacb84494 100644 --- a/pandas/tests/msgpack/test_obj.py +++ b/pandas/tests/io/msgpack/test_obj.py @@ -1,7 +1,7 @@ # coding: utf-8 import unittest -from pandas.msgpack import packb, unpackb +from pandas.io.msgpack import packb, unpackb class DecodeError(Exception): diff --git a/pandas/tests/msgpack/test_pack.py b/pandas/tests/io/msgpack/test_pack.py similarity index 98% rename from pandas/tests/msgpack/test_pack.py rename to pandas/tests/io/msgpack/test_pack.py index 005352691d908..6f9a271cbd326 100644 --- a/pandas/tests/msgpack/test_pack.py +++ b/pandas/tests/io/msgpack/test_pack.py @@ -5,7 +5,7 @@ import struct from pandas import compat from pandas.compat import u, OrderedDict -from pandas.msgpack import packb, unpackb, Unpacker, Packer +from pandas.io.msgpack import packb, unpackb, Unpacker, Packer class TestPack(unittest.TestCase): diff --git a/pandas/tests/msgpack/test_read_size.py b/pandas/tests/io/msgpack/test_read_size.py similarity index 96% rename from pandas/tests/msgpack/test_read_size.py rename to pandas/tests/io/msgpack/test_read_size.py index 965e97a7007de..ef521fa345637 100644 --- a/pandas/tests/msgpack/test_read_size.py +++ b/pandas/tests/io/msgpack/test_read_size.py @@ -1,5 +1,5 @@ """Test Unpacker's read_array_header and read_map_header methods""" -from pandas.msgpack import packb, Unpacker, OutOfData +from pandas.io.msgpack import packb, Unpacker, OutOfData UnexpectedTypeException = ValueError diff --git a/pandas/tests/msgpack/test_seq.py b/pandas/tests/io/msgpack/test_seq.py similarity index 96% rename from pandas/tests/msgpack/test_seq.py rename to pandas/tests/io/msgpack/test_seq.py index 927c2622419a6..5f203e8997ccb 100644 --- a/pandas/tests/msgpack/test_seq.py +++ b/pandas/tests/io/msgpack/test_seq.py @@ -1,7 +1,7 @@ # coding: utf-8 import io -import pandas.msgpack as msgpack +import pandas.io.msgpack as msgpack binarydata = bytes(bytearray(range(256))) diff --git a/pandas/tests/msgpack/test_sequnpack.py b/pandas/tests/io/msgpack/test_sequnpack.py similarity index 97% rename from pandas/tests/msgpack/test_sequnpack.py rename to pandas/tests/io/msgpack/test_sequnpack.py index fe089ccda1c7f..c9c979c4e0e44 100644 --- a/pandas/tests/msgpack/test_sequnpack.py +++ b/pandas/tests/io/msgpack/test_sequnpack.py @@ -3,8 +3,8 @@ import unittest from pandas import compat -from pandas.msgpack import Unpacker, BufferFull -from pandas.msgpack import OutOfData +from pandas.io.msgpack import Unpacker, BufferFull +from pandas.io.msgpack import OutOfData class TestPack(unittest.TestCase): diff --git a/pandas/tests/msgpack/test_subtype.py b/pandas/tests/io/msgpack/test_subtype.py similarity index 90% rename from pandas/tests/msgpack/test_subtype.py rename to pandas/tests/io/msgpack/test_subtype.py index d6dd72c4d9850..e27ec66c63e1f 100644 --- a/pandas/tests/msgpack/test_subtype.py +++ b/pandas/tests/io/msgpack/test_subtype.py @@ -1,6 +1,6 @@ # coding: utf-8 -from pandas.msgpack import packb +from pandas.io.msgpack import packb from collections import namedtuple diff --git a/pandas/tests/msgpack/test_unpack.py b/pandas/tests/io/msgpack/test_unpack.py similarity index 96% rename from pandas/tests/msgpack/test_unpack.py rename to pandas/tests/io/msgpack/test_unpack.py index ae8227ab276fb..24a8e885d19d6 100644 --- a/pandas/tests/msgpack/test_unpack.py +++ b/pandas/tests/io/msgpack/test_unpack.py @@ -1,6 +1,6 @@ from io import BytesIO import sys -from pandas.msgpack import Unpacker, packb, OutOfData, ExtType +from pandas.io.msgpack import Unpacker, packb, OutOfData, ExtType import pandas.util.testing as tm import pytest diff --git a/pandas/tests/msgpack/test_unpack_raw.py b/pandas/tests/io/msgpack/test_unpack_raw.py similarity index 94% rename from pandas/tests/msgpack/test_unpack_raw.py rename to pandas/tests/io/msgpack/test_unpack_raw.py index c6bf747c8d992..a261bf4cbbcd7 100644 --- a/pandas/tests/msgpack/test_unpack_raw.py +++ b/pandas/tests/io/msgpack/test_unpack_raw.py @@ -1,7 +1,7 @@ """Tests for cases where the user seeks to obtain packed msgpack objects""" import io -from pandas.msgpack import Unpacker, packb +from pandas.io.msgpack import Unpacker, packb def test_write_bytes(): diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index b667eed346355..df75d14e9702d 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -11,7 +11,7 @@ import pytest import numpy as np -from pandas.lib import Timestamp +from pandas._libs.lib import Timestamp import pandas as pd import pandas.util.testing as tm diff --git a/pandas/tests/io/parser/converters.py b/pandas/tests/io/parser/converters.py index 859d2e19bd56a..2659d977ea747 100644 --- a/pandas/tests/io/parser/converters.py +++ b/pandas/tests/io/parser/converters.py @@ -13,7 +13,7 @@ import pandas as pd import pandas.util.testing as tm -from pandas.lib import Timestamp +from pandas._libs.lib import Timestamp from pandas import DataFrame, Index from pandas.compat import parse_date, StringIO, lmap diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index b1960159bb41d..4cba9276a9d1e 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -10,8 +10,8 @@ import pytest import numpy as np -import pandas.lib as lib -from pandas.lib import Timestamp +import pandas._libs.lib as lib +from pandas._libs.lib import Timestamp import pandas as pd import pandas.io.parsers as parsers diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 0e91ca806e8fe..b6a9900b0b087 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -20,8 +20,8 @@ import pandas.util.testing as tm -from pandas.parser import TextReader -import pandas.parser as parser +from pandas.io.libparsers import TextReader +import pandas.io.libparsers as parser class TestTextReader(tm.TestCase): diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 95df077dae997..0cf642983e8d3 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -11,7 +11,7 @@ import pandas.util.testing as tm from pandas import DataFrame, Index -from pandas.lib import Timestamp +from pandas._libs.lib import Timestamp from pandas.compat import StringIO diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 232e68a87f16e..c1a2a4545a6f9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -23,7 +23,7 @@ is_platform_windows) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html -from pandas.parser import ParserError +from pandas.io.libparsers import ParserError import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 251c6ae8b4dec..efa8587d64657 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -22,7 +22,8 @@ from pandas.tests.test_panel import assert_panel_equal import pandas -from pandas import Timestamp, NaT, tslib +from pandas import Timestamp, NaT +from pandas._libs.tslib import iNaT nan = np.nan @@ -373,7 +374,7 @@ def setUp(self): s.name = 'object' self.d['object'] = s - s = Series(tslib.iNaT, dtype='M8[ns]', index=range(5)) + s = Series(iNaT, dtype='M8[ns]', index=range(5)) self.d['date'] = s data = { diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 9f1dea2094bc6..5592c564e51df 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -5282,7 +5282,7 @@ def test_append_with_timezones_dateutil(self): # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows # filename issues. - from pandas.tslib import maybe_get_tz + from pandas._libs.tslib import maybe_get_tz gettz = lambda x: maybe_get_tz('dateutil/' + x) # as columns diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index ae09e671dbca3..5188adf54b887 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -19,7 +19,7 @@ from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) -from pandas.tslib import NaT +from pandas._libs.tslib import NaT from pandas.types.common import is_categorical_dtype diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index 49aa44492fe81..3128e90695324 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -6,7 +6,9 @@ import pandas.tseries.period as period from pandas.compat import text_type, iteritems from pandas.compat.numpy import np_datetime64_compat -from pandas import Period, Timestamp, tslib, offsets, _period + +from pandas._libs import tslib, period as libperiod +from pandas import Period, Timestamp, offsets from pandas.tseries.frequencies import DAYS, MONTHS @@ -256,8 +258,8 @@ def test_timestamp_tz_arg(self): self.assertEqual(p.tz, exp.tz) def test_timestamp_tz_arg_dateutil(self): - from pandas.tslib import _dateutil_gettz as gettz - from pandas.tslib import maybe_get_tz + from pandas._libs.tslib import _dateutil_gettz as gettz + from pandas._libs.tslib import maybe_get_tz for case in ['dateutil/Europe/Brussels', 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific']: p = Period('1/1/2005', freq='M').to_timestamp( @@ -275,7 +277,7 @@ def test_timestamp_tz_arg_dateutil(self): self.assertEqual(p.tz, exp.tz) def test_timestamp_tz_arg_dateutil_from_string(self): - from pandas.tslib import _dateutil_gettz as gettz + from pandas._libs.tslib import _dateutil_gettz as gettz p = Period('1/1/2005', freq='M').to_timestamp(tz='dateutil/Europe/Brussels') self.assertEqual(p.tz, gettz('Europe/Brussels')) @@ -939,10 +941,10 @@ def test_round_trip(self): class TestPeriodField(tm.TestCase): def test_get_period_field_raises_on_out_of_range(self): - self.assertRaises(ValueError, _period.get_period_field, -1, 0, 0) + self.assertRaises(ValueError, libperiod.get_period_field, -1, 0, 0) def test_get_period_field_array_raises_on_out_of_range(self): - self.assertRaises(ValueError, _period.get_period_field_arr, -1, + self.assertRaises(ValueError, libperiod.get_period_field_arr, -1, np.empty(1), 0) diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index c5a828bf2e912..7c5caa9506ca2 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -6,9 +6,8 @@ import pandas.util.testing as tm from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct from pandas import (Timedelta, TimedeltaIndex, timedelta_range, Series, - to_timedelta, tslib, compat, isnull) - -iNaT = tslib.iNaT + to_timedelta, compat, isnull) +from pandas._libs.tslib import iNaT, NaTType class TestTimedeltas(tm.TestCase): @@ -301,9 +300,9 @@ def check(value): def test_nat_converters(self): self.assertEqual(to_timedelta( - 'nat', box=False).astype('int64'), tslib.iNaT) + 'nat', box=False).astype('int64'), iNaT) self.assertEqual(to_timedelta( - 'nan', box=False).astype('int64'), tslib.iNaT) + 'nan', box=False).astype('int64'), iNaT) def testit(unit, transform): @@ -589,7 +588,7 @@ def test_implementation_limits(self): # Beyond lower limit, a NAT before the Overflow self.assertIsInstance(min_td - Timedelta(1, 'ns'), - pd.tslib.NaTType) + NaTType) with tm.assertRaises(OverflowError): min_td - Timedelta(2, 'ns') @@ -599,7 +598,7 @@ def test_implementation_limits(self): # Same tests using the internal nanosecond values td = Timedelta(min_td.value - 1, 'ns') - self.assertIsInstance(td, pd.tslib.NaTType) + self.assertIsInstance(td, NaTType) with tm.assertRaises(OverflowError): Timedelta(min_td.value - 2, 'ns') diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index bbcdce922f58a..d5d92dcf96eab 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -9,13 +9,15 @@ import pandas as pd import pandas.util.testing as tm -import pandas._period as period + from pandas.tseries import offsets, frequencies -from pandas.tslib import get_timezone, iNaT +from pandas._libs import tslib, period +from pandas._libs.tslib import get_timezone, iNaT + from pandas.compat import lrange, long from pandas.util.testing import assert_series_equal from pandas.compat.numpy import np_datetime64_compat -from pandas import (Timestamp, date_range, Period, Timedelta, tslib, compat, +from pandas import (Timestamp, date_range, Period, Timedelta, compat, Series, NaT, isnull, DataFrame, DatetimeIndex) from pandas.tseries.frequencies import (RESO_DAY, RESO_HR, RESO_MIN, RESO_US, RESO_MS, RESO_SEC) @@ -1482,7 +1484,7 @@ def test_timestamp_to_datetime_explicit_pytz(self): def test_timestamp_to_datetime_explicit_dateutil(self): tm._skip_if_windows_python_3() tm._skip_if_no_dateutil() - from pandas.tslib import _dateutil_gettz as gettz + from pandas._libs.tslib import _dateutil_gettz as gettz rng = date_range('20090415', '20090519', tz=gettz('US/Eastern')) stamp = rng[0] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c15171f331df3..24e4355fa9f9a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,7 +14,8 @@ from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex -from pandas import lib, tslib +from pandas._libs import lib +from pandas._libs.tslib import iNaT from pandas.compat import lrange, range, zip, OrderedDict, long from pandas import compat @@ -200,14 +201,14 @@ def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype='M8[ns]') result = Series(data) - expected = Series([tslib.iNaT, tslib.iNaT, tslib.iNaT], dtype='M8[ns]') + expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]') assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) index = ['a', 'b', 'c'] result = Series(data, index=index) - expected = Series([datetime(2001, 1, 1), tslib.iNaT, + expected = Series([datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) @@ -327,20 +328,19 @@ def test_constructor_datelike_coercion(self): self.assertTrue(result.dtype == object) def test_constructor_dtype_datetime64(self): - import pandas.tslib as tslib - s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) + s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) self.assertTrue(isnull(s).all()) # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous - s = Series(tslib.iNaT, index=lrange(5)) + s = Series(iNaT, index=lrange(5)) self.assertFalse(isnull(s).all()) s = Series(nan, dtype='M8[ns]', index=lrange(5)) self.assertTrue(isnull(s).all()) - s = Series([datetime(2001, 1, 2, 0, 0), tslib.iNaT], dtype='M8[ns]') + s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') self.assertTrue(isnull(s[1])) self.assertEqual(s.dtype, 'M8[ns]') @@ -732,8 +732,7 @@ def test_constructor_dtype_timedelta64(self): self.assertEqual(td.dtype, 'timedelta64[ns]') # mixed with NaT - from pandas import tslib - td = Series([timedelta(days=1), tslib.NaT], dtype='m8[ns]') + td = Series([timedelta(days=1), NaT], dtype='m8[ns]') self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') @@ -744,11 +743,11 @@ def test_constructor_dtype_timedelta64(self): # improved inference # GH5689 - td = Series([np.timedelta64(300000000), pd.NaT]) + td = Series([np.timedelta64(300000000), NaT]) self.assertEqual(td.dtype, 'timedelta64[ns]') # because iNaT is int, not coerced to timedelta - td = Series([np.timedelta64(300000000), tslib.iNaT]) + td = Series([np.timedelta64(300000000), iNaT]) self.assertEqual(td.dtype, 'object') td = Series([np.timedelta64(300000000), np.nan]) @@ -791,7 +790,7 @@ def f(): self.assertEqual(s.dtype, 'timedelta64[ns]') def test_NaT_scalar(self): - series = Series([0, 1000, 2000, tslib.iNaT], dtype='M8[ns]') + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') val = series[3] self.assertTrue(isnull(val)) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 13375ab886d8d..a2aaff25516ae 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -62,7 +62,7 @@ def test_astype_cast_object_int(self): self.assert_series_equal(result, Series(np.arange(1, 5))) def test_astype_datetimes(self): - import pandas.tslib as tslib + import pandas._libs.tslib as tslib s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) s = s.astype('O') diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index bb77550e01f11..9d93d9f01b161 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -7,14 +7,14 @@ import numpy as np import pandas as pd -import pandas.index as _index +import pandas._libs.index as _index from pandas.types.common import is_integer, is_scalar from pandas import (Index, Series, DataFrame, isnull, date_range, NaT, MultiIndex, Timestamp, DatetimeIndex, Timedelta) from pandas.core.indexing import IndexingError from pandas.tseries.offsets import BDay -from pandas import lib, tslib +from pandas._libs import tslib, lib from pandas.compat import lrange, range from pandas import compat @@ -375,7 +375,7 @@ def test_getitem_setitem_datetime_tz_pytz(self): def test_getitem_setitem_datetime_tz_dateutil(self): tm._skip_if_no_dateutil() from dateutil.tz import tzutc - from pandas.tslib import _dateutil_gettz as gettz + from pandas._libs.tslib import _dateutil_gettz as gettz tz = lambda x: tzutc() if x == 'UTC' else gettz( x) # handle special case for utc in dateutil diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index a3b13ba9b993a..4b1c303200739 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -8,7 +8,7 @@ from pandas import Series from pandas.tseries.index import Timestamp -import pandas.lib as lib +import pandas._libs.lib as lib from pandas.util.testing import assert_series_equal import pandas.util.testing as tm diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 23eb6a40f5f1d..87cfcf32229b4 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -9,9 +9,9 @@ import pandas as pd from pandas import (Series, DataFrame, isnull, date_range, - MultiIndex, Index, Timestamp) + MultiIndex, Index, Timestamp, NaT) from pandas.compat import range -from pandas import tslib +from pandas._libs.tslib import iNaT from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm @@ -69,9 +69,8 @@ def test_timedelta_fillna(self): timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) assert_series_equal(result, expected) - from pandas import tslib - result = td.fillna(tslib.NaT) - expected = Series([tslib.NaT, timedelta(0), timedelta(1), + result = td.fillna(NaT) + expected = Series([NaT, timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1)], dtype='m8[ns]') assert_series_equal(result, expected) @@ -102,8 +101,7 @@ def test_datetime64_fillna(self): '20130101'), Timestamp('20130104'), Timestamp('20130103 9:01:01')]) assert_series_equal(result, expected) - from pandas import tslib - result = s.fillna(tslib.NaT) + result = s.fillna(NaT) expected = s assert_series_equal(result, expected) @@ -303,7 +301,7 @@ def test_fillna_raise(self): s.fillna(1, limit=limit, method=method) def test_fillna_nat(self): - series = Series([0, 1, 2, tslib.iNaT], dtype='M8[ns]') + series = Series([0, 1, 2, iNaT], dtype='M8[ns]') filled = series.fillna(method='pad') filled2 = series.fillna(value=series.values[2]) @@ -321,7 +319,7 @@ def test_fillna_nat(self): assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected) - series = Series([tslib.iNaT, 0, 1, 2], dtype='M8[ns]') + series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') filled = series.fillna(method='bfill') filled2 = series.fillna(value=series[1]) @@ -460,26 +458,25 @@ def test_bfill(self): def test_timedelta64_nan(self): - from pandas import tslib td = Series([timedelta(days=i) for i in range(10)]) # nan ops on timedeltas td1 = td.copy() td1[0] = np.nan self.assertTrue(isnull(td1[0])) - self.assertEqual(td1[0].value, tslib.iNaT) + self.assertEqual(td1[0].value, iNaT) td1[0] = td[0] self.assertFalse(isnull(td1[0])) - td1[1] = tslib.iNaT + td1[1] = iNaT self.assertTrue(isnull(td1[1])) - self.assertEqual(td1[1].value, tslib.iNaT) + self.assertEqual(td1[1].value, iNaT) td1[1] = td[1] self.assertFalse(isnull(td1[1])) - td1[2] = tslib.NaT + td1[2] = NaT self.assertTrue(isnull(td1[2])) - self.assertEqual(td1[2].value, tslib.iNaT) + self.assertEqual(td1[2].value, iNaT) td1[2] = td[2] self.assertFalse(isnull(td1[2])) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 7fe31bab87537..0acd03316339e 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -import pandas.lib as lib +import pandas._libs.lib as lib import pandas.util.testing as tm from .common import TestData diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index d384460c3d030..ce7d5a573bfab 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -6,7 +6,7 @@ import pandas as pd import pandas.util.testing as tm -from pandas.tslib import iNaT +from pandas._libs.tslib import iNaT from pandas.compat import lrange, StringIO, product from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.index import DatetimeIndex diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 70aaea5b5b1f0..15531cecfe79b 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -8,7 +8,7 @@ from pandas import _np_version_under1p8 from pandas.sparse.api import SparseArray, SparseSeries -from pandas._sparse import IntIndex +from pandas.sparse.libsparse import IntIndex from pandas.util.testing import assert_almost_equal, assertRaisesRegexp import pandas.util.testing as tm diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index b2283364a1631..a7dd7f2e81033 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -14,7 +14,7 @@ from pandas import compat import pandas.sparse.frame as spf -from pandas._sparse import BlockIndex, IntIndex +from pandas.sparse.libsparse import BlockIndex, IntIndex from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_misc_api import SharedWithSparse diff --git a/pandas/tests/sparse/test_libsparse.py b/pandas/tests/sparse/test_libsparse.py index 0435b732911da..b6ab99dc66cda 100644 --- a/pandas/tests/sparse/test_libsparse.py +++ b/pandas/tests/sparse/test_libsparse.py @@ -8,7 +8,7 @@ from pandas import compat from pandas.sparse.array import IntIndex, BlockIndex, _make_index -import pandas._sparse as splib +import pandas.sparse.libsparse as splib TEST_LENGTH = 20 diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index de6636162ff05..8aa85a5b7f396 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -16,7 +16,7 @@ import pandas.sparse.frame as spf -from pandas._sparse import BlockIndex, IntIndex +from pandas.sparse.libsparse import BlockIndex, IntIndex from pandas.sparse.api import SparseSeries from pandas.tests.series.test_misc_api import SharedWithSparse diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fab04f7fa4bf2..7a3cc3e2c3cd7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import compat -import pandas.algos as _algos +from pandas._libs import algos as libalgos, hashtable +from pandas._libs.hashtable import unique_label_indices from pandas.compat import lrange import pandas.core.algorithms as algos import pandas.util.testing as tm -import pandas.hashtable as hashtable from pandas.compat.numpy import np_array_datetime64_compat from pandas.util.testing import assert_almost_equal @@ -972,7 +972,6 @@ def test_quantile(): def test_unique_label_indices(): - from pandas.hashtable import unique_label_indices a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') @@ -998,7 +997,7 @@ def test_scipy_compat(self): def _check(arr): mask = ~np.isfinite(arr) arr = arr.copy() - result = _algos.rank_1d_float64(arr) + result = libalgos.rank_1d_float64(arr) arr[mask] = np.inf exp = rankdata(arr) exp[mask] = nan @@ -1034,26 +1033,26 @@ def test_pad_backfill_object_segfault(): old = np.array([], dtype='O') new = np.array([datetime(2010, 12, 31)], dtype='O') - result = _algos.pad_object(old, new) + result = libalgos.pad_object(old, new) expected = np.array([-1], dtype=np.int64) assert (np.array_equal(result, expected)) - result = _algos.pad_object(new, old) + result = libalgos.pad_object(new, old) expected = np.array([], dtype=np.int64) assert (np.array_equal(result, expected)) - result = _algos.backfill_object(old, new) + result = libalgos.backfill_object(old, new) expected = np.array([-1], dtype=np.int64) assert (np.array_equal(result, expected)) - result = _algos.backfill_object(new, old) + result = libalgos.backfill_object(new, old) expected = np.array([], dtype=np.int64) assert (np.array_equal(result, expected)) def test_arrmap(): values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') - result = _algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) + result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar']) assert (result.dtype == np.bool_) @@ -1078,7 +1077,7 @@ def test_backfill(self): old = Index([1, 5, 10]) new = Index(lrange(12)) - filler = _algos.backfill_int64(old.values, new.values) + filler = libalgos.backfill_int64(old.values, new.values) expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.int64) @@ -1087,7 +1086,7 @@ def test_backfill(self): # corner case old = Index([1, 4]) new = Index(lrange(5, 10)) - filler = _algos.backfill_int64(old.values, new.values) + filler = libalgos.backfill_int64(old.values, new.values) expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) self.assert_numpy_array_equal(filler, expect_filler) @@ -1096,7 +1095,7 @@ def test_pad(self): old = Index([1, 5, 10]) new = Index(lrange(12)) - filler = _algos.pad_int64(old.values, new.values) + filler = libalgos.pad_int64(old.values, new.values) expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.int64) @@ -1105,7 +1104,7 @@ def test_pad(self): # corner case old = Index([5, 10]) new = Index(lrange(5)) - filler = _algos.pad_int64(old.values, new.values) + filler = libalgos.pad_int64(old.values, new.values) expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) self.assert_numpy_array_equal(filler, expect_filler) @@ -1137,7 +1136,7 @@ def test_is_lexsorted(): 6, 5, 4, 3, 2, 1, 0])] - assert (not _algos.is_lexsorted(failure)) + assert (not libalgos.is_lexsorted(failure)) # def test_get_group_index(): # a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) @@ -1153,7 +1152,7 @@ def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) b = np.random.randint(0, 1000, 100).astype(np.int64) - result = _algos.groupsort_indexer(a, 1000)[0] + result = libalgos.groupsort_indexer(a, 1000)[0] # need to use a stable sort expected = np.argsort(a, kind='mergesort') @@ -1161,7 +1160,7 @@ def test_groupsort_indexer(): # compare with lexsort key = a * 1000 + b - result = _algos.groupsort_indexer(key, 1000000)[0] + result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) assert (np.array_equal(result, expected)) @@ -1172,8 +1171,8 @@ def test_infinity_sort(): # itself. Instead, let's give our infinities a self-consistent # ordering, but outside the float extended real line. - Inf = _algos.Infinity() - NegInf = _algos.NegInfinity() + Inf = libalgos.Infinity() + NegInf = libalgos.NegInfinity() ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf] @@ -1191,14 +1190,14 @@ def test_infinity_sort(): assert sorted(perm) == ref_nums # smoke tests - np.array([_algos.Infinity()] * 32).argsort() - np.array([_algos.NegInfinity()] * 32).argsort() + np.array([libalgos.Infinity()] * 32).argsort() + np.array([libalgos.NegInfinity()] * 32).argsort() def test_ensure_platform_int(): arr = np.arange(100, dtype=np.intp) - result = _algos.ensure_platform_int(arr) + result = libalgos.ensure_platform_int(arr) assert (result is arr) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 8264ad33950f9..1d4dddf6477df 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -18,6 +18,7 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.tseries.base import DatetimeIndexOpsMixin +from pandas._libs.tslib import iNaT class CheckStringMixin(object): @@ -451,15 +452,15 @@ def test_value_counts_unique_nunique_null(self): if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 - v[0:2] = pd.tslib.iNaT + v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() - o[0:2] = pd.tslib.iNaT + o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): - values[0:2] = pd.tslib.iNaT + values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index f086935df6dc8..5ab2bbc4ac6ba 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -17,7 +17,7 @@ import pandas.core.algorithms as algos import pandas.util.testing as tm import pandas as pd -from pandas import lib +from pandas._libs import lib from pandas.util.testing import (assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 2a16d7663b0cf..6723494d1529b 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -3,7 +3,7 @@ import numpy as np from pandas import Index -import pandas._join as _join +from pandas._libs import join as _join import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 2381c52ef14b6..a925cf13900e9 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -import pandas.lib as lib +import pandas._libs.lib as lib import pandas.util.testing as tm diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index c809b39bb566e..d1b7fdadce6ae 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -20,7 +20,7 @@ cart_product, zip) import pandas as pd -import pandas.index as _index +import pandas._libs.index as _index class TestMultiLevel(tm.TestCase): diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 3aed22c140ffe..0bc1d0dcd0532 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -6,7 +6,7 @@ from pandas.compat import long import pandas.core.algorithms as algos import pandas.util.testing as tm -from pandas.tslib import iNaT +from pandas._libs.tslib import iNaT class TestTake(tm.TestCase): diff --git a/pandas/tests/tools/test_join.py b/pandas/tests/tools/test_join.py index ee6b3d57b852d..b65f800802bca 100644 --- a/pandas/tests/tools/test_join.py +++ b/pandas/tests/tools/test_join.py @@ -9,7 +9,7 @@ from pandas.util.testing import assert_frame_equal from pandas import DataFrame, MultiIndex, Series, Index, merge, concat -import pandas._join as _join +from pandas._libs import join as libjoin import pandas.util.testing as tm from pandas.tests.tools.test_merge import get_test_data, N, NGROUPS @@ -46,7 +46,7 @@ def test_cython_left_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - ls, rs = _join.left_outer_join(left, right, max_group) + ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') @@ -70,7 +70,7 @@ def test_cython_right_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - rs, ls = _join.left_outer_join(right, left, max_group) + rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') @@ -96,7 +96,7 @@ def test_cython_inner_join(self): right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 - ls, rs = _join.inner_join(left, right, max_group) + ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') diff --git a/pandas/tests/tseries/test_offsets.py b/pandas/tests/tseries/test_offsets.py index dfa1e94e4dc11..f644c353982f6 100644 --- a/pandas/tests/tseries/test_offsets.py +++ b/pandas/tests/tseries/test_offsets.py @@ -31,8 +31,8 @@ to_datetime, DateParseError) import pandas.tseries.offsets as offsets from pandas.io.pickle import read_pickle -from pandas.tslib import normalize_date, NaT, Timestamp, Timedelta -import pandas.tslib as tslib +from pandas._libs.tslib import normalize_date, NaT, Timestamp, Timedelta +import pandas._libs.tslib as tslib from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm from pandas.tseries.holiday import USFederalHolidayCalendar diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index 1535bd665fe8b..57a655b0b7610 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -26,7 +26,7 @@ from pandas.tseries.tdi import timedelta_range, TimedeltaIndex from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, assert_index_equal) -from pandas._period import IncompatibleFrequency +from pandas._libs.period import IncompatibleFrequency bday = BDay() diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 771fb2f50c410..1ccc1652d2719 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -11,7 +11,8 @@ from pandas.compat import lrange, zip from pandas.tseries.index import bdate_range, date_range from pandas.types.dtypes import DatetimeTZDtype -from pandas import (Index, Series, DataFrame, isnull, Timestamp, tslib, NaT, +from pandas._libs import tslib +from pandas import (Index, Series, DataFrame, isnull, Timestamp, NaT, DatetimeIndex, to_datetime) from pandas.util.testing import (assert_frame_equal, assert_series_equal, set_timezone) @@ -924,7 +925,7 @@ def test_utc_with_system_utc(self): # Skipped on win32 due to dateutil bug tm._skip_if_windows() - from pandas.tslib import maybe_get_tz + from pandas._libs.tslib import maybe_get_tz # from system utc to real utc ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 629aa63f4a0ae..a36a77a70f9ad 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -13,7 +13,7 @@ import pytz import pandas as pd -from pandas import lib, tslib +from pandas._libs import tslib, lib from pandas import (Series, Index, DataFrame, Timedelta, DatetimeIndex, TimedeltaIndex, Timestamp, Panel, Period, Categorical) @@ -517,28 +517,28 @@ def test_infer_dtype_period(self): # GH 13664 arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='D')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'period') + self.assertEqual(lib.infer_dtype(arr), 'period') arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='M')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'period') + self.assertEqual(lib.infer_dtype(arr), 'period') # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Period('2011-01', freq='D')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'period') + self.assertEqual(lib.infer_dtype(arr), 'period') arr = np.array([n, pd.Period('2011-01', freq='D'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'period') + self.assertEqual(lib.infer_dtype(arr), 'period') # different type of nat arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + self.assertEqual(lib.infer_dtype(arr), 'mixed') def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) diff --git a/pandas/tests/types/test_io.py b/pandas/tests/types/test_io.py index ce8e23342bf5a..b6c10394dd232 100644 --- a/pandas/tests/types/test_io.py +++ b/pandas/tests/types/test_io.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import numpy as np -import pandas.lib as lib +import pandas._libs.lib as lib import pandas.util.testing as tm from pandas.compat import long, u @@ -73,7 +73,7 @@ def test_convert_sql_column_decimals(self): self.assert_numpy_array_equal(result, expected) def test_convert_downcast_int64(self): - from pandas.parser import na_values + from pandas.io.libparsers import na_values arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index cab44f1122ae1..2e35f5c1badbb 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -7,7 +7,7 @@ import pandas as pd from pandas.core import config as cf from pandas.compat import u -from pandas.tslib import iNaT +from pandas._libs.tslib import iNaT from pandas import (NaT, Float64Index, Series, DatetimeIndex, TimedeltaIndex, date_range) from pandas.types.dtypes import DatetimeTZDtype diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index ef863510cdd87..85ceb439435ee 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -4,8 +4,9 @@ import itertools import numpy as np -from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex -from pandas.lib import is_bool_array +from pandas import Series, factorize, Categorical, Index, MultiIndex +from pandas.tools import libhashing as _hash +from pandas._libs.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, diff --git a/pandas/src/hash.pyx b/pandas/tools/hashing.pyx similarity index 100% rename from pandas/src/hash.pyx rename to pandas/tools/hashing.pyx diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index ba53d42fccec7..3f1e7640ba538 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -37,9 +37,7 @@ from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos import pandas.core.common as com - -import pandas._join as _join -import pandas.hashtable as _hash +from pandas._libs import hashtable as libhashtable, join as libjoin # back-compat of pseudo-public API @@ -1005,8 +1003,8 @@ def get_result(self): rdata.items, rsuf) if self.fill_method == 'ffill': - left_join_indexer = _join.ffill_indexer(left_indexer) - right_join_indexer = _join.ffill_indexer(right_indexer) + left_join_indexer = libjoin.ffill_indexer(left_indexer) + right_join_indexer = libjoin.ffill_indexer(right_indexer) else: left_join_indexer = left_indexer right_join_indexer = right_indexer @@ -1030,11 +1028,11 @@ def get_result(self): def _asof_function(direction, on_type): - return getattr(_join, 'asof_join_%s_%s' % (direction, on_type), None) + return getattr(libjoin, 'asof_join_%s_%s' % (direction, on_type), None) def _asof_by_function(direction, on_type, by_type): - return getattr(_join, 'asof_join_%s_%s_by_%s' % + return getattr(libjoin, 'asof_join_%s_%s_by_%s' % (direction, on_type, by_type), None) @@ -1294,13 +1292,13 @@ def _get_multiindex_indexer(join_keys, index, sort): # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) - return _join.left_outer_join(lkey, rkey, count, sort=sort) + return libjoin.left_outer_join(lkey, rkey, count, sort=sort) def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) - left_indexer, right_indexer = _join.left_outer_join( + left_indexer, right_indexer = libjoin.left_outer_join( _ensure_int64(left_key), _ensure_int64(right_key), count, sort=sort) @@ -1335,15 +1333,15 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = _join.left_outer_join(y, x, max_groups) + right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) return left_indexer, right_indexer _join_functions = { - 'inner': _join.inner_join, - 'left': _join.left_outer_join, + 'inner': libjoin.inner_join, + 'left': libjoin.left_outer_join, 'right': _right_outer_join, - 'outer': _join.full_outer_join, + 'outer': libjoin.full_outer_join, } @@ -1352,11 +1350,11 @@ def _factorize_keys(lk, rk, sort=True): lk = lk.values rk = rk.values if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): - klass = _hash.Int64Factorizer + klass = libhashtable.Int64Factorizer lk = _ensure_int64(com._values_from_object(lk)) rk = _ensure_int64(com._values_from_object(rk)) else: - klass = _hash.Factorizer + klass = libhashtable.Factorizer lk = _ensure_object(lk) rk = _ensure_object(rk) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index feb4d4bfd5044..9b21e542f153c 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -13,7 +13,7 @@ from pandas.compat import zip from pandas import to_timedelta, to_datetime from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype -from pandas.lib import infer_dtype +from pandas._libs.lib import infer_dtype import numpy as np diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 8ec074fbf5950..bf78a9dfb65cc 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,5 +1,5 @@ import numpy as np -import pandas.lib as lib +import pandas._libs.lib as lib from pandas.types.common import (is_number, is_numeric_dtype, diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index 9a07983b4d951..a00ccf99e1b96 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -10,5 +10,5 @@ from pandas.tseries.period import Period, PeriodIndex, period_range, pnow from pandas.tseries.resample import TimeGrouper from pandas.tseries.timedeltas import to_timedelta -from pandas.lib import NaT +from pandas._libs.lib import NaT import pandas.tseries.offsets as offsets diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 2e22c35868cb3..ae40c2f66a590 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -21,9 +21,10 @@ from pandas.core.common import AbstractMethodError import pandas.formats.printing as printing -import pandas.tslib as tslib -import pandas._period as prlib -import pandas.lib as lib +from pandas._libs import (tslib as libts, lib, + Timedelta, Timestamp, iNaT, NaT) +from pandas._libs.period import Period + from pandas.core.index import Index from pandas.indexes.base import _index_shared_docs from pandas.util.decorators import Appender, cache_readonly @@ -94,7 +95,8 @@ def _round(self, freq, rounder): result = (unit * rounder(values / float(unit)).astype('i8')) else: result = (unit * rounder(values / float(unit)).astype('i8')) - result = self._maybe_mask_results(result, fill_value=tslib.NaT) + result = self._maybe_mask_results(result, fill_value=NaT) + attribs = self._get_attributes_dict() if 'freq' in attribs: attribs['freq'] = None @@ -196,7 +198,7 @@ def _evaluate_compare(self, other, op): result[mask] = False return result try: - result[mask] = tslib.iNaT + result[mask] = iNaT return Index(result) except TypeError: return result @@ -327,7 +329,7 @@ def _nat_new(self, box=True): - If False returns ndarray of np.int64. """ result = np.zeros(len(self), dtype=np.int64) - result.fill(tslib.iNaT) + result.fill(iNaT) if not box: return result @@ -392,7 +394,7 @@ def take(self, indices, axis=0, allow_fill=True, taken = self._assert_take_fillable(self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value, - na_value=tslib.iNaT) + na_value=iNaT) # keep freq in PeriodIndex, reset otherwise freq = self.freq if isinstance(self, ABCPeriodIndex) else None @@ -404,13 +406,13 @@ def get_duplicates(self): _can_hold_na = True - _na_value = tslib.NaT + _na_value = NaT """The expected NA value to use with this index.""" @cache_readonly def _isnan(self): """ return if each value is nan""" - return (self.asi8 == tslib.iNaT) + return (self.asi8 == iNaT) @property def asobject(self): @@ -424,7 +426,7 @@ def asobject(self): def _convert_tolerance(self, tolerance): try: - return tslib.Timedelta(tolerance).to_timedelta64() + return Timedelta(tolerance).to_timedelta64() except ValueError: raise ValueError('tolerance argument for %s must be convertible ' 'to Timedelta: %r' @@ -477,7 +479,7 @@ def min(self, axis=None, *args, **kwargs): # quick check if len(i8) and self.is_monotonic: - if i8[0] != tslib.iNaT: + if i8[0] != iNaT: return self._box_func(i8[0]) if self.hasnans: @@ -525,7 +527,7 @@ def max(self, axis=None, *args, **kwargs): # quick check if len(i8) and self.is_monotonic: - if i8[-1] != tslib.iNaT: + if i8[-1] != iNaT: return self._box_func(i8[-1]) if self.hasnans: @@ -643,11 +645,11 @@ def __add__(self, other): .format(typ1=type(self).__name__, typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64, - tslib.Timedelta)): + Timedelta)): return self._add_delta(other) elif is_integer(other): return self.shift(other) - elif isinstance(other, (tslib.Timestamp, datetime)): + elif isinstance(other, (Timestamp, datetime)): return self._add_datelike(other) else: # pragma: no cover return NotImplemented @@ -673,13 +675,13 @@ def __sub__(self, other): .format(typ1=type(self).__name__, typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64, - tslib.Timedelta)): + Timedelta)): return self._add_delta(-other) elif is_integer(other): return self.shift(-other) - elif isinstance(other, (tslib.Timestamp, datetime)): + elif isinstance(other, (Timestamp, datetime)): return self._sub_datelike(other) - elif isinstance(other, prlib.Period): + elif isinstance(other, Period): return self._sub_period(other) else: # pragma: no cover return NotImplemented @@ -699,11 +701,11 @@ def _add_delta_td(self, other): # add a delta of a timedeltalike # return the i8 result view - inc = tslib._delta_to_nanoseconds(other) + inc = libts._delta_to_nanoseconds(other) new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view('i8') if self.hasnans: - new_values[self._isnan] = tslib.iNaT + new_values[self._isnan] = iNaT return new_values.view('i8') def _add_delta_tdi(self, other): @@ -721,7 +723,7 @@ def _add_delta_tdi(self, other): b_mask=other._isnan) if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) - new_values[mask] = tslib.iNaT + new_values[mask] = iNaT return new_values.view(self.dtype) def isin(self, values): @@ -849,7 +851,7 @@ def _append_same_dtype(self, to_concat, name): def _ensure_datetimelike_to_i8(other): """ helper for coercing an input scalar or array to i8 """ if lib.isscalar(other) and isnull(other): - other = tslib.iNaT + other = iNaT elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 46e8bd43e8ff8..82fcdbcd0d367 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -13,10 +13,9 @@ from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.tseries.index import DatetimeIndex -from pandas._period import IncompatibleFrequency # flake8: noqa +from pandas._libs.period import IncompatibleFrequency # flake8: noqa from pandas.tseries.period import PeriodIndex from pandas.tseries.tdi import TimedeltaIndex -from pandas import tslib from pandas.core.algorithms import take_1d diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index db7049ebc89b3..1f99e88ce86d6 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -20,7 +20,7 @@ from pandas.compat import lrange import pandas.compat as compat -import pandas.lib as lib +import pandas._libs.lib as lib import pandas.core.common as com from pandas.core.index import Index diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 957a934d13f09..8013947babc5a 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -17,9 +17,9 @@ from pandas.tseries.offsets import DateOffset from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.tseries.offsets as offsets -import pandas.lib as lib -import pandas.tslib as tslib -from pandas.tslib import Timedelta + +from pandas._libs import lib, tslib +from pandas._libs.tslib import Timedelta from pytz import AmbiguousTimeError diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 5f00e8b648689..f80618ef34373 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -44,13 +44,9 @@ import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools -from pandas.lib import Timestamp -import pandas.lib as lib -import pandas.tslib as tslib -import pandas._period as period -import pandas._join as _join -import pandas.algos as _algos -import pandas.index as _index +from pandas._libs import (lib, index as libindex, tslib as libts, + algos as libalgos, join as libjoin, + Timestamp, period as libperiod) def _utc(): @@ -75,16 +71,16 @@ def f(self): self.freq.kwds.get('month', 12)) if self.freq else 12) - result = tslib.get_start_end_field(values, field, self.freqstr, + result = libts.get_start_end_field(values, field, self.freqstr, month_kw) elif field in ['weekday_name']: - result = tslib.get_date_name_field(values, field) + result = libts.get_date_name_field(values, field) return self._maybe_mask_results(result) elif field in ['is_leap_year']: # no need to mask NaT - return tslib.get_date_field(values, field) + return libts.get_date_field(values, field) else: - result = tslib.get_date_field(values, field) + result = libts.get_date_field(values, field) return self._maybe_mask_results(result, convert='float64') @@ -115,9 +111,9 @@ def wrapper(self, other): result = _values_from_object(result) if isinstance(other, Index): - o_mask = other.values.view('i8') == tslib.iNaT + o_mask = other.values.view('i8') == libts.iNaT else: - o_mask = other.view('i8') == tslib.iNaT + o_mask = other.view('i8') == libts.iNaT if o_mask.any(): result[o_mask] = nat_result @@ -211,11 +207,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _join.left_join_indexer_unique_int64, with_indexers=False) + libjoin.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None __eq__ = _dt_index_cmp('__eq__') @@ -225,7 +221,7 @@ def _join_i8_wrapper(joinf, **kwargs): __le__ = _dt_index_cmp('__le__') __ge__ = _dt_index_cmp('__ge__') - _engine_type = _index.DatetimeEngine + _engine_type = libindex.DatetimeEngine tz = None offset = None @@ -340,7 +336,7 @@ def __new__(cls, data=None, verify_integrity = False else: if data.dtype != _NS_DTYPE: - subarr = tslib.cast_to_nanoseconds(data) + subarr = libts.cast_to_nanoseconds(data) else: subarr = data else: @@ -356,13 +352,13 @@ def __new__(cls, data=None, tz = subarr.tz else: if tz is not None: - tz = tslib.maybe_get_tz(tz) + tz = libts.maybe_get_tz(tz) if (not isinstance(data, DatetimeIndex) or getattr(data, 'tz', None) is None): # Convert tz-naive to UTC ints = subarr.view('i8') - subarr = tslib.tz_localize_to_utc(ints, tz, + subarr = libts.tz_localize_to_utc(ints, tz, ambiguous=ambiguous) subarr = subarr.view(_NS_DTYPE) @@ -430,17 +426,17 @@ def _generate(cls, start, end, periods, name, offset, raise TypeError('Start and end cannot both be tz-aware with ' 'different timezones') - inferred_tz = tslib.maybe_get_tz(inferred_tz) + inferred_tz = libts.maybe_get_tz(inferred_tz) # these may need to be localized - tz = tslib.maybe_get_tz(tz) + tz = libts.maybe_get_tz(tz) if tz is not None: date = start or end if date.tzinfo is not None and hasattr(tz, 'localize'): tz = tz.localize(date.replace(tzinfo=None)).tzinfo if tz is not None and inferred_tz is not None: - if not tslib.get_timezone(inferred_tz) == tslib.get_timezone(tz): + if not libts.get_timezone(inferred_tz) == libts.get_timezone(tz): raise AssertionError("Inferred time zone not equal to passed " "time zone") @@ -507,7 +503,7 @@ def _generate(cls, start, end, periods, name, offset, index = _generate_regular_range(start, end, periods, offset) if tz is not None and getattr(index, 'tz', None) is None: - index = tslib.tz_localize_to_utc(_ensure_int64(index), tz, + index = libts.tz_localize_to_utc(_ensure_int64(index), tz, ambiguous=ambiguous) index = index.view(_NS_DTYPE) @@ -539,11 +535,11 @@ def _local_timestamps(self): utc = _utc() if self.is_monotonic: - return tslib.tz_convert(self.asi8, utc, self.tz) + return libts.tz_convert(self.asi8, utc, self.tz) else: values = self.asi8 indexer = values.argsort() - result = tslib.tz_convert(values.take(indexer), utc, self.tz) + result = libts.tz_convert(values.take(indexer), utc, self.tz) n = len(indexer) reverse = np.empty(n, dtype=np.int_) @@ -576,7 +572,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, result._data = values result.name = name result.offset = freq - result.tz = tslib.maybe_get_tz(tz) + result.tz = libts.maybe_get_tz(tz) result._reset_identity() return result @@ -590,7 +586,7 @@ def tzinfo(self): @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" - return tslib.get_timezone(self.tzinfo) + return libts.get_timezone(self.tzinfo) def _has_same_tz(self, other): zzone = self._timezone @@ -599,7 +595,7 @@ def _has_same_tz(self, other): if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) - vzone = tslib.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) + vzone = libts.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) return zzone == vzone @classmethod @@ -671,7 +667,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, def _mpl_repr(self): # how to represent ourselves to matplotlib - return tslib.ints_to_pydatetime(self.asi8, self.tz) + return libts.ints_to_pydatetime(self.asi8, self.tz) @cache_readonly def _is_dates_only(self): @@ -728,7 +724,7 @@ def __setstate__(self, state): def _add_datelike(self, other): # adding a timedeltaindex to a datetimelike - if other is tslib.NaT: + if other is libts.NaT: return self._nat_new(box=True) raise TypeError("cannot add a datelike to a DatetimeIndex") @@ -741,9 +737,9 @@ def _sub_datelike(self, other): raise TypeError("DatetimeIndex subtraction must have the same " "timezones or no timezones") result = self._sub_datelike_dti(other) - elif isinstance(other, (tslib.Timestamp, datetime)): + elif isinstance(other, (libts.Timestamp, datetime)): other = Timestamp(other) - if other is tslib.NaT: + if other is libts.NaT: result = self._nat_new(box=False) # require tz compat elif not self._has_same_tz(other): @@ -753,7 +749,7 @@ def _sub_datelike(self, other): i8 = self.asi8 result = i8 - other.value result = self._maybe_mask_results(result, - fill_value=tslib.iNaT) + fill_value=libts.iNaT) else: raise TypeError("cannot subtract DatetimeIndex and {typ}" .format(typ=type(other).__name__)) @@ -769,7 +765,7 @@ def _sub_datelike_dti(self, other): new_values = self_i8 - other_i8 if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) - new_values[mask] = tslib.iNaT + new_values[mask] = libts.iNaT return new_values.view('i8') def _maybe_update_attributes(self, attrs): @@ -822,7 +818,7 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) - return tslib.format_array_from_datetime(self.asi8, + return libts.format_array_from_datetime(self.asi8, tz=self.tz, format=format, na_rep=na_rep) @@ -855,7 +851,7 @@ def _get_time_micros(self): values = self.asi8 if self.tz is not None and self.tz is not utc: values = self._local_timestamps() - return tslib.get_time_micros(values) + return libts.get_time_micros(values) def to_series(self, keep_tz=False): """ @@ -908,7 +904,7 @@ def to_pydatetime(self): ------- datetimes : ndarray """ - return tslib.ints_to_pydatetime(self.asi8, tz=self.tz) + return libts.ints_to_pydatetime(self.asi8, tz=self.tz) def to_period(self, freq=None): """ @@ -1160,7 +1156,7 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, l) - converted = tslib.ints_to_pydatetime(data[start_i:end_i], + converted = libts.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, freq=self.freq, box=True) for v in converted: @@ -1248,14 +1244,14 @@ def _parsed_string_to_bounds(self, reso, parsed): Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=self.tz)) elif reso == 'month': - d = tslib.monthrange(parsed.year, parsed.month)[1] + d = libts.monthrange(parsed.year, parsed.month)[1] return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), tz=self.tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead - d = tslib.monthrange(parsed.year, qe)[1] # at end of month + d = libts.monthrange(parsed.year, qe)[1] # at end of month return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), Timestamp(datetime(parsed.year, qe, d, 23, 59, @@ -1594,9 +1590,9 @@ def time(self): """ Returns numpy array of datetime.time. The time part of the Timestamps. """ - return self._maybe_mask_results(_algos.arrmap_object( + return self._maybe_mask_results(libalgos.arrmap_object( self.asobject.values, - lambda x: np.nan if x is tslib.NaT else x.time())) + lambda x: np.nan if x is libts.NaT else x.time())) @property def date(self): @@ -1604,7 +1600,7 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return self._maybe_mask_results(_algos.arrmap_object( + return self._maybe_mask_results(libalgos.arrmap_object( self.asobject.values, lambda x: x.date())) def normalize(self): @@ -1615,7 +1611,7 @@ def normalize(self): ------- normalized : DatetimeIndex """ - new_values = tslib.date_normalize(self.asi8, self.tz) + new_values = libts.date_normalize(self.asi8, self.tz) return DatetimeIndex(new_values, freq='infer', name=self.name, tz=self.tz) @@ -1654,11 +1650,11 @@ def is_normalized(self): """ Returns True if all of the dates are at midnight ("no time") """ - return tslib.dates_normalized(self.asi8, self.tz) + return libts.dates_normalized(self.asi8, self.tz) @cache_readonly def _resolution(self): - return period.resolution(self.asi8, self.tz) + return libperiod.resolution(self.asi8, self.tz) def insert(self, loc, item): """ @@ -1695,7 +1691,7 @@ def insert(self, loc, item): new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) if self.tz is not None: - new_dates = tslib.tz_convert(new_dates, 'UTC', self.tz) + new_dates = libts.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) @@ -1735,7 +1731,7 @@ def delete(self, loc): freq = self.freq if self.tz is not None: - new_dates = tslib.tz_convert(new_dates, 'UTC', self.tz) + new_dates = libts.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) def tz_convert(self, tz): @@ -1759,7 +1755,7 @@ def tz_convert(self, tz): TypeError If DatetimeIndex is tz-naive. """ - tz = tslib.maybe_get_tz(tz) + tz = libts.maybe_get_tz(tz) if self.tz is None: # tz naive, use tz_localize @@ -1814,14 +1810,14 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): """ if self.tz is not None: if tz is None: - new_dates = tslib.tz_convert(self.asi8, 'UTC', self.tz) + new_dates = libts.tz_convert(self.asi8, 'UTC', self.tz) else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: - tz = tslib.maybe_get_tz(tz) + tz = libts.maybe_get_tz(tz) # Convert to UTC - new_dates = tslib.tz_localize_to_utc(self.asi8, tz, + new_dates = libts.tz_localize_to_utc(self.asi8, tz, ambiguous=ambiguous, errors=errors) new_dates = new_dates.view(_NS_DTYPE) @@ -2134,7 +2130,7 @@ def _to_m8(key, tz=None): # this also converts strings key = Timestamp(key, tz=tz) - return np.int64(tslib.pydt_to_i8(key)).view(_NS_DTYPE) + return np.int64(libts.pydt_to_i8(key)).view(_NS_DTYPE) _CACHE_START = Timestamp(datetime(1950, 1, 1)) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 79227f6de90a5..2b6a684fc39dd 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -10,8 +10,7 @@ # import after tools, dateutil check from dateutil.relativedelta import relativedelta, weekday from dateutil.easter import easter -import pandas.tslib as tslib -from pandas.tslib import Timestamp, OutOfBoundsDatetime, Timedelta +from pandas._libs import tslib, Timestamp, OutOfBoundsDatetime, Timedelta import functools import operator diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index bfe7724a1cfaa..f7e9ba9eaa9b1 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -29,10 +29,11 @@ from pandas.tseries.tools import parse_time_string import pandas.tseries.offsets as offsets -import pandas._period as period -from pandas._period import (Period, IncompatibleFrequency, - get_period_field_arr, _validate_end_alias, - _quarter_to_myear) +from pandas._libs.lib import infer_dtype +from pandas._libs import tslib, period +from pandas._libs.period import (Period, IncompatibleFrequency, + get_period_field_arr, _validate_end_alias, + _quarter_to_myear) from pandas.core.base import _shared_docs from pandas.indexes.base import _index_shared_docs, _ensure_index @@ -40,9 +41,8 @@ from pandas import compat from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate_kwarg) -from pandas.lib import infer_dtype -import pandas.tslib as tslib from pandas.compat import zip, u + import pandas.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 21d7dc0c177b6..2856b54ad9a8c 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -20,10 +20,9 @@ import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.lib import Timestamp -from pandas._period import IncompatibleFrequency -import pandas.lib as lib -import pandas.tslib as tslib +from pandas._libs import lib, tslib +from pandas._libs.lib import Timestamp +from pandas._libs.period import IncompatibleFrequency from pandas.util.decorators import Appender from pandas.core.generic import _shared_docs diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index c62e3fc40d4af..f47d80a31b174 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -30,13 +30,8 @@ from pandas.tseries.timedeltas import (to_timedelta, _coerce_scalar_to_timedelta_type) from pandas.tseries.offsets import Tick, DateOffset - -import pandas.lib as lib -import pandas.tslib as tslib -import pandas._join as _join -import pandas.index as _index - -Timedelta = tslib.Timedelta +from pandas._libs import (lib, index as libindex, tslib as libts, + join as libjoin, Timedelta, NaT, iNaT) def _td_index_cmp(opname, nat_result=False): @@ -47,7 +42,7 @@ def _td_index_cmp(opname, nat_result=False): def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) - if _is_convertible_to_td(other) or other is tslib.NaT: + if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: @@ -65,9 +60,9 @@ def wrapper(self, other): result = _values_from_object(result) if isinstance(other, Index): - o_mask = other.values.view('i8') == tslib.iNaT + o_mask = other.values.view('i8') == iNaT else: - o_mask = other.view('i8') == tslib.iNaT + o_mask = other.view('i8') == iNaT if o_mask.any(): result[o_mask] = nat_result @@ -126,11 +121,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper( joinf, dtype='m8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _join.left_join_indexer_unique_int64, with_indexers=False) + libjoin.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None _datetimelike_ops = ['days', 'seconds', 'microseconds', 'nanoseconds', 'freq', 'components'] @@ -142,7 +137,7 @@ def _join_i8_wrapper(joinf, **kwargs): __le__ = _td_index_cmp('__le__') __ge__ = _td_index_cmp('__ge__') - _engine_type = _index.TimedeltaEngine + _engine_type = libindex.TimedeltaEngine _comparables = ['name', 'freq'] _attributes = ['name', 'freq'] @@ -274,7 +269,7 @@ def _box_func(self): def _simple_new(cls, values, name=None, freq=None, **kwargs): values = np.array(values, copy=False) if values.dtype == np.object_: - values = tslib.array_to_timedelta64(values) + values = libts.array_to_timedelta64(values) if values.dtype != _TD_DTYPE: values = _ensure_int64(values).view(_TD_DTYPE) @@ -341,18 +336,18 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): def _add_datelike(self, other): # adding a timedeltaindex to a datetimelike from pandas import Timestamp, DatetimeIndex - if other is tslib.NaT: + if other is NaT: result = self._nat_new(box=False) else: other = Timestamp(other) i8 = self.asi8 result = checked_add_with_arr(i8, other.value) - result = self._maybe_mask_results(result, fill_value=tslib.iNaT) + result = self._maybe_mask_results(result, fill_value=iNaT) return DatetimeIndex(result, name=self.name, copy=False) def _sub_datelike(self, other): from pandas import DatetimeIndex - if other is tslib.NaT: + if other is NaT: result = self._nat_new(box=False) else: raise TypeError("cannot subtract a datelike from a TimedeltaIndex") @@ -452,7 +447,7 @@ def to_pytimedelta(self): ------- datetimes : ndarray """ - return tslib.ints_to_pytimedelta(self.asi8) + return libts.ints_to_pytimedelta(self.asi8) @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): @@ -677,7 +672,7 @@ def get_loc(self, key, method=None, tolerance=None): raise TypeError if isnull(key): - key = tslib.NaT + key = NaT if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by @@ -736,7 +731,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - if is_integer(key) or is_float(key) or key is tslib.NaT: + if is_integer(key) or is_float(key) or key is NaT: self._invalid_indexer('slice', key) loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, use_rhs=use_rhs) @@ -837,7 +832,7 @@ def insert(self, loc, item): pass freq = None - if isinstance(item, (Timedelta, tslib.NaTType)): + if isinstance(item, (Timedelta, libts.NaTType)): # check freq can be preserved on edge cases if self.freq is not None: diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 5a5d1533bfa91..ead602ee80e32 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -import pandas.tslib as tslib +import pandas._libs.tslib as tslib from pandas.types.common import (_ensure_object, is_integer_dtype, diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index f746409aadfc9..093331e861fa7 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -2,8 +2,7 @@ import numpy as np from collections import MutableMapping -import pandas.lib as lib -import pandas.tslib as tslib +from pandas._libs import lib, tslib from pandas.types.common import (_ensure_object, is_datetime64_ns_dtype, diff --git a/pandas/tslib.py b/pandas/tslib.py new file mode 100644 index 0000000000000..3ecbffa20700d --- /dev/null +++ b/pandas/tslib.py @@ -0,0 +1,8 @@ +# flake8: noqa + +import warnings +warnings.warn("The pandas.tslib module is deprecated and will be " + "removed in a future version. Please import from " + "the pandas._libs.tslib instead", FutureWarning, stacklevel=2) +from pandas._libs.tslib import (Timestamp, Timedelta, + NaT, OutOfBoundsDatetime) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 8cc3fe41f73c8..1cd55274b9b49 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -2,8 +2,8 @@ from datetime import datetime, timedelta import numpy as np -from pandas import lib, tslib -from pandas.tslib import iNaT +from pandas._libs import tslib, lib +from pandas._libs.tslib import iNaT from pandas.compat import string_types, text_type, PY3 from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, @@ -807,14 +807,14 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): "dtype [%s]" % dtype) if is_scalar(value): - if value == tslib.iNaT or isnull(value): - value = tslib.iNaT + if value == iNaT or isnull(value): + value = iNaT else: value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) if value.ndim == 0: - value = tslib.iNaT + value = iNaT # we have an array of datetime or timedeltas & nulls elif np.prod(value.shape) or not is_dtype_equal(value.dtype, diff --git a/pandas/types/common.py b/pandas/types/common.py index e58e0826ea49a..1be5b5f6f1368 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -3,7 +3,7 @@ import numpy as np from pandas.compat import (string_types, text_type, binary_type, PY3, PY36) -from pandas import lib, algos +from pandas._libs import algos, lib from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 9e47a97dd621a..b098bbb75d984 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -3,7 +3,7 @@ """ import numpy as np -import pandas.tslib as tslib +import pandas._libs.tslib as tslib from pandas import compat from pandas.core.algorithms import take_1d from .common import (is_categorical_dtype, diff --git a/pandas/types/inference.py b/pandas/types/inference.py index d2a2924b27659..d8e3b3ee7329b 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -6,7 +6,7 @@ from numbers import Number from pandas.compat import (string_types, text_type, string_and_binary_types) -from pandas import lib +from pandas._libs import lib is_bool = lib.is_bool diff --git a/pandas/types/missing.py b/pandas/types/missing.py index e6791b79bf3bd..cc8b5edc27542 100644 --- a/pandas/types/missing.py +++ b/pandas/types/missing.py @@ -2,8 +2,8 @@ missing types & inference """ import numpy as np -from pandas import lib -from pandas.tslib import NaT, iNaT +from pandas._libs import lib +from pandas._libs.tslib import NaT, iNaT from .generic import (ABCMultiIndex, ABCSeries, ABCIndexClass, ABCGeneric) from .common import (is_string_dtype, is_datetimelike, diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 62ff6ef14418a..4e1719958e8b7 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -1,5 +1,5 @@ from pandas.compat import StringIO, callable, signature -from pandas.lib import cache_readonly # noqa +from pandas._libs.lib import cache_readonly # noqa import types import sys import warnings diff --git a/pandas/util/depr_module.py b/pandas/util/depr_module.py index cf8b0f7960f17..b181c4627b1e1 100644 --- a/pandas/util/depr_module.py +++ b/pandas/util/depr_module.py @@ -13,12 +13,15 @@ class _DeprecatedModule(object): Parameters ---------- deprmod : name of module to be deprecated. + deprmodto : name of module as a replacement, optional + if not givent will __module__ removals : objects or methods in module that will no longer be accessible once module is removed. """ - def __init__(self, deprmod, removals=None): + def __init__(self, deprmod, deprmodto=None, removals=None): self.deprmod = deprmod + self.deprmodto = deprmodto self.removals = removals if self.removals is not None: self.removals = frozenset(self.removals) @@ -40,7 +43,15 @@ def __getattr__(self, name): if name in self.self_dir: return object.__getattribute__(self, name) - deprmodule = self._import_deprmod() + try: + deprmodule = self._import_deprmod(self.deprmod) + except ImportError: + if self.deprmodto is None: + raise + + # a rename + deprmodule = self._import_deprmod(self.deprmodto) + obj = getattr(deprmodule, name) if self.removals is not None and name in self.removals: @@ -49,17 +60,24 @@ def __getattr__(self, name): "a future version.".format(deprmod=self.deprmod, name=name), FutureWarning, stacklevel=2) else: + deprmodto = self.deprmodto + if deprmodto is None: + deprmodto = "{modname}.{name}".format( + modname=obj.__module__, name=name) # The object is actually located in another module. warnings.warn( "{deprmod}.{name} is deprecated. Please use " - "{modname}.{name} instead.".format( - deprmod=self.deprmod, modname=obj.__module__, name=name), + "{deprmodto}.{name} instead.".format( + deprmod=self.deprmod, name=name, deprmodto=deprmodto), FutureWarning, stacklevel=2) return obj - def _import_deprmod(self): + def _import_deprmod(self, mod=None): + if mod is None: + mod = self.deprmod + with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=FutureWarning) - deprmodule = importlib.import_module(self.deprmod) + deprmodule = importlib.import_module(mod) return deprmodule diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c5e5df9037daa..b68bf55a347b2 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -47,7 +47,7 @@ TimedeltaIndex, PeriodIndex, RangeIndex, Index, MultiIndex, Series, DataFrame, Panel, Panel4D) from pandas.util.decorators import deprecate -from pandas import _testing +from pandas.util import libtesting from pandas.io.common import urlopen slow = pytest.mark.slow @@ -173,7 +173,7 @@ def assert_almost_equal(left, right, check_exact=False, else: obj = 'Input' assert_class_equal(left, right, obj=obj) - return _testing.assert_almost_equal( + return libtesting.assert_almost_equal( left, right, check_dtype=check_dtype, check_less_precise=check_less_precise, @@ -185,7 +185,7 @@ def assert_dict_equal(left, right, compare_keys=True): assertIsInstance(left, dict, '[dict] ') assertIsInstance(right, dict, '[dict] ') - return _testing.assert_dict_equal(left, right, compare_keys=compare_keys) + return libtesting.assert_dict_equal(left, right, compare_keys=compare_keys) def randbool(size=(), p=0.5): @@ -833,10 +833,10 @@ def _get_ilevel_values(index, level): .format(obj, np.round(diff, 5)) raise_assert_detail(obj, msg, left, right) else: - _testing.assert_almost_equal(left.values, right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, lobj=left, robj=right) + libtesting.assert_almost_equal(left.values, right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, lobj=left, robj=right) # metadata comparison if check_names: @@ -1213,10 +1213,10 @@ def assert_series_equal(left, right, check_dtype=True, assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) else: - _testing.assert_almost_equal(left.get_values(), right.get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj='{0}'.format(obj)) + libtesting.assert_almost_equal(left.get_values(), right.get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj='{0}'.format(obj)) # metadata comparison if check_names: @@ -1432,8 +1432,10 @@ def assert_sp_array_equal(left, right, check_dtype=True): check_dtype=check_dtype) # SparseIndex comparison - assertIsInstance(left.sp_index, pd._sparse.SparseIndex, '[SparseIndex]') - assertIsInstance(right.sp_index, pd._sparse.SparseIndex, '[SparseIndex]') + assertIsInstance(left.sp_index, + pd.sparse.libsparse.SparseIndex, '[SparseIndex]') + assertIsInstance(right.sp_index, + pd.sparse.libsparse.SparseIndex, '[SparseIndex]') if not left.sp_index.equals(right.sp_index): raise_assert_detail('SparseArray.index', 'index are not equal', diff --git a/pandas/src/testing.pyx b/pandas/util/testing.pyx similarity index 100% rename from pandas/src/testing.pyx rename to pandas/util/testing.pyx diff --git a/scripts/bench_join.py b/scripts/bench_join.py index 1ce5c94130e85..f9d43772766d8 100644 --- a/scripts/bench_join.py +++ b/scripts/bench_join.py @@ -1,6 +1,6 @@ from pandas.compat import range, lrange import numpy as np -import pandas.lib as lib +import pandas._libs.lib as lib from pandas import * from copy import deepcopy import time diff --git a/scripts/bench_join_multi.py b/scripts/bench_join_multi.py index 7b93112b7f869..b19da6a2c47d8 100644 --- a/scripts/bench_join_multi.py +++ b/scripts/bench_join_multi.py @@ -3,7 +3,7 @@ import numpy as np from pandas.compat import zip, range, lzip from pandas.util.testing import rands -import pandas.lib as lib +import pandas._libs.lib as lib N = 100000 diff --git a/scripts/groupby_test.py b/scripts/groupby_test.py index 5acf7da7534a3..f640a6ed79503 100644 --- a/scripts/groupby_test.py +++ b/scripts/groupby_test.py @@ -5,7 +5,7 @@ from pandas import * -import pandas.lib as tseries +import pandas._libs.lib as tseries import pandas.core.groupby as gp import pandas.util.testing as tm from pandas.compat import range diff --git a/scripts/roll_median_leak.py b/scripts/roll_median_leak.py index 07161cc6499bf..03f39e2b18372 100644 --- a/scripts/roll_median_leak.py +++ b/scripts/roll_median_leak.py @@ -7,7 +7,7 @@ from vbench.api import Benchmark from pandas.util.testing import rands from pandas.compat import range -import pandas.lib as lib +import pandas._libs.lib as lib import pandas._sandbox as sbx import time diff --git a/setup.py b/setup.py index 525cbdf600c78..e257b2376060b 100755 --- a/setup.py +++ b/setup.py @@ -109,21 +109,21 @@ def is_platform_mac(): from os.path import join as pjoin -_pxipath = pjoin('pandas', 'src') _pxi_dep_template = { - 'algos': ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_take_helper.pxi.in', 'algos_rank_helper.pxi.in'], - '_reshape': ['reshape_helper.pxi.in'], - '_join': ['join_helper.pxi.in', 'joins_func_helper.pxi.in'], - 'hashtable': ['hashtable_class_helper.pxi.in', - 'hashtable_func_helper.pxi.in'], - 'index': ['index_class_helper.pxi.in'], - '_sparse': ['sparse_op_helper.pxi.in'] + 'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_groupby_helper.pxi.in', + '_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'], + 'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'], + 'reshape': ['_libs/reshape_helper.pxi.in'], + 'hashtable': ['_libs/hashtable_class_helper.pxi.in', + '_libs/hashtable_func_helper.pxi.in'], + 'index': ['_libs/index_class_helper.pxi.in'], + 'sparse': ['sparse/sparse_op_helper.pxi.in'], } + _pxifiles = [] _pxi_dep = {} for module, files in _pxi_dep_template.items(): - pxi_files = [pjoin(_pxipath, x) for x in files] + pxi_files = [pjoin('pandas', x) for x in files] _pxifiles.extend(pxi_files) _pxi_dep[module] = pxi_files @@ -261,7 +261,7 @@ def initialize_options(self): self._clean_me = [] self._clean_trees = [] - base = pjoin('pandas','src') + base = pjoin('pandas','_libs', 'src') dt = pjoin(base,'datetime') src = base util = pjoin('pandas','util') @@ -327,19 +327,19 @@ def run(self): class CheckSDist(sdist_class): """Custom sdist that ensures Cython has compiled all pyx files to c.""" - _pyxfiles = ['pandas/lib.pyx', - 'pandas/hashtable.pyx', - 'pandas/tslib.pyx', - 'pandas/index.pyx', - 'pandas/algos.pyx', - 'pandas/join.pyx', - 'pandas/window.pyx', - 'pandas/parser.pyx', - 'pandas/src/period.pyx', - 'pandas/src/sparse.pyx', - 'pandas/src/testing.pyx', - 'pandas/src/hash.pyx', - 'pandas/io/sas/saslib.pyx'] + _pyxfiles = ['pandas/_libs/lib.pyx', + 'pandas/_libs/hashtable.pyx', + 'pandas/_libs/tslib.pyx', + 'pandas/_libs/period.pyx', + 'pandas/_libs/index.pyx', + 'pandas/_libs/algos.pyx', + 'pandas/_libs/join.pyx', + 'pandas/core/window.pyx', + 'pandas/sparse/sparse.pyx', + 'pandas/util/testing.pyx', + 'pandas/tools/hash.pyx', + 'pandas/io/parsers.pyx', + 'pandas/io/sas/sas.pyx'] def initialize_options(self): sdist_class.initialize_options(self) @@ -374,6 +374,7 @@ def check_cython_extensions(self, extensions): for ext in extensions: for src in ext.sources: if not os.path.exists(src): + print("{}: -> [{}]".format(ext.name, ext.sources)) raise Exception("""Cython-generated file '%s' not found. Cython is required to compile pandas from a development branch. Please install Cython or download a release package of pandas. @@ -440,12 +441,12 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): if suffix == '.pyx': lib_depends = [srcpath(f, suffix='.pyx') for f in lib_depends] - lib_depends.append('pandas/src/util.pxd') + lib_depends.append('pandas/_libs/src/util.pxd') else: lib_depends = [] plib_depends = [] -common_include = ['pandas/src/klib', 'pandas/src'] +common_include = ['pandas/_libs/src/klib', 'pandas/_libs/src'] def pxd(name): @@ -457,71 +458,70 @@ def pxd(name): else: extra_compile_args=['-Wno-unused-function'] -lib_depends = lib_depends + ['pandas/src/numpy_helper.h', - 'pandas/src/parse_helper.h'] +lib_depends = lib_depends + ['pandas/_libs/src/numpy_helper.h', + 'pandas/_libs/src/parse_helper.h'] -tseries_depends = ['pandas/src/datetime/np_datetime.h', - 'pandas/src/datetime/np_datetime_strings.h', - 'pandas/src/datetime_helper.h', - 'pandas/src/period_helper.h', - 'pandas/src/datetime.pxd'] +tseries_depends = ['pandas/_libs/src/datetime/np_datetime.h', + 'pandas/_libs/src/datetime/np_datetime_strings.h', + 'pandas/_libs/src/datetime_helper.h', + 'pandas/_libs/src/period_helper.h', + 'pandas/_libs/src/datetime.pxd'] # some linux distros require it libraries = ['m'] if not is_platform_windows() else [] -ext_data = dict( - lib={'pyxfile': 'lib', - 'pxdfiles': [], - 'depends': lib_depends}, - hashtable={'pyxfile': 'hashtable', - 'pxdfiles': ['hashtable'], - 'depends': (['pandas/src/klib/khash_python.h'] - + _pxi_dep['hashtable'])}, - tslib={'pyxfile': 'tslib', - 'depends': tseries_depends, - 'sources': ['pandas/src/datetime/np_datetime.c', - 'pandas/src/datetime/np_datetime_strings.c', - 'pandas/src/period_helper.c']}, - _period={'pyxfile': 'src/period', - 'depends': tseries_depends, - 'sources': ['pandas/src/datetime/np_datetime.c', - 'pandas/src/datetime/np_datetime_strings.c', - 'pandas/src/period_helper.c']}, - index={'pyxfile': 'index', - 'sources': ['pandas/src/datetime/np_datetime.c', - 'pandas/src/datetime/np_datetime_strings.c'], - 'pxdfiles': ['src/util', 'hashtable'], - 'depends': _pxi_dep['index']}, - algos={'pyxfile': 'algos', - 'pxdfiles': ['src/util', 'hashtable'], - 'depends': _pxi_dep['algos']}, - _reshape={'pyxfile': 'src/reshape', - 'depends': _pxi_dep['_reshape']}, - _join={'pyxfile': 'src/join', - 'pxdfiles': ['src/util', 'hashtable'], - 'depends': _pxi_dep['_join']}, - _window={'pyxfile': 'window', - 'pxdfiles': ['src/skiplist', 'src/util'], - 'depends': ['pandas/src/skiplist.pyx', - 'pandas/src/skiplist.h']}, - parser={'pyxfile': 'parser', - 'depends': ['pandas/src/parser/tokenizer.h', - 'pandas/src/parser/io.h', - 'pandas/src/numpy_helper.h'], - 'sources': ['pandas/src/parser/tokenizer.c', - 'pandas/src/parser/io.c']}, - _sparse={'pyxfile': 'src/sparse', - 'depends': ([srcpath('sparse', suffix='.pyx')] + - _pxi_dep['_sparse'])}, - _testing={'pyxfile': 'src/testing', - 'depends': [srcpath('testing', suffix='.pyx')]}, - _hash={'pyxfile': 'src/hash', - 'depends': [srcpath('hash', suffix='.pyx')]}, -) - -ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'} +ext_data = { + '_libs.lib': {'pyxfile': '_libs/lib', + 'pxdfiles': [], + 'depends': lib_depends}, + '_libs.hashtable': {'pyxfile': '_libs/hashtable', + 'pxdfiles': ['_libs/hashtable'], + 'depends': (['pandas/_libs/src/klib/khash_python.h'] + + _pxi_dep['hashtable'])}, + '_libs.tslib': {'pyxfile': '_libs/tslib', + 'depends': tseries_depends, + 'sources': ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c', + 'pandas/_libs/src/period_helper.c']}, + '_libs.period': {'pyxfile': '_libs/period', + 'depends': tseries_depends, + 'sources': ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c', + 'pandas/_libs/src/period_helper.c']}, + '_libs.index': {'pyxfile': '_libs/index', + 'sources': ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c'], + 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'depends': _pxi_dep['index']}, + '_libs.algos': {'pyxfile': '_libs/algos', + 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'depends': _pxi_dep['algos']}, + '_libs.join': {'pyxfile': '_libs/join', + 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'depends': _pxi_dep['join']}, + '_libs.reshape': {'pyxfile': '_libs/reshape', + 'depends': _pxi_dep['reshape']}, + 'core.libwindow': {'pyxfile': 'core/window', + 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], + 'depends': ['pandas/_libs/src/skiplist.pyx', + 'pandas/_libs/src/skiplist.h']}, + 'io.libparsers': {'pyxfile': 'io/parsers', + 'depends': ['pandas/_libs/src/parser/tokenizer.h', + 'pandas/_libs/src/parser/io.h', + 'pandas/_libs/src/numpy_helper.h'], + 'sources': ['pandas/_libs/src/parser/tokenizer.c', + 'pandas/_libs/src/parser/io.c']}, + 'sparse.libsparse': {'pyxfile': 'sparse/sparse', + 'depends': (['pandas/sparse/sparse.pyx'] + + _pxi_dep['sparse'])}, + 'util.libtesting': {'pyxfile': 'util/testing', + 'depends': ['pandas/util/testing.pyx']}, + 'tools.libhashing': {'pyxfile': 'tools/hashing', + 'depends': ['pandas/tools/hashing.pyx']}, + 'io.sas.libsas': {'pyxfile': 'io/sas/sas'}, + } extensions = [] @@ -552,25 +552,25 @@ def pxd(name): else: macros = [('__LITTLE_ENDIAN__', '1')] -packer_ext = Extension('pandas.msgpack._packer', - depends=['pandas/src/msgpack/pack.h', - 'pandas/src/msgpack/pack_template.h'], +packer_ext = Extension('pandas.io.msgpack._packer', + depends=['pandas/_libs/src/msgpack/pack.h', + 'pandas/_libs/src/msgpack/pack_template.h'], sources = [srcpath('_packer', suffix=suffix if suffix == '.pyx' else '.cpp', - subdir='msgpack')], + subdir='io/msgpack')], language='c++', - include_dirs=['pandas/src/msgpack'] + common_include, + include_dirs=['pandas/_libs/src/msgpack'] + common_include, define_macros=macros, extra_compile_args=extra_compile_args) -unpacker_ext = Extension('pandas.msgpack._unpacker', - depends=['pandas/src/msgpack/unpack.h', - 'pandas/src/msgpack/unpack_define.h', - 'pandas/src/msgpack/unpack_template.h'], +unpacker_ext = Extension('pandas.io.msgpack._unpacker', + depends=['pandas/_libs/src/msgpack/unpack.h', + 'pandas/_libs/src/msgpack/unpack_define.h', + 'pandas/_libs/src/msgpack/unpack_template.h'], sources = [srcpath('_unpacker', suffix=suffix if suffix == '.pyx' else '.cpp', - subdir='msgpack')], + subdir='io/msgpack')], language='c++', - include_dirs=['pandas/src/msgpack'] + common_include, + include_dirs=['pandas/_libs/src/msgpack'] + common_include, define_macros=macros, extra_compile_args=extra_compile_args) extensions.append(packer_ext) @@ -586,20 +586,20 @@ def pxd(name): root, _ = os.path.splitext(ext.sources[0]) ext.sources[0] = root + suffix -ujson_ext = Extension('pandas.json', - depends=['pandas/src/ujson/lib/ultrajson.h', - 'pandas/src/datetime_helper.h', - 'pandas/src/numpy_helper.h'], - sources=['pandas/src/ujson/python/ujson.c', - 'pandas/src/ujson/python/objToJSON.c', - 'pandas/src/ujson/python/JSONtoObj.c', - 'pandas/src/ujson/lib/ultrajsonenc.c', - 'pandas/src/ujson/lib/ultrajsondec.c', - 'pandas/src/datetime/np_datetime.c', - 'pandas/src/datetime/np_datetime_strings.c'], - include_dirs=['pandas/src/ujson/python', - 'pandas/src/ujson/lib', - 'pandas/src/datetime'] + common_include, +ujson_ext = Extension('pandas.io.json.libjson', + depends=['pandas/_libs/src/ujson/lib/ultrajson.h', + 'pandas/_libs/src/datetime_helper.h', + 'pandas/_libs/src/numpy_helper.h'], + sources=['pandas/_libs/src/ujson/python/ujson.c', + 'pandas/_libs/src/ujson/python/objToJSON.c', + 'pandas/_libs/src/ujson/python/JSONtoObj.c', + 'pandas/_libs/src/ujson/lib/ultrajsonenc.c', + 'pandas/_libs/src/ujson/lib/ultrajsondec.c', + 'pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c'], + include_dirs=['pandas/_libs/src/ujson/python', + 'pandas/_libs/src/ujson/lib', + 'pandas/_libs/src/datetime'] + common_include, extra_compile_args=['-D_GNU_SOURCE'] + extra_compile_args) @@ -634,6 +634,8 @@ def pxd(name): 'pandas.io', 'pandas.io.json', 'pandas.io.sas', + 'pandas.io.msgpack', + 'pandas._libs', 'pandas.formats', 'pandas.sparse', 'pandas.stats', @@ -650,10 +652,10 @@ def pxd(name): 'pandas.tests.io.json', 'pandas.tests.io.parser', 'pandas.tests.io.sas', + 'pandas.tests.io.msgpack', 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', - 'pandas.tests.msgpack', 'pandas.tests.scalar', 'pandas.tests.sparse', 'pandas.tests.tseries', @@ -663,7 +665,6 @@ def pxd(name): 'pandas.tools', 'pandas.tseries', 'pandas.types', - 'pandas.msgpack', 'pandas.util.clipboard' ], package_data={'pandas.tests': ['data/*.csv'], diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py index a1326d63a112a..bd2e8a1c1d504 100644 --- a/vb_suite/pandas_vb_common.py +++ b/vb_suite/pandas_vb_common.py @@ -16,7 +16,7 @@ try: import pandas._tseries as lib except: - import pandas.lib as lib + import pandas._libs.lib as lib try: Panel = WidePanel From ef562286a5a6ec51ce5ae3ed6f4d26d6be7d16d6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 7 Mar 2017 19:31:22 -0500 Subject: [PATCH 250/338] CLN: clean up Makefile & fix lib.pyx deps --- Makefile | 3 --- setup.py | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 90dcd16d955d6..194a8861715b7 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,6 @@ clean: clean_pyc: -find . -name '*.py[co]' -exec rm {} \; -sparse: pandas/src/sparse.pyx - python setup.py build_ext --inplace - build: clean_pyc python setup.py build_ext --inplace diff --git a/setup.py b/setup.py index e257b2376060b..3e0a6b41152dc 100755 --- a/setup.py +++ b/setup.py @@ -440,7 +440,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): return pjoin('pandas', subdir, name + suffix) if suffix == '.pyx': - lib_depends = [srcpath(f, suffix='.pyx') for f in lib_depends] + lib_depends = [srcpath(f, suffix='.pyx', subdir='_libs/src') for f in lib_depends] lib_depends.append('pandas/_libs/src/util.pxd') else: lib_depends = [] @@ -474,13 +474,13 @@ def pxd(name): ext_data = { '_libs.lib': {'pyxfile': '_libs/lib', - 'pxdfiles': [], - 'depends': lib_depends}, + 'depends': lib_depends + tseries_depends}, '_libs.hashtable': {'pyxfile': '_libs/hashtable', 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, '_libs.tslib': {'pyxfile': '_libs/tslib', + 'pxdfiles': ['_libs/src/util', '_libs/lib'], 'depends': tseries_depends, 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', From 0c59dd58e53b37e82bed6cca04264a99bda7d3cc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Mar 2017 08:12:10 -0500 Subject: [PATCH 251/338] DOC: fix appeveyor badge to point to pandas-dev account --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8595043cf68c3..e05f1405419fc 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,8 @@ From cbbce9a3f0e87e4a882a6b7b46cddcfdfbb4dcee Mon Sep 17 00:00:00 2001 From: manu Date: Wed, 8 Mar 2017 08:24:38 -0500 Subject: [PATCH 252/338] BUG: make Series.sort_values(ascending=[False]) behave as ascending=False (#15604) closes #15604 Author: manu Closes #15607 from MLopez-Ibanez/series-ascending and squashes the following commits: 6678574 [manu] BUG: make Series.sort_values(ascending=[False]) behave as ascending=False (#15604) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/series.py | 10 ++++++++++ pandas/tests/series/test_sorting.py | 19 +++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8f2033de6c77f..a7169640759e3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -228,7 +228,7 @@ Other enhancements - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - +- ``Series.sort_values`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values`` (:issue:`15604`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/core/series.py b/pandas/core/series.py index 83036ffef0bed..f23e90effdabf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -14,6 +14,7 @@ import numpy.ma as ma from pandas.types.common import (_coerce_to_dtype, is_categorical_dtype, + is_bool, is_integer, is_integer_dtype, is_float_dtype, is_extension_type, is_datetimetz, @@ -1719,6 +1720,15 @@ def _try_kind_sort(arr): argsorted = _try_kind_sort(arr[good]) + if is_list_like(ascending): + if len(ascending) != 1: + raise ValueError('Length of ascending (%d) must be 1 ' + 'for Series' % (len(ascending))) + ascending = ascending[0] + + if not is_bool(ascending): + raise ValueError('ascending must be boolean') + if not ascending: argsorted = argsorted[::-1] diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index db506f12a2293..590a530a847bd 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -64,6 +64,25 @@ def test_sort_values(self): ordered = ts.sort_values(ascending=False, na_position='first') assert_almost_equal(expected, ordered.valid().values) + # ascending=[False] should behave the same as ascending=False + ordered = ts.sort_values(ascending=[False]) + expected = ts.sort_values(ascending=False) + assert_series_equal(expected, ordered) + ordered = ts.sort_values(ascending=[False], na_position='first') + expected = ts.sort_values(ascending=False, na_position='first') + assert_series_equal(expected, ordered) + + self.assertRaises(ValueError, + lambda: ts.sort_values(ascending=None)) + self.assertRaises(ValueError, + lambda: ts.sort_values(ascending=[])) + self.assertRaises(ValueError, + lambda: ts.sort_values(ascending=[1, 2, 3])) + self.assertRaises(ValueError, + lambda: ts.sort_values(ascending=[False, False])) + self.assertRaises(ValueError, + lambda: ts.sort_values(ascending='foobar')) + # inplace=True ts = self.ts.copy() ts.sort_values(ascending=False, inplace=True) From 32530542684b7a29498809a2fa160106f9eaf45d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Mar 2017 08:30:59 -0500 Subject: [PATCH 253/338] DOC: remove gbq references / clean some whatsnew --- doc/source/whatsnew/v0.20.0.txt | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a7169640759e3..92daf29efe71f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -12,7 +12,7 @@ Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` - Switched the test framework to `pytest`_ (:issue:`13097`) -- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref: `here ` +- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref:`here ` .. _pytest: http://doc.pytest.org/en/latest/ @@ -27,11 +27,6 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ -- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. -- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) -- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) - - .. _whatsnew_0200.enhancements.dataio_dtype: @@ -193,6 +188,11 @@ You must enable this by setting the ``display.html.table_schema`` option to True Other enhancements ^^^^^^^^^^^^^^^^^^ +- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. +- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) + + - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). @@ -201,7 +201,6 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) - ``.isnull()`` and ``.notnull()`` have been added to ``Index`` object to make them more consistent with the ``Series`` API (:issue:`15300`) -- ``pd.read_gbq`` method now allows query configuration preferences (:issue:`14742`) - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack @@ -228,7 +227,7 @@ Other enhancements - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) -- ``Series.sort_values`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values`` (:issue:`15604`) + .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -444,7 +443,7 @@ Pandas Google BigQuery support has moved ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``pip install pandas-gbq`` to get it. -The functionality of ``pd.read_gbq()`` and ``.to_gbq()`` remains the same with the currently released version of ``pandas-gbq=0.1.2``. (:issue:`15347`) +The functionality of ``pd.read_gbq()`` and ``.to_gbq()`` remains the same with the currently released version of ``pandas-gbq=0.1.3``. (:issue:`15347`) Documentation is now hosted `here `__ .. _whatsnew_0200.api_breaking.memory_usage: @@ -611,9 +610,9 @@ Other API Changes - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) -- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss (:issue:`14064`, :issue:`14305`). - Reorganization of timeseries development tests (:issue:`14854`) - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) +- ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) .. _whatsnew_0200.deprecations: @@ -651,7 +650,7 @@ Removal of prior version deprecations/changes - ``pandas.stats.fama_macbeth``, ``pandas.stats.ols``, ``pandas.stats.plm`` and ``pandas.stats.var``, as well as the top-level ``pandas.fama_macbeth`` and ``pandas.ols`` routines are removed. Similar functionaility can be found in the `statsmodels `__ package. (:issue:`11898`) - The ``TimeSeries`` and ``SparseTimeSeries`` classes, aliases of ``Series`` and ``SparseSeries``, are removed (:issue:`10890`, :issue:`15098`). -- ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:``) +- ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:`15098`) - The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). @@ -681,7 +680,7 @@ Bug Fixes - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) -- Bug in ``DatetimeIndex.round()`` and ``Timestamp.round()`` floating point accuracy when rounding by milliseconds or less (:issue: `14440`, :issue:`15578`) +- Bug in ``DatetimeIndex.round()`` and ``Timestamp.round()`` floating point accuracy when rounding by milliseconds or less (:issue:`14440`, :issue:`15578`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) From 65bdde58aa137f0e4bb8098d86c1cf71c3449ff2 Mon Sep 17 00:00:00 2001 From: Luca Scarabello Date: Wed, 8 Mar 2017 08:38:43 -0500 Subject: [PATCH 254/338] BUG: pd.cut with bins=1 and input all 0s The special case of running pd.cut() qith bins=1 an input containing all 0s raises a ValueError closes #15428 closes #15431 Author: Luca Scarabello Author: Luca Closes #15437 from luca-s/issue_15428 and squashes the following commits: 1248987 [Luca] rebased on master def84ba [Luca] Yet another implementation attempt 692503a [Luca Scarabello] Improved solution: using same approach as pd.cut b7d92dc [Luca] Added 'allow' duplicates option to _bins_to_cuts f56a27f [Luca Scarabello] Issue #15431 55806cf [Luca Scarabello] BUG: pd.cut with bins=1 and input all 0s --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/tests/tools/test_tile.py | 81 +++++++++++++++++++++++++++++++-- pandas/tools/tile.py | 6 +-- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 92daf29efe71f..bf778f6065010 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -698,8 +698,8 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - - +- Bug in ``pd.cut()`` with a single bin on an all 0s array (:issue:`15428`) +- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) - Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index de44eadc15751..11b242bc06e15 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -3,7 +3,7 @@ import numpy as np from pandas.compat import zip -from pandas import Series, Index +from pandas import Series, Index, Categorical import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com @@ -239,7 +239,6 @@ def test_qcut_binning_issues(self): self.assertTrue(ep <= sn) def test_cut_return_categorical(self): - from pandas import Categorical s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = cut(s, 3) exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -249,7 +248,6 @@ def test_cut_return_categorical(self): tm.assert_series_equal(res, exp) def test_qcut_return_categorical(self): - from pandas import Categorical s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = qcut(s, [0, 0.333, 0.666, 1]) exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -285,6 +283,60 @@ def test_qcut_duplicates_bin(self): # invalid self.assertRaises(ValueError, qcut, values, 3, duplicates='foo') + def test_single_quantile(self): + # issue 15431 + expected = Series([0, 0]) + + s = Series([9., 9.]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0, 0], ["[9, 9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) + + s = Series([-9., -9.]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0, 0], ["[-9, -9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) + + s = Series([0., 0.]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0, 0], ["[0, 0]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) + + expected = Series([0]) + + s = Series([9]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0], ["[9, 9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) + + s = Series([-9]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0], ["[-9, -9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) + + s = Series([0]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0], ["[0, 0]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) @@ -297,6 +349,29 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + expected = Series([0]) + + s = Series([9]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([-9]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + # issue 15428 + expected = Series([0, 0]) + + s = Series([0., 0.]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + expected = Series([0]) + + s = Series([0]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 9b21e542f153c..ccd8c2478e8a5 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -104,8 +104,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, mn, mx = [mi + 0.0 for mi in rng] if mn == mx: # adjust end points before binning - mn -= .001 * abs(mn) - mx += .001 * abs(mx) + mn -= .001 * abs(mn) if mn != 0 else .001 + mx += .001 * abs(mx) if mx != 0 else .001 bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) @@ -206,7 +206,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, "valid options are: raise, drop") unique_bins = algos.unique(bins) - if len(unique_bins) < len(bins): + if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " From 53dc52caf1d1f1fed406a1cf474cc3d086ad5f73 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Mar 2017 09:52:29 -0500 Subject: [PATCH 255/338] BLD: fix linting wrt to #15537, changes in location of pandas/src (#15614) --- ci/lint.sh | 8 ++++---- pandas/_libs/src/datetime/np_datetime.h | 6 +++--- pandas/_libs/src/datetime/np_datetime_strings.h | 6 +++--- pandas/_libs/src/datetime_helper.h | 6 +++--- pandas/_libs/src/helper.h | 6 +++--- pandas/_libs/src/numpy_helper.h | 6 +++--- pandas/_libs/src/parse_helper.h | 6 +++--- pandas/_libs/src/parser/io.h | 6 +++--- pandas/_libs/src/parser/tokenizer.h | 6 +++--- pandas/_libs/src/period_helper.h | 6 +++--- pandas/_libs/src/skiplist.h | 6 +++--- pandas/_libs/src/ujson/lib/ultrajson.h | 6 +++--- pandas/_libs/src/ujson/python/py_defines.h | 8 ++++---- pandas/_libs/src/ujson/python/version.h | 8 ++++---- test.bat | 2 +- test_fast.sh | 2 +- 16 files changed, 47 insertions(+), 47 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 2ffc68e5eb139..ed3af2568811c 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -8,9 +8,9 @@ RET=0 if [ "$LINT" ]; then - # pandas/src is C code, so no need to search there. + # pandas/_libs/src is C code, so no need to search there. echo "Linting *.py" - flake8 pandas --filename=*.py --exclude pandas/src + flake8 pandas --filename=*.py --exclude pandas/_libs/src if [ $? -ne "0" ]; then RET=1 fi @@ -46,8 +46,8 @@ if [ "$LINT" ]; then echo "Linting *.c and *.h" for path in '*.h' 'period_helper.c' 'datetime' 'parser' 'ujson' do - echo "linting -> pandas/src/$path" - cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/$path + echo "linting -> pandas/_libs/src/$path" + cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/_libs/src/$path if [ $? -ne "0" ]; then RET=1 fi diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index 3445fc3e48376..97ec5782b625b 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -14,8 +14,8 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt */ -#ifndef PANDAS_SRC_DATETIME_NP_DATETIME_H_ -#define PANDAS_SRC_DATETIME_NP_DATETIME_H_ +#ifndef PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_H_ +#define PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_H_ #include @@ -124,4 +124,4 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); -#endif // PANDAS_SRC_DATETIME_NP_DATETIME_H_ +#endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/_libs/src/datetime/np_datetime_strings.h b/pandas/_libs/src/datetime/np_datetime_strings.h index 1114ec5eae064..833c1869c1664 100644 --- a/pandas/_libs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/src/datetime/np_datetime_strings.h @@ -19,8 +19,8 @@ This file implements string parsing and creation for NumPy datetime. */ -#ifndef PANDAS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ -#define PANDAS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ +#ifndef PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ +#define PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -103,4 +103,4 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, int local, PANDAS_DATETIMEUNIT base, int tzoffset, NPY_CASTING casting); -#endif // PANDAS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ +#endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/_libs/src/datetime_helper.h b/pandas/_libs/src/datetime_helper.h index bef4b4266c824..8023285f85b9b 100644 --- a/pandas/_libs/src/datetime_helper.h +++ b/pandas/_libs/src/datetime_helper.h @@ -7,8 +7,8 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS_SRC_DATETIME_HELPER_H_ -#define PANDAS_SRC_DATETIME_HELPER_H_ +#ifndef PANDAS__LIBS_SRC_DATETIME_HELPER_H_ +#define PANDAS__LIBS_SRC_DATETIME_HELPER_H_ #include #include "datetime.h" @@ -33,4 +33,4 @@ npy_float64 total_seconds(PyObject *td) { return (microseconds + (seconds + days_in_seconds) * 1000000.0) / 1000000.0; } -#endif // PANDAS_SRC_DATETIME_HELPER_H_ +#endif // PANDAS__LIBS_SRC_DATETIME_HELPER_H_ diff --git a/pandas/_libs/src/helper.h b/pandas/_libs/src/helper.h index 39bcf27e074df..26b4d033b963b 100644 --- a/pandas/_libs/src/helper.h +++ b/pandas/_libs/src/helper.h @@ -7,8 +7,8 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS_SRC_HELPER_H_ -#define PANDAS_SRC_HELPER_H_ +#ifndef PANDAS__LIBS_SRC_HELPER_H_ +#define PANDAS__LIBS_SRC_HELPER_H_ #ifndef PANDAS_INLINE #if defined(__GNUC__) @@ -22,4 +22,4 @@ The full license is in the LICENSE file, distributed with this software. #endif #endif -#endif // PANDAS_SRC_HELPER_H_ +#endif // PANDAS__LIBS_SRC_HELPER_H_ diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index 809edb2e99fa2..5f4db5b2f55d3 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -7,8 +7,8 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS_SRC_NUMPY_HELPER_H_ -#define PANDAS_SRC_NUMPY_HELPER_H_ +#ifndef PANDAS__LIBS_SRC_NUMPY_HELPER_H_ +#define PANDAS__LIBS_SRC_NUMPY_HELPER_H_ #include "Python.h" #include "helper.h" @@ -159,4 +159,4 @@ PANDAS_INLINE PyObject* unbox_if_zerodim(PyObject* arr) { } } -#endif // PANDAS_SRC_NUMPY_HELPER_H_ +#endif // PANDAS__LIBS_SRC_NUMPY_HELPER_H_ diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 5d2a0dad3da17..6dd8b66eab33d 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -7,8 +7,8 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS_SRC_PARSE_HELPER_H_ -#define PANDAS_SRC_PARSE_HELPER_H_ +#ifndef PANDAS__LIBS_SRC_PARSE_HELPER_H_ +#define PANDAS__LIBS_SRC_PARSE_HELPER_H_ #include #include @@ -270,4 +270,4 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } -#endif // PANDAS_SRC_PARSE_HELPER_H_ +#endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_ diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index 5a0c2b2b5e4a4..77121e9a169c1 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -7,8 +7,8 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS_SRC_PARSER_IO_H_ -#define PANDAS_SRC_PARSER_IO_H_ +#ifndef PANDAS__LIBS_SRC_PARSER_IO_H_ +#define PANDAS__LIBS_SRC_PARSER_IO_H_ #include "Python.h" #include "tokenizer.h" @@ -83,4 +83,4 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status); -#endif // PANDAS_SRC_PARSER_IO_H_ +#endif // PANDAS__LIBS_SRC_PARSER_IO_H_ diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 6c1bc630ab547..9853b5149bee3 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -9,8 +9,8 @@ See LICENSE for the license */ -#ifndef PANDAS_SRC_PARSER_TOKENIZER_H_ -#define PANDAS_SRC_PARSER_TOKENIZER_H_ +#ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ +#define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ #include #include @@ -276,4 +276,4 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); int to_boolean(const char *item, uint8_t *val); -#endif // PANDAS_SRC_PARSER_TOKENIZER_H_ +#endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 601717692ff6d..45afc074cab72 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -11,8 +11,8 @@ Cython to pandas. This primarily concerns interval representation and frequency conversion routines. */ -#ifndef PANDAS_SRC_PERIOD_HELPER_H_ -#define PANDAS_SRC_PERIOD_HELPER_H_ +#ifndef PANDAS__LIBS_SRC_PERIOD_HELPER_H_ +#define PANDAS__LIBS_SRC_PERIOD_HELPER_H_ #include #include "headers/stdint.h" @@ -188,4 +188,4 @@ int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); void initialize_daytime_conversion_factor_matrix(void); -#endif // PANDAS_SRC_PERIOD_HELPER_H_ +#endif // PANDAS__LIBS_SRC_PERIOD_HELPER_H_ diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 013516a49fa2f..f9527e72f577e 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -13,8 +13,8 @@ Port of Wes McKinney's Cython version of Raymond Hettinger's original pure Python recipe (http://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) */ -#ifndef PANDAS_SRC_SKIPLIST_H_ -#define PANDAS_SRC_SKIPLIST_H_ +#ifndef PANDAS__LIBS_SRC_SKIPLIST_H_ +#define PANDAS__LIBS_SRC_SKIPLIST_H_ #include #include @@ -287,4 +287,4 @@ PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { return 1; } -#endif // PANDAS_SRC_SKIPLIST_H_ +#endif // PANDAS__LIBS_SRC_SKIPLIST_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 3bfb4b26c0095..d0588348baa44 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -49,8 +49,8 @@ tree doesn't have cyclic references. */ -#ifndef PANDAS_SRC_UJSON_LIB_ULTRAJSON_H_ -#define PANDAS_SRC_UJSON_LIB_ULTRAJSON_H_ +#ifndef PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ +#define PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ #include #include @@ -307,4 +307,4 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#endif // PANDAS_SRC_UJSON_LIB_ULTRAJSON_H_ +#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/python/py_defines.h b/pandas/_libs/src/ujson/python/py_defines.h index b32285766c86a..82385fdd48a3b 100644 --- a/pandas/_libs/src/ujson/python/py_defines.h +++ b/pandas/_libs/src/ujson/python/py_defines.h @@ -16,7 +16,7 @@ modification, are permitted provided that the following conditions are met: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -35,8 +35,8 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ -#ifndef PANDAS_SRC_UJSON_PYTHON_PY_DEFINES_H_ -#define PANDAS_SRC_UJSON_PYTHON_PY_DEFINES_H_ +#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_PY_DEFINES_H_ +#define PANDAS__LIBS_SRC_UJSON_PYTHON_PY_DEFINES_H_ #include @@ -55,4 +55,4 @@ Numeric decoder derived from from TCL library #endif -#endif // PANDAS_SRC_UJSON_PYTHON_PY_DEFINES_H_ +#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_PY_DEFINES_H_ diff --git a/pandas/_libs/src/ujson/python/version.h b/pandas/_libs/src/ujson/python/version.h index c074ef572101d..ef6d28bf3a1f7 100644 --- a/pandas/_libs/src/ujson/python/version.h +++ b/pandas/_libs/src/ujson/python/version.h @@ -16,7 +16,7 @@ modification, are permitted provided that the following conditions are met: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -35,9 +35,9 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ -#ifndef PANDAS_SRC_UJSON_PYTHON_VERSION_H_ -#define PANDAS_SRC_UJSON_PYTHON_VERSION_H_ +#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_ +#define PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_ #define UJSON_VERSION "1.33" -#endif // PANDAS_SRC_UJSON_PYTHON_VERSION_H_ +#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_ diff --git a/test.bat b/test.bat index 2c5f25c24a637..080a1cc163a05 100644 --- a/test.bat +++ b/test.bat @@ -1,3 +1,3 @@ :: test on windows -pytest --skip-slow --skip-network pandas +pytest --skip-slow --skip-network pandas %* diff --git a/test_fast.sh b/test_fast.sh index 30ac7f84cbe8b..9b984156a796c 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network -m "not single" -n 4 +pytest pandas --skip-slow --skip-network -m "not single" -n 4 "$@" From cc82b542ce76b4bf2aed100724405e75433826cc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Mar 2017 10:07:24 -0500 Subject: [PATCH 256/338] DOC: more whatsnew fixing --- doc/source/whatsnew/v0.20.0.txt | 84 ++++++++++++++++----------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index bf778f6065010..34358a193b360 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -289,7 +289,7 @@ Possible incompat for HDF5 formats for pandas < 0.13.0 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``pd.TimeSeries`` was deprecated officially in 0.17.0, though has only been an alias since 0.13.0. It has -been dropped in favor of ``pd.Series``. (:issue:``15098). +been dropped in favor of ``pd.Series``. (:issue:`15098`). This *may* cause HDF5 files that were created in prior versions to become unreadable if ``pd.TimeSeries`` was used. This is most likely to be for pandas < 0.13.0. If you find yourself in this situation. @@ -328,68 +328,66 @@ then write them out again after applying the procedure below. Map on Index types now return other Index types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- ``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`) +``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`) - .. ipython:: python - - idx = Index([1, 2]) - idx - mi = MultiIndex.from_tuples([(1, 2), (2, 4)]) - mi - - Previous Behavior: +.. ipython:: python - .. code-block:: ipython + idx = Index([1, 2]) + idx + mi = MultiIndex.from_tuples([(1, 2), (2, 4)]) + mi - In [5]: idx.map(lambda x: x * 2) - Out[5]: array([2, 4]) +Previous Behavior: - In [6]: idx.map(lambda x: (x, x * 2)) - Out[6]: array([(1, 2), (2, 4)], dtype=object) +.. code-block:: ipython - In [7]: mi.map(lambda x: x) - Out[7]: array([(1, 2), (2, 4)], dtype=object) + In [5]: idx.map(lambda x: x * 2) + Out[5]: array([2, 4]) - In [8]: mi.map(lambda x: x[0]) - Out[8]: array([1, 2]) + In [6]: idx.map(lambda x: (x, x * 2)) + Out[6]: array([(1, 2), (2, 4)], dtype=object) - New Behavior: + In [7]: mi.map(lambda x: x) + Out[7]: array([(1, 2), (2, 4)], dtype=object) - .. ipython:: python + In [8]: mi.map(lambda x: x[0]) + Out[8]: array([1, 2]) - idx.map(lambda x: x * 2) +New Behavior: - idx.map(lambda x: (x, x * 2)) +.. ipython:: python - mi.map(lambda x: x) + idx.map(lambda x: x * 2) + idx.map(lambda x: (x, x * 2)) - mi.map(lambda x: x[0]) + mi.map(lambda x: x) + mi.map(lambda x: x[0]) -- ``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32`` - .. ipython:: python +``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32`` - s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo')) - s +.. ipython:: python - Previous Behavior: + s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo')) + s - .. code-block:: ipython +Previous Behavior: - In [9]: s.map(lambda x: x.hour) - Out[9]: - 0 0 - 1 1 - 2 2 - dtype: int32 +.. code-block:: ipython + In [9]: s.map(lambda x: x.hour) + Out[9]: + 0 0 + 1 1 + 2 2 + dtype: int32 - New Behavior: +New Behavior: - .. ipython:: python +.. ipython:: python - s.map(lambda x: x.hour) + s.map(lambda x: x.hour) .. _whatsnew_0200.api_breaking.s3: @@ -443,8 +441,8 @@ Pandas Google BigQuery support has moved ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``pip install pandas-gbq`` to get it. -The functionality of ``pd.read_gbq()`` and ``.to_gbq()`` remains the same with the currently released version of ``pandas-gbq=0.1.3``. (:issue:`15347`) -Documentation is now hosted `here `__ +The functionality of :func:`read_gbq` and :meth:`DataFrame.to_gbq` remain the same with the currently released version of ``pandas-gbq=0.1.3``. +Documentation is now hosted `here `__ (:issue:`15347`) .. _whatsnew_0200.api_breaking.memory_usage: @@ -667,7 +665,7 @@ Performance Improvements - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) -- Improved performance of `rank()` for categorical data (:issue:`15498`) +- Improved performance of ``.rank()`` for categorical data (:issue:`15498`) - Improved performance when using ``.unstack()`` (:issue:`15503`) From c958948eff44e9469f4ea404d1d774cc71588781 Mon Sep 17 00:00:00 2001 From: DaanVanHauwermeiren Date: Wed, 8 Mar 2017 19:46:50 +0100 Subject: [PATCH 257/338] DOC: fix link to offset strings in resample method (#15619) --- pandas/core/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ff58a2aa77447..c45cf57152599 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4360,6 +4360,8 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, .. versionadded:: 0.19.0 + Notes + ----- To learn more about the offset strings, please see `this link `__. From e8369a1bcd0703fb280cbbe8536828f3e0d3dc1d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Mar 2017 08:51:09 +0100 Subject: [PATCH 258/338] DOC: make it possible to run doctests (#15626) --- pandas/conftest.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index 623feb99e9cdc..e0a15f740688b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,5 +1,8 @@ import pytest +import numpy +import pandas + def pytest_addoption(parser): parser.addoption("--skip-slow", action="store_true", @@ -19,3 +22,11 @@ def pytest_runtest_setup(item): if 'network' in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") + + +# For running doctests: make np and pd names available + +@pytest.fixture(autouse=True) +def add_imports(doctest_namespace): + doctest_namespace['np'] = numpy + doctest_namespace['pd'] = pandas From 3ef6f46e0b5681f1d0053d1fc1d844f3939fa382 Mon Sep 17 00:00:00 2001 From: DaanVanHauwermeiren Date: Thu, 9 Mar 2017 08:57:27 +0100 Subject: [PATCH 259/338] DOC: add example for DataFrame.resample: keywords on and level (#15627) --- pandas/core/generic.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c45cf57152599..84a48c9be8fd9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4462,6 +4462,30 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, 2000-01-01 00:06:00 26 Freq: 3T, dtype: int64 + For DataFrame objects, the keyword ``on`` can be used to specify the + column instead of the index for resampling. + + >>> df = pd.DataFrame(data=9*[range(4)], columns=['a', 'b', 'c', 'd']) + >>> df['time'] = pd.date_range('1/1/2000', periods=9, freq='T') + >>> df.resample('3T', on='time').sum() + a b c d + time + 2000-01-01 00:00:00 0 3 6 9 + 2000-01-01 00:03:00 0 3 6 9 + 2000-01-01 00:06:00 0 3 6 9 + + For a DataFrame with MultiIndex, the keyword ``level`` can be used to + specify on level the resampling needs to take place. + + >>> time = pd.date_range('1/1/2000', periods=5, freq='T') + >>> df2 = pd.DataFrame(data=10*[range(4)], + columns=['a', 'b', 'c', 'd'], + index=pd.MultiIndex.from_product([time, [1, 2]]) + ) + >>> df2.resample('3T', level=0).sum() + a b c d + 2000-01-01 00:00:00 0 6 12 18 + 2000-01-01 00:03:00 0 4 8 12 """ from pandas.tseries.resample import (resample, _maybe_process_deprecations) From d8df4d20ec32737c1a51aa84ceb0713466878672 Mon Sep 17 00:00:00 2001 From: Michiel Stock Date: Thu, 9 Mar 2017 09:21:30 +0100 Subject: [PATCH 260/338] DOC: resolved mistakes in examples series (#15625) --- pandas/core/generic.py | 11 ++++--- pandas/core/series.py | 71 +++++++++++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 84a48c9be8fd9..606906bfcd7c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -668,6 +668,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): dtype: int64 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df.rename(2) + Traceback (most recent call last): ... TypeError: 'int' object is not callable >>> df.rename(index=str, columns={"A": "a", "B": "c"}) @@ -1115,7 +1116,7 @@ def __setstate__(self, state): to the existing workbook. This can be used to save different DataFrames to one workbook: - >>> writer = ExcelWriter('output.xlsx') + >>> writer = pd.ExcelWriter('output.xlsx') >>> df1.to_excel(writer,'Sheet1') >>> df2.to_excel(writer,'Sheet2') >>> writer.save() @@ -2260,7 +2261,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, ... index=index) >>> df - http_status response_time + http_status response_time Firefox 200 0.04 Chrome 200 0.02 Safari 404 0.07 @@ -2275,11 +2276,11 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, ... 'Chrome'] >>> df.reindex(new_index) http_status response_time - Safari 404 0.07 + Safari 404.0 0.07 Iceweasel NaN NaN Comodo Dragon NaN NaN - IE10 404 0.08 - Chrome 200 0.02 + IE10 404.0 0.08 + Chrome 200.0 0.02 We can fill in the missing values by passing a value to the keyword ``fill_value``. Because the index is not monotonically diff --git a/pandas/core/series.py b/pandas/core/series.py index f23e90effdabf..cfa25ca1299eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -369,10 +369,10 @@ def values(self): Timezone aware datetime data is converted to UTC: >>> pd.Series(pd.date_range('20130101', periods=3, - tz='US/Eastern')).values - array(['2013-01-01T00:00:00.000000000-0500', - '2013-01-02T00:00:00.000000000-0500', - '2013-01-03T00:00:00.000000000-0500'], dtype='datetime64[ns]') + ... tz='US/Eastern')).values + array(['2013-01-01T05:00:00.000000000', + '2013-01-02T05:00:00.000000000', + '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') """ return self._data.external_values() @@ -1550,6 +1550,8 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): With `verify_integrity` set to True: >>> s1.append(s2, verify_integrity=True) + Traceback (most recent call last): + ... ValueError: Indexes have overlapping values: [0, 1, 2] @@ -1919,8 +1921,19 @@ def nlargest(self, n=5, keep='first'): -------- >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(np.random.randn(1e6)) + >>> s = pd.Series(np.random.randn(10**6)) >>> s.nlargest(10) # only sorts up to the N requested + 219921 4.644710 + 82124 4.608745 + 421689 4.564644 + 425277 4.447014 + 718691 4.414137 + 43154 4.403520 + 283187 4.313922 + 595519 4.273635 + 503969 4.250236 + 121637 4.240952 + dtype: float64 """ return algorithms.select_n_series(self, n=n, keep=keep, method='nlargest') @@ -1958,8 +1971,19 @@ def nsmallest(self, n=5, keep='first'): -------- >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(np.random.randn(1e6)) + >>> s = pd.Series(np.random.randn(10**6)) >>> s.nsmallest(10) # only sorts up to the N requested + 288532 -4.954580 + 732345 -4.835960 + 64803 -4.812550 + 446457 -4.609998 + 501225 -4.483945 + 669476 -4.472935 + 973615 -4.401699 + 621279 -4.355126 + 773916 -4.347355 + 359919 -4.331927 + dtype: float64 """ return algorithms.select_n_series(self, n=n, keep=keep, method='nsmallest') @@ -2052,21 +2076,24 @@ def unstack(self, level=-1, fill_value=None): Examples -------- + >>> s = pd.Series([1, 2, 3, 4], + ... index=pd.MultiIndex.from_product([['one', 'two'], ['a', 'b']])) >>> s - one a 1. - one b 2. - two a 3. - two b 4. + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 >>> s.unstack(level=-1) - a b - one 1. 2. - two 3. 4. + a b + one 1 2 + two 3 4 >>> s.unstack(level=0) one two - a 1. 2. - b 3. 4. + a 1 3 + b 2 4 Returns ------- @@ -2102,15 +2129,16 @@ def map(self, arg, na_action=None): >>> x = pd.Series([1,2,3], index=['one', 'two', 'three']) >>> x - one 1 - two 2 - three 3 + one 1 + two 2 + three 3 + dtype: int64 >>> y = pd.Series(['foo', 'bar', 'baz'], index=[1,2,3]) >>> y - 1 foo - 2 bar - 3 baz + 1 foo + 2 bar + 3 baz >>> x.map(y) one foo @@ -2215,6 +2243,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): >>> import numpy as np >>> series = pd.Series([20, 21, 12], index=['London', ... 'New York','Helsinki']) + >>> series London 20 New York 21 Helsinki 12 From 250dc8de69e2ed242e8775b1cc73e78aea92f373 Mon Sep 17 00:00:00 2001 From: mcocdawc Date: Thu, 9 Mar 2017 11:58:48 +0100 Subject: [PATCH 261/338] ENH: to_string/to_latex now accept list-like header arg for overwriting column names (#15548) closes #15536 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 5 +++ pandas/formats/format.py | 38 ++++++++++++---------- pandas/tests/formats/test_format.py | 11 +++++++ pandas/tests/formats/test_to_latex.py | 45 +++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 34358a193b360..ad7571662b8f4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -227,6 +227,7 @@ Other enhancements - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) +- ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4e7a5ebdf6f67..2062f301b9e0e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1516,6 +1516,8 @@ def to_feather(self, fname): from pandas.io.feather_format import to_feather to_feather(self, fname) + @Substitution(header='Write out column names. If a list of string is given, \ +it is assumed to be aliases for the column names') @Appender(fmt.docstring_to_string, indents=1) def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, @@ -1543,6 +1545,7 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, result = formatter.buf.getvalue() return result + @Substitution(header='whether to print column labels, default True') @Appender(fmt.docstring_to_string, indents=1) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, @@ -1596,6 +1599,8 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() + @Substitution(header='Write out column names. If a list of string is given, \ +it is assumed to be aliases for the column names.') @Appender(fmt.common_docstring + fmt.return_docstring, indents=1) def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, diff --git a/pandas/formats/format.py b/pandas/formats/format.py index d354911a825bc..2665f5aea145d 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -20,9 +20,9 @@ is_float, is_numeric_dtype, is_datetime64_dtype, - is_timedelta64_dtype) + is_timedelta64_dtype, + is_list_like) from pandas.types.generic import ABCSparseArray - from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat @@ -54,7 +54,7 @@ col_space : int, optional the minimum width of each column header : bool, optional - whether to print column labels, default True + %(header)s index : bool, optional whether to print index (row) labels, default True na_rep : string, optional @@ -488,32 +488,38 @@ def _to_str_columns(self): # may include levels names also str_index = self._get_formatted_index(frame) - str_columns = self._get_formatted_column_labels(frame) - if self.header: + if not is_list_like(self.header) and not self.header: stringified = [] for i, c in enumerate(frame): - cheader = str_columns[i] - max_colwidth = max(self.col_space or 0, *(self.adj.len(x) - for x in cheader)) fmt_values = self._format_col(i) fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=max_colwidth, + minimum=(self.col_space or 0), adj=self.adj) - - max_len = max(np.max([self.adj.len(x) for x in fmt_values]), - max_colwidth) - cheader = self.adj.justify(cheader, max_len, mode=self.justify) - stringified.append(cheader + fmt_values) + stringified.append(fmt_values) else: + if is_list_like(self.header): + if len(self.header) != len(self.columns): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(self.columns), len(self.header)))) + str_columns = [[label] for label in self.header] + else: + str_columns = self._get_formatted_column_labels(frame) + stringified = [] for i, c in enumerate(frame): + cheader = str_columns[i] + header_colwidth = max(self.col_space or 0, + *(self.adj.len(x) for x in cheader)) fmt_values = self._format_col(i) fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=(self.col_space or 0), + minimum=header_colwidth, adj=self.adj) - stringified.append(fmt_values) + max_len = max(np.max([self.adj.len(x) for x in fmt_values]), + header_colwidth) + cheader = self.adj.justify(cheader, max_len, mode=self.justify) + stringified.append(cheader + fmt_values) strcols = stringified if self.index: diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index ddf9d35841ce7..b1f163ccf9429 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1125,6 +1125,17 @@ def test_to_string_no_header(self): self.assertEqual(df_s, expected) + def test_to_string_specified_header(self): + df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + + df_s = df.to_string(header=['X', 'Y']) + expected = ' X Y\n0 1 4\n1 2 5\n2 3 6' + + self.assertEqual(df_s, expected) + + with tm.assertRaises(ValueError): + df.to_string(header=['X']) + def test_to_string_no_index(self): df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) diff --git a/pandas/tests/formats/test_to_latex.py b/pandas/tests/formats/test_to_latex.py index 17e1e18f03dd6..29ead83f3bcd9 100644 --- a/pandas/tests/formats/test_to_latex.py +++ b/pandas/tests/formats/test_to_latex.py @@ -428,6 +428,51 @@ def test_to_latex_no_header(self): assert withoutindex_result == withoutindex_expected + def test_to_latex_specified_header(self): + # GH 7124 + df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + withindex_result = df.to_latex(header=['AA', 'BB']) + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & AA & BB \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule +AA & BB \\ +\midrule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutindex_result == withoutindex_expected + + withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False) + withoutescape_expected = r"""\begin{tabular}{lrl} +\toprule +{} & $A$ & $B$ \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutescape_result == withoutescape_expected + + with tm.assertRaises(ValueError): + df.to_latex(header=['A']) + def test_to_latex_decimal(self, frame): # GH 12031 frame.to_latex() From 160f19d062ee2c5c29068940ecbfbb880a7c0fa8 Mon Sep 17 00:00:00 2001 From: Jim Date: Thu, 9 Mar 2017 13:05:47 +0100 Subject: [PATCH 262/338] DOC: add documentation to IndexSlice (#15623) --- doc/source/api.rst | 1 + pandas/core/indexing.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index fbce64df84859..f126e478f424d 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1405,6 +1405,7 @@ MultiIndex :toctree: generated/ MultiIndex + IndexSlice MultiIndex Components ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6f490875742ca..546cbd8337e7e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -43,6 +43,36 @@ def get_indexers_list(): # the public IndexSlicerMaker class _IndexSlice(object): + """ + Create an object to more easily perform multi-index slicing + + Examples + -------- + + >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) + >>> columns = ['foo', 'bar'] + >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), + index=midx, columns=columns) + + Using the default slice command: + + >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + + Using the IndexSlice class for a more intuitive command: + + >>> idx = pd.IndexSlice + >>> dfmi.loc[idx[:, 'B0':'B1'], :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + """ def __getitem__(self, arg): return arg From ae8ea018d880a531ae5e63e17907abbe4c5b8e5d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 9 Mar 2017 09:08:17 -0500 Subject: [PATCH 263/338] DOC: remove to_gbq from api.rst as not directly callable (DataFrame.to_gbq) is the entry point --- doc/source/api.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index f126e478f424d..7e297a15055a0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -118,7 +118,6 @@ Google BigQuery :toctree: generated/ read_gbq - to_gbq .. currentmodule:: pandas @@ -1237,7 +1236,7 @@ Serialization / IO / Conversion Panel.to_frame Panel.to_xarray Panel.to_clipboard - + .. _api.index: Index From 973cfc154c71975c52aedd66a322ba1d1b6ebb56 Mon Sep 17 00:00:00 2001 From: chaimdemulder Date: Thu, 9 Mar 2017 15:26:56 +0100 Subject: [PATCH 264/338] DOC: use mathjax on sphinx - #15469 Exponentially Weighed Windows pages now shows formulas (#15618) --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 1e82dfca87d17..6840f76866d2c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -46,7 +46,7 @@ 'ipython_sphinxext.ipython_console_highlighting', 'sphinx.ext.intersphinx', 'sphinx.ext.coverage', - 'sphinx.ext.pngmath', + 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig', 'sphinx.ext.linkcode', ] From 3bfe1ca698e653d948ee1fe3c83e5a75398511f0 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Thu, 9 Mar 2017 09:43:14 -0500 Subject: [PATCH 265/338] ENH: add compression support for 'read_pickle' and 'to_pickle' closes #11666 Author: goldenbull Author: Chen Jinniu Closes #13317 from goldenbull/pickle_io_compression and squashes the following commits: e9c5fd2 [goldenbull] docs update d50e430 [goldenbull] update docs. re-write all tests to avoid round-trip read/write comparison. 86afd25 [goldenbull] change test to new pytest parameterized style 945e7bb [goldenbull] Merge remote-tracking branch 'origin/master' into pickle_io_compression ccbeaa9 [goldenbull] move pickle compression tests into a new class 9a07250 [goldenbull] Remove prepared compressed data. _get_handle will take care of compressed I/O 1cb810b [goldenbull] add zip decompression support. refactor using lambda. b8c4175 [goldenbull] add compressed pickle data file to io/tests 6df6611 [goldenbull] pickle compression code update 81d55a0 [Chen Jinniu] Merge branch 'master' into pickle_io_compression 025a0cd [goldenbull] add compression support for pickle --- doc/source/io.rst | 39 +++++++ doc/source/whatsnew/v0.20.0.txt | 34 ++++++ pandas/core/generic.py | 8 +- pandas/io/common.py | 14 ++- pandas/io/pickle.py | 52 +++++++-- pandas/tests/io/test_pickle.py | 196 +++++++++++++++++++++++++++++++- 6 files changed, 324 insertions(+), 19 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index fa57d6d692152..67491c8b30de7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3046,6 +3046,45 @@ any pickled pandas object (or any other pickled object) from file: These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. +.. _io.pickle.compression: + +Read/Write compressed pickle files +'''''''''''''' + +.. versionadded:: 0.20.0 + +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read +and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports +both read and write. ``zip`` file supports read only and must contain only one data file +to be read in. +Compression type can be an explicitely parameter or be inferred from the file extension. +If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or +``'.xz'``, respectively. + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': np.random.randn(1000), + 'C': np.random.randn(1000)}) + df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type + df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension + df.to_pickle("data.pkl.gz") # default, using "infer" + df["A"].to_pickle("s1.pkl.bz2") + + df = pd.read_pickle("data.pkl.compress", compression="gzip") + df = pd.read_pickle("data.pkl.xz", compression="infer") + df = pd.read_pickle("data.pkl.gz") + s = pd.read_pickle("s1.pkl.bz2") + +.. ipython:: python + :suppress: + import os + os.remove("data.pkl.compress") + os.remove("data.pkl.xz") + os.remove("data.pkl.gz") + os.remove("s1.pkl.bz2") + .. _io.msgpack: msgpack diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ad7571662b8f4..4b320d21fe738 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -94,6 +94,40 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). df = pd.read_table(url, compression='bz2') # explicitly specify compression df.head(2) +.. _whatsnew_0200.enhancements.pickle_compression: + +Pickle file I/O now supports compression +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` +can now read from and write to compressed pickle files. Compression methods +can be an explicit parameter or be inferred from the file extension. +See :ref:`Read/Write compressed pickle files ` + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': np.random.randn(1000), + 'C': np.random.randn(1000)}) + df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type + df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension + df.to_pickle("data.pkl.gz") # default, using "infer" + df["A"].to_pickle("s1.pkl.bz2") + + df = pd.read_pickle("data.pkl.compress", compression="gzip") + df = pd.read_pickle("data.pkl.xz", compression="infer") + df = pd.read_pickle("data.pkl.gz") + s = pd.read_pickle("s1.pkl.bz2") + +.. ipython:: python + :suppress: + import os + os.remove("data.pkl.compress") + os.remove("data.pkl.xz") + os.remove("data.pkl.gz") + os.remove("s1.pkl.bz2") + .. _whatsnew_0200.enhancements.uint64_support: UInt64 Support Improved diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 606906bfcd7c4..a0111cb9ef7ec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1355,7 +1355,7 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype) - def to_pickle(self, path): + def to_pickle(self, path, compression='infer'): """ Pickle (serialize) object to input file path. @@ -1363,9 +1363,13 @@ def to_pickle(self, path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 """ from pandas.io.pickle import to_pickle - return to_pickle(self, path) + return to_pickle(self, path, compression=compression) def to_clipboard(self, excel=None, sep=None, **kwargs): """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 0630311620e20..9f4ebb449e102 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -316,7 +316,7 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False): + memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. @@ -331,7 +331,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, Supported compression protocols are gzip, bz2, zip, and xz memory_map : boolean, default False See parsers._parser_params for more information. - + is_text : boolean, default True + whether file/buffer is in text format (csv, json, etc.), or in binary + mode (pickle, etc.) Returns ------- f : file-like @@ -405,13 +407,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding) - else: + elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace') + else: + # Python 3 and binary mode + f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and (compression or isinstance(f, need_text_wrapping)): + if compat.PY3 and is_text and\ + (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 2358c296f782e..969a2a51cb15d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -4,9 +4,10 @@ from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 from pandas.types.common import is_datetime64_dtype, _NS_DTYPE +from pandas.io.common import _get_handle, _infer_compression -def to_pickle(obj, path): +def to_pickle(obj, path, compression='infer'): """ Pickle (serialize) object to input file path @@ -15,12 +16,23 @@ def to_pickle(obj, path): obj : any object path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 """ - with open(path, 'wb') as f: + inferred_compression = _infer_compression(path, compression) + f, fh = _get_handle(path, 'wb', + compression=inferred_compression, + is_text=False) + try: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) + finally: + for _f in fh: + _f.close() -def read_pickle(path): +def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -32,12 +44,32 @@ def read_pickle(path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz', + or 'zip' respectively, and no decompression otherwise. + Set to None for no decompression. + + .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file """ + inferred_compression = _infer_compression(path, compression) + + def read_wrapper(func): + # wrapper file handle open/close operation + f, fh = _get_handle(path, 'rb', + compression=inferred_compression, + is_text=False) + try: + return func(f) + finally: + for _f in fh: + _f.close() + def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then @@ -48,19 +80,16 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - with open(path, 'rb') as fh: - return pkl.load(fh) + return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=False) - + return read_wrapper( + lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=True) - + return read_wrapper( + lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: @@ -68,6 +97,7 @@ def try_read(path, encoding=None): return try_read(path, encoding='latin1') raise + # compat with sparse pickle / unpickle diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index c736ec829808a..2fffc3c39ec26 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -15,15 +15,14 @@ import pytest import os - from distutils.version import LooseVersion - import pandas as pd from pandas import Index from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd +import shutil @pytest.fixture(scope='module') @@ -302,3 +301,196 @@ def test_pickle_v0_15_2(): # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + +# --------------------- +# test pickle compression +# --------------------- +_compression_to_extension = { + None: ".none", + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', +} + + +def get_random_path(): + return u'__%s__.pickle' % tm.rands(10) + + +def compress_file(src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == 'gzip': + import gzip + f = gzip.open(dest_path, "w") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(dest_path, "w") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(dest_path, "w", + compression=zipfile.ZIP_DEFLATED) + zip_file.write(src_path, os.path.basename(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(dest_path, "w") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + if compression != "zip": + f.write(open(src_path, "rb").read()) + f.close() + + +def decompress_file(src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == 'gzip': + import gzip + f = gzip.open(src_path, "r") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(src_path, "r") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(src_path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError('ZIP file {} error. Only one file per ZIP.' + .format(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(src_path, "r") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + open(dest_path, "wb").write(f.read()) + f.close() + + +@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) +def test_write_explicit(compression): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".compressed" + path2 = base + ".raw" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to compressed file + df.to_pickle(p1, compression=compression) + + # decompress + decompress_file(p1, p2, compression=compression) + + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) +def test_write_explicit_bad(compression): + with tm.assertRaisesRegexp(ValueError, + "Unrecognized compression type"): + with tm.ensure_clean(get_random_path()) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) + + +@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) +def test_write_infer(ext): + if ext == '.xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ext + path2 = base + ".raw" + compression = None + for c in _compression_to_extension: + if _compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to compressed file by inferred compression method + df.to_pickle(p1) + + # decompress + decompress_file(p1, p2, compression=compression) + + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) +def test_read_explicit(compression): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".raw" + path2 = base + ".compressed" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress + compress_file(p1, p2, compression=compression) + + # read compressed file + df2 = pd.read_pickle(p2, compression=compression) + + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', + '.no_compress']) +def test_read_infer(ext): + if ext == '.xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".raw" + path2 = base + ext + compression = None + for c in _compression_to_extension: + if _compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress + compress_file(p1, p2, compression=compression) + + # read compressed file by inferred compression method + df2 = pd.read_pickle(p2) + + tm.assert_frame_equal(df, df2) From 55507caf87bd25d523e695c145fd8aa57b8f4fc4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 9 Mar 2017 09:50:04 -0500 Subject: [PATCH 266/338] TST: fix up compression tests / docs --- doc/source/io.rst | 55 +++--- doc/source/whatsnew/v0.20.0.txt | 40 +++-- pandas/tests/io/test_pickle.py | 289 ++++++++++++++++---------------- 3 files changed, 208 insertions(+), 176 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 67491c8b30de7..fdd33ab4625f3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3042,22 +3042,19 @@ any pickled pandas object (or any other pickled object) from file: See `this question `__ for a detailed explanation. -.. note:: - - These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. - .. _io.pickle.compression: -Read/Write compressed pickle files -'''''''''''''' +Compressed pickle files +''''''''''''''''''''''' .. versionadded:: 0.20.0 :func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read -and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports -both read and write. ``zip`` file supports read only and must contain only one data file +and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing. +`zip`` file supports read only and must contain only one data file to be read in. -Compression type can be an explicitely parameter or be inferred from the file extension. + +The compression type can be an explicit parameter or be inferred from the file extension. If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or ``'.xz'``, respectively. @@ -3065,17 +3062,37 @@ If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ` df = pd.DataFrame({ 'A': np.random.randn(1000), - 'B': np.random.randn(1000), - 'C': np.random.randn(1000)}) - df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type - df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension - df.to_pickle("data.pkl.gz") # default, using "infer" - df["A"].to_pickle("s1.pkl.bz2") + 'B': 'foo', + 'C': pd.date_range('20130101', periods=1000, freq='s')}) + df + +Using an explicit compression type + +.. ipython:: python - df = pd.read_pickle("data.pkl.compress", compression="gzip") - df = pd.read_pickle("data.pkl.xz", compression="infer") - df = pd.read_pickle("data.pkl.gz") - s = pd.read_pickle("s1.pkl.bz2") + df.to_pickle("data.pkl.compress", compression="gzip") + rt = pd.read_pickle("data.pkl.compress", compression="gzip") + rt + +Inferring compression type from the extension + +.. ipython:: python + + df.to_pickle("data.pkl.xz", compression="infer") + rt = pd.read_pickle("data.pkl.xz", compression="infer") + rt + +The default is to 'infer + +.. ipython:: python + + df.to_pickle("data.pkl.gz") + rt = pd.read_pickle("data.pkl.gz") + rt + + df["A"].to_pickle("s1.pkl.bz2") + rt = pd.read_pickle("s1.pkl.bz2") + rt .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4b320d21fe738..8f671062464f0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -102,23 +102,41 @@ Pickle file I/O now supports compression :func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can now read from and write to compressed pickle files. Compression methods can be an explicit parameter or be inferred from the file extension. -See :ref:`Read/Write compressed pickle files ` +See :ref:`the docs here ` .. ipython:: python df = pd.DataFrame({ 'A': np.random.randn(1000), - 'B': np.random.randn(1000), - 'C': np.random.randn(1000)}) - df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type - df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension - df.to_pickle("data.pkl.gz") # default, using "infer" - df["A"].to_pickle("s1.pkl.bz2") + 'B': 'foo', + 'C': pd.date_range('20130101', periods=1000, freq='s')}) + +Using an explicit compression type + +.. ipython:: python - df = pd.read_pickle("data.pkl.compress", compression="gzip") - df = pd.read_pickle("data.pkl.xz", compression="infer") - df = pd.read_pickle("data.pkl.gz") - s = pd.read_pickle("s1.pkl.bz2") + df.to_pickle("data.pkl.compress", compression="gzip") + rt = pd.read_pickle("data.pkl.compress", compression="gzip") + rt + +Inferring compression type from the extension + +.. ipython:: python + + df.to_pickle("data.pkl.xz", compression="infer") + rt = pd.read_pickle("data.pkl.xz", compression="infer") + rt + +The default is to 'infer + +.. ipython:: python + + df.to_pickle("data.pkl.gz") + rt = pd.read_pickle("data.pkl.gz") + rt + df["A"].to_pickle("s1.pkl.bz2") + rt = pd.read_pickle("s1.pkl.bz2") + rt .. ipython:: python :suppress: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2fffc3c39ec26..91e70e942089c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -306,191 +306,188 @@ def test_pickle_v0_15_2(): # --------------------- # test pickle compression # --------------------- -_compression_to_extension = { - None: ".none", - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', -} - +@pytest.fixture def get_random_path(): return u'__%s__.pickle' % tm.rands(10) -def compress_file(src_path, dest_path, compression): - if compression is None: - shutil.copyfile(src_path, dest_path) - return - - if compression == 'gzip': - import gzip - f = gzip.open(dest_path, "w") - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(dest_path, "w") - elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(dest_path, "w", - compression=zipfile.ZIP_DEFLATED) - zip_file.write(src_path, os.path.basename(src_path)) - elif compression == 'xz': - lzma = pandas.compat.import_lzma() - f = lzma.LZMAFile(dest_path, "w") - else: - msg = 'Unrecognized compression type: {}'.format(compression) - raise ValueError(msg) - - if compression != "zip": - f.write(open(src_path, "rb").read()) - f.close() +class TestCompression(object): + _compression_to_extension = { + None: ".none", + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', + } -def decompress_file(src_path, dest_path, compression): - if compression is None: - shutil.copyfile(src_path, dest_path) - return + def compress_file(self, src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return - if compression == 'gzip': - import gzip - f = gzip.open(src_path, "r") - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(src_path, "r") - elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(src_path) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) + if compression == 'gzip': + import gzip + f = gzip.open(dest_path, "w") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(dest_path, "w") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(dest_path, "w", + compression=zipfile.ZIP_DEFLATED) + zip_file.write(src_path, os.path.basename(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(dest_path, "w") else: - raise ValueError('ZIP file {} error. Only one file per ZIP.' - .format(src_path)) - elif compression == 'xz': - lzma = pandas.compat.import_lzma() - f = lzma.LZMAFile(src_path, "r") - else: - msg = 'Unrecognized compression type: {}'.format(compression) - raise ValueError(msg) - - open(dest_path, "wb").write(f.read()) - f.close() + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + if compression != "zip": + f.write(open(src_path, "rb").read()) + f.close() -@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) -def test_write_explicit(compression): - # issue 11666 - if compression == 'xz': - tm._skip_if_no_lzma() + def decompress_file(self, src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return - base = get_random_path() - path1 = base + ".compressed" - path2 = base + ".raw" + if compression == 'gzip': + import gzip + f = gzip.open(src_path, "r") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(src_path, "r") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(src_path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError('ZIP file {} error. Only one file per ZIP.' + .format(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(src_path, "r") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + open(dest_path, "wb").write(f.read()) + f.close() - # write to compressed file - df.to_pickle(p1, compression=compression) + @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) + def test_write_explicit(self, compression, get_random_path): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() - # decompress - decompress_file(p1, p2, compression=compression) + base = get_random_path + path1 = base + ".compressed" + path2 = base + ".raw" - # read decompressed file - df2 = pd.read_pickle(p2, compression=None) + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - tm.assert_frame_equal(df, df2) + # write to compressed file + df.to_pickle(p1, compression=compression) + # decompress + self.decompress_file(p1, p2, compression=compression) -@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) -def test_write_explicit_bad(compression): - with tm.assertRaisesRegexp(ValueError, - "Unrecognized compression type"): - with tm.ensure_clean(get_random_path()) as path: - df = tm.makeDataFrame() - df.to_pickle(path, compression=compression) + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + tm.assert_frame_equal(df, df2) -@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) -def test_write_infer(ext): - if ext == '.xz': - tm._skip_if_no_lzma() + @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) + def test_write_explicit_bad(self, compression, get_random_path): + with tm.assertRaisesRegexp(ValueError, + "Unrecognized compression type"): + with tm.ensure_clean(get_random_path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) - base = get_random_path() - path1 = base + ext - path2 = base + ".raw" - compression = None - for c in _compression_to_extension: - if _compression_to_extension[c] == ext: - compression = c - break + @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) + def test_write_infer(self, ext, get_random_path): + if ext == '.xz': + tm._skip_if_no_lzma() - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + base = get_random_path + path1 = base + ext + path2 = base + ".raw" + compression = None + for c in self._compression_to_extension: + if self._compression_to_extension[c] == ext: + compression = c + break - # write to compressed file by inferred compression method - df.to_pickle(p1) - - # decompress - decompress_file(p1, p2, compression=compression) + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - # read decompressed file - df2 = pd.read_pickle(p2, compression=None) + # write to compressed file by inferred compression method + df.to_pickle(p1) - tm.assert_frame_equal(df, df2) + # decompress + self.decompress_file(p1, p2, compression=compression) + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) -@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) -def test_read_explicit(compression): - # issue 11666 - if compression == 'xz': - tm._skip_if_no_lzma() + tm.assert_frame_equal(df, df2) - base = get_random_path() - path1 = base + ".raw" - path2 = base + ".compressed" + @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) + def test_read_explicit(self, compression, get_random_path): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + base = get_random_path + path1 = base + ".raw" + path2 = base + ".compressed" - # write to uncompressed file - df.to_pickle(p1, compression=None) + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - # compress - compress_file(p1, p2, compression=compression) + # write to uncompressed file + df.to_pickle(p1, compression=None) - # read compressed file - df2 = pd.read_pickle(p2, compression=compression) + # compress + self.compress_file(p1, p2, compression=compression) - tm.assert_frame_equal(df, df2) + # read compressed file + df2 = pd.read_pickle(p2, compression=compression) + tm.assert_frame_equal(df, df2) -@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', - '.no_compress']) -def test_read_infer(ext): - if ext == '.xz': - tm._skip_if_no_lzma() + @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', + '.no_compress']) + def test_read_infer(self, ext, get_random_path): + if ext == '.xz': + tm._skip_if_no_lzma() - base = get_random_path() - path1 = base + ".raw" - path2 = base + ext - compression = None - for c in _compression_to_extension: - if _compression_to_extension[c] == ext: - compression = c - break + base = get_random_path + path1 = base + ".raw" + path2 = base + ext + compression = None + for c in self._compression_to_extension: + if self._compression_to_extension[c] == ext: + compression = c + break - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - # write to uncompressed file - df.to_pickle(p1, compression=None) + # write to uncompressed file + df.to_pickle(p1, compression=None) - # compress - compress_file(p1, p2, compression=compression) + # compress + self.compress_file(p1, p2, compression=compression) - # read compressed file by inferred compression method - df2 = pd.read_pickle(p2) + # read compressed file by inferred compression method + df2 = pd.read_pickle(p2) - tm.assert_frame_equal(df, df2) + tm.assert_frame_equal(df, df2) From 35145b2466b50cea719b71050bbd50d201ba9c06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Mar 2017 00:15:03 +0100 Subject: [PATCH 267/338] DOC: remove latex and parallel building (#15637) --- ci/build_docs.sh | 3 --- doc/make.py | 2 +- doc/source/io.rst | 5 +++-- doc/source/whatsnew/v0.20.0.txt | 1 + 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 5dc649a91c4f7..bfe7a1eed756b 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -23,9 +23,6 @@ if [ x"$DOC_BUILD" != x"" ]; then source activate pandas - # install sudo deps - time sudo apt-get $APT_ARGS install dvipng texlive-latex-base texlive-latex-extra - mv "$TRAVIS_BUILD_DIR"/doc /tmp cd /tmp/doc diff --git a/doc/make.py b/doc/make.py index a2f5be5594e44..30cd2ad8b61c9 100755 --- a/doc/make.py +++ b/doc/make.py @@ -197,7 +197,7 @@ def html(): print(e) print("Failed to convert %s" % nb) - if os.system('sphinx-build -j 2 -P -b html -d build/doctrees ' + if os.system('sphinx-build -P -b html -d build/doctrees ' 'source build/html'): raise SystemExit("Building HTML failed.") try: diff --git a/doc/source/io.rst b/doc/source/io.rst index fdd33ab4625f3..a702efdc6aaf9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2070,9 +2070,9 @@ by the Table Schema spec. The full list of types supported are described in the Table Schema spec. This table shows the mapping from pandas types: -============== ================= +=============== ================= Pandas type Table Schema type -============== ================= +=============== ================= int64 integer float64 number bool boolean @@ -3096,6 +3096,7 @@ The default is to 'infer .. ipython:: python :suppress: + import os os.remove("data.pkl.compress") os.remove("data.pkl.xz") diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8f671062464f0..cf3dddc3a2933 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -140,6 +140,7 @@ The default is to 'infer .. ipython:: python :suppress: + import os os.remove("data.pkl.compress") os.remove("data.pkl.xz") From 0696bb19ea9ea04b6bbb799447c8f3514187500d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Mar 2017 03:24:36 -0500 Subject: [PATCH 268/338] DOC: increase recursion limit on sphinx builds (#15641) --- doc/source/conf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index 6840f76866d2c..0b0de16411e9b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,6 +16,14 @@ import inspect from pandas.compat import u, PY3 +# https://github.com/sphinx-doc/sphinx/pull/2325/files +# Workaround for sphinx-build recursion limit overflow: +# pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL) +# RuntimeError: maximum recursion depth exceeded while pickling an object +# +# Python's default allowed recursion depth is 1000. +sys.setrecursionlimit(5000) + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. From 9eaac952d95564fc23f9a1dab13b9f344889d3e1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Mar 2017 06:20:01 -0500 Subject: [PATCH 269/338] DEPR: remove more .ix warnings from tests Author: Jeff Reback Closes #15638 from jreback/indexing and squashes the following commits: 8b82bd6 [Jeff Reback] CLN: split test_indexing.py 23e82eb [Jeff Reback] DEPR: remove more .ix warnings from tests --- pandas/tests/indexing/common.py | 257 ++ .../indexing/test_chaining_and_caching.py | 96 +- pandas/tests/indexing/test_iloc.py | 590 ++++ pandas/tests/indexing/test_indexing.py | 2811 +---------------- pandas/tests/indexing/test_ix.py | 333 ++ pandas/tests/indexing/test_loc.py | 630 ++++ pandas/tests/indexing/test_multiindex.py | 225 +- pandas/tests/indexing/test_panel.py | 12 +- pandas/tests/indexing/test_partial.py | 587 ++++ pandas/tests/indexing/test_scalar.py | 156 + 10 files changed, 2906 insertions(+), 2791 deletions(-) create mode 100644 pandas/tests/indexing/test_iloc.py create mode 100644 pandas/tests/indexing/test_ix.py create mode 100644 pandas/tests/indexing/test_loc.py create mode 100644 pandas/tests/indexing/test_partial.py create mode 100644 pandas/tests/indexing/test_scalar.py diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 73167393cf35d..c7637a00910c6 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -1,5 +1,262 @@ """ common utilities """ +import itertools +from warnings import catch_warnings +import numpy as np + +from pandas.compat import lrange +from pandas.types.common import is_scalar +from pandas import Series, DataFrame, Panel, date_range, UInt64Index +from pandas.util import testing as tm +from pandas.formats.printing import pprint_thing + +_verbose = False + def _mklbl(prefix, n): return ["%s%s" % (prefix, i) for i in range(n)] + + +def _axify(obj, key, axis): + # create a tuple accessor + axes = [slice(None)] * obj.ndim + axes[axis] = key + return tuple(axes) + + +class Base(object): + """ indexing comprehensive base class """ + + _objs = set(['series', 'frame', 'panel']) + _typs = set(['ints', 'uints', 'labels', 'mixed', + 'ts', 'floats', 'empty', 'ts_rev']) + + def setUp(self): + + self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) + self.frame_ints = DataFrame(np.random.randn(4, 4), + index=lrange(0, 8, 2), + columns=lrange(0, 12, 3)) + self.panel_ints = Panel(np.random.rand(4, 4, 4), + items=lrange(0, 8, 2), + major_axis=lrange(0, 12, 3), + minor_axis=lrange(0, 16, 4)) + + self.series_uints = Series(np.random.rand(4), + index=UInt64Index(lrange(0, 8, 2))) + self.frame_uints = DataFrame(np.random.randn(4, 4), + index=UInt64Index(lrange(0, 8, 2)), + columns=UInt64Index(lrange(0, 12, 3))) + self.panel_uints = Panel(np.random.rand(4, 4, 4), + items=UInt64Index(lrange(0, 8, 2)), + major_axis=UInt64Index(lrange(0, 12, 3)), + minor_axis=UInt64Index(lrange(0, 16, 4))) + + self.series_labels = Series(np.random.randn(4), index=list('abcd')) + self.frame_labels = DataFrame(np.random.randn(4, 4), + index=list('abcd'), columns=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4, 4, 4), + items=list('abcd'), + major_axis=list('ABCD'), + minor_axis=list('ZYXW')) + + self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) + self.frame_mixed = DataFrame(np.random.randn(4, 4), + index=[2, 4, 'null', 8]) + self.panel_mixed = Panel(np.random.randn(4, 4, 4), + items=[2, 4, 'null', 8]) + + self.series_ts = Series(np.random.randn(4), + index=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), + index=date_range('20130101', periods=4)) + self.panel_ts = Panel(np.random.randn(4, 4, 4), + items=date_range('20130101', periods=4)) + + dates_rev = (date_range('20130101', periods=4) + .sort_values(ascending=False)) + self.series_ts_rev = Series(np.random.randn(4), + index=dates_rev) + self.frame_ts_rev = DataFrame(np.random.randn(4, 4), + index=dates_rev) + self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), + items=dates_rev) + + self.frame_empty = DataFrame({}) + self.series_empty = Series({}) + self.panel_empty = Panel({}) + + # form agglomerates + for o in self._objs: + + d = dict() + for t in self._typs: + d[t] = getattr(self, '%s_%s' % (o, t), None) + + setattr(self, o, d) + + def generate_indices(self, f, values=False): + """ generate the indicies + if values is True , use the axis values + is False, use the range + """ + + axes = f.axes + if values: + axes = [lrange(len(a)) for a in axes] + + return itertools.product(*axes) + + def get_result(self, obj, method, key, axis): + """ return the result for this obj with this key and this axis """ + + if isinstance(key, dict): + key = key[axis] + + # use an artifical conversion to map the key as integers to the labels + # so ix can work for comparisions + if method == 'indexer': + method = 'ix' + key = obj._get_axis(axis)[key] + + # in case we actually want 0 index slicing + try: + with catch_warnings(record=True): + xp = getattr(obj, method).__getitem__(_axify(obj, key, axis)) + except: + xp = getattr(obj, method).__getitem__(key) + + return xp + + def get_value(self, f, i, values=False): + """ return the value for the location i """ + + # check agains values + if values: + return f.values[i] + + # this is equiv of f[col][row]..... + # v = f + # for a in reversed(i): + # v = v.__getitem__(a) + # return v + with catch_warnings(record=True): + return f.ix[i] + + def check_values(self, f, func, values=False): + + if f is None: + return + axes = f.axes + indicies = itertools.product(*axes) + + for i in indicies: + result = getattr(f, func)[i] + + # check agains values + if values: + expected = f.values[i] + else: + expected = f + for a in reversed(i): + expected = expected.__getitem__(a) + + tm.assert_almost_equal(result, expected) + + def check_result(self, name, method1, key1, method2, key2, typs=None, + objs=None, axes=None, fails=None): + def _eq(t, o, a, obj, k1, k2): + """ compare equal for these 2 keys """ + + if a is not None and a > obj.ndim - 1: + return + + def _print(result, error=None): + if error is not None: + error = str(error) + v = ("%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s," + "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % + (name, result, t, o, method1, method2, a, error or '')) + if _verbose: + pprint_thing(v) + + try: + rs = getattr(obj, method1).__getitem__(_axify(obj, k1, a)) + + try: + xp = self.get_result(obj, method2, k2, a) + except: + result = 'no comp' + _print(result) + return + + detail = None + + try: + if is_scalar(rs) and is_scalar(xp): + self.assertEqual(rs, xp) + elif xp.ndim == 1: + tm.assert_series_equal(rs, xp) + elif xp.ndim == 2: + tm.assert_frame_equal(rs, xp) + elif xp.ndim == 3: + tm.assert_panel_equal(rs, xp) + result = 'ok' + except AssertionError as e: + detail = str(e) + result = 'fail' + + # reverse the checks + if fails is True: + if result == 'fail': + result = 'ok (fail)' + + _print(result) + if not result.startswith('ok'): + raise AssertionError(detail) + + except AssertionError: + raise + except Exception as detail: + + # if we are in fails, the ok, otherwise raise it + if fails is not None: + if isinstance(detail, fails): + result = 'ok (%s)' % type(detail).__name__ + _print(result) + return + + result = type(detail).__name__ + raise AssertionError(_print(result, error=detail)) + + if typs is None: + typs = self._typs + + if objs is None: + objs = self._objs + + if axes is not None: + if not isinstance(axes, (tuple, list)): + axes = [axes] + else: + axes = list(axes) + else: + axes = [0, 1, 2] + + # check + for o in objs: + if o not in self._objs: + continue + + d = getattr(self, o) + for a in axes: + for t in typs: + if t not in self._typs: + continue + + obj = d[t] + if obj is not None: + obj = obj.copy() + + k2 = key2 + _eq(t, o, a, obj, key1, k2) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 0e921aaf826f9..72e704537ba3f 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -1,3 +1,5 @@ +from warnings import catch_warnings + import numpy as np import pandas as pd from pandas.core import common as com @@ -41,13 +43,13 @@ def test_setitem_cache_updating(self): # ref the cache if do_ref: - df.ix[0, "c"] + df.loc[0, "c"] # set it - df.ix[7, 'c'] = 1 + df.loc[7, 'c'] = 1 - self.assertEqual(df.ix[0, 'c'], 0.0) - self.assertEqual(df.ix[7, 'c'], 1.0) + self.assertEqual(df.loc[0, 'c'], 0.0) + self.assertEqual(df.loc[7, 'c'], 1.0) # GH 7084 # not updating cache on series setting with slices @@ -226,21 +228,21 @@ def random_text(nobs=100): # explicity copy indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.ix[indexer].copy() + df = df.loc[indexer].copy() self.assertIsNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) # implicity take df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.ix[indexer] + df = df.loc[indexer] self.assertIsNotNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) # implicity take 2 df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.ix[indexer] + df = df.loc[indexer] self.assertIsNotNone(df.is_copy) df.loc[:, 'letters'] = df['letters'].apply(str.lower) @@ -251,7 +253,8 @@ def random_text(nobs=100): df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) - df.ix[indexer, 'letters'] = df.ix[indexer, 'letters'].apply(str.lower) + df.loc[indexer, 'letters'] = ( + df.loc[indexer, 'letters'].apply(str.lower)) # an identical take, so no copy df = DataFrame({'a': [1]}).dropna() @@ -312,12 +315,12 @@ def f(): D=list('abcde'))) def f(): - df.ix[2]['D'] = 'foo' + df.loc[2]['D'] = 'foo' self.assertRaises(com.SettingWithCopyError, f) def f(): - df.ix[2]['C'] = 'foo' + df.loc[2]['C'] = 'foo' self.assertRaises(com.SettingWithCopyError, f) @@ -356,3 +359,76 @@ def test_detect_chained_assignment_warnings(self): with tm.assert_produces_warning( expected_warning=com.SettingWithCopyWarning): df.loc[0]['A'] = 111 + + def test_chained_getitem_with_lists(self): + + # GH6394 + # Regression in chained getitem indexing with embedded list-like from + # 0.12 + def check(result, expected): + tm.assert_numpy_array_equal(result, expected) + tm.assertIsInstance(result, np.ndarray) + + df = DataFrame({'A': 5 * [np.zeros(3)], 'B': 5 * [np.ones(3)]}) + expected = df['A'].iloc[2] + result = df.loc[2, 'A'] + check(result, expected) + result2 = df.iloc[2]['A'] + check(result2, expected) + result3 = df['A'].loc[2] + check(result3, expected) + result4 = df['A'].iloc[2] + check(result4, expected) + + def test_cache_updating(self): + # GH 4939, make sure to update the cache on setitem + + df = tm.makeDataFrame() + df['A'] # cache series + with catch_warnings(record=True): + df.ix["Hello Friend"] = df.ix[0] + self.assertIn("Hello Friend", df['A'].index) + self.assertIn("Hello Friend", df['B'].index) + + with catch_warnings(record=True): + panel = tm.makePanel() + panel.ix[0] # get first item into cache + panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1 + self.assertIn("A+1", panel.ix[0].columns) + self.assertIn("A+1", panel.ix[1].columns) + + # 5216 + # make sure that we don't try to set a dead cache + a = np.random.rand(10, 3) + df = DataFrame(a, columns=['x', 'y', 'z']) + tuples = [(i, j) for i in range(5) for j in range(2)] + index = MultiIndex.from_tuples(tuples) + df.index = index + + # setting via chained assignment + # but actually works, since everything is a view + df.loc[0]['z'].iloc[0] = 1. + result = df.loc[(0, 0), 'z'] + self.assertEqual(result, 1) + + # correct setting + df.loc[(0, 0), 'z'] = 2 + result = df.loc[(0, 0), 'z'] + self.assertEqual(result, 2) + + # 10264 + df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[ + 'a', 'b', 'c', 'd', 'e'], index=range(5)) + df['f'] = 0 + df.f.values[3] = 1 + + # TODO(wesm): unused? + # y = df.iloc[np.arange(2, len(df))] + + df.f.values[3] = 2 + expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ + 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) + expected.at[3, 'f'] = 2 + tm.assert_frame_equal(df, expected) + expected = Series([0, 0, 0, 2, 0], name='f') + tm.assert_series_equal(df.f, expected) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py new file mode 100644 index 0000000000000..517194835ca73 --- /dev/null +++ b/pandas/tests/indexing/test_iloc.py @@ -0,0 +1,590 @@ +""" test positional based indexing with iloc """ + +from warnings import catch_warnings +import numpy as np + +import pandas as pd +from pandas.compat import lrange, lmap +from pandas import Series, DataFrame, date_range, concat, isnull +from pandas.util import testing as tm +from pandas.tests.indexing.common import Base + + +class TestiLoc(Base, tm.TestCase): + + def test_iloc_exceeds_bounds(self): + + # GH6296 + # iloc should allow indexers that exceed the bounds + df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) + expected = df + + # lists of positions should raise IndexErrror! + with tm.assertRaisesRegexp(IndexError, + 'positional indexers are out-of-bounds'): + df.iloc[:, [0, 1, 2, 3, 4, 5]] + self.assertRaises(IndexError, lambda: df.iloc[[1, 30]]) + self.assertRaises(IndexError, lambda: df.iloc[[1, -30]]) + self.assertRaises(IndexError, lambda: df.iloc[[100]]) + + s = df['A'] + self.assertRaises(IndexError, lambda: s.iloc[[100]]) + self.assertRaises(IndexError, lambda: s.iloc[[-100]]) + + # still raise on a single indexer + msg = 'single positional indexer is out-of-bounds' + with tm.assertRaisesRegexp(IndexError, msg): + df.iloc[30] + self.assertRaises(IndexError, lambda: df.iloc[-30]) + + # GH10779 + # single positive/negative indexer exceeding Series bounds should raise + # an IndexError + with tm.assertRaisesRegexp(IndexError, msg): + s.iloc[30] + self.assertRaises(IndexError, lambda: s.iloc[-30]) + + # slices are ok + result = df.iloc[:, 4:10] # 0 < start < len < stop + expected = df.iloc[:, 4:] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, -4:-10] # stop < 0 < start < len + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down) + expected = df.iloc[:, :4:-1] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down) + expected = df.iloc[:, 4::-1] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, -10:4] # start < 0 < stop < len + expected = df.iloc[:, :4] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 10:4] # 0 < stop < len < start + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down) + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 10:11] # 0 < len < start < stop + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + # slice bounds exceeding is ok + result = s.iloc[18:30] + expected = s.iloc[18:] + tm.assert_series_equal(result, expected) + + result = s.iloc[30:] + expected = s.iloc[:0] + tm.assert_series_equal(result, expected) + + result = s.iloc[30::-1] + expected = s.iloc[::-1] + tm.assert_series_equal(result, expected) + + # doc example + def check(result, expected): + str(result) + result.dtypes + tm.assert_frame_equal(result, expected) + + dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) + check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) + check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) + check(dfl.iloc[4:6], dfl.iloc[[4]]) + + self.assertRaises(IndexError, lambda: dfl.iloc[[4, 5, 6]]) + self.assertRaises(IndexError, lambda: dfl.iloc[:, 4]) + + def test_iloc_getitem_int(self): + + # integer + self.check_result('integer', 'iloc', 2, 'ix', + {0: 4, 1: 6, 2: 8}, typs=['ints', 'uints']) + self.check_result('integer', 'iloc', 2, 'indexer', 2, + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) + + def test_iloc_getitem_neg_int(self): + + # neg integer + self.check_result('neg int', 'iloc', -1, 'ix', + {0: 6, 1: 9, 2: 12}, typs=['ints', 'uints']) + self.check_result('neg int', 'iloc', -1, 'indexer', -1, + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) + + def test_iloc_getitem_list_int(self): + + # list of ints + self.check_result('list int', 'iloc', [0, 1, 2], 'ix', + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, + typs=['ints', 'uints']) + self.check_result('list int', 'iloc', [2], 'ix', + {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) + self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) + + # array of ints (GH5006), make sure that a single indexer is returning + # the correct type + self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'ix', + {0: [0, 2, 4], + 1: [0, 3, 6], + 2: [0, 4, 8]}, typs=['ints', 'uints']) + self.check_result('array int', 'iloc', np.array([2]), 'ix', + {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) + self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', + [0, 1, 2], + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) + + def test_iloc_getitem_neg_int_can_reach_first_index(self): + # GH10547 and GH10779 + # negative integers should be able to reach index 0 + df = DataFrame({'A': [2, 3, 5], 'B': [7, 11, 13]}) + s = df['A'] + + expected = df.iloc[0] + result = df.iloc[-3] + tm.assert_series_equal(result, expected) + + expected = df.iloc[[0]] + result = df.iloc[[-3]] + tm.assert_frame_equal(result, expected) + + expected = s.iloc[0] + result = s.iloc[-3] + self.assertEqual(result, expected) + + expected = s.iloc[[0]] + result = s.iloc[[-3]] + tm.assert_series_equal(result, expected) + + # check the length 1 Series case highlighted in GH10547 + expected = pd.Series(['a'], index=['A']) + result = expected.iloc[[-1]] + tm.assert_series_equal(result, expected) + + def test_iloc_getitem_dups(self): + + # no dups in panel (bug?) + self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', + {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, + objs=['series', 'frame'], typs=['ints', 'uints']) + + # GH 6766 + df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) + df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df = concat([df1, df2], axis=1) + + # cross-sectional indexing + result = df.iloc[0, 0] + self.assertTrue(isnull(result)) + + result = df.iloc[0, :] + expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'], + name=0) + tm.assert_series_equal(result, expected) + + def test_iloc_getitem_array(self): + + # array like + s = Series(index=lrange(1, 4)) + self.check_result('array like', 'iloc', s.index, 'ix', + {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, + typs=['ints', 'uints']) + + def test_iloc_getitem_bool(self): + + # boolean indexers + b = [True, False, True, False, ] + self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints', 'uints']) + self.check_result('bool', 'iloc', b, 'ix', b, + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) + + def test_iloc_getitem_slice(self): + + # slices + self.check_result('slice', 'iloc', slice(1, 3), 'ix', + {0: [2, 4], 1: [3, 6], 2: [4, 8]}, + typs=['ints', 'uints']) + self.check_result('slice', 'iloc', slice(1, 3), 'indexer', + slice(1, 3), + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) + + def test_iloc_getitem_slice_dups(self): + + df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) + df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), + columns=['A', 'C']) + + # axis=1 + df = concat([df1, df2], axis=1) + tm.assert_frame_equal(df.iloc[:, :4], df1) + tm.assert_frame_equal(df.iloc[:, 4:], df2) + + df = concat([df2, df1], axis=1) + tm.assert_frame_equal(df.iloc[:, :2], df2) + tm.assert_frame_equal(df.iloc[:, 2:], df1) + + exp = concat([df2, df1.iloc[:, [0]]], axis=1) + tm.assert_frame_equal(df.iloc[:, 0:3], exp) + + # axis=0 + df = concat([df, df], axis=0) + tm.assert_frame_equal(df.iloc[0:10, :2], df2) + tm.assert_frame_equal(df.iloc[0:10, 2:], df1) + tm.assert_frame_equal(df.iloc[10:, :2], df2) + tm.assert_frame_equal(df.iloc[10:, 2:], df1) + + def test_iloc_setitem(self): + df = self.frame_ints + + df.iloc[1, 1] = 1 + result = df.iloc[1, 1] + self.assertEqual(result, 1) + + df.iloc[:, 2:3] = 0 + expected = df.iloc[:, 2:3] + result = df.iloc[:, 2:3] + tm.assert_frame_equal(result, expected) + + # GH5771 + s = Series(0, index=[4, 5, 6]) + s.iloc[1:2] += 1 + expected = Series([0, 1, 0], index=[4, 5, 6]) + tm.assert_series_equal(s, expected) + + def test_iloc_setitem_list(self): + + # setitem with an iloc list + df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], + columns=["A", "B", "C"]) + df.iloc[[0, 1], [1, 2]] + df.iloc[[0, 1], [1, 2]] += 100 + + expected = DataFrame( + np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), + index=["A", "B", "C"], columns=["A", "B", "C"]) + tm.assert_frame_equal(df, expected) + + def test_iloc_setitem_dups(self): + + # GH 6766 + # iloc with a mask aligning from another iloc + df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) + df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df = concat([df1, df2], axis=1) + + expected = df.fillna(3) + expected['A'] = expected['A'].astype('float64') + inds = np.isnan(df.iloc[:, 0]) + mask = inds[inds].index + df.iloc[mask, 0] = df.iloc[mask, 2] + tm.assert_frame_equal(df, expected) + + # del a dup column across blocks + expected = DataFrame({0: [1, 2], 1: [3, 4]}) + expected.columns = ['B', 'B'] + del df['A'] + tm.assert_frame_equal(df, expected) + + # assign back to self + df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] + tm.assert_frame_equal(df, expected) + + # reversed x 2 + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( + drop=True) + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( + drop=True) + tm.assert_frame_equal(df, expected) + + def test_iloc_getitem_frame(self): + df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), + columns=lrange(0, 8, 2)) + + result = df.iloc[2] + with catch_warnings(record=True): + exp = df.ix[4] + tm.assert_series_equal(result, exp) + + result = df.iloc[2, 2] + with catch_warnings(record=True): + exp = df.ix[4, 4] + self.assertEqual(result, exp) + + # slice + result = df.iloc[4:8] + with catch_warnings(record=True): + expected = df.ix[8:14] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 2:3] + with catch_warnings(record=True): + expected = df.ix[:, 4:5] + tm.assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[0, 1, 3]] + with catch_warnings(record=True): + expected = df.ix[[0, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.iloc[[0, 1, 3], [0, 1]] + with catch_warnings(record=True): + expected = df.ix[[0, 2, 6], [0, 2]] + tm.assert_frame_equal(result, expected) + + # neg indicies + result = df.iloc[[-1, 1, 3], [-1, 1]] + with catch_warnings(record=True): + expected = df.ix[[18, 2, 6], [6, 2]] + tm.assert_frame_equal(result, expected) + + # dups indicies + result = df.iloc[[-1, -1, 1, 3], [-1, 1]] + with catch_warnings(record=True): + expected = df.ix[[18, 18, 2, 6], [6, 2]] + tm.assert_frame_equal(result, expected) + + # with index-like + s = Series(index=lrange(1, 5)) + result = df.iloc[s.index] + with catch_warnings(record=True): + expected = df.ix[[2, 4, 6, 8]] + tm.assert_frame_equal(result, expected) + + def test_iloc_getitem_labelled_frame(self): + # try with labelled frame + df = DataFrame(np.random.randn(10, 4), + index=list('abcdefghij'), columns=list('ABCD')) + + result = df.iloc[1, 1] + exp = df.loc['b', 'B'] + self.assertEqual(result, exp) + + result = df.iloc[:, 2:3] + expected = df.loc[:, ['C']] + tm.assert_frame_equal(result, expected) + + # negative indexing + result = df.iloc[-1, -1] + exp = df.loc['j', 'D'] + self.assertEqual(result, exp) + + # out-of-bounds exception + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10, 5])) + + # trying to use a label + self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j', 'D'])) + + def test_iloc_getitem_doc_issue(self): + + # multi axis slicing issue with single block + # surfaced in GH 6059 + + arr = np.random.randn(6, 4) + index = date_range('20130101', periods=6) + columns = list('ABCD') + df = DataFrame(arr, index=index, columns=columns) + + # defines ref_locs + df.describe() + + result = df.iloc[3:5, 0:2] + str(result) + result.dtypes + + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], + columns=columns[0:2]) + tm.assert_frame_equal(result, expected) + + # for dups + df.columns = list('aaaa') + result = df.iloc[3:5, 0:2] + str(result) + result.dtypes + + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], + columns=list('aa')) + tm.assert_frame_equal(result, expected) + + # related + arr = np.random.randn(6, 4) + index = list(range(0, 12, 2)) + columns = list(range(0, 8, 2)) + df = DataFrame(arr, index=index, columns=columns) + + df._data.blocks[0].mgr_locs + result = df.iloc[1:5, 2:4] + str(result) + result.dtypes + expected = DataFrame(arr[1:5, 2:4], index=index[1:5], + columns=columns[2:4]) + tm.assert_frame_equal(result, expected) + + def test_iloc_setitem_series(self): + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), + columns=list('ABCD')) + + df.iloc[1, 1] = 1 + result = df.iloc[1, 1] + self.assertEqual(result, 1) + + df.iloc[:, 2:3] = 0 + expected = df.iloc[:, 2:3] + result = df.iloc[:, 2:3] + tm.assert_frame_equal(result, expected) + + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + + s.iloc[1] = 1 + result = s.iloc[1] + self.assertEqual(result, 1) + + s.iloc[:4] = 0 + expected = s.iloc[:4] + result = s.iloc[:4] + tm.assert_series_equal(result, expected) + + s = Series([-1] * 6) + s.iloc[0::2] = [0, 2, 4] + s.iloc[1::2] = [1, 3, 5] + result = s + expected = Series([0, 1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + + def test_iloc_setitem_list_of_lists(self): + + # GH 7551 + # list-of-list is set incorrectly in mixed vs. single dtyped frames + df = DataFrame(dict(A=np.arange(5, dtype='int64'), + B=np.arange(5, 10, dtype='int64'))) + df.iloc[2:4] = [[10, 11], [12, 13]] + expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) + tm.assert_frame_equal(df, expected) + + df = DataFrame( + dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) + df.iloc[2:4] = [['x', 11], ['y', 13]] + expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], + B=[5, 6, 11, 13, 9])) + tm.assert_frame_equal(df, expected) + + def test_iloc_mask(self): + + # GH 3631, iloc with a mask (of a series) should raise + df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) + mask = (df.a % 2 == 0) + self.assertRaises(ValueError, df.iloc.__getitem__, tuple([mask])) + mask.index = lrange(len(mask)) + self.assertRaises(NotImplementedError, df.iloc.__getitem__, + tuple([mask])) + + # ndarray ok + result = df.iloc[np.array([True] * len(mask), dtype=bool)] + tm.assert_frame_equal(result, df) + + # the possibilities + locs = np.arange(4) + nums = 2 ** locs + reps = lmap(bin, nums) + df = DataFrame({'locs': locs, 'nums': nums}, reps) + + expected = { + (None, ''): '0b1100', + (None, '.loc'): '0b1100', + (None, '.iloc'): '0b1100', + ('index', ''): '0b11', + ('index', '.loc'): '0b11', + ('index', '.iloc'): ('iLocation based boolean indexing ' + 'cannot use an indexable as a mask'), + ('locs', ''): 'Unalignable boolean Series provided as indexer ' + '(index of the boolean Series and of the indexed ' + 'object do not match', + ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' + '(index of the boolean Series and of the ' + 'indexed object do not match', + ('locs', '.iloc'): ('iLocation based boolean indexing on an ' + 'integer type is not available'), + } + + # UserWarnings from reindex of a boolean mask + with catch_warnings(record=True): + result = dict() + for idx in [None, 'index', 'locs']: + mask = (df.nums > 2).values + if idx: + mask = Series(mask, list(reversed(getattr(df, idx)))) + for method in ['', '.loc', '.iloc']: + try: + if method: + accessor = getattr(df, method[1:]) + else: + accessor = df + ans = str(bin(accessor[mask]['nums'].sum())) + except Exception as e: + ans = str(e) + + key = tuple([idx, method]) + r = expected.get(key) + if r != ans: + raise AssertionError( + "[%s] does not match [%s], received [%s]" + % (key, ans, r)) + + def test_iloc_non_unique_indexing(self): + + # GH 4017, non-unique indexing (on the axis) + df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000}) + idx = np.array(lrange(30)) * 99 + expected = df.iloc[idx] + + df3 = pd.concat([df, 2 * df, 3 * df]) + result = df3.iloc[idx] + + tm.assert_frame_equal(result, expected) + + df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) + df2 = pd.concat([df2, 2 * df2, 3 * df2]) + + sidx = df2.index.to_series() + expected = df2.iloc[idx[idx <= sidx.max()]] + + new_list = [] + for r, s in expected.iterrows(): + new_list.append(s) + new_list.append(s * 2) + new_list.append(s * 3) + + expected = DataFrame(new_list) + expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()]) + ]) + result = df2.loc[idx] + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_iloc_empty_list_indexer_is_ok(self): + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + # vertical empty + tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) + # horizontal empty + tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) + # horizontal empty + tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 4502e0171dfbe..0d6ca383a1be1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1,1648 +1,64 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 -import itertools -import warnings + +""" test fancy indexing & misc """ + from warnings import catch_warnings from datetime import datetime from pandas.types.common import (is_integer_dtype, - is_float_dtype, - is_scalar) -from pandas.compat import range, lrange, lzip, StringIO, lmap -from pandas._libs.tslib import NaT -from numpy import nan -from numpy.random import randn + is_float_dtype) +from pandas.compat import range, lrange, lzip, StringIO import numpy as np import pandas as pd -from pandas import option_context from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice -from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, - MultiIndex, Timestamp, Timedelta, UInt64Index) -from pandas.formats.printing import pprint_thing -from pandas import concat -from pandas.core.common import PerformanceWarning -from pandas.tests.indexing.common import _mklbl +from pandas import NaT, DataFrame, Index, Series, MultiIndex import pandas.util.testing as tm -from pandas import date_range +from pandas.tests.indexing.common import Base, _mklbl -_verbose = False # ------------------------------------------------------------------------ # Indexing test cases -def _generate_indices(f, values=False): - """ generate the indicies - if values is True , use the axis values - is False, use the range - """ - - axes = f.axes - if values: - axes = [lrange(len(a)) for a in axes] - - return itertools.product(*axes) - - -def _get_value(f, i, values=False): - """ return the value for the location i """ - - # check agains values - if values: - return f.values[i] - - # this is equiv of f[col][row]..... - # v = f - # for a in reversed(i): - # v = v.__getitem__(a) - # return v - with catch_warnings(record=True): - return f.ix[i] - - -def _get_result(obj, method, key, axis): - """ return the result for this obj with this key and this axis """ - - if isinstance(key, dict): - key = key[axis] - - # use an artifical conversion to map the key as integers to the labels - # so ix can work for comparisions - if method == 'indexer': - method = 'ix' - key = obj._get_axis(axis)[key] - - # in case we actually want 0 index slicing - try: - xp = getattr(obj, method).__getitem__(_axify(obj, key, axis)) - except: - xp = getattr(obj, method).__getitem__(key) - - return xp - - -def _axify(obj, key, axis): - # create a tuple accessor - axes = [slice(None)] * obj.ndim - axes[axis] = key - return tuple(axes) - - -class TestIndexing(tm.TestCase): - - _objs = set(['series', 'frame', 'panel']) - _typs = set(['ints', 'uints', 'labels', 'mixed', - 'ts', 'floats', 'empty', 'ts_rev']) - - def setUp(self): - - self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) - self.frame_ints = DataFrame(np.random.randn(4, 4), - index=lrange(0, 8, 2), - columns=lrange(0, 12, 3)) - self.panel_ints = Panel(np.random.rand(4, 4, 4), - items=lrange(0, 8, 2), - major_axis=lrange(0, 12, 3), - minor_axis=lrange(0, 16, 4)) - - self.series_uints = Series(np.random.rand(4), - index=UInt64Index(lrange(0, 8, 2))) - self.frame_uints = DataFrame(np.random.randn(4, 4), - index=UInt64Index(lrange(0, 8, 2)), - columns=UInt64Index(lrange(0, 12, 3))) - self.panel_uints = Panel(np.random.rand(4, 4, 4), - items=UInt64Index(lrange(0, 8, 2)), - major_axis=UInt64Index(lrange(0, 12, 3)), - minor_axis=UInt64Index(lrange(0, 16, 4))) - - self.series_labels = Series(np.random.randn(4), index=list('abcd')) - self.frame_labels = DataFrame(np.random.randn(4, 4), - index=list('abcd'), columns=list('ABCD')) - self.panel_labels = Panel(np.random.randn(4, 4, 4), - items=list('abcd'), - major_axis=list('ABCD'), - minor_axis=list('ZYXW')) - - self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) - self.frame_mixed = DataFrame(np.random.randn(4, 4), - index=[2, 4, 'null', 8]) - self.panel_mixed = Panel(np.random.randn(4, 4, 4), - items=[2, 4, 'null', 8]) - - self.series_ts = Series(np.random.randn(4), - index=date_range('20130101', periods=4)) - self.frame_ts = DataFrame(np.random.randn(4, 4), - index=date_range('20130101', periods=4)) - self.panel_ts = Panel(np.random.randn(4, 4, 4), - items=date_range('20130101', periods=4)) - - dates_rev = (date_range('20130101', periods=4) - .sort_values(ascending=False)) - self.series_ts_rev = Series(np.random.randn(4), - index=dates_rev) - self.frame_ts_rev = DataFrame(np.random.randn(4, 4), - index=dates_rev) - self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), - items=dates_rev) - - self.frame_empty = DataFrame({}) - self.series_empty = Series({}) - self.panel_empty = Panel({}) - - # form agglomerates - for o in self._objs: - - d = dict() - for t in self._typs: - d[t] = getattr(self, '%s_%s' % (o, t), None) - - setattr(self, o, d) - - def check_values(self, f, func, values=False): - - if f is None: - return - axes = f.axes - indicies = itertools.product(*axes) - - for i in indicies: - result = getattr(f, func)[i] - - # check agains values - if values: - expected = f.values[i] - else: - expected = f - for a in reversed(i): - expected = expected.__getitem__(a) - - tm.assert_almost_equal(result, expected) - - def check_result(self, name, method1, key1, method2, key2, typs=None, - objs=None, axes=None, fails=None): - def _eq(t, o, a, obj, k1, k2): - """ compare equal for these 2 keys """ - - if a is not None and a > obj.ndim - 1: - return - - def _print(result, error=None): - if error is not None: - error = str(error) - v = ("%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s," - "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % - (name, result, t, o, method1, method2, a, error or '')) - if _verbose: - pprint_thing(v) - - try: - rs = getattr(obj, method1).__getitem__(_axify(obj, k1, a)) - - try: - xp = _get_result(obj, method2, k2, a) - except: - result = 'no comp' - _print(result) - return - - detail = None - - try: - if is_scalar(rs) and is_scalar(xp): - self.assertEqual(rs, xp) - elif xp.ndim == 1: - tm.assert_series_equal(rs, xp) - elif xp.ndim == 2: - tm.assert_frame_equal(rs, xp) - elif xp.ndim == 3: - tm.assert_panel_equal(rs, xp) - result = 'ok' - except AssertionError as e: - detail = str(e) - result = 'fail' - - # reverse the checks - if fails is True: - if result == 'fail': - result = 'ok (fail)' - - _print(result) - if not result.startswith('ok'): - raise AssertionError(detail) - - except AssertionError: - raise - except Exception as detail: - - # if we are in fails, the ok, otherwise raise it - if fails is not None: - if isinstance(detail, fails): - result = 'ok (%s)' % type(detail).__name__ - _print(result) - return - - result = type(detail).__name__ - raise AssertionError(_print(result, error=detail)) - - if typs is None: - typs = self._typs - - if objs is None: - objs = self._objs - - if axes is not None: - if not isinstance(axes, (tuple, list)): - axes = [axes] - else: - axes = list(axes) - else: - axes = [0, 1, 2] - - # check - for o in objs: - if o not in self._objs: - continue - - d = getattr(self, o) - for a in axes: - for t in typs: - if t not in self._typs: - continue - - obj = d[t] - if obj is not None: - obj = obj.copy() - - k2 = key2 - _eq(t, o, a, obj, key1, k2) - - def test_ix_deprecation(self): - # GH 15114 - - df = DataFrame({'A': [1, 2, 3]}) - with tm.assert_produces_warning(DeprecationWarning, - check_stacklevel=False): - df.ix[1, 'A'] - - def test_indexer_caching(self): - # GH5727 - # make sure that indexers are in the _internal_names_set - n = 1000001 - arrays = [lrange(n), lrange(n)] - index = MultiIndex.from_tuples(lzip(*arrays)) - s = Series(np.zeros(n), index=index) - str(s) - - # setitem - expected = Series(np.ones(n), index=index) - s = Series(np.zeros(n), index=index) - s[s == 0] = 1 - tm.assert_series_equal(s, expected) - - def test_at_and_iat_get(self): - def _check(f, func, values=False): - - if f is not None: - indicies = _generate_indices(f, values) - for i in indicies: - result = getattr(f, func)[i] - expected = _get_value(f, i, values) - tm.assert_almost_equal(result, expected) - - for o in self._objs: - - d = getattr(self, o) - - # iat - for f in [d['ints'], d['uints']]: - _check(f, 'iat', values=True) - - for f in [d['labels'], d['ts'], d['floats']]: - if f is not None: - self.assertRaises(ValueError, self.check_values, f, 'iat') - - # at - for f in [d['ints'], d['uints'], d['labels'], - d['ts'], d['floats']]: - _check(f, 'at') - - def test_at_and_iat_set(self): - def _check(f, func, values=False): - - if f is not None: - indicies = _generate_indices(f, values) - for i in indicies: - getattr(f, func)[i] = 1 - expected = _get_value(f, i, values) - tm.assert_almost_equal(expected, 1) - - for t in self._objs: - - d = getattr(self, t) - - # iat - for f in [d['ints'], d['uints']]: - _check(f, 'iat', values=True) - - for f in [d['labels'], d['ts'], d['floats']]: - if f is not None: - self.assertRaises(ValueError, _check, f, 'iat') - - # at - for f in [d['ints'], d['uints'], d['labels'], - d['ts'], d['floats']]: - _check(f, 'at') - - def test_at_iat_coercion(self): - - # as timestamp is not a tuple! - dates = date_range('1/1/2000', periods=8) - df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) - s = df['A'] - - result = s.at[dates[5]] - xp = s.values[5] - self.assertEqual(result, xp) - - # GH 7729 - # make sure we are boxing the returns - s = Series(['2014-01-01', '2014-02-02'], dtype='datetime64[ns]') - expected = Timestamp('2014-02-02') - - for r in [lambda: s.iat[1], lambda: s.iloc[1]]: - result = r() - self.assertEqual(result, expected) - - s = Series(['1 days', '2 days'], dtype='timedelta64[ns]') - expected = Timedelta('2 days') - - for r in [lambda: s.iat[1], lambda: s.iloc[1]]: - result = r() - self.assertEqual(result, expected) - - def test_iat_invalid_args(self): - pass - - def test_imethods_with_dups(self): - - # GH6493 - # iat/iloc with dups - - s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64') - result = s.iloc[2] - self.assertEqual(result, 2) - result = s.iat[2] - self.assertEqual(result, 2) - - self.assertRaises(IndexError, lambda: s.iat[10]) - self.assertRaises(IndexError, lambda: s.iat[-10]) - - result = s.iloc[[2, 3]] - expected = Series([2, 3], [2, 2], dtype='int64') - tm.assert_series_equal(result, expected) - - df = s.to_frame() - result = df.iloc[2] - expected = Series(2, index=[0], name=2) - tm.assert_series_equal(result, expected) - - result = df.iat[2, 0] - expected = 2 - self.assertEqual(result, 2) - - def test_repeated_getitem_dups(self): - # GH 5678 - # repeated gettitems on a dup index returing a ndarray - df = DataFrame( - np.random.random_sample((20, 5)), - index=['ABCDE' [x % 5] for x in range(20)]) - expected = df.loc['A', 0] - result = df.loc[:, 0].loc['A'] - tm.assert_series_equal(result, expected) - - def test_iloc_exceeds_bounds(self): - - # GH6296 - # iloc should allow indexers that exceed the bounds - df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) - expected = df - - # lists of positions should raise IndexErrror! - with tm.assertRaisesRegexp(IndexError, - 'positional indexers are out-of-bounds'): - df.iloc[:, [0, 1, 2, 3, 4, 5]] - self.assertRaises(IndexError, lambda: df.iloc[[1, 30]]) - self.assertRaises(IndexError, lambda: df.iloc[[1, -30]]) - self.assertRaises(IndexError, lambda: df.iloc[[100]]) - - s = df['A'] - self.assertRaises(IndexError, lambda: s.iloc[[100]]) - self.assertRaises(IndexError, lambda: s.iloc[[-100]]) - - # still raise on a single indexer - msg = 'single positional indexer is out-of-bounds' - with tm.assertRaisesRegexp(IndexError, msg): - df.iloc[30] - self.assertRaises(IndexError, lambda: df.iloc[-30]) - - # GH10779 - # single positive/negative indexer exceeding Series bounds should raise - # an IndexError - with tm.assertRaisesRegexp(IndexError, msg): - s.iloc[30] - self.assertRaises(IndexError, lambda: s.iloc[-30]) - - # slices are ok - result = df.iloc[:, 4:10] # 0 < start < len < stop - expected = df.iloc[:, 4:] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, -4:-10] # stop < 0 < start < len - expected = df.iloc[:, :0] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down) - expected = df.iloc[:, :4:-1] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down) - expected = df.iloc[:, 4::-1] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, -10:4] # start < 0 < stop < len - expected = df.iloc[:, :4] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, 10:4] # 0 < stop < len < start - expected = df.iloc[:, :0] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down) - expected = df.iloc[:, :0] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, 10:11] # 0 < len < start < stop - expected = df.iloc[:, :0] - tm.assert_frame_equal(result, expected) - - # slice bounds exceeding is ok - result = s.iloc[18:30] - expected = s.iloc[18:] - tm.assert_series_equal(result, expected) - - result = s.iloc[30:] - expected = s.iloc[:0] - tm.assert_series_equal(result, expected) - - result = s.iloc[30::-1] - expected = s.iloc[::-1] - tm.assert_series_equal(result, expected) - - # doc example - def check(result, expected): - str(result) - result.dtypes - tm.assert_frame_equal(result, expected) - - dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) - check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) - check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) - check(dfl.iloc[4:6], dfl.iloc[[4]]) - - self.assertRaises(IndexError, lambda: dfl.iloc[[4, 5, 6]]) - self.assertRaises(IndexError, lambda: dfl.iloc[:, 4]) - - def test_iloc_getitem_int(self): - - # integer - self.check_result('integer', 'iloc', 2, 'ix', - {0: 4, 1: 6, 2: 8}, typs=['ints', 'uints']) - self.check_result('integer', 'iloc', 2, 'indexer', 2, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - def test_iloc_getitem_neg_int(self): - - # neg integer - self.check_result('neg int', 'iloc', -1, 'ix', - {0: 6, 1: 9, 2: 12}, typs=['ints', 'uints']) - self.check_result('neg int', 'iloc', -1, 'indexer', -1, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - def test_iloc_getitem_list_int(self): - - # list of ints - self.check_result('list int', 'iloc', [0, 1, 2], 'ix', - {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, - typs=['ints', 'uints']) - self.check_result('list int', 'iloc', [2], 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) - self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - # array of ints (GH5006), make sure that a single indexer is returning - # the correct type - self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'ix', - {0: [0, 2, 4], - 1: [0, 3, 6], - 2: [0, 4, 8]}, typs=['ints', 'uints']) - self.check_result('array int', 'iloc', np.array([2]), 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) - self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', - [0, 1, 2], - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - def test_iloc_getitem_neg_int_can_reach_first_index(self): - # GH10547 and GH10779 - # negative integers should be able to reach index 0 - df = DataFrame({'A': [2, 3, 5], 'B': [7, 11, 13]}) - s = df['A'] - - expected = df.iloc[0] - result = df.iloc[-3] - tm.assert_series_equal(result, expected) - - expected = df.iloc[[0]] - result = df.iloc[[-3]] - tm.assert_frame_equal(result, expected) - - expected = s.iloc[0] - result = s.iloc[-3] - self.assertEqual(result, expected) - - expected = s.iloc[[0]] - result = s.iloc[[-3]] - tm.assert_series_equal(result, expected) - - # check the length 1 Series case highlighted in GH10547 - expected = pd.Series(['a'], index=['A']) - result = expected.iloc[[-1]] - tm.assert_series_equal(result, expected) - - def test_iloc_getitem_dups(self): - - # no dups in panel (bug?) - self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', - {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, - objs=['series', 'frame'], typs=['ints', 'uints']) - - # GH 6766 - df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) - df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) - df = concat([df1, df2], axis=1) - - # cross-sectional indexing - result = df.iloc[0, 0] - self.assertTrue(isnull(result)) - - result = df.iloc[0, :] - expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'], - name=0) - tm.assert_series_equal(result, expected) - - def test_iloc_getitem_array(self): - - # array like - s = Series(index=lrange(1, 4)) - self.check_result('array like', 'iloc', s.index, 'ix', - {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, - typs=['ints', 'uints']) - - def test_iloc_getitem_bool(self): - - # boolean indexers - b = [True, False, True, False, ] - self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints', 'uints']) - self.check_result('bool', 'iloc', b, 'ix', b, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - def test_iloc_getitem_slice(self): - - # slices - self.check_result('slice', 'iloc', slice(1, 3), 'ix', - {0: [2, 4], 1: [3, 6], 2: [4, 8]}, - typs=['ints', 'uints']) - self.check_result('slice', 'iloc', slice(1, 3), 'indexer', - slice(1, 3), - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - def test_iloc_getitem_slice_dups(self): - - df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), - columns=['A', 'C']) - - # axis=1 - df = concat([df1, df2], axis=1) - tm.assert_frame_equal(df.iloc[:, :4], df1) - tm.assert_frame_equal(df.iloc[:, 4:], df2) - - df = concat([df2, df1], axis=1) - tm.assert_frame_equal(df.iloc[:, :2], df2) - tm.assert_frame_equal(df.iloc[:, 2:], df1) - - exp = concat([df2, df1.iloc[:, [0]]], axis=1) - tm.assert_frame_equal(df.iloc[:, 0:3], exp) - - # axis=0 - df = concat([df, df], axis=0) - tm.assert_frame_equal(df.iloc[0:10, :2], df2) - tm.assert_frame_equal(df.iloc[0:10, 2:], df1) - tm.assert_frame_equal(df.iloc[10:, :2], df2) - tm.assert_frame_equal(df.iloc[10:, 2:], df1) - - def test_iloc_setitem(self): - df = self.frame_ints - - df.iloc[1, 1] = 1 - result = df.iloc[1, 1] - self.assertEqual(result, 1) - - df.iloc[:, 2:3] = 0 - expected = df.iloc[:, 2:3] - result = df.iloc[:, 2:3] - tm.assert_frame_equal(result, expected) - - # GH5771 - s = Series(0, index=[4, 5, 6]) - s.iloc[1:2] += 1 - expected = Series([0, 1, 0], index=[4, 5, 6]) - tm.assert_series_equal(s, expected) - - def test_loc_setitem_slice(self): - # GH10503 - - # assigning the same type should not change the type - df1 = DataFrame({'a': [0, 1, 1], - 'b': Series([100, 200, 300], dtype='uint32')}) - ix = df1['a'] == 1 - newb1 = df1.loc[ix, 'b'] + 1 - df1.loc[ix, 'b'] = newb1 - expected = DataFrame({'a': [0, 1, 1], - 'b': Series([100, 201, 301], dtype='uint32')}) - tm.assert_frame_equal(df1, expected) - - # assigning a new type should get the inferred type - df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') - ix = df1['a'] == 1 - newb2 = df2.loc[ix, 'b'] - df1.loc[ix, 'b'] = newb2 - expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') - tm.assert_frame_equal(df2, expected) - - def test_ix_loc_setitem_consistency(self): - - # GH 5771 - # loc with slice and series - s = Series(0, index=[4, 5, 6]) - s.loc[4:5] += 1 - expected = Series([1, 1, 0], index=[4, 5, 6]) - tm.assert_series_equal(s, expected) - - # GH 5928 - # chained indexing assignment - df = DataFrame({'a': [0, 1, 2]}) - expected = df.copy() - with catch_warnings(record=True): - expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] - - with catch_warnings(record=True): - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] - tm.assert_frame_equal(df, expected) - - df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) - with catch_warnings(record=True): - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype( - 'float64') + 0.5 - expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) - tm.assert_frame_equal(df, expected) - - # GH 8607 - # ix setitem consistency - df = DataFrame({'timestamp': [1413840976, 1413842580, 1413760580], - 'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470]}) - expected = DataFrame({'timestamp': pd.to_datetime( - [1413840976, 1413842580, 1413760580], unit='s'), - 'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470]}) - - df2 = df.copy() - df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - tm.assert_frame_equal(df2, expected) - - df2 = df.copy() - df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - tm.assert_frame_equal(df2, expected) - - df2 = df.copy() - with catch_warnings(record=True): - df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') - tm.assert_frame_equal(df2, expected) - - def test_ix_loc_consistency(self): - - # GH 8613 - # some edge cases where ix/loc should return the same - # this is not an exhaustive case - - def compare(result, expected): - if is_scalar(expected): - self.assertEqual(result, expected) - else: - self.assertTrue(expected.equals(result)) - - # failure cases for .loc, but these work for .ix - df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD')) - for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]), - tuple([slice(0, 2), df.columns[0:2]])]: - - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex, - tm.makeTimedeltaIndex]: - df.index = index(len(df.index)) - with catch_warnings(record=True): - df.ix[key] - - self.assertRaises(TypeError, lambda: df.loc[key]) - - df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), - index=pd.date_range('2012-01-01', periods=5)) - - for key in ['2012-01-03', - '2012-01-31', - slice('2012-01-03', '2012-01-03'), - slice('2012-01-03', '2012-01-04'), - slice('2012-01-03', '2012-01-06', 2), - slice('2012-01-03', '2012-01-31'), - tuple([[True, True, True, False, True]]), ]: - - # getitem - - # if the expected raises, then compare the exceptions - try: - with catch_warnings(record=True): - expected = df.ix[key] - except KeyError: - self.assertRaises(KeyError, lambda: df.loc[key]) - continue - - result = df.loc[key] - compare(result, expected) - - # setitem - df1 = df.copy() - df2 = df.copy() - - with catch_warnings(record=True): - df1.ix[key] = 10 - df2.loc[key] = 10 - compare(df2, df1) - - # edge cases - s = Series([1, 2, 3, 4], index=list('abde')) - - result1 = s['a':'c'] - with catch_warnings(record=True): - result2 = s.ix['a':'c'] - result3 = s.loc['a':'c'] - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) - - # now work rather than raising KeyError - s = Series(range(5), [-2, -1, 1, 2, 3]) - - with catch_warnings(record=True): - result1 = s.ix[-10:3] - result2 = s.loc[-10:3] - tm.assert_series_equal(result1, result2) - - with catch_warnings(record=True): - result1 = s.ix[0:3] - result2 = s.loc[0:3] - tm.assert_series_equal(result1, result2) - - def test_loc_setitem_dups(self): - - # GH 6541 - df_orig = DataFrame( - {'me': list('rttti'), - 'foo': list('aaade'), - 'bar': np.arange(5, dtype='float64') * 1.34 + 2, - 'bar2': np.arange(5, dtype='float64') * -.34 + 2}).set_index('me') - - indexer = tuple(['r', ['bar', 'bar2']]) - df = df_orig.copy() - df.loc[indexer] *= 2.0 - tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - - indexer = tuple(['r', 'bar']) - df = df_orig.copy() - df.loc[indexer] *= 2.0 - self.assertEqual(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - - indexer = tuple(['t', ['bar', 'bar2']]) - df = df_orig.copy() - df.loc[indexer] *= 2.0 - tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - - def test_iloc_setitem_dups(self): - - # GH 6766 - # iloc with a mask aligning from another iloc - df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) - df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) - df = concat([df1, df2], axis=1) - - expected = df.fillna(3) - expected['A'] = expected['A'].astype('float64') - inds = np.isnan(df.iloc[:, 0]) - mask = inds[inds].index - df.iloc[mask, 0] = df.iloc[mask, 2] - tm.assert_frame_equal(df, expected) - - # del a dup column across blocks - expected = DataFrame({0: [1, 2], 1: [3, 4]}) - expected.columns = ['B', 'B'] - del df['A'] - tm.assert_frame_equal(df, expected) - - # assign back to self - df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] - tm.assert_frame_equal(df, expected) - - # reversed x 2 - df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( - drop=True) - df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( - drop=True) - tm.assert_frame_equal(df, expected) - - def test_chained_getitem_with_lists(self): - - # GH6394 - # Regression in chained getitem indexing with embedded list-like from - # 0.12 - def check(result, expected): - tm.assert_numpy_array_equal(result, expected) - tm.assertIsInstance(result, np.ndarray) - - df = DataFrame({'A': 5 * [np.zeros(3)], 'B': 5 * [np.ones(3)]}) - expected = df['A'].iloc[2] - result = df.loc[2, 'A'] - check(result, expected) - result2 = df.iloc[2]['A'] - check(result2, expected) - result3 = df['A'].loc[2] - check(result3, expected) - result4 = df['A'].iloc[2] - check(result4, expected) - - def test_loc_getitem_int(self): - - # int label - self.check_result('int label', 'loc', 2, 'ix', 2, - typs=['ints', 'uints'], axes=0) - self.check_result('int label', 'loc', 3, 'ix', 3, - typs=['ints', 'uints'], axes=1) - self.check_result('int label', 'loc', 4, 'ix', 4, - typs=['ints', 'uints'], axes=2) - self.check_result('int label', 'loc', 2, 'ix', 2, - typs=['label'], fails=KeyError) - - def test_loc_getitem_label(self): - - # label - self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['labels'], - axes=0) - self.check_result('label', 'loc', 'null', 'ix', 'null', typs=['mixed'], - axes=0) - self.check_result('label', 'loc', 8, 'ix', 8, typs=['mixed'], axes=0) - self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, - typs=['ts'], axes=0) - self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['empty'], - fails=KeyError) - - def test_loc_getitem_label_out_of_range(self): - - # out of range label - self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['ints', 'uints', 'labels', 'mixed', 'ts'], - fails=KeyError) - self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['floats'], fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['ints', 'uints', 'mixed'], fails=KeyError) - self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['labels'], fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'], - axes=0, fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, typs=['floats'], - axes=0, fails=TypeError) - - def test_loc_getitem_label_list(self): - - # list of labels - self.check_result('list lbl', 'loc', [0, 2, 4], 'ix', [0, 2, 4], - typs=['ints', 'uints'], axes=0) - self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9], - typs=['ints', 'uints'], axes=1) - self.check_result('list lbl', 'loc', [4, 8, 12], 'ix', [4, 8, 12], - typs=['ints', 'uints'], axes=2) - self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix', - ['a', 'b', 'd'], typs=['labels'], axes=0) - self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix', - ['A', 'B', 'C'], typs=['labels'], axes=1) - self.check_result('list lbl', 'loc', ['Z', 'Y', 'W'], 'ix', - ['Z', 'Y', 'W'], typs=['labels'], axes=2) - self.check_result('list lbl', 'loc', [2, 8, 'null'], 'ix', - [2, 8, 'null'], typs=['mixed'], axes=0) - self.check_result('list lbl', 'loc', - [Timestamp('20130102'), Timestamp('20130103')], 'ix', - [Timestamp('20130102'), Timestamp('20130103')], - typs=['ts'], axes=0) - - self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2], - typs=['empty'], fails=KeyError) - self.check_result('list lbl', 'loc', [0, 2, 3], 'ix', [0, 2, 3], - typs=['ints', 'uints'], axes=0, fails=KeyError) - self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7], - typs=['ints', 'uints'], axes=1, fails=KeyError) - self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], - typs=['ints', 'uints'], axes=2, fails=KeyError) - - def test_loc_getitem_label_list_fails(self): - # fails - self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints', 'uints'], axes=1, fails=KeyError) - self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints', 'uints'], axes=2, fails=KeyError) - - def test_loc_getitem_label_array_like(self): - # array like - self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, - 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0) - self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index, - 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) - self.check_result('array like', 'loc', Series(index=[4, 8, 12]).index, - 'ix', [4, 8, 12], typs=['ints', 'uints'], axes=2) - - def test_loc_getitem_bool(self): - # boolean indexers - b = [True, False, True, False] - self.check_result('bool', 'loc', b, 'ix', b, - typs=['ints', 'uints', 'labels', - 'mixed', 'ts', 'floats']) - self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'], - fails=KeyError) - - def test_loc_getitem_int_slice(self): - - # ok - self.check_result('int slice2', 'loc', slice(2, 4), 'ix', [2, 4], - typs=['ints', 'uints'], axes=0) - self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6], - typs=['ints', 'uints'], axes=1) - self.check_result('int slice2', 'loc', slice(4, 8), 'ix', [4, 8], - typs=['ints', 'uints'], axes=2) - - # GH 3053 - # loc should treat integer slices like label slices - from itertools import product - - index = MultiIndex.from_tuples([t for t in product( - [6, 7, 8], ['a', 'b'])]) - df = DataFrame(np.random.randn(6, 6), index, index) - result = df.loc[6:8, :] - with catch_warnings(record=True): - expected = df.ix[6:8, :] - tm.assert_frame_equal(result, expected) - - index = MultiIndex.from_tuples([t - for t in product( - [10, 20, 30], ['a', 'b'])]) - df = DataFrame(np.random.randn(6, 6), index, index) - result = df.loc[20:30, :] - with catch_warnings(record=True): - expected = df.ix[20:30, :] - tm.assert_frame_equal(result, expected) - - # doc examples - result = df.loc[10, :] - with catch_warnings(record=True): - expected = df.ix[10, :] - tm.assert_frame_equal(result, expected) - - result = df.loc[:, 10] - # expected = df.ix[:,10] (this fails) - expected = df[10] - tm.assert_frame_equal(result, expected) - - def test_loc_to_fail(self): - - # GH3449 - df = DataFrame(np.random.random((3, 3)), - index=['a', 'b', 'c'], - columns=['e', 'f', 'g']) - - # raise a KeyError? - self.assertRaises(KeyError, df.loc.__getitem__, - tuple([[1, 2], [1, 2]])) - - # GH 7496 - # loc should not fallback - - s = Series() - s.loc[1] = 1 - s.loc['a'] = 2 - - self.assertRaises(KeyError, lambda: s.loc[-1]) - self.assertRaises(KeyError, lambda: s.loc[[-1, -2]]) - - self.assertRaises(KeyError, lambda: s.loc[['4']]) - - s.loc[-1] = 3 - result = s.loc[[-1, -2]] - expected = Series([3, np.nan], index=[-1, -2]) - tm.assert_series_equal(result, expected) - - s['a'] = 2 - self.assertRaises(KeyError, lambda: s.loc[[-2]]) - - del s['a'] - - def f(): - s.loc[[-2]] = 0 - - self.assertRaises(KeyError, f) - - # inconsistency between .loc[values] and .loc[values,:] - # GH 7999 - df = DataFrame([['a'], ['b']], index=[1, 2], columns=['value']) - - def f(): - df.loc[[3], :] - - self.assertRaises(KeyError, f) - - def f(): - df.loc[[3]] - - self.assertRaises(KeyError, f) - - def test_at_to_fail(self): - # at should not fallback - # GH 7814 - s = Series([1, 2, 3], index=list('abc')) - result = s.at['a'] - self.assertEqual(result, 1) - self.assertRaises(ValueError, lambda: s.at[0]) - - df = DataFrame({'A': [1, 2, 3]}, index=list('abc')) - result = df.at['a', 'A'] - self.assertEqual(result, 1) - self.assertRaises(ValueError, lambda: df.at['a', 0]) - - s = Series([1, 2, 3], index=[3, 2, 1]) - result = s.at[1] - self.assertEqual(result, 3) - self.assertRaises(ValueError, lambda: s.at['a']) - - df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) - result = df.at[1, 0] - self.assertEqual(result, 3) - self.assertRaises(ValueError, lambda: df.at['a', 0]) - - # GH 13822, incorrect error string with non-unique columns when missing - # column is accessed - df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) - df.columns = ['x', 'x', 'z'] - - # Check that we get the correct value in the KeyError - self.assertRaisesRegexp(KeyError, r"\['y'\] not in index", - lambda: df[['x', 'y', 'z']]) - - def test_loc_getitem_label_slice(self): - - # label slices (with ints) - self.check_result('lab slice', 'loc', slice(1, 3), - 'ix', slice(1, 3), - typs=['labels', 'mixed', 'empty', 'ts', 'floats'], - fails=TypeError) - - # real label slices - self.check_result('lab slice', 'loc', slice('a', 'c'), - 'ix', slice('a', 'c'), typs=['labels'], axes=0) - self.check_result('lab slice', 'loc', slice('A', 'C'), - 'ix', slice('A', 'C'), typs=['labels'], axes=1) - self.check_result('lab slice', 'loc', slice('W', 'Z'), - 'ix', slice('W', 'Z'), typs=['labels'], axes=2) - - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=0) - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=1, fails=TypeError) - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=2, fails=TypeError) - - # GH 14316 - self.check_result('ts slice rev', 'loc', slice('20130104', '20130102'), - 'indexer', [0, 1, 2], typs=['ts_rev'], axes=0) - - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=0, fails=TypeError) - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=1, fails=KeyError) - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=2, fails=KeyError) - - self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice( - 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError) - - def test_loc_general(self): - - df = DataFrame( - np.random.rand(4, 4), columns=['A', 'B', 'C', 'D'], - index=['A', 'B', 'C', 'D']) - - # want this to work - result = df.loc[:, "A":"B"].iloc[0:2, :] - self.assertTrue((result.columns == ['A', 'B']).all()) - self.assertTrue((result.index == ['A', 'B']).all()) - - # mixed type - result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0] - expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, object) - - def test_loc_setitem_consistency(self): - # GH 6149 - # coerce similary for setitem and loc when rows have a null-slice - expected = DataFrame({'date': Series(0, index=range(5), - dtype=np.int64), - 'val': Series(range(5), dtype=np.int64)}) - - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 0 - tm.assert_frame_equal(df, expected) - - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = np.array(0, dtype=np.int64) - tm.assert_frame_equal(df, expected) - - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64) - tm.assert_frame_equal(df, expected) - - expected = DataFrame({'date': Series('foo', index=range(5)), - 'val': Series(range(5), dtype=np.int64)}) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 'foo' - tm.assert_frame_equal(df, expected) - - expected = DataFrame({'date': Series(1.0, index=range(5)), - 'val': Series(range(5), dtype=np.int64)}) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 1.0 - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_consistency_empty(self): - # empty (essentially noops) - expected = DataFrame(columns=['x', 'y']) - expected['x'] = expected['x'].astype(np.int64) - df = DataFrame(columns=['x', 'y']) - df.loc[:, 'x'] = 1 - tm.assert_frame_equal(df, expected) - - df = DataFrame(columns=['x', 'y']) - df['x'] = 1 - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_consistency_slice_column_len(self): - # .loc[:,column] setting with slice == len of the column - # GH10408 - data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat -Level_1,,,Something,StartDate,EndDate,Yes/No,SomethingElse -Region,Site,RespondentID,,,,, -Region_1,Site_1,3987227376,A,5/25/2015 10:59,5/25/2015 11:22,Yes, -Region_1,Site_1,3980680971,A,5/21/2015 9:40,5/21/2015 9:52,Yes,Yes -Region_1,Site_2,3977723249,A,5/20/2015 8:27,5/20/2015 8:41,Yes, -Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No""" - - df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2]) - df.loc[:, ('Respondent', 'StartDate')] = pd.to_datetime(df.loc[:, ( - 'Respondent', 'StartDate')]) - df.loc[:, ('Respondent', 'EndDate')] = pd.to_datetime(df.loc[:, ( - 'Respondent', 'EndDate')]) - df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( - 'Respondent', 'EndDate')] - df.loc[:, ('Respondent', 'StartDate')] - - df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( - 'Respondent', 'Duration')].astype('timedelta64[s]') - expected = Series([1380, 720, 840, 2160.], index=df.index, - name=('Respondent', 'Duration')) - tm.assert_series_equal(df[('Respondent', 'Duration')], expected) - - def test_loc_setitem_frame(self): - df = self.frame_labels - - result = df.iloc[0, 0] - - df.loc['a', 'A'] = 1 - result = df.loc['a', 'A'] - self.assertEqual(result, 1) - - result = df.iloc[0, 0] - self.assertEqual(result, 1) - - df.loc[:, 'B':'D'] = 0 - expected = df.loc[:, 'B':'D'] - with catch_warnings(record=True): - result = df.ix[:, 1:] - tm.assert_frame_equal(result, expected) - - # GH 6254 - # setting issue - df = DataFrame(index=[3, 5, 4], columns=['A']) - df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64') - expected = DataFrame(dict(A=Series( - [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4]) - tm.assert_frame_equal(df, expected) - - # GH 6252 - # setting with an empty frame - keys1 = ['@' + str(i) for i in range(5)] - val1 = np.arange(5, dtype='int64') - - keys2 = ['@' + str(i) for i in range(4)] - val2 = np.arange(4, dtype='int64') - - index = list(set(keys1).union(keys2)) - df = DataFrame(index=index) - df['A'] = nan - df.loc[keys1, 'A'] = val1 - - df['B'] = nan - df.loc[keys2, 'B'] = val2 - - expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series( - val2, index=keys2))).reindex(index=index) - tm.assert_frame_equal(df, expected) - - # GH 8669 - # invalid coercion of nan -> int - df = DataFrame({'A': [1, 2, 3], 'B': np.nan}) - df.loc[df.B > df.A, 'B'] = df.A - expected = DataFrame({'A': [1, 2, 3], 'B': np.nan}) - tm.assert_frame_equal(df, expected) - - # GH 6546 - # setting with mixed labels - df = DataFrame({1: [1, 2], 2: [3, 4], 'a': ['a', 'b']}) - - result = df.loc[0, [1, 2]] - expected = Series([1, 3], index=[1, 2], dtype=object, name=0) - tm.assert_series_equal(result, expected) - - expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']}) - df.loc[0, [1, 2]] = [5, 6] - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_frame_multiples(self): - # multiple setting - df = DataFrame({'A': ['foo', 'bar', 'baz'], - 'B': Series( - range(3), dtype=np.int64)}) - rhs = df.loc[1:2] - rhs.index = df.index[0:2] - df.loc[0:1] = rhs - expected = DataFrame({'A': ['bar', 'baz', 'baz'], - 'B': Series( - [1, 2, 2], dtype=np.int64)}) - tm.assert_frame_equal(df, expected) - - # multiple setting with frame on rhs (with M8) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) - expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( - '20000102'), Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')], - 'val': Series( - [0, 1, 0, 1, 2], dtype=np.int64)}) - rhs = df.loc[0:2] - rhs.index = df.index[2:5] - df.loc[2:4] = rhs - tm.assert_frame_equal(df, expected) - - def test_iloc_getitem_frame(self): - df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), - columns=lrange(0, 8, 2)) - - result = df.iloc[2] - with catch_warnings(record=True): - exp = df.ix[4] - tm.assert_series_equal(result, exp) - - result = df.iloc[2, 2] - with catch_warnings(record=True): - exp = df.ix[4, 4] - self.assertEqual(result, exp) - - # slice - result = df.iloc[4:8] - with catch_warnings(record=True): - expected = df.ix[8:14] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:, 2:3] - with catch_warnings(record=True): - expected = df.ix[:, 4:5] - tm.assert_frame_equal(result, expected) - - # list of integers - result = df.iloc[[0, 1, 3]] - with catch_warnings(record=True): - expected = df.ix[[0, 2, 6]] - tm.assert_frame_equal(result, expected) - - result = df.iloc[[0, 1, 3], [0, 1]] - with catch_warnings(record=True): - expected = df.ix[[0, 2, 6], [0, 2]] - tm.assert_frame_equal(result, expected) - - # neg indicies - result = df.iloc[[-1, 1, 3], [-1, 1]] - with catch_warnings(record=True): - expected = df.ix[[18, 2, 6], [6, 2]] - tm.assert_frame_equal(result, expected) - - # dups indicies - result = df.iloc[[-1, -1, 1, 3], [-1, 1]] - with catch_warnings(record=True): - expected = df.ix[[18, 18, 2, 6], [6, 2]] - tm.assert_frame_equal(result, expected) - - # with index-like - s = Series(index=lrange(1, 5)) - result = df.iloc[s.index] - with catch_warnings(record=True): - expected = df.ix[[2, 4, 6, 8]] - tm.assert_frame_equal(result, expected) - - def test_iloc_getitem_labelled_frame(self): - # try with labelled frame - df = DataFrame(np.random.randn(10, 4), - index=list('abcdefghij'), columns=list('ABCD')) - - result = df.iloc[1, 1] - exp = df.loc['b', 'B'] - self.assertEqual(result, exp) - - result = df.iloc[:, 2:3] - expected = df.loc[:, ['C']] - tm.assert_frame_equal(result, expected) - - # negative indexing - result = df.iloc[-1, -1] - exp = df.loc['j', 'D'] - self.assertEqual(result, exp) - - # out-of-bounds exception - self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10, 5])) - - # trying to use a label - self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j', 'D'])) - - def test_iloc_getitem_doc_issue(self): - - # multi axis slicing issue with single block - # surfaced in GH 6059 - - arr = np.random.randn(6, 4) - index = date_range('20130101', periods=6) - columns = list('ABCD') - df = DataFrame(arr, index=index, columns=columns) - - # defines ref_locs - df.describe() - - result = df.iloc[3:5, 0:2] - str(result) - result.dtypes - - expected = DataFrame(arr[3:5, 0:2], index=index[3:5], - columns=columns[0:2]) - tm.assert_frame_equal(result, expected) - - # for dups - df.columns = list('aaaa') - result = df.iloc[3:5, 0:2] - str(result) - result.dtypes - - expected = DataFrame(arr[3:5, 0:2], index=index[3:5], - columns=list('aa')) - tm.assert_frame_equal(result, expected) - - # related - arr = np.random.randn(6, 4) - index = list(range(0, 12, 2)) - columns = list(range(0, 8, 2)) - df = DataFrame(arr, index=index, columns=columns) - - df._data.blocks[0].mgr_locs - result = df.iloc[1:5, 2:4] - str(result) - result.dtypes - expected = DataFrame(arr[1:5, 2:4], index=index[1:5], - columns=columns[2:4]) - tm.assert_frame_equal(result, expected) +class TestFancy(Base, tm.TestCase): + """ pure get/set item & fancy indexing """ def test_setitem_ndarray_1d(self): # GH5508 - # len of indexer vs length of the 1d ndarray - df = DataFrame(index=Index(lrange(1, 11))) - df['foo'] = np.zeros(10, dtype=np.float64) - df['bar'] = np.zeros(10, dtype=np.complex) - - # invalid - def f(): - with catch_warnings(record=True): - df.ix[2:5, 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2]) - - self.assertRaises(ValueError, f) - - def f(): - df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, - 2.2, 1.0]) - - self.assertRaises(ValueError, f) - - # valid - df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, - 2.2, 1.0]) - - result = df.loc[df.index[2:6], 'bar'] - expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], - name='bar') - tm.assert_series_equal(result, expected) - - # dtype getting changed? - df = DataFrame(index=Index(lrange(1, 11))) - df['foo'] = np.zeros(10, dtype=np.float64) - df['bar'] = np.zeros(10, dtype=np.complex) - - def f(): - df[2:5] = np.arange(1, 4) * 1j - - self.assertRaises(ValueError, f) - - def test_iloc_setitem_series(self): - df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), - columns=list('ABCD')) - - df.iloc[1, 1] = 1 - result = df.iloc[1, 1] - self.assertEqual(result, 1) - - df.iloc[:, 2:3] = 0 - expected = df.iloc[:, 2:3] - result = df.iloc[:, 2:3] - tm.assert_frame_equal(result, expected) - - s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - - s.iloc[1] = 1 - result = s.iloc[1] - self.assertEqual(result, 1) - - s.iloc[:4] = 0 - expected = s.iloc[:4] - result = s.iloc[:4] - tm.assert_series_equal(result, expected) - - s = Series([-1] * 6) - s.iloc[0::2] = [0, 2, 4] - s.iloc[1::2] = [1, 3, 5] - result = s - expected = Series([0, 1, 2, 3, 4, 5]) - tm.assert_series_equal(result, expected) - - def test_iloc_setitem_list_of_lists(self): - - # GH 7551 - # list-of-list is set incorrectly in mixed vs. single dtyped frames - df = DataFrame(dict(A=np.arange(5, dtype='int64'), - B=np.arange(5, 10, dtype='int64'))) - df.iloc[2:4] = [[10, 11], [12, 13]] - expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) - tm.assert_frame_equal(df, expected) - - df = DataFrame( - dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) - df.iloc[2:4] = [['x', 11], ['y', 13]] - expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], - B=[5, 6, 11, 13, 9])) - tm.assert_frame_equal(df, expected) - - def test_ix_general(self): - - # ix general issues - - # GH 2817 - data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, - 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, - 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} - df = DataFrame(data).set_index(keys=['col', 'year']) - key = 4.0, 2012 - - # emits a PerformanceWarning, ok - with self.assert_produces_warning(PerformanceWarning): - tm.assert_frame_equal(df.loc[key], df.iloc[2:]) - - # this is ok - df.sort_index(inplace=True) - res = df.loc[key] - - # col has float dtype, result should be Float64Index - index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], - names=['col', 'year']) - expected = DataFrame({'amount': [222, 333, 444]}, index=index) - tm.assert_frame_equal(res, expected) - - def test_ix_weird_slicing(self): - # http://stackoverflow.com/q/17056560/1240268 - df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], - 'two': [1, 2, 3, 4, 5]}) - df.loc[df['one'] > 1, 'two'] = -df['two'] - - expected = DataFrame({'one': {0: 1.0, - 1: 2.0, - 2: 3.0, - 3: nan, - 4: nan}, - 'two': {0: 1, - 1: -2, - 2: -3, - 3: 4, - 4: 5}}) - tm.assert_frame_equal(df, expected) - - def test_loc_coerceion(self): - - # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) - expected = df.dtypes - - result = df.iloc[[0]] - tm.assert_series_equal(result.dtypes, expected) + # len of indexer vs length of the 1d ndarray + df = DataFrame(index=Index(lrange(1, 11))) + df['foo'] = np.zeros(10, dtype=np.float64) + df['bar'] = np.zeros(10, dtype=np.complex) - result = df.iloc[[1]] - tm.assert_series_equal(result.dtypes, expected) + # invalid + def f(): + df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, + 2.2, 1.0]) - # 12045 - import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) - expected = df.dtypes + self.assertRaises(ValueError, f) - result = df.iloc[[0]] - tm.assert_series_equal(result.dtypes, expected) + # valid + df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, + 2.2, 1.0]) - result = df.iloc[[1]] - tm.assert_series_equal(result.dtypes, expected) + result = df.loc[df.index[2:6], 'bar'] + expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], + name='bar') + tm.assert_series_equal(result, expected) - # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) - expected = df.dtypes + # dtype getting changed? + df = DataFrame(index=Index(lrange(1, 11))) + df['foo'] = np.zeros(10, dtype=np.float64) + df['bar'] = np.zeros(10, dtype=np.complex) - result = df.iloc[0:2] - tm.assert_series_equal(result.dtypes, expected) + def f(): + df[2:5] = np.arange(1, 4) * 1j - result = df.iloc[3:] - tm.assert_series_equal(result.dtypes, expected) + self.assertRaises(ValueError, f) def test_setitem_dtype_upcast(self): @@ -1683,19 +99,6 @@ def test_setitem_dtype_upcast(self): self.assertTrue(is_float_dtype(left['foo'])) self.assertTrue(is_float_dtype(left['baz'])) - def test_setitem_iloc(self): - - # setitem with an iloc list - df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], - columns=["A", "B", "C"]) - df.iloc[[0, 1], [1, 2]] - df.iloc[[0, 1], [1, 2]] += 100 - - expected = DataFrame( - np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), - index=["A", "B", "C"], columns=["A", "B", "C"]) - tm.assert_frame_equal(df, expected) - def test_dups_fancy_indexing(self): # GH 3455 @@ -1757,23 +160,24 @@ def test_dups_fancy_indexing(self): # inconsistent returns for unique/duplicate indices when values are # missing - df = DataFrame(randn(4, 3), index=list('ABCD')) - expected = df.ix[['E']] + df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) + expected = df.reindex(['E']) - dfnu = DataFrame(randn(5, 3), index=list('AABCD')) - result = dfnu.ix[['E']] + dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) + with catch_warnings(record=True): + result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) - result = df.ix[[0, 8, 0]] + result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) - result = df.ix[[0, 8, 0]] + result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) @@ -1781,7 +185,7 @@ def test_dups_fancy_indexing(self): df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) - result = df.ix[['A', 'A', 'E']] + result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 @@ -1790,9 +194,9 @@ def test_dups_fancy_indexing(self): np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat( - [df.ix[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], - index=df.index)], axis=1) - result = df.ix[:, ['A', 'B', 'C']] + [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], + index=df.index)], axis=1) + result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing @@ -1822,8 +226,8 @@ def test_indexing_mixed_frame_bug(self): # this does not work, ie column test is not changed idx = df['test'] == '_' - temp = df.ix[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x) - df.ix[idx, 'test'] = temp + temp = df.loc[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x) + df.loc[idx, 'test'] = temp self.assertEqual(df.iloc[0, 2], '-----') # if I look at df, then element [0,2] equals '_'. If instead I type @@ -1859,17 +263,17 @@ def test_set_index_nan(self): 'QC': {17: 0.0, 18: 0.0, 19: 0.0, - 20: nan, - 21: nan, - 22: nan, - 23: nan, + 20: np.nan, + 21: np.nan, + 22: np.nan, + 23: np.nan, 24: 1.0, - 25: nan, - 26: nan, - 27: nan, - 28: nan, - 29: nan, - 30: nan}, + 25: np.nan, + 26: np.nan, + 27: np.nan, + 28: np.nan, + 29: np.nan, + 30: np.nan}, 'data': {17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, @@ -1925,14 +329,14 @@ def test_multi_assign(self): 'PF': [0, 0, 0, 0, 1, 1], 'col1': lrange(6), 'col2': lrange(6, 12)}) - df.ix[1, 0] = np.nan + df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isnull() cols = ['col1', 'col2'] dft = df2 * 2 - dft.ix[3, 3] = np.nan + dft.iloc[3, 3] = np.nan expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], @@ -1940,17 +344,17 @@ def test_multi_assign(self): 'col2': [12, 7, 16, np.nan, 20, 22]}) # frame on rhs - df2.ix[mask, cols] = dft.ix[mask, cols] + df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) - df2.ix[mask, cols] = dft.ix[mask, cols] + df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs df2 = df.copy() - df2.ix[mask, cols] = dft.ix[mask, cols].values + df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) - df2.ix[mask, cols] = dft.ix[mask, cols].values + df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required @@ -1965,79 +369,18 @@ def test_multi_assign(self): df.loc[df['A'] == 0, ['A', 'B']] = df['D'] tm.assert_frame_equal(df, expected) - def test_ix_assign_column_mixed(self): - # GH #1142 - df = DataFrame(tm.getSeriesData()) - df['foo'] = 'bar' - - orig = df.ix[:, 'B'].copy() - df.ix[:, 'B'] = df.ix[:, 'B'] + 1 - tm.assert_series_equal(df.B, orig + 1) - - # GH 3668, mixed frame with series value - df = DataFrame({'x': lrange(10), 'y': lrange(10, 20), 'z': 'bar'}) - expected = df.copy() - - for i in range(5): - indexer = i * 2 - v = 1000 + i * 200 - expected.ix[indexer, 'y'] = v - self.assertEqual(expected.ix[indexer, 'y'], v) - - df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 - tm.assert_frame_equal(df, expected) - - # GH 4508, making sure consistency of assignments - df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]}) - df.ix[[0, 2, ], 'b'] = [100, -100] - expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]}) - tm.assert_frame_equal(df, expected) - - df = pd.DataFrame({'a': lrange(4)}) - df['b'] = np.nan - df.ix[[1, 3], 'b'] = [100, -100] - expected = DataFrame({'a': [0, 1, 2, 3], - 'b': [np.nan, 100, np.nan, -100]}) - tm.assert_frame_equal(df, expected) - - # ok, but chained assignments are dangerous - # if we turn off chained assignement it will work - with option_context('chained_assignment', None): - df = pd.DataFrame({'a': lrange(4)}) - df['b'] = np.nan - df['b'].ix[[1, 3]] = [100, -100] - tm.assert_frame_equal(df, expected) - - def test_ix_get_set_consistency(self): - - # GH 4544 - # ix/loc get/set not consistent when - # a mixed int/string index - df = DataFrame(np.arange(16).reshape((4, 4)), - columns=['a', 'b', 8, 'c'], - index=['e', 7, 'f', 'g']) - - self.assertEqual(df.ix['e', 8], 2) - self.assertEqual(df.loc['e', 8], 2) - - df.ix['e', 8] = 42 - self.assertEqual(df.ix['e', 8], 42) - self.assertEqual(df.loc['e', 8], 42) - - df.loc['e', 8] = 45 - self.assertEqual(df.ix['e', 8], 45) - self.assertEqual(df.loc['e', 8], 45) - def test_setitem_list(self): # GH 6043 # ix with a list df = DataFrame(index=[0, 1], columns=[0]) - df.ix[1, 0] = [1, 2, 3] - df.ix[1, 0] = [1, 2] + with catch_warnings(record=True): + df.ix[1, 0] = [1, 2, 3] + df.ix[1, 0] = [1, 2] result = DataFrame(index=[0, 1], columns=[0]) - result.ix[1, 0] = [1, 2] + with catch_warnings(record=True): + result.ix[1, 0] = [1, 2] tm.assert_frame_equal(result, df) @@ -2059,187 +402,25 @@ def view(self): return self df = DataFrame(index=[0, 1], columns=[0]) - df.ix[1, 0] = TO(1) - df.ix[1, 0] = TO(2) + with catch_warnings(record=True): + df.ix[1, 0] = TO(1) + df.ix[1, 0] = TO(2) result = DataFrame(index=[0, 1], columns=[0]) - result.ix[1, 0] = TO(2) + with catch_warnings(record=True): + result.ix[1, 0] = TO(2) tm.assert_frame_equal(result, df) # remains object dtype even after setting it back df = DataFrame(index=[0, 1], columns=[0]) - df.ix[1, 0] = TO(1) - df.ix[1, 0] = np.nan + with catch_warnings(record=True): + df.ix[1, 0] = TO(1) + df.ix[1, 0] = np.nan result = DataFrame(index=[0, 1], columns=[0]) tm.assert_frame_equal(result, df) - def test_iloc_mask(self): - - # GH 3631, iloc with a mask (of a series) should raise - df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) - mask = (df.a % 2 == 0) - self.assertRaises(ValueError, df.iloc.__getitem__, tuple([mask])) - mask.index = lrange(len(mask)) - self.assertRaises(NotImplementedError, df.iloc.__getitem__, - tuple([mask])) - - # ndarray ok - result = df.iloc[np.array([True] * len(mask), dtype=bool)] - tm.assert_frame_equal(result, df) - - # the possibilities - locs = np.arange(4) - nums = 2 ** locs - reps = lmap(bin, nums) - df = DataFrame({'locs': locs, 'nums': nums}, reps) - - expected = { - (None, ''): '0b1100', - (None, '.loc'): '0b1100', - (None, '.iloc'): '0b1100', - ('index', ''): '0b11', - ('index', '.loc'): '0b11', - ('index', '.iloc'): ('iLocation based boolean indexing ' - 'cannot use an indexable as a mask'), - ('locs', ''): 'Unalignable boolean Series provided as indexer ' - '(index of the boolean Series and of the indexed ' - 'object do not match', - ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' - '(index of the boolean Series and of the ' - 'indexed object do not match', - ('locs', '.iloc'): ('iLocation based boolean indexing on an ' - 'integer type is not available'), - } - - # UserWarnings from reindex of a boolean mask - with warnings.catch_warnings(record=True): - result = dict() - for idx in [None, 'index', 'locs']: - mask = (df.nums > 2).values - if idx: - mask = Series(mask, list(reversed(getattr(df, idx)))) - for method in ['', '.loc', '.iloc']: - try: - if method: - accessor = getattr(df, method[1:]) - else: - accessor = df - ans = str(bin(accessor[mask]['nums'].sum())) - except Exception as e: - ans = str(e) - - key = tuple([idx, method]) - r = expected.get(key) - if r != ans: - raise AssertionError( - "[%s] does not match [%s], received [%s]" - % (key, ans, r)) - - def test_ix_slicing_strings(self): - # GH3836 - data = {'Classification': - ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'], - 'Random': [1, 2, 3, 4, 5], - 'X': ['correct', 'wrong', 'correct', 'correct', 'wrong']} - df = DataFrame(data) - x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF' - ])] - df.ix[x.index, 'X'] = df['Classification'] - - expected = DataFrame({'Classification': {0: 'SA EQUITY CFD', - 1: 'bbb', - 2: 'SA EQUITY', - 3: 'SA SSF', - 4: 'aaa'}, - 'Random': {0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5}, - 'X': {0: 'correct', - 1: 'bbb', - 2: 'correct', - 3: 'correct', - 4: 'aaa'}}) # bug was 4: 'bbb' - - tm.assert_frame_equal(df, expected) - - def test_non_unique_loc(self): - # GH3659 - # non-unique indexer with loc slice - # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs - - # these are going to raise becuase the we are non monotonic - df = DataFrame({'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) - self.assertRaises(KeyError, df.loc.__getitem__, - tuple([slice(1, None)])) - self.assertRaises(KeyError, df.loc.__getitem__, - tuple([slice(0, None)])) - self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1, 2)])) - - # monotonic are ok - df = DataFrame({'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, - index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0) - result = df.loc[1:] - expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]}, - index=[1, 1, 2, 3]) - tm.assert_frame_equal(result, expected) - - result = df.loc[0:] - tm.assert_frame_equal(result, df) - - result = df.loc[1:2] - expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, - index=[1, 1, 2]) - tm.assert_frame_equal(result, expected) - - def test_loc_name(self): - # GH 3880 - df = DataFrame([[1, 1], [1, 1]]) - df.index.name = 'index_name' - result = df.iloc[[0, 1]].index.name - self.assertEqual(result, 'index_name') - - result = df.ix[[0, 1]].index.name - self.assertEqual(result, 'index_name') - - result = df.loc[[0, 1]].index.name - self.assertEqual(result, 'index_name') - - def test_iloc_non_unique_indexing(self): - - # GH 4017, non-unique indexing (on the axis) - df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000}) - idx = np.array(lrange(30)) * 99 - expected = df.iloc[idx] - - df3 = pd.concat([df, 2 * df, 3 * df]) - result = df3.iloc[idx] - - tm.assert_frame_equal(result, expected) - - df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) - df2 = pd.concat([df2, 2 * df2, 3 * df2]) - - sidx = df2.index.to_series() - expected = df2.iloc[idx[idx <= sidx.max()]] - - new_list = [] - for r, s in expected.iterrows(): - new_list.append(s) - new_list.append(s * 2) - new_list.append(s * 3) - - expected = DataFrame(new_list) - expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()]) - ]) - result = df2.loc[idx] - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object @@ -2300,43 +481,6 @@ def test_mi_access(self): result = df2['A']['B2'] tm.assert_frame_equal(result, expected) - def test_non_unique_loc_memory_error(self): - - # GH 4280 - # non_unique index with a large selection triggers a memory error - - columns = list('ABCDEFG') - - def gen_test(l, l2): - return pd.concat([DataFrame(randn(l, len(columns)), - index=lrange(l), columns=columns), - DataFrame(np.ones((l2, len(columns))), - index=[0] * l2, columns=columns)]) - - def gen_expected(df, mask): - l = len(mask) - return pd.concat([df.take([0], convert=False), - DataFrame(np.ones((l, len(columns))), - index=[0] * l, - columns=columns), - df.take(mask[1:], convert=False)]) - - df = gen_test(900, 100) - self.assertFalse(df.index.is_unique) - - mask = np.arange(100) - result = df.loc[mask] - expected = gen_expected(df, mask) - tm.assert_frame_equal(result, expected) - - df = gen_test(900000, 100000) - self.assertFalse(df.index.is_unique) - - mask = np.arange(100000) - result = df.loc[mask] - expected = gen_expected(df, mask) - tm.assert_frame_equal(result, expected) - def test_astype_assignment(self): # GH4312 (iloc) @@ -2395,745 +539,79 @@ def test_astype_assignment_with_dups(self): # result = df.get_dtype_counts().sort_index() # expected = Series({'float64': 2, 'object': 1}).sort_index() - def test_dups_loc(self): - - # GH4726 - # dup indexing with iloc/loc - df = DataFrame([[1, 2, 'foo', 'bar', Timestamp('20130101')]], - columns=['a', 'a', 'a', 'a', 'a'], index=[1]) - expected = Series([1, 2, 'foo', 'bar', Timestamp('20130101')], - index=['a', 'a', 'a', 'a', 'a'], name=1) - - result = df.iloc[0] - tm.assert_series_equal(result, expected) - - result = df.loc[1] - tm.assert_series_equal(result, expected) - - def test_partial_setting(self): - - # GH2578, allow ix and friends to partially set - - # series - s_orig = Series([1, 2, 3]) - - s = s_orig.copy() - s[5] = 5 - expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) - tm.assert_series_equal(s, expected) - - s = s_orig.copy() - s.loc[5] = 5 - expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) - tm.assert_series_equal(s, expected) - - s = s_orig.copy() - s[5] = 5. - expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) - tm.assert_series_equal(s, expected) - - s = s_orig.copy() - s.loc[5] = 5. - expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) - tm.assert_series_equal(s, expected) - - # iloc/iat raise - s = s_orig.copy() - - def f(): - s.iloc[3] = 5. - - self.assertRaises(IndexError, f) - - def f(): - s.iat[3] = 5. - - self.assertRaises(IndexError, f) - - # ## frame ## - - df_orig = DataFrame( - np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') - - # iloc/iat raise - df = df_orig.copy() - - def f(): - df.iloc[4, 2] = 5. - - self.assertRaises(IndexError, f) - - def f(): - df.iat[4, 2] = 5. - - self.assertRaises(IndexError, f) - - # row setting where it exists - expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) - df = df_orig.copy() - df.iloc[1] = df.iloc[2] - tm.assert_frame_equal(df, expected) - - expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) - df = df_orig.copy() - df.loc[1] = df.loc[2] - tm.assert_frame_equal(df, expected) - - # like 2578, partial setting with dtype preservation - expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) - df = df_orig.copy() - df.loc[3] = df.loc[2] - tm.assert_frame_equal(df, expected) - - # single dtype frame, overwrite - expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) - df = df_orig.copy() - df.ix[:, 'B'] = df.ix[:, 'A'] - tm.assert_frame_equal(df, expected) - - # mixed dtype frame, overwrite - expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) - df = df_orig.copy() - df['B'] = df['B'].astype(np.float64) - df.ix[:, 'B'] = df.ix[:, 'A'] - tm.assert_frame_equal(df, expected) - - # single dtype frame, partial setting - expected = df_orig.copy() - expected['C'] = df['A'] - df = df_orig.copy() - df.ix[:, 'C'] = df.ix[:, 'A'] - tm.assert_frame_equal(df, expected) - - # mixed frame, partial setting - expected = df_orig.copy() - expected['C'] = df['A'] - df = df_orig.copy() - df.ix[:, 'C'] = df.ix[:, 'A'] - tm.assert_frame_equal(df, expected) - - # ## panel ## - p_orig = Panel(np.arange(16).reshape(2, 4, 2), - items=['Item1', 'Item2'], - major_axis=pd.date_range('2001/1/12', periods=4), - minor_axis=['A', 'B'], dtype='float64') - - # panel setting via item - p_orig = Panel(np.arange(16).reshape(2, 4, 2), - items=['Item1', 'Item2'], - major_axis=pd.date_range('2001/1/12', periods=4), - minor_axis=['A', 'B'], dtype='float64') - expected = p_orig.copy() - expected['Item3'] = expected['Item1'] - p = p_orig.copy() - p.loc['Item3'] = p['Item1'] - tm.assert_panel_equal(p, expected) - - # panel with aligned series - expected = p_orig.copy() - expected = expected.transpose(2, 1, 0) - expected['C'] = DataFrame({'Item1': [30, 30, 30, 30], - 'Item2': [32, 32, 32, 32]}, - index=p_orig.major_axis) - expected = expected.transpose(2, 1, 0) - p = p_orig.copy() - p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) - tm.assert_panel_equal(p, expected) - - # GH 8473 - dates = date_range('1/1/2000', periods=8) - df_orig = DataFrame(np.random.randn(8, 4), index=dates, - columns=['A', 'B', 'C', 'D']) - - expected = pd.concat([df_orig, DataFrame( - {'A': 7}, index=[dates[-1] + 1])]) - df = df_orig.copy() - df.loc[dates[-1] + 1, 'A'] = 7 - tm.assert_frame_equal(df, expected) - df = df_orig.copy() - df.at[dates[-1] + 1, 'A'] = 7 - tm.assert_frame_equal(df, expected) - - exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) - expected = pd.concat([df_orig, exp_other], axis=1) - - df = df_orig.copy() - df.loc[dates[-1] + 1, 0] = 7 - tm.assert_frame_equal(df, expected) - df = df_orig.copy() - df.at[dates[-1] + 1, 0] = 7 - tm.assert_frame_equal(df, expected) - - def test_partial_setting_mixed_dtype(self): - - # in a mixed dtype environment, try to preserve dtypes - # by appending - df = DataFrame([[True, 1], [False, 2]], columns=["female", "fitness"]) - - s = df.loc[1].copy() - s.name = 2 - expected = df.append(s) - - df.loc[2] = df.loc[1] - tm.assert_frame_equal(df, expected) - - # columns will align - df = DataFrame(columns=['A', 'B']) - df.loc[0] = Series(1, index=range(4)) - tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) - - # columns will align - df = DataFrame(columns=['A', 'B']) - df.loc[0] = Series(1, index=['B']) - - exp = DataFrame([[np.nan, 1]], columns=['A', 'B'], - index=[0], dtype='float64') - tm.assert_frame_equal(df, exp) - - # list-like must conform - df = DataFrame(columns=['A', 'B']) - - def f(): - df.loc[0] = [1, 2, 3] - - self.assertRaises(ValueError, f) - - # these are coerced to float unavoidably (as its a list-like to begin) - df = DataFrame(columns=['A', 'B']) - df.loc[3] = [6, 7] - - exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], - dtype='float64') - tm.assert_frame_equal(df, exp) - - def test_series_partial_set(self): - # partial set with new index - # Regression from GH4825 - ser = Series([0.1, 0.2], index=[1, 2]) - - # loc - expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) - result = ser.loc[[3, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) - result = ser.loc[[3, 2, 3, 'x']] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) - result = ser.loc[[2, 2, 1]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) - result = ser.loc[[2, 2, 'x', 1]] - tm.assert_series_equal(result, expected, check_index_type=True) - - # raises as nothing in in the index - self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) - - expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) - result = ser.loc[[2, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) - result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) - result = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]).loc[[5, 3, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) - result = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]).loc[[5, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) - result = Series([0.1, 0.2, 0.3, 0.4], - index=[4, 5, 6, 7]).loc[[7, 2, 2]] - tm.assert_series_equal(result, expected, check_index_type=True) - - expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) - result = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]).loc[[4, 5, 5]] - tm.assert_series_equal(result, expected, check_index_type=True) - - # iloc - expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1]) - result = ser.iloc[[1, 1, 0, 0]] - tm.assert_series_equal(result, expected, check_index_type=True) - - def test_series_partial_set_with_name(self): - # GH 11497 - - idx = Index([1, 2], dtype='int64', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') - - # loc - exp_idx = Index([3, 2, 3], dtype='int64', name='idx') - expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') - result = ser.loc[[3, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') - expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, - name='s') - result = ser.loc[[3, 2, 3, 'x']] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([2, 2, 1], dtype='int64', name='idx') - expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') - result = ser.loc[[2, 2, 1]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') - expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') - result = ser.loc[[2, 2, 'x', 1]] - tm.assert_series_equal(result, expected, check_index_type=True) - - # raises as nothing in in the index - self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) - - exp_idx = Index([2, 2, 3], dtype='int64', name='idx') - expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') - result = ser.loc[[2, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([3, 4, 4], dtype='int64', name='idx') - expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([1, 2, 3], dtype='int64', name='idx') - result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([5, 3, 3], dtype='int64', name='idx') - expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[5, 3, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([5, 4, 4], dtype='int64', name='idx') - expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[5, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([7, 2, 2], dtype='int64', name='idx') - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([4, 5, 6, 7], dtype='int64', name='idx') - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[7, 2, 2]] - tm.assert_series_equal(result, expected, check_index_type=True) - - exp_idx = Index([4, 5, 5], dtype='int64', name='idx') - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[4, 5, 5]] - tm.assert_series_equal(result, expected, check_index_type=True) - - # iloc - exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') - expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') - result = ser.iloc[[1, 1, 0, 0]] - tm.assert_series_equal(result, expected, check_index_type=True) - - def test_partial_set_invalid(self): - - # GH 4940 - # allow only setting of 'valid' values - - orig = tm.makeTimeDataFrame() - df = orig.copy() - - # don't allow not string inserts - def f(): - df.loc[100.0, :] = df.ix[0] - - self.assertRaises(TypeError, f) - - def f(): - df.loc[100, :] = df.ix[0] - - self.assertRaises(TypeError, f) - - def f(): - df.ix[100.0, :] = df.ix[0] - - self.assertRaises(TypeError, f) - - def f(): - df.ix[100, :] = df.ix[0] - - self.assertRaises(ValueError, f) - - # allow object conversion here - df = orig.copy() - df.loc['a', :] = df.ix[0] - exp = orig.append(pd.Series(df.ix[0], name='a')) - tm.assert_frame_equal(df, exp) - tm.assert_index_equal(df.index, - pd.Index(orig.index.tolist() + ['a'])) - self.assertEqual(df.index.dtype, 'object') - - def test_partial_set_empty_series(self): - - # GH5226 - - # partially set with an empty object series - s = Series() - s.loc[1] = 1 - tm.assert_series_equal(s, Series([1], index=[1])) - s.loc[3] = 3 - tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) - - s = Series() - s.loc[1] = 1. - tm.assert_series_equal(s, Series([1.], index=[1])) - s.loc[3] = 3. - tm.assert_series_equal(s, Series([1., 3.], index=[1, 3])) - - s = Series() - s.loc['foo'] = 1 - tm.assert_series_equal(s, Series([1], index=['foo'])) - s.loc['bar'] = 3 - tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) - s.loc[3] = 4 - tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) - - def test_partial_set_empty_frame(self): - - # partially set with an empty object - # frame - df = DataFrame() - - def f(): - df.loc[1] = 1 - - self.assertRaises(ValueError, f) - - def f(): - df.loc[1] = Series([1], index=['foo']) - - self.assertRaises(ValueError, f) - - def f(): - df.loc[:, 1] = 1 - - self.assertRaises(ValueError, f) - - # these work as they don't really change - # anything but the index - # GH5632 - expected = DataFrame(columns=['foo'], index=pd.Index( - [], dtype='int64')) - - def f(): - df = DataFrame() - df['foo'] = Series([], dtype='object') - return df - - tm.assert_frame_equal(f(), expected) - - def f(): - df = DataFrame() - df['foo'] = Series(df.index) - return df - - tm.assert_frame_equal(f(), expected) - - def f(): - df = DataFrame() - df['foo'] = df.index - return df - - tm.assert_frame_equal(f(), expected) - - expected = DataFrame(columns=['foo'], - index=pd.Index([], dtype='int64')) - expected['foo'] = expected['foo'].astype('float64') - - def f(): - df = DataFrame() - df['foo'] = [] - return df - - tm.assert_frame_equal(f(), expected) - - def f(): - df = DataFrame() - df['foo'] = Series(range(len(df))) - return df - - tm.assert_frame_equal(f(), expected) - - def f(): - df = DataFrame() - tm.assert_index_equal(df.index, pd.Index([], dtype='object')) - df['foo'] = range(len(df)) - return df - - expected = DataFrame(columns=['foo'], - index=pd.Index([], dtype='int64')) - expected['foo'] = expected['foo'].astype('float64') - tm.assert_frame_equal(f(), expected) - - df = DataFrame() - tm.assert_index_equal(df.columns, pd.Index([], dtype=object)) - df2 = DataFrame() - df2[1] = Series([1], index=['foo']) - df.loc[:, 1] = Series([1], index=['foo']) - tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) - tm.assert_frame_equal(df, df2) - - # no index to start - expected = DataFrame({0: Series(1, index=range(4))}, - columns=['A', 'B', 0]) - - df = DataFrame(columns=['A', 'B']) - df[0] = Series(1, index=range(4)) - df.dtypes - str(df) - tm.assert_frame_equal(df, expected) - - df = DataFrame(columns=['A', 'B']) - df.loc[:, 0] = Series(1, index=range(4)) - df.dtypes - str(df) - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame_row(self): - # GH5720, GH5744 - # don't create rows when empty - expected = DataFrame(columns=['A', 'B', 'New'], - index=pd.Index([], dtype='int64')) - expected['A'] = expected['A'].astype('int64') - expected['B'] = expected['B'].astype('float64') - expected['New'] = expected['New'].astype('float64') - - df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) - y = df[df.A > 5] - y['New'] = np.nan - tm.assert_frame_equal(y, expected) - # tm.assert_frame_equal(y,expected) - - expected = DataFrame(columns=['a', 'b', 'c c', 'd']) - expected['d'] = expected['d'].astype('int64') - df = DataFrame(columns=['a', 'b', 'c c']) - df['d'] = 3 - tm.assert_frame_equal(df, expected) - tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object)) - - # reindex columns is ok - df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) - y = df[df.A > 5] - result = y.reindex(columns=['A', 'B', 'C']) - expected = DataFrame(columns=['A', 'B', 'C'], - index=pd.Index([], dtype='int64')) - expected['A'] = expected['A'].astype('int64') - expected['B'] = expected['B'].astype('float64') - expected['C'] = expected['C'].astype('float64') - tm.assert_frame_equal(result, expected) - - def test_partial_set_empty_frame_set_series(self): - # GH 5756 - # setting with empty Series - df = DataFrame(Series()) - tm.assert_frame_equal(df, DataFrame({0: Series()})) - - df = DataFrame(Series(name='foo')) - tm.assert_frame_equal(df, DataFrame({'foo': Series()})) - - def test_partial_set_empty_frame_empty_copy_assignment(self): - # GH 5932 - # copy on empty with assignment fails - df = DataFrame(index=[0]) - df = df.copy() - df['a'] = 0 - expected = DataFrame(0, index=[0], columns=['a']) - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame_empty_consistencies(self): - # GH 6171 - # consistency on empty frames - df = DataFrame(columns=['x', 'y']) - df['x'] = [1, 2] - expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) - tm.assert_frame_equal(df, expected, check_dtype=False) + def test_index_type_coercion(self): - df = DataFrame(columns=['x', 'y']) - df['x'] = ['1', '2'] - expected = DataFrame( - dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object) - tm.assert_frame_equal(df, expected) + with catch_warnings(record=True): - df = DataFrame(columns=['x', 'y']) - df.loc[0, 'x'] = 1 - expected = DataFrame(dict(x=[1], y=[np.nan])) - tm.assert_frame_equal(df, expected, check_dtype=False) - - def test_cache_updating(self): - # GH 4939, make sure to update the cache on setitem - - df = tm.makeDataFrame() - df['A'] # cache series - df.ix["Hello Friend"] = df.ix[0] - self.assertIn("Hello Friend", df['A'].index) - self.assertIn("Hello Friend", df['B'].index) - - panel = tm.makePanel() - panel.ix[0] # get first item into cache - panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1 - self.assertIn("A+1", panel.ix[0].columns) - self.assertIn("A+1", panel.ix[1].columns) - - # 5216 - # make sure that we don't try to set a dead cache - a = np.random.rand(10, 3) - df = DataFrame(a, columns=['x', 'y', 'z']) - tuples = [(i, j) for i in range(5) for j in range(2)] - index = MultiIndex.from_tuples(tuples) - df.index = index - - # setting via chained assignment - # but actually works, since everything is a view - df.loc[0]['z'].iloc[0] = 1. - result = df.loc[(0, 0), 'z'] - self.assertEqual(result, 1) - - # correct setting - df.loc[(0, 0), 'z'] = 2 - result = df.loc[(0, 0), 'z'] - self.assertEqual(result, 2) - - # 10264 - df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[ - 'a', 'b', 'c', 'd', 'e'], index=range(5)) - df['f'] = 0 - df.f.values[3] = 1 + # GH 11836 + # if we have an index type and set it with something that looks + # to numpy like the same, but is actually, not + # (e.g. setting with a float or string '0') + # then we need to coerce to object - # TODO(wesm): unused? - # y = df.iloc[np.arange(2, len(df))] + # integer indexes + for s in [Series(range(5)), + Series(range(5), index=range(1, 6))]: - df.f.values[3] = 2 - expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ - 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) - expected.at[3, 'f'] = 2 - tm.assert_frame_equal(df, expected) - expected = Series([0, 0, 0, 2, 0], name='f') - tm.assert_series_equal(df.f, expected) - - def test_set_ix_out_of_bounds_axis_0(self): - df = pd.DataFrame( - randn(2, 5), index=["row%s" % i for i in range(2)], - columns=["col%s" % i for i in range(5)]) - self.assertRaises(ValueError, df.ix.__setitem__, (2, 0), 100) - - def test_set_ix_out_of_bounds_axis_1(self): - df = pd.DataFrame( - randn(5, 2), index=["row%s" % i for i in range(5)], - columns=["col%s" % i for i in range(2)]) - self.assertRaises(ValueError, df.ix.__setitem__, (0, 2), 100) - - def test_iloc_empty_list_indexer_is_ok(self): - from pandas.util.testing import makeCustomDataframe as mkdf - df = mkdf(5, 2) - # vertical empty - tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) - # horizontal empty - tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) - # horizontal empty - tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) - - def test_loc_empty_list_indexer_is_ok(self): - from pandas.util.testing import makeCustomDataframe as mkdf - df = mkdf(5, 2) - # vertical empty - tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) - # horizontal empty - tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) - # horizontal empty - tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) - - def test_ix_empty_list_indexer_is_ok(self): - from pandas.util.testing import makeCustomDataframe as mkdf - df = mkdf(5, 2) - # vertical empty - tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0], - check_index_type=True, - check_column_type=True) - # horizontal empty - tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) - # horizontal empty - tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + self.assertTrue(s.index.is_integer()) - def test_index_type_coercion(self): + for indexer in [lambda x: x.ix, + lambda x: x.loc, + lambda x: x]: + s2 = s.copy() + indexer(s2)[0.1] = 0 + self.assertTrue(s2.index.is_floating()) + self.assertTrue(indexer(s2)[0.1] == 0) - # GH 11836 - # if we have an index type and set it with something that looks - # to numpy like the same, but is actually, not - # (e.g. setting with a float or string '0') - # then we need to coerce to object + s2 = s.copy() + indexer(s2)[0.0] = 0 + exp = s.index + if 0 not in s: + exp = Index(s.index.tolist() + [0]) + tm.assert_index_equal(s2.index, exp) - # integer indexes - for s in [Series(range(5)), - Series(range(5), index=range(1, 6))]: + s2 = s.copy() + indexer(s2)['0'] = 0 + self.assertTrue(s2.index.is_object()) - self.assertTrue(s.index.is_integer()) + for s in [Series(range(5), index=np.arange(5.))]: - for indexer in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: - s2 = s.copy() - indexer(s2)[0.1] = 0 - self.assertTrue(s2.index.is_floating()) - self.assertTrue(indexer(s2)[0.1] == 0) + self.assertTrue(s.index.is_floating()) - s2 = s.copy() - indexer(s2)[0.0] = 0 - exp = s.index - if 0 not in s: - exp = Index(s.index.tolist() + [0]) - tm.assert_index_equal(s2.index, exp) + for idxr in [lambda x: x.ix, + lambda x: x.loc, + lambda x: x]: - s2 = s.copy() - indexer(s2)['0'] = 0 - self.assertTrue(s2.index.is_object()) + s2 = s.copy() + idxr(s2)[0.1] = 0 + self.assertTrue(s2.index.is_floating()) + self.assertTrue(idxr(s2)[0.1] == 0) - for s in [Series(range(5), index=np.arange(5.))]: + s2 = s.copy() + idxr(s2)[0.0] = 0 + tm.assert_index_equal(s2.index, s.index) - self.assertTrue(s.index.is_floating()) + s2 = s.copy() + idxr(s2)['0'] = 0 + self.assertTrue(s2.index.is_object()) - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: - s2 = s.copy() - idxr(s2)[0.1] = 0 - self.assertTrue(s2.index.is_floating()) - self.assertTrue(idxr(s2)[0.1] == 0) +class TestMisc(Base, tm.TestCase): - s2 = s.copy() - idxr(s2)[0.0] = 0 - tm.assert_index_equal(s2.index, s.index) + def test_indexer_caching(self): + # GH5727 + # make sure that indexers are in the _internal_names_set + n = 1000001 + arrays = [lrange(n), lrange(n)] + index = MultiIndex.from_tuples(lzip(*arrays)) + s = Series(np.zeros(n), index=index) + str(s) - s2 = s.copy() - idxr(s2)['0'] = 0 - self.assertTrue(s2.index.is_object()) + # setitem + expected = Series(np.ones(n), index=index) + s = Series(np.zeros(n), index=index) + s[s == 0] = 1 + tm.assert_series_equal(s, expected) def test_float_index_to_mixed(self): df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) @@ -3143,13 +621,6 @@ def test_float_index_to_mixed(self): 'a': [10] * 10}), df) - def test_duplicate_ix_returns_series(self): - df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], - columns=list('abc')) - r = df.ix[0.2, 'a'] - e = df.loc[0.2, 'a'] - tm.assert_series_equal(r, e) - def test_float_index_non_scalar_assignment(self): df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.]) df.loc[df.index[:2]] = 1 @@ -3185,15 +656,18 @@ def run_tests(df, rhs, right): tm.assert_frame_equal(left, right) left = df.copy() - left.ix[s, l] = rhs + with catch_warnings(record=True): + left.ix[s, l] = rhs tm.assert_frame_equal(left, right) left = df.copy() - left.ix[i, j] = rhs + with catch_warnings(record=True): + left.ix[i, j] = rhs tm.assert_frame_equal(left, right) left = df.copy() - left.ix[r, c] = rhs + with catch_warnings(record=True): + left.ix[r, c] = rhs tm.assert_frame_equal(left, right) xs = np.arange(20).reshape(5, 4) @@ -3226,7 +700,7 @@ def assert_slices_equivalent(l_slc, i_slc): if not idx.is_integer: # For integer indices, ix and plain getitem are position-based. tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) for idx in [_mklbl('A', 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: @@ -3243,8 +717,9 @@ def test_slice_with_zero_step_raises(self): lambda: s[::0]) self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: s.loc[::0]) - self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', - lambda: s.ix[::0]) + with catch_warnings(record=True): + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: s.ix[::0]) def test_indexing_assignment_dict_already_exists(self): df = pd.DataFrame({'x': [1, 2, 6], @@ -3259,11 +734,13 @@ def test_indexing_assignment_dict_already_exists(self): def test_indexing_dtypes_on_empty(self): # Check that .iloc and .ix return correct dtypes GH9983 df = DataFrame({'a': [1, 2, 3], 'b': ['b', 'b2', 'b3']}) - df2 = df.ix[[], :] + with catch_warnings(record=True): + df2 = df.ix[[], :] self.assertEqual(df2.loc[:, 'a'].dtype, np.int64) tm.assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) - tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) + with catch_warnings(record=True): + tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) def test_range_in_series_indexing(self): # range can cause an indexing error diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py new file mode 100644 index 0000000000000..e68e8015a2f39 --- /dev/null +++ b/pandas/tests/indexing/test_ix.py @@ -0,0 +1,333 @@ +""" test indexing with ix """ + +from warnings import catch_warnings + +import numpy as np +import pandas as pd + +from pandas.types.common import is_scalar +from pandas.compat import lrange +from pandas import Series, DataFrame, option_context, MultiIndex +from pandas.util import testing as tm +from pandas.core.common import PerformanceWarning + + +class TestIX(tm.TestCase): + + def test_ix_deprecation(self): + # GH 15114 + + df = DataFrame({'A': [1, 2, 3]}) + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + df.ix[1, 'A'] + + def test_ix_loc_setitem_consistency(self): + + # GH 5771 + # loc with slice and series + s = Series(0, index=[4, 5, 6]) + s.loc[4:5] += 1 + expected = Series([1, 1, 0], index=[4, 5, 6]) + tm.assert_series_equal(s, expected) + + # GH 5928 + # chained indexing assignment + df = DataFrame({'a': [0, 1, 2]}) + expected = df.copy() + with catch_warnings(record=True): + expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] + + with catch_warnings(record=True): + df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] + tm.assert_frame_equal(df, expected) + + df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) + with catch_warnings(record=True): + df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype( + 'float64') + 0.5 + expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) + tm.assert_frame_equal(df, expected) + + # GH 8607 + # ix setitem consistency + df = DataFrame({'timestamp': [1413840976, 1413842580, 1413760580], + 'delta': [1174, 904, 161], + 'elapsed': [7673, 9277, 1470]}) + expected = DataFrame({'timestamp': pd.to_datetime( + [1413840976, 1413842580, 1413760580], unit='s'), + 'delta': [1174, 904, 161], + 'elapsed': [7673, 9277, 1470]}) + + df2 = df.copy() + df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + tm.assert_frame_equal(df2, expected) + + df2 = df.copy() + df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + tm.assert_frame_equal(df2, expected) + + df2 = df.copy() + with catch_warnings(record=True): + df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') + tm.assert_frame_equal(df2, expected) + + def test_ix_loc_consistency(self): + + # GH 8613 + # some edge cases where ix/loc should return the same + # this is not an exhaustive case + + def compare(result, expected): + if is_scalar(expected): + self.assertEqual(result, expected) + else: + self.assertTrue(expected.equals(result)) + + # failure cases for .loc, but these work for .ix + df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD')) + for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]), + tuple([slice(0, 2), df.columns[0:2]])]: + + for index in [tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex, + tm.makeTimedeltaIndex]: + df.index = index(len(df.index)) + with catch_warnings(record=True): + df.ix[key] + + self.assertRaises(TypeError, lambda: df.loc[key]) + + df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), + index=pd.date_range('2012-01-01', periods=5)) + + for key in ['2012-01-03', + '2012-01-31', + slice('2012-01-03', '2012-01-03'), + slice('2012-01-03', '2012-01-04'), + slice('2012-01-03', '2012-01-06', 2), + slice('2012-01-03', '2012-01-31'), + tuple([[True, True, True, False, True]]), ]: + + # getitem + + # if the expected raises, then compare the exceptions + try: + with catch_warnings(record=True): + expected = df.ix[key] + except KeyError: + self.assertRaises(KeyError, lambda: df.loc[key]) + continue + + result = df.loc[key] + compare(result, expected) + + # setitem + df1 = df.copy() + df2 = df.copy() + + with catch_warnings(record=True): + df1.ix[key] = 10 + df2.loc[key] = 10 + compare(df2, df1) + + # edge cases + s = Series([1, 2, 3, 4], index=list('abde')) + + result1 = s['a':'c'] + with catch_warnings(record=True): + result2 = s.ix['a':'c'] + result3 = s.loc['a':'c'] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + # now work rather than raising KeyError + s = Series(range(5), [-2, -1, 1, 2, 3]) + + with catch_warnings(record=True): + result1 = s.ix[-10:3] + result2 = s.loc[-10:3] + tm.assert_series_equal(result1, result2) + + with catch_warnings(record=True): + result1 = s.ix[0:3] + result2 = s.loc[0:3] + tm.assert_series_equal(result1, result2) + + def test_ix_weird_slicing(self): + # http://stackoverflow.com/q/17056560/1240268 + df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], + 'two': [1, 2, 3, 4, 5]}) + df.loc[df['one'] > 1, 'two'] = -df['two'] + + expected = DataFrame({'one': {0: 1.0, + 1: 2.0, + 2: 3.0, + 3: np.nan, + 4: np.nan}, + 'two': {0: 1, + 1: -2, + 2: -3, + 3: 4, + 4: 5}}) + tm.assert_frame_equal(df, expected) + + def test_ix_general(self): + + # ix general issues + + # GH 2817 + data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} + df = DataFrame(data).set_index(keys=['col', 'year']) + key = 4.0, 2012 + + # emits a PerformanceWarning, ok + with self.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(df.loc[key], df.iloc[2:]) + + # this is ok + df.sort_index(inplace=True) + res = df.loc[key] + + # col has float dtype, result should be Float64Index + index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], + names=['col', 'year']) + expected = DataFrame({'amount': [222, 333, 444]}, index=index) + tm.assert_frame_equal(res, expected) + + def test_ix_assign_column_mixed(self): + # GH #1142 + df = DataFrame(tm.getSeriesData()) + df['foo'] = 'bar' + + orig = df.loc[:, 'B'].copy() + df.loc[:, 'B'] = df.loc[:, 'B'] + 1 + tm.assert_series_equal(df.B, orig + 1) + + # GH 3668, mixed frame with series value + df = DataFrame({'x': lrange(10), 'y': lrange(10, 20), 'z': 'bar'}) + expected = df.copy() + + for i in range(5): + indexer = i * 2 + v = 1000 + i * 200 + expected.loc[indexer, 'y'] = v + self.assertEqual(expected.loc[indexer, 'y'], v) + + df.loc[df.x % 2 == 0, 'y'] = df.loc[df.x % 2 == 0, 'y'] * 100 + tm.assert_frame_equal(df, expected) + + # GH 4508, making sure consistency of assignments + df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]}) + df.loc[[0, 2, ], 'b'] = [100, -100] + expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]}) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame({'a': lrange(4)}) + df['b'] = np.nan + df.loc[[1, 3], 'b'] = [100, -100] + expected = DataFrame({'a': [0, 1, 2, 3], + 'b': [np.nan, 100, np.nan, -100]}) + tm.assert_frame_equal(df, expected) + + # ok, but chained assignments are dangerous + # if we turn off chained assignement it will work + with option_context('chained_assignment', None): + df = pd.DataFrame({'a': lrange(4)}) + df['b'] = np.nan + df['b'].loc[[1, 3]] = [100, -100] + tm.assert_frame_equal(df, expected) + + def test_ix_get_set_consistency(self): + + # GH 4544 + # ix/loc get/set not consistent when + # a mixed int/string index + df = DataFrame(np.arange(16).reshape((4, 4)), + columns=['a', 'b', 8, 'c'], + index=['e', 7, 'f', 'g']) + + with catch_warnings(record=True): + self.assertEqual(df.ix['e', 8], 2) + self.assertEqual(df.loc['e', 8], 2) + + with catch_warnings(record=True): + df.ix['e', 8] = 42 + self.assertEqual(df.ix['e', 8], 42) + self.assertEqual(df.loc['e', 8], 42) + + df.loc['e', 8] = 45 + with catch_warnings(record=True): + self.assertEqual(df.ix['e', 8], 45) + self.assertEqual(df.loc['e', 8], 45) + + def test_ix_slicing_strings(self): + # GH3836 + data = {'Classification': + ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'], + 'Random': [1, 2, 3, 4, 5], + 'X': ['correct', 'wrong', 'correct', 'correct', 'wrong']} + df = DataFrame(data) + x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF' + ])] + with catch_warnings(record=True): + df.ix[x.index, 'X'] = df['Classification'] + + expected = DataFrame({'Classification': {0: 'SA EQUITY CFD', + 1: 'bbb', + 2: 'SA EQUITY', + 3: 'SA SSF', + 4: 'aaa'}, + 'Random': {0: 1, + 1: 2, + 2: 3, + 3: 4, + 4: 5}, + 'X': {0: 'correct', + 1: 'bbb', + 2: 'correct', + 3: 'correct', + 4: 'aaa'}}) # bug was 4: 'bbb' + + tm.assert_frame_equal(df, expected) + + def test_ix_setitem_out_of_bounds_axis_0(self): + df = pd.DataFrame( + np.random.randn(2, 5), index=["row%s" % i for i in range(2)], + columns=["col%s" % i for i in range(5)]) + with catch_warnings(record=True): + self.assertRaises(ValueError, df.ix.__setitem__, (2, 0), 100) + + def test_ix_setitem_out_of_bounds_axis_1(self): + df = pd.DataFrame( + np.random.randn(5, 2), index=["row%s" % i for i in range(5)], + columns=["col%s" % i for i in range(2)]) + with catch_warnings(record=True): + self.assertRaises(ValueError, df.ix.__setitem__, (0, 2), 100) + + def test_ix_empty_list_indexer_is_ok(self): + with catch_warnings(record=True): + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + # vertical empty + tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0], + check_index_type=True, + check_column_type=True) + # horizontal empty + tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) + # horizontal empty + tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) + + def test_ix_duplicate_returns_series(self): + df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], + columns=list('abc')) + with catch_warnings(record=True): + r = df.ix[0.2, 'a'] + e = df.loc[0.2, 'a'] + tm.assert_series_equal(r, e) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py new file mode 100644 index 0000000000000..af9d3ffdf6671 --- /dev/null +++ b/pandas/tests/indexing/test_loc.py @@ -0,0 +1,630 @@ +""" test label based indexing with loc """ + +import itertools +from warnings import catch_warnings +import numpy as np + +import pandas as pd +from pandas.compat import lrange, StringIO +from pandas import (Series, DataFrame, Timestamp, + date_range, MultiIndex) +from pandas.util import testing as tm +from pandas.tests.indexing.common import Base + + +class TestLoc(Base, tm.TestCase): + + def test_loc_getitem_dups(self): + # GH 5678 + # repeated gettitems on a dup index returing a ndarray + df = DataFrame( + np.random.random_sample((20, 5)), + index=['ABCDE' [x % 5] for x in range(20)]) + expected = df.loc['A', 0] + result = df.loc[:, 0].loc['A'] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_dups2(self): + + # GH4726 + # dup indexing with iloc/loc + df = DataFrame([[1, 2, 'foo', 'bar', Timestamp('20130101')]], + columns=['a', 'a', 'a', 'a', 'a'], index=[1]) + expected = Series([1, 2, 'foo', 'bar', Timestamp('20130101')], + index=['a', 'a', 'a', 'a', 'a'], name=1) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + result = df.loc[1] + tm.assert_series_equal(result, expected) + + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame( + {'me': list('rttti'), + 'foo': list('aaade'), + 'bar': np.arange(5, dtype='float64') * 1.34 + 2, + 'bar2': np.arange(5, dtype='float64') * -.34 + 2}).set_index('me') + + indexer = tuple(['r', ['bar', 'bar2']]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + indexer = tuple(['r', 'bar']) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + self.assertEqual(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + indexer = tuple(['t', ['bar', 'bar2']]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + def test_loc_setitem_slice(self): + # GH10503 + + # assigning the same type should not change the type + df1 = DataFrame({'a': [0, 1, 1], + 'b': Series([100, 200, 300], dtype='uint32')}) + ix = df1['a'] == 1 + newb1 = df1.loc[ix, 'b'] + 1 + df1.loc[ix, 'b'] = newb1 + expected = DataFrame({'a': [0, 1, 1], + 'b': Series([100, 201, 301], dtype='uint32')}) + tm.assert_frame_equal(df1, expected) + + # assigning a new type should get the inferred type + df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, + dtype='uint64') + ix = df1['a'] == 1 + newb2 = df2.loc[ix, 'b'] + df1.loc[ix, 'b'] = newb2 + expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, + dtype='uint64') + tm.assert_frame_equal(df2, expected) + + def test_loc_getitem_int(self): + + # int label + self.check_result('int label', 'loc', 2, 'ix', 2, + typs=['ints', 'uints'], axes=0) + self.check_result('int label', 'loc', 3, 'ix', 3, + typs=['ints', 'uints'], axes=1) + self.check_result('int label', 'loc', 4, 'ix', 4, + typs=['ints', 'uints'], axes=2) + self.check_result('int label', 'loc', 2, 'ix', 2, + typs=['label'], fails=KeyError) + + def test_loc_getitem_label(self): + + # label + self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['labels'], + axes=0) + self.check_result('label', 'loc', 'null', 'ix', 'null', typs=['mixed'], + axes=0) + self.check_result('label', 'loc', 8, 'ix', 8, typs=['mixed'], axes=0) + self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, + typs=['ts'], axes=0) + self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['empty'], + fails=KeyError) + + def test_loc_getitem_label_out_of_range(self): + + # out of range label + self.check_result('label range', 'loc', 'f', 'ix', 'f', + typs=['ints', 'uints', 'labels', 'mixed', 'ts'], + fails=KeyError) + self.check_result('label range', 'loc', 'f', 'ix', 'f', + typs=['floats'], fails=TypeError) + self.check_result('label range', 'loc', 20, 'ix', 20, + typs=['ints', 'uints', 'mixed'], fails=KeyError) + self.check_result('label range', 'loc', 20, 'ix', 20, + typs=['labels'], fails=TypeError) + self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'], + axes=0, fails=TypeError) + self.check_result('label range', 'loc', 20, 'ix', 20, typs=['floats'], + axes=0, fails=TypeError) + + def test_loc_getitem_label_list(self): + + # list of labels + self.check_result('list lbl', 'loc', [0, 2, 4], 'ix', [0, 2, 4], + typs=['ints', 'uints'], axes=0) + self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9], + typs=['ints', 'uints'], axes=1) + self.check_result('list lbl', 'loc', [4, 8, 12], 'ix', [4, 8, 12], + typs=['ints', 'uints'], axes=2) + self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix', + ['a', 'b', 'd'], typs=['labels'], axes=0) + self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix', + ['A', 'B', 'C'], typs=['labels'], axes=1) + self.check_result('list lbl', 'loc', ['Z', 'Y', 'W'], 'ix', + ['Z', 'Y', 'W'], typs=['labels'], axes=2) + self.check_result('list lbl', 'loc', [2, 8, 'null'], 'ix', + [2, 8, 'null'], typs=['mixed'], axes=0) + self.check_result('list lbl', 'loc', + [Timestamp('20130102'), Timestamp('20130103')], 'ix', + [Timestamp('20130102'), Timestamp('20130103')], + typs=['ts'], axes=0) + + self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2], + typs=['empty'], fails=KeyError) + self.check_result('list lbl', 'loc', [0, 2, 3], 'ix', [0, 2, 3], + typs=['ints', 'uints'], axes=0, fails=KeyError) + self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7], + typs=['ints', 'uints'], axes=1, fails=KeyError) + self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], + typs=['ints', 'uints'], axes=2, fails=KeyError) + + def test_loc_getitem_label_list_fails(self): + # fails + self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], + typs=['ints', 'uints'], axes=1, fails=KeyError) + self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], + typs=['ints', 'uints'], axes=2, fails=KeyError) + + def test_loc_getitem_label_array_like(self): + # array like + self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, + 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0) + self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index, + 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) + self.check_result('array like', 'loc', Series(index=[4, 8, 12]).index, + 'ix', [4, 8, 12], typs=['ints', 'uints'], axes=2) + + def test_loc_getitem_bool(self): + # boolean indexers + b = [True, False, True, False] + self.check_result('bool', 'loc', b, 'ix', b, + typs=['ints', 'uints', 'labels', + 'mixed', 'ts', 'floats']) + self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'], + fails=KeyError) + + def test_loc_getitem_int_slice(self): + + # ok + self.check_result('int slice2', 'loc', slice(2, 4), 'ix', [2, 4], + typs=['ints', 'uints'], axes=0) + self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6], + typs=['ints', 'uints'], axes=1) + self.check_result('int slice2', 'loc', slice(4, 8), 'ix', [4, 8], + typs=['ints', 'uints'], axes=2) + + # GH 3053 + # loc should treat integer slices like label slices + + index = MultiIndex.from_tuples([t for t in itertools.product( + [6, 7, 8], ['a', 'b'])]) + df = DataFrame(np.random.randn(6, 6), index, index) + result = df.loc[6:8, :] + expected = df + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([t + for t in itertools.product( + [10, 20, 30], ['a', 'b'])]) + df = DataFrame(np.random.randn(6, 6), index, index) + result = df.loc[20:30, :] + expected = df.iloc[2:] + tm.assert_frame_equal(result, expected) + + # doc examples + result = df.loc[10, :] + expected = df.iloc[0:2] + expected.index = ['a', 'b'] + tm.assert_frame_equal(result, expected) + + result = df.loc[:, 10] + # expected = df.ix[:,10] (this fails) + expected = df[10] + tm.assert_frame_equal(result, expected) + + def test_loc_to_fail(self): + + # GH3449 + df = DataFrame(np.random.random((3, 3)), + index=['a', 'b', 'c'], + columns=['e', 'f', 'g']) + + # raise a KeyError? + self.assertRaises(KeyError, df.loc.__getitem__, + tuple([[1, 2], [1, 2]])) + + # GH 7496 + # loc should not fallback + + s = Series() + s.loc[1] = 1 + s.loc['a'] = 2 + + self.assertRaises(KeyError, lambda: s.loc[-1]) + self.assertRaises(KeyError, lambda: s.loc[[-1, -2]]) + + self.assertRaises(KeyError, lambda: s.loc[['4']]) + + s.loc[-1] = 3 + result = s.loc[[-1, -2]] + expected = Series([3, np.nan], index=[-1, -2]) + tm.assert_series_equal(result, expected) + + s['a'] = 2 + self.assertRaises(KeyError, lambda: s.loc[[-2]]) + + del s['a'] + + def f(): + s.loc[[-2]] = 0 + + self.assertRaises(KeyError, f) + + # inconsistency between .loc[values] and .loc[values,:] + # GH 7999 + df = DataFrame([['a'], ['b']], index=[1, 2], columns=['value']) + + def f(): + df.loc[[3], :] + + self.assertRaises(KeyError, f) + + def f(): + df.loc[[3]] + + self.assertRaises(KeyError, f) + + def test_loc_getitem_label_slice(self): + + # label slices (with ints) + self.check_result('lab slice', 'loc', slice(1, 3), + 'ix', slice(1, 3), + typs=['labels', 'mixed', 'empty', 'ts', 'floats'], + fails=TypeError) + + # real label slices + self.check_result('lab slice', 'loc', slice('a', 'c'), + 'ix', slice('a', 'c'), typs=['labels'], axes=0) + self.check_result('lab slice', 'loc', slice('A', 'C'), + 'ix', slice('A', 'C'), typs=['labels'], axes=1) + self.check_result('lab slice', 'loc', slice('W', 'Z'), + 'ix', slice('W', 'Z'), typs=['labels'], axes=2) + + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=0) + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=1, fails=TypeError) + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=2, fails=TypeError) + + # GH 14316 + self.check_result('ts slice rev', 'loc', slice('20130104', '20130102'), + 'indexer', [0, 1, 2], typs=['ts_rev'], axes=0) + + self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), + typs=['mixed'], axes=0, fails=TypeError) + self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), + typs=['mixed'], axes=1, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), + typs=['mixed'], axes=2, fails=KeyError) + + self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice( + 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError) + + def test_loc_general(self): + + df = DataFrame( + np.random.rand(4, 4), columns=['A', 'B', 'C', 'D'], + index=['A', 'B', 'C', 'D']) + + # want this to work + result = df.loc[:, "A":"B"].iloc[0:2, :] + self.assertTrue((result.columns == ['A', 'B']).all()) + self.assertTrue((result.index == ['A', 'B']).all()) + + # mixed type + result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0] + expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, object) + + def test_loc_setitem_consistency(self): + # GH 6149 + # coerce similary for setitem and loc when rows have a null-slice + expected = DataFrame({'date': Series(0, index=range(5), + dtype=np.int64), + 'val': Series(range(5), dtype=np.int64)}) + + df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), + 'val': Series( + range(5), dtype=np.int64)}) + df.loc[:, 'date'] = 0 + tm.assert_frame_equal(df, expected) + + df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), + 'val': Series(range(5), dtype=np.int64)}) + df.loc[:, 'date'] = np.array(0, dtype=np.int64) + tm.assert_frame_equal(df, expected) + + df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), + 'val': Series(range(5), dtype=np.int64)}) + df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({'date': Series('foo', index=range(5)), + 'val': Series(range(5), dtype=np.int64)}) + df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), + 'val': Series(range(5), dtype=np.int64)}) + df.loc[:, 'date'] = 'foo' + tm.assert_frame_equal(df, expected) + + expected = DataFrame({'date': Series(1.0, index=range(5)), + 'val': Series(range(5), dtype=np.int64)}) + df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), + 'val': Series(range(5), dtype=np.int64)}) + df.loc[:, 'date'] = 1.0 + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_consistency_empty(self): + # empty (essentially noops) + expected = DataFrame(columns=['x', 'y']) + expected['x'] = expected['x'].astype(np.int64) + df = DataFrame(columns=['x', 'y']) + df.loc[:, 'x'] = 1 + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=['x', 'y']) + df['x'] = 1 + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_consistency_slice_column_len(self): + # .loc[:,column] setting with slice == len of the column + # GH10408 + data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat +Level_1,,,Something,StartDate,EndDate,Yes/No,SomethingElse +Region,Site,RespondentID,,,,, +Region_1,Site_1,3987227376,A,5/25/2015 10:59,5/25/2015 11:22,Yes, +Region_1,Site_1,3980680971,A,5/21/2015 9:40,5/21/2015 9:52,Yes,Yes +Region_1,Site_2,3977723249,A,5/20/2015 8:27,5/20/2015 8:41,Yes, +Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No""" + + df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2]) + df.loc[:, ('Respondent', 'StartDate')] = pd.to_datetime(df.loc[:, ( + 'Respondent', 'StartDate')]) + df.loc[:, ('Respondent', 'EndDate')] = pd.to_datetime(df.loc[:, ( + 'Respondent', 'EndDate')]) + df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( + 'Respondent', 'EndDate')] - df.loc[:, ('Respondent', 'StartDate')] + + df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( + 'Respondent', 'Duration')].astype('timedelta64[s]') + expected = Series([1380, 720, 840, 2160.], index=df.index, + name=('Respondent', 'Duration')) + tm.assert_series_equal(df[('Respondent', 'Duration')], expected) + + def test_loc_setitem_frame(self): + df = self.frame_labels + + result = df.iloc[0, 0] + + df.loc['a', 'A'] = 1 + result = df.loc['a', 'A'] + self.assertEqual(result, 1) + + result = df.iloc[0, 0] + self.assertEqual(result, 1) + + df.loc[:, 'B':'D'] = 0 + expected = df.loc[:, 'B':'D'] + result = df.iloc[:, 1:] + tm.assert_frame_equal(result, expected) + + # GH 6254 + # setting issue + df = DataFrame(index=[3, 5, 4], columns=['A']) + df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64') + expected = DataFrame(dict(A=Series( + [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4]) + tm.assert_frame_equal(df, expected) + + # GH 6252 + # setting with an empty frame + keys1 = ['@' + str(i) for i in range(5)] + val1 = np.arange(5, dtype='int64') + + keys2 = ['@' + str(i) for i in range(4)] + val2 = np.arange(4, dtype='int64') + + index = list(set(keys1).union(keys2)) + df = DataFrame(index=index) + df['A'] = np.nan + df.loc[keys1, 'A'] = val1 + + df['B'] = np.nan + df.loc[keys2, 'B'] = val2 + + expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series( + val2, index=keys2))).reindex(index=index) + tm.assert_frame_equal(df, expected) + + # GH 8669 + # invalid coercion of nan -> int + df = DataFrame({'A': [1, 2, 3], 'B': np.nan}) + df.loc[df.B > df.A, 'B'] = df.A + expected = DataFrame({'A': [1, 2, 3], 'B': np.nan}) + tm.assert_frame_equal(df, expected) + + # GH 6546 + # setting with mixed labels + df = DataFrame({1: [1, 2], 2: [3, 4], 'a': ['a', 'b']}) + + result = df.loc[0, [1, 2]] + expected = Series([1, 3], index=[1, 2], dtype=object, name=0) + tm.assert_series_equal(result, expected) + + expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']}) + df.loc[0, [1, 2]] = [5, 6] + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_frame_multiples(self): + # multiple setting + df = DataFrame({'A': ['foo', 'bar', 'baz'], + 'B': Series( + range(3), dtype=np.int64)}) + rhs = df.loc[1:2] + rhs.index = df.index[0:2] + df.loc[0:1] = rhs + expected = DataFrame({'A': ['bar', 'baz', 'baz'], + 'B': Series( + [1, 2, 2], dtype=np.int64)}) + tm.assert_frame_equal(df, expected) + + # multiple setting with frame on rhs (with M8) + df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), + 'val': Series( + range(5), dtype=np.int64)}) + expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( + '20000102'), Timestamp('20000101'), Timestamp('20000102'), + Timestamp('20000103')], + 'val': Series( + [0, 1, 0, 1, 2], dtype=np.int64)}) + rhs = df.loc[0:2] + rhs.index = df.index[2:5] + df.loc[2:4] = rhs + tm.assert_frame_equal(df, expected) + + def test_loc_coerceion(self): + + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) + expected = df.dtypes + + result = df.iloc[[0]] + tm.assert_series_equal(result.dtypes, expected) + + result = df.iloc[[1]] + tm.assert_series_equal(result.dtypes, expected) + + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + expected = df.dtypes + + result = df.iloc[[0]] + tm.assert_series_equal(result.dtypes, expected) + + result = df.iloc[[1]] + tm.assert_series_equal(result.dtypes, expected) + + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + expected = df.dtypes + + result = df.iloc[0:2] + tm.assert_series_equal(result.dtypes, expected) + + result = df.iloc[3:] + tm.assert_series_equal(result.dtypes, expected) + + def test_loc_non_unique(self): + # GH3659 + # non-unique indexer with loc slice + # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs + + # these are going to raise becuase the we are non monotonic + df = DataFrame({'A': [1, 2, 3, 4, 5, 6], + 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) + self.assertRaises(KeyError, df.loc.__getitem__, + tuple([slice(1, None)])) + self.assertRaises(KeyError, df.loc.__getitem__, + tuple([slice(0, None)])) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1, 2)])) + + # monotonic are ok + df = DataFrame({'A': [1, 2, 3, 4, 5, 6], + 'B': [3, 4, 5, 6, 7, 8]}, + index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0) + result = df.loc[1:] + expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]}, + index=[1, 1, 2, 3]) + tm.assert_frame_equal(result, expected) + + result = df.loc[0:] + tm.assert_frame_equal(result, df) + + result = df.loc[1:2] + expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, + index=[1, 1, 2]) + tm.assert_frame_equal(result, expected) + + def test_loc_non_unique_memory_error(self): + + # GH 4280 + # non_unique index with a large selection triggers a memory error + + columns = list('ABCDEFG') + + def gen_test(l, l2): + return pd.concat([ + DataFrame(np.random.randn(l, len(columns)), + index=lrange(l), columns=columns), + DataFrame(np.ones((l2, len(columns))), + index=[0] * l2, columns=columns)]) + + def gen_expected(df, mask): + l = len(mask) + return pd.concat([df.take([0], convert=False), + DataFrame(np.ones((l, len(columns))), + index=[0] * l, + columns=columns), + df.take(mask[1:], convert=False)]) + + df = gen_test(900, 100) + self.assertFalse(df.index.is_unique) + + mask = np.arange(100) + result = df.loc[mask] + expected = gen_expected(df, mask) + tm.assert_frame_equal(result, expected) + + df = gen_test(900000, 100000) + self.assertFalse(df.index.is_unique) + + mask = np.arange(100000) + result = df.loc[mask] + expected = gen_expected(df, mask) + tm.assert_frame_equal(result, expected) + + def test_loc_name(self): + # GH 3880 + df = DataFrame([[1, 1], [1, 1]]) + df.index.name = 'index_name' + result = df.iloc[[0, 1]].index.name + self.assertEqual(result, 'index_name') + + with catch_warnings(record=True): + result = df.ix[[0, 1]].index.name + self.assertEqual(result, 'index_name') + + result = df.loc[[0, 1]].index.name + self.assertEqual(result, 'index_name') + + def test_loc_empty_list_indexer_is_ok(self): + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + # vertical empty + tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) + # horizontal empty + tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) + # horizontal empty + tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index b40f0b8cd9976..ed943202872a7 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -46,101 +46,103 @@ def test_iloc_getitem_multiindex2(self): tm.assert_frame_equal(rs, xp) def test_setitem_multiindex(self): - for index_fn in ('ix', 'loc'): - - def check(target, indexers, value, compare_fn, expected=None): - fn = getattr(target, index_fn) - fn.__setitem__(indexers, value) - result = fn.__getitem__(indexers) - if expected is None: - expected = value - compare_fn(result, expected) - # GH7190 - index = pd.MultiIndex.from_product([np.arange(0, 100), - np.arange(0, 80)], - names=['time', 'firm']) - t, n = 0, 2 - df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=0, - compare_fn=self.assertEqual) - - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=1, - compare_fn=self.assertEqual) - - df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=2, - compare_fn=self.assertEqual) - - # GH 7218, assinging with 0-dim arrays - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, - indexers=((t, n), 'X'), - value=np.array(3), - compare_fn=self.assertEqual, - expected=3, ) - - # GH5206 - df = pd.DataFrame(np.arange(25).reshape(5, 5), - columns='A,B,C,D,E'.split(','), dtype=float) - df['F'] = 99 - row_selection = df['A'] % 2 == 0 - col_selection = ['B', 'C'] - with catch_warnings(record=True): - df.ix[row_selection, col_selection] = df['F'] - output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) - with catch_warnings(record=True): - tm.assert_frame_equal(df.ix[row_selection, col_selection], - output) - check(target=df, - indexers=(row_selection, col_selection), - value=df['F'], - compare_fn=tm.assert_frame_equal, - expected=output, ) - - # GH11372 - idx = pd.MultiIndex.from_product([ - ['A', 'B', 'C'], - pd.date_range('2015-01-01', '2015-04-01', freq='MS')]) - cols = pd.MultiIndex.from_product([ - ['foo', 'bar'], - pd.date_range('2016-01-01', '2016-02-01', freq='MS')]) - - df = pd.DataFrame(np.random.random((12, 4)), - index=idx, columns=cols) - - subidx = pd.MultiIndex.from_tuples( - [('A', pd.Timestamp('2015-01-01')), - ('A', pd.Timestamp('2015-02-01'))]) - subcols = pd.MultiIndex.from_tuples( - [('foo', pd.Timestamp('2016-01-01')), - ('foo', pd.Timestamp('2016-02-01'))]) - - vals = pd.DataFrame(np.random.random((2, 2)), - index=subidx, columns=subcols) - check(target=df, - indexers=(subidx, subcols), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # set all columns - vals = pd.DataFrame( - np.random.random((2, 4)), index=subidx, columns=cols) - check(target=df, - indexers=(subidx, slice(None, None, None)), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # identity - copy = df.copy() - check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=tm.assert_frame_equal, expected=copy) + with catch_warnings(record=True): + + for index_fn in ('ix', 'loc'): + + def check(target, indexers, value, compare_fn, expected=None): + fn = getattr(target, index_fn) + fn.__setitem__(indexers, value) + result = fn.__getitem__(indexers) + if expected is None: + expected = value + compare_fn(result, expected) + # GH7190 + index = pd.MultiIndex.from_product([np.arange(0, 100), + np.arange(0, 80)], + names=['time', 'firm']) + t, n = 0, 2 + df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=0, + compare_fn=self.assertEqual) + + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=1, + compare_fn=self.assertEqual) + + df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=2, + compare_fn=self.assertEqual) + + # GH 7218, assinging with 0-dim arrays + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, + indexers=((t, n), 'X'), + value=np.array(3), + compare_fn=self.assertEqual, + expected=3, ) + + # GH5206 + df = pd.DataFrame(np.arange(25).reshape(5, 5), + columns='A,B,C,D,E'.split(','), dtype=float) + df['F'] = 99 + row_selection = df['A'] % 2 == 0 + col_selection = ['B', 'C'] + with catch_warnings(record=True): + df.ix[row_selection, col_selection] = df['F'] + output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) + with catch_warnings(record=True): + tm.assert_frame_equal(df.ix[row_selection, col_selection], + output) + check(target=df, + indexers=(row_selection, col_selection), + value=df['F'], + compare_fn=tm.assert_frame_equal, + expected=output, ) + + # GH11372 + idx = pd.MultiIndex.from_product([ + ['A', 'B', 'C'], + pd.date_range('2015-01-01', '2015-04-01', freq='MS')]) + cols = pd.MultiIndex.from_product([ + ['foo', 'bar'], + pd.date_range('2016-01-01', '2016-02-01', freq='MS')]) + + df = pd.DataFrame(np.random.random((12, 4)), + index=idx, columns=cols) + + subidx = pd.MultiIndex.from_tuples( + [('A', pd.Timestamp('2015-01-01')), + ('A', pd.Timestamp('2015-02-01'))]) + subcols = pd.MultiIndex.from_tuples( + [('foo', pd.Timestamp('2016-01-01')), + ('foo', pd.Timestamp('2016-02-01'))]) + + vals = pd.DataFrame(np.random.random((2, 2)), + index=subidx, columns=subcols) + check(target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, ) + # set all columns + vals = pd.DataFrame( + np.random.random((2, 4)), index=subidx, columns=cols) + check(target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, ) + # identity + copy = df.copy() + check(target=df, indexers=(df.index, df.columns), value=df, + compare_fn=tm.assert_frame_equal, expected=copy) def test_loc_getitem_series(self): # GH14730 @@ -559,32 +561,37 @@ def test_multiindex_assignment(self): df['d'] = np.nan arr = np.array([0., 1.]) - df.ix[4, 'd'] = arr - tm.assert_series_equal(df.ix[4, 'd'], - Series(arr, index=[8, 10], name='d')) + with catch_warnings(record=True): + df.ix[4, 'd'] = arr + tm.assert_series_equal(df.ix[4, 'd'], + Series(arr, index=[8, 10], name='d')) # single dtype df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), columns=list('abc'), index=[[4, 4, 8], [8, 10, 12]]) - df.ix[4, 'c'] = arr - exp = Series(arr, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) + with catch_warnings(record=True): + df.ix[4, 'c'] = arr + exp = Series(arr, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) # scalar ok - df.ix[4, 'c'] = 10 - exp = Series(10, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) + with catch_warnings(record=True): + df.ix[4, 'c'] = 10 + exp = Series(10, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) # invalid assignments def f(): - df.ix[4, 'c'] = [0, 1, 2, 3] + with catch_warnings(record=True): + df.ix[4, 'c'] = [0, 1, 2, 3] self.assertRaises(ValueError, f) def f(): - df.ix[4, 'c'] = [0] + with catch_warnings(record=True): + df.ix[4, 'c'] = [0] self.assertRaises(ValueError, f) @@ -614,7 +621,8 @@ def f(name, df2): # but in this case, that's ok for name, df2 in grp: new_vals = np.arange(df2.shape[0]) - df.ix[name, 'new_col'] = new_vals + with catch_warnings(record=True): + df.ix[name, 'new_col'] = new_vals def test_multiindex_label_slicing_with_negative_step(self): s = Series(np.arange(20), @@ -624,7 +632,8 @@ def test_multiindex_label_slicing_with_negative_step(self): def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + with catch_warnings(record=True): + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) assert_slices_equivalent(SLC[::-1], SLC[::-1]) diff --git a/pandas/tests/indexing/test_panel.py b/pandas/tests/indexing/test_panel.py index 5ec3076af599a..0677ea498c282 100644 --- a/pandas/tests/indexing/test_panel.py +++ b/pandas/tests/indexing/test_panel.py @@ -1,3 +1,5 @@ +from warnings import catch_warnings + import numpy as np from pandas.util import testing as tm from pandas import Panel, date_range, DataFrame @@ -112,8 +114,8 @@ def test_panel_getitem(self): len(ind), 5), index=ind, columns=list('ABCDE')) panel = Panel(dict([('frame_' + c, df) for c in list('ABC')])) - test2 = panel.ix[:, "2002":"2002-12-31"] - test1 = panel.ix[:, "2002"] + test2 = panel.loc[:, "2002":"2002-12-31"] + test1 = panel.loc[:, "2002"] tm.assert_panel_equal(test1, test2) # GH8710 @@ -134,10 +136,8 @@ def test_panel_getitem(self): result = panel.loc['ItemA':'ItemB'] tm.assert_panel_equal(result, expected) - result = panel.ix['ItemA':'ItemB'] - tm.assert_panel_equal(result, expected) - - result = panel.ix[['ItemA', 'ItemB']] + with catch_warnings(record=True): + result = panel.ix[['ItemA', 'ItemB']] tm.assert_panel_equal(result, expected) # with an object-like diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py new file mode 100644 index 0000000000000..a00f880ff6591 --- /dev/null +++ b/pandas/tests/indexing/test_partial.py @@ -0,0 +1,587 @@ +""" +test setting *parts* of objects both positionally and label based + +TOD: these should be split among the indexer tests +""" +from warnings import catch_warnings +import numpy as np + +import pandas as pd +from pandas import Series, DataFrame, Panel, Index, date_range +from pandas.util import testing as tm + + +class TestPartialSetting(tm.TestCase): + + def test_partial_setting(self): + + # GH2578, allow ix and friends to partially set + + # series + s_orig = Series([1, 2, 3]) + + s = s_orig.copy() + s[5] = 5 + expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s.loc[5] = 5 + expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s[5] = 5. + expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s.loc[5] = 5. + expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + # iloc/iat raise + s = s_orig.copy() + + def f(): + s.iloc[3] = 5. + + self.assertRaises(IndexError, f) + + def f(): + s.iat[3] = 5. + + self.assertRaises(IndexError, f) + + # ## frame ## + + df_orig = DataFrame( + np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') + + # iloc/iat raise + df = df_orig.copy() + + def f(): + df.iloc[4, 2] = 5. + + self.assertRaises(IndexError, f) + + def f(): + df.iat[4, 2] = 5. + + self.assertRaises(IndexError, f) + + # row setting where it exists + expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) + df = df_orig.copy() + df.iloc[1] = df.iloc[2] + tm.assert_frame_equal(df, expected) + + expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) + df = df_orig.copy() + df.loc[1] = df.loc[2] + tm.assert_frame_equal(df, expected) + + # like 2578, partial setting with dtype preservation + expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) + df = df_orig.copy() + df.loc[3] = df.loc[2] + tm.assert_frame_equal(df, expected) + + # single dtype frame, overwrite + expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) + df = df_orig.copy() + with catch_warnings(record=True): + df.ix[:, 'B'] = df.ix[:, 'A'] + tm.assert_frame_equal(df, expected) + + # mixed dtype frame, overwrite + expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) + df = df_orig.copy() + df['B'] = df['B'].astype(np.float64) + with catch_warnings(record=True): + df.ix[:, 'B'] = df.ix[:, 'A'] + tm.assert_frame_equal(df, expected) + + # single dtype frame, partial setting + expected = df_orig.copy() + expected['C'] = df['A'] + df = df_orig.copy() + with catch_warnings(record=True): + df.ix[:, 'C'] = df.ix[:, 'A'] + tm.assert_frame_equal(df, expected) + + # mixed frame, partial setting + expected = df_orig.copy() + expected['C'] = df['A'] + df = df_orig.copy() + with catch_warnings(record=True): + df.ix[:, 'C'] = df.ix[:, 'A'] + tm.assert_frame_equal(df, expected) + + # ## panel ## + p_orig = Panel(np.arange(16).reshape(2, 4, 2), + items=['Item1', 'Item2'], + major_axis=pd.date_range('2001/1/12', periods=4), + minor_axis=['A', 'B'], dtype='float64') + + # panel setting via item + p_orig = Panel(np.arange(16).reshape(2, 4, 2), + items=['Item1', 'Item2'], + major_axis=pd.date_range('2001/1/12', periods=4), + minor_axis=['A', 'B'], dtype='float64') + expected = p_orig.copy() + expected['Item3'] = expected['Item1'] + p = p_orig.copy() + p.loc['Item3'] = p['Item1'] + tm.assert_panel_equal(p, expected) + + # panel with aligned series + expected = p_orig.copy() + expected = expected.transpose(2, 1, 0) + expected['C'] = DataFrame({'Item1': [30, 30, 30, 30], + 'Item2': [32, 32, 32, 32]}, + index=p_orig.major_axis) + expected = expected.transpose(2, 1, 0) + p = p_orig.copy() + p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) + tm.assert_panel_equal(p, expected) + + # GH 8473 + dates = date_range('1/1/2000', periods=8) + df_orig = DataFrame(np.random.randn(8, 4), index=dates, + columns=['A', 'B', 'C', 'D']) + + expected = pd.concat([df_orig, DataFrame( + {'A': 7}, index=[dates[-1] + 1])]) + df = df_orig.copy() + df.loc[dates[-1] + 1, 'A'] = 7 + tm.assert_frame_equal(df, expected) + df = df_orig.copy() + df.at[dates[-1] + 1, 'A'] = 7 + tm.assert_frame_equal(df, expected) + + exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) + expected = pd.concat([df_orig, exp_other], axis=1) + + df = df_orig.copy() + df.loc[dates[-1] + 1, 0] = 7 + tm.assert_frame_equal(df, expected) + df = df_orig.copy() + df.at[dates[-1] + 1, 0] = 7 + tm.assert_frame_equal(df, expected) + + def test_partial_setting_mixed_dtype(self): + + # in a mixed dtype environment, try to preserve dtypes + # by appending + df = DataFrame([[True, 1], [False, 2]], columns=["female", "fitness"]) + + s = df.loc[1].copy() + s.name = 2 + expected = df.append(s) + + df.loc[2] = df.loc[1] + tm.assert_frame_equal(df, expected) + + # columns will align + df = DataFrame(columns=['A', 'B']) + df.loc[0] = Series(1, index=range(4)) + tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) + + # columns will align + df = DataFrame(columns=['A', 'B']) + df.loc[0] = Series(1, index=['B']) + + exp = DataFrame([[np.nan, 1]], columns=['A', 'B'], + index=[0], dtype='float64') + tm.assert_frame_equal(df, exp) + + # list-like must conform + df = DataFrame(columns=['A', 'B']) + + def f(): + df.loc[0] = [1, 2, 3] + + self.assertRaises(ValueError, f) + + # these are coerced to float unavoidably (as its a list-like to begin) + df = DataFrame(columns=['A', 'B']) + df.loc[3] = [6, 7] + + exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], + dtype='float64') + tm.assert_frame_equal(df, exp) + + def test_series_partial_set(self): + # partial set with new index + # Regression from GH4825 + ser = Series([0.1, 0.2], index=[1, 2]) + + # loc + expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) + result = ser.loc[[3, 2, 3]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) + result = ser.loc[[3, 2, 3, 'x']] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) + result = ser.loc[[2, 2, 1]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) + result = ser.loc[[2, 2, 'x', 1]] + tm.assert_series_equal(result, expected, check_index_type=True) + + # raises as nothing in in the index + self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) + + expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) + result = ser.loc[[2, 2, 3]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) + result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) + result = Series([0.1, 0.2, 0.3, 0.4], + index=[1, 2, 3, 4]).loc[[5, 3, 3]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) + result = Series([0.1, 0.2, 0.3, 0.4], + index=[1, 2, 3, 4]).loc[[5, 4, 4]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) + result = Series([0.1, 0.2, 0.3, 0.4], + index=[4, 5, 6, 7]).loc[[7, 2, 2]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) + result = Series([0.1, 0.2, 0.3, 0.4], + index=[1, 2, 3, 4]).loc[[4, 5, 5]] + tm.assert_series_equal(result, expected, check_index_type=True) + + # iloc + expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1]) + result = ser.iloc[[1, 1, 0, 0]] + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_series_partial_set_with_name(self): + # GH 11497 + + idx = Index([1, 2], dtype='int64', name='idx') + ser = Series([0.1, 0.2], index=idx, name='s') + + # loc + exp_idx = Index([3, 2, 3], dtype='int64', name='idx') + expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') + result = ser.loc[[3, 2, 3]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') + expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, + name='s') + result = ser.loc[[3, 2, 3, 'x']] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([2, 2, 1], dtype='int64', name='idx') + expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') + result = ser.loc[[2, 2, 1]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') + expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') + result = ser.loc[[2, 2, 'x', 1]] + tm.assert_series_equal(result, expected, check_index_type=True) + + # raises as nothing in in the index + self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) + + exp_idx = Index([2, 2, 3], dtype='int64', name='idx') + expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') + result = ser.loc[[2, 2, 3]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([3, 4, 4], dtype='int64', name='idx') + expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') + idx = Index([1, 2, 3], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([5, 3, 3], dtype='int64', name='idx') + expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') + idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, + name='s').loc[[5, 3, 3]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([5, 4, 4], dtype='int64', name='idx') + expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') + idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, + name='s').loc[[5, 4, 4]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([7, 2, 2], dtype='int64', name='idx') + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') + idx = Index([4, 5, 6, 7], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, + name='s').loc[[7, 2, 2]] + tm.assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([4, 5, 5], dtype='int64', name='idx') + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') + idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, + name='s').loc[[4, 5, 5]] + tm.assert_series_equal(result, expected, check_index_type=True) + + # iloc + exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') + expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') + result = ser.iloc[[1, 1, 0, 0]] + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_partial_set_invalid(self): + + # GH 4940 + # allow only setting of 'valid' values + + orig = tm.makeTimeDataFrame() + df = orig.copy() + + # don't allow not string inserts + def f(): + with catch_warnings(record=True): + df.loc[100.0, :] = df.ix[0] + + self.assertRaises(TypeError, f) + + def f(): + with catch_warnings(record=True): + df.loc[100, :] = df.ix[0] + + self.assertRaises(TypeError, f) + + def f(): + with catch_warnings(record=True): + df.ix[100.0, :] = df.ix[0] + + self.assertRaises(TypeError, f) + + def f(): + with catch_warnings(record=True): + df.ix[100, :] = df.ix[0] + + self.assertRaises(ValueError, f) + + # allow object conversion here + df = orig.copy() + with catch_warnings(record=True): + df.loc['a', :] = df.ix[0] + exp = orig.append(pd.Series(df.ix[0], name='a')) + tm.assert_frame_equal(df, exp) + tm.assert_index_equal(df.index, + pd.Index(orig.index.tolist() + ['a'])) + self.assertEqual(df.index.dtype, 'object') + + def test_partial_set_empty_series(self): + + # GH5226 + + # partially set with an empty object series + s = Series() + s.loc[1] = 1 + tm.assert_series_equal(s, Series([1], index=[1])) + s.loc[3] = 3 + tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) + + s = Series() + s.loc[1] = 1. + tm.assert_series_equal(s, Series([1.], index=[1])) + s.loc[3] = 3. + tm.assert_series_equal(s, Series([1., 3.], index=[1, 3])) + + s = Series() + s.loc['foo'] = 1 + tm.assert_series_equal(s, Series([1], index=['foo'])) + s.loc['bar'] = 3 + tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) + s.loc[3] = 4 + tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + + def test_partial_set_empty_frame(self): + + # partially set with an empty object + # frame + df = DataFrame() + + def f(): + df.loc[1] = 1 + + self.assertRaises(ValueError, f) + + def f(): + df.loc[1] = Series([1], index=['foo']) + + self.assertRaises(ValueError, f) + + def f(): + df.loc[:, 1] = 1 + + self.assertRaises(ValueError, f) + + # these work as they don't really change + # anything but the index + # GH5632 + expected = DataFrame(columns=['foo'], index=pd.Index( + [], dtype='int64')) + + def f(): + df = DataFrame() + df['foo'] = Series([], dtype='object') + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame() + df['foo'] = Series(df.index) + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame() + df['foo'] = df.index + return df + + tm.assert_frame_equal(f(), expected) + + expected = DataFrame(columns=['foo'], + index=pd.Index([], dtype='int64')) + expected['foo'] = expected['foo'].astype('float64') + + def f(): + df = DataFrame() + df['foo'] = [] + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame() + df['foo'] = Series(range(len(df))) + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame() + tm.assert_index_equal(df.index, pd.Index([], dtype='object')) + df['foo'] = range(len(df)) + return df + + expected = DataFrame(columns=['foo'], + index=pd.Index([], dtype='int64')) + expected['foo'] = expected['foo'].astype('float64') + tm.assert_frame_equal(f(), expected) + + df = DataFrame() + tm.assert_index_equal(df.columns, pd.Index([], dtype=object)) + df2 = DataFrame() + df2[1] = Series([1], index=['foo']) + df.loc[:, 1] = Series([1], index=['foo']) + tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) + tm.assert_frame_equal(df, df2) + + # no index to start + expected = DataFrame({0: Series(1, index=range(4))}, + columns=['A', 'B', 0]) + + df = DataFrame(columns=['A', 'B']) + df[0] = Series(1, index=range(4)) + df.dtypes + str(df) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=['A', 'B']) + df.loc[:, 0] = Series(1, index=range(4)) + df.dtypes + str(df) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_row(self): + # GH5720, GH5744 + # don't create rows when empty + expected = DataFrame(columns=['A', 'B', 'New'], + index=pd.Index([], dtype='int64')) + expected['A'] = expected['A'].astype('int64') + expected['B'] = expected['B'].astype('float64') + expected['New'] = expected['New'].astype('float64') + + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + y['New'] = np.nan + tm.assert_frame_equal(y, expected) + # tm.assert_frame_equal(y,expected) + + expected = DataFrame(columns=['a', 'b', 'c c', 'd']) + expected['d'] = expected['d'].astype('int64') + df = DataFrame(columns=['a', 'b', 'c c']) + df['d'] = 3 + tm.assert_frame_equal(df, expected) + tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object)) + + # reindex columns is ok + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + result = y.reindex(columns=['A', 'B', 'C']) + expected = DataFrame(columns=['A', 'B', 'C'], + index=pd.Index([], dtype='int64')) + expected['A'] = expected['A'].astype('int64') + expected['B'] = expected['B'].astype('float64') + expected['C'] = expected['C'].astype('float64') + tm.assert_frame_equal(result, expected) + + def test_partial_set_empty_frame_set_series(self): + # GH 5756 + # setting with empty Series + df = DataFrame(Series()) + tm.assert_frame_equal(df, DataFrame({0: Series()})) + + df = DataFrame(Series(name='foo')) + tm.assert_frame_equal(df, DataFrame({'foo': Series()})) + + def test_partial_set_empty_frame_empty_copy_assignment(self): + # GH 5932 + # copy on empty with assignment fails + df = DataFrame(index=[0]) + df = df.copy() + df['a'] = 0 + expected = DataFrame(0, index=[0], columns=['a']) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_empty_consistencies(self): + # GH 6171 + # consistency on empty frames + df = DataFrame(columns=['x', 'y']) + df['x'] = [1, 2] + expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) + tm.assert_frame_equal(df, expected, check_dtype=False) + + df = DataFrame(columns=['x', 'y']) + df['x'] = ['1', '2'] + expected = DataFrame( + dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=['x', 'y']) + df.loc[0, 'x'] = 1 + expected = DataFrame(dict(x=[1], y=[np.nan])) + tm.assert_frame_equal(df, expected, check_dtype=False) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py new file mode 100644 index 0000000000000..4e81cd01cd5d2 --- /dev/null +++ b/pandas/tests/indexing/test_scalar.py @@ -0,0 +1,156 @@ +""" test scalar indexing, including at and iat """ + +import numpy as np + +from pandas import (Series, DataFrame, Timestamp, + Timedelta, date_range) +from pandas.util import testing as tm +from pandas.tests.indexing.common import Base + + +class TestScalar(Base, tm.TestCase): + + def test_at_and_iat_get(self): + def _check(f, func, values=False): + + if f is not None: + indicies = self.generate_indices(f, values) + for i in indicies: + result = getattr(f, func)[i] + expected = self.get_value(f, i, values) + tm.assert_almost_equal(result, expected) + + for o in self._objs: + + d = getattr(self, o) + + # iat + for f in [d['ints'], d['uints']]: + _check(f, 'iat', values=True) + + for f in [d['labels'], d['ts'], d['floats']]: + if f is not None: + self.assertRaises(ValueError, self.check_values, f, 'iat') + + # at + for f in [d['ints'], d['uints'], d['labels'], + d['ts'], d['floats']]: + _check(f, 'at') + + def test_at_and_iat_set(self): + def _check(f, func, values=False): + + if f is not None: + indicies = self.generate_indices(f, values) + for i in indicies: + getattr(f, func)[i] = 1 + expected = self.get_value(f, i, values) + tm.assert_almost_equal(expected, 1) + + for t in self._objs: + + d = getattr(self, t) + + # iat + for f in [d['ints'], d['uints']]: + _check(f, 'iat', values=True) + + for f in [d['labels'], d['ts'], d['floats']]: + if f is not None: + self.assertRaises(ValueError, _check, f, 'iat') + + # at + for f in [d['ints'], d['uints'], d['labels'], + d['ts'], d['floats']]: + _check(f, 'at') + + def test_at_iat_coercion(self): + + # as timestamp is not a tuple! + dates = date_range('1/1/2000', periods=8) + df = DataFrame(np.random.randn(8, 4), + index=dates, + columns=['A', 'B', 'C', 'D']) + s = df['A'] + + result = s.at[dates[5]] + xp = s.values[5] + self.assertEqual(result, xp) + + # GH 7729 + # make sure we are boxing the returns + s = Series(['2014-01-01', '2014-02-02'], dtype='datetime64[ns]') + expected = Timestamp('2014-02-02') + + for r in [lambda: s.iat[1], lambda: s.iloc[1]]: + result = r() + self.assertEqual(result, expected) + + s = Series(['1 days', '2 days'], dtype='timedelta64[ns]') + expected = Timedelta('2 days') + + for r in [lambda: s.iat[1], lambda: s.iloc[1]]: + result = r() + self.assertEqual(result, expected) + + def test_iat_invalid_args(self): + pass + + def test_imethods_with_dups(self): + + # GH6493 + # iat/iloc with dups + + s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64') + result = s.iloc[2] + self.assertEqual(result, 2) + result = s.iat[2] + self.assertEqual(result, 2) + + self.assertRaises(IndexError, lambda: s.iat[10]) + self.assertRaises(IndexError, lambda: s.iat[-10]) + + result = s.iloc[[2, 3]] + expected = Series([2, 3], [2, 2], dtype='int64') + tm.assert_series_equal(result, expected) + + df = s.to_frame() + result = df.iloc[2] + expected = Series(2, index=[0], name=2) + tm.assert_series_equal(result, expected) + + result = df.iat[2, 0] + expected = 2 + self.assertEqual(result, 2) + + def test_at_to_fail(self): + # at should not fallback + # GH 7814 + s = Series([1, 2, 3], index=list('abc')) + result = s.at['a'] + self.assertEqual(result, 1) + self.assertRaises(ValueError, lambda: s.at[0]) + + df = DataFrame({'A': [1, 2, 3]}, index=list('abc')) + result = df.at['a', 'A'] + self.assertEqual(result, 1) + self.assertRaises(ValueError, lambda: df.at['a', 0]) + + s = Series([1, 2, 3], index=[3, 2, 1]) + result = s.at[1] + self.assertEqual(result, 3) + self.assertRaises(ValueError, lambda: s.at['a']) + + df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) + result = df.at[1, 0] + self.assertEqual(result, 3) + self.assertRaises(ValueError, lambda: df.at['a', 0]) + + # GH 13822, incorrect error string with non-unique columns when missing + # column is accessed + df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) + df.columns = ['x', 'x', 'z'] + + # Check that we get the correct value in the KeyError + self.assertRaisesRegexp(KeyError, r"\['y'\] not in index", + lambda: df[['x', 'y', 'z']]) From b28ac2733ba867315d7f24a16dd0c169cbce3bad Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 10 Mar 2017 06:25:22 -0500 Subject: [PATCH 270/338] BUG: Incorrect value updating for groupby.cummin/max (#15635) closes #15635 Author: Matt Roeschke Closes #15642 from mroeschke/fix_15635 and squashes the following commits: b92b81a [Matt Roeschke] BUG: Incorrect value updating for groupby.cummin/max (#15635) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/_libs/algos_groupby_helper.pxi.in | 20 ++++++++++---------- pandas/tests/groupby/test_groupby.py | 11 +++++++++++ 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cf3dddc3a2933..47aa4450b897f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -716,7 +716,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). -- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`) +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`, :issue:`15635`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) - Improved performance of ``.rank()`` for categorical data (:issue:`15498`) diff --git a/pandas/_libs/algos_groupby_helper.pxi.in b/pandas/_libs/algos_groupby_helper.pxi.in index 9552b4299fe6a..e2c263f49b110 100644 --- a/pandas/_libs/algos_groupby_helper.pxi.in +++ b/pandas/_libs/algos_groupby_helper.pxi.in @@ -603,7 +603,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, size - {{dest_type2}} val, min_val = 0 + {{dest_type2}} val, mval ndarray[{{dest_type2}}, ndim=2] accum int64_t lab @@ -628,10 +628,10 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{else}} if val == val: {{endif}} - if val < accum[lab, j]: - min_val = val - accum[lab, j] = min_val - out[i, j] = accum[lab, j] + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval @cython.boundscheck(False) @@ -645,7 +645,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, size - {{dest_type2}} val, max_val = 0 + {{dest_type2}} val, mval ndarray[{{dest_type2}}, ndim=2] accum int64_t lab @@ -669,10 +669,10 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{else}} if val == val: {{endif}} - if val > accum[lab, j]: - max_val = val - accum[lab, j] = max_val - out[i, j] = accum[lab, j] + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval {{endfor}} diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e846963732883..d7fa3beda0abf 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4303,6 +4303,17 @@ def test_cummin_cummax(self): result = getattr(df.groupby('a')['b'], method)() tm.assert_series_equal(expected, result) + # GH 15635 + df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) + result = df.groupby('a').b.cummax() + expected = pd.Series([2, 1, 2], name='b') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) + result = df.groupby('a').b.cummin() + expected = pd.Series([1, 2, 1], name='b') + tm.assert_series_equal(result, expected) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From fd9c9edbc60a61cd4306993c49d513ab3f3926b3 Mon Sep 17 00:00:00 2001 From: linebp Date: Fri, 10 Mar 2017 13:07:10 +0100 Subject: [PATCH 271/338] DOC GH15643 Removed pytest-xdist from requirements_dev.txt file (#15646) --- ci/requirements_dev.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index b0a8adc8df5cb..1e051802ec9f8 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -4,5 +4,4 @@ numpy cython pytest pytest-cov -pytest-xdist flake8 From 33c05cd1a92bcbe83809f32f3e44392c92f5056a Mon Sep 17 00:00:00 2001 From: JennaVergeynst Date: Fri, 10 Mar 2017 14:44:26 +0100 Subject: [PATCH 272/338] DOC: add examples to DataFrame.dropna (#15620) --- pandas/core/frame.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2062f301b9e0e..987eb10101f12 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3084,6 +3084,50 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, Returns ------- dropped : DataFrame + + Examples + -------- + >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, 5]], + ... columns=list('ABCD')) + >>> df + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + 2 NaN NaN NaN 5 + + Drop the columns where all elements are nan: + + >>> df.dropna(axis=1, how='all') + A B D + 0 NaN 2.0 0 + 1 3.0 4.0 1 + 2 NaN NaN 5 + + Drop the columns where any of the elements is nan + + >>> df.dropna(axis=1, how='any') + D + 0 0 + 1 1 + 2 5 + + Drop the rows where all of the elements are nan + (there is no row to drop, so df stays the same): + + >>> df.dropna(axis=0, how='all') + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + 2 NaN NaN NaN 5 + + Keep only the rows with at least 2 non-na values: + + >>> df.dropna(thresh=2) + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + """ inplace = validate_bool_kwarg(inplace, 'inplace') if isinstance(axis, (tuple, list)): From c1065da72996165f792c17f0effec07c1a2f7225 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Mar 2017 09:24:46 -0500 Subject: [PATCH 273/338] DOC: doc warnings Author: Jeff Reback Closes #15647 from jreback/doc and squashes the following commits: 6afb394 [Jeff Reback] gbq install adjustment 0fd4499 [Jeff Reback] maybe d5ec228 [Jeff Reback] DOC: fixup some doc-links b7ea898 [Jeff Reback] DOC: some deprecation warnings removed --- doc/source/index.rst.template | 1 - doc/source/install.rst | 2 +- doc/source/whatsnew/v0.10.0.txt | 5 ++--- doc/source/whatsnew/v0.10.1.txt | 4 ++-- doc/source/whatsnew/v0.17.0.txt | 2 +- doc/source/whatsnew/v0.18.0.txt | 2 +- doc/source/whatsnew/v0.19.0.txt | 2 +- doc/source/whatsnew/v0.20.0.txt | 6 +++--- pandas/io/gbq.py | 5 +++-- 9 files changed, 14 insertions(+), 15 deletions(-) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 67072ff9fb224..0bfb2b635f53a 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -116,7 +116,6 @@ See the package overview for more detail about what's in the library. whatsnew install contributing - faq overview 10min tutorials diff --git a/doc/source/install.rst b/doc/source/install.rst index fe2a9fa4ba509..578caae605471 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -260,7 +260,7 @@ Optional Dependencies `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* For Google BigQuery I/O - see :ref:`here `. +* For Google BigQuery I/O - see `here `__ * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the diff --git a/doc/source/whatsnew/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.txt index fed3ba3ce3a84..cf5369466308c 100644 --- a/doc/source/whatsnew/v0.10.0.txt +++ b/doc/source/whatsnew/v0.10.0.txt @@ -303,11 +303,10 @@ Updated PyTables Support store.append('wp',wp) # selecting via A QUERY - store.select('wp', - [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) + store.select('wp', "major_axis>20000102 and minor_axis=['A','B']") # removing data from tables - store.remove('wp', Term('major_axis>20000103')) + store.remove('wp', "major_axis>20000103") store.select('wp') # deleting a store diff --git a/doc/source/whatsnew/v0.10.1.txt b/doc/source/whatsnew/v0.10.1.txt index edc628fe85027..d5880e44e46c6 100644 --- a/doc/source/whatsnew/v0.10.1.txt +++ b/doc/source/whatsnew/v0.10.1.txt @@ -58,7 +58,7 @@ perform queries on a table, by passing a list to ``data_columns`` # on-disk operations store.append('df', df, data_columns = ['B','C','string','string2']) - store.select('df',[ 'B > 0', 'string == foo' ]) + store.select('df', "B>0 and string=='foo'") # this is in-memory version of this type of selection df[(df.B > 0) & (df.string == 'foo')] @@ -110,7 +110,7 @@ columns, this is equivalent to passing a store.select('mi') # the levels are automatically included as data columns - store.select('mi', Term('foo=bar')) + store.select('mi', "foo='bar'") Multi-table creation via ``append_to_multiple`` and selection via ``select_as_multiple`` can create/select from multiple tables and return a diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9cb299593076d..a3bbaf73c01ca 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -329,7 +329,7 @@ has been changed to make this keyword unnecessary - the change is shown below. Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added ability to automatically create a table/dataset using the :func:`pandas.io.gbq.to_gbq` function if the destination table/dataset does not exist. (:issue:`8325`, :issue:`11121`). -- Added ability to replace an existing table and schema when calling the :func:`pandas.io.gbq.to_gbq` function via the ``if_exists`` argument. See the :ref:`docs ` for more details (:issue:`8325`). +- Added ability to replace an existing table and schema when calling the :func:`pandas.io.gbq.to_gbq` function via the ``if_exists`` argument. See the `docs `__ for more details (:issue:`8325`). - ``InvalidColumnOrder`` and ``InvalidPageToken`` in the gbq module will raise ``ValueError`` instead of ``IOError``. - The ``generate_bq_schema()`` function is now deprecated and will be removed in a future version (:issue:`11121`) - The gbq module will now support Python 3 (:issue:`11094`). diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 893922b719b34..4b27cf706f9b2 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -518,7 +518,7 @@ Other enhancements - Added ``DataFrame.style.format`` for more flexible formatting of cell values (:issue:`11692`) - ``DataFrame.select_dtypes`` now allows the ``np.float16`` typecode (:issue:`11990`) - ``pivot_table()`` now accepts most iterables for the ``values`` parameter (:issue:`12017`) -- Added Google ``BigQuery`` service account authentication support, which enables authentication on remote servers. (:issue:`11881`, :issue:`12572`). For further details see :ref:`here ` +- Added Google ``BigQuery`` service account authentication support, which enables authentication on remote servers. (:issue:`11881`, :issue:`12572`). For further details see `here `__ - ``HDFStore`` is now iterable: ``for k in store`` is equivalent to ``for k in store.keys()`` (:issue:`12221`). - Add missing methods/fields to ``.dt`` for ``Period`` (:issue:`8848`) - The entire codebase has been ``PEP``-ified (:issue:`12096`) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8e7e95c071ea4..9b003034aa94a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -377,7 +377,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- The :func:`read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). +- The :func:`read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the `docs `__ for more details (:issue:`13615`). - The :func:`~DataFrame.to_gbq` method now allows the DataFrame column order to differ from the destination table schema (:issue:`11359`). .. _whatsnew_0190.errstate: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 47aa4450b897f..7b24264cd09db 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -203,7 +203,7 @@ New Behavior: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() -.. _whatsnew_0200.enhancements.table_schema +.. _whatsnew_0200.enhancements.table_schema: Table Schema Output ^^^^^^^^^^^^^^^^^^^ @@ -337,7 +337,7 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi df.iloc[[0, 2], df.columns.get_loc('A')] -.. _whatsnew.api_breaking.io_compat +.. _whatsnew.api_breaking.io_compat: Possible incompat for HDF5 formats for pandas < 0.13.0 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -758,7 +758,7 @@ Bug Fixes - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) -- Bug in ``.asfreq()``, where frequency was not set for empty ``Series` (:issue:`14320`) +- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 9cfb27a92bfef..b4dc9173f11ba 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -14,8 +14,9 @@ def _try_import(): "the pandas-gbq package is not installed\n" "see the docs: https://pandas-gbq.readthedocs.io\n" "\n" - "you can install via:\n" - "pip install pandas-gbq\n") + "you can install via pip or conda:\n" + "pip install pandas-gbq\n" + "conda install pandas-gbq -c conda-forge\n") return pandas_gbq From adaca513606e3a7cd72f6bbcc00e41810a7b4a18 Mon Sep 17 00:00:00 2001 From: Kernc Date: Fri, 10 Mar 2017 09:27:45 -0500 Subject: [PATCH 274/338] ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame closes #4343 Author: Kernc Closes #15497 from kernc/scipy-sparse and squashes the following commits: a0f2208 [Kernc] DOC: Fix some whatsnew/v0.20.0.txt sphinx warnings e72e594 [Kernc] ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame --- doc/source/api.rst | 11 ++- doc/source/sparse.rst | 32 +++++++- doc/source/whatsnew/v0.20.0.txt | 29 +++++++- pandas/sparse/array.py | 9 ++- pandas/sparse/frame.py | 107 ++++++++++++++++++++++----- pandas/tests/sparse/common.py | 10 +++ pandas/tests/sparse/test_frame.py | 62 ++++++++++++++++ pandas/tests/types/test_inference.py | 9 +++ pandas/types/common.py | 14 ++++ pandas/util/testing.py | 5 ++ 10 files changed, 266 insertions(+), 22 deletions(-) create mode 100644 pandas/tests/sparse/common.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 7e297a15055a0..f6bf480bebcfc 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -711,8 +711,8 @@ Serialization / IO / Conversion Series.to_string Series.to_clipboard -Sparse methods -~~~~~~~~~~~~~~ +Sparse +~~~~~~ .. autosummary:: :toctree: generated/ @@ -1030,6 +1030,13 @@ Serialization / IO / Conversion DataFrame.to_string DataFrame.to_clipboard +Sparse +~~~~~~ +.. autosummary:: + :toctree: generated/ + + SparseDataFrame.to_coo + .. _api.panel: Panel diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2bc5d3f6dd0f5..b4884cf1c4141 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -186,7 +186,37 @@ the correct dense result. Interaction with scipy.sparse ----------------------------- -Experimental api to transform between sparse pandas and scipy.sparse structures. +SparseDataFrame +~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices. + +.. ipython:: python + + from scipy.sparse import csr_matrix + + arr = np.random.random(size=(1000, 5)) + arr[arr < .9] = 0 + + sp_arr = csr_matrix(arr) + sp_arr + + sdf = pd.SparseDataFrame(sp_arr) + sdf + +All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. +To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use the :meth:`SparseDataFrame.to_coo` method: + +.. ipython:: python + + sdf.to_coo() + +SparseSeries +~~~~~~~~~~~~ + +.. versionadded:: 0.16.0 A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7b24264cd09db..3c82e533dd158 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -237,10 +237,37 @@ You must enable this by setting the ``display.html.table_schema`` option to True .. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ .. _nteract: http://nteract.io/ +.. _whatsnew_0200.enhancements.scipy_sparse: + +SciPy sparse matrix from/to SparseDataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. +See the :ref:`documentation ` for more information. (:issue:`4343`) + +All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. + +.. ipython:: python + + from scipy.sparse import csr_matrix + arr = np.random.random(size=(1000, 5)) + arr[arr < .9] = 0 + sp_arr = csr_matrix(arr) + sp_arr + sdf = pd.SparseDataFrame(sp_arr) + sdf + +To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use: + +.. ipython:: python + + sdf.to_coo() + .. _whatsnew_0200.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ + - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) @@ -752,7 +779,6 @@ Bug Fixes - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - Bug in ``pd.cut()`` with a single bin on an all 0s array (:issue:`15428`) - Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) -- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) @@ -783,6 +809,7 @@ Bug Fixes - Bug in ``to_sql`` when writing a DataFrame with numeric index names (:issue:`15404`). - Bug in ``Series.iloc`` where a ``Categorical`` object for list-like indexes input was returned, where a ``Series`` was expected. (:issue:`14580`) - Bug in repr-formatting a ``SparseDataFrame`` after a value was set on (a copy of) one of its series (:issue:`15488`) +- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) - Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 762b6d869eae0..5f4c07971d37e 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -20,6 +20,7 @@ is_integer_dtype, is_bool_dtype, is_list_like, + is_string_dtype, is_scalar, is_dtype_equal) from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, _astype_nansafe, _find_common_type) @@ -769,6 +770,12 @@ def make_sparse(arr, kind='block', fill_value=None): if isnull(fill_value): mask = notnull(arr) else: + # For str arrays in NumPy 1.12.0, operator!= below isn't + # element-wise but just returns False if fill_value is not str, + # so cast to object comparison to be safe + if is_string_dtype(arr): + arr = arr.astype(object) + mask = arr != fill_value length = len(arr) @@ -776,7 +783,7 @@ def make_sparse(arr, kind='block', fill_value=None): # the arr is a SparseArray indices = mask.sp_index.indices else: - indices = np.arange(length, dtype=np.int32)[mask] + indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 61b8434b0ea09..a21f64f524a0a 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -11,8 +11,8 @@ import numpy as np from pandas.types.missing import isnull, notnull -from pandas.types.cast import _maybe_upcast -from pandas.types.common import _ensure_platform_int +from pandas.types.cast import _maybe_upcast, _find_common_type +from pandas.types.common import _ensure_platform_int, is_scipy_sparse from pandas.core.common import _try_sort from pandas.compat.numpy import function as nv @@ -25,6 +25,7 @@ create_block_manager_from_arrays) import pandas.core.generic as generic from pandas.sparse.series import SparseSeries, SparseArray +from pandas.sparse.libsparse import BlockIndex, get_blocks from pandas.util.decorators import Appender import pandas.core.ops as ops @@ -39,15 +40,15 @@ class SparseDataFrame(DataFrame): Parameters ---------- - data : same types as can be passed to DataFrame + data : same types as can be passed to DataFrame or scipy.sparse.spmatrix index : array-like, optional column : array-like, optional default_kind : {'block', 'integer'}, default 'block' Default sparse kind for converting Series to SparseSeries. Will not override SparseSeries passed into constructor default_fill_value : float - Default fill_value for converting Series to SparseSeries. Will not - override SparseSeries passed in + Default fill_value for converting Series to SparseSeries + (default: nan). Will not override SparseSeries passed in. """ _constructor_sliced = SparseSeries _subtyp = 'sparse_frame' @@ -84,22 +85,19 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, self._default_kind = default_kind self._default_fill_value = default_fill_value - if isinstance(data, dict): - mgr = self._init_dict(data, index, columns) - if dtype is not None: - mgr = mgr.astype(dtype) + if is_scipy_sparse(data): + mgr = self._init_spmatrix(data, index, columns, dtype=dtype, + fill_value=default_fill_value) + elif isinstance(data, dict): + mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): - mgr = self._init_matrix(data, index, columns) - if dtype is not None: - mgr = mgr.astype(dtype) + mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): - mgr = self._init_dict(data, data.index, data.columns) - if dtype is not None: - mgr = mgr.astype(dtype) + mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) @@ -174,7 +172,43 @@ def _init_dict(self, data, index, columns, dtype=None): return to_manager(sdict, columns, index) def _init_matrix(self, data, index, columns, dtype=None): + """ Init self from ndarray or list of lists """ data = _prep_ndarray(data, copy=False) + index, columns = self._prep_index(data, index, columns) + data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) + return self._init_dict(data, index, columns, dtype) + + def _init_spmatrix(self, data, index, columns, dtype=None, + fill_value=None): + """ Init self from scipy.sparse matrix """ + index, columns = self._prep_index(data, index, columns) + data = data.tocoo() + N = len(index) + + # Construct a dict of SparseSeries + sdict = {} + values = Series(data.data, index=data.row, copy=False) + for col, rowvals in values.groupby(data.col): + # get_blocks expects int32 row indices in sorted order + rows = rowvals.index.values.astype(np.int32) + rows.sort() + blocs, blens = get_blocks(rows) + + sdict[columns[col]] = SparseSeries( + rowvals.values, index=index, + fill_value=fill_value, + sparse_index=BlockIndex(N, blocs, blens)) + + # Add any columns that were empty and thus not grouped on above + sdict.update({column: SparseSeries(index=index, + fill_value=fill_value, + sparse_index=BlockIndex(N, [], [])) + for column in columns + if column not in sdict}) + + return self._init_dict(sdict, index, columns, dtype) + + def _prep_index(self, data, index, columns): N, K = data.shape if index is None: index = _default_index(N) @@ -187,9 +221,48 @@ def _init_matrix(self, data, index, columns, dtype=None): if len(index) != N: raise ValueError('Index length mismatch: %d vs. %d' % (len(index), N)) + return index, columns - data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) - return self._init_dict(data, index, columns, dtype) + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.20.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ + try: + from scipy.sparse import coo_matrix + except ImportError: + raise ImportError('Scipy is not installed') + + dtype = _find_common_type(self.dtypes) + cols, rows, datas = [], [], [] + for col, name in enumerate(self): + s = self[name] + row = s.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self.shape) def __array_wrap__(self, result): return self._constructor( diff --git a/pandas/tests/sparse/common.py b/pandas/tests/sparse/common.py new file mode 100644 index 0000000000000..3aeef8d436e1a --- /dev/null +++ b/pandas/tests/sparse/common.py @@ -0,0 +1,10 @@ +import pytest + +import pandas.util.testing as tm + + +@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil']) +def spmatrix(request): + tm._skip_if_no_scipy() + from scipy import sparse + return getattr(sparse, request.param + '_matrix') diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index a7dd7f2e81033..4cd5a643ce4be 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -2,11 +2,17 @@ import operator +import pytest + from numpy import nan import numpy as np import pandas as pd from pandas import Series, DataFrame, bdate_range, Panel +from pandas.types.common import (is_bool_dtype, + is_float_dtype, + is_object_dtype, + is_float) from pandas.tseries.index import DatetimeIndex from pandas.tseries.offsets import BDay import pandas.util.testing as tm @@ -18,6 +24,8 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_misc_api import SharedWithSparse +from pandas.tests.sparse.common import spmatrix # noqa: F401 + class TestSparseDataFrame(tm.TestCase, SharedWithSparse): @@ -1118,6 +1126,60 @@ def test_isnotnull(self): tm.assert_frame_equal(res.to_dense(), exp) +@pytest.mark.parametrize('index', [None, list('ab')]) # noqa: F811 +@pytest.mark.parametrize('columns', [None, list('cd')]) +@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) +@pytest.mark.parametrize('dtype', [object, bool, int, float, np.uint16]) +def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): + # GH 4343 + tm._skip_if_no_scipy() + + # Make one ndarray and from it one sparse matrix, both to be used for + # constructing frames and comparing results + arr = np.eye(2, dtype=dtype) + try: + spm = spmatrix(arr) + assert spm.dtype == arr.dtype + except (TypeError, AssertionError): + # If conversion to sparse fails for this spmatrix type and arr.dtype, + # then the combination is not currently supported in NumPy, so we + # can just skip testing it thoroughly + return + + sdf = pd.SparseDataFrame(spm, index=index, columns=columns, + default_fill_value=fill_value) + + # Expected result construction is kind of tricky for all + # dtype-fill_value combinations; easiest to cast to something generic + # and except later on + rarr = arr.astype(object) + rarr[arr == 0] = np.nan + expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna( + fill_value if fill_value is not None else np.nan) + + # Assert frame is as expected + sdf_obj = sdf.astype(object) + tm.assert_sp_frame_equal(sdf_obj, expected) + tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) + + # Assert spmatrices equal + tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok())) + + # Ensure dtype is preserved if possible + was_upcast = ((fill_value is None or is_float(fill_value)) and + not is_object_dtype(dtype) and + not is_float_dtype(dtype)) + res_dtype = (bool if is_bool_dtype(dtype) else + float if was_upcast else + dtype) + tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) + tm.assert_equal(sdf.to_coo().dtype, res_dtype) + + # However, adding a str column results in an upcast to object + sdf['strings'] = np.arange(len(sdf)).astype(str) + tm.assert_equal(sdf.to_coo().dtype, np.object_) + + class TestSparseDataFrameArithmetic(tm.TestCase): def test_numeric_op_scalar(self): diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index a36a77a70f9ad..b41df0da45234 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -30,11 +30,14 @@ is_float, is_bool, is_scalar, + is_scipy_sparse, _ensure_int32, _ensure_categorical) from pandas.types.missing import isnull from pandas.util import testing as tm +from pandas.tests.sparse.test_frame import spmatrix # noqa: F401 + def test_is_sequence(): is_seq = inference.is_sequence @@ -946,6 +949,12 @@ def test_nan_to_nat_conversions(): assert (s[8].value == np.datetime64('NaT').astype(np.int64)) +def test_is_scipy_sparse(spmatrix): # noqa: F811 + tm._skip_if_no_scipy() + assert is_scipy_sparse(spmatrix([[0, 1]])) + assert not is_scipy_sparse(np.array([1])) + + def test_ensure_int32(): values = np.arange(10, dtype=np.int32) result = _ensure_int32(values) diff --git a/pandas/types/common.py b/pandas/types/common.py index 1be5b5f6f1368..a1f03e59a5e6e 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -23,6 +23,9 @@ _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) +# oh the troubles to reduce import time +_is_scipy_sparse = None + _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 @@ -59,6 +62,17 @@ def is_sparse(array): return isinstance(array, (ABCSparseArray, ABCSparseSeries)) +def is_scipy_sparse(array): + """ return if we are a scipy.sparse.spmatrix """ + global _is_scipy_sparse + if _is_scipy_sparse is None: + try: + from scipy.sparse import issparse as _is_scipy_sparse + except ImportError: + _is_scipy_sparse = lambda _: False + return _is_scipy_sparse(array) + + def is_categorical(array): """ return if we are a categorical possibility """ return isinstance(array, ABCCategorical) or is_categorical_dtype(array) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b68bf55a347b2..ec30a9376a9da 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -297,6 +297,11 @@ def _skip_if_no_scipy(): except ImportError: import pytest pytest.skip('scipy.interpolate missing') + try: + import scipy.sparse # noqa + except ImportError: + import pytest + pytest.skip('scipy.sparse missing') def _skip_if_scipy_0_17(): From e1996ce71cb75d2ad71f2636b05c12b78726bd52 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 10 Mar 2017 09:38:25 -0500 Subject: [PATCH 275/338] BUG: Error when specifying int index containing NaN xref #15187. Author: gfyoung Closes #15616 from gfyoung/nan-int-index and squashes the following commits: 195b830 [gfyoung] BUG: Error when specifying int index containing NaN --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/indexes/base.py | 27 +++++++++++++++++++++++---- pandas/tests/indexes/test_base.py | 17 +++++++++++++++++ pandas/tests/indexes/test_numeric.py | 27 ++++++++++++++++++++++++++- pandas/tests/indexes/test_range.py | 28 +++++++++++++++++++++++++++- 5 files changed, 94 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3c82e533dd158..dd081ea605c01 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -792,6 +792,7 @@ Bug Fixes - Bug in ``DataFrame.isin`` comparing datetimelike to empty frame (:issue:`15473`) - Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) +- Bug in ``Index`` construction with ``NaN`` elements and integer dtype specified (:issue:`15187`) - Bug in ``Series`` construction with a datetimetz (:issue:`14928`) - Bug in output formatting of a ``MultiIndex`` when names are integers (:issue:`12223`, :issue:`15262`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 607a463083fdd..7f46f437489a1 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -203,6 +203,9 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if inferred == 'integer': data = np.array(data, copy=copy, dtype=dtype) elif inferred in ['floating', 'mixed-integer-float']: + if isnull(data).any(): + raise ValueError('cannot convert float ' + 'NaN to integer') # If we are actually all equal to integers, # then coerce to integer. @@ -230,8 +233,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, else: data = np.array(data, dtype=dtype, copy=copy) - except (TypeError, ValueError): - pass + except (TypeError, ValueError) as e: + msg = str(e) + if 'cannot convert float' in msg: + raise # maybe coerce to a sub-class from pandas.tseries.period import (PeriodIndex, @@ -585,7 +590,14 @@ def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self.values, other) - return self._shallow_copy_with_infer(values, dtype=self.dtype) + + dtype = self.dtype + if self._is_numeric_dtype and np.any(isnull(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return self._shallow_copy_with_infer(values, dtype=dtype) def ravel(self, order='C'): """ @@ -689,7 +701,14 @@ def _coerce_scalar_to_index(self, item): ---------- item : scalar item to coerce """ - return Index([item], dtype=self.dtype, **self._get_attributes_dict()) + dtype = self.dtype + + if self._is_numeric_dtype and isnull(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return Index([item], dtype=dtype, **self._get_attributes_dict()) _index_shared_docs['copy'] = """ Make a copy of this object. Name and dtype sets those attributes on diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8c0a399cb58b3..05d3478ab0705 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -199,6 +199,23 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) self.assert_index_equal(result, expected) + def test_constructor_int_dtype_nan(self): + # see gh-15187 + data = [np.nan] + msg = "cannot convert" + + with tm.assertRaisesRegexp(ValueError, msg): + Index(data, dtype='int64') + + with tm.assertRaisesRegexp(ValueError, msg): + Index(data, dtype='uint64') + + # This, however, should not break + # because NaN is float. + expected = Float64Index(data) + result = Index(data, dtype='float') + tm.assert_index_equal(result, expected) + def test_index_ctor_infer_nan_nat(self): # GH 13467 exp = pd.Float64Index([np.nan, np.nan]) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e23e7c19ed799..d0ce34169f79e 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -5,7 +5,7 @@ import numpy as np -from pandas import (date_range, Series, Index, Float64Index, +from pandas import (date_range, notnull, Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex) import pandas.util.testing as tm @@ -686,6 +686,31 @@ def test_coerce_list(self): arr = Index([1, 2, 3, 4], dtype=object) tm.assertIsInstance(arr, Index) + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + _nan = i._na_value + cond = [False] + [True] * len(i[1:]) + expected = pd.Index([_nan] + i[1:].tolist()) + + result = i.where(cond) + tm.assert_index_equal(result, expected) + + def test_where_array_like(self): + i = self.create_index() + + _nan = i._na_value + cond = [False] + [True] * (len(i) - 1) + klasses = [list, tuple, np.array, pd.Series] + expected = pd.Index([_nan] + i[1:].tolist()) + + for klass in klasses: + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 38e715fce2720..53c88897d6764 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -8,7 +8,8 @@ import numpy as np -from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex) +from pandas import (notnull, Series, Index, Float64Index, + Int64Index, RangeIndex) from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -915,3 +916,28 @@ def test_len_specialised(self): i = RangeIndex(0, 5, step) self.assertEqual(len(i), 0) + + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + _nan = i._na_value + cond = [False] + [True] * len(i[1:]) + expected = pd.Index([_nan] + i[1:].tolist()) + + result = i.where(cond) + tm.assert_index_equal(result, expected) + + def test_where_array_like(self): + i = self.create_index() + + _nan = i._na_value + cond = [False] + [True] * (len(i) - 1) + klasses = [list, tuple, np.array, pd.Series] + expected = pd.Index([_nan] + i[1:].tolist()) + + for klass in klasses: + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) From 482b28d9e8521f1d13217b1359970bab499f8a03 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 10 Mar 2017 16:25:29 -0500 Subject: [PATCH 276/338] API: Drop DataFrame.iterkv() Deprecated since 0.17.0 xref #10711 Author: gfyoung Closes #15650 from gfyoung/df-iterkv-remove and squashes the following commits: e40fc9e [gfyoung] API: Drop DataFrame.iterkv() --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 10 ---------- pandas/tests/frame/test_misc_api.py | 6 +----- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index dd081ea605c01..f42dfb80924e0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -692,6 +692,7 @@ Other API Changes - Reorganization of timeseries development tests (:issue:`14854`) - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) +- ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) .. _whatsnew_0200.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a0111cb9ef7ec..1db9677659ca3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -899,16 +899,6 @@ def iteritems(self): for h in self._info_axis: yield h, self[h] - # originally used to get around 2to3's changes to iteritems. - # Now unnecessary. Sidenote: don't want to deprecate this for a while, - # otherwise libraries that use 2to3 will have issues. - def iterkv(self, *args, **kwargs): - "iteritems alias used to get around 2to3. Deprecated" - warnings.warn("iterkv is deprecated and will be removed in a future " - "release, use ``iteritems`` instead.", FutureWarning, - stacklevel=2) - return self.iteritems(*args, **kwargs) - def __len__(self): """Returns length of info axis""" return len(self._info_axis) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 674202980807a..321d46739b24c 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -389,11 +389,7 @@ def test_repr_with_mi_nat(self): exp = ' X\nNaT a 1\n2013-01-01 b 2' self.assertEqual(res, exp) - def test_iterkv_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - self.mixed_float.iterkv() - - def test_iterkv_names(self): + def test_iteritems_names(self): for k, v in compat.iteritems(self.mixed_frame): self.assertEqual(v.name, k) From 1607abe10825fd7760e8fddee66fd5244a850ef6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Mar 2017 18:04:41 -0500 Subject: [PATCH 277/338] BUG/API: .merge() and .join() on category dtype columns will now preserve category dtype closes #10409 Author: Jeff Reback Closes #15321 from jreback/merge_cat and squashes the following commits: 3671dad [Jeff Reback] DOC: merge docs a4b2ee6 [Jeff Reback] BUG/API: .merge() and .join() on category dtype columns will now preserve the category dtype when possible --- asv_bench/benchmarks/join_merge.py | 36 +++++- doc/source/categorical.rst | 3 + doc/source/merging.rst | 73 +++++++++++ doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/core/internals.py | 2 + pandas/tests/test_categorical.py | 3 + pandas/tests/tools/test_merge.py | 177 +++++++++++++++++++++----- pandas/tests/tools/test_merge_asof.py | 1 + pandas/tests/types/test_common.py | 50 ++++++-- pandas/tools/merge.py | 86 ++++++++++--- 10 files changed, 364 insertions(+), 71 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index d9c631fa92efd..776316343e009 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -6,7 +6,7 @@ from pandas import ordered_merge as merge_ordered -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Append class Append(object): @@ -35,7 +35,7 @@ def time_append_mixed(self): self.mdf1.append(self.mdf2) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Concat class Concat(object): @@ -120,7 +120,7 @@ def time_f_ordered_axis1(self): concat(self.frames_f, axis=1, ignore_index=True) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Joins class Join(object): @@ -202,7 +202,7 @@ def time_join_non_unique_equal(self): (self.fracofday * self.temp[self.fracofday.index]) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Merges class Merge(object): @@ -257,7 +257,31 @@ def time_i8merge(self): merge(self.left, self.right, how='outer') -#---------------------------------------------------------------------- +class MergeCategoricals(object): + goal_time = 0.2 + + def setup(self): + self.left_object = pd.DataFrame( + {'X': np.random.choice(range(0, 10), size=(10000,)), + 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))}) + + self.right_object = pd.DataFrame( + {'X': np.random.choice(range(0, 10), size=(10000,)), + 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))}) + + self.left_cat = self.left_object.assign( + Y=self.left_object['Y'].astype('category')) + self.right_cat = self.right_object.assign( + Z=self.right_object['Z'].astype('category')) + + def time_merge_object(self): + merge(self.left_object, self.right_object, on='X') + + def time_merge_cat(self): + merge(self.left_cat, self.right_cat, on='X') + + +# ---------------------------------------------------------------------- # Ordered merge class MergeOrdered(object): @@ -332,7 +356,7 @@ def time_multiby(self): merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2']) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # data alignment class Align(object): diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index db974922e1d76..6d85e1a6560b0 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -646,6 +646,9 @@ In this case the categories are not the same and so an error is raised: The same applies to ``df.append(df_different)``. +See also the section on :ref:`merge dtypes` for notes about preserving merge dtypes and performance. + + .. _categorical.union: Unioning diff --git a/doc/source/merging.rst b/doc/source/merging.rst index f732f0a4cc749..70d2ce5b1a664 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -746,6 +746,79 @@ The ``indicator`` argument will also accept string arguments, in which case the pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') +.. _merging.dtypes: + +Merge Dtypes +~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +Merging will preserve the dtype of the join keys. + +.. ipython:: python + + left = pd.DataFrame({'key': [1], 'v1': [10]}) + left + right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + right + +We are able to preserve the join keys + +.. ipython:: python + + pd.merge(left, right, how='outer') + pd.merge(left, right, how='outer').dtypes + +Of course if you have missing values that are introduced, then the +resulting dtype will be upcast. + +.. ipython:: python + + pd.merge(left, right, how='outer', on='key') + pd.merge(left, right, how='outer', on='key').dtypes + +.. versionadded:: 0.20.0 + +Merging will preserve ``category`` dtypes of the mergands. + +The left frame. + +.. ipython:: python + + X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) + X = X.astype('category', categories=['foo', 'bar']) + + left = DataFrame({'X': X, + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + left + left.dtypes + +The right frame. + +.. ipython:: python + + right = DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), + 'Z': [1, 2]}) + right + right.dtypes + +The merged result + +.. ipython:: python + + result = pd.merge(left, right, how='outer') + result + result.dtypes + +.. note:: + + The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute. + Otherwise the result will coerce to ``object`` dtype. + +.. note:: + + Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging. + .. _merging.join.index: Joining on index diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f42dfb80924e0..e392023423eb0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -692,7 +692,7 @@ Other API Changes - Reorganization of timeseries development tests (:issue:`14854`) - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) -- ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) +- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`) .. _whatsnew_0200.deprecations: @@ -733,6 +733,7 @@ Removal of prior version deprecations/changes - ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:`15098`) - The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). +- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) .. _whatsnew_0200.performance: @@ -749,6 +750,7 @@ Performance Improvements - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) - Improved performance of ``.rank()`` for categorical data (:issue:`15498`) - Improved performance when using ``.unstack()`` (:issue:`15503`) +- Improved performance of merge/join on ``category`` columns (:issue:`10409`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4b43574f49820..aa954fbee9a60 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5227,6 +5227,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values + elif self.block.is_categorical: + values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cc99cf0f830aa..2d5e98d49e152 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4097,9 +4097,12 @@ def test_merge(self): expected = df.copy() # object-cat + # note that we propogate the category + # because we don't have any matching rows cright = right.copy() cright['d'] = cright['d'].astype('category') result = pd.merge(left, cright, how='left', left_on='b', right_on='c') + expected['d'] = expected['d'].astype('category', categories=['null']) tm.assert_frame_equal(result, expected) # cat-object diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index b3b5e7e29319b..ff27500355998 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1,5 +1,6 @@ # pylint: disable=E1103 +import pytest from datetime import datetime from numpy.random import randn from numpy import nan @@ -11,6 +12,8 @@ from pandas.tools.concat import concat from pandas.tools.merge import merge, MergeError from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import is_categorical_dtype, is_object_dtype from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm @@ -1024,38 +1027,6 @@ def test_left_join_index_multi_match(self): expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected) - def test_join_multi_dtypes(self): - - # test with multi dtypes in the join index - def _test(dtype1, dtype2): - left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) - - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame( - {'v2': np.array([5, 7], dtype=dtype2)}, index=index) - - result = left.join(right, on=['k1', 'k2']) - - expected = left.copy() - - if dtype2.kind == 'i': - dtype2 = np.dtype('float64') - expected['v2'] = np.array(np.nan, dtype=dtype2) - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 - - tm.assert_frame_equal(result, expected) - - result = left.join(right, on=['k1', 'k2'], sort=True) - expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) - tm.assert_frame_equal(result, expected) - - for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]: - for d2 in [np.int64, np.float64, np.float32, np.float16]: - _test(np.dtype(d1), np.dtype(d2)) - def test_left_merge_na_buglet(self): left = DataFrame({'id': list('abcde'), 'v1': randn(5), 'v2': randn(5), 'dummy': list('abcde'), @@ -1242,3 +1213,145 @@ def f(): def f(): household.join(log_return, how='outer') self.assertRaises(NotImplementedError, f) + + +@pytest.fixture +def df(): + return DataFrame( + {'A': ['foo', 'bar'], + 'B': Series(['foo', 'bar']).astype('category'), + 'C': [1, 2], + 'D': [1.0, 2.0], + 'E': Series([1, 2], dtype='uint64'), + 'F': Series([1, 2], dtype='int32')}) + + +class TestMergeDtypes(object): + + def test_different(self, df): + + # we expect differences by kind + # to be ok, while other differences should return object + + left = df + for col in df.columns: + right = DataFrame({'A': df[col]}) + result = pd.merge(left, right, on='A') + assert is_object_dtype(result.A.dtype) + + @pytest.mark.parametrize('d1', [np.int64, np.int32, + np.int16, np.int8, np.uint8]) + @pytest.mark.parametrize('d2', [np.int64, np.float64, + np.float32, np.float16]) + def test_join_multi_dtypes(self, d1, d2): + + dtype1 = np.dtype(d1) + dtype2 = np.dtype(d2) + + left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), + 'k2': ['foo', 'bar'] * 12, + 'v': np.array(np.arange(24), dtype=np.int64)}) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + + if dtype2.kind == 'i': + dtype2 = np.dtype('float64') + expected['v2'] = np.array(np.nan, dtype=dtype2) + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=['k1', 'k2'], sort=True) + expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def left(): + np.random.seed(1234) + return DataFrame( + {'X': Series(np.random.choice( + ['foo', 'bar'], + size=(10,))).astype('category', categories=['foo', 'bar']), + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + + +@pytest.fixture +def right(): + np.random.seed(1234) + return DataFrame( + {'X': Series(['foo', 'bar']).astype('category', + categories=['foo', 'bar']), + 'Z': [1, 2]}) + + +class TestMergeCategorical(object): + + def test_identical(self, left): + # merging on the same, should preserve dtypes + merged = pd.merge(left, left, on='X') + result = merged.dtypes.sort_index() + expected = Series([CategoricalDtype(), + np.dtype('O'), + np.dtype('O')], + index=['X', 'Y_x', 'Y_y']) + assert_series_equal(result, expected) + + def test_basic(self, left, right): + # we have matching Categorical dtypes in X + # so should preserve the merged column + merged = pd.merge(left, right, on='X') + result = merged.dtypes.sort_index() + expected = Series([CategoricalDtype(), + np.dtype('O'), + np.dtype('int64')], + index=['X', 'Y', 'Z']) + assert_series_equal(result, expected) + + def test_other_columns(self, left, right): + # non-merge columns should preserve if possible + right = right.assign(Z=right.Z.astype('category')) + + merged = pd.merge(left, right, on='X') + result = merged.dtypes.sort_index() + expected = Series([CategoricalDtype(), + np.dtype('O'), + CategoricalDtype()], + index=['X', 'Y', 'Z']) + assert_series_equal(result, expected) + + # categories are preserved + assert left.X.values.is_dtype_equal(merged.X.values) + assert right.Z.values.is_dtype_equal(merged.Z.values) + + @pytest.mark.parametrize( + 'change', [lambda x: x, + lambda x: x.astype('category', + categories=['bar', 'foo']), + lambda x: x.astype('category', + categories=['foo', 'bar', 'bah']), + lambda x: x.astype('category', ordered=True)]) + @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) + def test_dtype_on_merged_different(self, change, how, left, right): + # our merging columns, X now has 2 different dtypes + # so we must be object as a result + + X = change(right.X.astype('object')) + right = right.assign(X=X) + assert is_categorical_dtype(left.X.values) + assert not left.X.values.is_dtype_equal(right.X.values) + + merged = pd.merge(left, right, on='X', how=how) + + result = merged.dtypes.sort_index() + expected = Series([np.dtype('O'), + np.dtype('O'), + np.dtype('int64')], + index=['X', 'Y', 'Z']) + assert_series_equal(result, expected) diff --git a/pandas/tests/tools/test_merge_asof.py b/pandas/tests/tools/test_merge_asof.py index 76798b3c895ea..cdff8f0349c15 100644 --- a/pandas/tests/tools/test_merge_asof.py +++ b/pandas/tests/tools/test_merge_asof.py @@ -147,6 +147,7 @@ def test_basic_categorical(self): trades.ticker = trades.ticker.astype('category') quotes = self.quotes.copy() quotes.ticker = quotes.ticker.astype('category') + expected.ticker = expected.ticker.astype('category') result = merge_asof(trades, quotes, on='time', diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py index 4667bbd47ad18..c15f219c8fad6 100644 --- a/pandas/tests/types/test_common.py +++ b/pandas/tests/types/test_common.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import pytest import numpy as np from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype @@ -38,17 +39,44 @@ def test_period_dtype(self): self.assertEqual(pandas_dtype(dtype), dtype) -def test_dtype_equal(): - assert is_dtype_equal(np.int64, np.int64) - assert not is_dtype_equal(np.int64, np.float64) +dtypes = dict(datetime_tz=pandas_dtype('datetime64[ns, US/Eastern]'), + datetime=pandas_dtype('datetime64[ns]'), + timedelta=pandas_dtype('timedelta64[ns]'), + period=PeriodDtype('D'), + integer=np.dtype(np.int64), + float=np.dtype(np.float64), + object=np.dtype(np.object), + category=pandas_dtype('category')) - p1 = PeriodDtype('D') - p2 = PeriodDtype('D') - assert is_dtype_equal(p1, p2) - assert not is_dtype_equal(np.int64, p1) - p3 = PeriodDtype('2D') - assert not is_dtype_equal(p1, p3) +@pytest.mark.parametrize('name1,dtype1', + list(dtypes.items()), + ids=lambda x: str(x)) +@pytest.mark.parametrize('name2,dtype2', + list(dtypes.items()), + ids=lambda x: str(x)) +def test_dtype_equal(name1, dtype1, name2, dtype2): - assert not DatetimeTZDtype.is_dtype(np.int64) - assert not PeriodDtype.is_dtype(np.int64) + # match equal to self, but not equal to other + assert is_dtype_equal(dtype1, dtype1) + if name1 != name2: + assert not is_dtype_equal(dtype1, dtype2) + + +def test_dtype_equal_strict(): + + # we are strict on kind equality + for dtype in [np.int8, np.int16, np.int32]: + assert not is_dtype_equal(np.int64, dtype) + + for dtype in [np.float32]: + assert not is_dtype_equal(np.float64, dtype) + + # strict w.r.t. PeriodDtype + assert not is_dtype_equal(PeriodDtype('D'), + PeriodDtype('2D')) + + # strict w.r.t. datetime64 + assert not is_dtype_equal( + pandas_dtype('datetime64[ns, US/Eastern]'), + pandas_dtype('datetime64[ns, CET]')) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3f1e7640ba538..d02f4c5b26c86 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -18,8 +18,10 @@ is_datetime64_dtype, needs_i8_conversion, is_int64_dtype, + is_categorical_dtype, is_integer_dtype, is_float_dtype, + is_numeric_dtype, is_integer, is_int_or_datetime_dtype, is_dtype_equal, @@ -37,7 +39,7 @@ from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos import pandas.core.common as com -from pandas._libs import hashtable as libhashtable, join as libjoin +from pandas._libs import hashtable as libhashtable, join as libjoin, lib # back-compat of pseudo-public API @@ -570,6 +572,10 @@ def __init__(self, left, right, how='inner', on=None, self.right_join_keys, self.join_names) = self._get_merge_keys() + # validate the merge keys dtypes. We may need to coerce + # to avoid incompat dtypes + self._maybe_coerce_merge_keys() + def get_result(self): if self.indicator: self.left, self.right = self._indicator_pre_merge( @@ -760,26 +766,6 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _get_merge_data(self): - """ - Handles overlapping column names etc. - """ - ldata, rdata = self.left._data, self.right._data - lsuf, rsuf = self.suffixes - - llabels, rlabels = items_overlap_with_suffix( - ldata.items, lsuf, rdata.items, rsuf) - - if not llabels.equals(ldata.items): - ldata = ldata.copy(deep=False) - ldata.set_axis(0, llabels) - - if not rlabels.equals(rdata.items): - rdata = rdata.copy(deep=False) - rdata.set_axis(0, rlabels) - - return ldata, rdata - def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) @@ -891,6 +877,51 @@ def _get_merge_keys(self): return left_keys, right_keys, join_names + def _maybe_coerce_merge_keys(self): + # we have valid mergee's but we may have to further + # coerce these if they are originally incompatible types + # + # for example if these are categorical, but are not dtype_equal + # or if we have object and integer dtypes + + for lk, rk, name in zip(self.left_join_keys, + self.right_join_keys, + self.join_names): + if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): + continue + + # if either left or right is a categorical + # then the must match exactly in categories & ordered + if is_categorical_dtype(lk) and is_categorical_dtype(rk): + if lk.is_dtype_equal(rk): + continue + elif is_categorical_dtype(lk) or is_categorical_dtype(rk): + pass + + elif is_dtype_equal(lk.dtype, rk.dtype): + continue + + # if we are numeric, then allow differing + # kinds to proceed, eg. int64 and int8 + # further if we are object, but we infer to + # the same, then proceed + if (is_numeric_dtype(lk) and is_numeric_dtype(rk)): + if lk.dtype.kind == rk.dtype.kind: + continue + + # let's infer and see if we are ok + if lib.infer_dtype(lk) == lib.infer_dtype(rk): + continue + + # Houston, we have a problem! + # let's coerce to object + if name in self.left.columns: + self.left = self.left.assign( + **{name: self.left[name].astype(object)}) + if name in self.right.columns: + self.right = self.right.assign( + **{name: self.right[name].astype(object)}) + def _validate_specification(self): # Hm, any way to make this logic less complicated?? if self.on is None and self.left_on is None and self.right_on is None: @@ -942,9 +973,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', Parameters ---------- + left_keys: ndarray, Index, Series + right_keys: ndarray, Index, Series + sort: boolean, default False + how: string {'inner', 'outer', 'left', 'right'}, default 'inner' Returns ------- + tuple of (left_indexer, right_indexer) + indexers into the left_keys, right_keys """ from functools import partial @@ -1349,6 +1386,13 @@ def _factorize_keys(lk, rk, sort=True): if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values + + # if we exactly match in categories, allow us to use codes + if (is_categorical_dtype(lk) and + is_categorical_dtype(rk) and + lk.is_dtype_equal(rk)): + return lk.codes, rk.codes, len(lk.categories) + if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = libhashtable.Int64Factorizer lk = _ensure_int64(com._values_from_object(lk)) From eb9a201beff224973f245ecd300ccd3bfa727ff3 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 11 Mar 2017 12:20:46 -0500 Subject: [PATCH 278/338] API: Drop the name parameter from Categorical Deprecated in 0.17.0 xref #10632 Author: gfyoung Closes #15654 from gfyoung/categorical-name-drop and squashes the following commits: 7e1e7d8 [gfyoung] API: Drop the name parameter from Categorical --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/categorical.py | 17 ++--------------- pandas/io/packers.py | 3 +-- pandas/tests/io/test_pickle.py | 16 ++++------------ pandas/tests/test_categorical.py | 11 +---------- 5 files changed, 9 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e392023423eb0..f6d5e3df814fc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -734,7 +734,7 @@ Removal of prior version deprecations/changes - The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). - The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) - +- The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`) .. _whatsnew_0200.performance: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 47db86ce1e73e..c1e5904693d1c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -231,8 +231,7 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, - name=None, fastpath=False): + def __init__(self, values, categories=None, ordered=False, fastpath=False): self._validate_ordered(ordered) @@ -244,12 +243,6 @@ def __init__(self, values, categories=None, ordered=False, self._ordered = ordered return - if name is not None: - msg = ("the 'name' keyword is removed, use 'name' with consumers " - "of the categorical instead (e.g. 'Series(cat, " - "name=\"something\")'") - warn(msg, UserWarning, stacklevel=2) - # sanitize input if is_categorical_dtype(values): @@ -431,7 +424,7 @@ def from_array(cls, data, **kwargs): return cls(data, **kwargs) @classmethod - def from_codes(cls, codes, categories, ordered=False, name=None): + def from_codes(cls, codes, categories, ordered=False): """ Make a Categorical type from codes and categories arrays. @@ -454,12 +447,6 @@ def from_codes(cls, codes, categories, ordered=False, name=None): categorical. If not given, the resulting categorical will be unordered. """ - if name is not None: - msg = ("the 'name' keyword is removed, use 'name' with consumers " - "of the categorical instead (e.g. 'Series(cat, " - "name=\"something\")'") - warn(msg, UserWarning, stacklevel=2) - try: codes = np.asarray(codes, np.int64) except: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 404be758a7fbe..4662e8b635d3f 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -589,8 +589,7 @@ def decode(obj): from_codes = globals()[obj[u'klass']].from_codes return from_codes(codes=obj[u'codes'], categories=obj[u'categories'], - ordered=obj[u'ordered'], - name=obj[u'name']) + ordered=obj[u'ordered']) elif typ == u'series': dtype = dtype_for(obj[u'dtype']) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 91e70e942089c..fad6237d851fb 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -265,12 +265,8 @@ def python_unpickler(path): def test_pickle_v0_14_1(): - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) + cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, + categories=['a', 'b', 'c', 'd']) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: @@ -286,12 +282,8 @@ def test_pickle_v0_15_2(): # ordered -> _ordered # GH 9347 - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) + cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, + categories=['a', 'b', 'c', 'd']) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2d5e98d49e152..6c8aeba704c7b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -682,7 +682,7 @@ def test_print(self): def test_big_print(self): factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ['a', 'b', 'c'], - name='cat', fastpath=True) + fastpath=True) expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Length: 600", "Categories (3, object): [a, b, c]"] expected = "\n".join(expected) @@ -1635,15 +1635,6 @@ def test_deprecated_from_array(self): with tm.assert_produces_warning(FutureWarning): Categorical.from_array([0, 1]) - def test_removed_names_produces_warning(self): - - # 10482 - with tm.assert_produces_warning(UserWarning): - Categorical([0, 1], name="a") - - with tm.assert_produces_warning(UserWarning): - Categorical.from_codes([1, 2], ["a", "b", "c"], name="a") - def test_datetime_categorical_comparison(self): dt_cat = pd.Categorical( pd.date_range('2014-01-01', periods=3), ordered=True) From 326d738f4dfad55e50dd9ffda72ebf0cb9250cc0 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 11 Mar 2017 12:24:10 -0500 Subject: [PATCH 279/338] MAINT: Remove testing.assert_isinstance (#15652) Deprecated in 0.17.0 xref gh-10458 --- pandas/tests/test_testing.py | 3 --- pandas/util/testing.py | 5 ----- 2 files changed, 8 deletions(-) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 2fb58ef70e3cb..e5cb953cb35a5 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -765,9 +765,6 @@ def test_warning(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.assertNotAlmostEquals(1, 2) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_isinstance(Series([1, 2]), Series, msg='xxx') - class TestLocale(tm.TestCase): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index ec30a9376a9da..74ff480a9c198 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -991,11 +991,6 @@ def assertIsInstance(obj, cls, msg=''): raise AssertionError(err_msg.format(msg, cls, type(obj))) -def assert_isinstance(obj, class_type_or_tuple, msg=''): - return deprecate('assert_isinstance', assertIsInstance)( - obj, class_type_or_tuple, msg=msg) - - def assertNotIsInstance(obj, cls, msg=''): """Test that obj is not an instance of cls (which can be a class or a tuple of classes, From 4cb1cd70433092fc60e410a7a641b4deaea47994 Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Sun, 12 Mar 2017 11:15:17 -0400 Subject: [PATCH 280/338] CLN: Cleanup tests for .rank() closes #15640 Author: Rouz Azari Closes #15658 from rouzazari/GH15640 and squashes the following commits: d0a2abc [Rouz Azari] Fixed linting error with datetime.datetime import 9580af0 [Rouz Azari] CLN: Cleanup tests for .rank() --- pandas/tests/frame/test_analytics.py | 169 +------------- pandas/tests/frame/test_rank.py | 268 +++++++++++++++++++++ pandas/tests/series/test_analytics.py | 201 ---------------- pandas/tests/series/test_rank.py | 323 ++++++++++++++++++++++++++ pandas/tests/test_stats.py | 185 --------------- 5 files changed, 592 insertions(+), 554 deletions(-) create mode 100644 pandas/tests/frame/test_rank.py create mode 100644 pandas/tests/series/test_rank.py delete mode 100644 pandas/tests/test_stats.py diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4758ee1323ca0..6c917444f9f43 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2,7 +2,7 @@ from __future__ import print_function -from datetime import timedelta, datetime +from datetime import timedelta from distutils.version import LooseVersion import sys import pytest @@ -642,173 +642,6 @@ def test_cumprod(self): df.cumprod(0) df.cumprod(1) - def test_rank(self): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - self.frame['A'][::2] = np.nan - self.frame['B'][::3] = np.nan - self.frame['C'][::4] = np.nan - self.frame['D'][::5] = np.nan - - ranks0 = self.frame.rank() - ranks1 = self.frame.rank(1) - mask = np.isnan(self.frame.values) - - fvals = self.frame.fillna(np.inf).values - - exp0 = np.apply_along_axis(rankdata, 0, fvals) - exp0[mask] = np.nan - - exp1 = np.apply_along_axis(rankdata, 1, fvals) - exp1[mask] = np.nan - - tm.assert_almost_equal(ranks0.values, exp0) - tm.assert_almost_equal(ranks1.values, exp1) - - # integers - df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) - - result = df.rank() - exp = df.astype(float).rank() - tm.assert_frame_equal(result, exp) - - result = df.rank(1) - exp = df.astype(float).rank(1) - tm.assert_frame_equal(result, exp) - - def test_rank2(self): - df = DataFrame([[1, 3, 2], [1, 2, 3]]) - expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 - result = df.rank(1, pct=True) - tm.assert_frame_equal(result, expected) - - df = DataFrame([[1, 3, 2], [1, 2, 3]]) - expected = df.rank(0) / 2.0 - result = df.rank(0, pct=True) - tm.assert_frame_equal(result, expected) - - df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) - expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) - result = df.rank(1, numeric_only=False) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) - result = df.rank(0, numeric_only=False) - tm.assert_frame_equal(result, expected) - - df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) - expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) - result = df.rank(1, numeric_only=False) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) - result = df.rank(0, numeric_only=False) - tm.assert_frame_equal(result, expected) - - # f7u12, this does not work without extensive workaround - data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 1)]] - df = DataFrame(data) - - # check the rank - expected = DataFrame([[2., nan, 1.], - [2., 3., 1.]]) - result = df.rank(1, numeric_only=False, ascending=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[1., nan, 2.], - [2., 1., 3.]]) - result = df.rank(1, numeric_only=False, ascending=False) - tm.assert_frame_equal(result, expected) - - # mixed-type frames - self.mixed_frame['datetime'] = datetime.now() - self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) - - result = self.mixed_frame.rank(1) - expected = self.mixed_frame.rank(1, numeric_only=True) - tm.assert_frame_equal(result, expected) - - df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, - 1e60, 1e80, 1e-30]}) - exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) - tm.assert_frame_equal(df.rank(), exp) - - def test_rank_na_option(self): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - self.frame['A'][::2] = np.nan - self.frame['B'][::3] = np.nan - self.frame['C'][::4] = np.nan - self.frame['D'][::5] = np.nan - - # bottom - ranks0 = self.frame.rank(na_option='bottom') - ranks1 = self.frame.rank(1, na_option='bottom') - - fvals = self.frame.fillna(np.inf).values - - exp0 = np.apply_along_axis(rankdata, 0, fvals) - exp1 = np.apply_along_axis(rankdata, 1, fvals) - - tm.assert_almost_equal(ranks0.values, exp0) - tm.assert_almost_equal(ranks1.values, exp1) - - # top - ranks0 = self.frame.rank(na_option='top') - ranks1 = self.frame.rank(1, na_option='top') - - fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values - fval1 = self.frame.T - fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T - fval1 = fval1.fillna(np.inf).values - - exp0 = np.apply_along_axis(rankdata, 0, fval0) - exp1 = np.apply_along_axis(rankdata, 1, fval1) - - tm.assert_almost_equal(ranks0.values, exp0) - tm.assert_almost_equal(ranks1.values, exp1) - - # descending - - # bottom - ranks0 = self.frame.rank(na_option='top', ascending=False) - ranks1 = self.frame.rank(1, na_option='top', ascending=False) - - fvals = self.frame.fillna(np.inf).values - - exp0 = np.apply_along_axis(rankdata, 0, -fvals) - exp1 = np.apply_along_axis(rankdata, 1, -fvals) - - tm.assert_almost_equal(ranks0.values, exp0) - tm.assert_almost_equal(ranks1.values, exp1) - - # descending - - # top - ranks0 = self.frame.rank(na_option='bottom', ascending=False) - ranks1 = self.frame.rank(1, na_option='bottom', ascending=False) - - fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values - fval1 = self.frame.T - fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T - fval1 = fval1.fillna(np.inf).values - - exp0 = np.apply_along_axis(rankdata, 0, -fval0) - exp1 = np.apply_along_axis(rankdata, 1, -fval1) - - tm.assert_numpy_array_equal(ranks0.values, exp0) - tm.assert_numpy_array_equal(ranks1.values, exp1) - - def test_rank_axis(self): - # check if using axes' names gives the same result - df = pd.DataFrame([[2, 1], [4, 3]]) - tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) - tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) - def test_sem(self): alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) self._check_stat_op('sem', alt) diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py new file mode 100644 index 0000000000000..151a89888c329 --- /dev/null +++ b/pandas/tests/frame/test_rank.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- +from datetime import timedelta, datetime +from distutils.version import LooseVersion +from numpy import nan +import numpy as np + +from pandas import Series, DataFrame + +from pandas.compat import product +from pandas.util.testing import assert_frame_equal +import pandas.util.testing as tm +from pandas.tests.frame.common import TestData + + +class TestRank(tm.TestCase, TestData): + s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + df = DataFrame({'A': s, 'B': s}) + + results = { + 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, + 3.5, 1.5, 8.0, nan, 5.5]), + 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + } + + def test_rank(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + self.frame['A'][::2] = np.nan + self.frame['B'][::3] = np.nan + self.frame['C'][::4] = np.nan + self.frame['D'][::5] = np.nan + + ranks0 = self.frame.rank() + ranks1 = self.frame.rank(1) + mask = np.isnan(self.frame.values) + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp0[mask] = np.nan + + exp1 = np.apply_along_axis(rankdata, 1, fvals) + exp1[mask] = np.nan + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # integers + df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) + + result = df.rank() + exp = df.astype(float).rank() + tm.assert_frame_equal(result, exp) + + result = df.rank(1) + exp = df.astype(float).rank(1) + tm.assert_frame_equal(result, exp) + + def test_rank2(self): + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 + result = df.rank(1, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = df.rank(0) / 2.0 + result = df.rank(0, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) + expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) + expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + # f7u12, this does not work without extensive workaround + data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 1)]] + df = DataFrame(data) + + # check the rank + expected = DataFrame([[2., nan, 1.], + [2., 3., 1.]]) + result = df.rank(1, numeric_only=False, ascending=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[1., nan, 2.], + [2., 1., 3.]]) + result = df.rank(1, numeric_only=False, ascending=False) + tm.assert_frame_equal(result, expected) + + # mixed-type frames + self.mixed_frame['datetime'] = datetime.now() + self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) + + result = self.mixed_frame.rank(1) + expected = self.mixed_frame.rank(1, numeric_only=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, + 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) + tm.assert_frame_equal(df.rank(), exp) + + def test_rank_na_option(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + self.frame['A'][::2] = np.nan + self.frame['B'][::3] = np.nan + self.frame['C'][::4] = np.nan + self.frame['D'][::5] = np.nan + + # bottom + ranks0 = self.frame.rank(na_option='bottom') + ranks1 = self.frame.rank(1, na_option='bottom') + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp1 = np.apply_along_axis(rankdata, 1, fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # top + ranks0 = self.frame.rank(na_option='top') + ranks1 = self.frame.rank(1, na_option='top') + + fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values + fval1 = self.frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fval0) + exp1 = np.apply_along_axis(rankdata, 1, fval1) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # bottom + ranks0 = self.frame.rank(na_option='top', ascending=False) + ranks1 = self.frame.rank(1, na_option='top', ascending=False) + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fvals) + exp1 = np.apply_along_axis(rankdata, 1, -fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # top + ranks0 = self.frame.rank(na_option='bottom', ascending=False) + ranks1 = self.frame.rank(1, na_option='bottom', ascending=False) + + fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values + fval1 = self.frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fval0) + exp1 = np.apply_along_axis(rankdata, 1, -fval1) + + tm.assert_numpy_array_equal(ranks0.values, exp0) + tm.assert_numpy_array_equal(ranks1.values, exp1) + + def test_rank_axis(self): + # check if using axes' names gives the same result + df = DataFrame([[2, 1], [4, 3]]) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) + + def test_rank_methods_frame(self): + tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + import scipy + from scipy.stats import rankdata + + xs = np.random.randint(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 + cols = [chr(ord('z') - i) for i in range(xs.shape[1])] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + df = DataFrame(vals, columns=cols) + + for ax in [0, 1]: + for m in ['average', 'min', 'max', 'first', 'dense']: + result = df.rank(axis=ax, method=m) + sprank = np.apply_along_axis( + rankdata, ax, vals, + m if m != 'first' else 'ordinal') + sprank = sprank.astype(np.float64) + expected = DataFrame(sprank, columns=cols) + + if LooseVersion(scipy.__version__) >= '0.17.0': + expected = expected.astype('float64') + tm.assert_frame_equal(result, expected) + + def test_rank_descending(self): + dtypes = ['O', 'f8', 'i8'] + + for dtype, method in product(dtypes, self.results): + if 'i' in dtype: + df = self.df.dropna() + else: + df = self.df.astype(dtype) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + assert_frame_equal(res, expected) + + if method == 'first' and dtype == 'O': + continue + + expected = (df.max() - df).rank(method=method) + + if dtype != 'O': + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, + numeric_only=False) + assert_frame_equal(res3, expected) + + def test_rank_2d_tie_methods(self): + df = self.df + + def _check2d(df, expected, method='average', axis=0): + exp_df = DataFrame({'A': expected, 'B': expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + assert_frame_equal(result, exp_df) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, axis, dtype in product(results, [0, 1], dtypes): + if (dtype, method) in disabled: + continue + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, results[method], method=method, axis=axis) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b6985abb64e40..c2543581dca50 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -969,207 +969,6 @@ def test_drop_duplicates(self): sc.drop_duplicates(keep=False, inplace=True) assert_series_equal(sc, s[~expected]) - def test_rank(self): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - self.ts[::2] = np.nan - self.ts[:10][::3] = 4. - - ranks = self.ts.rank() - oranks = self.ts.astype('O').rank() - - assert_series_equal(ranks, oranks) - - mask = np.isnan(self.ts) - filled = self.ts.fillna(np.inf) - - # rankdata returns a ndarray - exp = Series(rankdata(filled), index=filled.index, name='ts') - exp[mask] = np.nan - - tm.assert_series_equal(ranks, exp) - - iseries = Series(np.arange(5).repeat(2)) - - iranks = iseries.rank() - exp = iseries.astype(float).rank() - assert_series_equal(iranks, exp) - iseries = Series(np.arange(5)) + 1.0 - exp = iseries / 5.0 - iranks = iseries.rank(pct=True) - - assert_series_equal(iranks, exp) - - iseries = Series(np.repeat(1, 100)) - exp = Series(np.repeat(0.505, 100)) - iranks = iseries.rank(pct=True) - assert_series_equal(iranks, exp) - - iseries[1] = np.nan - exp = Series(np.repeat(50.0 / 99.0, 100)) - exp[1] = np.nan - iranks = iseries.rank(pct=True) - assert_series_equal(iranks, exp) - - iseries = Series(np.arange(5)) + 1.0 - iseries[4] = np.nan - exp = iseries / 4.0 - iranks = iseries.rank(pct=True) - assert_series_equal(iranks, exp) - - iseries = Series(np.repeat(np.nan, 100)) - exp = iseries.copy() - iranks = iseries.rank(pct=True) - assert_series_equal(iranks, exp) - - iseries = Series(np.arange(5)) + 1 - iseries[4] = np.nan - exp = iseries / 4.0 - iranks = iseries.rank(pct=True) - assert_series_equal(iranks, exp) - - rng = date_range('1/1/1990', periods=5) - iseries = Series(np.arange(5), rng) + 1 - iseries.iloc[4] = np.nan - exp = iseries / 4.0 - iranks = iseries.rank(pct=True) - assert_series_equal(iranks, exp) - - iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) - exp = Series([2, 1, 3, 5, 4, 6.0]) - iranks = iseries.rank() - assert_series_equal(iranks, exp) - - # GH 5968 - iseries = Series(['3 day', '1 day 10m', '-2 day', pd.NaT], - dtype='m8[ns]') - exp = Series([3, 2, 1, np.nan]) - iranks = iseries.rank() - assert_series_equal(iranks, exp) - - values = np.array( - [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 - ], dtype='float64') - random_order = np.random.permutation(len(values)) - iseries = Series(values[random_order]) - exp = Series(random_order + 1.0, dtype='float64') - iranks = iseries.rank() - assert_series_equal(iranks, exp) - - def test_rank_categorical(self): - # GH issue #15420 rank incorrectly orders ordered categories - - # Test ascending/descending ranking for ordered categoricals - exp = pd.Series([1., 2., 3., 4., 5., 6.]) - exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) - ordered = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ).astype( - 'category', - categories=['first', 'second', 'third', - 'fourth', 'fifth', 'sixth'], - ordered=True - ) - assert_series_equal(ordered.rank(), exp) - assert_series_equal(ordered.rank(ascending=False), exp_desc) - - # Unordered categoricals should be ranked as objects - unordered = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], - ).astype( - 'category', - categories=['first', 'second', 'third', - 'fourth', 'fifth', 'sixth'], - ordered=False - ) - exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.]) - res = unordered.rank() - assert_series_equal(res, exp_unordered) - - unordered1 = pd.Series( - [1, 2, 3, 4, 5, 6], - ).astype( - 'category', - categories=[1, 2, 3, 4, 5, 6], - ordered=False - ) - exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.]) - res1 = unordered1.rank() - assert_series_equal(res1, exp_unordered1) - - # Test na_option for rank data - na_ser = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] - ).astype( - 'category', - categories=[ - 'first', 'second', 'third', 'fourth', - 'fifth', 'sixth', 'seventh' - ], - ordered=True - ) - - exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.]) - exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.]) - exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN]) - - assert_series_equal(na_ser.rank(na_option='top'), exp_top) - assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) - assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) - - # Test na_option for rank data with ascending False - exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.]) - exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.]) - exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN]) - - assert_series_equal( - na_ser.rank(na_option='top', ascending=False), - exp_top - ) - assert_series_equal( - na_ser.rank(na_option='bottom', ascending=False), - exp_bot - ) - assert_series_equal( - na_ser.rank(na_option='keep', ascending=False), - exp_keep - ) - - # Test with pct=True - na_ser = pd.Series( - ['first', 'second', 'third', 'fourth', np.NaN], - ).astype( - 'category', - categories=['first', 'second', 'third', 'fourth'], - ordered=True - ) - exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2]) - exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.]) - exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN]) - - assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) - assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) - assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) - - def test_rank_signature(self): - s = Series([0, 1]) - s.rank(method='average') - self.assertRaises(ValueError, s.rank, 'average') - - def test_rank_inf(self): - pytest.skip('DataFrame.rank does not currently rank ' - 'np.inf and -np.inf properly') - - values = np.array( - [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, - 2, 40, np.inf], dtype='float64') - random_order = np.random.permutation(len(values)) - iseries = Series(values[random_order]) - exp = Series(random_order + 1.0, dtype='float64') - iranks = iseries.rank() - assert_series_equal(iranks, exp) - def test_clip(self): val = self.ts.median() diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py new file mode 100644 index 0000000000000..99257b343310f --- /dev/null +++ b/pandas/tests/series/test_rank.py @@ -0,0 +1,323 @@ +# -*- coding: utf-8 -*- +from pandas import compat + +import pytest + +from distutils.version import LooseVersion +from numpy import nan +import numpy as np + +from pandas import (Series, date_range, NaT) + +from pandas.compat import product +from pandas.util.testing import assert_series_equal +import pandas.util.testing as tm +from pandas.tests.series.common import TestData + + +class TestSeriesRank(tm.TestCase, TestData): + s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + + results = { + 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, + 3.5, 1.5, 8.0, nan, 5.5]), + 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + } + + def test_rank(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + self.ts[::2] = np.nan + self.ts[:10][::3] = 4. + + ranks = self.ts.rank() + oranks = self.ts.astype('O').rank() + + assert_series_equal(ranks, oranks) + + mask = np.isnan(self.ts) + filled = self.ts.fillna(np.inf) + + # rankdata returns a ndarray + exp = Series(rankdata(filled), index=filled.index, name='ts') + exp[mask] = np.nan + + tm.assert_series_equal(ranks, exp) + + iseries = Series(np.arange(5).repeat(2)) + + iranks = iseries.rank() + exp = iseries.astype(float).rank() + assert_series_equal(iranks, exp) + iseries = Series(np.arange(5)) + 1.0 + exp = iseries / 5.0 + iranks = iseries.rank(pct=True) + + assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(1, 100)) + exp = Series(np.repeat(0.505, 100)) + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries[1] = np.nan + exp = Series(np.repeat(50.0 / 99.0, 100)) + exp[1] = np.nan + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1.0 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(np.nan, 100)) + exp = iseries.copy() + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + rng = date_range('1/1/1990', periods=5) + iseries = Series(np.arange(5), rng) + 1 + iseries.iloc[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) + exp = Series([2, 1, 3, 5, 4, 6.0]) + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + # GH 5968 + iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], + dtype='m8[ns]') + exp = Series([3, 2, 1, np.nan]) + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + values = np.array( + [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 + ], dtype='float64') + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype='float64') + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + def test_rank_categorical(self): + # GH issue #15420 rank incorrectly orders ordered categories + + # Test ascending/descending ranking for ordered categoricals + exp = Series([1., 2., 3., 4., 5., 6.]) + exp_desc = Series([6., 5., 4., 3., 2., 1.]) + ordered = Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] + ).astype( + 'category', + categories=['first', 'second', 'third', + 'fourth', 'fifth', 'sixth'], + ordered=True + ) + assert_series_equal(ordered.rank(), exp) + assert_series_equal(ordered.rank(ascending=False), exp_desc) + + # Unordered categoricals should be ranked as objects + unordered = Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype( + 'category', + categories=['first', 'second', 'third', + 'fourth', 'fifth', 'sixth'], + ordered=False + ) + exp_unordered = Series([2., 4., 6., 3., 1., 5.]) + res = unordered.rank() + assert_series_equal(res, exp_unordered) + + unordered1 = Series( + [1, 2, 3, 4, 5, 6], + ).astype( + 'category', + categories=[1, 2, 3, 4, 5, 6], + ordered=False + ) + exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) + res1 = unordered1.rank() + assert_series_equal(res1, exp_unordered1) + + # Test na_option for rank data + na_ser = Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] + ).astype( + 'category', + categories=[ + 'first', 'second', 'third', 'fourth', + 'fifth', 'sixth', 'seventh' + ], + ordered=True + ) + + exp_top = Series([2., 3., 4., 5., 6., 7., 1.]) + exp_bot = Series([1., 2., 3., 4., 5., 6., 7.]) + exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top'), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) + + # Test na_option for rank data with ascending False + exp_top = Series([7., 6., 5., 4., 3., 2., 1.]) + exp_bot = Series([6., 5., 4., 3., 2., 1., 7.]) + exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN]) + + assert_series_equal( + na_ser.rank(na_option='top', ascending=False), + exp_top + ) + assert_series_equal( + na_ser.rank(na_option='bottom', ascending=False), + exp_bot + ) + assert_series_equal( + na_ser.rank(na_option='keep', ascending=False), + exp_keep + ) + + # Test with pct=True + na_ser = Series( + ['first', 'second', 'third', 'fourth', np.NaN], + ).astype( + 'category', + categories=['first', 'second', 'third', 'fourth'], + ordered=True + ) + exp_top = Series([0.4, 0.6, 0.8, 1., 0.2]) + exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.]) + exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + + def test_rank_signature(self): + s = Series([0, 1]) + s.rank(method='average') + self.assertRaises(ValueError, s.rank, 'average') + + def test_rank_inf(self): + pytest.skip('DataFrame.rank does not currently rank ' + 'np.inf and -np.inf properly') + + values = np.array( + [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, + 2, 40, np.inf], dtype='float64') + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype='float64') + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + def test_rank_tie_methods(self): + s = self.s + + def _check(s, expected, method='average'): + result = s.rank(method=method) + tm.assert_series_equal(result, Series(expected)) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, dtype in product(results, dtypes): + if (dtype, method) in disabled: + continue + series = s if dtype is None else s.astype(dtype) + _check(series, results[method], method=method) + + def test_rank_methods_series(self): + tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + import scipy + from scipy.stats import rankdata + + xs = np.random.randn(9) + xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates + np.random.shuffle(xs) + + index = [chr(ord('a') + i) for i in range(len(xs))] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + ts = Series(vals, index=index) + + for m in ['average', 'min', 'max', 'first', 'dense']: + result = ts.rank(method=m) + sprank = rankdata(vals, m if m != 'first' else 'ordinal') + expected = Series(sprank, index=index) + + if LooseVersion(scipy.__version__) >= '0.17.0': + expected = expected.astype('float64') + tm.assert_series_equal(result, expected) + + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2, 2], [1, 1]), + ([1, 2, 3], [1, 2, 3]), + ([4, 2, 1], [3, 2, 1],), + ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), + ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + def test_rank_descending(self): + dtypes = ['O', 'f8', 'i8'] + + for dtype, method in product(dtypes, self.results): + if 'i' in dtype: + s = self.s.dropna() + else: + s = self.s.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + assert_series_equal(res, expected) + + if method == 'first' and dtype == 'O': + continue + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + assert_series_equal(res2, expected) + + def test_rank_int(self): + s = self.s.dropna().astype('i8') + + for method, res in compat.iteritems(self.results): + result = s.rank(method=method) + expected = Series(res).dropna() + expected.index = result.index + assert_series_equal(result, expected) + + def test_rank_object_bug(self): + # GH 13445 + + # smoke tests + Series([np.nan] * 32).astype(object).rank(ascending=True) + Series([np.nan] * 32).astype(object).rank(ascending=False) diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py deleted file mode 100644 index 118c4147a2019..0000000000000 --- a/pandas/tests/test_stats.py +++ /dev/null @@ -1,185 +0,0 @@ -# -*- coding: utf-8 -*- -from pandas import compat - -from distutils.version import LooseVersion -from numpy import nan -import numpy as np - -from pandas import Series, DataFrame - -from pandas.compat import product -from pandas.util.testing import (assert_frame_equal, assert_series_equal) -import pandas.util.testing as tm - - -class TestRank(tm.TestCase): - s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) - df = DataFrame({'A': s, 'B': s}) - - results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, - 3.5, 1.5, 8.0, nan, 5.5]), - 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), - 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), - 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), - } - - def test_rank_tie_methods(self): - s = self.s - - def _check(s, expected, method='average'): - result = s.rank(method=method) - tm.assert_series_equal(result, Series(expected)) - - dtypes = [None, object] - disabled = set([(object, 'first')]) - results = self.results - - for method, dtype in product(results, dtypes): - if (dtype, method) in disabled: - continue - series = s if dtype is None else s.astype(dtype) - _check(series, results[method], method=method) - - def test_rank_methods_series(self): - tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') - import scipy - from scipy.stats import rankdata - - xs = np.random.randn(9) - xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates - np.random.shuffle(xs) - - index = [chr(ord('a') + i) for i in range(len(xs))] - - for vals in [xs, xs + 1e6, xs * 1e-6]: - ts = Series(vals, index=index) - - for m in ['average', 'min', 'max', 'first', 'dense']: - result = ts.rank(method=m) - sprank = rankdata(vals, m if m != 'first' else 'ordinal') - expected = Series(sprank, index=index) - - if LooseVersion(scipy.__version__) >= '0.17.0': - expected = expected.astype('float64') - tm.assert_series_equal(result, expected) - - def test_rank_methods_frame(self): - tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') - import scipy - from scipy.stats import rankdata - - xs = np.random.randint(0, 21, (100, 26)) - xs = (xs - 10.0) / 10.0 - cols = [chr(ord('z') - i) for i in range(xs.shape[1])] - - for vals in [xs, xs + 1e6, xs * 1e-6]: - df = DataFrame(vals, columns=cols) - - for ax in [0, 1]: - for m in ['average', 'min', 'max', 'first', 'dense']: - result = df.rank(axis=ax, method=m) - sprank = np.apply_along_axis( - rankdata, ax, vals, - m if m != 'first' else 'ordinal') - sprank = sprank.astype(np.float64) - expected = DataFrame(sprank, columns=cols) - - if LooseVersion(scipy.__version__) >= '0.17.0': - expected = expected.astype('float64') - tm.assert_frame_equal(result, expected) - - def test_rank_dense_method(self): - dtypes = ['O', 'f8', 'i8'] - in_out = [([1], [1]), - ([2], [1]), - ([0], [1]), - ([2, 2], [1, 1]), - ([1, 2, 3], [1, 2, 3]), - ([4, 2, 1], [3, 2, 1],), - ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), - ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] - - for ser, exp in in_out: - for dtype in dtypes: - s = Series(ser).astype(dtype) - result = s.rank(method='dense') - expected = Series(exp).astype(result.dtype) - assert_series_equal(result, expected) - - def test_rank_descending(self): - dtypes = ['O', 'f8', 'i8'] - - for dtype, method in product(dtypes, self.results): - if 'i' in dtype: - s = self.s.dropna() - df = self.df.dropna() - else: - s = self.s.astype(dtype) - df = self.df.astype(dtype) - - res = s.rank(ascending=False) - expected = (s.max() - s).rank() - assert_series_equal(res, expected) - - res = df.rank(ascending=False) - expected = (df.max() - df).rank() - assert_frame_equal(res, expected) - - if method == 'first' and dtype == 'O': - continue - - expected = (s.max() - s).rank(method=method) - res2 = s.rank(method=method, ascending=False) - assert_series_equal(res2, expected) - - expected = (df.max() - df).rank(method=method) - - if dtype != 'O': - res2 = df.rank(method=method, ascending=False, - numeric_only=True) - assert_frame_equal(res2, expected) - - res3 = df.rank(method=method, ascending=False, - numeric_only=False) - assert_frame_equal(res3, expected) - - def test_rank_2d_tie_methods(self): - df = self.df - - def _check2d(df, expected, method='average', axis=0): - exp_df = DataFrame({'A': expected, 'B': expected}) - - if axis == 1: - df = df.T - exp_df = exp_df.T - - result = df.rank(method=method, axis=axis) - assert_frame_equal(result, exp_df) - - dtypes = [None, object] - disabled = set([(object, 'first')]) - results = self.results - - for method, axis, dtype in product(results, [0, 1], dtypes): - if (dtype, method) in disabled: - continue - frame = df if dtype is None else df.astype(dtype) - _check2d(frame, results[method], method=method, axis=axis) - - def test_rank_int(self): - s = self.s.dropna().astype('i8') - - for method, res in compat.iteritems(self.results): - result = s.rank(method=method) - expected = Series(res).dropna() - expected.index = result.index - assert_series_equal(result, expected) - - def test_rank_object_bug(self): - # GH 13445 - - # smoke tests - Series([np.nan] * 32).astype(object).rank(ascending=True) - Series([np.nan] * 32).astype(object).rank(ascending=False) From 971908ed72247f75d44bcdfdc60897f31825bac0 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Sun, 12 Mar 2017 15:55:32 -0500 Subject: [PATCH 281/338] DOC: fix typo in timeseries documentation (#15666) --- doc/source/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index e09d240ed91b7..c0c178ad2fb49 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -610,7 +610,7 @@ There are several time/date properties that one can access from ``Timestamp`` or dayofweek,"The numer of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" weekday_name,"The name of the day in a week (ex: Friday)" - quarter,"Quarter of the date: Jan=Mar = 1, Apr-Jun = 2, etc." + quarter,"Quarter of the date: Jan-Mar = 1, Apr-Jun = 2, etc." days_in_month,"The number of days in the month of the datetime" is_month_start,"Logical indicating if first day of month (defined by frequency)" is_month_end,"Logical indicating if last day of month (defined by frequency)" From 8f3a2d66643e3fcbc04c11e492e9d58fb40da0d5 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Sun, 12 Mar 2017 17:40:05 -0500 Subject: [PATCH 282/338] Fix another typo in the timeseries documentation (#15667) --- doc/source/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index c0c178ad2fb49..7136b15a7633a 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -607,7 +607,7 @@ There are several time/date properties that one can access from ``Timestamp`` or dayofyear,"The ordinal day of year" weekofyear,"The week ordinal of the year" week,"The week ordinal of the year" - dayofweek,"The numer of the day of the week with Monday=0, Sunday=6" + dayofweek,"The number of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" weekday_name,"The name of the day in a week (ex: Friday)" quarter,"Quarter of the date: Jan-Mar = 1, Apr-Jun = 2, etc." From 94d9188e90cb9600d1fb5f70d8b68cbbfe52196f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Mar 2017 14:01:40 -0400 Subject: [PATCH 283/338] BLD: make 3.6 use *only* conda-forge channels --- .travis.yml | 1 + ci/install_travis.sh | 7 +++++-- ci/requirements-3.6.run | 1 + ci/requirements-3.6.sh | 7 ------- 4 files changed, 7 insertions(+), 9 deletions(-) delete mode 100644 ci/requirements-3.6.sh diff --git a/.travis.yml b/.travis.yml index 97bf881f3b6fc..b0331941e2a1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -86,6 +86,7 @@ matrix: - JOB_NAME: "36" - TEST_ARGS="--skip-slow --skip-network" - PANDAS_TESTING_MODE="deprecate" + - CONDA_FORGE=true addons: apt: packages: diff --git a/ci/install_travis.sh b/ci/install_travis.sh index b337f6e443be2..12202b4ceee70 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -53,14 +53,17 @@ conda config --set ssl_verify false || exit 1 conda config --set always_yes true --set changeps1 false || exit 1 conda update -q conda +echo "[add channels]" # add the pandas channel to take priority # to add extra packages -echo "[add channels]" conda config --add channels pandas || exit 1 conda config --remove channels defaults || exit 1 conda config --add channels defaults || exit 1 -conda install anaconda-client +if [ "$CONDA_FORGE" ]; then + # add conda-forge channel as priority + conda config --add channels conda-forge || exit 1 +fi # Useful for debugging any issues with conda conda info -a || exit 1 diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 9a6c1c7edbc5e..41c9680ce1b7e 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -14,6 +14,7 @@ html5lib jinja2 sqlalchemy pymysql +feather-format # psycopg2 (not avail on defaults ATM) beautifulsoup4 s3fs diff --git a/ci/requirements-3.6.sh b/ci/requirements-3.6.sh deleted file mode 100644 index 7d88ede751ec8..0000000000000 --- a/ci/requirements-3.6.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install 36" - -conda install -n pandas -c conda-forge feather-format From e17ba3ddd603ffd64cb5d9b40d5d33257bfdb7e3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Mar 2017 19:27:30 -0400 Subject: [PATCH 284/338] TST: skip scipy tests for >= 0.19.0 as needed in interpolation / window / sparse closes #15668 --- pandas/tests/frame/test_missing.py | 3 +- pandas/tests/frame/test_rank.py | 3 +- pandas/tests/series/test_missing.py | 7 +++-- pandas/tests/series/test_rank.py | 3 +- pandas/tests/sparse/test_frame.py | 49 +++++++++++++++++++++++++++-- pandas/tests/test_nanops.py | 14 +++------ pandas/tests/test_window.py | 10 +++--- pandas/util/testing.py | 28 ++++++++--------- 8 files changed, 80 insertions(+), 37 deletions(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 80ea01d3a05aa..923ed2e7c3444 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -548,7 +548,8 @@ def test_interp_nan_idx(self): df.interpolate(method='values') def test_interp_various(self): - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') + df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) df = df.set_index('C') diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index 151a89888c329..b115218d76958 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -193,7 +193,8 @@ def test_rank_axis(self): tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) def test_rank_methods_frame(self): - tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + tm.skip_if_no_package('scipy', min_version='0.13', + app='scipy.stats.rankdata') import scipy from scipy.stats import rankdata diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 87cfcf32229b4..9e997da517bf6 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -827,7 +827,8 @@ def test_interp_quad(self): assert_series_equal(result, expected) def test_interp_scipy_basic(self): - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') + s = Series([1, 3, np.nan, 12, np.nan, 25]) # slinear expected = Series([1., 3., 7.5, 12., 18.5, 25.]) @@ -1027,8 +1028,8 @@ def test_spline(self): def test_spline_extrapolate(self): tm.skip_if_no_package( - 'scipy', '0.15', - 'setting ext on scipy.interpolate.UnivariateSpline') + 'scipy', min_version='0.15', + app='setting ext on scipy.interpolate.UnivariateSpline') s = Series([1, 2, 3, 4, np.nan, 6, np.nan]) result3 = s.interpolate(method='spline', order=1, ext=3) expected3 = Series([1., 2., 3., 4., 5., 6., 6.]) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 99257b343310f..f47eae3adc3ae 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -246,7 +246,8 @@ def _check(s, expected, method='average'): _check(series, results[method], method=method) def test_rank_methods_series(self): - tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + tm.skip_if_no_package('scipy', min_version='0.13', + app='scipy.stats.rankdata') import scipy from scipy.stats import rankdata diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 4cd5a643ce4be..c0c678c184ee8 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1129,10 +1129,10 @@ def test_isnotnull(self): @pytest.mark.parametrize('index', [None, list('ab')]) # noqa: F811 @pytest.mark.parametrize('columns', [None, list('cd')]) @pytest.mark.parametrize('fill_value', [None, 0, np.nan]) -@pytest.mark.parametrize('dtype', [object, bool, int, float, np.uint16]) +@pytest.mark.parametrize('dtype', [bool, int, float, np.uint16]) def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # GH 4343 - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy') # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results @@ -1180,6 +1180,51 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): tm.assert_equal(sdf.to_coo().dtype, np.object_) +@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) # noqa: F811 +def test_from_to_scipy_object(spmatrix, fill_value): + # GH 4343 + dtype = object + columns = list('cd') + index = list('ab') + tm.skip_if_no_package('scipy', max_version='0.19.0') + + # Make one ndarray and from it one sparse matrix, both to be used for + # constructing frames and comparing results + arr = np.eye(2, dtype=dtype) + try: + spm = spmatrix(arr) + assert spm.dtype == arr.dtype + except (TypeError, AssertionError): + # If conversion to sparse fails for this spmatrix type and arr.dtype, + # then the combination is not currently supported in NumPy, so we + # can just skip testing it thoroughly + return + + sdf = pd.SparseDataFrame(spm, index=index, columns=columns, + default_fill_value=fill_value) + + # Expected result construction is kind of tricky for all + # dtype-fill_value combinations; easiest to cast to something generic + # and except later on + rarr = arr.astype(object) + rarr[arr == 0] = np.nan + expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna( + fill_value if fill_value is not None else np.nan) + + # Assert frame is as expected + sdf_obj = sdf.astype(object) + tm.assert_sp_frame_equal(sdf_obj, expected) + tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) + + # Assert spmatrices equal + tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok())) + + # Ensure dtype is preserved if possible + res_dtype = object + tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) + tm.assert_equal(sdf.to_coo().dtype, res_dtype) + + class TestSparseDataFrameArithmetic(tm.TestCase): def test_numeric_op_scalar(self): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 937c20d009b6b..75a7555d58ca5 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,7 +5,7 @@ import warnings import numpy as np -from pandas import Series, isnull +from pandas import Series, isnull, _np_version_under1p9 from pandas.types.common import is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -338,8 +338,7 @@ def test_nanmean_overflow(self): # is now consistent with numpy # numpy < 1.9.0 is not computing this correctly - from distutils.version import LooseVersion - if LooseVersion(np.__version__) >= '1.9.0': + if not _np_version_under1p9: for a in [2 ** 55, -2 ** 55, 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() @@ -388,8 +387,7 @@ def test_nanstd(self): allow_tdelta=True, allow_obj='convert') def test_nansem(self): - tm.skip_if_no_package('scipy.stats') - tm._skip_if_scipy_0_17() + tm.skip_if_no_package('scipy', min_version='0.17.0') from scipy.stats import sem self.check_funs_ddof(nanops.nansem, sem, allow_complex=False, allow_str=False, allow_date=False, @@ -448,16 +446,14 @@ def _skew_kurt_wrap(self, values, axis=None, func=None): return result def test_nanskew(self): - tm.skip_if_no_package('scipy.stats') - tm._skip_if_scipy_0_17() + tm.skip_if_no_package('scipy', min_version='0.17.0') from scipy.stats import skew func = partial(self._skew_kurt_wrap, func=skew) self.check_funs(nanops.nanskew, func, allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nankurt(self): - tm.skip_if_no_package('scipy.stats') - tm._skip_if_scipy_0_17() + tm.skip_if_no_package('scipy', min_version='0.17.0') from scipy.stats import kurtosis func1 = partial(kurtosis, fisher=True) func = partial(self._skew_kurt_wrap, func=func1) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3f2973a9834ca..b7164d31b2a5e 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -905,7 +905,7 @@ def test_cmov_window_na_min_periods(self): def test_cmov_window_regular(self): # GH 8238 - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] @@ -938,7 +938,7 @@ def test_cmov_window_regular(self): def test_cmov_window_regular_linear_range(self): # GH 8238 - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] @@ -955,7 +955,7 @@ def test_cmov_window_regular_linear_range(self): def test_cmov_window_regular_missing_data(self): # GH 8238 - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] @@ -988,7 +988,7 @@ def test_cmov_window_regular_missing_data(self): def test_cmov_window_special(self): # GH 8238 - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., @@ -1015,7 +1015,7 @@ def test_cmov_window_special(self): def test_cmov_window_special_linear_range(self): # GH 8238 - tm._skip_if_no_scipy() + tm.skip_if_no_package('scipy', max_version='0.19.0') win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 74ff480a9c198..529ecef3e2d6a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -304,14 +304,6 @@ def _skip_if_no_scipy(): pytest.skip('scipy.sparse missing') -def _skip_if_scipy_0_17(): - import scipy - v = scipy.__version__ - if v >= LooseVersion("0.17.0"): - import pytest - pytest.skip("scipy 0.17") - - def _check_if_lzma(): try: return compat.import_lzma() @@ -2020,15 +2012,18 @@ def __init__(self, *args, **kwargs): # Dependency checks. Copied this from Nipy/Nipype (Copyright of # respective developers, license: BSD-3) -def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion): - """Check that the minimal version of the required package is installed. +def package_check(pkg_name, min_version=None, max_version=None, app='pandas', + checker=LooseVersion): + """Check that the min/max version of the required package is installed. Parameters ---------- pkg_name : string Name of the required package. - version : string, optional + min_version : string, optional Minimal version number for required package. + max_version : string, optional + Max version number for required package. app : string, optional Application that is performing the check. For instance, the name of the tutorial being executed that depends on specific @@ -2040,7 +2035,6 @@ def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion): Examples -------- package_check('numpy', '1.3') - package_check('networkx', '1.0', 'tutorial1') """ @@ -2049,8 +2043,10 @@ def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion): msg = '%s requires %s' % (app, pkg_name) else: msg = 'module requires %s' % pkg_name - if version: - msg += ' with version >= %s' % (version,) + if min_version: + msg += ' with version >= %s' % (min_version,) + if max_version: + msg += ' with version < %s' % (max_version,) try: mod = __import__(pkg_name) except ImportError: @@ -2059,7 +2055,9 @@ def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion): have_version = mod.__version__ except AttributeError: pytest.skip('Cannot find version for %s' % pkg_name) - if version and checker(have_version) < checker(version): + if min_version and checker(have_version) < checker(min_version): + pytest.skip(msg) + if max_version and checker(have_version) >= checker(max_version): pytest.skip(msg) From 6bc9ea940ca224fdcc1dea4a8d24a054fe349195 Mon Sep 17 00:00:00 2001 From: mattip Date: Sun, 12 Mar 2017 20:54:52 -0400 Subject: [PATCH 285/338] COMPAT: free parser memory at close() for non-refcnt gc relying on __dealloc__ to clean up malloc() ed memory can lead to a perceived "leak" on PyPy since the garbage collector will not necessarily collect the object as soon as its refcnt reaches 0. Instead, pre-emptively release memory when close() is called The code still maintains backward compatibility for the case where close() is never called Author: mattip Closes #15665 from mattip/pypy-compat and squashes the following commits: eaf50fe [mattip] COMPAT: free parser memory at close() for non-refcnt gc --- pandas/_libs/src/parser/tokenizer.c | 4 ++++ pandas/_libs/src/parser/tokenizer.h | 2 ++ pandas/io/parsers.pyx | 18 ++++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 916f06d357473..6b0775e54da0c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -162,6 +162,7 @@ int parser_cleanup(parser_t *self) { if (self->cb_cleanup(self->source) < 0) { status = -1; } + self->cb_cleanup = NULL; } return status; @@ -239,6 +240,9 @@ int parser_init(parser_t *self) { void parser_free(parser_t *self) { // opposite of parser_init parser_cleanup(self); +} + +void parser_del(parser_t *self) { free(self); } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 9853b5149bee3..b4344e8a6c070 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -243,6 +243,8 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows); void parser_free(parser_t *self); +void parser_del(parser_t *self); + void parser_set_default_options(parser_t *self); void debug_print_parser(parser_t *self); diff --git a/pandas/io/parsers.pyx b/pandas/io/parsers.pyx index a5858accbb6f5..3728cda559050 100644 --- a/pandas/io/parsers.pyx +++ b/pandas/io/parsers.pyx @@ -214,6 +214,7 @@ cdef extern from "parser/tokenizer.h": int parser_init(parser_t *self) nogil void parser_free(parser_t *self) nogil + void parser_del(parser_t *self) nogil int parser_add_skiprow(parser_t *self, int64_t row) int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) @@ -573,8 +574,13 @@ cdef class TextReader: def __dealloc__(self): parser_free(self.parser) - kh_destroy_str(self.true_set) - kh_destroy_str(self.false_set) + if self.true_set: + kh_destroy_str(self.true_set) + self.true_set = NULL + if self.false_set: + kh_destroy_str(self.false_set) + self.false_set = NULL + parser_del(self.parser) def close(self): # we need to properly close an open derived @@ -584,6 +590,14 @@ cdef class TextReader: self.handle.close() except: pass + # also preemptively free all allocated memory + parser_free(self.parser) + if self.true_set: + kh_destroy_str(self.true_set) + self.true_set = NULL + if self.false_set: + kh_destroy_str(self.false_set) + self.false_set = NULL def set_error_bad_lines(self, int status): self.parser.error_bad_lines = status From 42fe25f79d08aba1ccb7f86195b97fee7680aafa Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 13 Mar 2017 10:08:01 -0400 Subject: [PATCH 286/338] DOC: typo in merge.rst --- doc/source/categorical.rst | 1 + doc/source/merging.rst | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 6d85e1a6560b0..2203737ecd7b5 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -617,6 +617,7 @@ Assigning a `Categorical` to parts of a column of other types will use the value df df.dtypes +.. _categorical.merge: Merging ~~~~~~~ diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 70d2ce5b1a664..0b7f9f18190a4 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -779,7 +779,7 @@ resulting dtype will be upcast. .. versionadded:: 0.20.0 -Merging will preserve ``category`` dtypes of the mergands. +Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals ` The left frame. @@ -788,8 +788,8 @@ The left frame. X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) X = X.astype('category', categories=['foo', 'bar']) - left = DataFrame({'X': X, - 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + left = pd.DataFrame({'X': X, + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) left left.dtypes @@ -797,8 +797,8 @@ The right frame. .. ipython:: python - right = DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + right = pd.DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), + 'Z': [1, 2]}) right right.dtypes From 3bca592596bbe95848f9848c6be8139875be4a43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 13 Mar 2017 15:47:49 +0100 Subject: [PATCH 287/338] DOC: correct whatsnew note of #15515 --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f6d5e3df814fc..8a4f2f47b9853 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -880,7 +880,7 @@ Bug Fixes - Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) - Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`, :issue:`11444`) -- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 0.2.0``) (:issue:`9351`) +- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) From a4c8c601a8be8eafc6904be1c4872538f1eab217 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 13 Mar 2017 11:10:38 -0400 Subject: [PATCH 288/338] DOC: typo in merge.rst --- doc/source/merging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 0b7f9f18190a4..fb020727d077e 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -797,7 +797,7 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), + right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), 'Z': [1, 2]}) right right.dtypes From b8420bafc61cdc6c67b150b7ce4eb1c09d0c34dc Mon Sep 17 00:00:00 2001 From: Aleksey Bilogur Date: Mon, 13 Mar 2017 19:04:39 -0400 Subject: [PATCH 289/338] TST: fix errant tight_layout test (#15671) closes #9351 --- pandas/tests/plotting/common.py | 1 + pandas/tests/plotting/test_hist_method.py | 4 ++-- pandas/tools/plotting.py | 8 ++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 92e2dc7b5d934..c31d8b539ae6f 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -53,6 +53,7 @@ def setUp(self): self.mpl_ge_1_4_0 = plotting._mpl_ge_1_4_0() self.mpl_ge_1_5_0 = plotting._mpl_ge_1_5_0() self.mpl_ge_2_0_0 = plotting._mpl_ge_2_0_0() + self.mpl_ge_2_0_1 = plotting._mpl_ge_2_0_1() if self.mpl_ge_1_4_0: self.bp_n_objects = 7 diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 22de7055e3cea..380bdc12abce4 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -241,8 +241,8 @@ def test_hist_layout(self): @slow # GH 9351 def test_tight_layout(self): - if self.mpl_ge_2_0_0: - df = DataFrame(randn(100, 2)) + if self.mpl_ge_2_0_1: + df = DataFrame(randn(100, 3)) _check_plot_works(df.hist) self.plt.tight_layout() diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index d46c38c117445..d311b0e6d83eb 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -150,6 +150,14 @@ def _mpl_ge_2_0_0(): return False +def _mpl_ge_2_0_1(): + try: + import matplotlib + return matplotlib.__version__ >= LooseVersion('2.0.1') + except ImportError: + return False + + if _mpl_ge_1_5_0(): # Compat with mp 1.5, which uses cycler. import cycler From 36cfda492696990c37ff80eb423941dd0e424de0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 13 Mar 2017 19:49:42 -0400 Subject: [PATCH 290/338] DOC: use shared docs on Index._convert_list_indexer (#15678) CLN: push key coercion to the indexes themselves to simplify a bit --- pandas/core/indexing.py | 86 ++++++++++---------------------------- pandas/indexes/base.py | 37 ++++++++++++++++ pandas/indexes/category.py | 19 ++++++--- pandas/indexes/multi.py | 33 +++++++++++++++ pandas/indexes/numeric.py | 1 + 5 files changed, 106 insertions(+), 70 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 546cbd8337e7e..19b7771251da3 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -7,7 +7,6 @@ from pandas.types.generic import ABCDataFrame, ABCPanel, ABCSeries from pandas.types.common import (is_integer_dtype, is_integer, is_float, - is_categorical_dtype, is_list_like, is_sequence, is_iterator, @@ -1087,51 +1086,24 @@ def _getitem_iterable(self, key, axis=0): inds, = key.nonzero() return self.obj.take(inds, axis=axis, convert=False) else: - if isinstance(key, Index): - keyarr = labels._convert_index_indexer(key) - else: - keyarr = _asarray_tuplesafe(key) - keyarr = labels._convert_arr_indexer(keyarr) - - if is_categorical_dtype(labels): - keyarr = labels._shallow_copy(keyarr) - - # have the index handle the indexer and possibly return - # an indexer or raising - indexer = labels._convert_list_indexer(keyarr, kind=self.name) + # Have the index compute an indexer or return None + # if it cannot handle + indexer, keyarr = labels._convert_listlike_indexer( + key, kind=self.name) if indexer is not None: return self.obj.take(indexer, axis=axis) - # this is not the most robust, but... - if (isinstance(labels, MultiIndex) and len(keyarr) and - not isinstance(keyarr[0], tuple)): - level = 0 - else: - level = None - # existing labels are unique and indexer are unique if labels.is_unique and Index(keyarr).is_unique: try: - result = self.obj.reindex_axis(keyarr, axis=axis, - level=level) - - # this is an error as we are trying to find - # keys in a multi-index that don't exist - if isinstance(labels, MultiIndex) and level is not None: - if (hasattr(result, 'ndim') and - not np.prod(result.shape) and len(keyarr)): - raise KeyError("cannot index a multi-index axis " - "with these keys") - - return result - + return self.obj.reindex_axis(keyarr, axis=axis) except AttributeError: # Series if axis != 0: raise AssertionError('axis must be 0') - return self.obj.reindex(keyarr, level=level) + return self.obj.reindex(keyarr) # existing labels are non-unique else: @@ -1225,49 +1197,33 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): if is_nested_tuple(obj, labels): return labels.get_locs(obj) + elif is_list_like_indexer(obj): + if is_bool_indexer(obj): obj = check_bool_indexer(labels, obj) inds, = obj.nonzero() return inds else: - if isinstance(obj, Index): - # want Index objects to pass through untouched - objarr = obj - else: - objarr = _asarray_tuplesafe(obj) - # The index may want to handle a list indexer differently - # by returning an indexer or raising - indexer = labels._convert_list_indexer(objarr, kind=self.name) + # Have the index compute an indexer or return None + # if it cannot handle + indexer, objarr = labels._convert_listlike_indexer( + obj, kind=self.name) if indexer is not None: return indexer - # this is not the most robust, but... - if (isinstance(labels, MultiIndex) and - not isinstance(objarr[0], tuple)): - level = 0 - _, indexer = labels.reindex(objarr, level=level) + # unique index + if labels.is_unique: + indexer = check = labels.get_indexer(objarr) - # take all - if indexer is None: - indexer = np.arange(len(labels)) - - check = labels.levels[0].get_indexer(objarr) + # non-unique (dups) else: - level = None - - # unique index - if labels.is_unique: - indexer = check = labels.get_indexer(objarr) - - # non-unique (dups) - else: - (indexer, - missing) = labels.get_indexer_non_unique(objarr) - # 'indexer' has dupes, create 'check' using 'missing' - check = np.zeros_like(objarr) - check[missing] = -1 + (indexer, + missing) = labels.get_indexer_non_unique(objarr) + # 'indexer' has dupes, create 'check' using 'missing' + check = np.zeros_like(objarr) + check[missing] = -1 mask = check == -1 if mask.any(): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 7f46f437489a1..5b942e2565c29 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1339,6 +1339,27 @@ def is_int(v): return indexer + def _convert_listlike_indexer(self, keyarr, kind=None): + """ + Parameters + ---------- + keyarr : list-like + Indexer to convert. + + Returns + ------- + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys + """ + if isinstance(keyarr, Index): + keyarr = self._convert_index_indexer(keyarr) + else: + keyarr = self._convert_arr_indexer(keyarr) + + indexer = self._convert_list_indexer(keyarr, kind=kind) + return indexer, keyarr + _index_shared_docs['_convert_arr_indexer'] = """ Convert an array-like indexer to the appropriate dtype. @@ -1354,6 +1375,7 @@ def is_int(v): @Appender(_index_shared_docs['_convert_arr_indexer']) def _convert_arr_indexer(self, keyarr): + keyarr = _asarray_tuplesafe(keyarr) return keyarr _index_shared_docs['_convert_index_indexer'] = """ @@ -1373,6 +1395,21 @@ def _convert_arr_indexer(self, keyarr): def _convert_index_indexer(self, keyarr): return keyarr + _index_shared_docs['_convert_list_indexer'] = """ + Convert a list-like indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + kind : iloc, ix, loc, optional + + Returns + ------- + positional indexer or None + """ + + @Appender(_index_shared_docs['_convert_list_indexer']) def _convert_list_indexer(self, keyarr, kind=None): """ passed a key that is tuplesafe that is integer based diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 3d8f76fc56b01..923dd4ec785c5 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -18,6 +18,8 @@ import pandas.core.base as base import pandas.core.missing as missing import pandas.indexes.base as ibase +from pandas.core.common import _asarray_tuplesafe + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -458,12 +460,10 @@ def get_indexer_non_unique(self, target): codes = self.categories.get_indexer(target) return self._engine.get_indexer_non_unique(codes) + @Appender(_index_shared_docs['_convert_list_indexer']) def _convert_list_indexer(self, keyarr, kind=None): - """ - we are passed a list indexer. - Return our indexer or raise if all of the values are not included in - the categories - """ + # Return our indexer or raise if all of the values are not included in + # the categories codes = self.categories.get_indexer(keyarr) if (codes == -1).any(): raise KeyError("a list-indexer must only include values that are " @@ -471,6 +471,15 @@ def _convert_list_indexer(self, keyarr, kind=None): return None + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + keyarr = _asarray_tuplesafe(keyarr) + return self._shallow_copy(keyarr) + + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + return self._shallow_copy(keyarr) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index bca1db83b6645..1c1609fed1dd1 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1568,6 +1568,39 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): return new_index, indexer + def _convert_listlike_indexer(self, keyarr, kind=None): + """ + Parameters + ---------- + keyarr : list-like + Indexer to convert. + + Returns + ------- + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys + """ + indexer, keyarr = super(MultiIndex, self)._convert_listlike_indexer( + keyarr, kind=kind) + + # are we indexing a specific level + if indexer is None and len(keyarr) and not isinstance(keyarr[0], + tuple): + level = 0 + _, indexer = self.reindex(keyarr, level=level) + + # take all + if indexer is None: + indexer = np.arange(len(self)) + + check = self.levels[0].get_indexer(keyarr) + mask = check == -1 + if mask.any(): + raise KeyError('%s not in index' % keyarr[mask]) + + return indexer, keyarr + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 9bb70feb2501f..2f897c81975c2 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -203,6 +203,7 @@ def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are # also uint64. + keyarr = _asarray_tuplesafe(keyarr) if is_integer_dtype(keyarr): return _asarray_tuplesafe(keyarr, dtype=np.uint64) return keyarr From 632652100921907ece4ddd0fc163714a91af764e Mon Sep 17 00:00:00 2001 From: Jaehoon Hwang Date: Tue, 14 Mar 2017 08:28:05 -0400 Subject: [PATCH 291/338] BUG: upcasting on reshaping ops #13247 Original work done by @jennolsen84, in #13337 closes #13247 Author: Jaehoon Hwang Author: Jae Closes #15594 from jaehoonhwang/Bug13247 and squashes the following commits: 3cd1734 [Jaehoon Hwang] Pass the non-related tests in test_partial and test_reshape 1fa578b [Jaehoon Hwang] Applying request changes removing unnecessary test and renameing 6744636 [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' into Bug13247 5bb72c7 [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' into Bug13247 a1d5d40 [Jaehoon Hwang] Completed pytest 8122359 [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' into Bug13247 0e52b74 [Jaehoon Hwang] Working: Except for pytest 8fec07c [Jaehoon Hwang] Fix: test_concat.py and internals.py 4f6c03e [Jaehoon Hwang] Fix: is_float_dtypes and is_numeric_dtype wrong place d3476c0 [Jaehoon Hwang] Merge branch 'master' into Bug13247 b977615 [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' 4b1e5c6 [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' into Bug13247 45f7ae9 [Jaehoon Hwang] Added pytest function 468baee [Jae] BUG: upcasting on reshaping ops #13247 --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/core/internals.py | 20 ++++++++++++++++---- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/test_internals.py | 2 +- pandas/tests/test_reshape.py | 1 + pandas/tests/tools/test_concat.py | 14 ++++++++++++++ 6 files changed, 35 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8a4f2f47b9853..097efdd097eec 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -886,3 +886,5 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) + +- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index aa954fbee9a60..1c070b3ed34a9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -21,6 +21,7 @@ is_datetime64tz_dtype, is_object_dtype, is_datetimelike_v_numeric, + is_float_dtype, is_numeric_dtype, is_numeric_v_string_like, is_extension_type, is_list_like, is_re, @@ -4522,6 +4523,8 @@ def _interleaved_dtype(blocks): return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) return lcd + elif have_int and have_float and not have_complex: + return np.dtype('float64') elif have_complex: return np.dtype('c16') else: @@ -4891,6 +4894,8 @@ def get_empty_dtype_and_na(join_units): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name else: upcast_cls = 'float' @@ -4915,8 +4920,6 @@ def get_empty_dtype_and_na(join_units): return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan - elif 'float' in upcast_classes: - return np.dtype(np.float64), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslib.iNaT @@ -4925,7 +4928,17 @@ def get_empty_dtype_and_na(join_units): elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslib.iNaT else: # pragma - raise AssertionError("invalid dtype determination in get_concat_dtype") + g = np.find_common_type(upcast_classes, []) + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None + else: + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) def concatenate_join_units(join_units, concat_axis, copy): @@ -5190,7 +5203,6 @@ def is_null(self): return True def get_reindexed_values(self, empty_dtype, upcasted_na): - if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index a00f880ff6591..b92ffbfb6fe59 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -210,7 +210,7 @@ def f(): df.loc[3] = [6, 7] exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], - dtype='float64') + dtype='object') tm.assert_frame_equal(df, exp) def test_series_partial_set(self): diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 5ab2bbc4ac6ba..df5e843097514 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -651,7 +651,7 @@ def test_interleave(self): mgr = create_mgr('a: f8; b: i8') self.assertEqual(mgr.as_matrix().dtype, 'f8') mgr = create_mgr('a: f4; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'f4') + self.assertEqual(mgr.as_matrix().dtype, 'f8') mgr = create_mgr('a: f4; b: i8; d: object') self.assertEqual(mgr.as_matrix().dtype, 'object') mgr = create_mgr('a: bool; b: i8') diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index d587e4ea6a1fa..24e26be15a44b 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -250,6 +250,7 @@ def test_basic_types(self): self.assertEqual(type(r), exp_df_type) r = get_dummies(s_df, sparse=self.sparse, columns=['a']) + exp_blk_type = pd.core.internals.IntBlock self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type) self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type) self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index a2b5773f551c9..a0b22892e74c5 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -13,6 +13,8 @@ makeCustomDataframe as mkdf, assert_almost_equal) +import pytest + class ConcatenateBase(tm.TestCase): @@ -1899,3 +1901,15 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = pd.concat(example_dict, names=['testname']) tm.assert_frame_equal(result_no_copy, expected) + + +@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) +@pytest.mark.parametrize('dt', np.sctypes['float']) +def test_concat_no_unnecessary_upcast(dt, pdt): + # GH 13247 + dims = pdt().ndim + dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims))] + x = pd.concat(dfs) + assert x.values.dtype == dt From eae80ec4867ef5f38b887bb32bbccbedd76d1d92 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Mar 2017 08:35:38 -0400 Subject: [PATCH 292/338] DOC/TST: clean up docs & tests, xref #15594 BUG: default_fill_value for get_dummies will be 0 --- doc/source/whatsnew/v0.20.0.txt | 37 +++++++++++++++++++++-- pandas/core/internals.py | 6 ++-- pandas/core/reshape.py | 3 +- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/test_reshape.py | 42 +++++++++++++++------------ pandas/tests/tools/test_concat.py | 11 +++++++ 6 files changed, 74 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 097efdd097eec..a509e45b13d9a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -516,6 +516,39 @@ New Behavior: In [5]: df['a']['2011-12-31 23:59:59'] Out[5]: 1 +.. _whatsnew_0200.api_breaking.concat_dtypes: + +Concat of different float dtypes will not automatically upcast +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, ``concat`` of multiple objects with different ``float`` dtypes would automatically upcast results to a dtype of ``float64``. +Now the smallest acceptable dtype will be used (:issue:`13247`) + +.. ipython:: python + + df1 = pd.DataFrame(np.array([1.0], dtype=np.float32, ndmin=2)) + df1.dtypes + +.. ipython:: python + + df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2)) + df2.dtypes + +Previous Behavior: + +.. code-block:: ipython + + In [7]: pd.concat([df1,df2]).dtypes + Out[7]: + 0 float64 + dtype: object + +New Behavior: + +.. ipython:: python + + pd.concat([df1,df2]).dtypes + .. _whatsnew_0200.api_breaking.gbq: Pandas Google BigQuery support has moved @@ -693,6 +726,7 @@ Other API Changes - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) - ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`) +- ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`) .. _whatsnew_0200.deprecations: @@ -784,7 +818,6 @@ Bug Fixes - Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) - - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) @@ -886,5 +919,3 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) - -- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1c070b3ed34a9..0e6c176d950a1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4936,9 +4936,9 @@ def get_empty_dtype_and_na(join_units): return np.float64, np.nan else: return g, None - else: - msg = "invalid dtype determination in get_concat_dtype" - raise AssertionError(msg) + + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) def concatenate_join_units(join_units, concat_axis, copy): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 3279a8f2be39d..1e685ae6895ad 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1308,7 +1308,7 @@ def get_empty_Frame(data, sparse): if not sparse: return DataFrame(index=index) else: - return SparseDataFrame(index=index) + return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: @@ -1357,6 +1357,7 @@ def get_empty_Frame(data, sparse): sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, + default_fill_value=0, dtype=np.uint8) return out diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b92ffbfb6fe59..31fadcc88583c 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -205,7 +205,7 @@ def f(): self.assertRaises(ValueError, f) - # these are coerced to float unavoidably (as its a list-like to begin) + # TODO: #15657, these are left as object and not coerced df = DataFrame(columns=['A', 'B']) df.loc[3] = [6, 7] diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 24e26be15a44b..7ba743a6c425c 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -2,7 +2,6 @@ # pylint: disable-msg=W0612,E1101 from pandas import DataFrame, Series -from pandas.core.sparse import SparseDataFrame import pandas as pd from numpy import nan @@ -234,26 +233,31 @@ def test_basic_types(self): 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2]}) + expected = DataFrame({'a': [1, 0, 0], + 'b': [0, 1, 0], + 'c': [0, 0, 1]}, + dtype='uint8', + columns=list('abc')) if not self.sparse: - exp_df_type = DataFrame - exp_blk_type = pd.core.internals.IntBlock + compare = tm.assert_frame_equal else: - exp_df_type = SparseDataFrame - exp_blk_type = pd.core.internals.SparseBlock - - self.assertEqual( - type(get_dummies(s_list, sparse=self.sparse)), exp_df_type) - self.assertEqual( - type(get_dummies(s_series, sparse=self.sparse)), exp_df_type) - - r = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) - self.assertEqual(type(r), exp_df_type) - - r = get_dummies(s_df, sparse=self.sparse, columns=['a']) - exp_blk_type = pd.core.internals.IntBlock - self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type) - self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type) - self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type) + expected = expected.to_sparse(fill_value=0, kind='integer') + compare = tm.assert_sp_frame_equal + + result = get_dummies(s_list, sparse=self.sparse) + compare(result, expected) + + result = get_dummies(s_series, sparse=self.sparse) + compare(result, expected) + + result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) + tm.assert_series_equal(result.get_dtype_counts(), + Series({'uint8': 8})) + + result = get_dummies(s_df, sparse=self.sparse, columns=['a']) + expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() + tm.assert_series_equal(result.get_dtype_counts().sort_values(), + expected) def test_just_na(self): just_na_list = [np.nan] diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index a0b22892e74c5..392036a99a297 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1913,3 +1913,14 @@ def test_concat_no_unnecessary_upcast(dt, pdt): pdt(np.array([5], dtype=dt, ndmin=dims))] x = pd.concat(dfs) assert x.values.dtype == dt + + +@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) +@pytest.mark.parametrize('dt', np.sctypes['int']) +def test_concat_will_upcast(dt, pdt): + dims = pdt().ndim + dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims))] + x = pd.concat(dfs) + assert x.values.dtype == 'float64' From 8f410a41c58b9017d905ee8834caba10bd89b77c Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Tue, 14 Mar 2017 10:05:38 -0400 Subject: [PATCH 293/338] BUG: Allow multiple 'by' parameters in merge_asof() when DataFrames are indexed (#15676) closes #15676 Author: Christopher C. Aycock Closes #15679 from chrisaycock/GH15676 and squashes the following commits: 965caf2 [Christopher C. Aycock] Verify that 'by' parameters are the same length 4a2cc09 [Christopher C. Aycock] BUG: Allow multiple 'by' parameters in merge_asof() when DataFrames are indexed (#15676) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/tools/test_merge_asof.py | 35 +++++++++++++++++++++++++++ pandas/tools/merge.py | 25 +++++++++++++------ 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a509e45b13d9a..3548cbf6eb4a7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -884,6 +884,7 @@ Bug Fixes - Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) +- Bug in ``pd.merge_asof()`` where ``left_index`` or ``right_index`` caused a failure when multiple ``by`` was specified (:issue:`15676`) - Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`) - Bug in ``DataFrame.pivot_table()`` where ``dropna=True`` would not drop all-NaN columns when the columns was a ``category`` dtype (:issue:`15193`) diff --git a/pandas/tests/tools/test_merge_asof.py b/pandas/tests/tools/test_merge_asof.py index cdff8f0349c15..c9460cc74c94a 100644 --- a/pandas/tests/tools/test_merge_asof.py +++ b/pandas/tests/tools/test_merge_asof.py @@ -368,6 +368,41 @@ def test_multiby_heterogeneous_types(self): by=['ticker', 'exch']) assert_frame_equal(result, expected) + def test_multiby_indexed(self): + # GH15676 + left = pd.DataFrame([ + [pd.to_datetime('20160602'), 1, 'a'], + [pd.to_datetime('20160602'), 2, 'a'], + [pd.to_datetime('20160603'), 1, 'b'], + [pd.to_datetime('20160603'), 2, 'b']], + columns=['time', 'k1', 'k2']).set_index('time') + + right = pd.DataFrame([ + [pd.to_datetime('20160502'), 1, 'a', 1.0], + [pd.to_datetime('20160502'), 2, 'a', 2.0], + [pd.to_datetime('20160503'), 1, 'b', 3.0], + [pd.to_datetime('20160503'), 2, 'b', 4.0]], + columns=['time', 'k1', 'k2', 'value']).set_index('time') + + expected = pd.DataFrame([ + [pd.to_datetime('20160602'), 1, 'a', 1.0], + [pd.to_datetime('20160602'), 2, 'a', 2.0], + [pd.to_datetime('20160603'), 1, 'b', 3.0], + [pd.to_datetime('20160603'), 2, 'b', 4.0]], + columns=['time', 'k1', 'k2', 'value']).set_index('time') + + result = pd.merge_asof(left, + right, + left_index=True, + right_index=True, + by=['k1', 'k2']) + + assert_frame_equal(expected, result) + + with self.assertRaises(MergeError): + pd.merge_asof(left, right, left_index=True, right_index=True, + left_by=['k1', 'k2'], right_by=['k1']) + def test_basic2(self): expected = self.read_data('asof2.csv') diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d02f4c5b26c86..261884bba54bd 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1165,7 +1165,7 @@ def _validate_specification(self): if self.left_by is not None and self.right_by is None: raise MergeError('missing right_by') - # add by to our key-list so we can have it in the + # add 'by' to our key-list so we can have it in the # output as a key if self.left_by is not None: if not is_list_like(self.left_by): @@ -1173,6 +1173,9 @@ def _validate_specification(self): if not is_list_like(self.right_by): self.right_by = [self.right_by] + if len(self.left_by) != len(self.right_by): + raise MergeError('left_by and right_by must be same length') + self.left_on = self.left_by + list(self.left_on) self.right_on = self.right_by + list(self.right_on) @@ -1264,13 +1267,21 @@ def flip(xs): # a "by" parameter requires special handling if self.left_by is not None: - if len(self.left_join_keys) > 2: - # get tuple representation of values if more than one - left_by_values = flip(self.left_join_keys[0:-1]) - right_by_values = flip(self.right_join_keys[0:-1]) + # remove 'on' parameter from values if one existed + if self.left_index and self.right_index: + left_by_values = self.left_join_keys + right_by_values = self.right_join_keys + else: + left_by_values = self.left_join_keys[0:-1] + right_by_values = self.right_join_keys[0:-1] + + # get tuple representation of values if more than one + if len(left_by_values) == 1: + left_by_values = left_by_values[0] + right_by_values = right_by_values[0] else: - left_by_values = self.left_join_keys[0] - right_by_values = self.right_join_keys[0] + left_by_values = flip(left_by_values) + right_by_values = flip(right_by_values) # upcast 'by' parameter because HashTable is limited by_type = _get_cython_type_upcast(left_by_values.dtype) From 883536ade974f2c6544df4bfd14b25144f0d3b63 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Mar 2017 11:06:20 -0400 Subject: [PATCH 294/338] DOC: elevate deprecations / removals to top-level of whatsnew doc to promote visibility --- doc/source/whatsnew/v0.20.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3548cbf6eb4a7..9c6f5d3e0596d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -731,7 +731,7 @@ Other API Changes .. _whatsnew_0200.deprecations: Deprecations -^^^^^^^^^^^^ +~~~~~~~~~~~~ - ``SparseArray.to_dense()`` has deprecated the ``fill`` parameter, as that parameter was not being respected (:issue:`14647`) - ``SparseSeries.to_dense()`` has deprecated the ``sparse_only`` parameter (:issue:`14647`) @@ -753,7 +753,7 @@ Deprecations .. _whatsnew_0200.prior_deprecations: Removal of prior version deprecations/changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The ``pandas.rpy`` module is removed. Similar functionality can be accessed through the `rpy2 `__ project. From c556177d51909281b8e7a7bd59f1bf39328edbe0 Mon Sep 17 00:00:00 2001 From: Yimeng Zhang Date: Wed, 15 Mar 2017 09:26:37 -0400 Subject: [PATCH 295/338] compatibility with scipy 0.19 fix #15662 Author: Yimeng Zhang Closes #15689 from zym1010/fix_scipy019 and squashes the following commits: 3cc6528 [Yimeng Zhang] doc and PEP8 9ed7524 [Yimeng Zhang] fix interpolation related issue with scipy 0.19 ca09705 [Yimeng Zhang] get symmetric window --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/window.py | 3 ++- pandas/tests/frame/test_missing.py | 33 ++++++++++++++++++++--------- pandas/tests/series/test_missing.py | 17 +++++++++++++-- pandas/tests/test_window.py | 10 ++++----- 5 files changed, 46 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9c6f5d3e0596d..2a6c8a1e26955 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -816,6 +816,7 @@ Bug Fixes - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - Bug in ``pd.cut()`` with a single bin on an all 0s array (:issue:`15428`) - Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) +- Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) diff --git a/pandas/core/window.py b/pandas/core/window.py index 6fda60c449f42..9c9f861451309 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -544,7 +544,8 @@ def _pop_args(win_type, arg_names, kwargs): return all_args win_type = _validate_win_type(self.win_type, kwargs) - return sig.get_window(win_type, window).astype(float) + # GH #15662. `False` makes symmetric window, rather than periodic. + return sig.get_window(win_type, window, False).astype(float) def _apply_window(self, mean=True, how=None, **kwargs): """ diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 923ed2e7c3444..93c3ba78a0abf 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -19,6 +19,13 @@ from pandas.tests.frame.common import TestData, _check_mixed_float +try: + import scipy + _is_scipy_ge_0190 = scipy.__version__ >= LooseVersion('0.19.0') +except: + _is_scipy_ge_0190 = False + + def _skip_if_no_pchip(): try: from scipy.interpolate import pchip_interpolate # noqa @@ -548,7 +555,7 @@ def test_interp_nan_idx(self): df.interpolate(method='values') def test_interp_various(self): - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) @@ -561,8 +568,15 @@ def test_interp_various(self): assert_frame_equal(result, expected) result = df.interpolate(method='cubic') - expected.A.loc[3] = 2.81621174 - expected.A.loc[13] = 5.64146581 + # GH #15662. + # new cubic and quadratic interpolation algorithms from scipy 0.19.0. + # previously `splmake` was used. See scipy/scipy#6710 + if _is_scipy_ge_0190: + expected.A.loc[3] = 2.81547781 + expected.A.loc[13] = 5.52964175 + else: + expected.A.loc[3] = 2.81621174 + expected.A.loc[13] = 5.64146581 assert_frame_equal(result, expected) result = df.interpolate(method='nearest') @@ -571,8 +585,12 @@ def test_interp_various(self): assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method='quadratic') - expected.A.loc[3] = 2.82533638 - expected.A.loc[13] = 6.02817974 + if _is_scipy_ge_0190: + expected.A.loc[3] = 2.82150771 + expected.A.loc[13] = 6.12648668 + else: + expected.A.loc[3] = 2.82533638 + expected.A.loc[13] = 6.02817974 assert_frame_equal(result, expected) result = df.interpolate(method='slinear') @@ -585,11 +603,6 @@ def test_interp_various(self): expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) - result = df.interpolate(method='quadratic') - expected.A.loc[3] = 2.82533638 - expected.A.loc[13] = 6.02817974 - assert_frame_equal(result, expected) - def test_interp_alt_scipy(self): tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 9e997da517bf6..7174283494fe7 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -4,6 +4,7 @@ import pytz from datetime import timedelta, datetime +from distutils.version import LooseVersion from numpy import nan import numpy as np import pandas as pd @@ -17,6 +18,12 @@ from .common import TestData +try: + import scipy + _is_scipy_ge_0190 = scipy.__version__ >= LooseVersion('0.19.0') +except: + _is_scipy_ge_0190 = False + def _skip_if_no_pchip(): try: @@ -827,7 +834,7 @@ def test_interp_quad(self): assert_series_equal(result, expected) def test_interp_scipy_basic(self): - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() s = Series([1, 3, np.nan, 12, np.nan, 25]) # slinear @@ -852,7 +859,13 @@ def test_interp_scipy_basic(self): result = s.interpolate(method='zero', downcast='infer') assert_series_equal(result, expected) # quadratic - expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) + # GH #15662. + # new cubic and quadratic interpolation algorithms from scipy 0.19.0. + # previously `splmake` was used. See scipy/scipy#6710 + if _is_scipy_ge_0190: + expected = Series([1, 3., 6.823529, 12., 18.058824, 25.]) + else: + expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) result = s.interpolate(method='quadratic') assert_series_equal(result, expected) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index b7164d31b2a5e..3f2973a9834ca 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -905,7 +905,7 @@ def test_cmov_window_na_min_periods(self): def test_cmov_window_regular(self): # GH 8238 - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] @@ -938,7 +938,7 @@ def test_cmov_window_regular(self): def test_cmov_window_regular_linear_range(self): # GH 8238 - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] @@ -955,7 +955,7 @@ def test_cmov_window_regular_linear_range(self): def test_cmov_window_regular_missing_data(self): # GH 8238 - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] @@ -988,7 +988,7 @@ def test_cmov_window_regular_missing_data(self): def test_cmov_window_special(self): # GH 8238 - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., @@ -1015,7 +1015,7 @@ def test_cmov_window_special(self): def test_cmov_window_special_linear_range(self): # GH 8238 - tm.skip_if_no_package('scipy', max_version='0.19.0') + tm._skip_if_no_scipy() win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., From c3efe0ebca3df77f74068926cf9ee105d0d00321 Mon Sep 17 00:00:00 2001 From: John Zwinck Date: Wed, 15 Mar 2017 12:04:12 -0400 Subject: [PATCH 296/338] ENH: use constant f32 eps, not np.finfo() during import NumPy docs for np.finfo() say not to call it during import (at module scope). It's a relatively expensive call, and it modifies the GIL state. Now we just hard-code it, because it is always the value anyway. This avoids touching the GIL at import, which helps avoid deadlocks in practice. closes #14641 Author: John Zwinck Closes #15691 from jzwinck/patch-1 and squashes the following commits: dadb97c [John Zwinck] DOC: mention #14641 in 0.20.0 whatsnew e565230 [John Zwinck] ENH: use constant f32 eps, not np.finfo() during import --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/indexing.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2a6c8a1e26955..41b6519eb740f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -905,6 +905,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) +- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`) - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 19b7771251da3..c80e8c34aa88f 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1852,7 +1852,7 @@ def _convert_key(self, key, is_setter=False): # 32-bit floating point machine epsilon -_eps = np.finfo('f4').eps +_eps = 1.1920929e-07 def length_of_indexer(indexer, target=None): From cc7a9994ad08ab39f58c62fed933c1fb8ff31529 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Mar 2017 19:15:36 -0400 Subject: [PATCH 297/338] TST: reorg tests_multilevel.py tests --- pandas/tests/test_multilevel.py | 813 ++++++++++++++++---------------- 1 file changed, 411 insertions(+), 402 deletions(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d1b7fdadce6ae..d7b115d808312 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -12,18 +12,15 @@ from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp from pandas.types.common import is_float_dtype, is_integer_dtype -from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assertRaisesRegexp) import pandas.core.common as com import pandas.util.testing as tm from pandas.compat import (range, lrange, StringIO, lzip, u, product as cart_product, zip) import pandas as pd - import pandas._libs.index as _index -class TestMultiLevel(tm.TestCase): +class Base(object): def setUp(self): @@ -58,6 +55,9 @@ def setUp(self): inplace=True) self.ymd.index.set_names(['year', 'month', 'day'], inplace=True) + +class TestMultiLevel(Base, tm.TestCase): + def test_append(self): a, b = self.frame[:5], self.frame[5:] @@ -87,19 +87,19 @@ def test_append_index(self): (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz)), (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz))] expected = Index([1.1, 1.2, 1.3] + expected_tuples) - self.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = midx_lv2.append(idx1) expected = Index(expected_tuples + [1.1, 1.2, 1.3]) - self.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv2) expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) - self.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv3) - self.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = midx_lv3.append(midx_lv2) expected = Index._simple_new( @@ -107,7 +107,7 @@ def test_append_index(self): (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz), 'B'), (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz), 'C')] + expected_tuples), None) - self.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) def test_dataframe_constructor(self): multi = DataFrame(np.random.randn(4, 4), @@ -139,18 +139,18 @@ def test_reindex_level(self): result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) - assert_series_equal(result, expected, check_names=False) + tm.assert_series_equal(result, expected, check_names=False) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): @@ -160,7 +160,7 @@ def _check_op(opname): broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Series op = getattr(Series, opname) @@ -169,7 +169,7 @@ def _check_op(opname): np.sum) expected = op(self.ymd['A'], broadcasted) expected.name = 'A' - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) _check_op('sub') _check_op('add') @@ -179,7 +179,7 @@ def _check_op(opname): def test_pickle(self): def _test_roundtrip(frame): unpickled = self.round_trip_pickle(frame) - assert_frame_equal(frame, unpickled) + tm.assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) @@ -189,11 +189,11 @@ def _test_roundtrip(frame): def test_reindex(self): expected = self.frame.iloc[[0, 3]] reindexed = self.frame.loc[[('foo', 'one'), ('bar', 'one')]] - assert_frame_equal(reindexed, expected) + tm.assert_frame_equal(reindexed, expected) with catch_warnings(record=True): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] - assert_frame_equal(reindexed, expected) + tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] @@ -214,50 +214,6 @@ def test_reindex_preserve_levels(self): chunk = ymdT.loc[:, new_index] self.assertIs(chunk.columns, new_index) - def test_sort_index_preserve_levels(self): - result = self.frame.sort_index() - self.assertEqual(result.index.names, self.frame.index.names) - - def test_sorting_repr_8017(self): - - np.random.seed(0) - data = np.random.randn(3, 4) - - for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4), - ([Timestamp('20130101'), Timestamp('20130103'), - Timestamp('20130102'), Timestamp('20130105')], - Timestamp('20130104')), - (['1one', '3one', '2one', '5one'], '4one')]: - columns = MultiIndex.from_tuples([('red', i) for i in gen]) - df = DataFrame(data, index=list('def'), columns=columns) - df2 = pd.concat([df, - DataFrame('world', index=list('def'), - columns=MultiIndex.from_tuples( - [('red', extra)]))], axis=1) - - # check that the repr is good - # make sure that we have a correct sparsified repr - # e.g. only 1 header of read - self.assertEqual(str(df2).splitlines()[0].split(), ['red']) - - # GH 8017 - # sorting fails after columns added - - # construct single-dtype then sort - result = df.copy().sort_index(axis=1) - expected = df.iloc[:, [0, 2, 1, 3]] - assert_frame_equal(result, expected) - - result = df2.sort_index(axis=1) - expected = df2.iloc[:, [0, 2, 1, 4, 3]] - assert_frame_equal(result, expected) - - # setitem then sort - result = df.copy() - result[('red', extra)] = 'world' - result = result.sort_index(axis=1) - assert_frame_equal(result, expected) - def test_repr_to_string(self): repr(self.frame) repr(self.ymd) @@ -283,9 +239,11 @@ def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] - assert_almost_equal(col.values, df.values[:, 0]) - self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) - self.assertRaises(KeyError, df.__getitem__, 'foobar') + tm.assert_almost_equal(col.values, df.values[:, 0]) + with pytest.raises(KeyError): + df[('foo', 'four')] + with pytest.raises(KeyError): + df['foobar'] def test_series_getitem(self): s = self.ymd['A'] @@ -297,7 +255,7 @@ def test_series_getitem(self): expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] @@ -306,11 +264,11 @@ def test_series_getitem(self): # fancy expected = s.reindex(s.index[49:51]) result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) with catch_warnings(record=True): result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) @@ -325,7 +283,7 @@ def test_series_getitem_corner(self): # generator result = s[(x > 0 for x in s)] expected = s[s > 0] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_series_setitem(self): s = self.ymd['A'] @@ -347,29 +305,29 @@ def test_frame_getitem_setitem_boolean(self): result = df[df > 0] expected = df.where(df > 0) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df[df > 0] = 5 values[values > 0] = 5 - assert_almost_equal(df.values, values) + tm.assert_almost_equal(df.values, values) df[df == 5] = 0 values[values == 5] = 0 - assert_almost_equal(df.values, values) + tm.assert_almost_equal(df.values, values) # a df that needs alignment first df[df[:-1] < 0] = 2 np.putmask(values[:-1], values[:-1] < 0, 2) - assert_almost_equal(df.values, values) + tm.assert_almost_equal(df.values, values) - with assertRaisesRegexp(TypeError, 'boolean values only'): + with tm.assertRaisesRegexp(TypeError, 'boolean values only'): df[df * 0] = 2 def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.iloc[:4] expected = self.frame[:4] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # setitem cp = self.frame.copy() @@ -385,25 +343,25 @@ def test_frame_getitem_setitem_multislice(self): df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) result = df.loc[:, 'value'] - assert_series_equal(df['value'], result) + tm.assert_series_equal(df['value'], result) with catch_warnings(record=True): result = df.ix[:, 'value'] - assert_series_equal(df['value'], result) + tm.assert_series_equal(df['value'], result) result = df.loc[df.index[1:3], 'value'] - assert_series_equal(df['value'][1:3], result) + tm.assert_series_equal(df['value'][1:3], result) result = df.loc[:, :] - assert_frame_equal(df, result) + tm.assert_frame_equal(df, result) result = df df.loc[:, 'value'] = 10 result['value'] = 10 - assert_frame_equal(df, result) + tm.assert_frame_equal(df, result) df.loc[:, :] = 10 - assert_frame_equal(df, result) + tm.assert_frame_equal(df, result) def test_frame_getitem_multicolumn_empty_level(self): f = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) @@ -413,7 +371,7 @@ def test_frame_getitem_multicolumn_empty_level(self): result = f['level1 item1'] expected = DataFrame([['1'], ['2'], ['3']], index=f.index, columns=['level3 item1']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_frame_setitem_multi_column(self): df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], @@ -421,12 +379,12 @@ def test_frame_setitem_multi_column(self): cp = df.copy() cp['a'] = cp['b'] - assert_frame_equal(cp['a'], cp['b']) + tm.assert_frame_equal(cp['a'], cp['b']) # set with ndarray cp = df.copy() cp['a'] = cp['b'].values - assert_frame_equal(cp['a'], cp['b']) + tm.assert_frame_equal(cp['a'], cp['b']) # --------------------------------------- # #1803 @@ -444,8 +402,8 @@ def test_frame_setitem_multi_column(self): sliced_a1 = df['A', '1'] sliced_a2 = df['A', '2'] sliced_b1 = df['B', '1'] - assert_series_equal(sliced_a1, sliced_b1, check_names=False) - assert_series_equal(sliced_a2, sliced_b1, check_names=False) + tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) + tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) self.assertEqual(sliced_a1.name, ('A', '1')) self.assertEqual(sliced_a2.name, ('A', '2')) self.assertEqual(sliced_b1.name, ('B', '1')) @@ -465,9 +423,9 @@ def test_getitem_tuple_plus_slice(self): with catch_warnings(record=True): expected3 = idf.ix[0, 0] - assert_series_equal(result, expected) - assert_series_equal(result, expected2) - assert_series_equal(result, expected3) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected2) + tm.assert_series_equal(result, expected3) def test_getitem_setitem_tuple_plus_columns(self): # GH #1013 @@ -476,26 +434,14 @@ def test_getitem_setitem_tuple_plus_columns(self): result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] expected = df.loc[2000, 1, 6][['A', 'B', 'C']] - assert_series_equal(result, expected) - - def test_getitem_multilevel_index_tuple_unsorted(self): - index_columns = list("abc") - df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], - columns=index_columns + ["data"]) - df = df.set_index(index_columns) - query_index = df.index[:1] - rs = df.loc[query_index, "data"] - - xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) - xp = Series(['x'], index=xp_idx, name='data') - assert_series_equal(rs, xp) + tm.assert_series_equal(result, expected) def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.loc[('bar', 'two')] - assert_series_equal(xs, xs2) - assert_almost_equal(xs.values, self.frame.values[4]) + tm.assert_series_equal(xs, xs2) + tm.assert_almost_equal(xs.values, self.frame.values[4]) # GH 6574 # missing values in returned index should be preserrved @@ -514,18 +460,18 @@ def test_xs(self): ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) result = df.xs('z', level='a1') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.loc['foo'] expected = self.frame.T['foo'].T - assert_frame_equal(result, expected) - assert_frame_equal(result, result2) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) result = self.ymd.xs((2000, 4)) expected = self.ymd.loc[2000, 4] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ex from #1796 index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], @@ -537,14 +483,14 @@ def test_xs_partial(self): result = df.xs(['foo', 'one']) expected = df.loc['foo', 'one'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_xs_level(self): result = self.frame.xs('two', level='second') expected = self.frame[self.frame.index.get_level_values(1) == 'two'] expected.index = expected.index.droplevel(1) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ( 'p', 'q', 'r')]) @@ -552,7 +498,7 @@ def test_xs_level(self): result = df.xs('c', level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = self.frame.xs('two', level='second') @@ -576,7 +522,7 @@ def test_xs_level_multiple(self): result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) @@ -597,7 +543,7 @@ def f(x): rs = df.xs(20111201, level='date') xp = df.loc[20111201, :] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_xs_level0(self): from pandas import read_table @@ -612,18 +558,18 @@ def test_xs_level0(self): result = df.xs('a', level=0) expected = df.xs('a') self.assertEqual(len(result), 2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_xs_level_series(self): s = self.frame['A'] result = s[:, 'two'] expected = self.frame.xs('two', level=1)['A'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s = self.ymd['A'] result = s[2000, 5] expected = self.ymd.loc[2000, 5]['A'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # not implementing this for now @@ -633,7 +579,7 @@ def test_xs_level_series(self): # lv =s.index.get_level_values(1) # expected = s[(lv == 3) | (lv == 4)] # expected.index = expected.index.droplevel(0) - # assert_series_equal(result, expected) + # tm.assert_series_equal(result, expected) # can do this though @@ -649,15 +595,15 @@ def test_getitem_toplevel(self): result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df['bar'] result2 = df.loc[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) - assert_frame_equal(result, expected) - assert_frame_equal(result, result2) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], @@ -667,7 +613,7 @@ def test_getitem_setitem_slice_integers(self): columns=['a', 'b', 'c', 'd']) res = frame.loc[1:2] exp = frame.reindex(frame.index[2:]) - assert_frame_equal(res, exp) + tm.assert_frame_equal(res, exp) frame.loc[1:2] = 7 self.assertTrue((frame.loc[1:2] == 7).values.all()) @@ -676,7 +622,7 @@ def test_getitem_setitem_slice_integers(self): res = series.loc[1:2] exp = series.reindex(series.index[2:]) - assert_series_equal(res, exp) + tm.assert_series_equal(res, exp) series.loc[1:2] = 7 self.assertTrue((series.loc[1:2] == 7).values.all()) @@ -691,7 +637,7 @@ def test_getitem_int(self): result = frame.loc[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.loc.__getitem__, 3) @@ -699,7 +645,7 @@ def test_getitem_int(self): # however this will work result = self.frame.iloc[2] expected = self.frame.xs(self.frame.index[2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T @@ -707,25 +653,17 @@ def test_getitem_partial(self): expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) - assert_frame_equal(result, expected) - - def test_getitem_slice_not_sorted(self): - df = self.frame.sort_index(level=1).T - - # buglet with int typechecking - result = df.iloc[:, :np.int32(3)] - expected = df.reindex(columns=df.columns[:3]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() - assert_series_equal(dft['foo', 'two'], s > s.median()) + tm.assert_series_equal(dft['foo', 'two'], s > s.median()) # tm.assertIsInstance(dft._data.blocks[1].items, MultiIndex) reindexed = dft.reindex(columns=[('foo', 'two')]) - assert_series_equal(reindexed['foo', 'two'], s > s.median()) + tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.loc[('bar', 'two'), 'B'] = 5 @@ -746,12 +684,12 @@ def test_frame_setitem_ix(self): def test_fancy_slice_partial(self): result = self.frame.loc['bar':'baz'] expected = self.frame[3:7] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.ymd.loc[(2000, 2):(2000, 4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_getitem_partial_column_select(self): idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], @@ -760,55 +698,19 @@ def test_getitem_partial_column_select(self): result = df.loc[('a', 'y'), :] expected = df.loc[('a', 'y')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[('a', 'y'), [1, 0]] expected = df.loc[('a', 'y')][[1, 0]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) with catch_warnings(record=True): result = df.ix[('a', 'y'), [1, 0]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertRaises(KeyError, df.loc.__getitem__, (('a', 'foo'), slice(None, None))) - def test_sort_index_level(self): - df = self.frame.copy() - df.index = np.arange(len(df)) - - # axis=1 - - # series - a_sorted = self.frame['A'].sort_index(level=0) - - # preserve names - self.assertEqual(a_sorted.index.names, self.frame.index.names) - - # inplace - rs = self.frame.copy() - rs.sort_index(level=0, inplace=True) - assert_frame_equal(rs, self.frame.sort_index(level=0)) - - def test_sort_index_level_large_cardinality(self): - - # #2684 (int64) - index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) - - # it works! - result = df.sort_index(level=0) - self.assertTrue(result.index.lexsort_depth == 3) - - # #2684 (int32) - index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) - - # it works! - result = df.sort_index(level=0) - self.assertTrue((result.dtypes.values == df.dtypes.values).all()) - self.assertTrue(result.index.lexsort_depth == 3) - def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product( @@ -832,28 +734,6 @@ def test_reset_index_with_drop(self): deleveled = self.series.reset_index(drop=True) tm.assertIsInstance(deleveled, Series) - def test_sort_index_level_by_name(self): - self.frame.index.names = ['first', 'second'] - result = self.frame.sort_index(level='second') - expected = self.frame.sort_index(level=1) - assert_frame_equal(result, expected) - - def test_sort_index_level_mixed(self): - sorted_before = self.frame.sort_index(level=1) - - df = self.frame.copy() - df['foo'] = 'bar' - sorted_after = df.sort_index(level=1) - assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) - - dft = self.frame.T - sorted_before = dft.sort_index(level=1, axis=1) - dft['foo', 'three'] = 'bar' - - sorted_after = dft.sort_index(level=1, axis=1) - assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), - sorted_after.drop([('foo', 'three')], axis=1)) - def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) @@ -861,7 +741,7 @@ def _check_counts(frame, axis=0): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count() expected = expected.reindex_like(result).astype('i8') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.frame.iloc[1, [1, 2]] = np.nan self.frame.iloc[7, [0, 1]] = np.nan @@ -875,7 +755,7 @@ def _check_counts(frame, axis=0): # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() - assertRaisesRegexp(TypeError, 'hierarchical', df.count, level=0) + tm.assertRaisesRegexp(TypeError, 'hierarchical', df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) @@ -891,30 +771,30 @@ def test_count_level_series(self): result = s.count(level=0) expected = s.groupby(level=0).count() - assert_series_equal(result.astype('f8'), - expected.reindex(result.index).fillna(0)) + tm.assert_series_equal( + result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() - assert_series_equal(result.astype('f8'), - expected.reindex(result.index).fillna(0)) + tm.assert_series_equal( + result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0], name='A') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(np.int64) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_get_level_number_out_of_bounds(self): - with assertRaisesRegexp(IndexError, "Too many levels"): + with tm.assertRaisesRegexp(IndexError, "Too many levels"): self.frame.index._get_level_number(2) - with assertRaisesRegexp(IndexError, "not a valid level number"): + with tm.assertRaisesRegexp(IndexError, "not a valid level number"): self.frame.index._get_level_number(-3) def test_unstack(self): @@ -936,56 +816,56 @@ def test_unstack_multiple_no_empty_columns(self): unstacked = s.unstack([1, 2]) expected = unstacked.dropna(axis=1, how='all') - assert_frame_equal(unstacked, expected) + tm.assert_frame_equal(unstacked, expected) def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() - assert_frame_equal(restacked, self.ymd) + tm.assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sort_index(level=2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() - assert_frame_equal(restacked.sort_index(level=0), self.ymd) + tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) - assert_frame_equal(restacked.sort_index(level=0), self.ymd) + tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) - assert_frame_equal(restacked.sort_index(level=0), self.ymd) + tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort_index(axis=1, ascending=False) restacked = unstacked.stack() - assert_frame_equal(restacked, self.ymd) + tm.assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).loc[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() - assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) @@ -993,7 +873,7 @@ def test_stack(self): # GH10417 def check(left, right): - assert_series_equal(left, right) + tm.assert_series_equal(left, right) self.assertFalse(left.index.is_unique) li, ri = left.index, right.index tm.assert_index_equal(li, ri) @@ -1049,7 +929,7 @@ def test_unstack_odd_failure(self): result = df.unstack(2) recons = result.stack() - assert_frame_equal(recons, df) + tm.assert_frame_equal(recons, df) def test_stack_mixed_dtype(self): df = self.frame.T @@ -1058,7 +938,7 @@ def test_stack_mixed_dtype(self): stacked = df.stack() result = df['foo'].stack() - assert_series_equal(stacked['foo'], result, check_names=False) + tm.assert_series_equal(stacked['foo'], result, check_names=False) self.assertIs(result.name, None) self.assertEqual(stacked['bar'].dtype, np.float_) @@ -1074,8 +954,8 @@ def test_unstack_bug(self): unstacked = result.unstack() restacked = unstacked.stack() - assert_series_equal(restacked, - result.reindex(restacked.index).astype(float)) + tm.assert_series_equal( + restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() @@ -1088,59 +968,59 @@ def test_stack_unstack_preserve_names(self): def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') - assert_frame_equal(unstacked, expected) + tm.assert_frame_equal(unstacked, expected) self.assertEqual(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) - assert_frame_equal(s_unstacked, expected['A']) + tm.assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sort_index(level=0) - assert_frame_equal(restacked, self.ymd) + tm.assert_frame_equal(restacked, self.ymd) self.assertEqual(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') - assert_frame_equal(unstacked, expected) + tm.assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') - assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) + tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) def test_stack_names_and_numbers(self): unstacked = self.ymd.unstack(['year', 'month']) # Can't use mixture of names and numbers to stack - with assertRaisesRegexp(ValueError, "level should contain"): + with tm.assertRaisesRegexp(ValueError, "level should contain"): unstacked.stack([0, 'month']) def test_stack_multiple_out_of_bounds(self): # nlevels == 3 unstacked = self.ymd.unstack(['year', 'month']) - with assertRaisesRegexp(IndexError, "Too many levels"): + with tm.assertRaisesRegexp(IndexError, "Too many levels"): unstacked.stack([2, 3]) - with assertRaisesRegexp(IndexError, "not a valid level number"): + with tm.assertRaisesRegexp(IndexError, "not a valid level number"): unstacked.stack([-4, -3]) def test_unstack_period_series(self): @@ -1163,9 +1043,9 @@ def test_unstack_period_series(self): columns=['A', 'B']) expected.columns.name = 'str' - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected.T) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected.T) idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', '2013-03', '2013-03'], freq='M', name='period1') @@ -1189,9 +1069,9 @@ def test_unstack_period_series(self): [6, 5, np.nan, np.nan, np.nan, np.nan]], index=e_idx, columns=e_cols) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected.T) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected.T) def test_unstack_period_frame(self): # GH 4342 @@ -1216,8 +1096,8 @@ def test_unstack_period_frame(self): expected = DataFrame([[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) e_1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-01', '2014-02'], freq='M', name='period1') @@ -1227,7 +1107,7 @@ def test_unstack_period_frame(self): expected = DataFrame([[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols) - assert_frame_equal(result3, expected) + tm.assert_frame_equal(result3, expected) def test_stack_multiple_bug(self): """ bug when some uniques are not present in the data #3170""" @@ -1245,7 +1125,7 @@ def test_stack_multiple_bug(self): rs = down.stack('ID') xp = unst.loc[:, ['VAR1']].resample('W-THU').mean().stack('ID') xp.columns.name = 'Params' - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_stack_dropna(self): # GH #3997 @@ -1256,7 +1136,7 @@ def test_stack_dropna(self): self.assertTrue(len(stacked) > len(stacked.dropna())) stacked = df.unstack().stack(dropna=True) - assert_frame_equal(stacked, stacked.dropna()) + tm.assert_frame_equal(stacked, stacked.dropna()) def test_unstack_multiple_hierarchical(self): df = DataFrame(index=[[0, 0, 0, 0, 1, 1, 1, 1], @@ -1279,7 +1159,7 @@ def test_groupby_transform(self): applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) result = applied.reindex(expected.index) - assert_series_equal(result, expected, check_names=False) + tm.assert_series_equal(result, expected, check_names=False) def test_unstack_sparse_keyspace(self): # memory problems with naive impl #2278 @@ -1311,7 +1191,7 @@ def test_unstack_unobserved_keys(self): self.assertEqual(len(result.columns), 4) recons = result.stack() - assert_frame_equal(recons, df) + tm.assert_frame_equal(recons, df) def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], @@ -1344,8 +1224,8 @@ def test_join(self): self.assertFalse(np.isnan(joined.values).all()) - assert_frame_equal(joined, expected, check_names=False - ) # TODO what should join do with names ? + # TODO what should join do with names ? + tm.assert_frame_equal(joined, expected, check_names=False) def test_swaplevel(self): swapped = self.frame['A'].swaplevel() @@ -1353,23 +1233,23 @@ def test_swaplevel(self): swapped3 = self.frame['A'].swaplevel(0, 1) swapped4 = self.frame['A'].swaplevel('first', 'second') self.assertFalse(swapped.index.equals(self.frame.index)) - assert_series_equal(swapped, swapped2) - assert_series_equal(swapped, swapped3) - assert_series_equal(swapped, swapped4) + tm.assert_series_equal(swapped, swapped2) + tm.assert_series_equal(swapped, swapped3) + tm.assert_series_equal(swapped, swapped4) back = swapped.swaplevel() back2 = swapped.swaplevel(0) back3 = swapped.swaplevel(0, 1) back4 = swapped.swaplevel('second', 'first') self.assertTrue(back.index.equals(self.frame.index)) - assert_series_equal(back, back2) - assert_series_equal(back, back3) - assert_series_equal(back, back4) + tm.assert_series_equal(back, back2) + tm.assert_series_equal(back, back3) + tm.assert_series_equal(back, back4) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T - assert_frame_equal(swapped, exp) + tm.assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA': self.frame, 'ItemB': self.frame * 2}) @@ -1384,20 +1264,20 @@ def test_swaplevel_panel(self): def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - with assertRaisesRegexp(TypeError, 'hierarchical axis'): + with tm.assertRaisesRegexp(TypeError, 'hierarchical axis'): self.ymd.reorder_levels([1, 2], axis=1) - with assertRaisesRegexp(IndexError, 'Too many levels'): + with tm.assertRaisesRegexp(IndexError, 'Too many levels'): self.ymd.index.reorder_levels([1, 2, 3]) def test_insert_index(self): @@ -1416,29 +1296,13 @@ def test_alignment(self): res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) - assert_series_equal(res, exp) + tm.assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) - assert_series_equal(res, exp) - - def test_is_lexsorted(self): - levels = [[0, 1], [0, 1, 2]] - - index = MultiIndex(levels=levels, - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) - self.assertTrue(index.is_lexsorted()) - - index = MultiIndex(levels=levels, - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) - self.assertFalse(index.is_lexsorted()) - - index = MultiIndex(levels=levels, - labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) - self.assertFalse(index.is_lexsorted()) - self.assertEqual(index.lexsort_depth, 0) + tm.assert_series_equal(res, exp) def test_frame_getitem_view(self): df = self.frame.T.copy() @@ -1465,66 +1329,29 @@ def f(): pass self.assertTrue((df['foo', 'one'] == 0).all()) - def test_frame_getitem_not_sorted(self): - df = self.frame.T - df['foo', 'four'] = 'foo' - - arrays = [np.array(x) for x in zip(*df.columns.values)] - - result = df['foo'] - result2 = df.loc[:, 'foo'] - expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) - expected.columns = expected.columns.droplevel(0) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - - df = df.T - result = df.xs('foo') - result2 = df.loc['foo'] - expected = df.reindex(df.index[arrays[0] == 'foo']) - expected.index = expected.index.droplevel(0) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - - def test_series_getitem_not_sorted(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - tuples = lzip(*arrays) - index = MultiIndex.from_tuples(tuples) - s = Series(randn(8), index=index) - - arrays = [np.array(x) for x in zip(*index.values)] - - result = s['qux'] - result2 = s.loc['qux'] - expected = s[arrays[0] == 'qux'] - expected.index = expected.index.droplevel(0) - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - def test_count(self): frame = self.frame.copy() frame.index.names = ['a', 'b'] result = frame.count(level='b') expect = self.frame.count(level=1) - assert_frame_equal(result, expect, check_names=False) + tm.assert_frame_equal(result, expect, check_names=False) result = frame.count(level='a') expect = self.frame.count(level=0) - assert_frame_equal(result, expect, check_names=False) + tm.assert_frame_equal(result, expect, check_names=False) series = self.series.copy() series.index.names = ['a', 'b'] result = series.count(level='b') expect = self.series.count(level=1) - assert_series_equal(result, expect, check_names=False) + tm.assert_series_equal(result, expect, check_names=False) self.assertEqual(result.index.name, 'b') result = series.count(level='a') expect = self.series.count(level=0) - assert_series_equal(result, expect, check_names=False) + tm.assert_series_equal(result, expect, check_names=False) self.assertEqual(result.index.name, 'a') self.assertRaises(KeyError, series.count, 'x') @@ -1541,7 +1368,7 @@ def test_series_group_min_max(self): # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) - assert_series_equal(leftside, rightside) + tm.assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.iloc[1, [1, 2]] = np.nan @@ -1550,6 +1377,7 @@ def test_frame_group_ops(self): for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2), lrange(2), [False, True]): + if axis == 0: frame = self.frame else: @@ -1570,17 +1398,17 @@ def aggf(x): # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] - self.assert_index_equal(leftside._get_axis(axis), level_index) - self.assert_index_equal(rightside._get_axis(axis), level_index) + tm.assert_index_equal(leftside._get_axis(axis), level_index) + tm.assert_index_equal(rightside._get_axis(axis), level_index) - assert_frame_equal(leftside, rightside) + tm.assert_frame_equal(leftside, rightside) def test_stat_op_corner(self): obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) result = obj.sum(level=0) expected = Series([10.0], index=[2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_frame_any_all_group(self): df = DataFrame( @@ -1591,11 +1419,11 @@ def test_frame_any_all_group(self): result = df.any(level=0) ex = DataFrame({'data': [False, True]}, index=['one', 'two']) - assert_frame_equal(result, ex) + tm.assert_frame_equal(result, ex) result = df.all(level=0) ex = DataFrame({'data': [False, False]}, index=['one', 'two']) - assert_frame_equal(result, ex) + tm.assert_frame_equal(result, ex) def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile( @@ -1608,20 +1436,20 @@ def test_std_var_pass_ddof(self): result = getattr(df[0], meth)(level=0, ddof=ddof) expected = df[0].groupby(level=0).agg(alt) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = getattr(df, meth)(level=0, ddof=ddof) expected = df.groupby(level=0).agg(alt) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() @@ -1631,12 +1459,12 @@ def test_groupby_multilevel(self): expected = self.ymd.groupby([k1, k2]).mean() - assert_frame_equal(result, expected, check_names=False - ) # TODO groupby with level_values drops names + # TODO groupby with level_values drops names + tm.assert_frame_equal(result, expected, check_names=False) self.assertEqual(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() - assert_frame_equal(result, result2) + tm.assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass @@ -1665,15 +1493,15 @@ def test_partial_set(self): exp = self.ymd.copy() df.loc[2000, 4] = 0 exp.loc[2000, 4].values[:] = 0 - assert_frame_equal(df, exp) + tm.assert_frame_equal(df, exp) df['A'].loc[2000, 4] = 1 exp['A'].loc[2000, 4].values[:] = 1 - assert_frame_equal(df, exp) + tm.assert_frame_equal(df, exp) df.loc[2000] = 5 exp.loc[2000].values[:] = 5 - assert_frame_equal(df, exp) + tm.assert_frame_equal(df, exp) # this works...for now df['A'].iloc[14] = 5 @@ -1702,7 +1530,7 @@ def test_unstack_group_index_overflow(self): # test roundtrip stacked = result.stack() - assert_series_equal(s, stacked.reindex(s.index)) + tm.assert_series_equal(s, stacked.reindex(s.index)) # put it at beginning index = MultiIndex(levels=[[0, 1]] + [level] * 8, @@ -1737,7 +1565,7 @@ def test_partial_ix_missing(self): result = self.ymd.loc[2000, 0] expected = self.ymd.loc[2000]['A'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # need to put in some work here @@ -1767,8 +1595,8 @@ def test_level_with_tuples(self): result2 = series.loc[('foo', 'bar', 0)] expected = series[:2] expected.index = expected.index.droplevel(0) - assert_series_equal(result, expected) - assert_series_equal(result2, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) @@ -1776,8 +1604,8 @@ def test_level_with_tuples(self): result2 = frame.xs(('foo', 'bar', 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ( 'foo', 'qux')], [0, 1]], @@ -1790,30 +1618,30 @@ def test_level_with_tuples(self): result2 = series.loc[('foo', 'bar')] expected = series[:2] expected.index = expected.index.droplevel(0) - assert_series_equal(result, expected) - assert_series_equal(result2, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) result = frame.loc[('foo', 'bar')] result2 = frame.xs(('foo', 'bar')) expected = frame[:2] expected.index = expected.index.droplevel(0) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) def test_int_series_slicing(self): s = self.ymd['A'] result = s[5:] expected = s.reindex(s.index[5:]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) exp = self.ymd['A'].copy() s[5:] = 0 exp.values[5:] = 0 - self.assert_numpy_array_equal(s.values, exp.values) + tm.assert_numpy_array_equal(s.values, exp.values) result = self.ymd[5:] expected = self.ymd.reindex(s.index[5:]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_mixed_depth_get(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], @@ -1826,12 +1654,12 @@ def test_mixed_depth_get(self): result = df['a'] expected = df['a', '', ''] - assert_series_equal(result, expected, check_names=False) + tm.assert_series_equal(result, expected, check_names=False) self.assertEqual(result.name, 'a') result = df['routine1', 'result1'] expected = df['routine1', 'result1', ''] - assert_series_equal(result, expected, check_names=False) + tm.assert_series_equal(result, expected, check_names=False) self.assertEqual(result.name, ('routine1', 'result1')) def test_mixed_depth_insert(self): @@ -1847,7 +1675,7 @@ def test_mixed_depth_insert(self): expected = df.copy() result['b'] = [1, 2, 3, 4] expected['b', '', ''] = [1, 2, 3, 4] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_mixed_depth_drop(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], @@ -1860,16 +1688,16 @@ def test_mixed_depth_drop(self): result = df.drop('a', axis=1) expected = df.drop([('a', '', '')], axis=1) - assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) result = df.drop(['top'], axis=1) expected = df.drop([('top', 'OD', 'wx')], axis=1) expected = expected.drop([('top', 'OD', 'wy')], axis=1) - assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) result = df.drop(('top', 'OD', 'wx'), axis=1) expected = df.drop([('top', 'OD', 'wx')], axis=1) - assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) expected = df.drop([('top', 'OD', 'wy')], axis=1) expected = df.drop('top', axis=1) @@ -1877,7 +1705,7 @@ def test_mixed_depth_drop(self): result = df.drop('result1', level=1, axis=1) expected = df.drop([('routine1', 'result1', ''), ('routine2', 'result1', '')], axis=1) - assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) def test_drop_nonunique(self): df = DataFrame([["x-a", "x", "a", 1.5], ["x-a", "x", "a", 1.2], @@ -1898,7 +1726,7 @@ def test_drop_nonunique(self): result.index = expected.index - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_mixed_depth_pop(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], @@ -1913,32 +1741,32 @@ def test_mixed_depth_pop(self): df2 = df.copy() result = df1.pop('a') expected = df2.pop(('a', '', '')) - assert_series_equal(expected, result, check_names=False) - assert_frame_equal(df1, df2) + tm.assert_series_equal(expected, result, check_names=False) + tm.assert_frame_equal(df1, df2) self.assertEqual(result.name, 'a') expected = df1['top'] df1 = df1.drop(['top'], axis=1) result = df2.pop('top') - assert_frame_equal(expected, result) - assert_frame_equal(df1, df2) + tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(df1, df2) def test_reindex_level_partial_selection(self): result = self.frame.reindex(['foo', 'qux'], level=0) expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0) - assert_frame_equal(result, expected.T) + tm.assert_frame_equal(result, expected.T) result = self.frame.loc[['foo', 'qux']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.frame['A'].loc[['foo', 'qux']] - assert_series_equal(result, expected['A']) + tm.assert_series_equal(result, expected['A']) result = self.frame.T.loc[:, ['foo', 'qux']] - assert_frame_equal(result, expected.T) + tm.assert_frame_equal(result, expected.T) def test_setitem_multiple_partial(self): expected = self.frame.copy() @@ -1946,45 +1774,45 @@ def test_setitem_multiple_partial(self): result.loc[['foo', 'bar']] = 0 expected.loc['foo'] = 0 expected.loc['bar'] = 0 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = self.frame.copy() result = self.frame.copy() result.loc['foo':'bar'] = 0 expected.loc['foo'] = 0 expected.loc['bar'] = 0 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() result.loc[['foo', 'bar']] = 0 expected.loc['foo'] = 0 expected.loc['bar'] = 0 - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() result.loc['foo':'bar'] = 0 expected.loc['foo'] = 0 expected.loc['bar'] = 0 - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_drop_level(self): result = self.frame.drop(['bar', 'qux'], level='first') expected = self.frame.iloc[[0, 1, 2, 5, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.frame.drop(['two'], level='second') expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') expected = self.frame.iloc[[0, 1, 2, 5, 6]].T - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.frame.T.drop(['two'], axis=1, level='second') expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_level_nonunique_datetime(self): # GH 12701 @@ -2003,7 +1831,7 @@ def test_drop_level_nonunique_datetime(self): result = df.drop(ts, level='tstamp') expected = df.loc[idx != 4] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_preserve_names(self): index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], @@ -2089,7 +1917,7 @@ def test_indexing_ambiguity_bug_1678(self): result = frame.iloc[:, 1] exp = frame.loc[:, ('Ohio', 'Red')] tm.assertIsInstance(result, Series) - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) def test_nonunique_assignment_1750(self): df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], @@ -2181,7 +2009,7 @@ def test_duplicate_mi(self): ['foo', 'bar', 5.0, 5]], columns=list('ABCD')).set_index(['A', 'B']) result = df.loc[('foo', 'bar')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_duplicated_drop_duplicates(self): # GH 4060 @@ -2242,8 +2070,8 @@ def test_datetimeindex(self): expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'], tz='Asia/Tokyo') - self.assert_index_equal(idx.levels[0], expected1) - self.assert_index_equal(idx.levels[1], idx2) + tm.assert_index_equal(idx.levels[0], expected1) + tm.assert_index_equal(idx.levels[1], idx2) # from datetime combos # GH 7888 @@ -2289,14 +2117,14 @@ def test_set_index_datetime(self): expected = expected.tz_localize('UTC').tz_convert('US/Pacific') df = df.set_index('label', append=True) - self.assert_index_equal(df.index.levels[0], expected) - self.assert_index_equal(df.index.levels[1], - pd.Index(['a', 'b'], name='label')) + tm.assert_index_equal(df.index.levels[0], expected) + tm.assert_index_equal(df.index.levels[1], + pd.Index(['a', 'b'], name='label')) df = df.swaplevel(0, 1) - self.assert_index_equal(df.index.levels[0], - pd.Index(['a', 'b'], name='label')) - self.assert_index_equal(df.index.levels[1], expected) + tm.assert_index_equal(df.index.levels[0], + pd.Index(['a', 'b'], name='label')) + tm.assert_index_equal(df.index.levels[1], expected) df = DataFrame(np.random.random(6)) idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', @@ -2319,14 +2147,14 @@ def test_set_index_datetime(self): expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], tz='US/Eastern') - self.assert_index_equal(df.index.levels[0], expected1) - self.assert_index_equal(df.index.levels[1], expected2) - self.assert_index_equal(df.index.levels[2], idx3) + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) # GH 7092 - self.assert_index_equal(df.index.get_level_values(0), idx1) - self.assert_index_equal(df.index.get_level_values(1), idx2) - self.assert_index_equal(df.index.get_level_values(2), idx3) + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) def test_reset_index_datetime(self): # GH 3950 @@ -2351,7 +2179,7 @@ def test_reset_index_datetime(self): expected['idx1'] = expected['idx1'].apply( lambda d: pd.Timestamp(d, tz=tz)) - assert_frame_equal(df.reset_index(), expected) + tm.assert_frame_equal(df.reset_index(), expected) idx3 = pd.date_range('1/1/2012', periods=5, freq='MS', tz='Europe/Paris', name='idx3') @@ -2378,7 +2206,7 @@ def test_reset_index_datetime(self): lambda d: pd.Timestamp(d, tz=tz)) expected['idx3'] = expected['idx3'].apply( lambda d: pd.Timestamp(d, tz='Europe/Paris')) - assert_frame_equal(df.reset_index(), expected) + tm.assert_frame_equal(df.reset_index(), expected) # GH 7793 idx = pd.MultiIndex.from_product([['a', 'b'], pd.date_range( @@ -2396,7 +2224,7 @@ def test_reset_index_datetime(self): columns=['level_0', 'level_1', 'a']) expected['level_1'] = expected['level_1'].apply( lambda d: pd.Timestamp(d, freq='D', tz=tz)) - assert_frame_equal(df.reset_index(), expected) + tm.assert_frame_equal(df.reset_index(), expected) def test_reset_index_period(self): # GH 7746 @@ -2415,7 +2243,7 @@ def test_reset_index_period(self): 'feature': ['a', 'b', 'c'] * 3, 'a': np.arange(9, dtype='int64') }, columns=['month', 'feature', 'a']) - assert_frame_equal(df.reset_index(), expected) + tm.assert_frame_equal(df.reset_index(), expected) def test_set_index_period(self): # GH 6631 @@ -2433,13 +2261,13 @@ def test_set_index_period(self): expected1 = pd.period_range('2011-01-01', periods=3, freq='M') expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') - self.assert_index_equal(df.index.levels[0], expected1) - self.assert_index_equal(df.index.levels[1], expected2) - self.assert_index_equal(df.index.levels[2], idx3) + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) - self.assert_index_equal(df.index.get_level_values(0), idx1) - self.assert_index_equal(df.index.get_level_values(1), idx2) - self.assert_index_equal(df.index.get_level_values(2), idx3) + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) def test_repeat(self): # GH 9361 @@ -2475,4 +2303,185 @@ def test_iloc_mi(self): result = pd.DataFrame([[df_mi.iloc[r, c] for c in range(2)] for r in range(5)]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +class TestSorted(Base, tm.TestCase): + """ everthing you wanted to test about sorting """ + + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + self.assertEqual(result.index.names, self.frame.index.names) + + def test_sorting_repr_8017(self): + + np.random.seed(0) + data = np.random.randn(3, 4) + + for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4), + ([Timestamp('20130101'), Timestamp('20130103'), + Timestamp('20130102'), Timestamp('20130105')], + Timestamp('20130104')), + (['1one', '3one', '2one', '5one'], '4one')]: + columns = MultiIndex.from_tuples([('red', i) for i in gen]) + df = DataFrame(data, index=list('def'), columns=columns) + df2 = pd.concat([df, + DataFrame('world', index=list('def'), + columns=MultiIndex.from_tuples( + [('red', extra)]))], axis=1) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + self.assertEqual(str(df2).splitlines()[0].split(), ['red']) + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:, [0, 2, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:, [0, 2, 1, 4, 3]] + tm.assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[('red', extra)] = 'world' + + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level(self): + df = self.frame.copy() + df.index = np.arange(len(df)) + + # axis=1 + + # series + a_sorted = self.frame['A'].sort_index(level=0) + + # preserve names + self.assertEqual(a_sorted.index.names, self.frame.index.names) + + # inplace + rs = self.frame.copy() + rs.sort_index(level=0, inplace=True) + tm.assert_frame_equal(rs, self.frame.sort_index(level=0)) + + def test_sort_index_level_large_cardinality(self): + + # #2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) + + # it works! + result = df.sort_index(level=0) + self.assertTrue(result.index.lexsort_depth == 3) + + # #2684 (int32) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) + + # it works! + result = df.sort_index(level=0) + self.assertTrue((result.dtypes.values == df.dtypes.values).all()) + self.assertTrue(result.index.lexsort_depth == 3) + + def test_sort_index_level_by_name(self): + self.frame.index.names = ['first', 'second'] + result = self.frame.sort_index(level='second') + expected = self.frame.sort_index(level=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level_mixed(self): + sorted_before = self.frame.sort_index(level=1) + + df = self.frame.copy() + df['foo'] = 'bar' + sorted_after = df.sort_index(level=1) + tm.assert_frame_equal(sorted_before, + sorted_after.drop(['foo'], axis=1)) + + dft = self.frame.T + sorted_before = dft.sort_index(level=1, axis=1) + dft['foo', 'three'] = 'bar' + + sorted_after = dft.sort_index(level=1, axis=1) + tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), + sorted_after.drop([('foo', 'three')], axis=1)) + + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + self.assertTrue(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) + self.assertFalse(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) + self.assertFalse(index.is_lexsorted()) + self.assertEqual(index.lexsort_depth, 0) + + def test_getitem_multilevel_index_tuple_not_sorted(self): + index_columns = list("abc") + df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], + columns=index_columns + ["data"]) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.loc[query_index, "data"] + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) + xp = Series(['x'], index=xp_idx, name='data') + tm.assert_series_equal(rs, xp) + + def test_getitem_slice_not_sorted(self): + df = self.frame.sort_index(level=1).T + + # buglet with int typechecking + result = df.iloc[:, :np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted(self): + df = self.frame.T + df['foo', 'four'] = 'foo' + + arrays = [np.array(x) for x in zip(*df.columns.values)] + + result = df['foo'] + result2 = df.loc[:, 'foo'] + expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + df = df.T + result = df.xs('foo') + result2 = df.loc['foo'] + expected = df.reindex(df.index[arrays[0] == 'foo']) + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = lzip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index.values)] + + result = s['qux'] + result2 = s.loc['qux'] + expected = s[arrays[0] == 'qux'] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) From 7d3333c33872f99daa34762c8f8e9f54b268347d Mon Sep 17 00:00:00 2001 From: Greg Williams Date: Thu, 16 Mar 2017 07:56:46 -0400 Subject: [PATCH 298/338] BUG: Group-by numeric type-coercion with datetime closes #14423 closes #15421 closes #15670 During a group-by/apply on a DataFrame, in the presence of one or more DateTime-like columns, Pandas would incorrectly coerce the type of all other columns to numeric. E.g. a String column would be coerced to numeric, producing NaNs. Author: Greg Williams Closes #15680 from gwpdt/bugfix14423 and squashes the following commits: e1ed104 [Greg Williams] TST: Rename and expand test_numeric_coercion 0a15674 [Greg Williams] CLN: move import, add whatsnew entry c8844e0 [Greg Williams] CLN: PEP8 (whitespace fixes) 46d12c2 [Greg Williams] BUG: Group-by numeric type-coericion with datetime --- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/core/groupby.py | 5 ++- pandas/tests/groupby/test_groupby.py | 48 ++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 41b6519eb740f..a56212328f5c3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -850,7 +850,8 @@ Bug Fixes - Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) -- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) +- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) +- Bug in ``groupby.apply()`` coercing ``object`` dtypes to numeric types, when not all values were numeric (:issue:`14423`, :issue:`15421`, :issue:`15670`) - Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a10be078a8f96..7a017ffae284c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -10,6 +10,7 @@ zip, range, lzip, callable, map ) + from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 @@ -3424,6 +3425,7 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same + from pandas.tools.util import to_numeric if len(keys) == 0: return DataFrame(index=keys) @@ -3566,7 +3568,8 @@ def first_non_None_value(values): # as we are stacking can easily have object dtypes here so = self._selected_obj if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): - result = result._convert(numeric=True) + result = result.apply( + lambda x: to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( include=['datetime', 'timedelta']).columns date_cols = date_cols.intersection(result.columns) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d7fa3beda0abf..c25974c94bfd1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4314,6 +4314,54 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) + def test_apply_numeric_coercion_when_datetime(self): + # In the past, group-by/apply operations have been over-eager + # in converting dtypes to numeric, in the presence of datetime + # columns. Various GH issues were filed, the reproductions + # for which are here. + + # GH 15670 + df = pd.DataFrame({'Number': [1, 2], + 'Date': ["2017-03-02"] * 2, + 'Str': ["foo", "inf"]}) + expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df.Date = pd.to_datetime(df.Date) + result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result['Str'], expected['Str']) + + # GH 15421 + df = pd.DataFrame({'A': [10, 20, 30], + 'B': ['foo', '3', '4'], + 'T': [pd.Timestamp("12:31:22")] * 3}) + + def get_B(g): + return g.iloc[0][['B']] + result = df.groupby('A').apply(get_B)['B'] + expected = df.B + expected.index = df.A + tm.assert_series_equal(result, expected) + + # GH 14423 + def predictions(tool): + out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) + if 'step1' in list(tool.State): + out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) + if 'step2' in list(tool.State): + out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) + out['useTime'] = str( + tool[tool.State == 'step2'].oTime.values[0]) + return out + df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], + 'State': ['step1', 'step2', 'step1', 'step2'], + 'oTime': ['', '2016-09-19 05:24:33', + '', '2016-09-19 23:59:04'], + 'Machine': ['23', '36L', '36R', '36R']}) + df2 = df1.copy() + df2.oTime = pd.to_datetime(df2.oTime) + expected = df1.groupby('Key').apply(predictions).p1 + result = df2.groupby('Key').apply(predictions).p1 + tm.assert_series_equal(expected, result) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From ca223b59ec21ad66d6fb8a5c4f72d83a3e395067 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Mar 2017 12:20:37 -0400 Subject: [PATCH 299/338] CI: remove dev-scipy from testing on numpy-dev build as really old wheels (#15699) closes #15696 --- ci/requirements-3.5_NUMPY_DEV.build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.5_NUMPY_DEV.build.sh b/ci/requirements-3.5_NUMPY_DEV.build.sh index 91fa15491bbf7..b6c8a477e6f5e 100644 --- a/ci/requirements-3.5_NUMPY_DEV.build.sh +++ b/ci/requirements-3.5_NUMPY_DEV.build.sh @@ -8,6 +8,6 @@ echo "install numpy master wheel" pip uninstall numpy -y # install numpy wheel from master -pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy scipy +pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy true From 7e4615a0045055929b9705c0d09840fdfa1a3df8 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Thu, 16 Mar 2017 12:12:08 -0700 Subject: [PATCH 300/338] MAINT: test with manylinux numpy/scipy pre-release (#15702) Numpy switching to daily manylinux wheels of trunk, instead of building wheels specific to Ubuntu 12.04 for every commit. Use these new wheels for numpy pre-release testing. --- .travis.yml | 10 ---------- ci/requirements-3.5_NUMPY_DEV.build.sh | 3 ++- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index b0331941e2a1e..ee093e5bf0e60 100644 --- a/.travis.yml +++ b/.travis.yml @@ -123,11 +123,6 @@ matrix: - PANDAS_TESTING_MODE="deprecate" - CACHE_NAME="35_numpy_dev" - USE_CACHE=true - addons: - apt: - packages: - - libatlas-base-dev - - gfortran # In allow_failures - python: 3.5 env: @@ -167,11 +162,6 @@ matrix: - PANDAS_TESTING_MODE="deprecate" - CACHE_NAME="35_numpy_dev" - USE_CACHE=true - addons: - apt: - packages: - - libatlas-base-dev - - gfortran - python: 3.5 env: - PYTHON_VERSION=3.5 diff --git a/ci/requirements-3.5_NUMPY_DEV.build.sh b/ci/requirements-3.5_NUMPY_DEV.build.sh index b6c8a477e6f5e..4af1307f26a18 100644 --- a/ci/requirements-3.5_NUMPY_DEV.build.sh +++ b/ci/requirements-3.5_NUMPY_DEV.build.sh @@ -8,6 +8,7 @@ echo "install numpy master wheel" pip uninstall numpy -y # install numpy wheel from master -pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy +PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" +pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy true From 3dd9a8d3445241d31b35c8aedb4b43837a5c7018 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Mar 2017 16:46:21 -0400 Subject: [PATCH 301/338] TST: missing __init__.py file in pandas/tests/io/sas --- pandas/tests/io/sas/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/sas/__init__.py diff --git a/pandas/tests/io/sas/__init__.py b/pandas/tests/io/sas/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 05be2bd729af188a446db12ba13c150b938b6d90 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Mar 2017 18:28:56 -0400 Subject: [PATCH 302/338] TST: report the exit code on pandas.test() exit --- pandas/util/_tester.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 8d9701e0b4672..aeb4259a9edae 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -2,6 +2,7 @@ Entrypoint for testing from the top-level namespace """ import os +import sys PKG = os.path.dirname(os.path.dirname(__file__)) @@ -20,7 +21,7 @@ def test(extra_args=None): cmd = extra_args cmd += [PKG] print("running: pytest {}".format(' '.join(cmd))) - pytest.main(cmd) + sys.exit(pytest.main(cmd)) __all__ = ['test'] From 4dbe10eba17396f2feb20ff98f0fd08eb87f0d39 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 08:37:10 -0400 Subject: [PATCH 303/338] CI: re-enable miniconda cache --- ci/install_travis.sh | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 12202b4ceee70..aad87ea37439f 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -33,20 +33,26 @@ home_dir=$(pwd) echo "[home_dir: $home_dir]" # install miniconda -echo "[Using clean Miniconda install]" - MINICONDA_DIR="$HOME/miniconda3" -if [ -d "$MINICONDA_DIR" ]; then - rm -rf "$MINICONDA_DIR" -fi -# install miniconda -if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 +if [ "$USE_CACHE" ] && [ -d "$MINICONDA_DIR" ]; then + echo "[Using cached Miniconda install]" + else - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + echo "[Using clean Miniconda install]" + + if [ -d "$MINICONDA_DIR" ]; then + rm -rf "$MINICONDA_DIR" + fi + + # install miniconda + if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + else + wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + fi + bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 fi -bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 echo "[update conda]" conda config --set ssl_verify false || exit 1 From 50bbb0934c3a97e4e8c80b04bb1517f48b2f5d26 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 17 Mar 2017 08:50:27 -0400 Subject: [PATCH 304/338] TST: Replace check_package with skip_if_no_package (#15709) check_package literally just called skip_if_no_package with no additional decorations. --- pandas/tests/io/test_pytables.py | 3 ++- pandas/util/testing.py | 27 ++++++++++----------------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5592c564e51df..8ea8088a297b8 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -728,7 +728,8 @@ def test_put_compression(self): format='fixed', complib='zlib') def test_put_compression_blosc(self): - tm.skip_if_no_package('tables', '2.2', app='blosc support') + tm.skip_if_no_package('tables', min_version='2.2', + app='blosc support') if skip_compression: pytest.skip("skipping on windows/PY3") diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 529ecef3e2d6a..154476ce8340a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2010,12 +2010,16 @@ def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) -# Dependency checks. Copied this from Nipy/Nipype (Copyright of -# respective developers, license: BSD-3) -def package_check(pkg_name, min_version=None, max_version=None, app='pandas', - checker=LooseVersion): +# Dependency checker when running tests. +# +# Copied this from nipy/nipype +# Copyright of respective developers, License: BSD-3 +def skip_if_no_package(pkg_name, min_version=None, max_version=None, + app='pandas', checker=LooseVersion): """Check that the min/max version of the required package is installed. + If the package check fails, the test is automatically skipped. + Parameters ---------- pkg_name : string @@ -2025,11 +2029,11 @@ def package_check(pkg_name, min_version=None, max_version=None, app='pandas', max_version : string, optional Max version number for required package. app : string, optional - Application that is performing the check. For instance, the + Application that is performing the check. For instance, the name of the tutorial being executed that depends on specific packages. checker : object, optional - The class that will perform the version checking. Default is + The class that will perform the version checking. Default is distutils.version.LooseVersion. Examples @@ -2061,17 +2065,6 @@ def package_check(pkg_name, min_version=None, max_version=None, app='pandas', pytest.skip(msg) -def skip_if_no_package(*args, **kwargs): - """pytest.skip() if package_check fails - - Parameters - ---------- - *args Positional parameters passed to `package_check` - *kwargs Keyword parameters passed to `package_check` - """ - package_check(*args, **kwargs) - - def optional_args(decorator): """allows a decorator to take optional positional and keyword arguments. Assumes that taking a single, callable, positional argument means that From 9630d4cb94703749aaeacdccbbb6a1110fa845d0 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 17 Mar 2017 08:50:55 -0400 Subject: [PATCH 305/338] DOC: Add gotcha about flake8-ing diff The `flake8`-ing the diff will not catch any import style errors. I put an alternative check that is more comprehensive but will take longer to run since you will be checking entire files instead of the diff. Author: gfyoung Closes #15712 from gfyoung/pep8-diff-gotcha and squashes the following commits: 42c13de [gfyoung] DOC: Add gotcha about flake8-ing diff --- doc/source/contributing.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 83f99b4f01b26..7961780d0c79b 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -520,6 +520,15 @@ submitting code to run the check yourself on the diff:: git diff master | flake8 --diff +This command will catch any stylistic errors in your changes specifically, but +be beware it may not catch all of them. For example, if you delete the only +usage of an imported function, it is stylistically incorrect to import an +unused function. However, style-checking the diff will not catch this because +the actual import is not part of the diff. Thus, for completeness, you should +run this command, though it will take longer:: + + git diff master --name-only -- '*.py' | grep 'pandas' | xargs -r flake8 + Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ From 7fc34342a869c95ddf18229f72dced8a9629c552 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 09:06:27 -0400 Subject: [PATCH 306/338] TST: don't catch, but supress warnings in panel4d/panelnd (#15705) --- pandas/core/categorical.py | 4 +- pandas/io/pytables.py | 18 ++- pandas/tests/io/test_pytables.py | 221 ++++++++++++------------------ pandas/tests/test_panel.py | 3 +- pandas/tests/test_panel4d.py | 187 +++++++++++++------------ pandas/tests/test_panelnd.py | 7 +- pandas/tests/tools/test_concat.py | 5 +- 7 files changed, 210 insertions(+), 235 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c1e5904693d1c..af51c7f2e2dc1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -550,8 +550,8 @@ def _validate_categories(cls, categories, fastpath=False): # we don't allow NaNs in the categories themselves if categories.hasnans: - # NaNs in cats deprecated in 0.17, - # remove in 0.18 or 0.19 GH 10748 + # NaNs in cats deprecated in 0.17 + # GH 10748 msg = ('\nSetting NaNs in `categories` is deprecated and ' 'will be removed in a future version of pandas.') warn(msg, FutureWarning, stacklevel=3) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e4f72f901a417..10fe678b487f9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2097,7 +2097,17 @@ def convert(self, values, nan_rep, encoding): # we have a categorical categories = self.metadata - self.data = Categorical.from_codes(self.data.ravel(), + codes = self.data.ravel() + + # if we have stored a NaN in the categories + # then strip it; in theory we could have BOTH + # -1s in the codes and nulls :< + mask = isnull(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values + + self.data = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) @@ -3407,10 +3417,12 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if existing_table is not None: indexer = len(self.non_index_axes) exist_axis = existing_table.non_index_axes[indexer][1] - if append_axis != exist_axis: + if not array_equivalent(np.array(append_axis), + np.array(exist_axis)): # ahah! -> reindex - if sorted(append_axis) == sorted(exist_axis): + if array_equivalent(np.array(sorted(append_axis)), + np.array(sorted(exist_axis))): append_axis = exist_axis # the non_index_axes info diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 8ea8088a297b8..40866b8702fe2 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1,11 +1,12 @@ import pytest import sys import os -import warnings +from warnings import catch_warnings import tempfile from contextlib import contextmanager import datetime +from datetime import timedelta import numpy as np import pandas @@ -22,7 +23,7 @@ from pandas.io.pytables import TableIterator from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf, IncompatibilityWarning, PerformanceWarning, - AttributeConflictWarning, DuplicateWarning, + AttributeConflictWarning, PossibleDataLossError, ClosedFileError) from pandas.io import pytables as pytables @@ -31,7 +32,6 @@ assert_panel_equal, assert_frame_equal, assert_series_equal, - assert_produces_warning, set_timezone) from pandas import concat, Timestamp from pandas import compat @@ -123,17 +123,6 @@ def _maybe_remove(store, key): pass -@contextmanager -def compat_assert_produces_warning(w): - """ don't produce a warning under PY3 """ - if compat.PY3: - yield - else: - with tm.assert_produces_warning(expected_warning=w, - check_stacklevel=False): - yield - - class Base(tm.TestCase): @classmethod @@ -151,8 +140,6 @@ def tearDownClass(cls): tm.set_testing_mode() def setUp(self): - warnings.filterwarnings(action='ignore', category=FutureWarning) - self.path = 'tmp.__%s__.h5' % tm.rands(10) def tearDown(self): @@ -420,9 +407,9 @@ def test_repr(self): df.loc[3:6, ['obj1']] = np.nan df = df._consolidate()._convert(datetime=True) - warnings.filterwarnings('ignore', category=PerformanceWarning) - store['df'] = df - warnings.filterwarnings('always', category=PerformanceWarning) + # PerformanceWarning + with catch_warnings(record=True): + store['df'] = df # make a random group in hdf space store._handle.create_group(store._handle.root, 'bah') @@ -455,9 +442,9 @@ def test_contains(self): self.assertNotIn('bar', store) # GH 2694 - warnings.filterwarnings( - 'ignore', category=tables.NaturalNameWarning) - store['node())'] = tm.makeDataFrame() + # tables.NaturalNameWarning + with catch_warnings(record=True): + store['node())'] = tm.makeDataFrame() self.assertIn('node())', store) def test_versioning(self): @@ -768,11 +755,8 @@ def test_put_mixed_type(self): with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df') - # cannot use assert_produces_warning here for some reason - # a PendingDeprecationWarning is also raised? - warnings.filterwarnings('ignore', category=PerformanceWarning) - store.put('df', df) - warnings.filterwarnings('always', category=PerformanceWarning) + with catch_warnings(record=True): + store.put('df', df) expected = store.get('df') tm.assert_frame_equal(expected, df) @@ -797,8 +781,8 @@ def test_append(self): tm.assert_frame_equal(store['df3'], df) # this is allowed by almost always don't want to do it - with tm.assert_produces_warning( - expected_warning=tables.NaturalNameWarning): + # tables.NaturalNameWarning): + with catch_warnings(record=True): _maybe_remove(store, '/df3 foo') store.append('/df3 foo', df[:10]) store.append('/df3 foo', df[10:]) @@ -812,8 +796,7 @@ def test_append(self): assert_panel_equal(store['wp1'], wp) # ndim - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D() _maybe_remove(store, 'p4d') store.append('p4d', p4d.iloc[:, :, :10, :]) @@ -901,12 +884,12 @@ def test_append_series(self): # select on the values expected = ns[ns > 60] - result = store.select('ns', Term('foo>60')) + result = store.select('ns', 'foo>60') tm.assert_series_equal(result, expected) # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] - result = store.select('ns', [Term('foo>70'), Term('index<90')]) + result = store.select('ns', 'foo>70 and index<90') tm.assert_series_equal(result, expected) # multi-index @@ -1228,7 +1211,7 @@ def test_append_with_different_block_ordering(self): def test_ndim_indexables(self): # test using ndim tables in new ways - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): with ensure_clean_store(self.path) as store: p4d = tm.makePanel4D() @@ -1888,8 +1871,7 @@ def test_append_misc(self): with ensure_clean_store(self.path) as store: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): # unsuported data types for non-tables p4d = tm.makePanel4D() @@ -1930,7 +1912,7 @@ def check(obj, comparator): p = tm.makePanel() check(p, assert_panel_equal) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D() check(p4d, assert_panel4d_equal) @@ -2058,8 +2040,8 @@ def test_table_values_dtypes_roundtrip(self): expected = Series({'float32': 2, 'float64': 1, 'int32': 1, 'bool': 1, 'int16': 1, 'int8': 1, 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) - result.sort() - expected.sort() + result = result.sort_index() + result = expected.sort_index() tm.assert_series_equal(result, expected) def test_table_mixed_dtypes(self): @@ -2098,7 +2080,8 @@ def test_table_mixed_dtypes(self): store.append('p1_mixed', wp) assert_panel_equal(store.select('p1_mixed'), wp) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): + # ndim wp = tm.makePanel4D() wp['obj1'] = 'foo' @@ -2170,7 +2153,6 @@ def test_append_with_timedelta(self): # GH 3577 # append timedelta - from datetime import timedelta df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp( '20130101') + timedelta(days=i, seconds=10) for i in range(10)])) df['C'] = df['A'] - df['B'] @@ -2184,12 +2166,9 @@ def test_append_with_timedelta(self): result = store.select('df') assert_frame_equal(result, df) - result = store.select('df', Term("C<100000")) + result = store.select('df', "C<100000") assert_frame_equal(result, df) - result = store.select('df', Term("C", "<", -3 * 86400)) - assert_frame_equal(result, df.iloc[3:]) - result = store.select('df', "C<'-3D'") assert_frame_equal(result, df.iloc[3:]) @@ -2432,7 +2411,7 @@ def test_invalid_terms(self): with ensure_clean_store(self.path) as store: - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): df = tm.makeTimeDataFrame() df['string'] = 'foo' @@ -2490,7 +2469,7 @@ def test_terms(self): 0: tm.makeDataFrame(), 1: tm.makeDataFrame()}) - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): p4d = tm.makePanel4D() store.put('p4d', p4d, format='table') @@ -2499,39 +2478,23 @@ def test_terms(self): store.put('wpneg', wpneg, format='table') # panel - result = store.select('wp', [Term( - 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) + result = store.select( + 'wp', "major_axis<'20000108' and minor_axis=['A', 'B']") expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) assert_panel_equal(result, expected) - # with deprecation - result = store.select('wp', [Term( - 'major_axis', '<', "20000108"), Term("minor_axis=['A', 'B']")]) - expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) - tm.assert_panel_equal(result, expected) - # p4d - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): result = store.select('p4d', - [Term('major_axis<"20000108"'), - Term("minor_axis=['A', 'B']"), - Term("items=['ItemA', 'ItemB']")]) + ("major_axis<'20000108' and " + "minor_axis=['A', 'B'] and " + "items=['ItemA', 'ItemB']")) expected = p4d.truncate(after='20000108').reindex( minor=['A', 'B'], items=['ItemA', 'ItemB']) assert_panel4d_equal(result, expected) - # back compat invalid terms - terms = [dict(field='major_axis', op='>', value='20121114'), - [dict(field='major_axis', op='>', value='20121114')], - ["minor_axis=['A','B']", - dict(field='major_axis', op='>', value='20121114')]] - for t in terms: - with tm.assert_produces_warning(expected_warning=FutureWarning, - check_stacklevel=False): - Term(t) - - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): # valid terms terms = [('major_axis=20121114'), @@ -2582,13 +2545,13 @@ def test_term_compat(self): minor_axis=['A', 'B', 'C', 'D']) store.append('wp', wp) - result = store.select('wp', [Term('major_axis>20000102'), - Term('minor_axis', '=', ['A', 'B'])]) + result = store.select( + 'wp', "major_axis>20000102 and minor_axis=['A', 'B']") expected = wp.loc[:, wp.major_axis > Timestamp('20000102'), ['A', 'B']] assert_panel_equal(result, expected) - store.remove('wp', Term('major_axis>20000103')) + store.remove('wp', 'major_axis>20000103') result = store.select('wp') expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :] assert_panel_equal(result, expected) @@ -2602,25 +2565,23 @@ def test_term_compat(self): # stringified datetimes result = store.select( - 'wp', [Term('major_axis', '>', datetime.datetime(2000, 1, 2))]) + 'wp', "major_axis>datetime.datetime(2000, 1, 2)") expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] assert_panel_equal(result, expected) result = store.select( - 'wp', [Term('major_axis', '>', - datetime.datetime(2000, 1, 2, 0, 0))]) + 'wp', "major_axis>datetime.datetime(2000, 1, 2, 0, 0)") expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] assert_panel_equal(result, expected) result = store.select( - 'wp', [Term('major_axis', '=', - [datetime.datetime(2000, 1, 2, 0, 0), - datetime.datetime(2000, 1, 3, 0, 0)])]) + 'wp', ("major_axis=[datetime.datetime(2000, 1, 2, 0, 0), " + "datetime.datetime(2000, 1, 3, 0, 0)]")) expected = wp.loc[:, [Timestamp('20000102'), Timestamp('20000103')]] assert_panel_equal(result, expected) - result = store.select('wp', [Term('minor_axis', '=', ['A', 'B'])]) + result = store.select('wp', "minor_axis=['A', 'B']") expected = wp.loc[:, :, ['A', 'B']] assert_panel_equal(result, expected) @@ -2631,8 +2592,7 @@ def test_backwards_compat_without_term_object(self): major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) store.append('wp', wp) - with assert_produces_warning(expected_warning=FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): result = store.select('wp', [('major_axis>20000102'), ('minor_axis', '=', ['A', 'B'])]) expected = wp.loc[:, @@ -2653,24 +2613,21 @@ def test_backwards_compat_without_term_object(self): store.append('wp', wp) # stringified datetimes - with assert_produces_warning(expected_warning=FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): result = store.select('wp', [('major_axis', '>', datetime.datetime(2000, 1, 2))]) expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] assert_panel_equal(result, expected) - with assert_produces_warning(expected_warning=FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): result = store.select('wp', [('major_axis', '>', datetime.datetime(2000, 1, 2, 0, 0))]) expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] assert_panel_equal(result, expected) - with assert_produces_warning(expected_warning=FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): result = store.select('wp', [('major_axis', '=', @@ -2769,9 +2726,7 @@ def test_tuple_index(self): data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) - expected_warning = Warning if PY35 else PerformanceWarning - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): self._check_roundtrip(DF, tm.assert_frame_equal) def test_index_types(self): @@ -2783,30 +2738,23 @@ def test_index_types(self): check_index_type=True, check_series_type=True) - # nose has a deprecation warning in 3.5 - expected_warning = Warning if PY35 else PerformanceWarning - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): ser = Series(values, [datetime.date.today(), 'a']) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) @@ -3054,7 +3002,7 @@ def test_wide_table_dups(self): store.put('panel', wp, format='table') store.put('panel', wp, format='table', append=True) - with tm.assert_produces_warning(expected_warning=DuplicateWarning): + with catch_warnings(record=True): recons = store['panel'] assert_panel_equal(recons, wp) @@ -3648,6 +3596,7 @@ def test_retain_index_attributes(self): def test_retain_index_attributes2(self): with ensure_clean_path(self.path) as path: + expected_warning = Warning if PY35 else AttributeConflictWarning with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): @@ -3805,15 +3754,10 @@ def test_frame_select_complex2(self): hist.to_hdf(hh, 'df', mode='w', format='table') - expected = read_hdf(hh, 'df', where=Term('l1', '=', [2, 3, 4])) - - # list like - result = read_hdf(hh, 'df', where=Term( - 'l1', '=', selection.index.tolist())) - assert_frame_equal(result, expected) - l = selection.index.tolist() # noqa + expected = read_hdf(hh, 'df', where="l1=[2, 3, 4]") # sccope with list like + l = selection.index.tolist() # noqa store = HDFStore(hh) result = store.select('df', where='l1=l') assert_frame_equal(result, expected) @@ -3882,12 +3826,12 @@ def test_string_select(self): store.append('df', df, data_columns=['x']) - result = store.select('df', Term('x=none')) + result = store.select('df', 'x=none') expected = df[df.x == 'none'] assert_frame_equal(result, expected) try: - result = store.select('df', Term('x!=none')) + result = store.select('df', 'x!=none') expected = df[df.x != 'none'] assert_frame_equal(result, expected) except Exception as detail: @@ -3899,7 +3843,7 @@ def test_string_select(self): df2.loc[df2.x == '', 'x'] = np.nan store.append('df2', df2, data_columns=['x']) - result = store.select('df2', Term('x!=none')) + result = store.select('df2', 'x!=none') expected = df2[isnull(df2.x)] assert_frame_equal(result, expected) @@ -3909,11 +3853,11 @@ def test_string_select(self): store.append('df3', df, data_columns=['int']) - result = store.select('df3', Term('int=2')) + result = store.select('df3', 'int=2') expected = df[df.int == 2] assert_frame_equal(result, expected) - result = store.select('df3', Term('int!=2')) + result = store.select('df3', 'int!=2') expected = df[df.int != 2] assert_frame_equal(result, expected) @@ -4179,8 +4123,8 @@ def test_select_as_multiple(self): tm.assert_frame_equal(result, expected) # multiple (diff selector) - result = store.select_as_multiple(['df1', 'df2'], where=[Term( - 'index>df2.index[4]')], selector='df2') + result = store.select_as_multiple( + ['df1', 'df2'], where='index>df2.index[4]', selector='df2') expected = concat([df1, df2], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) @@ -4222,13 +4166,13 @@ def test_start_stop_table(self): store.append('df', df) result = store.select( - 'df', [Term("columns=['A']")], start=0, stop=5) + 'df', "columns=['A']", start=0, stop=5) expected = df.loc[0:4, ['A']] tm.assert_frame_equal(result, expected) # out of range result = store.select( - 'df', [Term("columns=['A']")], start=30, stop=40) + 'df', "columns=['A']", start=30, stop=40) self.assertTrue(len(result) == 0) expected = df.loc[30:40, ['A']] tm.assert_frame_equal(result, expected) @@ -4288,11 +4232,11 @@ def test_select_filter_corner(self): with ensure_clean_store(self.path) as store: store.put('frame', df, format='table') - crit = Term('columns=df.columns[:75]') + crit = 'columns=df.columns[:75]' result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - crit = Term('columns=df.columns[:75:2]') + crit = 'columns=df.columns[:75:2]' result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) @@ -4471,16 +4415,16 @@ def test_legacy_table_read(self): with tm.assert_produces_warning( expected_warning=IncompatibilityWarning): self.assertRaises( - Exception, store.select, 'wp1', Term('minor_axis=B')) + Exception, store.select, 'wp1', 'minor_axis=B') df2 = store.select('df2') - result = store.select('df2', Term('index>df2.index[2]')) + result = store.select('df2', 'index>df2.index[2]') expected = df2[df2.index > df2.index[2]] assert_frame_equal(expected, result) def test_legacy_0_10_read(self): # legacy from 0.10 - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): path = tm.get_data_path('legacy_hdf/legacy_0.10.h5') with ensure_clean_store(path, mode='r') as store: str(store) @@ -4504,7 +4448,7 @@ def test_legacy_0_11_read(self): def test_copy(self): - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): def do_copy(f=None, new_f=None, keys=None, propindexes=True, **kwargs): @@ -4646,7 +4590,8 @@ def test_unicode_index(self): unicode_values = [u('\u03c3'), u('\u03c3\u03c3')] - with compat_assert_produces_warning(PerformanceWarning): + # PerformanceWarning + with catch_warnings(record=True): s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) @@ -4914,15 +4859,19 @@ def test_to_hdf_with_object_column_names(self): with self.assertRaises( ValueError, msg=("cannot have non-object label " "DataIndexableCol")): - df.to_hdf(path, 'df', format='table', data_columns=True) + with catch_warnings(record=True): + df.to_hdf(path, 'df', + format='table', + data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = pd.read_hdf( - path, 'df', where="index = [{0}]".format(df.index[0])) - assert(len(result)) + with catch_warnings(record=True): + df.to_hdf(path, 'df', format='table', data_columns=True) + result = pd.read_hdf( + path, 'df', where="index = [{0}]".format(df.index[0])) + assert(len(result)) def test_read_hdf_open_store(self): # GH10330 @@ -5187,7 +5136,7 @@ def test_complex_mixed_table(self): with ensure_clean_store(self.path) as store: store.append('df', df, data_columns=['A', 'B']) - result = store.select('df', where=Term('A>2')) + result = store.select('df', where='A>2') assert_frame_equal(df.loc[df.A > 2], result) with ensure_clean_path(self.path) as path: @@ -5216,7 +5165,7 @@ def test_complex_across_dimensions(self): df = DataFrame({'A': s, 'B': s}) p = Panel({'One': df, 'Two': df}) - with compat_assert_produces_warning(FutureWarning): + with catch_warnings(record=True): p4d = pd.Panel4D({'i': p, 'ii': p}) objs = [df, p, p4d] @@ -5300,7 +5249,7 @@ def test_append_with_timezones_dateutil(self): # select with tz aware expected = df[df.A >= df.A[3]] - result = store.select('df_tz', where=Term('A>=df.A[3]')) + result = store.select('df_tz', where='A>=df.A[3]') self._compare_with_tz(result, expected) # ensure we include dates in DST and STD time here. @@ -5371,7 +5320,7 @@ def test_append_with_timezones_pytz(self): # select with tz aware self._compare_with_tz(store.select( - 'df_tz', where=Term('A>=df.A[3]')), df[df.A >= df.A[3]]) + 'df_tz', where='A>=df.A[3]'), df[df.A >= df.A[3]]) _maybe_remove(store, 'df_tz') # ensure we include dates in DST and STD time here. diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 373f590cbf9eb..ab0322abbcf06 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=W0612,E1101 +from warnings import catch_warnings from datetime import datetime import operator @@ -1272,7 +1273,7 @@ def test_apply_slabs(self): f = lambda x: ((x.T - x.mean(1)) / x.std(1)).T # make sure that we don't trigger any warnings - with tm.assert_produces_warning(False): + with catch_warnings(record=True): result = self.panel.apply(f, axis=['items', 'major_axis']) expected = Panel(dict([(ax, f(self.panel.loc[:, :, ax])) for ax in self.panel.minor_axis])) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 2491bac2a7f19..c0511581cd299 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -3,7 +3,7 @@ from pandas.compat import range, lrange import operator import pytest - +from warnings import catch_warnings import numpy as np from pandas.types.common import is_float_dtype @@ -129,17 +129,21 @@ def skipna_wrapper(x): def wrapper(x): return alternative(np.asarray(x)) - for i in range(obj.ndim): - result = f(axis=i, skipna=False) - assert_panel_equal(result, obj.apply(wrapper, axis=i)) + with catch_warnings(record=True): + for i in range(obj.ndim): + result = f(axis=i, skipna=False) + expected = obj.apply(wrapper, axis=i) + assert_panel_equal(result, expected) else: skipna_wrapper = alternative wrapper = alternative - for i in range(obj.ndim): - result = f(axis=i) - if not tm._incompat_bottleneck_version(name): - assert_panel_equal(result, obj.apply(skipna_wrapper, axis=i)) + with catch_warnings(record=True): + for i in range(obj.ndim): + result = f(axis=i) + if not tm._incompat_bottleneck_version(name): + expected = obj.apply(skipna_wrapper, axis=i) + assert_panel_equal(result, expected) self.assertRaises(Exception, f, axis=obj.ndim) @@ -161,32 +165,33 @@ def test_get_axis(self): assert(self.panel4d._get_axis(3) is self.panel4d.minor_axis) def test_set_axis(self): - new_labels = Index(np.arange(len(self.panel4d.labels))) + with catch_warnings(record=True): + new_labels = Index(np.arange(len(self.panel4d.labels))) - # TODO: unused? - # new_items = Index(np.arange(len(self.panel4d.items))) + # TODO: unused? + # new_items = Index(np.arange(len(self.panel4d.items))) - new_major = Index(np.arange(len(self.panel4d.major_axis))) - new_minor = Index(np.arange(len(self.panel4d.minor_axis))) + new_major = Index(np.arange(len(self.panel4d.major_axis))) + new_minor = Index(np.arange(len(self.panel4d.minor_axis))) - # ensure propagate to potentially prior-cached items too + # ensure propagate to potentially prior-cached items too - # TODO: unused? - # label = self.panel4d['l1'] + # TODO: unused? + # label = self.panel4d['l1'] - self.panel4d.labels = new_labels + self.panel4d.labels = new_labels - if hasattr(self.panel4d, '_item_cache'): - self.assertNotIn('l1', self.panel4d._item_cache) - self.assertIs(self.panel4d.labels, new_labels) + if hasattr(self.panel4d, '_item_cache'): + self.assertNotIn('l1', self.panel4d._item_cache) + self.assertIs(self.panel4d.labels, new_labels) - self.panel4d.major_axis = new_major - self.assertIs(self.panel4d[0].major_axis, new_major) - self.assertIs(self.panel4d.major_axis, new_major) + self.panel4d.major_axis = new_major + self.assertIs(self.panel4d[0].major_axis, new_major) + self.assertIs(self.panel4d.major_axis, new_major) - self.panel4d.minor_axis = new_minor - self.assertIs(self.panel4d[0].minor_axis, new_minor) - self.assertIs(self.panel4d.minor_axis, new_minor) + self.panel4d.minor_axis = new_minor + self.assertIs(self.panel4d[0].minor_axis, new_minor) + self.assertIs(self.panel4d.minor_axis, new_minor) def test_get_axis_number(self): self.assertEqual(self.panel4d._get_axis_number('labels'), 0) @@ -201,7 +206,7 @@ def test_get_axis_name(self): self.assertEqual(self.panel4d._get_axis_name(3), 'minor_axis') def test_arith(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._test_op(self.panel4d, operator.add) self._test_op(self.panel4d, operator.sub) self._test_op(self.panel4d, operator.mul) @@ -233,16 +238,16 @@ def test_iteritems(self): len(self.panel4d.labels)) def test_combinePanel4d(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = self.panel4d.add(self.panel4d) self.assert_panel4d_equal(result, self.panel4d * 2) def test_neg(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.assert_panel4d_equal(-self.panel4d, self.panel4d * -1) def test_select(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p = self.panel4d @@ -283,7 +288,7 @@ def test_get_value(self): def test_abs(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = self.panel4d.abs() expected = np.abs(self.panel4d) self.assert_panel4d_equal(result, expected) @@ -306,7 +311,7 @@ def test_getitem(self): def test_delitem_and_pop(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): expected = self.panel4d['l2'] result = self.panel4d.pop('l2') assert_panel_equal(expected, result) @@ -351,40 +356,38 @@ def test_delitem_and_pop(self): assert_panel_equal(panel4dc[0], panel4d[0]) def test_setitem(self): - # LongPanel with one item - # lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() - # self.assertRaises(Exception, self.panel.__setitem__, - # 'ItemE', lp) + with catch_warnings(record=True): - # Panel - p = Panel(dict( - ItemA=self.panel4d['l1']['ItemA'][2:].filter(items=['A', 'B']))) - self.panel4d['l4'] = p - self.panel4d['l5'] = p + # Panel + p = Panel(dict( + ItemA=self.panel4d['l1']['ItemA'][2:].filter( + items=['A', 'B']))) + self.panel4d['l4'] = p + self.panel4d['l5'] = p - p2 = self.panel4d['l4'] + p2 = self.panel4d['l4'] - assert_panel_equal(p, p2.reindex(items=p.items, - major_axis=p.major_axis, - minor_axis=p.minor_axis)) + assert_panel_equal(p, p2.reindex(items=p.items, + major_axis=p.major_axis, + minor_axis=p.minor_axis)) - # scalar - self.panel4d['lG'] = 1 - self.panel4d['lE'] = True - self.assertEqual(self.panel4d['lG'].values.dtype, np.int64) - self.assertEqual(self.panel4d['lE'].values.dtype, np.bool_) + # scalar + self.panel4d['lG'] = 1 + self.panel4d['lE'] = True + self.assertEqual(self.panel4d['lG'].values.dtype, np.int64) + self.assertEqual(self.panel4d['lE'].values.dtype, np.bool_) - # object dtype - self.panel4d['lQ'] = 'foo' - self.assertEqual(self.panel4d['lQ'].values.dtype, np.object_) + # object dtype + self.panel4d['lQ'] = 'foo' + self.assertEqual(self.panel4d['lQ'].values.dtype, np.object_) - # boolean dtype - self.panel4d['lP'] = self.panel4d['l1'] > 0 - self.assertEqual(self.panel4d['lP'].values.dtype, np.bool_) + # boolean dtype + self.panel4d['lP'] = self.panel4d['l1'] > 0 + self.assertEqual(self.panel4d['lP'].values.dtype, np.bool_) def test_setitem_by_indexer(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): # Panel panel4dc = self.panel4d.copy() @@ -419,7 +422,7 @@ def func(): def test_setitem_by_indexer_mixed_type(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): # GH 8702 self.panel4d['foo'] = 'bar' @@ -433,7 +436,7 @@ def test_setitem_by_indexer_mixed_type(self): self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) def test_comparisons(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p1 = tm.makePanel4D() p2 = tm.makePanel4D() @@ -467,7 +470,8 @@ def test_major_xs(self): ref = self.panel4d['l1']['ItemA'] idx = self.panel4d.major_axis[5] - xs = self.panel4d.major_xs(idx) + with catch_warnings(record=True): + xs = self.panel4d.major_xs(idx) assert_series_equal(xs['l1'].T['ItemA'], ref.xs(idx), check_names=False) @@ -478,15 +482,17 @@ def test_major_xs(self): def test_major_xs_mixed(self): self.panel4d['l4'] = 'foo' - xs = self.panel4d.major_xs(self.panel4d.major_axis[0]) + with catch_warnings(record=True): + xs = self.panel4d.major_xs(self.panel4d.major_axis[0]) self.assertEqual(xs['l1']['A'].dtype, np.float64) self.assertEqual(xs['l4']['A'].dtype, np.object_) def test_minor_xs(self): ref = self.panel4d['l1']['ItemA'] - idx = self.panel4d.minor_axis[1] - xs = self.panel4d.minor_xs(idx) + with catch_warnings(record=True): + idx = self.panel4d.minor_axis[1] + xs = self.panel4d.minor_xs(idx) assert_series_equal(xs['l1'].T['ItemA'], ref[idx], check_names=False) @@ -496,7 +502,8 @@ def test_minor_xs(self): def test_minor_xs_mixed(self): self.panel4d['l4'] = 'foo' - xs = self.panel4d.minor_xs('D') + with catch_warnings(record=True): + xs = self.panel4d.minor_xs('D') self.assertEqual(xs['l1'].T['ItemA'].dtype, np.float64) self.assertEqual(xs['l4'].T['ItemA'].dtype, np.object_) @@ -512,11 +519,12 @@ def test_xs(self): # mixed-type self.panel4d['strings'] = 'foo' - result = self.panel4d.xs('D', axis=3) + with catch_warnings(record=True): + result = self.panel4d.xs('D', axis=3) self.assertIsNotNone(result.is_copy) def test_getitem_fancy_labels(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): panel4d = self.panel4d labels = panel4d.labels[[1, 0]] @@ -572,7 +580,7 @@ def test_get_value(self): def test_set_value(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): for label in self.panel4d.labels: for item in self.panel4d.items: @@ -603,13 +611,13 @@ def assert_panel4d_equal(cls, x, y): assert_panel4d_equal(x, y) def setUp(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.panel4d = tm.makePanel4D(nper=8) add_nans(self.panel4d) def test_constructor(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): panel4d = Panel4D(self.panel4d._data) self.assertIs(panel4d._data, self.panel4d._data) @@ -649,7 +657,7 @@ def test_constructor(self): assert_panel4d_equal(panel4d, expected) def test_constructor_cast(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): zero_filled = self.panel4d.fillna(0) casted = Panel4D(zero_filled._data, dtype=int) @@ -671,7 +679,7 @@ def test_constructor_cast(self): self.assertRaises(ValueError, Panel, data, dtype=float) def test_consolidate(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.assertTrue(self.panel4d._data.is_consolidated()) self.panel4d['foo'] = 1. @@ -681,7 +689,7 @@ def test_consolidate(self): self.assertTrue(panel4d._data.is_consolidated()) def test_ctor_dict(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): l1 = self.panel4d['l1'] l2 = self.panel4d['l2'] @@ -694,7 +702,7 @@ def test_ctor_dict(self): :, :]['ItemB']) def test_constructor_dict_mixed(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): data = dict((k, v.values) for k, v in self.panel4d.iteritems()) result = Panel4D(data) @@ -721,7 +729,7 @@ def test_constructor_dict_mixed(self): self.assertRaises(Exception, Panel4D, data) def test_constructor_resize(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): data = self.panel4d._data labels = self.panel4d.labels[:-1] items = self.panel4d.items[:-1] @@ -747,16 +755,19 @@ def test_constructor_resize(self): assert_panel4d_equal(result, expected) def test_conform(self): + with catch_warnings(record=True): - p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) - conformed = self.panel4d.conform(p) + p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) + conformed = self.panel4d.conform(p) - tm.assert_index_equal(conformed.items, self.panel4d.labels) - tm.assert_index_equal(conformed.major_axis, self.panel4d.major_axis) - tm.assert_index_equal(conformed.minor_axis, self.panel4d.minor_axis) + tm.assert_index_equal(conformed.items, self.panel4d.labels) + tm.assert_index_equal(conformed.major_axis, + self.panel4d.major_axis) + tm.assert_index_equal(conformed.minor_axis, + self.panel4d.minor_axis) def test_reindex(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): ref = self.panel4d['l2'] # labels @@ -810,14 +821,14 @@ def test_reindex(self): self.assertTrue(result is self.panel4d) def test_not_hashable(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4D_empty = Panel4D() self.assertRaises(TypeError, hash, p4D_empty) self.assertRaises(TypeError, hash, self.panel4d) def test_reindex_like(self): # reindex_like - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], items=self.panel4d.items[:-1], major=self.panel4d.major_axis[:-1], @@ -826,7 +837,7 @@ def test_reindex_like(self): assert_panel4d_equal(smaller, smaller_like) def test_sort_index(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): import random rlabels = list(self.panel4d.labels) @@ -844,7 +855,7 @@ def test_sort_index(self): def test_fillna(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.assertFalse(np.isfinite(self.panel4d.values).all()) filled = self.panel4d.fillna(0) self.assertTrue(np.isfinite(filled.values).all()) @@ -853,7 +864,7 @@ def test_fillna(self): self.panel4d.fillna, method='pad') def test_swapaxes(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = self.panel4d.swapaxes('labels', 'items') self.assertIs(result.items, self.panel4d.labels) @@ -880,7 +891,7 @@ def test_swapaxes(self): def test_update(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = Panel4D([[[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], @@ -913,12 +924,12 @@ def test_dtypes(self): assert_series_equal(result, expected) def test_repr_empty(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): empty = Panel4D() repr(empty) def test_rename(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): mapper = {'l1': 'foo', 'l2': 'bar', diff --git a/pandas/tests/test_panelnd.py b/pandas/tests/test_panelnd.py index 6a578d85d3ee3..7ecc773cd7bea 100644 --- a/pandas/tests/test_panelnd.py +++ b/pandas/tests/test_panelnd.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from warnings import catch_warnings from pandas.core import panelnd from pandas.core.panel import Panel @@ -13,7 +14,7 @@ def setUp(self): def test_4d_construction(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): # create a 4D Panel4D = panelnd.create_nd_panel_factory( @@ -29,7 +30,7 @@ def test_4d_construction(self): def test_4d_construction_alt(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): # create a 4D Panel4D = panelnd.create_nd_panel_factory( @@ -61,7 +62,7 @@ def test_4d_construction_error(self): def test_5d_construction(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): # create a 4D Panel4D = panelnd.create_nd_panel_factory( diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 392036a99a297..c41924a7987bd 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1,3 +1,4 @@ +from warnings import catch_warnings import numpy as np from numpy.random import randn @@ -1373,7 +1374,7 @@ def df(): concat([panel1, panel3], axis=1, verify_integrity=True) def test_panel4d_concat(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D() p1 = p4d.iloc[:, :, :5, :] @@ -1389,7 +1390,7 @@ def test_panel4d_concat(self): tm.assert_panel4d_equal(result, p4d) def test_panel4d_concat_mixed_type(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D() # if things are a bit misbehaved From 87df521602c002c18abf5c6a04329e67a95f2f4d Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Fri, 17 Mar 2017 09:08:52 -0400 Subject: [PATCH 307/338] BUG: TZ-aware Series.where() appropriately handles default other=nan (#15701) closes #15701 Author: Christopher C. Aycock Closes #15711 from chrisaycock/GH15701 and squashes the following commits: b77f5ed [Christopher C. Aycock] BUG: TZ-aware Series.where() appropriately handles default other=nan (#15701) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/internals.py | 3 ++- pandas/tests/series/test_indexing.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a56212328f5c3..29d05ddcfb497 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -829,6 +829,7 @@ Bug Fixes - Bug in ``DataFrame.isin`` comparing datetimelike to empty frame (:issue:`15473`) - Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) +- Bug in ``Series.where()`` where TZ-aware data was converted to float representation (:issue:`15701`) - Bug in ``Index`` construction with ``NaN`` elements and integer dtype specified (:issue:`15187`) - Bug in ``Series`` construction with a datetimetz (:issue:`14928`) - Bug in output formatting of a ``MultiIndex`` when names are integers (:issue:`12223`, :issue:`15262`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0e6c176d950a1..9db01713b05ed 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2440,7 +2440,8 @@ def _try_coerce_args(self, values, other): if isinstance(other, bool): raise TypeError - elif is_null_datelike_scalar(other): + elif (is_null_datelike_scalar(other) or + (is_scalar(other) and isnull(other))): other = tslib.iNaT other_mask = True elif isinstance(other, self._holder): diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 9d93d9f01b161..0b6c0c601ac72 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1385,6 +1385,14 @@ def test_where_datetime(self): expected = Series([10, None], dtype='datetime64[ns]') assert_series_equal(rs, expected) + # GH 15701 + timestamps = ['2016-12-31 12:00:04+00:00', + '2016-12-31 12:00:04.010000+00:00'] + s = Series([pd.Timestamp(t) for t in timestamps]) + rs = s.where(Series([False, True])) + expected = Series([pd.NaT, s[1]]) + assert_series_equal(rs, expected) + def test_where_timedelta(self): s = Series([1, 2], dtype='timedelta64[ns]') expected = Series([10, 10], dtype='timedelta64[ns]') From bcbb6738e3e5994dbc9bc62a27dc451dd1e7b35b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 10:08:48 -0400 Subject: [PATCH 308/338] TST: remove rest of yield warnings (#15708) * TST: remove yield warnings from test_internals.py * TST: remove yield warnings from test_windows.py --- pandas/tests/formats/test_format.py | 38 +- pandas/tests/test_internals.py | 567 ++++++++++++++-------------- pandas/tests/test_window.py | 2 +- pandas/util/testing.py | 14 +- setup.cfg | 1 - 5 files changed, 322 insertions(+), 300 deletions(-) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index b1f163ccf9429..44a7f2b45e759 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1392,24 +1392,26 @@ def test_repr_html_long(self): assert u('2 columns') in long_repr def test_repr_html_float(self): - max_rows = get_option('display.max_rows') - h = max_rows - 1 - df = DataFrame({'idx': np.linspace(-10, 10, h), - 'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}).set_index('idx') - reg_repr = df._repr_html_() - assert '..' not in reg_repr - assert str(40 + h) in reg_repr - - h = max_rows + 1 - df = DataFrame({'idx': np.linspace(-10, 10, h), - 'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}).set_index('idx') - long_repr = df._repr_html_() - assert '..' in long_repr - assert '31' not in long_repr - assert u('%d rows ') % h in long_repr - assert u('2 columns') in long_repr + with option_context('display.max_rows', 60): + + max_rows = get_option('display.max_rows') + h = max_rows - 1 + df = DataFrame({'idx': np.linspace(-10, 10, h), + 'A': np.arange(1, 1 + h), + 'B': np.arange(41, 41 + h)}).set_index('idx') + reg_repr = df._repr_html_() + assert '..' not in reg_repr + assert str(40 + h) in reg_repr + + h = max_rows + 1 + df = DataFrame({'idx': np.linspace(-10, 10, h), + 'A': np.arange(1, 1 + h), + 'B': np.arange(41, 41 + h)}).set_index('idx') + long_repr = df._repr_html_() + assert '..' in long_repr + assert '31' not in long_repr + assert u('%d rows ') % h in long_repr + assert u('2 columns') in long_repr def test_repr_html_long_multiindex(self): max_rows = get_option('display.max_rows') diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index df5e843097514..29920b165d3f6 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -23,11 +23,19 @@ from pandas.compat import zip, u +@pytest.fixture +def mgr(): + return create_mgr( + 'a: f8; b: object; c: f8; d: object; e: f8;' + 'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;' + 'k: M8[ns, US/Eastern]; l: M8[ns, CET];') + + def assert_block_equal(left, right): tm.assert_numpy_array_equal(left.values, right.values) - assert (left.dtype == right.dtype) - tm.assertIsInstance(left.mgr_locs, lib.BlockPlacement) - tm.assertIsInstance(right.mgr_locs, lib.BlockPlacement) + assert left.dtype == right.dtype + assert isinstance(left.mgr_locs, lib.BlockPlacement) + assert isinstance(right.mgr_locs, lib.BlockPlacement) tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array) @@ -197,11 +205,11 @@ def setUp(self): def test_constructor(self): int32block = create_block('i4', [0]) - self.assertEqual(int32block.dtype, np.int32) + assert int32block.dtype == np.int32 def test_pickle(self): def _check(blk): - assert_block_equal(self.round_trip_pickle(blk), blk) + assert_block_equal(tm.round_trip_pickle(blk), blk) _check(self.fblock) _check(self.cblock) @@ -209,14 +217,14 @@ def _check(blk): _check(self.bool_block) def test_mgr_locs(self): - tm.assertIsInstance(self.fblock.mgr_locs, lib.BlockPlacement) + assert isinstance(self.fblock.mgr_locs, lib.BlockPlacement) tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64)) def test_attrs(self): - self.assertEqual(self.fblock.shape, self.fblock.values.shape) - self.assertEqual(self.fblock.dtype, self.fblock.values.dtype) - self.assertEqual(len(self.fblock), len(self.fblock.values)) + assert self.fblock.shape == self.fblock.values.shape + assert self.fblock.dtype == self.fblock.values.dtype + assert len(self.fblock) == len(self.fblock.values) def test_merge(self): avals = randn(2, 10) @@ -251,26 +259,27 @@ def test_insert(self): def test_delete(self): newb = self.fblock.copy() newb.delete(0) - tm.assertIsInstance(newb.mgr_locs, lib.BlockPlacement) + assert isinstance(newb.mgr_locs, lib.BlockPlacement) tm.assert_numpy_array_equal(newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64)) - self.assertTrue((newb.values[0] == 1).all()) + assert (newb.values[0] == 1).all() newb = self.fblock.copy() newb.delete(1) - tm.assertIsInstance(newb.mgr_locs, lib.BlockPlacement) + assert isinstance(newb.mgr_locs, lib.BlockPlacement) tm.assert_numpy_array_equal(newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64)) - self.assertTrue((newb.values[1] == 2).all()) + assert (newb.values[1] == 2).all() newb = self.fblock.copy() newb.delete(2) tm.assert_numpy_array_equal(newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64)) - self.assertTrue((newb.values[1] == 1).all()) + assert (newb.values[1] == 1).all() newb = self.fblock.copy() - self.assertRaises(Exception, newb.delete, 3) + with pytest.raises(Exception): + newb.delete(3) def test_split_block_at(self): @@ -279,21 +288,21 @@ def test_split_block_at(self): pytest.skip("skipping for now") bs = list(self.fblock.split_block_at('a')) - self.assertEqual(len(bs), 1) - self.assertTrue(np.array_equal(bs[0].items, ['c', 'e'])) + assert len(bs) == 1 + assert np.array_equal(bs[0].items, ['c', 'e']) bs = list(self.fblock.split_block_at('c')) - self.assertEqual(len(bs), 2) - self.assertTrue(np.array_equal(bs[0].items, ['a'])) - self.assertTrue(np.array_equal(bs[1].items, ['e'])) + assert len(bs) == 2 + assert np.array_equal(bs[0].items, ['a']) + assert np.array_equal(bs[1].items, ['e']) bs = list(self.fblock.split_block_at('e')) - self.assertEqual(len(bs), 1) - self.assertTrue(np.array_equal(bs[0].items, ['a', 'c'])) + assert len(bs) == 1 + assert np.array_equal(bs[0].items, ['a', 'c']) # bblock = get_bool_ex(['f']) # bs = list(bblock.split_block_at('f')) - # self.assertEqual(len(bs), 0) + # assert len(bs), 0) class TestDatetimeBlock(tm.TestCase): @@ -303,50 +312,44 @@ def test_try_coerce_arg(self): # coerce None none_coerced = block._try_coerce_args(block.values, None)[2] - self.assertTrue(pd.Timestamp(none_coerced) is pd.NaT) + assert pd.Timestamp(none_coerced) is pd.NaT # coerce different types of date bojects vals = (np.datetime64('2010-10-10'), datetime(2010, 10, 10), date(2010, 10, 10)) for val in vals: coerced = block._try_coerce_args(block.values, val)[2] - self.assertEqual(np.int64, type(coerced)) - self.assertEqual(pd.Timestamp('2010-10-10'), pd.Timestamp(coerced)) - + assert np.int64 == type(coerced) + assert pd.Timestamp('2010-10-10') == pd.Timestamp(coerced) -class TestBlockManager(tm.TestCase): - def setUp(self): - self.mgr = create_mgr( - 'a: f8; b: object; c: f8; d: object; e: f8;' - 'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;' - 'k: M8[ns, US/Eastern]; l: M8[ns, CET];') +class TestBlockManager(object): def test_constructor_corner(self): pass def test_attrs(self): mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2') - self.assertEqual(mgr.nblocks, 2) - self.assertEqual(len(mgr), 6) + assert mgr.nblocks == 2 + assert len(mgr) == 6 def test_is_mixed_dtype(self): - self.assertFalse(create_mgr('a,b:f8').is_mixed_type) - self.assertFalse(create_mgr('a:f8-1; b:f8-2').is_mixed_type) + assert not create_mgr('a,b:f8').is_mixed_type + assert not create_mgr('a:f8-1; b:f8-2').is_mixed_type - self.assertTrue(create_mgr('a,b:f8; c,d: f4').is_mixed_type) - self.assertTrue(create_mgr('a,b:f8; c,d: object').is_mixed_type) + assert create_mgr('a,b:f8; c,d: f4').is_mixed_type + assert create_mgr('a,b:f8; c,d: object').is_mixed_type def test_is_indexed_like(self): mgr1 = create_mgr('a,b: f8') mgr2 = create_mgr('a:i8; b:bool') mgr3 = create_mgr('a,b,c: f8') - self.assertTrue(mgr1._is_indexed_like(mgr1)) - self.assertTrue(mgr1._is_indexed_like(mgr2)) - self.assertTrue(mgr1._is_indexed_like(mgr3)) + assert mgr1._is_indexed_like(mgr1) + assert mgr1._is_indexed_like(mgr2) + assert mgr1._is_indexed_like(mgr3) - self.assertFalse(mgr1._is_indexed_like(mgr1.get_slice( - slice(-1), axis=1))) + assert not mgr1._is_indexed_like(mgr1.get_slice( + slice(-1), axis=1)) def test_duplicate_ref_loc_failure(self): tmp_mgr = create_mgr('a:bool; a: f8') @@ -355,61 +358,63 @@ def test_duplicate_ref_loc_failure(self): blocks[0].mgr_locs = np.array([0]) blocks[1].mgr_locs = np.array([0]) + # test trying to create block manager with overlapping ref locs - self.assertRaises(AssertionError, BlockManager, blocks, axes) + with pytest.raises(AssertionError): + BlockManager(blocks, axes) blocks[0].mgr_locs = np.array([0]) blocks[1].mgr_locs = np.array([1]) mgr = BlockManager(blocks, axes) mgr.iget(1) - def test_contains(self): - self.assertIn('a', self.mgr) - self.assertNotIn('baz', self.mgr) + def test_contains(self, mgr): + assert 'a' in mgr + assert 'baz' not in mgr - def test_pickle(self): + def test_pickle(self, mgr): - mgr2 = self.round_trip_pickle(self.mgr) - assert_frame_equal(DataFrame(self.mgr), DataFrame(mgr2)) + mgr2 = tm.round_trip_pickle(mgr) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) # share ref_items # self.assertIs(mgr2.blocks[0].ref_items, mgr2.blocks[1].ref_items) # GH2431 - self.assertTrue(hasattr(mgr2, "_is_consolidated")) - self.assertTrue(hasattr(mgr2, "_known_consolidated")) + assert hasattr(mgr2, "_is_consolidated") + assert hasattr(mgr2, "_known_consolidated") # reset to False on load - self.assertFalse(mgr2._is_consolidated) - self.assertFalse(mgr2._known_consolidated) + assert not mgr2._is_consolidated + assert not mgr2._known_consolidated def test_non_unique_pickle(self): mgr = create_mgr('a,a,a:f8') - mgr2 = self.round_trip_pickle(mgr) + mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) mgr = create_mgr('a: f8; a: i8') - mgr2 = self.round_trip_pickle(mgr) + mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) def test_categorical_block_pickle(self): mgr = create_mgr('a: category') - mgr2 = self.round_trip_pickle(mgr) + mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) smgr = create_single_mgr('category') - smgr2 = self.round_trip_pickle(smgr) + smgr2 = tm.round_trip_pickle(smgr) assert_series_equal(Series(smgr), Series(smgr2)) - def test_get_scalar(self): - for item in self.mgr.items: - for i, index in enumerate(self.mgr.axes[1]): - res = self.mgr.get_scalar((item, index)) - exp = self.mgr.get(item, fastpath=False)[i] - self.assertEqual(res, exp) - exp = self.mgr.get(item).internal_values()[i] - self.assertEqual(res, exp) + def test_get_scalar(self, mgr): + for item in mgr.items: + for i, index in enumerate(mgr.axes[1]): + res = mgr.get_scalar((item, index)) + exp = mgr.get(item, fastpath=False)[i] + assert res == exp + exp = mgr.get(item).internal_values()[i] + assert res == exp def test_get(self): cols = Index(list('abc')) @@ -438,30 +443,21 @@ def test_set(self): tm.assert_numpy_array_equal(mgr.get('d').internal_values(), np.array(['foo'] * 3, dtype=np.object_)) - def test_insert(self): - self.mgr.insert(0, 'inserted', np.arange(N)) - - self.assertEqual(self.mgr.items[0], 'inserted') - assert_almost_equal(self.mgr.get('inserted'), np.arange(N)) + def test_set_change_dtype(self, mgr): + mgr.set('baz', np.zeros(N, dtype=bool)) - for blk in self.mgr.blocks: - yield self.assertIs, self.mgr.items, blk.ref_items + mgr.set('baz', np.repeat('foo', N)) + assert mgr.get('baz').dtype == np.object_ - def test_set_change_dtype(self): - self.mgr.set('baz', np.zeros(N, dtype=bool)) - - self.mgr.set('baz', np.repeat('foo', N)) - self.assertEqual(self.mgr.get('baz').dtype, np.object_) - - mgr2 = self.mgr.consolidate() + mgr2 = mgr.consolidate() mgr2.set('baz', np.repeat('foo', N)) - self.assertEqual(mgr2.get('baz').dtype, np.object_) + assert mgr2.get('baz').dtype == np.object_ mgr2.set('quux', randn(N).astype(int)) - self.assertEqual(mgr2.get('quux').dtype, np.int_) + assert mgr2.get('quux').dtype == np.int_ mgr2.set('quux', randn(N)) - self.assertEqual(mgr2.get('quux').dtype, np.float_) + assert mgr2.get('quux').dtype == np.float_ def test_set_change_dtype_slice(self): # GH8850 cols = MultiIndex.from_tuples([('1st', 'a'), ('2nd', 'b'), ('3rd', 'c') @@ -469,70 +465,69 @@ def test_set_change_dtype_slice(self): # GH8850 df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols) df['2nd'] = df['2nd'] * 2.0 - self.assertEqual(sorted(df.blocks.keys()), ['float64', 'int64']) + assert sorted(df.blocks.keys()) == ['float64', 'int64'] assert_frame_equal(df.blocks['float64'], DataFrame( [[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])) assert_frame_equal(df.blocks['int64'], DataFrame( [[3], [6]], columns=cols[2:])) - def test_copy(self): - cp = self.mgr.copy(deep=False) - for blk, cp_blk in zip(self.mgr.blocks, cp.blocks): + def test_copy(self, mgr): + cp = mgr.copy(deep=False) + for blk, cp_blk in zip(mgr.blocks, cp.blocks): # view assertion - self.assertTrue(cp_blk.equals(blk)) - self.assertTrue(cp_blk.values.base is blk.values.base) + assert cp_blk.equals(blk) + assert cp_blk.values.base is blk.values.base - cp = self.mgr.copy(deep=True) - for blk, cp_blk in zip(self.mgr.blocks, cp.blocks): + cp = mgr.copy(deep=True) + for blk, cp_blk in zip(mgr.blocks, cp.blocks): # copy assertion we either have a None for a base or in case of # some blocks it is an array (e.g. datetimetz), but was copied - self.assertTrue(cp_blk.equals(blk)) + assert cp_blk.equals(blk) if cp_blk.values.base is not None and blk.values.base is not None: - self.assertFalse(cp_blk.values.base is blk.values.base) + assert cp_blk.values.base is not blk.values.base else: - self.assertTrue(cp_blk.values.base is None and blk.values.base - is None) + assert cp_blk.values.base is None and blk.values.base is None def test_sparse(self): mgr = create_mgr('a: sparse-1; b: sparse-2') # what to test here? - self.assertEqual(mgr.as_matrix().dtype, np.float64) + assert mgr.as_matrix().dtype == np.float64 def test_sparse_mixed(self): mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') - self.assertEqual(len(mgr.blocks), 3) - self.assertIsInstance(mgr, BlockManager) + assert len(mgr.blocks) == 3 + assert isinstance(mgr, BlockManager) # what to test here? def test_as_matrix_float(self): mgr = create_mgr('c: f4; d: f2; e: f8') - self.assertEqual(mgr.as_matrix().dtype, np.float64) + assert mgr.as_matrix().dtype == np.float64 mgr = create_mgr('c: f4; d: f2') - self.assertEqual(mgr.as_matrix().dtype, np.float32) + assert mgr.as_matrix().dtype == np.float32 def test_as_matrix_int_bool(self): mgr = create_mgr('a: bool-1; b: bool-2') - self.assertEqual(mgr.as_matrix().dtype, np.bool_) + assert mgr.as_matrix().dtype == np.bool_ mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') - self.assertEqual(mgr.as_matrix().dtype, np.int64) + assert mgr.as_matrix().dtype == np.int64 mgr = create_mgr('c: i4; d: i2; e: u1') - self.assertEqual(mgr.as_matrix().dtype, np.int32) + assert mgr.as_matrix().dtype == np.int32 def test_as_matrix_datetime(self): mgr = create_mgr('h: datetime-1; g: datetime-2') - self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') + assert mgr.as_matrix().dtype == 'M8[ns]' def test_as_matrix_datetime_tz(self): mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') - self.assertEqual(mgr.get('h').dtype, 'datetime64[ns, US/Eastern]') - self.assertEqual(mgr.get('g').dtype, 'datetime64[ns, CET]') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]' + assert mgr.get('g').dtype == 'datetime64[ns, CET]' + assert mgr.as_matrix().dtype == 'object' def test_astype(self): # coerce all @@ -540,9 +535,9 @@ def test_astype(self): for t in ['float16', 'float32', 'float64', 'int32', 'int64']: t = np.dtype(t) tmgr = mgr.astype(t) - self.assertEqual(tmgr.get('c').dtype.type, t) - self.assertEqual(tmgr.get('d').dtype.type, t) - self.assertEqual(tmgr.get('e').dtype.type, t) + assert tmgr.get('c').dtype.type == t + assert tmgr.get('d').dtype.type == t + assert tmgr.get('e').dtype.type == t # mixed mgr = create_mgr('a,b: object; c: bool; d: datetime;' @@ -550,24 +545,24 @@ def test_astype(self): for t in ['float16', 'float32', 'float64', 'int32', 'int64']: t = np.dtype(t) tmgr = mgr.astype(t, errors='ignore') - self.assertEqual(tmgr.get('c').dtype.type, t) - self.assertEqual(tmgr.get('e').dtype.type, t) - self.assertEqual(tmgr.get('f').dtype.type, t) - self.assertEqual(tmgr.get('g').dtype.type, t) + assert tmgr.get('c').dtype.type == t + assert tmgr.get('e').dtype.type == t + assert tmgr.get('f').dtype.type == t + assert tmgr.get('g').dtype.type == t - self.assertEqual(tmgr.get('a').dtype.type, np.object_) - self.assertEqual(tmgr.get('b').dtype.type, np.object_) + assert tmgr.get('a').dtype.type == np.object_ + assert tmgr.get('b').dtype.type == np.object_ if t != np.int64: - self.assertEqual(tmgr.get('d').dtype.type, np.datetime64) + assert tmgr.get('d').dtype.type == np.datetime64 else: - self.assertEqual(tmgr.get('d').dtype.type, t) + assert tmgr.get('d').dtype.type == t def test_convert(self): def _compare(old_mgr, new_mgr): """ compare the blocks, numeric compare ==, object don't """ old_blocks = set(old_mgr.blocks) new_blocks = set(new_mgr.blocks) - self.assertEqual(len(old_blocks), len(new_blocks)) + assert len(old_blocks) == len(new_blocks) # compare non-numeric for b in old_blocks: @@ -576,7 +571,7 @@ def _compare(old_mgr, new_mgr): if (b.values == nb.values).all(): found = True break - self.assertTrue(found) + assert found for b in new_blocks: found = False @@ -584,7 +579,7 @@ def _compare(old_mgr, new_mgr): if (b.values == ob.values).all(): found = True break - self.assertTrue(found) + assert found # noops mgr = create_mgr('f: i8; g: f8') @@ -601,11 +596,11 @@ def _compare(old_mgr, new_mgr): mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - self.assertEqual(new_mgr.get('a').dtype, np.int64) - self.assertEqual(new_mgr.get('b').dtype, np.float64) - self.assertEqual(new_mgr.get('foo').dtype, np.object_) - self.assertEqual(new_mgr.get('f').dtype, np.int64) - self.assertEqual(new_mgr.get('g').dtype, np.float64) + assert new_mgr.get('a').dtype == np.int64 + assert new_mgr.get('b').dtype == np.float64 + assert new_mgr.get('foo').dtype == np.object_ + assert new_mgr.get('f').dtype == np.int64 + assert new_mgr.get('g').dtype == np.float64 mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;' 'i: i8; g: f8; h: f2') @@ -613,15 +608,15 @@ def _compare(old_mgr, new_mgr): mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - self.assertEqual(new_mgr.get('a').dtype, np.int64) - self.assertEqual(new_mgr.get('b').dtype, np.float64) - self.assertEqual(new_mgr.get('foo').dtype, np.object_) - self.assertEqual(new_mgr.get('f').dtype, np.int32) - self.assertEqual(new_mgr.get('bool').dtype, np.bool_) - self.assertEqual(new_mgr.get('dt').dtype.type, np.datetime64) - self.assertEqual(new_mgr.get('i').dtype, np.int64) - self.assertEqual(new_mgr.get('g').dtype, np.float64) - self.assertEqual(new_mgr.get('h').dtype, np.float16) + assert new_mgr.get('a').dtype == np.int64 + assert new_mgr.get('b').dtype == np.float64 + assert new_mgr.get('foo').dtype == np.object_ + assert new_mgr.get('f').dtype == np.int32 + assert new_mgr.get('bool').dtype == np.bool_ + assert new_mgr.get('dt').dtype.type, np.datetime64 + assert new_mgr.get('i').dtype == np.int64 + assert new_mgr.get('g').dtype == np.float64 + assert new_mgr.get('h').dtype == np.float16 def test_interleave(self): @@ -629,49 +624,49 @@ def test_interleave(self): for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]', 'm8[ns]']: mgr = create_mgr('a: {0}'.format(dtype)) - self.assertEqual(mgr.as_matrix().dtype, dtype) + assert mgr.as_matrix().dtype == dtype mgr = create_mgr('a: {0}; b: {0}'.format(dtype)) - self.assertEqual(mgr.as_matrix().dtype, dtype) + assert mgr.as_matrix().dtype == dtype # will be converted according the actual dtype of the underlying mgr = create_mgr('a: category') - self.assertEqual(mgr.as_matrix().dtype, 'i8') + assert mgr.as_matrix().dtype == 'i8' mgr = create_mgr('a: category; b: category') - self.assertEqual(mgr.as_matrix().dtype, 'i8'), + assert mgr.as_matrix().dtype == 'i8' mgr = create_mgr('a: category; b: category2') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: category2') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: category2; b: category2') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' # combinations mgr = create_mgr('a: f8') - self.assertEqual(mgr.as_matrix().dtype, 'f8') + assert mgr.as_matrix().dtype == 'f8' mgr = create_mgr('a: f8; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'f8') + assert mgr.as_matrix().dtype == 'f8' mgr = create_mgr('a: f4; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'f8') + assert mgr.as_matrix().dtype == 'f8' mgr = create_mgr('a: f4; b: i8; d: object') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: bool; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: complex') - self.assertEqual(mgr.as_matrix().dtype, 'complex') + assert mgr.as_matrix().dtype == 'complex' mgr = create_mgr('a: f8; b: category') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: category') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: bool') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: m8[ns]; b: bool') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: m8[ns]; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: m8[ns]') - self.assertEqual(mgr.as_matrix().dtype, 'object') + assert mgr.as_matrix().dtype == 'object' def test_interleave_non_unique_cols(self): df = DataFrame([ @@ -682,26 +677,26 @@ def test_interleave_non_unique_cols(self): df_unique = df.copy() df_unique.columns = ['x', 'y'] - self.assertEqual(df_unique.values.shape, df.values.shape) + assert df_unique.values.shape == df.values.shape tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) def test_consolidate(self): pass - def test_consolidate_ordering_issues(self): - self.mgr.set('f', randn(N)) - self.mgr.set('d', randn(N)) - self.mgr.set('b', randn(N)) - self.mgr.set('g', randn(N)) - self.mgr.set('h', randn(N)) - - # we have datetime/tz blocks in self.mgr - cons = self.mgr.consolidate() - self.assertEqual(cons.nblocks, 4) - cons = self.mgr.consolidate().get_numeric_data() - self.assertEqual(cons.nblocks, 1) - tm.assertIsInstance(cons.blocks[0].mgr_locs, lib.BlockPlacement) + def test_consolidate_ordering_issues(self, mgr): + mgr.set('f', randn(N)) + mgr.set('d', randn(N)) + mgr.set('b', randn(N)) + mgr.set('g', randn(N)) + mgr.set('h', randn(N)) + + # we have datetime/tz blocks in mgr + cons = mgr.consolidate() + assert cons.nblocks == 4 + cons = mgr.consolidate().get_numeric_data() + assert cons.nblocks == 1 + assert isinstance(cons.blocks[0].mgr_locs, lib.BlockPlacement) tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64)) @@ -714,7 +709,7 @@ def test_reindex_items(self): 'f: bool; g: f8-2') reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0) - self.assertEqual(reindexed.nblocks, 2) + assert reindexed.nblocks == 2 tm.assert_index_equal(reindexed.items, pd.Index(['g', 'c', 'a', 'd'])) assert_almost_equal( mgr.get('g', fastpath=False), reindexed.get('g', fastpath=False)) @@ -748,9 +743,9 @@ def test_multiindex_xs(self): mgr.set_axis(1, index) result = mgr.xs('bar', axis=1) - self.assertEqual(result.shape, (6, 2)) - self.assertEqual(result.axes[1][0], ('bar', 'one')) - self.assertEqual(result.axes[1][1], ('bar', 'two')) + assert result.shape == (6, 2) + assert result.axes[1][0] == ('bar', 'one') + assert result.axes[1][1] == ('bar', 'two') def test_get_numeric_data(self): mgr = create_mgr('int: int; float: float; complex: complex;' @@ -826,11 +821,11 @@ def test_equals(self): # unique items bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) - self.assertTrue(bm1.equals(bm2)) + assert bm1.equals(bm2) bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2') bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) - self.assertTrue(bm1.equals(bm2)) + assert bm1.equals(bm2) def test_equals_block_order_different_dtypes(self): # GH 9330 @@ -848,19 +843,19 @@ def test_equals_block_order_different_dtypes(self): block_perms = itertools.permutations(bm.blocks) for bm_perm in block_perms: bm_this = BlockManager(bm_perm, bm.axes) - self.assertTrue(bm.equals(bm_this)) - self.assertTrue(bm_this.equals(bm)) + assert bm.equals(bm_this) + assert bm_this.equals(bm) def test_single_mgr_ctor(self): mgr = create_single_mgr('f8', num_rows=5) - self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.]) + assert mgr.as_matrix().tolist() == [0., 1., 2., 3., 4.] def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') for value in invalid_values: - with self.assertRaises(ValueError): + with pytest.raises(ValueError): bm1.replace_list([1], [2], inplace=value) @@ -918,32 +913,37 @@ def assert_slice_ok(mgr, axis, slobj): for mgr in self.MANAGERS: for ax in range(mgr.ndim): # slice - yield assert_slice_ok, mgr, ax, slice(None) - yield assert_slice_ok, mgr, ax, slice(3) - yield assert_slice_ok, mgr, ax, slice(100) - yield assert_slice_ok, mgr, ax, slice(1, 4) - yield assert_slice_ok, mgr, ax, slice(3, 0, -2) + assert_slice_ok(mgr, ax, slice(None)) + assert_slice_ok(mgr, ax, slice(3)) + assert_slice_ok(mgr, ax, slice(100)) + assert_slice_ok(mgr, ax, slice(1, 4)) + assert_slice_ok(mgr, ax, slice(3, 0, -2)) # boolean mask - yield assert_slice_ok, mgr, ax, np.array([], dtype=np.bool_) - yield (assert_slice_ok, mgr, ax, - np.ones(mgr.shape[ax], dtype=np.bool_)) - yield (assert_slice_ok, mgr, ax, - np.zeros(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok( + mgr, ax, np.array([], dtype=np.bool_)) + assert_slice_ok( + mgr, ax, + np.ones(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok( + mgr, ax, + np.zeros(mgr.shape[ax], dtype=np.bool_)) if mgr.shape[ax] >= 3: - yield (assert_slice_ok, mgr, ax, - np.arange(mgr.shape[ax]) % 3 == 0) - yield (assert_slice_ok, mgr, ax, np.array( - [True, True, False], dtype=np.bool_)) + assert_slice_ok( + mgr, ax, + np.arange(mgr.shape[ax]) % 3 == 0) + assert_slice_ok( + mgr, ax, np.array( + [True, True, False], dtype=np.bool_)) # fancy indexer - yield assert_slice_ok, mgr, ax, [] - yield assert_slice_ok, mgr, ax, lrange(mgr.shape[ax]) + assert_slice_ok(mgr, ax, []) + assert_slice_ok(mgr, ax, lrange(mgr.shape[ax])) if mgr.shape[ax] >= 3: - yield assert_slice_ok, mgr, ax, [0, 1, 2] - yield assert_slice_ok, mgr, ax, [-1, -2, -3] + assert_slice_ok(mgr, ax, [0, 1, 2]) + assert_slice_ok(mgr, ax, [-1, -2, -3]) def test_take(self): def assert_take_ok(mgr, axis, indexer): @@ -957,13 +957,13 @@ def assert_take_ok(mgr, axis, indexer): for mgr in self.MANAGERS: for ax in range(mgr.ndim): # take/fancy indexer - yield assert_take_ok, mgr, ax, [] - yield assert_take_ok, mgr, ax, [0, 0, 0] - yield assert_take_ok, mgr, ax, lrange(mgr.shape[ax]) + assert_take_ok(mgr, ax, []) + assert_take_ok(mgr, ax, [0, 0, 0]) + assert_take_ok(mgr, ax, lrange(mgr.shape[ax])) if mgr.shape[ax] >= 3: - yield assert_take_ok, mgr, ax, [0, 1, 2] - yield assert_take_ok, mgr, ax, [-1, -2, -3] + assert_take_ok(mgr, ax, [0, 1, 2]) + assert_take_ok(mgr, ax, [-1, -2, -3]) def test_reindex_axis(self): def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): @@ -981,25 +981,33 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): for mgr in self.MANAGERS: for ax in range(mgr.ndim): for fill_value in (None, np.nan, 100.): - yield (assert_reindex_axis_is_ok, mgr, ax, - pd.Index([]), fill_value) - yield (assert_reindex_axis_is_ok, mgr, ax, mgr.axes[ax], - fill_value) - yield (assert_reindex_axis_is_ok, mgr, ax, - mgr.axes[ax][[0, 0, 0]], fill_value) - yield (assert_reindex_axis_is_ok, mgr, ax, - pd.Index(['foo', 'bar', 'baz']), fill_value) - yield (assert_reindex_axis_is_ok, mgr, ax, - pd.Index(['foo', mgr.axes[ax][0], 'baz']), - fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + pd.Index([]), fill_value) + assert_reindex_axis_is_ok( + mgr, ax, mgr.axes[ax], + fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + mgr.axes[ax][[0, 0, 0]], fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + pd.Index(['foo', 'bar', 'baz']), fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + pd.Index(['foo', mgr.axes[ax][0], 'baz']), + fill_value) if mgr.shape[ax] >= 3: - yield (assert_reindex_axis_is_ok, mgr, ax, - mgr.axes[ax][:-3], fill_value) - yield (assert_reindex_axis_is_ok, mgr, ax, - mgr.axes[ax][-3::-1], fill_value) - yield (assert_reindex_axis_is_ok, mgr, ax, - mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + mgr.axes[ax][:-3], fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + mgr.axes[ax][-3::-1], fill_value) + assert_reindex_axis_is_ok( + mgr, ax, + mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) def test_reindex_indexer(self): @@ -1018,33 +1026,41 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, for mgr in self.MANAGERS: for ax in range(mgr.ndim): for fill_value in (None, np.nan, 100.): - yield (assert_reindex_indexer_is_ok, mgr, ax, - pd.Index([]), [], fill_value) - yield (assert_reindex_indexer_is_ok, mgr, ax, - mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) - yield (assert_reindex_indexer_is_ok, mgr, ax, - pd.Index(['foo'] * mgr.shape[ax]), - np.arange(mgr.shape[ax]), fill_value) - - yield (assert_reindex_indexer_is_ok, mgr, ax, - mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), - fill_value) - yield (assert_reindex_indexer_is_ok, mgr, ax, mgr.axes[ax], - np.arange(mgr.shape[ax])[::-1], fill_value) - yield (assert_reindex_indexer_is_ok, mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [0, 0, 0], fill_value) - yield (assert_reindex_indexer_is_ok, mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [-1, 0, -1], fill_value) - yield (assert_reindex_indexer_is_ok, mgr, ax, - pd.Index(['foo', mgr.axes[ax][0], 'baz']), - [-1, -1, -1], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + pd.Index([]), [], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + pd.Index(['foo'] * mgr.shape[ax]), + np.arange(mgr.shape[ax]), fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), + fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, mgr.axes[ax], + np.arange(mgr.shape[ax])[::-1], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + pd.Index(['foo', 'bar', 'baz']), + [0, 0, 0], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + pd.Index(['foo', 'bar', 'baz']), + [-1, 0, -1], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + pd.Index(['foo', mgr.axes[ax][0], 'baz']), + [-1, -1, -1], fill_value) if mgr.shape[ax] >= 3: - yield (assert_reindex_indexer_is_ok, mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [0, 1, 2], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, + pd.Index(['foo', 'bar', 'baz']), + [0, 1, 2], fill_value) # test_get_slice(slice_like, axis) # take(indexer, axis) @@ -1055,21 +1071,23 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, class TestBlockPlacement(tm.TestCase): def test_slice_len(self): - self.assertEqual(len(BlockPlacement(slice(0, 4))), 4) - self.assertEqual(len(BlockPlacement(slice(0, 4, 2))), 2) - self.assertEqual(len(BlockPlacement(slice(0, 3, 2))), 2) + assert len(BlockPlacement(slice(0, 4))) == 4 + assert len(BlockPlacement(slice(0, 4, 2))) == 2 + assert len(BlockPlacement(slice(0, 3, 2))) == 2 - self.assertEqual(len(BlockPlacement(slice(0, 1, 2))), 1) - self.assertEqual(len(BlockPlacement(slice(1, 0, -1))), 1) + assert len(BlockPlacement(slice(0, 1, 2))) == 1 + assert len(BlockPlacement(slice(1, 0, -1))) == 1 def test_zero_step_raises(self): - self.assertRaises(ValueError, BlockPlacement, slice(1, 1, 0)) - self.assertRaises(ValueError, BlockPlacement, slice(1, 2, 0)) + with pytest.raises(ValueError): + BlockPlacement(slice(1, 1, 0)) + with pytest.raises(ValueError): + BlockPlacement(slice(1, 2, 0)) def test_unbounded_slice_raises(self): def assert_unbounded_slice_error(slc): - self.assertRaisesRegexp(ValueError, "unbounded slice", - lambda: BlockPlacement(slc)) + tm.assertRaisesRegexp(ValueError, "unbounded slice", + lambda: BlockPlacement(slc)) assert_unbounded_slice_error(slice(None, None)) assert_unbounded_slice_error(slice(10, None)) @@ -1087,7 +1105,7 @@ def assert_unbounded_slice_error(slc): def test_not_slice_like_slices(self): def assert_not_slice_like(slc): - self.assertTrue(not BlockPlacement(slc).is_slice_like) + assert not BlockPlacement(slc).is_slice_like assert_not_slice_like(slice(0, 0)) assert_not_slice_like(slice(100, 0)) @@ -1095,12 +1113,12 @@ def assert_not_slice_like(slc): assert_not_slice_like(slice(100, 100, -1)) assert_not_slice_like(slice(0, 100, -1)) - self.assertTrue(not BlockPlacement(slice(0, 0)).is_slice_like) - self.assertTrue(not BlockPlacement(slice(100, 100)).is_slice_like) + assert not BlockPlacement(slice(0, 0)).is_slice_like + assert not BlockPlacement(slice(100, 100)).is_slice_like def test_array_to_slice_conversion(self): def assert_as_slice_equals(arr, slc): - self.assertEqual(BlockPlacement(arr).as_slice, slc) + assert BlockPlacement(arr).as_slice == slc assert_as_slice_equals([0], slice(0, 1, 1)) assert_as_slice_equals([100], slice(100, 101, 1)) @@ -1115,7 +1133,7 @@ def assert_as_slice_equals(arr, slc): def test_not_slice_like_arrays(self): def assert_not_slice_like(arr): - self.assertTrue(not BlockPlacement(arr).is_slice_like) + assert not BlockPlacement(arr).is_slice_like assert_not_slice_like([]) assert_not_slice_like([-1]) @@ -1128,13 +1146,12 @@ def assert_not_slice_like(arr): assert_not_slice_like([1, 1, 1]) def test_slice_iter(self): - self.assertEqual(list(BlockPlacement(slice(0, 3))), [0, 1, 2]) - self.assertEqual(list(BlockPlacement(slice(0, 0))), []) - self.assertEqual(list(BlockPlacement(slice(3, 0))), []) + assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2] + assert list(BlockPlacement(slice(0, 0))) == [] + assert list(BlockPlacement(slice(3, 0))) == [] - self.assertEqual(list(BlockPlacement(slice(3, 0, -1))), [3, 2, 1]) - self.assertEqual(list(BlockPlacement(slice(3, None, -1))), - [3, 2, 1, 0]) + assert list(BlockPlacement(slice(3, 0, -1))) == [3, 2, 1] + assert list(BlockPlacement(slice(3, None, -1))) == [3, 2, 1, 0] def test_slice_to_array_conversion(self): def assert_as_array_equals(slc, asarray): @@ -1152,13 +1169,13 @@ def assert_as_array_equals(slc, asarray): def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) - self.assertEqual(bpl.add(1).as_slice, slice(1, 6, 1)) - self.assertEqual(bpl.add(np.arange(5)).as_slice, slice(0, 10, 2)) - self.assertEqual(list(bpl.add(np.arange(5, 0, -1))), [5, 5, 5, 5, 5]) + assert bpl.add(1).as_slice == slice(1, 6, 1) + assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2) + assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5] def test_blockplacement_add_int(self): def assert_add_equals(val, inc, result): - self.assertEqual(list(BlockPlacement(val).add(inc)), result) + assert list(BlockPlacement(val).add(inc)) == result assert_add_equals(slice(0, 0), 0, []) assert_add_equals(slice(1, 4), 0, [1, 2, 3]) @@ -1177,9 +1194,9 @@ def assert_add_equals(val, inc, result): assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) assert_add_equals([1, 2, 4], -1, [0, 1, 3]) - self.assertRaises(ValueError, - lambda: BlockPlacement(slice(1, 4)).add(-10)) - self.assertRaises(ValueError, - lambda: BlockPlacement([1, 2, 4]).add(-10)) - self.assertRaises(ValueError, - lambda: BlockPlacement(slice(2, None, -1)).add(-1)) + with pytest.raises(ValueError): + BlockPlacement(slice(1, 4)).add(-10) + with pytest.raises(ValueError): + BlockPlacement([1, 2, 4]).add(-10) + with pytest.raises(ValueError): + BlockPlacement(slice(2, None, -1)).add(-1) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3f2973a9834ca..fe03d7886e661 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -646,7 +646,7 @@ def test_dtypes(self): f = self.funcs[f_name] d = self.data[d_name] exp = self.expects[d_name][f_name] - yield self.check_dtypes, f, f_name, d, d_name, exp + self.check_dtypes(f, f_name, d, d_name, exp) def check_dtypes(self, f, f_name, d, d_name, exp): roll = d.rolling(window=self.window) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 154476ce8340a..cf76f4ead77e3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -93,11 +93,7 @@ def reset_display_options(self): pd.reset_option('^display.', silent=True) def round_trip_pickle(self, obj, path=None): - if path is None: - path = u('__%s__.pickle' % rands(10)) - with ensure_clean(path) as path: - pd.to_pickle(obj, path) - return pd.read_pickle(path) + return round_trip_pickle(obj, path=path) # https://docs.python.org/3/library/unittest.html#deprecated-aliases def assertEquals(self, *args, **kwargs): @@ -121,6 +117,14 @@ def assertNotAlmostEquals(self, *args, **kwargs): self.assertNotAlmostEqual)(*args, **kwargs) +def round_trip_pickle(obj, path=None): + if path is None: + path = u('__%s__.pickle' % rands(10)) + with ensure_clean(path) as path: + pd.to_pickle(obj, path) + return pd.read_pickle(path) + + def assert_almost_equal(left, right, check_exact=False, check_dtype='equiv', check_less_precise=False, **kwargs): diff --git a/setup.cfg b/setup.cfg index 8de4fc955bd50..8b32f0f62fe28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,6 @@ split_penalty_logical_operator = 30 [tool:pytest] # TODO: Change all yield-based (nose-style) fixutures to pytest fixtures # Silencing the warning until then -addopts = --disable-pytest-warnings testpaths = pandas markers = single: mark a test as single cpu only From ed9b01471d9cebf93a5f4d7668606e5e2bd09153 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 10:43:35 -0400 Subject: [PATCH 309/338] CI: actually use the miniconda cache :> --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ee093e5bf0e60..af3098b3fc715 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ language: python # The cash directories will be deleted if anything in ci/ changes in a commit cache: directories: - - $HOME/miniconda # miniconda cache + - $HOME/miniconda3 # miniconda cache - $HOME/.cache # cython cache - $HOME/.ccache # compiler cache From 89ccfa48c20d2f570b501ff7c5012de8bea67e39 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 10:58:34 -0400 Subject: [PATCH 310/338] TST: only catch deprecation warnings for top-level module imports (#15718) --- pandas/tests/api/test_api.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index db92210478182..73222c246fc70 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -249,31 +249,26 @@ def test_groupby(self): class TestJson(tm.TestCase): def test_deprecation_access_func(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): pd.json.dumps([]) class TestParser(tm.TestCase): def test_deprecation_access_func(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): pd.parser.na_values class TestLib(tm.TestCase): def test_deprecation_access_func(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): pd.lib.infer_dtype class TestTSLib(tm.TestCase): def test_deprecation_access_func(self): - # some libraries may be imported before we - # test and could show the warning with catch_warnings(record=True): pd.tslib.Timestamp From 880c9ab8ced4f83ccd78ead9462b80218bc236ad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 11:08:38 -0400 Subject: [PATCH 311/338] CI: fix cache again --- ci/install_travis.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index aad87ea37439f..67b94da120d90 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -35,7 +35,7 @@ echo "[home_dir: $home_dir]" # install miniconda MINICONDA_DIR="$HOME/miniconda3" -if [ "$USE_CACHE" ] && [ -d "$MINICONDA_DIR" ]; then +if [ "$USE_CACHE" ] && [ -d "$MINICONDA_DIR/bin" ]; then echo "[Using cached Miniconda install]" else @@ -54,6 +54,9 @@ else bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 fi +echo "[show conds]" +which conda + echo "[update conda]" conda config --set ssl_verify false || exit 1 conda config --set always_yes true --set changeps1 false || exit 1 From 8809d9c6940d06e84b5dc79cdd29d002341f8063 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 11:28:44 -0400 Subject: [PATCH 312/338] CI: typo in using ccache --- ci/install_travis.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 67b94da120d90..f0f4bc0873e05 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -54,7 +54,7 @@ else bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 fi -echo "[show conds]" +echo "[show conda]" which conda echo "[update conda]" @@ -78,7 +78,7 @@ fi conda info -a || exit 1 # set the compiler cache to work -if [ "$USE_CACHE" ] && "${TRAVIS_OS_NAME}" == "linux" ]; then +if [ "$USE_CACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then echo "[Using ccache]" export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH gcc=$(which gcc) From 66829d74b4d986feb96a27f8da95fcac685bf243 Mon Sep 17 00:00:00 2001 From: Lorenzo Cestaro Date: Fri, 17 Mar 2017 11:41:26 -0400 Subject: [PATCH 313/338] DOC: Update broken link in cookbook.rst #15605 closes #15605 Author: Lorenzo Cestaro Closes #15720 from LorenzoCestaro/fix-15605 and squashes the following commits: 006eefa [Lorenzo Cestaro] DOC: Update broken link in cookbook.rst #15605 --- doc/source/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 841195de3da47..8fa1283ffc924 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -905,7 +905,7 @@ CSV The :ref:`CSV ` docs -`read_csv in action `__ +`read_csv in action `__ `appending to a csv `__ From c09838209b9a26ba2e62380ac2d0c7e5c97319cb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 12:35:27 -0400 Subject: [PATCH 314/338] CI: don't fail if our env already exists in caching --- ci/install_travis.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index f0f4bc0873e05..8bf6de3efe7c4 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -47,11 +47,11 @@ else # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 else - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 fi - bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 + time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 fi echo "[show conda]" @@ -90,13 +90,16 @@ else echo "[Not using ccache]" fi +echo "[create env]" + # may have installation instructions for this build INSTALL="ci/install-${PYTHON_VERSION}${JOB_TAG}.sh" if [ -e ${INSTALL} ]; then time bash $INSTALL || exit 1 else # create new env - time conda create -n pandas python=$PYTHON_VERSION pytest || exit 1 + # this may already exists, in which case our caching worked + time conda create -n pandas python=$PYTHON_VERSION pytest fi # build deps From c40dbc80122ab6c865069d8d01d3fc11badb954f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 13:11:55 -0400 Subject: [PATCH 315/338] CI: remove caching for miniconda itself (#15722) --- .travis.yml | 1 - ci/install_travis.sh | 25 ++++++++++--------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index af3098b3fc715..c1419dd0c5d3b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,6 @@ language: python # The cash directories will be deleted if anything in ci/ changes in a commit cache: directories: - - $HOME/miniconda3 # miniconda cache - $HOME/.cache # cython cache - $HOME/.ccache # compiler cache diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 8bf6de3efe7c4..e59502b810975 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -35,24 +35,19 @@ echo "[home_dir: $home_dir]" # install miniconda MINICONDA_DIR="$HOME/miniconda3" -if [ "$USE_CACHE" ] && [ -d "$MINICONDA_DIR/bin" ]; then - echo "[Using cached Miniconda install]" +echo "[Using clean Miniconda install]" -else - echo "[Using clean Miniconda install]" - - if [ -d "$MINICONDA_DIR" ]; then - rm -rf "$MINICONDA_DIR" - fi +if [ -d "$MINICONDA_DIR" ]; then + rm -rf "$MINICONDA_DIR" +fi - # install miniconda - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 - else - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 - fi - time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 +# install miniconda +if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 +else + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 fi +time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 echo "[show conda]" which conda From a5e1ab8653f1229fc0ec234af74309c3c556bd71 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 15:24:35 -0400 Subject: [PATCH 316/338] CI: remove miniconda from actual cache scripts --- ci/check_cache.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/check_cache.sh b/ci/check_cache.sh index cd7a6e8f6b6f9..1c9de7b017569 100755 --- a/ci/check_cache.sh +++ b/ci/check_cache.sh @@ -12,14 +12,12 @@ else ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) fi -MINICONDA_DIR="$HOME/miniconda/" CACHE_DIR="$HOME/.cache/" CCACHE_DIR="$HOME/.ccache/" if [ $ci_changes -ne 0 ] then echo "Files have changed in ci/ deleting all caches" - rm -rf "$MINICONDA_DIR" rm -rf "$CACHE_DIR" rm -rf "$CCACHE_DIR" -fi \ No newline at end of file +fi From 69464392ad38016ffb583e760b57f6be0bd44d08 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 15:39:53 -0400 Subject: [PATCH 317/338] CI: install ccache on osx --- ci/install_travis.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index e59502b810975..610e6255e6832 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -32,6 +32,11 @@ edit_init home_dir=$(pwd) echo "[home_dir: $home_dir]" +if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + echo "[install ccache]" + time brew install ccache +fi + # install miniconda MINICONDA_DIR="$HOME/miniconda3" From eefe2669139875c39b0563aa7953873f3c08bccc Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 17 Mar 2017 20:11:31 -0400 Subject: [PATCH 318/338] MAINT: Drop take_last kwarg from method signatures Affected methods: 1) nlargest 2) nsmallest 3) duplicated 4) drop_duplicates xref #10236, #10792, #10920. Author: gfyoung Closes #15710 from gfyoung/create-last-kw-drop and squashes the following commits: b416290 [gfyoung] MAINT: Drop take_last kwarg from method signatures --- asv_bench/benchmarks/series_methods.py | 12 ++--- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/base.py | 6 --- pandas/core/frame.py | 9 +--- pandas/core/groupby.py | 28 +++------- pandas/core/series.py | 10 ---- pandas/indexes/base.py | 4 -- pandas/indexes/category.py | 5 +- pandas/indexes/multi.py | 2 - pandas/tests/frame/test_analytics.py | 75 -------------------------- pandas/tests/groupby/test_groupby.py | 7 +-- pandas/tests/series/test_analytics.py | 33 ------------ pandas/tests/test_base.py | 16 ------ pandas/tests/test_multilevel.py | 11 ---- vb_suite/series_methods.py | 16 +++--- 15 files changed, 26 insertions(+), 209 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 413c4e044fd3a..c66654ee1e006 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -68,8 +68,8 @@ def setup(self): self.s4 = self.s3.astype('object') def time_series_nlargest1(self): - self.s1.nlargest(3, take_last=True) - self.s1.nlargest(3, take_last=False) + self.s1.nlargest(3, keep='last') + self.s1.nlargest(3, keep='first') class series_nlargest2(object): @@ -83,8 +83,8 @@ def setup(self): self.s4 = self.s3.astype('object') def time_series_nlargest2(self): - self.s2.nlargest(3, take_last=True) - self.s2.nlargest(3, take_last=False) + self.s2.nlargest(3, keep='last') + self.s2.nlargest(3, keep='first') class series_nsmallest2(object): @@ -98,8 +98,8 @@ def setup(self): self.s4 = self.s3.astype('object') def time_series_nsmallest2(self): - self.s2.nsmallest(3, take_last=True) - self.s2.nsmallest(3, take_last=False) + self.s2.nsmallest(3, keep='last') + self.s2.nsmallest(3, keep='first') class series_dropna_int64(object): diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 29d05ddcfb497..9cf53300f8cca 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -769,6 +769,7 @@ Removal of prior version deprecations/changes in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). - The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) - The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`) +- The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`) .. _whatsnew_0200.performance: diff --git a/pandas/core/base.py b/pandas/core/base.py index d7c9e35ab6a51..bde60be3ddcff 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1065,7 +1065,6 @@ def searchsorted(self, value, side='left', sorter=None): - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - False : Drop all duplicates. - take_last : deprecated %(inplace)s Returns @@ -1073,8 +1072,6 @@ def searchsorted(self, value, side='left', sorter=None): deduplicated : %(klass)s """) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1100,15 +1097,12 @@ def drop_duplicates(self, keep='first', inplace=False): - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. - take_last : deprecated Returns ------- duplicated : %(duplicated)s """) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): from pandas.core.algorithms import duplicated diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 987eb10101f12..3696051b269e3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,8 +77,7 @@ OrderedDict, raise_with_traceback) from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import (deprecate_kwarg, Appender, - Substitution) +from pandas.util.decorators import Appender, Substitution from pandas.util.validators import validate_bool_kwarg from pandas.tseries.period import PeriodIndex @@ -3169,8 +3168,6 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) def drop_duplicates(self, subset=None, keep='first', inplace=False): """ Return DataFrame with duplicate rows removed, optionally only @@ -3185,7 +3182,6 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - False : Drop all duplicates. - take_last : deprecated inplace : boolean, default False Whether to drop duplicates in place or to return a copy @@ -3203,8 +3199,6 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) def duplicated(self, subset=None, keep='first'): """ Return boolean Series denoting duplicate rows, optionally only @@ -3221,7 +3215,6 @@ def duplicated(self, subset=None, keep='first'): - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. - take_last : deprecated Returns ------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7a017ffae284c..4095a14aa5970 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -51,8 +51,8 @@ from pandas.core.sorting import (get_group_index_sorter, get_group_index, compress_group_index, get_flattened_iterator, decons_obs_group_ids, get_indexer_dict) -from pandas.util.decorators import (cache_readonly, Substitution, Appender, - make_signature, deprecate_kwarg) +from pandas.util.decorators import (cache_readonly, Substitution, + Appender, make_signature) from pandas.formats.printing import pprint_thing from pandas.util.validators import validate_kwargs @@ -94,12 +94,12 @@ 'corr', 'cov', 'diff', ]) | _plotting_methods -_series_apply_whitelist = \ - (_common_apply_whitelist - set(['boxplot'])) | \ - frozenset(['dtype', 'unique']) +_series_apply_whitelist = ((_common_apply_whitelist | + {'nlargest', 'nsmallest'}) - + {'boxplot'}) | frozenset(['dtype', 'unique']) -_dataframe_apply_whitelist = \ - _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) +_dataframe_apply_whitelist = (_common_apply_whitelist | + frozenset(['dtypes', 'corrwith'])) _cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', 'cummin', 'cummax']) @@ -3025,20 +3025,6 @@ def nunique(self, dropna=True): index=ri, name=self.name) - @deprecate_kwarg('take_last', 'keep', - mapping={True: 'last', False: 'first'}) - @Appender(Series.nlargest.__doc__) - def nlargest(self, n=5, keep='first'): - # ToDo: When we remove deprecate_kwargs, we can remote these methods - # and include nlargest and nsmallest to _series_apply_whitelist - return self.apply(lambda x: x.nlargest(n=n, keep=keep)) - - @deprecate_kwarg('take_last', 'keep', - mapping={True: 'last', False: 'first'}) - @Appender(Series.nsmallest.__doc__) - def nsmallest(self, n=5, keep='first'): - return self.apply(lambda x: x.nsmallest(n=n, keep=keep)) - @Appender(Series.describe.__doc__) def describe(self, **kwargs): self._set_group_selection() diff --git a/pandas/core/series.py b/pandas/core/series.py index cfa25ca1299eb..7ee3b3e8fb519 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1211,14 +1211,10 @@ def unique(self): return result.asobject.values return result - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) def duplicated(self, keep='first'): return super(Series, self).duplicated(keep=keep) @@ -1888,8 +1884,6 @@ def argsort(self, axis=0, kind='quicksort', order=None): np.argsort(values, kind=kind), index=self.index, dtype='int64').__finalize__(self) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) def nlargest(self, n=5, keep='first'): """Return the largest `n` elements. @@ -1901,7 +1895,6 @@ def nlargest(self, n=5, keep='first'): Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - take_last : deprecated Returns ------- @@ -1938,8 +1931,6 @@ def nlargest(self, n=5, keep='first'): return algorithms.select_n_series(self, n=n, keep=keep, method='nlargest') - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) def nsmallest(self, n=5, keep='first'): """Return the smallest `n` elements. @@ -1951,7 +1942,6 @@ def nsmallest(self, n=5, keep='first'): Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - take_last : deprecated Returns ------- diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 5b942e2565c29..381e4d5caa8ac 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -3500,14 +3500,10 @@ def unique(self): result = super(Index, self).unique() return self._shallow_copy(result) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) def drop_duplicates(self, keep='first'): return super(Index, self).drop_duplicates(keep=keep) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): return super(Index, self).duplicated(keep=keep) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 923dd4ec785c5..7cfc95de5f538 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -11,8 +11,7 @@ from pandas.types.missing import array_equivalent -from pandas.util.decorators import (Appender, cache_readonly, - deprecate_kwarg) +from pandas.util.decorators import Appender, cache_readonly from pandas.core.config import get_option from pandas.indexes.base import Index, _index_shared_docs import pandas.core.base as base @@ -301,8 +300,6 @@ def unique(self): return self._shallow_copy(result, categories=result.categories, ordered=result.ordered) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas._libs.hashtable import duplicated_int64 diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 1c1609fed1dd1..978492131ca89 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -755,8 +755,6 @@ def f(k, stringify): for k, stringify in zip(key, self._have_mixed_levels)]) return hash_tuples(key) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', - False: 'first'}) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.core.sorting import get_group_index diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6c917444f9f43..4fb1d2222fa06 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1381,12 +1381,6 @@ def test_drop_duplicates(self): tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates('AAA', take_last=True) - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) @@ -1402,12 +1396,6 @@ def test_drop_duplicates(self): expected = df.loc[[0]] tm.assert_frame_equal(result, expected) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates(('AAA', 'B'), take_last=True) - expected = df.loc[[0, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - # consider everything df2 = df.loc[:, ['AAA', 'B', 'C']] @@ -1424,13 +1412,6 @@ def test_drop_duplicates(self): expected = df2.drop_duplicates(['AAA', 'B'], keep=False) tm.assert_frame_equal(result, expected) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df2.drop_duplicates(take_last=True) - with tm.assert_produces_warning(FutureWarning): - expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) - tm.assert_frame_equal(result, expected) - # integers result = df.drop_duplicates('C') expected = df.iloc[[0, 2]] @@ -1529,12 +1510,6 @@ def test_drop_duplicates_tuple(self): self.assertEqual(len(result), 0) tm.assert_frame_equal(result, expected) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates(('AA', 'AB'), take_last=True) - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates((('AA', 'AB'), 'B')) @@ -1563,12 +1538,6 @@ def test_drop_duplicates_NA(self): tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates('A', take_last=True) - expected = df.loc[[1, 6, 7]] - tm.assert_frame_equal(result, expected) - # multi column result = df.drop_duplicates(['A', 'B']) expected = df.loc[[0, 2, 3, 6]] @@ -1582,12 +1551,6 @@ def test_drop_duplicates_NA(self): expected = df.loc[[6]] tm.assert_frame_equal(result, expected) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates(['A', 'B'], take_last=True) - expected = df.loc[[1, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - # nan df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -1610,12 +1573,6 @@ def test_drop_duplicates_NA(self): tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates('C', take_last=True) - expected = df.loc[[3, 7]] - tm.assert_frame_equal(result, expected) - # multi column result = df.drop_duplicates(['C', 'B']) expected = df.loc[[0, 1, 2, 4]] @@ -1629,12 +1586,6 @@ def test_drop_duplicates_NA(self): expected = df.loc[[1]] tm.assert_frame_equal(result, expected) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - result = df.drop_duplicates(['C', 'B'], take_last=True) - expected = df.loc[[1, 3, 6, 7]] - tm.assert_frame_equal(result, expected) - def test_drop_duplicates_NA_for_take_all(self): # none df = DataFrame({'A': [None, None, 'foo', 'bar', @@ -1697,14 +1648,6 @@ def test_drop_duplicates_inplace(self): tm.assert_frame_equal(result, expected) self.assertEqual(len(df), 0) - # deprecate take_last - df = orig.copy() - with tm.assert_produces_warning(FutureWarning): - df.drop_duplicates('A', take_last=True, inplace=True) - expected = orig.loc[[6, 7]] - result = df - tm.assert_frame_equal(result, expected) - # multi column df = orig.copy() df.drop_duplicates(['A', 'B'], inplace=True) @@ -1724,14 +1667,6 @@ def test_drop_duplicates_inplace(self): result = df tm.assert_frame_equal(result, expected) - # deprecate take_last - df = orig.copy() - with tm.assert_produces_warning(FutureWarning): - df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) - expected = orig.loc[[0, 5, 6, 7]] - result = df - tm.assert_frame_equal(result, expected) - # consider everything orig2 = orig.loc[:, ['A', 'B', 'C']].copy() @@ -1754,17 +1689,7 @@ def test_drop_duplicates_inplace(self): result = df2 tm.assert_frame_equal(result, expected) - # deprecate take_last - df2 = orig2.copy() - with tm.assert_produces_warning(FutureWarning): - df2.drop_duplicates(take_last=True, inplace=True) - with tm.assert_produces_warning(FutureWarning): - expected = orig2.drop_duplicates(['A', 'B'], take_last=True) - result = df2 - tm.assert_frame_equal(result, expected) - # Rounding - def test_round(self): # GH 2665 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c25974c94bfd1..a355dca3029c7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3816,7 +3816,8 @@ def test_groupby_whitelist(self): 'cov', 'diff', 'unique', - # 'nlargest', 'nsmallest', + 'nlargest', + 'nsmallest', ]) for obj, whitelist in zip((df, s), (df_whitelist, s_whitelist)): @@ -4025,8 +4026,6 @@ def test_nlargest(self): 3, 2, 1, 3, 3, 2 ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) assert_series_equal(gb.nlargest(3, keep='last'), e) - with tm.assert_produces_warning(FutureWarning): - assert_series_equal(gb.nlargest(3, take_last=True), e) def test_nsmallest(self): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) @@ -4044,8 +4043,6 @@ def test_nsmallest(self): 0, 1, 1, 0, 1, 2 ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) assert_series_equal(gb.nsmallest(3, keep='last'), e) - with tm.assert_produces_warning(FutureWarning): - assert_series_equal(gb.nsmallest(3, take_last=True), e) def test_transform_doesnt_clobber_ints(self): # GH 7972 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index c2543581dca50..dc71fafb1094f 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -917,17 +917,6 @@ def test_drop_duplicates(self): sc.drop_duplicates(keep='last', inplace=True) assert_series_equal(sc, s[~expected]) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - assert_series_equal(s.duplicated(take_last=True), expected) - with tm.assert_produces_warning(FutureWarning): - assert_series_equal( - s.drop_duplicates(take_last=True), s[~expected]) - sc = s.copy() - with tm.assert_produces_warning(FutureWarning): - sc.drop_duplicates(take_last=True, inplace=True) - assert_series_equal(sc, s[~expected]) - expected = Series([False, False, True, True]) assert_series_equal(s.duplicated(keep=False), expected) assert_series_equal(s.drop_duplicates(keep=False), s[~expected]) @@ -951,17 +940,6 @@ def test_drop_duplicates(self): sc.drop_duplicates(keep='last', inplace=True) assert_series_equal(sc, s[~expected]) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - assert_series_equal(s.duplicated(take_last=True), expected) - with tm.assert_produces_warning(FutureWarning): - assert_series_equal( - s.drop_duplicates(take_last=True), s[~expected]) - sc = s.copy() - with tm.assert_produces_warning(FutureWarning): - sc.drop_duplicates(take_last=True, inplace=True) - assert_series_equal(sc, s[~expected]) - expected = Series([False, True, True, False, True, True, False]) assert_series_equal(s.duplicated(keep=False), expected) assert_series_equal(s.drop_duplicates(keep=False), s[~expected]) @@ -1443,18 +1421,7 @@ def test_nsmallest_nlargest(self): for s in s_list: assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]]) - with tm.assert_produces_warning(FutureWarning): - assert_series_equal( - s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) - - assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) - - assert_series_equal(s.nlargest(3, keep='last'), s.iloc[[4, 0, 3]]) - with tm.assert_produces_warning(FutureWarning): - assert_series_equal( - s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]]) empty = s.iloc[0:0] assert_series_equal(s.nsmallest(0), empty) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1d4dddf6477df..68db0d19344b9 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -816,15 +816,6 @@ def test_duplicated_drop_duplicates_index(self): result = idx.drop_duplicates(keep='last') tm.assert_index_equal(result, idx[~expected]) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - duplicated = idx.duplicated(take_last=True) - tm.assert_numpy_array_equal(duplicated, expected) - self.assertTrue(duplicated.dtype == bool) - with tm.assert_produces_warning(FutureWarning): - result = idx.drop_duplicates(take_last=True) - tm.assert_index_equal(result, idx[~expected]) - base = [False] * len(original) + [True, True] base[3] = True base[5] = True @@ -867,13 +858,6 @@ def test_duplicated_drop_duplicates_index(self): tm.assert_series_equal(s.drop_duplicates(keep='last'), s[~np.array(base)]) - # deprecate take_last - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal( - s.duplicated(take_last=True), expected) - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.drop_duplicates(take_last=True), - s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d7b115d808312..fd5421abc89ad 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2037,17 +2037,6 @@ def test_duplicated_drop_duplicates(self): expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) - # deprecate take_last - expected = np.array([True, False, False, False, False, False]) - with tm.assert_produces_warning(FutureWarning): - duplicated = idx.duplicated(take_last=True) - tm.assert_numpy_array_equal(duplicated, expected) - self.assertTrue(duplicated.dtype == bool) - expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) - with tm.assert_produces_warning(FutureWarning): - tm.assert_index_equal( - idx.drop_duplicates(take_last=True), expected) - def test_multiindex_set_index(self): # segfault in #3308 d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]} diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py index cd8688495fa09..c545f419c2dec 100644 --- a/vb_suite/series_methods.py +++ b/vb_suite/series_methods.py @@ -12,22 +12,22 @@ s4 = s3.astype('object') """ -series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);' - 's1.nlargest(3, take_last=False)', +series_nlargest1 = Benchmark("s1.nlargest(3, keep='last');" + "s1.nlargest(3, keep='first')", setup, start_date=datetime(2014, 1, 25)) -series_nlargest2 = Benchmark('s2.nlargest(3, take_last=True);' - 's2.nlargest(3, take_last=False)', +series_nlargest2 = Benchmark("s2.nlargest(3, keep='last');" + "s2.nlargest(3, keep='first')", setup, start_date=datetime(2014, 1, 25)) -series_nsmallest2 = Benchmark('s1.nsmallest(3, take_last=True);' - 's1.nsmallest(3, take_last=False)', +series_nsmallest2 = Benchmark("s1.nsmallest(3, keep='last');" + "s1.nsmallest(3, keep='first')", setup, start_date=datetime(2014, 1, 25)) -series_nsmallest2 = Benchmark('s2.nsmallest(3, take_last=True);' - 's2.nsmallest(3, take_last=False)', +series_nsmallest2 = Benchmark("s2.nsmallest(3, keep='last');" + "s2.nsmallest(3, keep='first')", setup, start_date=datetime(2014, 1, 25)) From 47943923ba7284994e8abe3f55eebe8ab51e5cd9 Mon Sep 17 00:00:00 2001 From: Jaehoon Hwang Date: Fri, 17 Mar 2017 21:55:38 -0400 Subject: [PATCH 319/338] TST: move pandas/tests/io/test_date_converters.py to pandas/tests/io/parsers/parse_dates.py closes #15519 Author: Jaehoon Hwang Closes #15707 from jaehoonhwang/TST15519 and squashes the following commits: 0b309d3 [Jaehoon Hwang] Fixed frame email and PEP8 ef6e8fa [Jaehoon Hwang] Fixing up few lines and imports e019e95 [Jaehoon Hwang] Imported read_table and using self.readcsv 3eb63c5 [Jaehoon Hwang] TST15519 Moving Unit tests to appropriate file 9b20caa [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' b977615 [Jaehoon Hwang] Merge remote-tracking branch 'pandas-dev/master' --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/io/parser/parse_dates.py | 148 ++++++++++++++++++++++- pandas/tests/io/test_date_converters.py | 150 ------------------------ 3 files changed, 147 insertions(+), 152 deletions(-) delete mode 100644 pandas/tests/io/test_date_converters.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9cf53300f8cca..4949b68d46723 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -723,6 +723,7 @@ Other API Changes - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) - Reorganization of timeseries development tests (:issue:`14854`) +- Reorganization of date converter tests (:issue:`15707`) - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) - ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 4cba9276a9d1e..de4e3fbc0d943 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -6,7 +6,7 @@ """ from distutils.version import LooseVersion -from datetime import datetime +from datetime import datetime, date import pytest import numpy as np @@ -19,9 +19,10 @@ import pandas.util.testing as tm import pandas.io.date_converters as conv -from pandas import DataFrame, Series, Index, DatetimeIndex +from pandas import DataFrame, Series, Index, DatetimeIndex, MultiIndex from pandas import compat from pandas.compat import parse_date, StringIO, lrange +from pandas.compat.numpy import np_array_datetime64_compat from pandas.tseries.index import date_range @@ -510,3 +511,146 @@ def test_parse_date_time_multi_level_column_name(self): expected = DataFrame(expected_data, columns=['date_time', ('A', 'a'), ('B', 'b')]) tm.assert_frame_equal(result, expected) + + def test_parse_date_time(self): + dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) + times = np.array(['05:07:09', '06:08:00'], dtype=object) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + + result = conv.parse_date_time(dates, times) + self.assertTrue((result == expected).all()) + + data = """\ +date, time, a, b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""" + datecols = {'date_time': [0, 1]} + df = self.read_csv(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_date_time) + self.assertIn('date_time', df) + self.assertEqual(df.date_time.loc[0], datetime(2001, 1, 5, 10, 0, 0)) + + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + + def test_parse_date_fields(self): + years = np.array([2007, 2008]) + months = np.array([1, 2]) + days = np.array([3, 4]) + result = conv.parse_date_fields(years, months, days) + expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + self.assertTrue((result == expected).all()) + + data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n" + "2001 , 02 , 1 , 11.") + datecols = {'ymd': [0, 1, 2]} + df = self.read_csv(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_date_fields) + self.assertIn('ymd', df) + self.assertEqual(df.ymd.loc[0], datetime(2001, 1, 10)) + + def test_datetime_six_col(self): + years = np.array([2007, 2008]) + months = np.array([1, 2]) + days = np.array([3, 4]) + hours = np.array([5, 6]) + minutes = np.array([7, 8]) + seconds = np.array([9, 0]) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + + result = conv.parse_all_fields(years, months, days, + hours, minutes, seconds) + + self.assertTrue((result == expected).all()) + + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0, 0.0, 10. +2001, 01, 5, 10, 0, 00, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = self.read_csv(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assertIn('ymdHMS', df) + self.assertEqual(df.ymdHMS.loc[0], datetime(2001, 1, 5, 10, 0, 0)) + + def test_datetime_fractional_seconds(self): + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0.123456, 0.0, 10. +2001, 01, 5, 10, 0, 0.500000, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = self.read_csv(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assertIn('ymdHMS', df) + self.assertEqual(df.ymdHMS.loc[0], datetime(2001, 1, 5, 10, 0, 0, + microsecond=123456)) + self.assertEqual(df.ymdHMS.loc[1], datetime(2001, 1, 5, 10, 0, 0, + microsecond=500000)) + + def test_generic(self): + data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." + datecols = {'ym': [0, 1]} + dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) + df = self.read_csv(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=dateconverter) + self.assertIn('ym', df) + self.assertEqual(df.ym.loc[0], date(2001, 1, 1)) + + def test_dateparser_resolution_if_not_ns(self): + # GH 10245 + data = """\ +date,time,prn,rxstatus +2013-11-03,19:00:00,126,00E80000 +2013-11-03,19:00:00,23,00E80000 +2013-11-03,19:00:00,13,00E80000 +""" + + def date_parser(date, time): + datetime = np_array_datetime64_compat( + date + 'T' + time + 'Z', dtype='datetime64[s]') + return datetime + + df = self.read_csv(StringIO(data), date_parser=date_parser, + parse_dates={'datetime': ['date', 'time']}, + index_col=['datetime', 'prn']) + + datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, + dtype='datetime64[s]') + df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, + index=MultiIndex.from_tuples( + [(datetimes[0], 126), + (datetimes[1], 23), + (datetimes[2], 13)], + names=['datetime', 'prn'])) + tm.assert_frame_equal(df, df_correct) + + def test_parse_date_column_with_empty_string(self): + # GH 6428 + data = """case,opdate + 7,10/18/2006 + 7,10/18/2008 + 621, """ + result = self.read_csv(StringIO(data), parse_dates=['opdate']) + expected_data = [[7, '10/18/2006'], + [7, '10/18/2008'], + [621, ' ']] + expected = DataFrame(expected_data, columns=['case', 'opdate']) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py deleted file mode 100644 index 5b54925c65fbd..0000000000000 --- a/pandas/tests/io/test_date_converters.py +++ /dev/null @@ -1,150 +0,0 @@ -from pandas.compat import StringIO -from datetime import date, datetime - -import numpy as np - -from pandas import DataFrame, MultiIndex -from pandas.io.parsers import (read_csv, read_table) -from pandas.util.testing import assert_frame_equal -import pandas.io.date_converters as conv -import pandas.util.testing as tm -from pandas.compat.numpy import np_array_datetime64_compat - - -class TestConverters(tm.TestCase): - - def setUp(self): - self.years = np.array([2007, 2008]) - self.months = np.array([1, 2]) - self.days = np.array([3, 4]) - self.hours = np.array([5, 6]) - self.minutes = np.array([7, 8]) - self.seconds = np.array([9, 0]) - self.dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) - self.times = np.array(['05:07:09', '06:08:00'], dtype=object) - self.expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) - - def test_parse_date_time(self): - result = conv.parse_date_time(self.dates, self.times) - self.assertTrue((result == self.expected).all()) - - data = """\ -date, time, a, b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""" - datecols = {'date_time': [0, 1]} - df = read_table(StringIO(data), sep=',', header=0, - parse_dates=datecols, date_parser=conv.parse_date_time) - self.assertIn('date_time', df) - self.assertEqual(df.date_time.loc[0], datetime(2001, 1, 5, 10, 0, 0)) - - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") - - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = read_csv(StringIO(data), header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) - - def test_parse_date_fields(self): - result = conv.parse_date_fields(self.years, self.months, self.days) - expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) - self.assertTrue((result == expected).all()) - - data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n" - "2001 , 02 , 1 , 11.") - datecols = {'ymd': [0, 1, 2]} - df = read_table(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_date_fields) - self.assertIn('ymd', df) - self.assertEqual(df.ymd.loc[0], datetime(2001, 1, 10)) - - def test_datetime_six_col(self): - result = conv.parse_all_fields(self.years, self.months, self.days, - self.hours, self.minutes, self.seconds) - self.assertTrue((result == self.expected).all()) - - data = """\ -year, month, day, hour, minute, second, a, b -2001, 01, 05, 10, 00, 0, 0.0, 10. -2001, 01, 5, 10, 0, 00, 1., 11. -""" - datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} - df = read_table(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_all_fields) - self.assertIn('ymdHMS', df) - self.assertEqual(df.ymdHMS.loc[0], datetime(2001, 1, 5, 10, 0, 0)) - - def test_datetime_fractional_seconds(self): - data = """\ -year, month, day, hour, minute, second, a, b -2001, 01, 05, 10, 00, 0.123456, 0.0, 10. -2001, 01, 5, 10, 0, 0.500000, 1., 11. -""" - datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} - df = read_table(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_all_fields) - self.assertIn('ymdHMS', df) - self.assertEqual(df.ymdHMS.loc[0], datetime(2001, 1, 5, 10, 0, 0, - microsecond=123456)) - self.assertEqual(df.ymdHMS.loc[1], datetime(2001, 1, 5, 10, 0, 0, - microsecond=500000)) - - def test_generic(self): - data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." - datecols = {'ym': [0, 1]} - dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) - df = read_table(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=dateconverter) - self.assertIn('ym', df) - self.assertEqual(df.ym.loc[0], date(2001, 1, 1)) - - def test_dateparser_resolution_if_not_ns(self): - # issue 10245 - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(date, time): - datetime = np_array_datetime64_compat( - date + 'T' + time + 'Z', dtype='datetime64[s]') - return datetime - - df = read_csv(StringIO(data), date_parser=date_parser, - parse_dates={'datetime': ['date', 'time']}, - index_col=['datetime', 'prn']) - - datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, - dtype='datetime64[s]') - df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, - index=MultiIndex.from_tuples( - [(datetimes[0], 126), - (datetimes[1], 23), - (datetimes[2], 13)], - names=['datetime', 'prn'])) - assert_frame_equal(df, df_correct) - - def test_parse_date_column_with_empty_string(self): - # GH 6428 - data = """case,opdate - 7,10/18/2006 - 7,10/18/2008 - 621, """ - result = read_csv(StringIO(data), parse_dates=['opdate']) - expected_data = [[7, '10/18/2006'], - [7, '10/18/2008'], - [621, ' ']] - expected = DataFrame(expected_data, columns=['case', 'opdate']) - assert_frame_equal(result, expected) From 7e341905122d44acae2a400bdd57e2c614852602 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Mar 2017 22:51:36 -0400 Subject: [PATCH 320/338] CI: install nomkl to speed building (#15728) CI: use cache on all builds --- .travis.yml | 17 +++++++++-------- ci/install_travis.sh | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index c1419dd0c5d3b..cafe46059e6c0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -74,7 +74,7 @@ matrix: - CLIPBOARD=xsel - COVERAGE=true - CACHE_NAME="35_nslow" -# - USE_CACHE=true # Don't use cache for 35_nslow + - USE_CACHE=true addons: apt: packages: @@ -86,6 +86,7 @@ matrix: - TEST_ARGS="--skip-slow --skip-network" - PANDAS_TESTING_MODE="deprecate" - CONDA_FORGE=true + - USE_CACHE=true addons: apt: packages: @@ -154,13 +155,13 @@ matrix: - USE_CACHE=true - python: 3.5 env: - - PYTHON_VERSION=3.5 - - JOB_NAME: "35_numpy_dev" - - JOB_TAG=_NUMPY_DEV - - TEST_ARGS="--skip-slow --skip-network" - - PANDAS_TESTING_MODE="deprecate" - - CACHE_NAME="35_numpy_dev" - - USE_CACHE=true + - PYTHON_VERSION=3.5 + - JOB_NAME: "35_numpy_dev" + - JOB_TAG=_NUMPY_DEV + - TEST_ARGS="--skip-slow --skip-network" + - PANDAS_TESTING_MODE="deprecate" + - CACHE_NAME="35_numpy_dev" + - USE_CACHE=true - python: 3.5 env: - PYTHON_VERSION=3.5 diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 610e6255e6832..de3b3fb6a464e 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -99,7 +99,7 @@ if [ -e ${INSTALL} ]; then else # create new env # this may already exists, in which case our caching worked - time conda create -n pandas python=$PYTHON_VERSION pytest + time conda create -n pandas python=$PYTHON_VERSION pytest nomkl fi # build deps From 16a45a867c7bea93fbdcce5ec98cb5c03b0c5410 Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Fri, 17 Mar 2017 23:32:46 -0400 Subject: [PATCH 321/338] DOC: Fix typos in merge_asof() docstring (#15729) --- pandas/tools/merge.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 261884bba54bd..60d523a8ea539 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -295,7 +295,7 @@ def merge_asof(left, right, on=None, - A "nearest" search selects the row in the right DataFrame whose 'on' key is closest in absolute distance to the left's key. - The default is "backward" and is the compatible in versions below 0.20.0. + The default is "backward" and is compatible in versions below 0.20.0. The direction parameter was added in version 0.20.0 and introduces "forward" and "nearest". @@ -340,13 +340,13 @@ def merge_asof(left, right, on=None, suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right - side, respectively + side, respectively. tolerance : integer or Timedelta, optional, default None - select asof tolerance within this range; must be compatible - to the merge index. + Select asof tolerance within this range; must be compatible + with the merge index. allow_exact_matches : boolean, default True - - If True, allow matching the same 'on' value + - If True, allow matching with the same 'on' value (i.e. less-than-or-equal-to / greater-than-or-equal-to) - If False, don't match the same 'on' value (i.e., stricly less-than / strictly greater-than) From 579710f84e7e9df9cfc486679638d3f38d466444 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Mar 2017 12:01:19 -0400 Subject: [PATCH 322/338] TST: move conftest.py to top-level (#15731) --- pandas/conftest.py => conftest.py | 0 pandas/tests/api/test_api.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename pandas/conftest.py => conftest.py (100%) diff --git a/pandas/conftest.py b/conftest.py similarity index 100% rename from pandas/conftest.py rename to conftest.py diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 73222c246fc70..2972427f1b245 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -29,7 +29,7 @@ class TestPDApi(Base, tm.TestCase): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'locale', 'conftest'] + ignored = ['tests', 'locale'] # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', From 6a5193b5e2e977f756c5773158fd36903e81e6e6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Mar 2017 12:02:09 -0400 Subject: [PATCH 323/338] CI: remove 3.5 appveyor build (#15730) --- appveyor.yml | 7 ------- ci/requirements-3.5-64.run | 13 ------------- 2 files changed, 20 deletions(-) delete mode 100644 ci/requirements-3.5-64.run diff --git a/appveyor.yml b/appveyor.yml index 1c14698430996..5d748ddf1a108 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,13 +30,6 @@ environment: CONDA_PY: "27" CONDA_NPY: "110" - - CONDA_ROOT: "C:\\Miniconda3_64" - PYTHON_VERSION: "3.5" - PYTHON_ARCH: "64" - CONDA_PY: "35" - CONDA_NPY: "111" - - # We always use a 64-bit machine, but can build x86 distributions # with the PYTHON_ARCH variable (which is used by CMD_IN_ENV). platform: diff --git a/ci/requirements-3.5-64.run b/ci/requirements-3.5-64.run deleted file mode 100644 index ad66f578d702a..0000000000000 --- a/ci/requirements-3.5-64.run +++ /dev/null @@ -1,13 +0,0 @@ -python-dateutil -pytz -numpy=1.11* -openpyxl -xlsxwriter -xlrd -xlwt -scipy -feather-format -numexpr -pytables -matplotlib -blosc From d7e19f6af5d2fca66fdcfeb831799c1ec478dc7e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Mar 2017 12:50:26 -0400 Subject: [PATCH 324/338] CI: turn on cache for osx (#15733) --- .travis.yml | 1 + ci/submit_cython_cache.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cafe46059e6c0..88e1655363a4e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ matrix: os: osx compiler: clang osx_image: xcode6.4 + cache: ccache env: - PYTHON_VERSION=3.5 - JOB_NAME: "35_osx" diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh index cfbced4988357..b87acef0ba11c 100755 --- a/ci/submit_cython_cache.sh +++ b/ci/submit_cython_cache.sh @@ -9,7 +9,7 @@ rm -rf $PYX_CACHE_DIR home_dir=$(pwd) -mkdir $PYX_CACHE_DIR +mkdir -p $PYX_CACHE_DIR rsync -Rv $pyx_file_list $PYX_CACHE_DIR echo "pyx files:" From a5ea998e607c2f6c539cc9e21665a6803277ee9a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Mar 2017 16:44:46 -0400 Subject: [PATCH 325/338] Revert "TST: move conftest.py to top-level (#15731)" This reverts commit 6a52c15a4ac7b2228e7f8ca45412cacfe301b040. --- conftest.py => pandas/conftest.py | 0 pandas/tests/api/test_api.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename conftest.py => pandas/conftest.py (100%) diff --git a/conftest.py b/pandas/conftest.py similarity index 100% rename from conftest.py rename to pandas/conftest.py diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 2972427f1b245..73222c246fc70 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -29,7 +29,7 @@ class TestPDApi(Base, tm.TestCase): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'locale'] + ignored = ['tests', 'locale', 'conftest'] # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', From 232228d8977393933aad7262502a0d8d065e1843 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Mar 2017 22:00:00 -0400 Subject: [PATCH 326/338] TST: clean up build testing Author: Jeff Reback Closes #15734 from jreback/build and squashes the following commits: a99b713 [Jeff Reback] suppress the import json warning when generating _version ed7c526 [Jeff Reback] modify install tests 1cc5b67 [Jeff Reback] TST: have the build test exercise pandas.test() --- .travis.yml | 9 ++++---- ci/install_test.sh | 17 --------------- ci/install_travis.sh | 52 +++++++++++++++++++++++++------------------- ci/script_multi.sh | 3 ++- versioneer.py | 4 +++- 5 files changed, 39 insertions(+), 46 deletions(-) delete mode 100755 ci/install_test.sh diff --git a/.travis.yml b/.travis.yml index 88e1655363a4e..705b2380ac697 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,7 +33,6 @@ matrix: - PYTHON_VERSION=3.5 - JOB_NAME: "35_osx" - TEST_ARGS="--skip-slow --skip-network" - - BUILD_TYPE=conda - JOB_TAG=_OSX - TRAVIS_PYTHON_VERSION=3.5 - CACHE_NAME="35_osx" @@ -107,12 +106,12 @@ matrix: - python: 2.7 env: - PYTHON_VERSION=2.7 - - JOB_NAME: "27_build_test_conda" + - JOB_NAME: "27_build_test" - JOB_TAG=_BUILD_TEST - TEST_ARGS="--skip-slow" - FULL_DEPS=true - BUILD_TEST=true - - CACHE_NAME="27_build_test_conda" + - CACHE_NAME="27_build_test" - USE_CACHE=true # In allow_failures - python: 3.5 @@ -147,12 +146,12 @@ matrix: - python: 2.7 env: - PYTHON_VERSION=2.7 - - JOB_NAME: "27_build_test_conda" + - JOB_NAME: "27_build_test" - JOB_TAG=_BUILD_TEST - TEST_ARGS="--skip-slow" - FULL_DEPS=true - BUILD_TEST=true - - CACHE_NAME="27_build_test_conda" + - CACHE_NAME="27_build_test" - USE_CACHE=true - python: 3.5 env: diff --git a/ci/install_test.sh b/ci/install_test.sh deleted file mode 100755 index 9ace633d7f39d..0000000000000 --- a/ci/install_test.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -echo "inside $0" - -if [ "$INSTALL_TEST" ]; then - source activate pandas - echo "Starting installation test." - conda uninstall cython || exit 1 - python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1 - pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1 - pytest pandas/tests/test_series.py --junitxml=/tmp/pytest_install.xml -else - echo "Skipping installation test." -fi -RET="$?" - -exit "$RET" diff --git a/ci/install_travis.sh b/ci/install_travis.sh index de3b3fb6a464e..053a2d15a287c 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -131,10 +131,13 @@ fi if [ "$BUILD_TEST" ]; then - # build testing - pip uninstall --yes cython - pip install cython==0.23 - ( python setup.py build_ext --inplace && python setup.py develop ) || true + # build & install testing + echo ["Starting installation test."] + python setup.py clean + python setup.py build_ext --inplace + python setup.py sdist --formats=gztar + conda uninstall cython + pip install dist/*tar.gz || exit 1 else @@ -142,26 +145,31 @@ else echo "[build em]" time python setup.py build_ext --inplace || exit 1 - # we may have run installations - echo "[conda installs]" - REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.run" - if [ -e ${REQ} ]; then - time conda install -n pandas --file=${REQ} || exit 1 - fi +fi - # we may have additional pip installs - echo "[pip installs]" - REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip" - if [ -e ${REQ} ]; then - pip install -r $REQ - fi +# we may have run installations +echo "[conda installs]" +REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.run" +if [ -e ${REQ} ]; then + time conda install -n pandas --file=${REQ} || exit 1 +fi - # may have addtl installation instructions for this build - echo "[addtl installs]" - REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.sh" - if [ -e ${REQ} ]; then - time bash $REQ || exit 1 - fi +# we may have additional pip installs +echo "[pip installs]" +REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip" +if [ -e ${REQ} ]; then + pip install -r $REQ +fi + +# may have addtl installation instructions for this build +echo "[addtl installs]" +REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.sh" +if [ -e ${REQ} ]; then + time bash $REQ || exit 1 +fi + +# finish install if we are not doing a build-testk +if [ -z "$BUILD_TEST" ]; then # remove any installed pandas package # w/o removing anything else diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 41f71fd21f63f..2d1211b2f7b96 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -24,7 +24,8 @@ export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 429496 echo PYTHONHASHSEED=$PYTHONHASHSEED if [ "$BUILD_TEST" ]; then - echo "We are not running pytest as this is simply a build test." + cd /tmp + python -c "import pandas; pandas.test(['-n 2'])" elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/cov.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/cov.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas diff --git a/versioneer.py b/versioneer.py index c010f63e3ead8..104e8e97c6bd6 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1130,7 +1130,9 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. -import json +from warnings import catch_warnings +with catch_warnings(record=True): + import json import sys version_json = ''' From c2e5beb568e300d8676b4ee7e43fe1b023a7e612 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 18 Mar 2017 22:02:20 -0400 Subject: [PATCH 327/338] MAINT: Drop order and sort from pandas objects (#15735) Affect classes: 1) Index 2) Series 2) DataFrame xref gh-10726 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 50 -------------------- pandas/core/series.py | 71 ---------------------------- pandas/indexes/base.py | 11 ----- pandas/tests/frame/test_analytics.py | 20 -------- pandas/tests/frame/test_sorting.py | 6 +-- pandas/tests/indexes/common.py | 6 --- pandas/tests/indexes/test_base.py | 15 ------ pandas/tests/series/test_sorting.py | 15 +----- 9 files changed, 4 insertions(+), 191 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4949b68d46723..680aefc4041fb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -771,6 +771,7 @@ Removal of prior version deprecations/changes - The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) - The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`) - The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`) +- ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`) .. _whatsnew_0200.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3696051b269e3..732d88b47ae2a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3304,56 +3304,6 @@ def trans(v): else: return self._constructor(new_data).__finalize__(self) - def sort(self, columns=None, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last', **kwargs): - """ - DEPRECATED: use :meth:`DataFrame.sort_values` - - Sort DataFrame either by labels (along either axis) or by the values in - column(s) - - Parameters - ---------- - columns : object - Column name(s) in frame. Accepts a column name or a list - for a nested sort. A tuple will be interpreted as the - levels of a multi-index. - ascending : boolean or list, default True - Sort ascending vs. descending. Specify list for multiple sort - orders - axis : {0 or 'index', 1 or 'columns'}, default 0 - Sort index/rows versus columns - inplace : boolean, default False - Sort the DataFrame without creating a new instance - kind : {'quicksort', 'mergesort', 'heapsort'}, optional - This option is only applied when sorting on a single column or - label. - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - - Examples - -------- - >>> result = df.sort(['A', 'B'], ascending=[1, 0]) - - Returns - ------- - sorted : DataFrame - """ - nv.validate_sort(tuple(), kwargs) - - if columns is None: - warnings.warn("sort(....) is deprecated, use sort_index(.....)", - FutureWarning, stacklevel=2) - return self.sort_index(axis=axis, ascending=ascending, - inplace=inplace) - - warnings.warn("sort(columns=....) is deprecated, use " - "sort_values(by=.....)", FutureWarning, stacklevel=2) - return self.sort_values(by=columns, axis=axis, ascending=ascending, - inplace=inplace, kind=kind, - na_position=na_position) - @Appender(_shared_docs['sort_index'] % _shared_doc_kwargs) def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, diff --git a/pandas/core/series.py b/pandas/core/series.py index 7ee3b3e8fb519..4c51ced1845fe 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1777,77 +1777,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return result.__finalize__(self) - def sort(self, axis=0, ascending=True, kind='quicksort', - na_position='last', inplace=True): - """ - DEPRECATED: use :meth:`Series.sort_values(inplace=True)` for INPLACE - sorting - - Sort values and index labels by value. This is an inplace sort by - default. Series.order is the equivalent but returns a new Series. - - Parameters - ---------- - axis : int (can only be zero) - ascending : boolean, default True - Sort ascending. Passing False sorts descending - kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' - Choice of sorting algorithm. See np.sort for more - information. 'mergesort' is the only stable algorithm - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - inplace : boolean, default True - Do operation in place. - - See Also - -------- - Series.sort_values - """ - warnings.warn("sort is deprecated, use sort_values(inplace=True) for " - "INPLACE sorting", FutureWarning, stacklevel=2) - - return self.sort_values(ascending=ascending, kind=kind, - na_position=na_position, inplace=inplace) - - def order(self, na_last=None, ascending=True, kind='quicksort', - na_position='last', inplace=False): - """ - DEPRECATED: use :meth:`Series.sort_values` - - Sorts Series object, by value, maintaining index-value link. - This will return a new Series by default. Series.sort is the equivalent - but as an inplace method. - - Parameters - ---------- - na_last : boolean (optional, default=True)--DEPRECATED; use na_position - Put NaN's at beginning or end - ascending : boolean, default True - Sort ascending. Passing False sorts descending - kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' - Choice of sorting algorithm. See np.sort for more - information. 'mergesort' is the only stable algorithm - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - inplace : boolean, default False - Do operation in place. - - Returns - ------- - y : Series - - See Also - -------- - Series.sort_values - """ - warnings.warn("order is deprecated, use sort_values(...)", - FutureWarning, stacklevel=2) - - return self.sort_values(ascending=ascending, kind=kind, - na_position=na_position, inplace=inplace) - def argsort(self, axis=0, kind='quicksort', order=None): """ Overrides ndarray.argsort. Argsorts the value, omitting NA/null values, diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 381e4d5caa8ac..d262ecd818f1d 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1912,17 +1912,6 @@ def sort_values(self, return_indexer=False, ascending=True): else: return sorted_index - def order(self, return_indexer=False, ascending=True): - """ - Return sorted copy of Index - - DEPRECATED: use :meth:`Index.sort_values` - """ - warnings.warn("order is deprecated, use sort_values(...)", - FutureWarning, stacklevel=2) - return self.sort_values(return_indexer=return_indexer, - ascending=ascending) - def sort(self, *args, **kwargs): raise TypeError("cannot sort an Index object in-place, use " "sort_values instead") diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4fb1d2222fa06..735d3786e6a54 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -660,26 +660,6 @@ def test_sem(self): self.assertFalse((result < 0).any()) nanops._USE_BOTTLENECK = True - def test_sort_invalid_kwargs(self): - df = DataFrame([1, 2, 3], columns=['a']) - - msg = r"sort\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, df.sort, foo=2) - - # Neither of these should raise an error because they - # are explicit keyword arguments in the signature and - # hence should not be swallowed by the kwargs parameter - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.sort(axis=1) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.sort(kind='mergesort') - - msg = "the 'order' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, df.sort, order=2) - def test_skew(self): tm._skip_if_no_scipy() from scipy.stats import skew diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 7779afdc47b48..5108fc6080866 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -62,11 +62,7 @@ def test_sort(self): frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - frame.sort(columns='A') - with tm.assert_produces_warning(FutureWarning): - frame.sort() + # see gh-9816 with tm.assert_produces_warning(FutureWarning): frame.sortlevel() diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 3581f894e53a3..b1e6bd7520c69 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -346,12 +346,6 @@ def test_sort(self): for ind in self.indices.values(): self.assertRaises(TypeError, ind.sort) - def test_order(self): - for ind in self.indices.values(): - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - ind.order() - def test_mutability(self): for ind in self.indices.values(): if not len(ind): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 05d3478ab0705..7199a38bb7a80 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1808,21 +1808,6 @@ def setUp(self): def create_index(self): return self.mixedIndex - def test_order(self): - idx = self.create_index() - # 9816 deprecated - if PY36: - with tm.assertRaisesRegexp(TypeError, "'>' not supported"): - with tm.assert_produces_warning(FutureWarning): - idx.order() - elif PY3: - with tm.assertRaisesRegexp(TypeError, "unorderable types"): - with tm.assert_produces_warning(FutureWarning): - idx.order() - else: - with tm.assert_produces_warning(FutureWarning): - idx.order() - def test_argsort(self): idx = self.create_index() if PY36: diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 590a530a847bd..66ecba960ae0b 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -13,24 +13,13 @@ class TestSeriesSorting(TestData, tm.TestCase): - def test_sort(self): - + def test_sortlevel_deprecated(self): ts = self.ts.copy() - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - ts.sort() # sorts inplace - self.assert_series_equal(ts, self.ts.sort_values()) + # see gh-9816 with tm.assert_produces_warning(FutureWarning): ts.sortlevel() - def test_order(self): - - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - result = self.ts.order() - self.assert_series_equal(result, self.ts.sort_values()) - def test_sort_values(self): # check indexes are reordered corresponding with the values From 8e62346ea422bae2fca7bb18e253bc3aafc6e214 Mon Sep 17 00:00:00 2001 From: John Zwinck Date: Mon, 20 Mar 2017 16:13:52 +0800 Subject: [PATCH 328/338] DOC: Fix typo in docstring param name (#15739) --- pandas/tseries/holiday.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index d3d936693c266..9acb52ebe0e9f 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -365,7 +365,7 @@ def holidays(self, start=None, end=None, return_name=False): ---------- start : starting date, datetime-like, optional end : ending date, datetime-like, optional - return_names : bool, optional + return_name : bool, optional If True, return a series that has dates and holiday names. False will only return a DatetimeIndex of dates. From 5b75cf8998470eacef209439b3b2310ab3874fa5 Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Mon, 20 Mar 2017 09:44:29 -0400 Subject: [PATCH 329/338] BUG: Fix linux clipboard QApplication() creation closes #14372 A Qt application cannot instantiate multiple `QApplication` instances, so we create a new `QApplication` only when the global `QApplication.instance()` is None. Author: Pankaj Pandey Closes #14815 from pankajp/patch-2 and squashes the following commits: 40d70f9 [Pankaj Pandey] BUG: Fix linux clipboard QApplication() creation --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/util/clipboard/clipboards.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 680aefc4041fb..af0d0d7b04475 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -828,7 +828,7 @@ Bug Fixes - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) - +- Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`) - Bug in ``DataFrame.isin`` comparing datetimelike to empty frame (:issue:`15473`) - Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) diff --git a/pandas/util/clipboard/clipboards.py b/pandas/util/clipboard/clipboards.py index f73f4f191d577..bd5528334168f 100644 --- a/pandas/util/clipboard/clipboards.py +++ b/pandas/util/clipboard/clipboards.py @@ -50,7 +50,8 @@ def init_qt_clipboard(): # $DISPLAY should exist from PyQt4.QtGui import QApplication - app = QApplication([]) + # use the global instance if it exists + app = QApplication.instance() or QApplication([]) def copy_qt(text): cb = app.clipboard() From 25b8c49a52d8c00ba5ad7d38d2e1b4153d0becec Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 20 Mar 2017 10:19:18 -0400 Subject: [PATCH 330/338] BUG: replace coerces incorrect dtype closes #12747 Author: sinhrks This patch had conflicts when merged, resolved by Committer: Jeff Reback Closes #12780 from sinhrks/replace_type and squashes the following commits: f9154e8 [sinhrks] remove unnecessary comments 279fdf6 [sinhrks] remove import failure de44877 [sinhrks] BUG: replace coerces incorrect dtype --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/internals.py | 20 +++++++++-- pandas/tests/indexing/test_coercion.py | 50 ++++++++++++++++++++------ pandas/tests/series/test_replace.py | 4 +-- pandas/types/cast.py | 37 ++++++++++++++----- 5 files changed, 88 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index af0d0d7b04475..7c78132232077 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -823,6 +823,7 @@ Bug Fixes - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) +- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9db01713b05ed..60684a929889b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1894,8 +1894,11 @@ def convert(self, *args, **kwargs): blocks.append(newb) else: - values = fn( - self.values.ravel(), **fn_kwargs).reshape(self.values.shape) + values = fn(self.values.ravel(), **fn_kwargs) + try: + values = values.reshape(self.values.shape) + except NotImplementedError: + pass blocks.append(make_block(values, ndim=self.ndim, placement=self.mgr_locs)) @@ -3238,6 +3241,16 @@ def comp(s): return _possibly_compare(values, getattr(s, 'asm8', s), operator.eq) + def _cast_scalar(block, scalar): + dtype, val = _infer_dtype_from_scalar(scalar, pandas_dtype=True) + if not is_dtype_equal(block.dtype, dtype): + dtype = _find_common_type([block.dtype, dtype]) + block = block.astype(dtype) + # use original value + val = scalar + + return block, val + masks = [comp(s) for i, s in enumerate(src_list)] result_blocks = [] @@ -3260,7 +3273,8 @@ def comp(s): # particular block m = masks[i][b.mgr_locs.indexer] if m.any(): - new_rb.extend(b.putmask(m, d, inplace=True)) + b, val = _cast_scalar(b, d) + new_rb.extend(b.putmask(m, val, inplace=True)) else: new_rb.append(b) rb = new_rb diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 38f8bb5355a69..df95f563c0832 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1153,12 +1153,27 @@ def setUp(self): self.rep['float64'] = [1.1, 2.2] self.rep['complex128'] = [1 + 1j, 2 + 2j] self.rep['bool'] = [True, False] + self.rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-03')] + + for tz in ['UTC', 'US/Eastern']: + # to test tz => different tz replacement + key = 'datetime64[ns, {0}]'.format(tz) + self.rep[key] = [pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz)] + + self.rep['timedelta64[ns]'] = [pd.Timedelta('1 day'), + pd.Timedelta('2 day')] def _assert_replace_conversion(self, from_key, to_key, how): index = pd.Index([3, 4], name='xxx') obj = pd.Series(self.rep[from_key], index=index, name='yyy') self.assertEqual(obj.dtype, from_key) + if (from_key.startswith('datetime') and to_key.startswith('datetime')): + # different tz, currently mask_missing raises SystemError + return + if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) elif how == 'series': @@ -1175,17 +1190,12 @@ def _assert_replace_conversion(self, from_key, to_key, how): pytest.skip("windows platform buggy: {0} -> {1}".format (from_key, to_key)) - if ((from_key == 'float64' and - to_key in ('bool', 'int64')) or - + if ((from_key == 'float64' and to_key in ('bool', 'int64')) or (from_key == 'complex128' and to_key in ('bool', 'int64', 'float64')) or - (from_key == 'int64' and - to_key in ('bool')) or - - # TODO_GH12747 The result must be int? - (from_key == 'bool' and to_key == 'int64')): + # GH12747 The result must be int? + (from_key == 'int64' and to_key in ('bool'))): # buggy on 32-bit if tm.is_platform_32bit(): @@ -1248,13 +1258,31 @@ def test_replace_series_bool(self): self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_datetime64(self): - pass + from_key = 'datetime64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='dict') + + from_key = 'datetime64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_datetime64tz(self): - pass + from_key = 'datetime64[ns, US/Eastern]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='dict') + + from_key = 'datetime64[ns, US/Eastern]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_timedelta64(self): - pass + from_key = 'timedelta64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='dict') + + from_key = 'timedelta64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_period(self): pass diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 0acd03316339e..f5a25e93cc82d 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -132,8 +132,8 @@ def check_replace(to_rep, val, expected): tm.assert_series_equal(expected, r) tm.assert_series_equal(expected, sc) - # should NOT upcast to float - e = pd.Series([0, 1, 2, 3, 4]) + # MUST upcast to float + e = pd.Series([0., 1., 2., 3., 4.]) tr, v = [3], [3.0] check_replace(tr, v, e) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 1cd55274b9b49..11a837dd21159 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -21,7 +21,7 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _POSSIBLY_CAST_DTYPES) -from .dtypes import ExtensionDtype +from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .missing import isnull, notnull from .inference import is_list_like @@ -312,8 +312,17 @@ def _maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _infer_dtype_from_scalar(val): - """ interpret the dtype from a scalar """ +def _infer_dtype_from_scalar(val, pandas_dtype=False): + """ + interpret the dtype from a scalar + + Parameters + ---------- + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, scalar belongs to pandas extension types is inferred as + object + """ dtype = np.object_ @@ -336,13 +345,20 @@ def _infer_dtype_from_scalar(val): dtype = np.object_ - elif isinstance(val, (np.datetime64, - datetime)) and getattr(val, 'tzinfo', None) is None: - val = lib.Timestamp(val).value - dtype = np.dtype('M8[ns]') + elif isinstance(val, (np.datetime64, datetime)): + val = tslib.Timestamp(val) + if val is tslib.NaT or val.tz is None: + dtype = np.dtype('M8[ns]') + else: + if pandas_dtype: + dtype = DatetimeTZDtype(unit='ns', tz=val.tz) + else: + # return datetimetz as object + return np.object_, val + val = val.value elif isinstance(val, (np.timedelta64, timedelta)): - val = lib.Timedelta(val).value + val = tslib.Timedelta(val).value dtype = np.dtype('m8[ns]') elif is_bool(val): @@ -363,6 +379,11 @@ def _infer_dtype_from_scalar(val): elif is_complex(val): dtype = np.complex_ + elif pandas_dtype: + if lib.is_period(val): + dtype = PeriodDtype(freq=val.freq) + val = val.ordinal + return dtype, val From f139a77179effab587fcdc7bd2424df41e25aa11 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 20 Mar 2017 13:45:49 -0400 Subject: [PATCH 331/338] BUG: tz aware Timestamp field accessors returns local values (#13303) closes #13303 Previously, calling a date/time attribute with Timestamp that's tz aware (e.g. `Timestamp('...', tz='...').dayofyear`) would return the attribute in UTC instead of the local tz. Author: Matt Roeschke Closes #15740 from mroeschke/fix_13303 and squashes the following commits: b78b333 [Matt Roeschke] BUG: tz aware Timestamp field accessors returns local values (#13303) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/_libs/tslib.pyx | 10 +- pandas/tests/indexes/datetimes/test_misc.py | 158 ++++++++++---------- pandas/tests/scalar/test_timestamp.py | 26 ++++ 4 files changed, 117 insertions(+), 78 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7c78132232077..98407aacb993b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -796,6 +796,7 @@ Bug Fixes ~~~~~~~~~ - Bug in ``Timestamp.replace`` now raises ``TypeError`` when incorrect argument names are given; previously this raised ``ValueError`` (:issue:`15240`) +- Bug in ``Timestamp`` returning UTC based time/date attributes when a timezone was provided (:issue:`13303`) - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8ee92e9fb900d..055534bbdb7ee 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1233,7 +1233,10 @@ cdef class _Timestamp(datetime): return datetime.__sub__(self, other) cpdef _get_field(self, field): - out = get_date_field(np.array([self.value], dtype=np.int64), field) + val = self.value + if self.tz is not None and not _is_utc(self.tz): + val = tz_convert_single(self.value, 'UTC', self.tz) + out = get_date_field(np.array([val], dtype=np.int64), field) return int(out[0]) cpdef _get_start_end_field(self, field): @@ -1241,8 +1244,11 @@ cdef class _Timestamp(datetime): 'startingMonth', self.freq.kwds.get( 'month', 12)) if self.freq else 12 freqstr = self.freqstr if self.freq else None + val = self.value + if self.tz is not None and not _is_utc(self.tz): + val = tz_convert_single(self.value, 'UTC', self.tz) out = get_start_end_field( - np.array([self.value], dtype=np.int64), field, freqstr, month_kw) + np.array([val], dtype=np.int64), field, freqstr, month_kw) return out[0] property _repr_base: diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 6b0191edbda5a..e99f1d46637c2 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -172,82 +172,88 @@ def test_normalize(self): class TestDatetime64(tm.TestCase): def test_datetimeindex_accessors(self): - dti = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), periods=365) - - self.assertEqual(dti.year[0], 1998) - self.assertEqual(dti.month[0], 1) - self.assertEqual(dti.day[0], 1) - self.assertEqual(dti.hour[0], 0) - self.assertEqual(dti.minute[0], 0) - self.assertEqual(dti.second[0], 0) - self.assertEqual(dti.microsecond[0], 0) - self.assertEqual(dti.dayofweek[0], 3) - - self.assertEqual(dti.dayofyear[0], 1) - self.assertEqual(dti.dayofyear[120], 121) - - self.assertEqual(dti.weekofyear[0], 1) - self.assertEqual(dti.weekofyear[120], 18) - - self.assertEqual(dti.quarter[0], 1) - self.assertEqual(dti.quarter[120], 2) - - self.assertEqual(dti.days_in_month[0], 31) - self.assertEqual(dti.days_in_month[90], 30) - - self.assertEqual(dti.is_month_start[0], True) - self.assertEqual(dti.is_month_start[1], False) - self.assertEqual(dti.is_month_start[31], True) - self.assertEqual(dti.is_quarter_start[0], True) - self.assertEqual(dti.is_quarter_start[90], True) - self.assertEqual(dti.is_year_start[0], True) - self.assertEqual(dti.is_year_start[364], False) - self.assertEqual(dti.is_month_end[0], False) - self.assertEqual(dti.is_month_end[30], True) - self.assertEqual(dti.is_month_end[31], False) - self.assertEqual(dti.is_month_end[364], True) - self.assertEqual(dti.is_quarter_end[0], False) - self.assertEqual(dti.is_quarter_end[30], False) - self.assertEqual(dti.is_quarter_end[89], True) - self.assertEqual(dti.is_quarter_end[364], True) - self.assertEqual(dti.is_year_end[0], False) - self.assertEqual(dti.is_year_end[364], True) - - # GH 11128 - self.assertEqual(dti.weekday_name[4], u'Monday') - self.assertEqual(dti.weekday_name[5], u'Tuesday') - self.assertEqual(dti.weekday_name[6], u'Wednesday') - self.assertEqual(dti.weekday_name[7], u'Thursday') - self.assertEqual(dti.weekday_name[8], u'Friday') - self.assertEqual(dti.weekday_name[9], u'Saturday') - self.assertEqual(dti.weekday_name[10], u'Sunday') - - self.assertEqual(Timestamp('2016-04-04').weekday_name, u'Monday') - self.assertEqual(Timestamp('2016-04-05').weekday_name, u'Tuesday') - self.assertEqual(Timestamp('2016-04-06').weekday_name, u'Wednesday') - self.assertEqual(Timestamp('2016-04-07').weekday_name, u'Thursday') - self.assertEqual(Timestamp('2016-04-08').weekday_name, u'Friday') - self.assertEqual(Timestamp('2016-04-09').weekday_name, u'Saturday') - self.assertEqual(Timestamp('2016-04-10').weekday_name, u'Sunday') - - self.assertEqual(len(dti.year), 365) - self.assertEqual(len(dti.month), 365) - self.assertEqual(len(dti.day), 365) - self.assertEqual(len(dti.hour), 365) - self.assertEqual(len(dti.minute), 365) - self.assertEqual(len(dti.second), 365) - self.assertEqual(len(dti.microsecond), 365) - self.assertEqual(len(dti.dayofweek), 365) - self.assertEqual(len(dti.dayofyear), 365) - self.assertEqual(len(dti.weekofyear), 365) - self.assertEqual(len(dti.quarter), 365) - self.assertEqual(len(dti.is_month_start), 365) - self.assertEqual(len(dti.is_month_end), 365) - self.assertEqual(len(dti.is_quarter_start), 365) - self.assertEqual(len(dti.is_quarter_end), 365) - self.assertEqual(len(dti.is_year_start), 365) - self.assertEqual(len(dti.is_year_end), 365) - self.assertEqual(len(dti.weekday_name), 365) + dti_naive = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), + periods=365) + # GH 13303 + dti_tz = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), + periods=365, tz='US/Eastern') + for dti in [dti_naive, dti_tz]: + + self.assertEqual(dti.year[0], 1998) + self.assertEqual(dti.month[0], 1) + self.assertEqual(dti.day[0], 1) + self.assertEqual(dti.hour[0], 0) + self.assertEqual(dti.minute[0], 0) + self.assertEqual(dti.second[0], 0) + self.assertEqual(dti.microsecond[0], 0) + self.assertEqual(dti.dayofweek[0], 3) + + self.assertEqual(dti.dayofyear[0], 1) + self.assertEqual(dti.dayofyear[120], 121) + + self.assertEqual(dti.weekofyear[0], 1) + self.assertEqual(dti.weekofyear[120], 18) + + self.assertEqual(dti.quarter[0], 1) + self.assertEqual(dti.quarter[120], 2) + + self.assertEqual(dti.days_in_month[0], 31) + self.assertEqual(dti.days_in_month[90], 30) + + self.assertEqual(dti.is_month_start[0], True) + self.assertEqual(dti.is_month_start[1], False) + self.assertEqual(dti.is_month_start[31], True) + self.assertEqual(dti.is_quarter_start[0], True) + self.assertEqual(dti.is_quarter_start[90], True) + self.assertEqual(dti.is_year_start[0], True) + self.assertEqual(dti.is_year_start[364], False) + self.assertEqual(dti.is_month_end[0], False) + self.assertEqual(dti.is_month_end[30], True) + self.assertEqual(dti.is_month_end[31], False) + self.assertEqual(dti.is_month_end[364], True) + self.assertEqual(dti.is_quarter_end[0], False) + self.assertEqual(dti.is_quarter_end[30], False) + self.assertEqual(dti.is_quarter_end[89], True) + self.assertEqual(dti.is_quarter_end[364], True) + self.assertEqual(dti.is_year_end[0], False) + self.assertEqual(dti.is_year_end[364], True) + + # GH 11128 + self.assertEqual(dti.weekday_name[4], u'Monday') + self.assertEqual(dti.weekday_name[5], u'Tuesday') + self.assertEqual(dti.weekday_name[6], u'Wednesday') + self.assertEqual(dti.weekday_name[7], u'Thursday') + self.assertEqual(dti.weekday_name[8], u'Friday') + self.assertEqual(dti.weekday_name[9], u'Saturday') + self.assertEqual(dti.weekday_name[10], u'Sunday') + + self.assertEqual(Timestamp('2016-04-04').weekday_name, u'Monday') + self.assertEqual(Timestamp('2016-04-05').weekday_name, u'Tuesday') + self.assertEqual(Timestamp('2016-04-06').weekday_name, + u'Wednesday') + self.assertEqual(Timestamp('2016-04-07').weekday_name, u'Thursday') + self.assertEqual(Timestamp('2016-04-08').weekday_name, u'Friday') + self.assertEqual(Timestamp('2016-04-09').weekday_name, u'Saturday') + self.assertEqual(Timestamp('2016-04-10').weekday_name, u'Sunday') + + self.assertEqual(len(dti.year), 365) + self.assertEqual(len(dti.month), 365) + self.assertEqual(len(dti.day), 365) + self.assertEqual(len(dti.hour), 365) + self.assertEqual(len(dti.minute), 365) + self.assertEqual(len(dti.second), 365) + self.assertEqual(len(dti.microsecond), 365) + self.assertEqual(len(dti.dayofweek), 365) + self.assertEqual(len(dti.dayofyear), 365) + self.assertEqual(len(dti.weekofyear), 365) + self.assertEqual(len(dti.quarter), 365) + self.assertEqual(len(dti.is_month_start), 365) + self.assertEqual(len(dti.is_month_end), 365) + self.assertEqual(len(dti.is_quarter_start), 365) + self.assertEqual(len(dti.is_quarter_end), 365) + self.assertEqual(len(dti.is_year_start), 365) + self.assertEqual(len(dti.is_year_end), 365) + self.assertEqual(len(dti.weekday_name), 365) dti = DatetimeIndex(freq='BQ-FEB', start=datetime(1998, 1, 1), periods=4) diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index d5d92dcf96eab..082f0fa9c40d5 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -550,6 +550,32 @@ def check(value, equal): check(ts.daysinmonth, 31) check(ts.daysinmonth, 31) + # GH 13303 + ts = Timestamp('2014-12-31 23:59:00-05:00', tz='US/Eastern') + check(ts.year, 2014) + check(ts.month, 12) + check(ts.day, 31) + check(ts.hour, 23) + check(ts.minute, 59) + check(ts.second, 0) + self.assertRaises(AttributeError, lambda: ts.millisecond) + check(ts.microsecond, 0) + check(ts.nanosecond, 0) + check(ts.dayofweek, 2) + check(ts.quarter, 4) + check(ts.dayofyear, 365) + check(ts.week, 1) + check(ts.daysinmonth, 31) + + ts = Timestamp('2014-01-01 00:00:00+01:00') + starts = ['is_month_start', 'is_quarter_start', 'is_year_start'] + for start in starts: + self.assertTrue(getattr(ts, start)) + ts = Timestamp('2014-12-31 23:59:59+01:00') + ends = ['is_month_end', 'is_year_end', 'is_quarter_end'] + for end in ends: + self.assertTrue(getattr(ts, end)) + def test_nat_fields(self): # GH 10050 ts = Timestamp('NaT') From fc0d670ccf4994d09a7592d03c9cfa29244ffb7e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 20 Mar 2017 15:38:35 -0400 Subject: [PATCH 332/338] DOC: Patch new flake8 command grep The grep was initially matching to "pandas," which is incorrect because that was also matching files containing "pandas" in the name but that were not in the main `pandas` directory (e.g. performance test code). This change enforces that we match to any Python files in the main `pandas` directory. Also picked up compatibility issue with OSX, in which the `-r` flag does not exist. However, `xargs` terminates if the argument list is empty, which was the whole point of passing in `-r` in the first place. Follow-up to #15712 Author: gfyoung Closes #15749 from gfyoung/flake8-diff-patch and squashes the following commits: d1543b5 [gfyoung] COMPAT: Do not run xargs with -r on OSX da57857 [gfyoung] DOC: Patch new flake8 command grep --- doc/source/contributing.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 7961780d0c79b..7ad5916a8809d 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -527,7 +527,12 @@ unused function. However, style-checking the diff will not catch this because the actual import is not part of the diff. Thus, for completeness, you should run this command, though it will take longer:: - git diff master --name-only -- '*.py' | grep 'pandas' | xargs -r flake8 + git diff master --name-only -- '*.py' | grep 'pandas/' | xargs -r flake8 + +Note that on OSX, the ``-r`` flag is not available, so you have to omit it and +run this slightly modified command:: + + git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8 Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ From 649cd39012cffe5dc928f7a8bea24ad808dce36a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 20 Mar 2017 15:47:48 -0400 Subject: [PATCH 333/338] MAINT: Remove Long and WidePanel (#15748) Deprecated since 0.17.0. xref gh-10892 --- asv_bench/benchmarks/pandas_vb_common.py | 5 ---- bench/bench_join_panel.py | 4 +-- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/api.py | 2 +- pandas/core/panel.py | 23 ---------------- pandas/tests/api/test_api.py | 3 +- pandas/tests/io/test_pytables.py | 3 -- pandas/tests/test_panel.py | 35 ++++++++---------------- vb_suite/pandas_vb_common.py | 5 ---- 9 files changed, 17 insertions(+), 64 deletions(-) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 56ccc94c414fb..a7e530e7f5ef1 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -25,11 +25,6 @@ except: pass -try: - Panel = Panel -except Exception: - Panel = WidePanel - # didn't add to namespace until later try: from pandas.core.index import MultiIndex diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py index f3c3f8ba15f70..113b317dd8ff8 100644 --- a/bench/bench_join_panel.py +++ b/bench/bench_join_panel.py @@ -45,8 +45,8 @@ def reindex_on_axis(panels, axis, axis_reindex): return p -# does the job but inefficient (better to handle like you read a table in -# pytables...e.g create a LongPanel then convert to Wide) +# Does the job but inefficient. It is better to handle +# this like you read a table in pytables. def create_panels_join(cls, panels): """ given an array of panels's, create a single panel """ panels = [a for a in panels if a is not None] diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 98407aacb993b..ebdd4060f0588 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -772,6 +772,7 @@ Removal of prior version deprecations/changes - The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`) - The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`) - ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`) +- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) .. _whatsnew_0200.performance: diff --git a/pandas/core/api.py b/pandas/core/api.py index 65253dedb8b53..5018de39ca907 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -15,7 +15,7 @@ from pandas.core.series import Series from pandas.core.frame import DataFrame -from pandas.core.panel import Panel, WidePanel +from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.core.reshape import (pivot_simple as pivot, get_dummies, lreshape, wide_to_long) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 4a6c6cf291316..5c7b66a2d1356 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -4,8 +4,6 @@ # pylint: disable=E1103,W0231,W0212,W0621 from __future__ import division -import warnings - import numpy as np from pandas.types.cast import (_infer_dtype_from_scalar, @@ -1556,24 +1554,3 @@ def f(self, other, axis=0): ops.add_special_arithmetic_methods(Panel, **ops.panel_special_funcs) Panel._add_aggregate_operations() Panel._add_numeric_operations() - - -# legacy -class WidePanel(Panel): - - def __init__(self, *args, **kwargs): - # deprecation, #10892 - warnings.warn("WidePanel is deprecated. Please use Panel", - FutureWarning, stacklevel=2) - - super(WidePanel, self).__init__(*args, **kwargs) - - -class LongPanel(DataFrame): - - def __init__(self, *args, **kwargs): - # deprecation, #10892 - warnings.warn("LongPanel is deprecated. Please use DataFrame", - FutureWarning, stacklevel=2) - - super(LongPanel, self).__init__(*args, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 73222c246fc70..2c7dcf2501f32 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -54,8 +54,7 @@ class TestPDApi(Base, tm.TestCase): 'TimedeltaIndex', 'Timestamp'] # these are already deprecated; awaiting removal - deprecated_classes = ['WidePanel', 'Panel4D', - 'SparseList', 'Expr', 'Term'] + deprecated_classes = ['Panel4D', 'SparseList', 'Expr', 'Term'] # these should be deprecated in the future deprecated_classes_in_future = ['Panel'] diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 40866b8702fe2..324160d5b1ae6 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -3017,9 +3017,6 @@ def _check(left, right): # empty # self._check_roundtrip(wp.to_frame()[:0], _check) - def test_longpanel(self): - pass - def test_overwrite_node(self): with ensure_clean_store(self.path) as store: diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index ab0322abbcf06..13e16f3b90730 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -178,10 +178,6 @@ def wrapper(x): class SafeForSparse(object): - @classmethod - def assert_panel_equal(cls, x, y): - assert_panel_equal(x, y) - def test_get_axis(self): assert (self.panel._get_axis(0) is self.panel.items) assert (self.panel._get_axis(1) is self.panel.major_axis) @@ -346,10 +342,10 @@ def check_op(op, name): def test_combinePanel(self): result = self.panel.add(self.panel) - self.assert_panel_equal(result, self.panel * 2) + assert_panel_equal(result, self.panel * 2) def test_neg(self): - self.assert_panel_equal(-self.panel, self.panel * -1) + assert_panel_equal(-self.panel, self.panel * -1) # issue 7692 def test_raise_when_not_implemented(self): @@ -369,22 +365,22 @@ def test_select(self): # select items result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') expected = p.reindex(items=['ItemA', 'ItemC']) - self.assert_panel_equal(result, expected) + assert_panel_equal(result, expected) # select major_axis result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] expected = p.reindex(major=new_major) - self.assert_panel_equal(result, expected) + assert_panel_equal(result, expected) # select minor_axis result = p.select(lambda x: x in ('D', 'A'), axis=2) expected = p.reindex(minor=['A', 'D']) - self.assert_panel_equal(result, expected) + assert_panel_equal(result, expected) # corner case, empty thing result = p.select(lambda x: x in ('foo', ), axis='items') - self.assert_panel_equal(result, p.reindex(items=[])) + assert_panel_equal(result, p.reindex(items=[])) def test_get_value(self): for item in self.panel.items: @@ -399,8 +395,8 @@ def test_abs(self): result = self.panel.abs() result2 = abs(self.panel) expected = np.abs(self.panel) - self.assert_panel_equal(result, expected) - self.assert_panel_equal(result2, expected) + assert_panel_equal(result, expected) + assert_panel_equal(result2, expected) df = self.panel['ItemA'] result = df.abs() @@ -867,10 +863,6 @@ def test_set_value(self): class TestPanel(tm.TestCase, PanelTests, CheckIndexing, SafeForLongAndSparse, SafeForSparse): - @classmethod - def assert_panel_equal(cls, x, y): - assert_panel_equal(x, y) - def setUp(self): self.panel = _panel.copy() self.panel.major_axis.name = None @@ -1967,7 +1959,7 @@ def test_round(self): major_axis=pd.date_range('1/1/2000', periods=5), minor_axis=['A', 'B']) result = p.round() - self.assert_panel_equal(expected, result) + assert_panel_equal(expected, result) def test_numpy_round(self): values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12], @@ -1983,7 +1975,7 @@ def test_numpy_round(self): major_axis=pd.date_range('1/1/2000', periods=5), minor_axis=['A', 'B']) result = np.round(p) - self.assert_panel_equal(expected, result) + assert_panel_equal(expected, result) msg = "the 'out' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.round, p, out=p) @@ -2270,15 +2262,12 @@ def test_all_any_unhandled(self): self.assertRaises(NotImplementedError, self.panel.any, bool_only=True) -class TestLongPanel(tm.TestCase): +class TestPanelFrame(tm.TestCase): """ - LongPanel no longer exists, but... + Check that conversions to and from Panel to DataFrame work. """ def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) - panel = tm.makePanel() tm.add_nans(panel) diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py index bd2e8a1c1d504..41e43d6ab10e5 100644 --- a/vb_suite/pandas_vb_common.py +++ b/vb_suite/pandas_vb_common.py @@ -18,11 +18,6 @@ except: import pandas._libs.lib as lib -try: - Panel = WidePanel -except Exception: - pass - # didn't add to namespace until later try: from pandas.core.index import MultiIndex From fcb6a9f0224d09a6e86be3d12acb8cf07d0a7d09 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 20 Mar 2017 18:45:19 -0400 Subject: [PATCH 334/338] PERF: Improve drop_duplicates for bool columns (#12963) closes #12963 Author: Matt Roeschke Closes #15738 from mroeschke/fix_12963 and squashes the following commits: a020c10 [Matt Roeschke] PERF: Improve drop_duplicates for bool columns (#12963) --- asv_bench/benchmarks/reindex.py | 5 +++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 6fe6c32a96df9..537d275e7c727 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -132,6 +132,9 @@ def setup(self): self.K = 10000 self.key1 = np.random.randint(0, self.K, size=self.N) self.df_int = DataFrame({'key1': self.key1}) + self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K, + dtype=bool) + for i in range(10)}) def time_frame_drop_dups(self): self.df.drop_duplicates(['key1', 'key2']) @@ -154,6 +157,8 @@ def time_series_drop_dups_string(self): def time_frame_drop_dups_int(self): self.df_int.drop_duplicates() + def time_frame_drop_dups_bool(self): + self.df_bool.drop_duplicates() #---------------------------------------------------------------------- # blog "pandas escaped the zoo" diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ebdd4060f0588..d036049e3ffdb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -789,6 +789,7 @@ Performance Improvements - Improved performance of ``.rank()`` for categorical data (:issue:`15498`) - Improved performance when using ``.unstack()`` (:issue:`15503`) - Improved performance of merge/join on ``category`` columns (:issue:`10409`) +- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6937675603c10..f9d4c9107d7cd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,6 +19,7 @@ is_period_dtype, is_period_arraylike, is_float_dtype, + is_bool_dtype, needs_i8_conversion, is_categorical, is_datetime64_dtype, @@ -325,8 +326,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ from pandas import Index, Series, DatetimeIndex, PeriodIndex - # handling two possibilities here + # handling possibilities here # - for a numpy datetimelike simply view as i8 then cast back + # - bool handled as uint8 then cast back # - for an extension datetimelike view as i8 then # reconstruct from boxed values to transfer metadata dtype = None @@ -341,6 +343,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # numpy dtype dtype = values.dtype vals = values.view(np.int64) + elif is_bool_dtype(values): + dtype = bool + vals = np.asarray(values).view('uint8') else: vals = np.asarray(values) From de15b6d15cc734653d25528dec3eee025ae8f6ff Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 20 Mar 2017 19:38:01 -0400 Subject: [PATCH 335/338] CI: trying for osx cache again --- .travis.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 705b2380ac697..67b37f1d58931 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,11 @@ matrix: os: osx compiler: clang osx_image: xcode6.4 - cache: ccache + cache: + ccache: true + directories: + - $HOME/.cache # cython cache + - $HOME/.ccache # compiler cache env: - PYTHON_VERSION=3.5 - JOB_NAME: "35_osx" From 2c4e37722915e61f3cdc9c16e6a857b6ef750028 Mon Sep 17 00:00:00 2001 From: Wiktor Tomczak Date: Tue, 21 Mar 2017 09:52:37 +0100 Subject: [PATCH 336/338] Fix num_days in PandasAutoDateLocator (#14716) --- pandas/tseries/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 1f99e88ce86d6..8aea14a2688d1 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -261,7 +261,7 @@ def get_locator(self, dmin, dmax): 'Pick the best locator based on a distance.' delta = relativedelta(dmax, dmin) - num_days = ((delta.years * 12.0) + delta.months * 31.0) + delta.days + num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds tot_sec = num_days * 86400. + num_sec From 0fa874bbe4cf1a4032ce59fe1137d8013728c18f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 21 Mar 2017 08:21:25 -0400 Subject: [PATCH 337/338] CI: set path for osx ccache --- ci/install_travis.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 053a2d15a287c..c940083f5ae9e 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -32,11 +32,6 @@ edit_init home_dir=$(pwd) echo "[home_dir: $home_dir]" -if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - echo "[install ccache]" - time brew install ccache -fi - # install miniconda MINICONDA_DIR="$HOME/miniconda3" @@ -86,6 +81,14 @@ if [ "$USE_CACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then ccache=$(which ccache) echo "[ccache: $ccache]" export CC='ccache gcc' +elif [ "$USE_CACHE" ] && [ "${TRAVIS_OS_NAME}" == "osx" ]; then + echo "[Using ccache]" + time brew install ccache + export PATH=/usr/local/opt/ccache/libexec:$PATH + gcc=$(which gcc) + echo "[gcc: $gcc]" + ccache=$(which ccache) + echo "[ccache: $ccache]" else echo "[Not using ccache]" fi From fb72afc0203df9e9cbad86dbc7436c149adc7e2a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 21 Mar 2017 10:49:46 -0400 Subject: [PATCH 338/338] CLN: replace _interleave_dtype with _find_common_type xref #15736 xref #12780 Author: Jeff Reback Closes #15765 from jreback/common_types and squashes the following commits: d472646 [Jeff Reback] try removing restriction on windows 8d07cae [Jeff Reback] CLN: replace _interleave_dtype with _find_common_type --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/internals.py | 59 ++++---------------------- pandas/tests/indexing/test_coercion.py | 14 +----- pandas/tests/series/test_replace.py | 4 +- pandas/tests/types/test_cast.py | 14 ++++++ pandas/types/cast.py | 28 +++++++++++- 6 files changed, 55 insertions(+), 66 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d036049e3ffdb..e0d15c218ec85 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -826,7 +826,7 @@ Bug Fixes - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) -- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`) +- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 60684a929889b..6487c2108028e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -9,7 +9,8 @@ from pandas.core.base import PandasObject -from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype +from pandas.types.dtypes import (ExtensionDtype, DatetimeTZDtype, + CategoricalDtype) from pandas.types.common import (_TD_DTYPE, _NS_DTYPE, _ensure_int64, _ensure_platform_int, is_integer, @@ -4496,55 +4497,13 @@ def _interleaved_dtype(blocks): if not len(blocks): return None - counts = defaultdict(list) - for x in blocks: - counts[type(x)].append(x) - - have_int = len(counts[IntBlock]) > 0 - have_bool = len(counts[BoolBlock]) > 0 - have_object = len(counts[ObjectBlock]) > 0 - have_float = len(counts[FloatBlock]) > 0 - have_complex = len(counts[ComplexBlock]) > 0 - have_dt64 = len(counts[DatetimeBlock]) > 0 - have_dt64_tz = len(counts[DatetimeTZBlock]) > 0 - have_td64 = len(counts[TimeDeltaBlock]) > 0 - have_cat = len(counts[CategoricalBlock]) > 0 - # TODO: have_sparse is not used - have_sparse = len(counts[SparseBlock]) > 0 # noqa - have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat - - if (have_object or - (have_bool and - (have_numeric or have_dt64 or have_dt64_tz or have_td64)) or - (have_numeric and has_non_numeric) or have_cat or have_dt64 or - have_dt64_tz or have_td64): - return np.dtype(object) - elif have_bool: - return np.dtype(bool) - elif have_int and not have_float and not have_complex: - # if we are mixing unsigned and signed, then return - # the next biggest int type (if we can) - lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) - kinds = set([i.dtype.kind for i in counts[IntBlock]]) - if len(kinds) == 1: - return lcd - - if lcd == 'uint64' or lcd == 'int64': - return np.dtype('int64') - - # return 1 bigger on the itemsize if unsinged - if lcd.kind == 'u': - return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) - return lcd - - elif have_int and have_float and not have_complex: - return np.dtype('float64') - elif have_complex: - return np.dtype('c16') - else: - introspection_blks = counts[FloatBlock] + counts[SparseBlock] - return _find_common_type([b.dtype for b in introspection_blks]) + dtype = _find_common_type([b.dtype for b in blocks]) + + # only numpy compat + if isinstance(dtype, ExtensionDtype): + dtype = np.object + + return dtype def _consolidate(blocks): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index df95f563c0832..7216c05657102 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1183,19 +1183,9 @@ def _assert_replace_conversion(self, from_key, to_key, how): result = obj.replace(replacer) - # buggy on windows for bool/int64 - if (from_key == 'bool' and - to_key == 'int64' and - tm.is_platform_windows()): - pytest.skip("windows platform buggy: {0} -> {1}".format - (from_key, to_key)) - - if ((from_key == 'float64' and to_key in ('bool', 'int64')) or + if ((from_key == 'float64' and to_key in ('int64')) or (from_key == 'complex128' and - to_key in ('bool', 'int64', 'float64')) or - - # GH12747 The result must be int? - (from_key == 'int64' and to_key in ('bool'))): + to_key in ('int64', 'float64'))): # buggy on 32-bit if tm.is_platform_32bit(): diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index f5a25e93cc82d..0a53581e24ba5 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -152,8 +152,8 @@ def check_replace(to_rep, val, expected): tr, v = [3, 4], [3.5, pd.Timestamp('20130101')] check_replace(tr, v, e) - # casts to float - e = pd.Series([0, 1, 2, 3.5, 1]) + # casts to object + e = pd.Series([0, 1, 2, 3.5, True], dtype='object') tr, v = [3, 4], [3.5, True] check_replace(tr, v, e) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 70f69cc7d5701..d7b086daea1e3 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -238,6 +238,20 @@ def test_numpy_dtypes(self): ((np.object, np.float32), np.object), ((np.object, np.int16), np.object), + # bool with int + ((np.dtype('bool'), np.int64), np.object), + ((np.dtype('bool'), np.int32), np.object), + ((np.dtype('bool'), np.int16), np.object), + ((np.dtype('bool'), np.int8), np.object), + ((np.dtype('bool'), np.uint64), np.object), + ((np.dtype('bool'), np.uint32), np.object), + ((np.dtype('bool'), np.uint16), np.object), + ((np.dtype('bool'), np.uint8), np.object), + + # bool with float + ((np.dtype('bool'), np.float64), np.object), + ((np.dtype('bool'), np.float32), np.object), + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), np.dtype('datetime64[ns]')), ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 11a837dd21159..0e26cd085db5a 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -892,12 +892,28 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): def _find_common_type(types): - """Find a common data type among the given dtypes.""" + """ + Find a common data type among the given dtypes. + + Parameters + ---------- + types : list of dtypes + + Returns + ------- + pandas extension or numpy dtype + + See Also + -------- + numpy.find_common_type + + """ if len(types) == 0: raise ValueError('no types given') first = types[0] + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) # => object if all(is_dtype_equal(first, t) for t in types[1:]): @@ -912,4 +928,14 @@ def _find_common_type(types): if all(is_timedelta64_dtype(t) for t in types): return np.dtype('timedelta64[ns]') + # don't mix bool / int or float or complex + # this is different from numpy, which casts bool with float/int as int + has_bools = any(is_bool_dtype(t) for t in types) + if has_bools: + has_ints = any(is_integer_dtype(t) for t in types) + has_floats = any(is_float_dtype(t) for t in types) + has_complex = any(is_complex_dtype(t) for t in types) + if has_ints or has_floats or has_complex: + return np.object + return np.find_common_type(types, [])
Conda - conda downloads + conda default downloads + +
Conda-forge + + conda-forge downloads
- - appveyor build status + + appveyor build status