From c1b657e193cd59e3a302fbc594082bb07c7695ae Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 00:21:37 +0000 Subject: [PATCH 01/52] get_indexer_non_unique for orderable indexes --- pandas/index.pyx | 69 ++++++++++++++++++++++++++++++++++++++++++ pandas/indexes/base.py | 13 +++++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 0c975d1775a03..90ab6f5328df8 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -45,6 +45,47 @@ cdef extern from "Python.h": int PySlice_Check(object) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1, + list[:] result, list[:] missing): + cdef: + Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0] + + while i < n and j < n_t: + + val0 = values[idx0[i]] + val1 = targets[idx1[j]] + + if val0 == val1: + + while i < n and values[idx0[i]] == val1: + result[idx1[j]].append(idx0[i]) + i += 1 + + j += 1 + while j < n_t and val0 == targets[idx1[j]]: + result[idx1[j]] = result[idx1[j-1]] + j += 1 + + elif val0 > val1: + + result[idx1[j]].append(-1) + missing[idx1[j]].append(idx1[j]) + j += 1 + + else: + i += 1 + + while j < n_t: + result[idx1[j]].append(-1) + missing[idx1[j]].append(idx1[j]) + j += 1 + + cdef inline is_definitely_invalid_key(object val): if PyTuple_Check(val): try: @@ -372,6 +413,34 @@ cdef class IndexEngine: return result[0:count], missing[0:count_missing] + def get_indexer_non_unique_orderable(self, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1): + + cdef: + ndarray values + object val0, val1 + Py_ssize_t i, n_t + + self._ensure_mapping_populated() + values = self._get_index_values() + n_t = len(targets) + + result = np.empty((n_t,), dtype=np.object_) + result.fill([]) + result = np.frompyfunc(list,1,1)(result) + + missing = np.empty((n_t,), dtype=np.object_) + missing.fill([]) + missing = np.frompyfunc(list,1,1)(missing) + + _indexer_non_unique_orderable_loop(values, targets, idx0, idx1, + result, missing) + + result = np.concatenate(result) + missing = np.asarray(np.concatenate(missing), np.int64) + + return result, missing cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index bb2941a121452..81ae7ad1c5a0a 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2509,7 +2509,18 @@ def get_indexer_non_unique(self, target): else: tgt_values = target._values - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + try: + if self.is_all_dates: + idx0 = np.argsort(self.asi8, kind='mergesort') + else: + idx0 = np.argsort(self._values, kind='mergesort') + + idx1 = np.argsort(tgt_values, kind='mergesort') + indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) + + except TypeError: + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return Index(indexer), missing def get_indexer_for(self, target, **kwargs): From 2f971a23a67ef9bc51453d94ae7b9626e12be006 Mon Sep 17 00:00:00 2001 From: "John W. O'Brien" Date: Sat, 11 Feb 2017 21:21:56 -0500 Subject: [PATCH 02/52] BUG: Avoid grafting missing examples directory (#15373) --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 2d26fbfd6adaf..b7a7e6039ac9a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,7 +7,6 @@ include setup.py graft doc prune doc/build -graft examples graft pandas global-exclude *.so From 1bad601641cc024cc4d0c1215b12c9d0066b8103 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Feb 2017 21:53:44 -0500 Subject: [PATCH 03/52] CLN: remove pandas/io/auth.py, from ga.py (now removed) (#15374) --- pandas/io/auth.py | 126 ---------------------------------------------- 1 file changed, 126 deletions(-) delete mode 100644 pandas/io/auth.py diff --git a/pandas/io/auth.py b/pandas/io/auth.py deleted file mode 100644 index e42df6a7309b7..0000000000000 --- a/pandas/io/auth.py +++ /dev/null @@ -1,126 +0,0 @@ -from __future__ import print_function -# see LICENSES directory for copyright and license -import os -import sys -import logging - -import httplib2 - -import apiclient.discovery as gapi -import gflags -import oauth2client.file as auth_file -import oauth2client.client as oauth -import oauth2client.tools as tools -OOB_CALLBACK_URN = oauth.OOB_CALLBACK_URN - - -class AuthenticationConfigError(ValueError): - pass - -FLOWS = {} -FLAGS = gflags.FLAGS -DEFAULT_SECRETS = os.path.join( - os.path.dirname(__file__), 'client_secrets.json') -DEFAULT_SCOPE = 'https://www.googleapis.com/auth/analytics.readonly' -DEFAULT_TOKEN_FILE = os.path.join(os.path.dirname(__file__), 'analytics.dat') -MISSING_CLIENT_MSG = """ -WARNING: Please configure OAuth 2.0 - -You need to populate the client_secrets.json file found at: - - %s - -with information from the APIs Console -. - -""" -DOC_URL = ('https://developers.google.com/api-client-library/python/guide/' - 'aaa_client_secrets') - -gflags.DEFINE_enum('logging_level', 'ERROR', - ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - 'Set the level of logging detail.') - -# Name of file that will store the access and refresh tokens to access -# the API without having to login each time. Make sure this file is in -# a secure place. - - -def process_flags(flags=None): - """Uses the command-line flags to set the logging level. - - Args: - argv: List of command line arguments passed to the python script. - """ - if flags is None: - flags = [] - - # Let the gflags module process the command-line arguments. - try: - FLAGS(flags) - except gflags.FlagsError as e: - print('%s\nUsage: %s ARGS\n%s' % (e, str(flags), FLAGS)) - sys.exit(1) - - # Set the logging according to the command-line flag. - logging.getLogger().setLevel(getattr(logging, FLAGS.logging_level)) - - -def get_flow(secret, scope, redirect): - """ - Retrieve an authentication flow object based on the given - configuration in the secret file name, the authentication scope, - and a redirect URN - """ - key = (secret, scope, redirect) - flow = FLOWS.get(key, None) - if flow is None: - msg = MISSING_CLIENT_MSG % secret - if not os.path.exists(secret): - raise AuthenticationConfigError(msg) - flow = oauth.flow_from_clientsecrets(secret, scope, - redirect_uri=redirect, - message=msg) - FLOWS[key] = flow - return flow - - -def make_token_store(fpath=None): - """create token storage from give file name""" - if fpath is None: - fpath = DEFAULT_TOKEN_FILE - return auth_file.Storage(fpath) - - -def authenticate(flow, storage=None): - """ - Try to retrieve a valid set of credentials from the token store if possible - Otherwise use the given authentication flow to obtain new credentials - and return an authenticated http object - - Parameters - ---------- - flow : authentication workflow - storage: token storage, default None - """ - http = httplib2.Http() - - # Prepare credentials, and authorize HTTP object with them. - credentials = storage.get() - if credentials is None or credentials.invalid: - credentials = tools.run(flow, storage) - - http = credentials.authorize(http) - return http - - -def init_service(http): - """ - Use the given http object to build the analytics service object - """ - return gapi.build('analytics', 'v3', http=http) - - -def reset_default_token_store(): - import os - os.remove(DEFAULT_TOKEN_FILE) From 5fb5228988832ff0328c4d830cb4e2609b882ab1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 09:50:55 -0500 Subject: [PATCH 04/52] TST: consolidate remaining tests under pandas.tests move all remaining tests so that ALL tests are now under pandas/tests Author: Jeff Reback Closes #15371 from jreback/tests and squashes the following commits: 43039e4 [Jeff Reback] add in data 118127b [Jeff Reback] wip bfa6a9c [Jeff Reback] fix data locations 79a79e6 [Jeff Reback] fix import 57437bf [Jeff Reback] fixes b407586 [Jeff Reback] move io e13bfe3 [Jeff Reback] move tools 0194e31 [Jeff Reback] move computation 0e6bcb4 [Jeff Reback] rename test_msgpack -> msgpack c5e4ab8 [Jeff Reback] move sparse 42e60e2 [Jeff Reback] move api tests --- pandas/{api/tests => tests/api}/__init__.py | 0 pandas/{api/tests => tests/api}/test_api.py | 2 +- .../tests => tests/computation}/__init__.py | 0 .../computation}/test_compat.py | 0 .../tests => tests/computation}/test_eval.py | 0 pandas/tests/indexes/datetimes/test_ops.py | 3 +- pandas/{io/tests => tests/io}/__init__.py | 0 .../{io/tests => tests/io}/data/S4_EDUC1.dta | Bin .../{io/tests => tests/io}/data/banklist.csv | 0 .../{io/tests => tests/io}/data/banklist.html | 0 pandas/{io/tests => tests/io}/data/blank.xls | Bin pandas/{io/tests => tests/io}/data/blank.xlsm | Bin pandas/{io/tests => tests/io}/data/blank.xlsx | Bin .../io}/data/blank_with_header.xls | Bin .../io}/data/blank_with_header.xlsm | Bin .../io}/data/blank_with_header.xlsx | Bin .../io}/data/categorical_0_14_1.pickle | 0 .../io}/data/categorical_0_15_2.pickle | Bin .../io}/data/computer_sales_page.html | 0 .../tests => tests/io}/data/gbq_fake_job.txt | 0 .../data/html_encoding/chinese_utf-16.html | Bin .../data/html_encoding/chinese_utf-32.html | Bin .../io}/data/html_encoding/chinese_utf-8.html | 0 .../io}/data/html_encoding/letz_latin1.html | 0 pandas/{io/tests => tests/io}/data/iris.csv | 0 .../io}/data/legacy_hdf/datetimetz_object.h5 | Bin .../io}/data/legacy_hdf/legacy.h5 | Bin .../io}/data/legacy_hdf/legacy_0.10.h5 | Bin .../io}/data/legacy_hdf/legacy_table.h5 | Bin .../io}/data/legacy_hdf/legacy_table_0.11.h5 | Bin .../io}/data/legacy_hdf/pytables_native.h5 | Bin .../io}/data/legacy_hdf/pytables_native2.h5 | Bin .../0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack | Bin .../0.16.2_AMD64_windows_2.7.10.msgpack | Bin .../0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack | Bin .../0.16.2_x86_64_darwin_2.7.10.msgpack | Bin .../0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack | Bin .../0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack | Bin .../0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack | Bin .../0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack | Bin .../0.17.0_AMD64_windows_2.7.11.msgpack | Bin .../0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack | Bin .../0.17.0_x86_64_darwin_2.7.11.msgpack | Bin .../0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack | Bin .../0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack | Bin .../0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack | Bin .../0.17.1_AMD64_windows_2.7.11.msgpack | Bin .../0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack | Bin .../0.17.1_AMD64_windows_2.7.11.msgpack | Bin .../0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack | Bin .../0.17.1_x86_64_darwin_2.7.11.msgpack | Bin .../0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack | Bin .../0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack | Bin .../0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack | Bin .../0.18.0_AMD64_windows_2.7.11.msgpack | Bin .../0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack | Bin .../0.18.0_x86_64_darwin_2.7.11.msgpack | Bin .../0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack | Bin .../0.18.1_x86_64_darwin_2.7.12.msgpack | Bin .../0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack | Bin .../0.10.1/AMD64_windows_2.7.3.pickle | Bin .../0.10.1/x86_64_linux_2.7.3.pickle | Bin .../0.11.0/0.11.0_x86_64_linux_3.3.0.pickle | Bin .../0.11.0/x86_64_linux_2.7.3.pickle | Bin .../0.11.0/x86_64_linux_3.3.0.pickle | Bin .../0.12.0/0.12.0_AMD64_windows_2.7.3.pickle | Bin .../0.12.0/0.12.0_x86_64_linux_2.7.3.pickle | Bin .../0.13.0/0.13.0_AMD64_windows_2.7.3.pickle | Bin .../0.13.0/0.13.0_i686_linux_2.6.5.pickle | Bin .../0.13.0/0.13.0_i686_linux_2.7.3.pickle | Bin .../0.13.0/0.13.0_i686_linux_3.2.3.pickle | Bin .../0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle | Bin .../0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle | Bin .../0.13.0/0.13.0_x86_64_linux_2.7.3.pickle | Bin .../0.13.0/0.13.0_x86_64_linux_2.7.8.pickle | Bin .../0.13.0/0.13.0_x86_64_linux_3.3.0.pickle | Bin .../0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle | Bin .../0.14.0/0.14.0_x86_64_linux_2.7.8.pickle | Bin .../0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle | Bin .../0.14.1/0.14.1_x86_64_linux_2.7.8.pickle | Bin .../0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle | Bin .../0.15.0/0.15.0_x86_64_linux_2.7.8.pickle | Bin .../0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle | Bin .../0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle | Bin .../0.16.2/0.16.2_AMD64_windows_2.7.10.pickle | Bin .../0.16.2/0.16.2_AMD64_windows_3.4.3.pickle | Bin .../0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle | Bin .../0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle | Bin .../0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle | Bin .../0.16.2/0.16.2_x86_64_linux_2.7.10.pickle | Bin .../0.16.2/0.16.2_x86_64_linux_3.4.3.pickle | Bin .../0.17.0/0.17.0_AMD64_windows_2.7.11.pickle | Bin .../0.17.0/0.17.0_AMD64_windows_3.4.4.pickle | Bin .../0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle | Bin .../0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle | Bin .../0.17.0/0.17.0_x86_64_linux_2.7.11.pickle | Bin .../0.17.0/0.17.0_x86_64_linux_3.4.4.pickle | Bin .../0.17.0/0.17.1_AMD64_windows_2.7.11.pickle | Bin .../0.17.1/0.17.1_AMD64_windows_2.7.11.pickle | Bin .../0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle | Bin .../0.18.0/0.18.0_AMD64_windows_2.7.11.pickle | Bin .../0.18.0/0.18.0_AMD64_windows_3.5.1.pickle | Bin .../0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle | Bin .../0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle | Bin .../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin .../0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle | Bin pandas/{io/tests => tests/io}/data/macau.html | 0 .../{io/tests => tests/io}/data/nyse_wsj.html | 0 pandas/{io/tests => tests/io}/data/spam.html | 0 .../tests => tests/io}/data/stata10_115.dta | Bin .../tests => tests/io}/data/stata10_117.dta | Bin .../tests => tests/io}/data/stata11_115.dta | Bin .../tests => tests/io}/data/stata11_117.dta | Bin .../tests => tests/io}/data/stata12_117.dta | Bin .../tests => tests/io}/data/stata14_118.dta | Bin .../{io/tests => tests/io}/data/stata15.dta | Bin .../tests => tests/io}/data/stata1_114.dta | Bin .../tests => tests/io}/data/stata1_117.dta | Bin .../io}/data/stata1_encoding.dta | Bin .../tests => tests/io}/data/stata2_113.dta | Bin .../tests => tests/io}/data/stata2_114.dta | Bin .../tests => tests/io}/data/stata2_115.dta | Bin .../tests => tests/io}/data/stata2_117.dta | Bin pandas/{io/tests => tests/io}/data/stata3.csv | 0 .../tests => tests/io}/data/stata3_113.dta | Bin .../tests => tests/io}/data/stata3_114.dta | Bin .../tests => tests/io}/data/stata3_115.dta | Bin .../tests => tests/io}/data/stata3_117.dta | Bin .../tests => tests/io}/data/stata4_113.dta | Bin .../tests => tests/io}/data/stata4_114.dta | Bin .../tests => tests/io}/data/stata4_115.dta | Bin .../tests => tests/io}/data/stata4_117.dta | Bin pandas/{io/tests => tests/io}/data/stata5.csv | 0 .../tests => tests/io}/data/stata5_113.dta | Bin .../tests => tests/io}/data/stata5_114.dta | Bin .../tests => tests/io}/data/stata5_115.dta | Bin .../tests => tests/io}/data/stata5_117.dta | Bin pandas/{io/tests => tests/io}/data/stata6.csv | 0 .../tests => tests/io}/data/stata6_113.dta | Bin .../tests => tests/io}/data/stata6_114.dta | Bin .../tests => tests/io}/data/stata6_115.dta | Bin .../tests => tests/io}/data/stata6_117.dta | Bin .../tests => tests/io}/data/stata7_111.dta | Bin .../tests => tests/io}/data/stata7_115.dta | Bin .../tests => tests/io}/data/stata7_117.dta | Bin .../tests => tests/io}/data/stata8_113.dta | Bin .../tests => tests/io}/data/stata8_115.dta | Bin .../tests => tests/io}/data/stata8_117.dta | Bin .../tests => tests/io}/data/stata9_115.dta | Bin .../tests => tests/io}/data/stata9_117.dta | Bin pandas/{io/tests => tests/io}/data/test1.csv | 0 pandas/{io/tests => tests/io}/data/test1.xls | Bin pandas/{io/tests => tests/io}/data/test1.xlsm | Bin pandas/{io/tests => tests/io}/data/test1.xlsx | Bin pandas/{io/tests => tests/io}/data/test2.xls | Bin pandas/{io/tests => tests/io}/data/test2.xlsm | Bin pandas/{io/tests => tests/io}/data/test2.xlsx | Bin pandas/{io/tests => tests/io}/data/test3.xls | Bin pandas/{io/tests => tests/io}/data/test3.xlsm | Bin pandas/{io/tests => tests/io}/data/test3.xlsx | Bin pandas/{io/tests => tests/io}/data/test4.xls | Bin pandas/{io/tests => tests/io}/data/test4.xlsm | Bin pandas/{io/tests => tests/io}/data/test4.xlsx | Bin pandas/{io/tests => tests/io}/data/test5.xls | Bin pandas/{io/tests => tests/io}/data/test5.xlsm | Bin pandas/{io/tests => tests/io}/data/test5.xlsx | Bin .../io}/data/test_converters.xls | Bin .../io}/data/test_converters.xlsm | Bin .../io}/data/test_converters.xlsx | Bin .../io}/data/test_index_name_pre17.xls | Bin .../io}/data/test_index_name_pre17.xlsm | Bin .../io}/data/test_index_name_pre17.xlsx | Bin .../{io/tests => tests/io}/data/test_mmap.csv | 0 .../io}/data/test_multisheet.xls | Bin .../io}/data/test_multisheet.xlsm | Bin .../io}/data/test_multisheet.xlsx | Bin .../tests => tests/io}/data/test_squeeze.xls | Bin .../tests => tests/io}/data/test_squeeze.xlsm | Bin .../tests => tests/io}/data/test_squeeze.xlsx | Bin .../tests => tests/io}/data/test_types.xls | Bin .../tests => tests/io}/data/test_types.xlsm | Bin .../tests => tests/io}/data/test_types.xlsx | Bin .../io}/data/testdateoverflow.xls | Bin .../io}/data/testdateoverflow.xlsm | Bin .../io}/data/testdateoverflow.xlsx | Bin .../{io/tests => tests/io}/data/testdtype.xls | Bin .../tests => tests/io}/data/testdtype.xlsm | Bin .../tests => tests/io}/data/testdtype.xlsx | Bin .../io}/data/testmultiindex.xls | Bin .../io}/data/testmultiindex.xlsm | Bin .../io}/data/testmultiindex.xlsx | Bin .../tests => tests/io}/data/testskiprows.xls | Bin .../tests => tests/io}/data/testskiprows.xlsm | Bin .../tests => tests/io}/data/testskiprows.xlsx | Bin .../tests => tests/io}/data/times_1900.xls | Bin .../tests => tests/io}/data/times_1900.xlsm | Bin .../tests => tests/io}/data/times_1900.xlsx | Bin .../tests => tests/io}/data/times_1904.xls | Bin .../tests => tests/io}/data/times_1904.xlsm | Bin .../tests => tests/io}/data/times_1904.xlsx | Bin pandas/{io/tests => tests/io}/data/tips.csv | 0 .../tests => tests/io}/data/valid_markup.html | 0 .../io}/data/wikipedia_states.html | 0 .../io}/generate_legacy_storage_files.py | 0 .../{io/tests => tests/io}/json/__init__.py | 0 .../io}/json/data/tsframe_iso_v012.json | 0 .../io}/json/data/tsframe_v012.json | 0 .../tests => tests/io}/json/test_normalize.py | 0 .../tests => tests/io}/json/test_pandas.py | 0 .../{io/tests => tests/io}/json/test_ujson.py | 0 .../{io/tests => tests/io}/parser/__init__.py | 0 .../io}/parser/c_parser_only.py | 0 .../{io/tests => tests/io}/parser/comment.py | 0 .../{io/tests => tests/io}/parser/common.py | 0 .../tests => tests/io}/parser/compression.py | 0 .../tests => tests/io}/parser/converters.py | 0 .../tests => tests/io}/parser/data/iris.csv | 0 .../io}/parser/data/salaries.csv | 0 .../io}/parser/data/salaries.csv.bz2 | Bin .../io}/parser/data/salaries.csv.gz | Bin .../io}/parser/data/salaries.csv.xz | Bin .../io}/parser/data/salaries.csv.zip | Bin .../io}/parser/data/sauron.SHIFT_JIS.csv | 0 .../tests => tests/io}/parser/data/test1.csv | 0 .../io}/parser/data/test1.csv.bz2 | Bin .../io}/parser/data/test1.csv.gz | Bin .../tests => tests/io}/parser/data/test2.csv | 0 .../io}/parser/data/test_mmap.csv | 0 .../tests => tests/io}/parser/data/tips.csv | 0 .../io}/parser/data/unicode_series.csv | 0 .../io}/parser/data/utf16_ex.txt | Bin .../{io/tests => tests/io}/parser/dialect.py | 0 .../{io/tests => tests/io}/parser/dtypes.py | 0 .../{io/tests => tests/io}/parser/header.py | 0 .../tests => tests/io}/parser/index_col.py | 0 .../tests => tests/io}/parser/multithread.py | 0 .../tests => tests/io}/parser/na_values.py | 0 .../tests => tests/io}/parser/parse_dates.py | 0 .../io}/parser/python_parser_only.py | 0 .../{io/tests => tests/io}/parser/quoting.py | 0 .../{io/tests => tests/io}/parser/skiprows.py | 0 .../tests => tests/io}/parser/test_network.py | 0 .../tests => tests/io}/parser/test_parsers.py | 0 .../io}/parser/test_read_fwf.py | 0 .../io}/parser/test_textreader.py | 0 .../io}/parser/test_unsupported.py | 0 .../{io/tests => tests/io}/parser/usecols.py | 0 .../tests => tests/io}/sas/data/DEMO_G.csv | 0 .../tests => tests/io}/sas/data/DEMO_G.xpt | Bin .../tests => tests/io}/sas/data/DRXFCD_G.csv | 0 .../tests => tests/io}/sas/data/DRXFCD_G.xpt | Bin .../tests => tests/io}/sas/data/SSHSV1_A.csv | 0 .../tests => tests/io}/sas/data/SSHSV1_A.xpt | Bin .../tests => tests/io}/sas/data/airline.csv | 0 .../io}/sas/data/airline.sas7bdat | Bin .../io}/sas/data/paxraw_d_short.csv | 0 .../io}/sas/data/paxraw_d_short.xpt | Bin .../io}/sas/data/productsales.csv | 0 .../io}/sas/data/productsales.sas7bdat | Bin .../io}/sas/data/test1.sas7bdat | Bin .../io}/sas/data/test10.sas7bdat | Bin .../io}/sas/data/test11.sas7bdat | Bin .../io}/sas/data/test12.sas7bdat | Bin .../io}/sas/data/test13.sas7bdat | Bin .../io}/sas/data/test14.sas7bdat | Bin .../io}/sas/data/test15.sas7bdat | Bin .../io}/sas/data/test16.sas7bdat | Bin .../io}/sas/data/test2.sas7bdat | Bin .../io}/sas/data/test3.sas7bdat | Bin .../io}/sas/data/test4.sas7bdat | Bin .../io}/sas/data/test5.sas7bdat | Bin .../io}/sas/data/test6.sas7bdat | Bin .../io}/sas/data/test7.sas7bdat | Bin .../io}/sas/data/test8.sas7bdat | Bin .../io}/sas/data/test9.sas7bdat | Bin .../io}/sas/data/test_12659.csv | 0 .../io}/sas/data/test_12659.sas7bdat | Bin .../io}/sas/data/test_sas7bdat_1.csv | 0 .../io}/sas/data/test_sas7bdat_2.csv | 0 pandas/{io/tests => tests/io}/sas/test_sas.py | 0 .../tests => tests/io}/sas/test_sas7bdat.py | 0 .../{io/tests => tests/io}/sas/test_xport.py | 0 .../{io/tests => tests/io}/test_clipboard.py | 0 pandas/{io/tests => tests/io}/test_common.py | 0 .../io}/test_date_converters.py | 0 pandas/{io/tests => tests/io}/test_excel.py | 0 pandas/{io/tests => tests/io}/test_feather.py | 0 pandas/{io/tests => tests/io}/test_gbq.py | 0 pandas/{io/tests => tests/io}/test_html.py | 0 pandas/{io/tests => tests/io}/test_packers.py | 2 +- pandas/{io/tests => tests/io}/test_pickle.py | 2 +- .../{io/tests => tests/io}/test_pytables.py | 0 pandas/{io/tests => tests/io}/test_s3.py | 0 pandas/{io/tests => tests/io}/test_sql.py | 0 pandas/{io/tests => tests/io}/test_stata.py | 0 .../tests => tests/msgpack}/__init__.py | 0 .../{test_msgpack => msgpack}/test_buffer.py | 0 .../{test_msgpack => msgpack}/test_case.py | 0 .../{test_msgpack => msgpack}/test_except.py | 0 .../test_extension.py | 0 .../{test_msgpack => msgpack}/test_format.py | 0 .../{test_msgpack => msgpack}/test_limits.py | 0 .../{test_msgpack => msgpack}/test_newspec.py | 0 .../{test_msgpack => msgpack}/test_obj.py | 0 .../{test_msgpack => msgpack}/test_pack.py | 0 .../test_read_size.py | 0 .../{test_msgpack => msgpack}/test_seq.py | 0 .../test_sequnpack.py | 0 .../{test_msgpack => msgpack}/test_subtype.py | 0 .../{test_msgpack => msgpack}/test_unpack.py | 0 .../test_unpack_raw.py | 0 .../{test_msgpack => sparse}/__init__.py | 0 .../sparse}/test_arithmetics.py | 0 .../tests => tests/sparse}/test_array.py | 0 .../sparse}/test_combine_concat.py | 0 .../tests => tests/sparse}/test_format.py | 0 .../tests => tests/sparse}/test_frame.py | 0 .../tests => tests/sparse}/test_groupby.py | 0 .../tests => tests/sparse}/test_indexing.py | 0 .../tests => tests/sparse}/test_libsparse.py | 0 .../tests => tests/sparse}/test_list.py | 0 .../tests => tests/sparse}/test_pivot.py | 0 .../tests => tests/sparse}/test_series.py | 0 .../{tools/tests => tests/tools}/__init__.py | 0 .../tools}/data/allow_exact_matches.csv | 0 .../allow_exact_matches_and_tolerance.csv | 0 .../tests => tests/tools}/data/asof.csv | 0 .../tests => tests/tools}/data/asof2.csv | 0 .../tests => tests/tools}/data/cut_data.csv | 0 .../tests => tests/tools}/data/quotes.csv | 0 .../tests => tests/tools}/data/quotes2.csv | 0 .../tests => tests/tools}/data/tolerance.csv | 0 .../tests => tests/tools}/data/trades.csv | 0 .../tests => tests/tools}/data/trades2.csv | 0 .../tests => tests/tools}/test_concat.py | 0 .../tests => tests/tools}/test_hashing.py | 0 .../{tools/tests => tests/tools}/test_join.py | 2 +- .../tests => tests/tools}/test_merge.py | 0 .../tests => tests/tools}/test_merge_asof.py | 0 .../tools}/test_merge_ordered.py | 0 .../tests => tests/tools}/test_pivot.py | 0 .../{tools/tests => tests/tools}/test_tile.py | 0 .../{tools/tests => tests/tools}/test_util.py | 0 setup.py | 64 +++++++++--------- 344 files changed, 38 insertions(+), 37 deletions(-) rename pandas/{api/tests => tests/api}/__init__.py (100%) rename pandas/{api/tests => tests/api}/test_api.py (99%) rename pandas/{computation/tests => tests/computation}/__init__.py (100%) rename pandas/{computation/tests => tests/computation}/test_compat.py (100%) rename pandas/{computation/tests => tests/computation}/test_eval.py (100%) rename pandas/{io/tests => tests/io}/__init__.py (100%) rename pandas/{io/tests => tests/io}/data/S4_EDUC1.dta (100%) rename pandas/{io/tests => tests/io}/data/banklist.csv (100%) rename pandas/{io/tests => tests/io}/data/banklist.html (100%) rename pandas/{io/tests => tests/io}/data/blank.xls (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank.xlsm (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank.xlsx (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank_with_header.xls (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank_with_header.xlsm (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/blank_with_header.xlsx (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/categorical_0_14_1.pickle (100%) rename pandas/{io/tests => tests/io}/data/categorical_0_15_2.pickle (100%) rename pandas/{io/tests => tests/io}/data/computer_sales_page.html (100%) rename pandas/{io/tests => tests/io}/data/gbq_fake_job.txt (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/chinese_utf-16.html (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/chinese_utf-32.html (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/chinese_utf-8.html (100%) rename pandas/{io/tests => tests/io}/data/html_encoding/letz_latin1.html (100%) rename pandas/{io/tests => tests/io}/data/iris.csv (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/datetimetz_object.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy_0.10.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy_table.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/legacy_table_0.11.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/pytables_native.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_hdf/pytables_native2.h5 (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle (100%) rename pandas/{io/tests => tests/io}/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle (100%) rename pandas/{io/tests => tests/io}/data/macau.html (100%) rename pandas/{io/tests => tests/io}/data/nyse_wsj.html (100%) rename pandas/{io/tests => tests/io}/data/spam.html (100%) rename pandas/{io/tests => tests/io}/data/stata10_115.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata10_117.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata11_115.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata11_117.dta (100%) mode change 100755 => 100644 rename pandas/{io/tests => tests/io}/data/stata12_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata14_118.dta (100%) rename pandas/{io/tests => tests/io}/data/stata15.dta (100%) rename pandas/{io/tests => tests/io}/data/stata1_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata1_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata1_encoding.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata2_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3.csv (100%) rename pandas/{io/tests => tests/io}/data/stata3_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata3_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata4_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5.csv (100%) rename pandas/{io/tests => tests/io}/data/stata5_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata5_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6.csv (100%) rename pandas/{io/tests => tests/io}/data/stata6_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6_114.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata6_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata7_111.dta (100%) rename pandas/{io/tests => tests/io}/data/stata7_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata7_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata8_113.dta (100%) rename pandas/{io/tests => tests/io}/data/stata8_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata8_117.dta (100%) rename pandas/{io/tests => tests/io}/data/stata9_115.dta (100%) rename pandas/{io/tests => tests/io}/data/stata9_117.dta (100%) rename pandas/{io/tests => tests/io}/data/test1.csv (100%) rename pandas/{io/tests => tests/io}/data/test1.xls (100%) rename pandas/{io/tests => tests/io}/data/test1.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test1.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test2.xls (100%) rename pandas/{io/tests => tests/io}/data/test2.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test2.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test3.xls (100%) rename pandas/{io/tests => tests/io}/data/test3.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test3.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test4.xls (100%) rename pandas/{io/tests => tests/io}/data/test4.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test4.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test5.xls (100%) rename pandas/{io/tests => tests/io}/data/test5.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test5.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_converters.xls (100%) rename pandas/{io/tests => tests/io}/data/test_converters.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_converters.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_index_name_pre17.xls (100%) rename pandas/{io/tests => tests/io}/data/test_index_name_pre17.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_index_name_pre17.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_mmap.csv (100%) rename pandas/{io/tests => tests/io}/data/test_multisheet.xls (100%) rename pandas/{io/tests => tests/io}/data/test_multisheet.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_multisheet.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_squeeze.xls (100%) rename pandas/{io/tests => tests/io}/data/test_squeeze.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_squeeze.xlsx (100%) rename pandas/{io/tests => tests/io}/data/test_types.xls (100%) rename pandas/{io/tests => tests/io}/data/test_types.xlsm (100%) rename pandas/{io/tests => tests/io}/data/test_types.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testdateoverflow.xls (100%) rename pandas/{io/tests => tests/io}/data/testdateoverflow.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testdateoverflow.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testdtype.xls (100%) rename pandas/{io/tests => tests/io}/data/testdtype.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testdtype.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testmultiindex.xls (100%) rename pandas/{io/tests => tests/io}/data/testmultiindex.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testmultiindex.xlsx (100%) rename pandas/{io/tests => tests/io}/data/testskiprows.xls (100%) rename pandas/{io/tests => tests/io}/data/testskiprows.xlsm (100%) rename pandas/{io/tests => tests/io}/data/testskiprows.xlsx (100%) rename pandas/{io/tests => tests/io}/data/times_1900.xls (100%) rename pandas/{io/tests => tests/io}/data/times_1900.xlsm (100%) rename pandas/{io/tests => tests/io}/data/times_1900.xlsx (100%) rename pandas/{io/tests => tests/io}/data/times_1904.xls (100%) rename pandas/{io/tests => tests/io}/data/times_1904.xlsm (100%) rename pandas/{io/tests => tests/io}/data/times_1904.xlsx (100%) rename pandas/{io/tests => tests/io}/data/tips.csv (100%) rename pandas/{io/tests => tests/io}/data/valid_markup.html (100%) rename pandas/{io/tests => tests/io}/data/wikipedia_states.html (100%) rename pandas/{io/tests => tests/io}/generate_legacy_storage_files.py (100%) rename pandas/{io/tests => tests/io}/json/__init__.py (100%) rename pandas/{io/tests => tests/io}/json/data/tsframe_iso_v012.json (100%) rename pandas/{io/tests => tests/io}/json/data/tsframe_v012.json (100%) rename pandas/{io/tests => tests/io}/json/test_normalize.py (100%) rename pandas/{io/tests => tests/io}/json/test_pandas.py (100%) rename pandas/{io/tests => tests/io}/json/test_ujson.py (100%) rename pandas/{io/tests => tests/io}/parser/__init__.py (100%) rename pandas/{io/tests => tests/io}/parser/c_parser_only.py (100%) rename pandas/{io/tests => tests/io}/parser/comment.py (100%) rename pandas/{io/tests => tests/io}/parser/common.py (100%) rename pandas/{io/tests => tests/io}/parser/compression.py (100%) rename pandas/{io/tests => tests/io}/parser/converters.py (100%) rename pandas/{io/tests => tests/io}/parser/data/iris.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.bz2 (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.gz (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.xz (100%) rename pandas/{io/tests => tests/io}/parser/data/salaries.csv.zip (100%) rename pandas/{io/tests => tests/io}/parser/data/sauron.SHIFT_JIS.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/test1.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/test1.csv.bz2 (100%) rename pandas/{io/tests => tests/io}/parser/data/test1.csv.gz (100%) rename pandas/{io/tests => tests/io}/parser/data/test2.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/test_mmap.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/tips.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/unicode_series.csv (100%) rename pandas/{io/tests => tests/io}/parser/data/utf16_ex.txt (100%) rename pandas/{io/tests => tests/io}/parser/dialect.py (100%) rename pandas/{io/tests => tests/io}/parser/dtypes.py (100%) rename pandas/{io/tests => tests/io}/parser/header.py (100%) rename pandas/{io/tests => tests/io}/parser/index_col.py (100%) rename pandas/{io/tests => tests/io}/parser/multithread.py (100%) rename pandas/{io/tests => tests/io}/parser/na_values.py (100%) rename pandas/{io/tests => tests/io}/parser/parse_dates.py (100%) rename pandas/{io/tests => tests/io}/parser/python_parser_only.py (100%) rename pandas/{io/tests => tests/io}/parser/quoting.py (100%) rename pandas/{io/tests => tests/io}/parser/skiprows.py (100%) rename pandas/{io/tests => tests/io}/parser/test_network.py (100%) rename pandas/{io/tests => tests/io}/parser/test_parsers.py (100%) rename pandas/{io/tests => tests/io}/parser/test_read_fwf.py (100%) rename pandas/{io/tests => tests/io}/parser/test_textreader.py (100%) rename pandas/{io/tests => tests/io}/parser/test_unsupported.py (100%) rename pandas/{io/tests => tests/io}/parser/usecols.py (100%) rename pandas/{io/tests => tests/io}/sas/data/DEMO_G.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/DEMO_G.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/DRXFCD_G.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/DRXFCD_G.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/SSHSV1_A.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/SSHSV1_A.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/airline.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/airline.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/paxraw_d_short.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/paxraw_d_short.xpt (100%) rename pandas/{io/tests => tests/io}/sas/data/productsales.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/productsales.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test1.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test10.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test11.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test12.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test13.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test14.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test15.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test16.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test2.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test3.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test4.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test5.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test6.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test7.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test8.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test9.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test_12659.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/test_12659.sas7bdat (100%) rename pandas/{io/tests => tests/io}/sas/data/test_sas7bdat_1.csv (100%) rename pandas/{io/tests => tests/io}/sas/data/test_sas7bdat_2.csv (100%) rename pandas/{io/tests => tests/io}/sas/test_sas.py (100%) rename pandas/{io/tests => tests/io}/sas/test_sas7bdat.py (100%) rename pandas/{io/tests => tests/io}/sas/test_xport.py (100%) rename pandas/{io/tests => tests/io}/test_clipboard.py (100%) rename pandas/{io/tests => tests/io}/test_common.py (100%) rename pandas/{io/tests => tests/io}/test_date_converters.py (100%) rename pandas/{io/tests => tests/io}/test_excel.py (100%) rename pandas/{io/tests => tests/io}/test_feather.py (100%) rename pandas/{io/tests => tests/io}/test_gbq.py (100%) rename pandas/{io/tests => tests/io}/test_html.py (100%) rename pandas/{io/tests => tests/io}/test_packers.py (99%) rename pandas/{io/tests => tests/io}/test_pickle.py (99%) rename pandas/{io/tests => tests/io}/test_pytables.py (100%) rename pandas/{io/tests => tests/io}/test_s3.py (100%) rename pandas/{io/tests => tests/io}/test_sql.py (100%) rename pandas/{io/tests => tests/io}/test_stata.py (100%) rename pandas/{sparse/tests => tests/msgpack}/__init__.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_buffer.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_case.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_except.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_extension.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_format.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_limits.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_newspec.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_obj.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_pack.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_read_size.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_seq.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_sequnpack.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_subtype.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_unpack.py (100%) rename pandas/tests/{test_msgpack => msgpack}/test_unpack_raw.py (100%) rename pandas/tests/{test_msgpack => sparse}/__init__.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_arithmetics.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_array.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_combine_concat.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_format.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_frame.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_groupby.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_indexing.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_libsparse.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_list.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_pivot.py (100%) rename pandas/{sparse/tests => tests/sparse}/test_series.py (100%) rename pandas/{tools/tests => tests/tools}/__init__.py (100%) rename pandas/{tools/tests => tests/tools}/data/allow_exact_matches.csv (100%) rename pandas/{tools/tests => tests/tools}/data/allow_exact_matches_and_tolerance.csv (100%) rename pandas/{tools/tests => tests/tools}/data/asof.csv (100%) rename pandas/{tools/tests => tests/tools}/data/asof2.csv (100%) rename pandas/{tools/tests => tests/tools}/data/cut_data.csv (100%) rename pandas/{tools/tests => tests/tools}/data/quotes.csv (100%) rename pandas/{tools/tests => tests/tools}/data/quotes2.csv (100%) rename pandas/{tools/tests => tests/tools}/data/tolerance.csv (100%) rename pandas/{tools/tests => tests/tools}/data/trades.csv (100%) rename pandas/{tools/tests => tests/tools}/data/trades2.csv (100%) rename pandas/{tools/tests => tests/tools}/test_concat.py (100%) rename pandas/{tools/tests => tests/tools}/test_hashing.py (100%) rename pandas/{tools/tests => tests/tools}/test_join.py (99%) rename pandas/{tools/tests => tests/tools}/test_merge.py (100%) rename pandas/{tools/tests => tests/tools}/test_merge_asof.py (100%) rename pandas/{tools/tests => tests/tools}/test_merge_ordered.py (100%) rename pandas/{tools/tests => tests/tools}/test_pivot.py (100%) rename pandas/{tools/tests => tests/tools}/test_tile.py (100%) rename pandas/{tools/tests => tests/tools}/test_util.py (100%) diff --git a/pandas/api/tests/__init__.py b/pandas/tests/api/__init__.py similarity index 100% rename from pandas/api/tests/__init__.py rename to pandas/tests/api/__init__.py diff --git a/pandas/api/tests/test_api.py b/pandas/tests/api/test_api.py similarity index 99% rename from pandas/api/tests/test_api.py rename to pandas/tests/api/test_api.py index 05cf5dc4b7e7b..90a0c1d5c9347 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/tests/api/test_api.py @@ -133,7 +133,7 @@ def test_api(self): class TestApi(Base, tm.TestCase): - allowed = ['tests', 'types'] + allowed = ['types'] def test_api(self): diff --git a/pandas/computation/tests/__init__.py b/pandas/tests/computation/__init__.py similarity index 100% rename from pandas/computation/tests/__init__.py rename to pandas/tests/computation/__init__.py diff --git a/pandas/computation/tests/test_compat.py b/pandas/tests/computation/test_compat.py similarity index 100% rename from pandas/computation/tests/test_compat.py rename to pandas/tests/computation/test_compat.py diff --git a/pandas/computation/tests/test_eval.py b/pandas/tests/computation/test_eval.py similarity index 100% rename from pandas/computation/tests/test_eval.py rename to pandas/tests/computation/test_eval.py diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 63bf07ec041d3..9a968a42c4247 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1245,7 +1245,8 @@ def test_shift(self): self.assertEqual(shifted[0], self.rng[0]) self.assertEqual(shifted.offset, self.rng.offset) - with tm.assert_produces_warning(PerformanceWarning): + # PerformanceWarning + with warnings.catch_warnings(record=True): rng = date_range(START, END, freq=BMonthEnd()) shifted = rng.shift(1, freq=CDay()) self.assertEqual(shifted[0], rng[0] + CDay()) diff --git a/pandas/io/tests/__init__.py b/pandas/tests/io/__init__.py similarity index 100% rename from pandas/io/tests/__init__.py rename to pandas/tests/io/__init__.py diff --git a/pandas/io/tests/data/S4_EDUC1.dta b/pandas/tests/io/data/S4_EDUC1.dta similarity index 100% rename from pandas/io/tests/data/S4_EDUC1.dta rename to pandas/tests/io/data/S4_EDUC1.dta diff --git a/pandas/io/tests/data/banklist.csv b/pandas/tests/io/data/banklist.csv similarity index 100% rename from pandas/io/tests/data/banklist.csv rename to pandas/tests/io/data/banklist.csv diff --git a/pandas/io/tests/data/banklist.html b/pandas/tests/io/data/banklist.html similarity index 100% rename from pandas/io/tests/data/banklist.html rename to pandas/tests/io/data/banklist.html diff --git a/pandas/io/tests/data/blank.xls b/pandas/tests/io/data/blank.xls old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank.xls rename to pandas/tests/io/data/blank.xls diff --git a/pandas/io/tests/data/blank.xlsm b/pandas/tests/io/data/blank.xlsm old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank.xlsm rename to pandas/tests/io/data/blank.xlsm diff --git a/pandas/io/tests/data/blank.xlsx b/pandas/tests/io/data/blank.xlsx old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank.xlsx rename to pandas/tests/io/data/blank.xlsx diff --git a/pandas/io/tests/data/blank_with_header.xls b/pandas/tests/io/data/blank_with_header.xls old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank_with_header.xls rename to pandas/tests/io/data/blank_with_header.xls diff --git a/pandas/io/tests/data/blank_with_header.xlsm b/pandas/tests/io/data/blank_with_header.xlsm old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank_with_header.xlsm rename to pandas/tests/io/data/blank_with_header.xlsm diff --git a/pandas/io/tests/data/blank_with_header.xlsx b/pandas/tests/io/data/blank_with_header.xlsx old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/blank_with_header.xlsx rename to pandas/tests/io/data/blank_with_header.xlsx diff --git a/pandas/io/tests/data/categorical_0_14_1.pickle b/pandas/tests/io/data/categorical_0_14_1.pickle similarity index 100% rename from pandas/io/tests/data/categorical_0_14_1.pickle rename to pandas/tests/io/data/categorical_0_14_1.pickle diff --git a/pandas/io/tests/data/categorical_0_15_2.pickle b/pandas/tests/io/data/categorical_0_15_2.pickle similarity index 100% rename from pandas/io/tests/data/categorical_0_15_2.pickle rename to pandas/tests/io/data/categorical_0_15_2.pickle diff --git a/pandas/io/tests/data/computer_sales_page.html b/pandas/tests/io/data/computer_sales_page.html similarity index 100% rename from pandas/io/tests/data/computer_sales_page.html rename to pandas/tests/io/data/computer_sales_page.html diff --git a/pandas/io/tests/data/gbq_fake_job.txt b/pandas/tests/io/data/gbq_fake_job.txt similarity index 100% rename from pandas/io/tests/data/gbq_fake_job.txt rename to pandas/tests/io/data/gbq_fake_job.txt diff --git a/pandas/io/tests/data/html_encoding/chinese_utf-16.html b/pandas/tests/io/data/html_encoding/chinese_utf-16.html similarity index 100% rename from pandas/io/tests/data/html_encoding/chinese_utf-16.html rename to pandas/tests/io/data/html_encoding/chinese_utf-16.html diff --git a/pandas/io/tests/data/html_encoding/chinese_utf-32.html b/pandas/tests/io/data/html_encoding/chinese_utf-32.html similarity index 100% rename from pandas/io/tests/data/html_encoding/chinese_utf-32.html rename to pandas/tests/io/data/html_encoding/chinese_utf-32.html diff --git a/pandas/io/tests/data/html_encoding/chinese_utf-8.html b/pandas/tests/io/data/html_encoding/chinese_utf-8.html similarity index 100% rename from pandas/io/tests/data/html_encoding/chinese_utf-8.html rename to pandas/tests/io/data/html_encoding/chinese_utf-8.html diff --git a/pandas/io/tests/data/html_encoding/letz_latin1.html b/pandas/tests/io/data/html_encoding/letz_latin1.html similarity index 100% rename from pandas/io/tests/data/html_encoding/letz_latin1.html rename to pandas/tests/io/data/html_encoding/letz_latin1.html diff --git a/pandas/io/tests/data/iris.csv b/pandas/tests/io/data/iris.csv similarity index 100% rename from pandas/io/tests/data/iris.csv rename to pandas/tests/io/data/iris.csv diff --git a/pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 b/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 rename to pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy.h5 b/pandas/tests/io/data/legacy_hdf/legacy.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy.h5 rename to pandas/tests/io/data/legacy_hdf/legacy.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 b/pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 rename to pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_table.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy_table.h5 rename to pandas/tests/io/data/legacy_hdf/legacy_table.h5 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_0.11.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 rename to pandas/tests/io/data/legacy_hdf/legacy_table_0.11.h5 diff --git a/pandas/io/tests/data/legacy_hdf/pytables_native.h5 b/pandas/tests/io/data/legacy_hdf/pytables_native.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/pytables_native.h5 rename to pandas/tests/io/data/legacy_hdf/pytables_native.h5 diff --git a/pandas/io/tests/data/legacy_hdf/pytables_native2.h5 b/pandas/tests/io/data/legacy_hdf/pytables_native2.h5 similarity index 100% rename from pandas/io/tests/data/legacy_hdf/pytables_native2.h5 rename to pandas/tests/io/data/legacy_hdf/pytables_native2.h5 diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack similarity index 100% rename from pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack rename to pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle rename to pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle rename to pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle rename to pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle b/pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle rename to pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle rename to pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle rename to pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle rename to pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle diff --git a/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle b/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle similarity index 100% rename from pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle rename to pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle diff --git a/pandas/io/tests/data/macau.html b/pandas/tests/io/data/macau.html similarity index 100% rename from pandas/io/tests/data/macau.html rename to pandas/tests/io/data/macau.html diff --git a/pandas/io/tests/data/nyse_wsj.html b/pandas/tests/io/data/nyse_wsj.html similarity index 100% rename from pandas/io/tests/data/nyse_wsj.html rename to pandas/tests/io/data/nyse_wsj.html diff --git a/pandas/io/tests/data/spam.html b/pandas/tests/io/data/spam.html similarity index 100% rename from pandas/io/tests/data/spam.html rename to pandas/tests/io/data/spam.html diff --git a/pandas/io/tests/data/stata10_115.dta b/pandas/tests/io/data/stata10_115.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata10_115.dta rename to pandas/tests/io/data/stata10_115.dta diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/tests/io/data/stata10_117.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata10_117.dta rename to pandas/tests/io/data/stata10_117.dta diff --git a/pandas/io/tests/data/stata11_115.dta b/pandas/tests/io/data/stata11_115.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata11_115.dta rename to pandas/tests/io/data/stata11_115.dta diff --git a/pandas/io/tests/data/stata11_117.dta b/pandas/tests/io/data/stata11_117.dta old mode 100755 new mode 100644 similarity index 100% rename from pandas/io/tests/data/stata11_117.dta rename to pandas/tests/io/data/stata11_117.dta diff --git a/pandas/io/tests/data/stata12_117.dta b/pandas/tests/io/data/stata12_117.dta similarity index 100% rename from pandas/io/tests/data/stata12_117.dta rename to pandas/tests/io/data/stata12_117.dta diff --git a/pandas/io/tests/data/stata14_118.dta b/pandas/tests/io/data/stata14_118.dta similarity index 100% rename from pandas/io/tests/data/stata14_118.dta rename to pandas/tests/io/data/stata14_118.dta diff --git a/pandas/io/tests/data/stata15.dta b/pandas/tests/io/data/stata15.dta similarity index 100% rename from pandas/io/tests/data/stata15.dta rename to pandas/tests/io/data/stata15.dta diff --git a/pandas/io/tests/data/stata1_114.dta b/pandas/tests/io/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1_114.dta rename to pandas/tests/io/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_117.dta b/pandas/tests/io/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_117.dta rename to pandas/tests/io/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata1_encoding.dta b/pandas/tests/io/data/stata1_encoding.dta similarity index 100% rename from pandas/io/tests/data/stata1_encoding.dta rename to pandas/tests/io/data/stata1_encoding.dta diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/tests/io/data/stata2_113.dta similarity index 100% rename from pandas/io/tests/data/stata2_113.dta rename to pandas/tests/io/data/stata2_113.dta diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/tests/io/data/stata2_114.dta similarity index 100% rename from pandas/io/tests/data/stata2_114.dta rename to pandas/tests/io/data/stata2_114.dta diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/tests/io/data/stata2_115.dta similarity index 100% rename from pandas/io/tests/data/stata2_115.dta rename to pandas/tests/io/data/stata2_115.dta diff --git a/pandas/io/tests/data/stata2_117.dta b/pandas/tests/io/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_117.dta rename to pandas/tests/io/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3.csv b/pandas/tests/io/data/stata3.csv similarity index 100% rename from pandas/io/tests/data/stata3.csv rename to pandas/tests/io/data/stata3.csv diff --git a/pandas/io/tests/data/stata3_113.dta b/pandas/tests/io/data/stata3_113.dta similarity index 100% rename from pandas/io/tests/data/stata3_113.dta rename to pandas/tests/io/data/stata3_113.dta diff --git a/pandas/io/tests/data/stata3_114.dta b/pandas/tests/io/data/stata3_114.dta similarity index 100% rename from pandas/io/tests/data/stata3_114.dta rename to pandas/tests/io/data/stata3_114.dta diff --git a/pandas/io/tests/data/stata3_115.dta b/pandas/tests/io/data/stata3_115.dta similarity index 100% rename from pandas/io/tests/data/stata3_115.dta rename to pandas/tests/io/data/stata3_115.dta diff --git a/pandas/io/tests/data/stata3_117.dta b/pandas/tests/io/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_117.dta rename to pandas/tests/io/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4_113.dta b/pandas/tests/io/data/stata4_113.dta similarity index 100% rename from pandas/io/tests/data/stata4_113.dta rename to pandas/tests/io/data/stata4_113.dta diff --git a/pandas/io/tests/data/stata4_114.dta b/pandas/tests/io/data/stata4_114.dta similarity index 100% rename from pandas/io/tests/data/stata4_114.dta rename to pandas/tests/io/data/stata4_114.dta diff --git a/pandas/io/tests/data/stata4_115.dta b/pandas/tests/io/data/stata4_115.dta similarity index 100% rename from pandas/io/tests/data/stata4_115.dta rename to pandas/tests/io/data/stata4_115.dta diff --git a/pandas/io/tests/data/stata4_117.dta b/pandas/tests/io/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_117.dta rename to pandas/tests/io/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/tests/io/data/stata5.csv similarity index 100% rename from pandas/io/tests/data/stata5.csv rename to pandas/tests/io/data/stata5.csv diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/tests/io/data/stata5_113.dta similarity index 100% rename from pandas/io/tests/data/stata5_113.dta rename to pandas/tests/io/data/stata5_113.dta diff --git a/pandas/io/tests/data/stata5_114.dta b/pandas/tests/io/data/stata5_114.dta similarity index 100% rename from pandas/io/tests/data/stata5_114.dta rename to pandas/tests/io/data/stata5_114.dta diff --git a/pandas/io/tests/data/stata5_115.dta b/pandas/tests/io/data/stata5_115.dta similarity index 100% rename from pandas/io/tests/data/stata5_115.dta rename to pandas/tests/io/data/stata5_115.dta diff --git a/pandas/io/tests/data/stata5_117.dta b/pandas/tests/io/data/stata5_117.dta similarity index 100% rename from pandas/io/tests/data/stata5_117.dta rename to pandas/tests/io/data/stata5_117.dta diff --git a/pandas/io/tests/data/stata6.csv b/pandas/tests/io/data/stata6.csv similarity index 100% rename from pandas/io/tests/data/stata6.csv rename to pandas/tests/io/data/stata6.csv diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/tests/io/data/stata6_113.dta similarity index 100% rename from pandas/io/tests/data/stata6_113.dta rename to pandas/tests/io/data/stata6_113.dta diff --git a/pandas/io/tests/data/stata6_114.dta b/pandas/tests/io/data/stata6_114.dta similarity index 100% rename from pandas/io/tests/data/stata6_114.dta rename to pandas/tests/io/data/stata6_114.dta diff --git a/pandas/io/tests/data/stata6_115.dta b/pandas/tests/io/data/stata6_115.dta similarity index 100% rename from pandas/io/tests/data/stata6_115.dta rename to pandas/tests/io/data/stata6_115.dta diff --git a/pandas/io/tests/data/stata6_117.dta b/pandas/tests/io/data/stata6_117.dta similarity index 100% rename from pandas/io/tests/data/stata6_117.dta rename to pandas/tests/io/data/stata6_117.dta diff --git a/pandas/io/tests/data/stata7_111.dta b/pandas/tests/io/data/stata7_111.dta similarity index 100% rename from pandas/io/tests/data/stata7_111.dta rename to pandas/tests/io/data/stata7_111.dta diff --git a/pandas/io/tests/data/stata7_115.dta b/pandas/tests/io/data/stata7_115.dta similarity index 100% rename from pandas/io/tests/data/stata7_115.dta rename to pandas/tests/io/data/stata7_115.dta diff --git a/pandas/io/tests/data/stata7_117.dta b/pandas/tests/io/data/stata7_117.dta similarity index 100% rename from pandas/io/tests/data/stata7_117.dta rename to pandas/tests/io/data/stata7_117.dta diff --git a/pandas/io/tests/data/stata8_113.dta b/pandas/tests/io/data/stata8_113.dta similarity index 100% rename from pandas/io/tests/data/stata8_113.dta rename to pandas/tests/io/data/stata8_113.dta diff --git a/pandas/io/tests/data/stata8_115.dta b/pandas/tests/io/data/stata8_115.dta similarity index 100% rename from pandas/io/tests/data/stata8_115.dta rename to pandas/tests/io/data/stata8_115.dta diff --git a/pandas/io/tests/data/stata8_117.dta b/pandas/tests/io/data/stata8_117.dta similarity index 100% rename from pandas/io/tests/data/stata8_117.dta rename to pandas/tests/io/data/stata8_117.dta diff --git a/pandas/io/tests/data/stata9_115.dta b/pandas/tests/io/data/stata9_115.dta similarity index 100% rename from pandas/io/tests/data/stata9_115.dta rename to pandas/tests/io/data/stata9_115.dta diff --git a/pandas/io/tests/data/stata9_117.dta b/pandas/tests/io/data/stata9_117.dta similarity index 100% rename from pandas/io/tests/data/stata9_117.dta rename to pandas/tests/io/data/stata9_117.dta diff --git a/pandas/io/tests/data/test1.csv b/pandas/tests/io/data/test1.csv similarity index 100% rename from pandas/io/tests/data/test1.csv rename to pandas/tests/io/data/test1.csv diff --git a/pandas/io/tests/data/test1.xls b/pandas/tests/io/data/test1.xls similarity index 100% rename from pandas/io/tests/data/test1.xls rename to pandas/tests/io/data/test1.xls diff --git a/pandas/io/tests/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm similarity index 100% rename from pandas/io/tests/data/test1.xlsm rename to pandas/tests/io/data/test1.xlsm diff --git a/pandas/io/tests/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx similarity index 100% rename from pandas/io/tests/data/test1.xlsx rename to pandas/tests/io/data/test1.xlsx diff --git a/pandas/io/tests/data/test2.xls b/pandas/tests/io/data/test2.xls similarity index 100% rename from pandas/io/tests/data/test2.xls rename to pandas/tests/io/data/test2.xls diff --git a/pandas/io/tests/data/test2.xlsm b/pandas/tests/io/data/test2.xlsm similarity index 100% rename from pandas/io/tests/data/test2.xlsm rename to pandas/tests/io/data/test2.xlsm diff --git a/pandas/io/tests/data/test2.xlsx b/pandas/tests/io/data/test2.xlsx similarity index 100% rename from pandas/io/tests/data/test2.xlsx rename to pandas/tests/io/data/test2.xlsx diff --git a/pandas/io/tests/data/test3.xls b/pandas/tests/io/data/test3.xls similarity index 100% rename from pandas/io/tests/data/test3.xls rename to pandas/tests/io/data/test3.xls diff --git a/pandas/io/tests/data/test3.xlsm b/pandas/tests/io/data/test3.xlsm similarity index 100% rename from pandas/io/tests/data/test3.xlsm rename to pandas/tests/io/data/test3.xlsm diff --git a/pandas/io/tests/data/test3.xlsx b/pandas/tests/io/data/test3.xlsx similarity index 100% rename from pandas/io/tests/data/test3.xlsx rename to pandas/tests/io/data/test3.xlsx diff --git a/pandas/io/tests/data/test4.xls b/pandas/tests/io/data/test4.xls similarity index 100% rename from pandas/io/tests/data/test4.xls rename to pandas/tests/io/data/test4.xls diff --git a/pandas/io/tests/data/test4.xlsm b/pandas/tests/io/data/test4.xlsm similarity index 100% rename from pandas/io/tests/data/test4.xlsm rename to pandas/tests/io/data/test4.xlsm diff --git a/pandas/io/tests/data/test4.xlsx b/pandas/tests/io/data/test4.xlsx similarity index 100% rename from pandas/io/tests/data/test4.xlsx rename to pandas/tests/io/data/test4.xlsx diff --git a/pandas/io/tests/data/test5.xls b/pandas/tests/io/data/test5.xls similarity index 100% rename from pandas/io/tests/data/test5.xls rename to pandas/tests/io/data/test5.xls diff --git a/pandas/io/tests/data/test5.xlsm b/pandas/tests/io/data/test5.xlsm similarity index 100% rename from pandas/io/tests/data/test5.xlsm rename to pandas/tests/io/data/test5.xlsm diff --git a/pandas/io/tests/data/test5.xlsx b/pandas/tests/io/data/test5.xlsx similarity index 100% rename from pandas/io/tests/data/test5.xlsx rename to pandas/tests/io/data/test5.xlsx diff --git a/pandas/io/tests/data/test_converters.xls b/pandas/tests/io/data/test_converters.xls similarity index 100% rename from pandas/io/tests/data/test_converters.xls rename to pandas/tests/io/data/test_converters.xls diff --git a/pandas/io/tests/data/test_converters.xlsm b/pandas/tests/io/data/test_converters.xlsm similarity index 100% rename from pandas/io/tests/data/test_converters.xlsm rename to pandas/tests/io/data/test_converters.xlsm diff --git a/pandas/io/tests/data/test_converters.xlsx b/pandas/tests/io/data/test_converters.xlsx similarity index 100% rename from pandas/io/tests/data/test_converters.xlsx rename to pandas/tests/io/data/test_converters.xlsx diff --git a/pandas/io/tests/data/test_index_name_pre17.xls b/pandas/tests/io/data/test_index_name_pre17.xls similarity index 100% rename from pandas/io/tests/data/test_index_name_pre17.xls rename to pandas/tests/io/data/test_index_name_pre17.xls diff --git a/pandas/io/tests/data/test_index_name_pre17.xlsm b/pandas/tests/io/data/test_index_name_pre17.xlsm similarity index 100% rename from pandas/io/tests/data/test_index_name_pre17.xlsm rename to pandas/tests/io/data/test_index_name_pre17.xlsm diff --git a/pandas/io/tests/data/test_index_name_pre17.xlsx b/pandas/tests/io/data/test_index_name_pre17.xlsx similarity index 100% rename from pandas/io/tests/data/test_index_name_pre17.xlsx rename to pandas/tests/io/data/test_index_name_pre17.xlsx diff --git a/pandas/io/tests/data/test_mmap.csv b/pandas/tests/io/data/test_mmap.csv similarity index 100% rename from pandas/io/tests/data/test_mmap.csv rename to pandas/tests/io/data/test_mmap.csv diff --git a/pandas/io/tests/data/test_multisheet.xls b/pandas/tests/io/data/test_multisheet.xls similarity index 100% rename from pandas/io/tests/data/test_multisheet.xls rename to pandas/tests/io/data/test_multisheet.xls diff --git a/pandas/io/tests/data/test_multisheet.xlsm b/pandas/tests/io/data/test_multisheet.xlsm similarity index 100% rename from pandas/io/tests/data/test_multisheet.xlsm rename to pandas/tests/io/data/test_multisheet.xlsm diff --git a/pandas/io/tests/data/test_multisheet.xlsx b/pandas/tests/io/data/test_multisheet.xlsx similarity index 100% rename from pandas/io/tests/data/test_multisheet.xlsx rename to pandas/tests/io/data/test_multisheet.xlsx diff --git a/pandas/io/tests/data/test_squeeze.xls b/pandas/tests/io/data/test_squeeze.xls similarity index 100% rename from pandas/io/tests/data/test_squeeze.xls rename to pandas/tests/io/data/test_squeeze.xls diff --git a/pandas/io/tests/data/test_squeeze.xlsm b/pandas/tests/io/data/test_squeeze.xlsm similarity index 100% rename from pandas/io/tests/data/test_squeeze.xlsm rename to pandas/tests/io/data/test_squeeze.xlsm diff --git a/pandas/io/tests/data/test_squeeze.xlsx b/pandas/tests/io/data/test_squeeze.xlsx similarity index 100% rename from pandas/io/tests/data/test_squeeze.xlsx rename to pandas/tests/io/data/test_squeeze.xlsx diff --git a/pandas/io/tests/data/test_types.xls b/pandas/tests/io/data/test_types.xls similarity index 100% rename from pandas/io/tests/data/test_types.xls rename to pandas/tests/io/data/test_types.xls diff --git a/pandas/io/tests/data/test_types.xlsm b/pandas/tests/io/data/test_types.xlsm similarity index 100% rename from pandas/io/tests/data/test_types.xlsm rename to pandas/tests/io/data/test_types.xlsm diff --git a/pandas/io/tests/data/test_types.xlsx b/pandas/tests/io/data/test_types.xlsx similarity index 100% rename from pandas/io/tests/data/test_types.xlsx rename to pandas/tests/io/data/test_types.xlsx diff --git a/pandas/io/tests/data/testdateoverflow.xls b/pandas/tests/io/data/testdateoverflow.xls similarity index 100% rename from pandas/io/tests/data/testdateoverflow.xls rename to pandas/tests/io/data/testdateoverflow.xls diff --git a/pandas/io/tests/data/testdateoverflow.xlsm b/pandas/tests/io/data/testdateoverflow.xlsm similarity index 100% rename from pandas/io/tests/data/testdateoverflow.xlsm rename to pandas/tests/io/data/testdateoverflow.xlsm diff --git a/pandas/io/tests/data/testdateoverflow.xlsx b/pandas/tests/io/data/testdateoverflow.xlsx similarity index 100% rename from pandas/io/tests/data/testdateoverflow.xlsx rename to pandas/tests/io/data/testdateoverflow.xlsx diff --git a/pandas/io/tests/data/testdtype.xls b/pandas/tests/io/data/testdtype.xls similarity index 100% rename from pandas/io/tests/data/testdtype.xls rename to pandas/tests/io/data/testdtype.xls diff --git a/pandas/io/tests/data/testdtype.xlsm b/pandas/tests/io/data/testdtype.xlsm similarity index 100% rename from pandas/io/tests/data/testdtype.xlsm rename to pandas/tests/io/data/testdtype.xlsm diff --git a/pandas/io/tests/data/testdtype.xlsx b/pandas/tests/io/data/testdtype.xlsx similarity index 100% rename from pandas/io/tests/data/testdtype.xlsx rename to pandas/tests/io/data/testdtype.xlsx diff --git a/pandas/io/tests/data/testmultiindex.xls b/pandas/tests/io/data/testmultiindex.xls similarity index 100% rename from pandas/io/tests/data/testmultiindex.xls rename to pandas/tests/io/data/testmultiindex.xls diff --git a/pandas/io/tests/data/testmultiindex.xlsm b/pandas/tests/io/data/testmultiindex.xlsm similarity index 100% rename from pandas/io/tests/data/testmultiindex.xlsm rename to pandas/tests/io/data/testmultiindex.xlsm diff --git a/pandas/io/tests/data/testmultiindex.xlsx b/pandas/tests/io/data/testmultiindex.xlsx similarity index 100% rename from pandas/io/tests/data/testmultiindex.xlsx rename to pandas/tests/io/data/testmultiindex.xlsx diff --git a/pandas/io/tests/data/testskiprows.xls b/pandas/tests/io/data/testskiprows.xls similarity index 100% rename from pandas/io/tests/data/testskiprows.xls rename to pandas/tests/io/data/testskiprows.xls diff --git a/pandas/io/tests/data/testskiprows.xlsm b/pandas/tests/io/data/testskiprows.xlsm similarity index 100% rename from pandas/io/tests/data/testskiprows.xlsm rename to pandas/tests/io/data/testskiprows.xlsm diff --git a/pandas/io/tests/data/testskiprows.xlsx b/pandas/tests/io/data/testskiprows.xlsx similarity index 100% rename from pandas/io/tests/data/testskiprows.xlsx rename to pandas/tests/io/data/testskiprows.xlsx diff --git a/pandas/io/tests/data/times_1900.xls b/pandas/tests/io/data/times_1900.xls similarity index 100% rename from pandas/io/tests/data/times_1900.xls rename to pandas/tests/io/data/times_1900.xls diff --git a/pandas/io/tests/data/times_1900.xlsm b/pandas/tests/io/data/times_1900.xlsm similarity index 100% rename from pandas/io/tests/data/times_1900.xlsm rename to pandas/tests/io/data/times_1900.xlsm diff --git a/pandas/io/tests/data/times_1900.xlsx b/pandas/tests/io/data/times_1900.xlsx similarity index 100% rename from pandas/io/tests/data/times_1900.xlsx rename to pandas/tests/io/data/times_1900.xlsx diff --git a/pandas/io/tests/data/times_1904.xls b/pandas/tests/io/data/times_1904.xls similarity index 100% rename from pandas/io/tests/data/times_1904.xls rename to pandas/tests/io/data/times_1904.xls diff --git a/pandas/io/tests/data/times_1904.xlsm b/pandas/tests/io/data/times_1904.xlsm similarity index 100% rename from pandas/io/tests/data/times_1904.xlsm rename to pandas/tests/io/data/times_1904.xlsm diff --git a/pandas/io/tests/data/times_1904.xlsx b/pandas/tests/io/data/times_1904.xlsx similarity index 100% rename from pandas/io/tests/data/times_1904.xlsx rename to pandas/tests/io/data/times_1904.xlsx diff --git a/pandas/io/tests/data/tips.csv b/pandas/tests/io/data/tips.csv similarity index 100% rename from pandas/io/tests/data/tips.csv rename to pandas/tests/io/data/tips.csv diff --git a/pandas/io/tests/data/valid_markup.html b/pandas/tests/io/data/valid_markup.html similarity index 100% rename from pandas/io/tests/data/valid_markup.html rename to pandas/tests/io/data/valid_markup.html diff --git a/pandas/io/tests/data/wikipedia_states.html b/pandas/tests/io/data/wikipedia_states.html similarity index 100% rename from pandas/io/tests/data/wikipedia_states.html rename to pandas/tests/io/data/wikipedia_states.html diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py similarity index 100% rename from pandas/io/tests/generate_legacy_storage_files.py rename to pandas/tests/io/generate_legacy_storage_files.py diff --git a/pandas/io/tests/json/__init__.py b/pandas/tests/io/json/__init__.py similarity index 100% rename from pandas/io/tests/json/__init__.py rename to pandas/tests/io/json/__init__.py diff --git a/pandas/io/tests/json/data/tsframe_iso_v012.json b/pandas/tests/io/json/data/tsframe_iso_v012.json similarity index 100% rename from pandas/io/tests/json/data/tsframe_iso_v012.json rename to pandas/tests/io/json/data/tsframe_iso_v012.json diff --git a/pandas/io/tests/json/data/tsframe_v012.json b/pandas/tests/io/json/data/tsframe_v012.json similarity index 100% rename from pandas/io/tests/json/data/tsframe_v012.json rename to pandas/tests/io/json/data/tsframe_v012.json diff --git a/pandas/io/tests/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py similarity index 100% rename from pandas/io/tests/json/test_normalize.py rename to pandas/tests/io/json/test_normalize.py diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py similarity index 100% rename from pandas/io/tests/json/test_pandas.py rename to pandas/tests/io/json/test_pandas.py diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py similarity index 100% rename from pandas/io/tests/json/test_ujson.py rename to pandas/tests/io/json/test_ujson.py diff --git a/pandas/io/tests/parser/__init__.py b/pandas/tests/io/parser/__init__.py similarity index 100% rename from pandas/io/tests/parser/__init__.py rename to pandas/tests/io/parser/__init__.py diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py similarity index 100% rename from pandas/io/tests/parser/c_parser_only.py rename to pandas/tests/io/parser/c_parser_only.py diff --git a/pandas/io/tests/parser/comment.py b/pandas/tests/io/parser/comment.py similarity index 100% rename from pandas/io/tests/parser/comment.py rename to pandas/tests/io/parser/comment.py diff --git a/pandas/io/tests/parser/common.py b/pandas/tests/io/parser/common.py similarity index 100% rename from pandas/io/tests/parser/common.py rename to pandas/tests/io/parser/common.py diff --git a/pandas/io/tests/parser/compression.py b/pandas/tests/io/parser/compression.py similarity index 100% rename from pandas/io/tests/parser/compression.py rename to pandas/tests/io/parser/compression.py diff --git a/pandas/io/tests/parser/converters.py b/pandas/tests/io/parser/converters.py similarity index 100% rename from pandas/io/tests/parser/converters.py rename to pandas/tests/io/parser/converters.py diff --git a/pandas/io/tests/parser/data/iris.csv b/pandas/tests/io/parser/data/iris.csv similarity index 100% rename from pandas/io/tests/parser/data/iris.csv rename to pandas/tests/io/parser/data/iris.csv diff --git a/pandas/io/tests/parser/data/salaries.csv b/pandas/tests/io/parser/data/salaries.csv similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv rename to pandas/tests/io/parser/data/salaries.csv diff --git a/pandas/io/tests/parser/data/salaries.csv.bz2 b/pandas/tests/io/parser/data/salaries.csv.bz2 similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.bz2 rename to pandas/tests/io/parser/data/salaries.csv.bz2 diff --git a/pandas/io/tests/parser/data/salaries.csv.gz b/pandas/tests/io/parser/data/salaries.csv.gz similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.gz rename to pandas/tests/io/parser/data/salaries.csv.gz diff --git a/pandas/io/tests/parser/data/salaries.csv.xz b/pandas/tests/io/parser/data/salaries.csv.xz similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.xz rename to pandas/tests/io/parser/data/salaries.csv.xz diff --git a/pandas/io/tests/parser/data/salaries.csv.zip b/pandas/tests/io/parser/data/salaries.csv.zip similarity index 100% rename from pandas/io/tests/parser/data/salaries.csv.zip rename to pandas/tests/io/parser/data/salaries.csv.zip diff --git a/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv b/pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv similarity index 100% rename from pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv rename to pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv diff --git a/pandas/io/tests/parser/data/test1.csv b/pandas/tests/io/parser/data/test1.csv similarity index 100% rename from pandas/io/tests/parser/data/test1.csv rename to pandas/tests/io/parser/data/test1.csv diff --git a/pandas/io/tests/parser/data/test1.csv.bz2 b/pandas/tests/io/parser/data/test1.csv.bz2 similarity index 100% rename from pandas/io/tests/parser/data/test1.csv.bz2 rename to pandas/tests/io/parser/data/test1.csv.bz2 diff --git a/pandas/io/tests/parser/data/test1.csv.gz b/pandas/tests/io/parser/data/test1.csv.gz similarity index 100% rename from pandas/io/tests/parser/data/test1.csv.gz rename to pandas/tests/io/parser/data/test1.csv.gz diff --git a/pandas/io/tests/parser/data/test2.csv b/pandas/tests/io/parser/data/test2.csv similarity index 100% rename from pandas/io/tests/parser/data/test2.csv rename to pandas/tests/io/parser/data/test2.csv diff --git a/pandas/io/tests/parser/data/test_mmap.csv b/pandas/tests/io/parser/data/test_mmap.csv similarity index 100% rename from pandas/io/tests/parser/data/test_mmap.csv rename to pandas/tests/io/parser/data/test_mmap.csv diff --git a/pandas/io/tests/parser/data/tips.csv b/pandas/tests/io/parser/data/tips.csv similarity index 100% rename from pandas/io/tests/parser/data/tips.csv rename to pandas/tests/io/parser/data/tips.csv diff --git a/pandas/io/tests/parser/data/unicode_series.csv b/pandas/tests/io/parser/data/unicode_series.csv similarity index 100% rename from pandas/io/tests/parser/data/unicode_series.csv rename to pandas/tests/io/parser/data/unicode_series.csv diff --git a/pandas/io/tests/parser/data/utf16_ex.txt b/pandas/tests/io/parser/data/utf16_ex.txt similarity index 100% rename from pandas/io/tests/parser/data/utf16_ex.txt rename to pandas/tests/io/parser/data/utf16_ex.txt diff --git a/pandas/io/tests/parser/dialect.py b/pandas/tests/io/parser/dialect.py similarity index 100% rename from pandas/io/tests/parser/dialect.py rename to pandas/tests/io/parser/dialect.py diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py similarity index 100% rename from pandas/io/tests/parser/dtypes.py rename to pandas/tests/io/parser/dtypes.py diff --git a/pandas/io/tests/parser/header.py b/pandas/tests/io/parser/header.py similarity index 100% rename from pandas/io/tests/parser/header.py rename to pandas/tests/io/parser/header.py diff --git a/pandas/io/tests/parser/index_col.py b/pandas/tests/io/parser/index_col.py similarity index 100% rename from pandas/io/tests/parser/index_col.py rename to pandas/tests/io/parser/index_col.py diff --git a/pandas/io/tests/parser/multithread.py b/pandas/tests/io/parser/multithread.py similarity index 100% rename from pandas/io/tests/parser/multithread.py rename to pandas/tests/io/parser/multithread.py diff --git a/pandas/io/tests/parser/na_values.py b/pandas/tests/io/parser/na_values.py similarity index 100% rename from pandas/io/tests/parser/na_values.py rename to pandas/tests/io/parser/na_values.py diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py similarity index 100% rename from pandas/io/tests/parser/parse_dates.py rename to pandas/tests/io/parser/parse_dates.py diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py similarity index 100% rename from pandas/io/tests/parser/python_parser_only.py rename to pandas/tests/io/parser/python_parser_only.py diff --git a/pandas/io/tests/parser/quoting.py b/pandas/tests/io/parser/quoting.py similarity index 100% rename from pandas/io/tests/parser/quoting.py rename to pandas/tests/io/parser/quoting.py diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/tests/io/parser/skiprows.py similarity index 100% rename from pandas/io/tests/parser/skiprows.py rename to pandas/tests/io/parser/skiprows.py diff --git a/pandas/io/tests/parser/test_network.py b/pandas/tests/io/parser/test_network.py similarity index 100% rename from pandas/io/tests/parser/test_network.py rename to pandas/tests/io/parser/test_network.py diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py similarity index 100% rename from pandas/io/tests/parser/test_parsers.py rename to pandas/tests/io/parser/test_parsers.py diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py similarity index 100% rename from pandas/io/tests/parser/test_read_fwf.py rename to pandas/tests/io/parser/test_read_fwf.py diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py similarity index 100% rename from pandas/io/tests/parser/test_textreader.py rename to pandas/tests/io/parser/test_textreader.py diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py similarity index 100% rename from pandas/io/tests/parser/test_unsupported.py rename to pandas/tests/io/parser/test_unsupported.py diff --git a/pandas/io/tests/parser/usecols.py b/pandas/tests/io/parser/usecols.py similarity index 100% rename from pandas/io/tests/parser/usecols.py rename to pandas/tests/io/parser/usecols.py diff --git a/pandas/io/tests/sas/data/DEMO_G.csv b/pandas/tests/io/sas/data/DEMO_G.csv similarity index 100% rename from pandas/io/tests/sas/data/DEMO_G.csv rename to pandas/tests/io/sas/data/DEMO_G.csv diff --git a/pandas/io/tests/sas/data/DEMO_G.xpt b/pandas/tests/io/sas/data/DEMO_G.xpt similarity index 100% rename from pandas/io/tests/sas/data/DEMO_G.xpt rename to pandas/tests/io/sas/data/DEMO_G.xpt diff --git a/pandas/io/tests/sas/data/DRXFCD_G.csv b/pandas/tests/io/sas/data/DRXFCD_G.csv similarity index 100% rename from pandas/io/tests/sas/data/DRXFCD_G.csv rename to pandas/tests/io/sas/data/DRXFCD_G.csv diff --git a/pandas/io/tests/sas/data/DRXFCD_G.xpt b/pandas/tests/io/sas/data/DRXFCD_G.xpt similarity index 100% rename from pandas/io/tests/sas/data/DRXFCD_G.xpt rename to pandas/tests/io/sas/data/DRXFCD_G.xpt diff --git a/pandas/io/tests/sas/data/SSHSV1_A.csv b/pandas/tests/io/sas/data/SSHSV1_A.csv similarity index 100% rename from pandas/io/tests/sas/data/SSHSV1_A.csv rename to pandas/tests/io/sas/data/SSHSV1_A.csv diff --git a/pandas/io/tests/sas/data/SSHSV1_A.xpt b/pandas/tests/io/sas/data/SSHSV1_A.xpt similarity index 100% rename from pandas/io/tests/sas/data/SSHSV1_A.xpt rename to pandas/tests/io/sas/data/SSHSV1_A.xpt diff --git a/pandas/io/tests/sas/data/airline.csv b/pandas/tests/io/sas/data/airline.csv similarity index 100% rename from pandas/io/tests/sas/data/airline.csv rename to pandas/tests/io/sas/data/airline.csv diff --git a/pandas/io/tests/sas/data/airline.sas7bdat b/pandas/tests/io/sas/data/airline.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/airline.sas7bdat rename to pandas/tests/io/sas/data/airline.sas7bdat diff --git a/pandas/io/tests/sas/data/paxraw_d_short.csv b/pandas/tests/io/sas/data/paxraw_d_short.csv similarity index 100% rename from pandas/io/tests/sas/data/paxraw_d_short.csv rename to pandas/tests/io/sas/data/paxraw_d_short.csv diff --git a/pandas/io/tests/sas/data/paxraw_d_short.xpt b/pandas/tests/io/sas/data/paxraw_d_short.xpt similarity index 100% rename from pandas/io/tests/sas/data/paxraw_d_short.xpt rename to pandas/tests/io/sas/data/paxraw_d_short.xpt diff --git a/pandas/io/tests/sas/data/productsales.csv b/pandas/tests/io/sas/data/productsales.csv similarity index 100% rename from pandas/io/tests/sas/data/productsales.csv rename to pandas/tests/io/sas/data/productsales.csv diff --git a/pandas/io/tests/sas/data/productsales.sas7bdat b/pandas/tests/io/sas/data/productsales.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/productsales.sas7bdat rename to pandas/tests/io/sas/data/productsales.sas7bdat diff --git a/pandas/io/tests/sas/data/test1.sas7bdat b/pandas/tests/io/sas/data/test1.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test1.sas7bdat rename to pandas/tests/io/sas/data/test1.sas7bdat diff --git a/pandas/io/tests/sas/data/test10.sas7bdat b/pandas/tests/io/sas/data/test10.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test10.sas7bdat rename to pandas/tests/io/sas/data/test10.sas7bdat diff --git a/pandas/io/tests/sas/data/test11.sas7bdat b/pandas/tests/io/sas/data/test11.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test11.sas7bdat rename to pandas/tests/io/sas/data/test11.sas7bdat diff --git a/pandas/io/tests/sas/data/test12.sas7bdat b/pandas/tests/io/sas/data/test12.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test12.sas7bdat rename to pandas/tests/io/sas/data/test12.sas7bdat diff --git a/pandas/io/tests/sas/data/test13.sas7bdat b/pandas/tests/io/sas/data/test13.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test13.sas7bdat rename to pandas/tests/io/sas/data/test13.sas7bdat diff --git a/pandas/io/tests/sas/data/test14.sas7bdat b/pandas/tests/io/sas/data/test14.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test14.sas7bdat rename to pandas/tests/io/sas/data/test14.sas7bdat diff --git a/pandas/io/tests/sas/data/test15.sas7bdat b/pandas/tests/io/sas/data/test15.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test15.sas7bdat rename to pandas/tests/io/sas/data/test15.sas7bdat diff --git a/pandas/io/tests/sas/data/test16.sas7bdat b/pandas/tests/io/sas/data/test16.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test16.sas7bdat rename to pandas/tests/io/sas/data/test16.sas7bdat diff --git a/pandas/io/tests/sas/data/test2.sas7bdat b/pandas/tests/io/sas/data/test2.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test2.sas7bdat rename to pandas/tests/io/sas/data/test2.sas7bdat diff --git a/pandas/io/tests/sas/data/test3.sas7bdat b/pandas/tests/io/sas/data/test3.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test3.sas7bdat rename to pandas/tests/io/sas/data/test3.sas7bdat diff --git a/pandas/io/tests/sas/data/test4.sas7bdat b/pandas/tests/io/sas/data/test4.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test4.sas7bdat rename to pandas/tests/io/sas/data/test4.sas7bdat diff --git a/pandas/io/tests/sas/data/test5.sas7bdat b/pandas/tests/io/sas/data/test5.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test5.sas7bdat rename to pandas/tests/io/sas/data/test5.sas7bdat diff --git a/pandas/io/tests/sas/data/test6.sas7bdat b/pandas/tests/io/sas/data/test6.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test6.sas7bdat rename to pandas/tests/io/sas/data/test6.sas7bdat diff --git a/pandas/io/tests/sas/data/test7.sas7bdat b/pandas/tests/io/sas/data/test7.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test7.sas7bdat rename to pandas/tests/io/sas/data/test7.sas7bdat diff --git a/pandas/io/tests/sas/data/test8.sas7bdat b/pandas/tests/io/sas/data/test8.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test8.sas7bdat rename to pandas/tests/io/sas/data/test8.sas7bdat diff --git a/pandas/io/tests/sas/data/test9.sas7bdat b/pandas/tests/io/sas/data/test9.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test9.sas7bdat rename to pandas/tests/io/sas/data/test9.sas7bdat diff --git a/pandas/io/tests/sas/data/test_12659.csv b/pandas/tests/io/sas/data/test_12659.csv similarity index 100% rename from pandas/io/tests/sas/data/test_12659.csv rename to pandas/tests/io/sas/data/test_12659.csv diff --git a/pandas/io/tests/sas/data/test_12659.sas7bdat b/pandas/tests/io/sas/data/test_12659.sas7bdat similarity index 100% rename from pandas/io/tests/sas/data/test_12659.sas7bdat rename to pandas/tests/io/sas/data/test_12659.sas7bdat diff --git a/pandas/io/tests/sas/data/test_sas7bdat_1.csv b/pandas/tests/io/sas/data/test_sas7bdat_1.csv similarity index 100% rename from pandas/io/tests/sas/data/test_sas7bdat_1.csv rename to pandas/tests/io/sas/data/test_sas7bdat_1.csv diff --git a/pandas/io/tests/sas/data/test_sas7bdat_2.csv b/pandas/tests/io/sas/data/test_sas7bdat_2.csv similarity index 100% rename from pandas/io/tests/sas/data/test_sas7bdat_2.csv rename to pandas/tests/io/sas/data/test_sas7bdat_2.csv diff --git a/pandas/io/tests/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py similarity index 100% rename from pandas/io/tests/sas/test_sas.py rename to pandas/tests/io/sas/test_sas.py diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py similarity index 100% rename from pandas/io/tests/sas/test_sas7bdat.py rename to pandas/tests/io/sas/test_sas7bdat.py diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py similarity index 100% rename from pandas/io/tests/sas/test_xport.py rename to pandas/tests/io/sas/test_xport.py diff --git a/pandas/io/tests/test_clipboard.py b/pandas/tests/io/test_clipboard.py similarity index 100% rename from pandas/io/tests/test_clipboard.py rename to pandas/tests/io/test_clipboard.py diff --git a/pandas/io/tests/test_common.py b/pandas/tests/io/test_common.py similarity index 100% rename from pandas/io/tests/test_common.py rename to pandas/tests/io/test_common.py diff --git a/pandas/io/tests/test_date_converters.py b/pandas/tests/io/test_date_converters.py similarity index 100% rename from pandas/io/tests/test_date_converters.py rename to pandas/tests/io/test_date_converters.py diff --git a/pandas/io/tests/test_excel.py b/pandas/tests/io/test_excel.py similarity index 100% rename from pandas/io/tests/test_excel.py rename to pandas/tests/io/test_excel.py diff --git a/pandas/io/tests/test_feather.py b/pandas/tests/io/test_feather.py similarity index 100% rename from pandas/io/tests/test_feather.py rename to pandas/tests/io/test_feather.py diff --git a/pandas/io/tests/test_gbq.py b/pandas/tests/io/test_gbq.py similarity index 100% rename from pandas/io/tests/test_gbq.py rename to pandas/tests/io/test_gbq.py diff --git a/pandas/io/tests/test_html.py b/pandas/tests/io/test_html.py similarity index 100% rename from pandas/io/tests/test_html.py rename to pandas/tests/io/test_html.py diff --git a/pandas/io/tests/test_packers.py b/pandas/tests/io/test_packers.py similarity index 99% rename from pandas/io/tests/test_packers.py rename to pandas/tests/io/test_packers.py index 4bb6f4a69bab3..911cd8164571d 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -795,7 +795,7 @@ class TestMsgpack(): @classmethod def setup_class(cls): - from pandas.io.tests.generate_legacy_storage_files import ( + from pandas.tests.io.generate_legacy_storage_files import ( create_msgpack_data, create_data) cls.data = create_msgpack_data() cls.all_data = create_data() diff --git a/pandas/io/tests/test_pickle.py b/pandas/tests/io/test_pickle.py similarity index 99% rename from pandas/io/tests/test_pickle.py rename to pandas/tests/io/test_pickle.py index 588b2d5f04888..5445c506b050c 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -33,7 +33,7 @@ class TestPickle(): @classmethod def setup_class(cls): - from pandas.io.tests.generate_legacy_storage_files import ( + from pandas.tests.io.generate_legacy_storage_files import ( create_pickle_data) cls.data = create_pickle_data() cls.path = u('__%s__.pickle' % tm.rands(10)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/tests/io/test_pytables.py similarity index 100% rename from pandas/io/tests/test_pytables.py rename to pandas/tests/io/test_pytables.py diff --git a/pandas/io/tests/test_s3.py b/pandas/tests/io/test_s3.py similarity index 100% rename from pandas/io/tests/test_s3.py rename to pandas/tests/io/test_s3.py diff --git a/pandas/io/tests/test_sql.py b/pandas/tests/io/test_sql.py similarity index 100% rename from pandas/io/tests/test_sql.py rename to pandas/tests/io/test_sql.py diff --git a/pandas/io/tests/test_stata.py b/pandas/tests/io/test_stata.py similarity index 100% rename from pandas/io/tests/test_stata.py rename to pandas/tests/io/test_stata.py diff --git a/pandas/sparse/tests/__init__.py b/pandas/tests/msgpack/__init__.py similarity index 100% rename from pandas/sparse/tests/__init__.py rename to pandas/tests/msgpack/__init__.py diff --git a/pandas/tests/test_msgpack/test_buffer.py b/pandas/tests/msgpack/test_buffer.py similarity index 100% rename from pandas/tests/test_msgpack/test_buffer.py rename to pandas/tests/msgpack/test_buffer.py diff --git a/pandas/tests/test_msgpack/test_case.py b/pandas/tests/msgpack/test_case.py similarity index 100% rename from pandas/tests/test_msgpack/test_case.py rename to pandas/tests/msgpack/test_case.py diff --git a/pandas/tests/test_msgpack/test_except.py b/pandas/tests/msgpack/test_except.py similarity index 100% rename from pandas/tests/test_msgpack/test_except.py rename to pandas/tests/msgpack/test_except.py diff --git a/pandas/tests/test_msgpack/test_extension.py b/pandas/tests/msgpack/test_extension.py similarity index 100% rename from pandas/tests/test_msgpack/test_extension.py rename to pandas/tests/msgpack/test_extension.py diff --git a/pandas/tests/test_msgpack/test_format.py b/pandas/tests/msgpack/test_format.py similarity index 100% rename from pandas/tests/test_msgpack/test_format.py rename to pandas/tests/msgpack/test_format.py diff --git a/pandas/tests/test_msgpack/test_limits.py b/pandas/tests/msgpack/test_limits.py similarity index 100% rename from pandas/tests/test_msgpack/test_limits.py rename to pandas/tests/msgpack/test_limits.py diff --git a/pandas/tests/test_msgpack/test_newspec.py b/pandas/tests/msgpack/test_newspec.py similarity index 100% rename from pandas/tests/test_msgpack/test_newspec.py rename to pandas/tests/msgpack/test_newspec.py diff --git a/pandas/tests/test_msgpack/test_obj.py b/pandas/tests/msgpack/test_obj.py similarity index 100% rename from pandas/tests/test_msgpack/test_obj.py rename to pandas/tests/msgpack/test_obj.py diff --git a/pandas/tests/test_msgpack/test_pack.py b/pandas/tests/msgpack/test_pack.py similarity index 100% rename from pandas/tests/test_msgpack/test_pack.py rename to pandas/tests/msgpack/test_pack.py diff --git a/pandas/tests/test_msgpack/test_read_size.py b/pandas/tests/msgpack/test_read_size.py similarity index 100% rename from pandas/tests/test_msgpack/test_read_size.py rename to pandas/tests/msgpack/test_read_size.py diff --git a/pandas/tests/test_msgpack/test_seq.py b/pandas/tests/msgpack/test_seq.py similarity index 100% rename from pandas/tests/test_msgpack/test_seq.py rename to pandas/tests/msgpack/test_seq.py diff --git a/pandas/tests/test_msgpack/test_sequnpack.py b/pandas/tests/msgpack/test_sequnpack.py similarity index 100% rename from pandas/tests/test_msgpack/test_sequnpack.py rename to pandas/tests/msgpack/test_sequnpack.py diff --git a/pandas/tests/test_msgpack/test_subtype.py b/pandas/tests/msgpack/test_subtype.py similarity index 100% rename from pandas/tests/test_msgpack/test_subtype.py rename to pandas/tests/msgpack/test_subtype.py diff --git a/pandas/tests/test_msgpack/test_unpack.py b/pandas/tests/msgpack/test_unpack.py similarity index 100% rename from pandas/tests/test_msgpack/test_unpack.py rename to pandas/tests/msgpack/test_unpack.py diff --git a/pandas/tests/test_msgpack/test_unpack_raw.py b/pandas/tests/msgpack/test_unpack_raw.py similarity index 100% rename from pandas/tests/test_msgpack/test_unpack_raw.py rename to pandas/tests/msgpack/test_unpack_raw.py diff --git a/pandas/tests/test_msgpack/__init__.py b/pandas/tests/sparse/__init__.py similarity index 100% rename from pandas/tests/test_msgpack/__init__.py rename to pandas/tests/sparse/__init__.py diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/tests/sparse/test_arithmetics.py similarity index 100% rename from pandas/sparse/tests/test_arithmetics.py rename to pandas/tests/sparse/test_arithmetics.py diff --git a/pandas/sparse/tests/test_array.py b/pandas/tests/sparse/test_array.py similarity index 100% rename from pandas/sparse/tests/test_array.py rename to pandas/tests/sparse/test_array.py diff --git a/pandas/sparse/tests/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py similarity index 100% rename from pandas/sparse/tests/test_combine_concat.py rename to pandas/tests/sparse/test_combine_concat.py diff --git a/pandas/sparse/tests/test_format.py b/pandas/tests/sparse/test_format.py similarity index 100% rename from pandas/sparse/tests/test_format.py rename to pandas/tests/sparse/test_format.py diff --git a/pandas/sparse/tests/test_frame.py b/pandas/tests/sparse/test_frame.py similarity index 100% rename from pandas/sparse/tests/test_frame.py rename to pandas/tests/sparse/test_frame.py diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/tests/sparse/test_groupby.py similarity index 100% rename from pandas/sparse/tests/test_groupby.py rename to pandas/tests/sparse/test_groupby.py diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/tests/sparse/test_indexing.py similarity index 100% rename from pandas/sparse/tests/test_indexing.py rename to pandas/tests/sparse/test_indexing.py diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/tests/sparse/test_libsparse.py similarity index 100% rename from pandas/sparse/tests/test_libsparse.py rename to pandas/tests/sparse/test_libsparse.py diff --git a/pandas/sparse/tests/test_list.py b/pandas/tests/sparse/test_list.py similarity index 100% rename from pandas/sparse/tests/test_list.py rename to pandas/tests/sparse/test_list.py diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/tests/sparse/test_pivot.py similarity index 100% rename from pandas/sparse/tests/test_pivot.py rename to pandas/tests/sparse/test_pivot.py diff --git a/pandas/sparse/tests/test_series.py b/pandas/tests/sparse/test_series.py similarity index 100% rename from pandas/sparse/tests/test_series.py rename to pandas/tests/sparse/test_series.py diff --git a/pandas/tools/tests/__init__.py b/pandas/tests/tools/__init__.py similarity index 100% rename from pandas/tools/tests/__init__.py rename to pandas/tests/tools/__init__.py diff --git a/pandas/tools/tests/data/allow_exact_matches.csv b/pandas/tests/tools/data/allow_exact_matches.csv similarity index 100% rename from pandas/tools/tests/data/allow_exact_matches.csv rename to pandas/tests/tools/data/allow_exact_matches.csv diff --git a/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv b/pandas/tests/tools/data/allow_exact_matches_and_tolerance.csv similarity index 100% rename from pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv rename to pandas/tests/tools/data/allow_exact_matches_and_tolerance.csv diff --git a/pandas/tools/tests/data/asof.csv b/pandas/tests/tools/data/asof.csv similarity index 100% rename from pandas/tools/tests/data/asof.csv rename to pandas/tests/tools/data/asof.csv diff --git a/pandas/tools/tests/data/asof2.csv b/pandas/tests/tools/data/asof2.csv similarity index 100% rename from pandas/tools/tests/data/asof2.csv rename to pandas/tests/tools/data/asof2.csv diff --git a/pandas/tools/tests/data/cut_data.csv b/pandas/tests/tools/data/cut_data.csv similarity index 100% rename from pandas/tools/tests/data/cut_data.csv rename to pandas/tests/tools/data/cut_data.csv diff --git a/pandas/tools/tests/data/quotes.csv b/pandas/tests/tools/data/quotes.csv similarity index 100% rename from pandas/tools/tests/data/quotes.csv rename to pandas/tests/tools/data/quotes.csv diff --git a/pandas/tools/tests/data/quotes2.csv b/pandas/tests/tools/data/quotes2.csv similarity index 100% rename from pandas/tools/tests/data/quotes2.csv rename to pandas/tests/tools/data/quotes2.csv diff --git a/pandas/tools/tests/data/tolerance.csv b/pandas/tests/tools/data/tolerance.csv similarity index 100% rename from pandas/tools/tests/data/tolerance.csv rename to pandas/tests/tools/data/tolerance.csv diff --git a/pandas/tools/tests/data/trades.csv b/pandas/tests/tools/data/trades.csv similarity index 100% rename from pandas/tools/tests/data/trades.csv rename to pandas/tests/tools/data/trades.csv diff --git a/pandas/tools/tests/data/trades2.csv b/pandas/tests/tools/data/trades2.csv similarity index 100% rename from pandas/tools/tests/data/trades2.csv rename to pandas/tests/tools/data/trades2.csv diff --git a/pandas/tools/tests/test_concat.py b/pandas/tests/tools/test_concat.py similarity index 100% rename from pandas/tools/tests/test_concat.py rename to pandas/tests/tools/test_concat.py diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tests/tools/test_hashing.py similarity index 100% rename from pandas/tools/tests/test_hashing.py rename to pandas/tests/tools/test_hashing.py diff --git a/pandas/tools/tests/test_join.py b/pandas/tests/tools/test_join.py similarity index 99% rename from pandas/tools/tests/test_join.py rename to pandas/tests/tools/test_join.py index fe5821a637205..ab42b1212301b 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tests/tools/test_join.py @@ -11,7 +11,7 @@ import pandas._join as _join import pandas.util.testing as tm -from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS +from pandas.tests.tools.test_merge import get_test_data, N, NGROUPS a_ = np.array diff --git a/pandas/tools/tests/test_merge.py b/pandas/tests/tools/test_merge.py similarity index 100% rename from pandas/tools/tests/test_merge.py rename to pandas/tests/tools/test_merge.py diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tests/tools/test_merge_asof.py similarity index 100% rename from pandas/tools/tests/test_merge_asof.py rename to pandas/tests/tools/test_merge_asof.py diff --git a/pandas/tools/tests/test_merge_ordered.py b/pandas/tests/tools/test_merge_ordered.py similarity index 100% rename from pandas/tools/tests/test_merge_ordered.py rename to pandas/tests/tools/test_merge_ordered.py diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tests/tools/test_pivot.py similarity index 100% rename from pandas/tools/tests/test_pivot.py rename to pandas/tests/tools/test_pivot.py diff --git a/pandas/tools/tests/test_tile.py b/pandas/tests/tools/test_tile.py similarity index 100% rename from pandas/tools/tests/test_tile.py rename to pandas/tests/tools/test_tile.py diff --git a/pandas/tools/tests/test_util.py b/pandas/tests/tools/test_util.py similarity index 100% rename from pandas/tools/tests/test_util.py rename to pandas/tests/tools/test_util.py diff --git a/setup.py b/setup.py index edec53e9cefb0..cbcadce459c67 100755 --- a/setup.py +++ b/setup.py @@ -622,12 +622,10 @@ def pxd(name): version=versioneer.get_version(), packages=['pandas', 'pandas.api', - 'pandas.api.tests', 'pandas.api.types', 'pandas.compat', 'pandas.compat.numpy', 'pandas.computation', - 'pandas.computation.tests', 'pandas.core', 'pandas.indexes', 'pandas.io', @@ -635,59 +633,61 @@ def pxd(name): 'pandas.io.sas', 'pandas.formats', 'pandas.sparse', - 'pandas.sparse.tests', 'pandas.stats', 'pandas.util', 'pandas.tests', + 'pandas.tests.api', + 'pandas.tests.computation', 'pandas.tests.frame', 'pandas.tests.indexes', 'pandas.tests.indexes.datetimes', 'pandas.tests.indexes.timedeltas', 'pandas.tests.indexes.period', + 'pandas.tests.io', + 'pandas.tests.io.json', + 'pandas.tests.io.parser', + 'pandas.tests.io.sas', 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', + 'pandas.tests.msgpack', 'pandas.tests.scalar', + 'pandas.tests.sparse', 'pandas.tests.tseries', + 'pandas.tests.tools', 'pandas.tests.types', - 'pandas.tests.test_msgpack', 'pandas.tests.plotting', 'pandas.tools', - 'pandas.tools.tests', 'pandas.tseries', 'pandas.types', - 'pandas.io.tests', - 'pandas.io.tests.json', - 'pandas.io.tests.parser', - 'pandas.io.tests.sas', 'pandas.msgpack', 'pandas.util.clipboard' ], - package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', - 'tests/data/legacy_pickle/*/*.pickle', - 'tests/data/legacy_msgpack/*/*.msgpack', - 'tests/data/*.csv*', - 'tests/data/*.dta', - 'tests/data/*.pickle', - 'tests/data/*.txt', - 'tests/data/*.xls', - 'tests/data/*.xlsx', - 'tests/data/*.xlsm', - 'tests/data/*.table', - 'tests/parser/data/*.csv', - 'tests/parser/data/*.gz', - 'tests/parser/data/*.bz2', - 'tests/parser/data/*.txt', - 'tests/sas/data/*.csv', - 'tests/sas/data/*.xpt', - 'tests/sas/data/*.sas7bdat', - 'tests/data/*.html', - 'tests/data/html_encoding/*.html', - 'tests/json/data/*.json'], - 'pandas.tools': ['tests/data/*.csv'], - 'pandas.tests': ['data/*.csv'], + package_data={'pandas.tests': ['data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'], + 'pandas.tests.io': ['data/legacy_hdf/*.h5', + 'data/legacy_pickle/*/*.pickle', + 'data/legacy_msgpack/*/*.msgpack', + 'data/*.csv*', + 'data/*.dta', + 'data/*.pickle', + 'data/*.txt', + 'data/*.xls', + 'data/*.xlsx', + 'data/*.xlsm', + 'data/*.table', + 'parser/data/*.csv', + 'parser/data/*.gz', + 'parser/data/*.bz2', + 'parser/data/*.txt', + 'sas/data/*.csv', + 'sas/data/*.xpt', + 'sas/data/*.sas7bdat', + 'data/*.html', + 'data/html_encoding/*.html', + 'json/data/*.json'], + 'pandas.tests.tools': ['data/*.csv'], 'pandas.tests.tseries': ['data/*.pickle'] }, ext_modules=extensions, From 1bcc10da51c61886362d9d4d4eeafe604ab288ea Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 10:09:27 -0500 Subject: [PATCH 05/52] TST: fix locations for github based url tests --- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/test_excel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 0671901fc170a..b667eed346355 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -617,7 +617,7 @@ def test_read_csv_parse_simple_list(self): def test_url(self): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/io/tests/parser/data/salaries.csv') + 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salaries.csv') diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index a22c89184f20d..0c2b443cffe52 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -581,7 +581,7 @@ def test_read_xlrd_Book(self): @tm.network def test_read_from_http_url(self): url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/io/tests/data/test1' + self.ext) + 'pandas/tests/io/data/test1' + self.ext) url_table = read_excel(url) local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) From f87db63d821f9b7bc347c3ed8e0f452859843081 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 10:31:47 -0500 Subject: [PATCH 06/52] DOC: fix path in whatsnew --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9f86c777c665d..aa620bce0df59 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -91,7 +91,7 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). url = 'https://github.com/{repo}/raw/{branch}/{path}'.format( repo = 'pandas-dev/pandas', branch = 'master', - path = 'pandas/io/tests/parser/data/salaries.csv.bz2', + path = 'pandas/tests/io/parser/data/salaries.csv.bz2', ) df = pd.read_table(url, compression='infer') # default, infer compression df = pd.read_table(url, compression='bz2') # explicitly specify compression From 1190ac6e19a431a596980c766ec1a3405a7d554a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Feb 2017 16:17:27 -0500 Subject: [PATCH 07/52] TST: use xdist for multiple cpu testing closes #15369 --- .travis.yml | 3 +- ci/script_multi.sh | 32 +++ ci/{script.sh => script_single.sh} | 10 +- pandas/tests/indexes/datetimes/test_ops.py | 244 +++++++++++---------- pandas/tests/io/test_clipboard.py | 1 + pandas/tests/io/test_pytables.py | 7 +- pandas/tests/io/test_sql.py | 19 +- pandas/tests/test_window.py | 83 ++++--- setup.cfg | 2 + test_fast.sh | 3 +- 10 files changed, 223 insertions(+), 181 deletions(-) create mode 100755 ci/script_multi.sh rename ci/{script.sh => script_single.sh} (63%) diff --git a/.travis.yml b/.travis.yml index 2ff5d508d0371..6b90e49b336b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -320,7 +320,8 @@ before_script: script: - echo "script start" - ci/run_build_docs.sh - - ci/script.sh + - ci/script_single.sh + - ci/script_multi.sh - ci/lint.sh - echo "script done" diff --git a/ci/script_multi.sh b/ci/script_multi.sh new file mode 100755 index 0000000000000..83f8427cc57ad --- /dev/null +++ b/ci/script_multi.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +echo "[script multi]" + +source activate pandas + +# don't run the tests for the doc build +if [ x"$DOC_BUILD" != x"" ]; then + exit 0 +fi + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE"; + echo "Setting LC_ALL to $LOCALE_OVERRIDE" + + pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' + python -c "$pycmd" +fi + +if [ "$BUILD_TEST" ]; then + echo "We are not running pytest as this is simply a build test." +elif [ "$COVERAGE" ]; then + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas +else + echo pytest -n 2 -m "not single" $TEST_ARGS pandas + pytest -n 2 -m "not single" $TEST_ARGS pandas # TODO: doctest +fi + +RET="$?" + +exit "$RET" diff --git a/ci/script.sh b/ci/script_single.sh similarity index 63% rename from ci/script.sh rename to ci/script_single.sh index c52fa0fdb33a3..38021fcac5721 100755 --- a/ci/script.sh +++ b/ci/script_single.sh @@ -1,6 +1,6 @@ #!/bin/bash -echo "inside $0" +echo "[script_single]" source activate pandas @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas - pytest -s --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas else - echo pytest $TEST_ARGS pandas - pytest $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" $TEST_ARGS pandas + pytest -m "single" $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 9a968a42c4247..8eb9128d8d1c8 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1,7 +1,9 @@ +import pytest import warnings import numpy as np from datetime import timedelta +from itertools import product import pandas as pd import pandas.tslib as tslib import pandas.util.testing as tm @@ -958,134 +960,134 @@ def test_second(self): tm.assert_index_equal(r1, r2) -class TestDatetimeIndex(tm.TestCase): - - # GH 10699 - def test_datetime64_with_DateOffset(self): - for klass, assert_func in zip([Series, DatetimeIndex], - [self.assert_series_equal, - tm.assert_index_equal]): - s = klass(date_range('2000-01-01', '2000-01-31'), name='a') - result = s + pd.DateOffset(years=1) - result2 = pd.DateOffset(years=1) + s - exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') +# GH 10699 +@pytest.mark.parametrize('klass,assert_func', zip([Series, DatetimeIndex], + [tm.assert_series_equal, + tm.assert_index_equal])) +def test_datetime64_with_DateOffset(klass, assert_func): + s = klass(date_range('2000-01-01', '2000-01-31'), name='a') + result = s + pd.DateOffset(years=1) + result2 = pd.DateOffset(years=1) + s + exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') + assert_func(result, exp) + assert_func(result2, exp) + + result = s - pd.DateOffset(years=1) + exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') + assert_func(result, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.Day() + result2 = pd.offsets.Day() + s + exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), + Timestamp('2000-02-16', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.MonthEnd() + result2 = pd.offsets.MonthEnd() + s + exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), + Timestamp('2000-02-29', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + # array of offsets - valid for Series only + if klass is Series: + with tm.assert_produces_warning(PerformanceWarning): + s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.MonthEnd()]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') + ]) assert_func(result, exp) - assert_func(result2, exp) - result = s - pd.DateOffset(years=1) - exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') + # same offset + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.DateOffset(years=1)]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) assert_func(result, exp) - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')], name='a') - result = s + pd.offsets.Day() - result2 = pd.offsets.Day() + s - exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), - Timestamp('2000-02-16', tz='US/Central')], name='a') - assert_func(result, exp) - assert_func(result2, exp) - - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')], name='a') - result = s + pd.offsets.MonthEnd() - result2 = pd.offsets.MonthEnd() + s - exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], name='a') - assert_func(result, exp) - assert_func(result2, exp) - - # array of offsets - valid for Series only - if klass is Series: - with tm.assert_produces_warning(PerformanceWarning): - s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) - result = s + Series([pd.offsets.DateOffset(years=1), - pd.offsets.MonthEnd()]) - exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') - ]) - assert_func(result, exp) - - # same offset - result = s + Series([pd.offsets.DateOffset(years=1), - pd.offsets.DateOffset(years=1)]) - exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) - assert_func(result, exp) - - s = klass([Timestamp('2000-01-05 00:15:00'), + s = klass([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-03-31'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31'), + Timestamp('2000-05-15'), + Timestamp('2001-06-15')]) + + # DateOffset relativedelta fastpath + relative_kwargs = [('years', 2), ('months', 5), ('days', 3), + ('hours', 5), ('minutes', 10), ('seconds', 2), + ('microseconds', 5)] + for i, kwd in enumerate(relative_kwargs): + op = pd.DateOffset(**dict([kwd])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + + # assert these are equal on a piecewise basis + offsets = ['YearBegin', ('YearBegin', {'month': 5}), + 'YearEnd', ('YearEnd', {'month': 5}), + 'MonthBegin', 'MonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', + 'Week', ('Week', {'weekday': 3}), + 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', + 'CustomBusinessDay', 'CDay', 'CBMonthEnd', + 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', + 'BusinessHour', 'BYearBegin', 'BYearEnd', + 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}), + ('FY5253Quarter', {'qtr_with_extra_week': 1, + 'startingMonth': 1, + 'weekday': 2, + 'variation': 'nearest'}), + ('FY5253', {'weekday': 0, + 'startingMonth': 2, + 'variation': + 'nearest'}), + ('WeekOfMonth', {'weekday': 2, + 'week': 2}), + 'Easter', ('DateOffset', {'day': 4}), + ('DateOffset', {'month': 5})] + + with warnings.catch_warnings(record=True): + for normalize in (True, False): + for do in offsets: + if isinstance(do, tuple): + do, kwargs = do + else: + do = do + kwargs = {} + + for n in [0, 5]: + if (do in ['WeekOfMonth', 'LastWeekOfMonth', + 'FY5253Quarter', 'FY5253'] and n == 0): + continue + op = getattr(pd.offsets, do)(n, + normalize=normalize, + **kwargs) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + assert_func(klass([op + x for x in s]), op + s) + + +@pytest.mark.parametrize('years,months', product([-1, 0, 1], [-2, 0, 2])) +def test_shift_months(years, months): + s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp('2000-01-31 00:23:00'), Timestamp('2000-01-01'), - Timestamp('2000-03-31'), Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) - - # DateOffset relativedelta fastpath - relative_kwargs = [('years', 2), ('months', 5), ('days', 3), - ('hours', 5), ('minutes', 10), ('seconds', 2), - ('microseconds', 5)] - for i, kwd in enumerate(relative_kwargs): - op = pd.DateOffset(**dict([kwd])) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - - # assert these are equal on a piecewise basis - offsets = ['YearBegin', ('YearBegin', {'month': 5}), 'YearEnd', - ('YearEnd', {'month': 5}), 'MonthBegin', 'MonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'Week', ('Week', { - 'weekday': 3 - }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', - 'CustomBusinessDay', 'CDay', 'CBMonthEnd', - 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', - 'BusinessHour', 'BYearBegin', 'BYearEnd', - 'BQuarterBegin', ('LastWeekOfMonth', { - 'weekday': 2 - }), ('FY5253Quarter', {'qtr_with_extra_week': 1, - 'startingMonth': 1, - 'weekday': 2, - 'variation': 'nearest'}), - ('FY5253', {'weekday': 0, - 'startingMonth': 2, - 'variation': - 'nearest'}), ('WeekOfMonth', {'weekday': 2, - 'week': 2}), - 'Easter', ('DateOffset', {'day': 4}), - ('DateOffset', {'month': 5})] - - with warnings.catch_warnings(record=True): - for normalize in (True, False): - for do in offsets: - if isinstance(do, tuple): - do, kwargs = do - else: - do = do - kwargs = {} - - for n in [0, 5]: - if (do in ['WeekOfMonth', 'LastWeekOfMonth', - 'FY5253Quarter', 'FY5253'] and n == 0): - continue - op = getattr(pd.offsets, do)(n, - normalize=normalize, - **kwargs) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - assert_func(klass([op + x for x in s]), op + s) - - def test_shift_months(self): - s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp( - '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( - '2000-02-29'), Timestamp('2000-12-31')]) - for years in [-1, 0, 1]: - for months in [-2, 0, 2]: - actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + - months)) - expected = DatetimeIndex([x + offsets.DateOffset( - years=years, months=months) for x in s]) - tm.assert_index_equal(actual, expected) + Timestamp('2000-12-31')]) + actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + + months)) + expected = DatetimeIndex([x + offsets.DateOffset( + years=years, months=months) for x in s]) + tm.assert_index_equal(actual, expected) class TestBusinessDatetimeIndex(tm.TestCase): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3abd1093362f4..2e701143357e3 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -20,6 +20,7 @@ _DEPS_INSTALLED = 0 +@pytest.mark.single @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") class TestClipboard(tm.TestCase): diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 3fa0eb2ef52dc..a840ff46aa845 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -36,12 +36,6 @@ from pandas import concat, Timestamp from pandas import compat from pandas.compat import range, lrange, u - -try: - import tables -except ImportError: - pytest.skip('no pytables') - from distutils.version import LooseVersion _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' @@ -165,6 +159,7 @@ def tearDown(self): pass +@pytest.mark.single class TestHDFStore(Base, tm.TestCase): def test_factory_fun(self): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a6f4d96001021..78560611da7aa 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,13 +18,13 @@ """ from __future__ import print_function +import pytest import unittest import sqlite3 import csv import os import sys -import pytest import warnings import numpy as np import pandas as pd @@ -839,6 +839,7 @@ def test_unicode_column_name(self): df.to_sql('test_unicode', self.conn, index=False) +@pytest.mark.single class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi, unittest.TestCase): """ Test the public API as it would be used directly @@ -1024,10 +1025,12 @@ def tearDown(self): super(_EngineToConnMixin, self).tearDown() +@pytest.mark.single class TestSQLApiConn(_EngineToConnMixin, TestSQLApi, unittest.TestCase): pass +@pytest.mark.single class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi, unittest.TestCase): """ Test the public sqlite connection fallback API @@ -1875,30 +1878,36 @@ def test_schema_support(self): tm.assert_frame_equal(res1, res2) +@pytest.mark.single class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy, unittest.TestCase): pass +@pytest.mark.single class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn, unittest.TestCase): pass +@pytest.mark.single class TestPostgreSQLAlchemy(_TestPostgreSQLAlchemy, _TestSQLAlchemy, unittest.TestCase): pass +@pytest.mark.single class TestPostgreSQLAlchemyConn(_TestPostgreSQLAlchemy, _TestSQLAlchemyConn, unittest.TestCase): pass +@pytest.mark.single class TestSQLiteAlchemy(_TestSQLiteAlchemy, _TestSQLAlchemy, unittest.TestCase): pass +@pytest.mark.single class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn, unittest.TestCase): pass @@ -1907,6 +1916,7 @@ class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn, # ----------------------------------------------------------------------------- # -- Test Sqlite / MySQL fallback +@pytest.mark.single class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest, unittest.TestCase): """ Test the fallback mode against an in-memory sqlite database. @@ -2133,6 +2143,7 @@ def _skip_if_no_pymysql(): pytest.skip('pymysql not installed, skipping') +@pytest.mark.single class TestXSQLite(SQLiteMixIn, tm.TestCase): def setUp(self): @@ -2343,6 +2354,7 @@ def clean_up(test_table_to_drop): clean_up(table_name) +@pytest.mark.single class TestSQLFlavorDeprecation(tm.TestCase): """ gh-13611: test that the 'flavor' parameter @@ -2367,8 +2379,9 @@ def test_deprecated_flavor(self): getattr(sql, func)(self.con, flavor='sqlite') -@unittest.skip("gh-13611: there is no support for MySQL " - "if SQLAlchemy is not installed") +@pytest.mark.single +@pytest.mark.skip(reason="gh-13611: there is no support for MySQL " + "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn, tm.TestCase): @classmethod diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3add568c1ea99..1bb1f91423a9d 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2,6 +2,7 @@ import pytest import sys import warnings +from warnings import catch_warnings from datetime import datetime from numpy.random import randn @@ -291,8 +292,7 @@ def test_how_compat(self): for op in ['mean', 'sum', 'std', 'var', 'kurt', 'skew']: for t in ['rolling', 'expanding']: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): dfunc = getattr(pd, "{0}_{1}".format(t, op)) if dfunc is None: @@ -526,7 +526,7 @@ def setUp(self): def test_deprecations(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): mom.rolling_mean(np.ones(10), 3, center=True, axis=0) mom.rolling_mean(Series(np.ones(10)), 3, center=True, axis=0) @@ -791,7 +791,7 @@ def test_cmov_mean(self): xp = np.array([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, 12.818, 12.952, np.nan, np.nan]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_mean(vals, 5, center=True) tm.assert_almost_equal(xp, rs) @@ -808,7 +808,7 @@ def test_cmov_window(self): xp = np.array([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, 12.818, 12.952, np.nan, np.nan]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) tm.assert_almost_equal(xp, rs) @@ -823,19 +823,19 @@ def test_cmov_window_corner(self): # all nan vals = np.empty(10, dtype=float) vals.fill(np.nan) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) self.assertTrue(np.isnan(rs).all()) # empty vals = np.array([]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) self.assertEqual(len(rs), 0) # shorter than window vals = np.random.randn(5) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): rs = mom.rolling_window(vals, 10, 'boxcar') self.assertTrue(np.isnan(rs).all()) self.assertEqual(len(rs), 5) @@ -1014,16 +1014,16 @@ def test_cmov_window_special_linear_range(self): tm.assert_series_equal(xp, rs) def test_rolling_median(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_moment_func(mom.rolling_median, np.median, name='median') def test_rolling_min(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_moment_func(mom.rolling_min, np.min, name='min') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): a = np.array([1, 2, 3, 4, 5]) b = mom.rolling_min(a, window=100, min_periods=1) tm.assert_almost_equal(b, np.ones(len(a))) @@ -1033,10 +1033,10 @@ def test_rolling_min(self): def test_rolling_max(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_moment_func(mom.rolling_max, np.max, name='max') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): a = np.array([1, 2, 3, 4, 5], dtype=np.float64) b = mom.rolling_max(a, window=100, min_periods=1) tm.assert_almost_equal(a, b) @@ -1102,11 +1102,11 @@ def test_rolling_apply_out_of_bounds(self): arr = np.arange(4) # it works! - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_apply(arr, 10, np.sum) self.assertTrue(isnull(result).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) tm.assert_almost_equal(result, result) @@ -1117,19 +1117,19 @@ def test_rolling_std(self): name='std', ddof=0) def test_rolling_std_1obs(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1) expected = np.array([np.nan] * 5) tm.assert_almost_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1, ddof=0) expected = np.zeros(5) tm.assert_almost_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), 3, min_periods=2) self.assertTrue(np.isnan(result[2])) @@ -1142,11 +1142,11 @@ def test_rolling_std_neg_sqrt(self): a = np.array([0.0011448196318903589, 0.00028718669878572767, 0.00028718669878572767, 0.00028718669878572767, 0.00028718669878572767]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): b = mom.rolling_std(a, window=3) self.assertTrue(np.isfinite(b[2:]).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): b = mom.ewmstd(a, span=3) self.assertTrue(np.isfinite(b[2:]).all()) @@ -1184,25 +1184,25 @@ def test_fperr_robustness(self): if sys.byteorder != "little": arr = arr.byteswap().newbyteorder() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_sum(arr, 2) self.assertTrue((result[1:] >= 0).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_mean(arr, 2) self.assertTrue((result[1:] >= 0).all()) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_var(arr, 2) self.assertTrue((result[1:] >= 0).all()) # #2527, ugh arr = np.array([0.00012456, 0.0003, 0]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_mean(arr, 1) self.assertTrue(result[-1] >= 0) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.rolling_mean(-arr, 1) self.assertTrue(result[-1] <= 0) @@ -1327,15 +1327,13 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): # catch a freq deprecation warning if freq is provided and not # None - w = FutureWarning if freq is not None else None - with tm.assert_produces_warning(w, check_stacklevel=False): + with catch_warnings(record=True): r = obj.rolling(window=window, min_periods=min_periods, freq=freq, center=center) return getattr(r, name)(**kwargs) # check via the moments API - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): return f(obj, window=window, min_periods=min_periods, freq=freq, center=center, **kwargs) @@ -1419,7 +1417,7 @@ def test_ewma(self): arr = np.zeros(1000) arr[5] = 1 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): result = mom.ewma(arr, span=100, adjust=False).sum() self.assertTrue(np.abs(result - 1) < 1e-2) @@ -1506,7 +1504,7 @@ def test_ewmvol(self): self._check_ew(mom.ewmvol, name='vol') def test_ewma_span_com_args(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) tm.assert_almost_equal(A, B) @@ -1515,7 +1513,7 @@ def test_ewma_span_com_args(self): self.assertRaises(ValueError, mom.ewma, self.arr) def test_ewma_halflife_arg(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): A = mom.ewma(self.arr, com=13.932726172912965) B = mom.ewma(self.arr, halflife=10.0) tm.assert_almost_equal(A, B) @@ -1530,7 +1528,7 @@ def test_ewma_halflife_arg(self): def test_ewma_alpha_old_api(self): # GH 10789 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): a = mom.ewma(self.arr, alpha=0.61722699889169674) b = mom.ewma(self.arr, com=0.62014947789973052) c = mom.ewma(self.arr, span=2.240298955799461) @@ -1541,7 +1539,7 @@ def test_ewma_alpha_old_api(self): def test_ewma_alpha_arg_old_api(self): # GH 10789 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.assertRaises(ValueError, mom.ewma, self.arr) self.assertRaises(ValueError, mom.ewma, self.arr, com=10.0, alpha=0.5) @@ -1598,13 +1596,12 @@ def test_ew_empty_arrays(self): funcs = [mom.ewma, mom.ewmvol, mom.ewmvar] for f in funcs: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): result = f(arr, 3) tm.assert_almost_equal(result, arr) def _check_ew(self, func, name=None): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self._check_ew_ndarray(func, name=name) self._check_ew_structures(func, name=name) @@ -2870,7 +2867,7 @@ def test_rolling_max_gh6297(self): expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max() tm.assert_series_equal(expected, x) @@ -2889,14 +2886,14 @@ def test_rolling_max_how_resample(self): # Default how should be max expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max() tm.assert_series_equal(expected, x) # Now specify median (10.0) expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max(how='median') tm.assert_series_equal(expected, x) @@ -2904,7 +2901,7 @@ def test_rolling_max_how_resample(self): v = (4.0 + 10.0 + 20.0) / 3.0 expected = Series([0.0, 1.0, 2.0, 3.0, v], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').max(how='mean') tm.assert_series_equal(expected, x) @@ -2923,7 +2920,7 @@ def test_rolling_min_how_resample(self): # Default how should be min expected = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): r = series.rolling(window=1, freq='D') tm.assert_series_equal(expected, r.min()) @@ -2942,7 +2939,7 @@ def test_rolling_median_how_resample(self): # Default how should be median expected = Series([0.0, 1.0, 2.0, 3.0, 10], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): x = series.rolling(window=1, freq='D').median() tm.assert_series_equal(expected, x) diff --git a/setup.cfg b/setup.cfg index 45d98dd733f1f..b9de7a3532209 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,3 +25,5 @@ split_penalty_logical_operator = 30 # Silencing the warning until then addopts = --disable-pytest-warnings testpaths = pandas +markers = + single: mark a test as single cpu only diff --git a/test_fast.sh b/test_fast.sh index 0b394cffa3d74..43eb376f879cd 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -1,2 +1 @@ -# nosetests -A "not slow and not network" pandas --with-id $* -pytest pandas --skip-slow +pytest pandas --skip-slow --skip-network -m "not single" -n 4 From 0915857cc9209548d9c26122e822eaef841c6b24 Mon Sep 17 00:00:00 2001 From: Andrew Kittredge Date: Sun, 12 Feb 2017 12:37:13 -0500 Subject: [PATCH 08/52] Typo (#15377) --- doc/source/advanced.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 8833d73cb0a84..b6f015c15606d 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -59,7 +59,7 @@ Creating a MultiIndex (hierarchical index) object The ``MultiIndex`` object is the hierarchical analogue of the standard ``Index`` object which typically stores the axis labels in pandas objects. You -can think of ``MultiIndex`` an array of tuples where each tuple is unique. A +can think of ``MultiIndex`` as an array of tuples where each tuple is unique. A ``MultiIndex`` can be created from a list of arrays (using ``MultiIndex.from_arrays``), an array of tuples (using ``MultiIndex.from_tuples``), or a crossed set of iterables (using From a0f7fc061ca37ab992e320bd3d1b7b130e500469 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 11:46:48 -0500 Subject: [PATCH 09/52] TST: control skipping of numexpr tests if its installed / used --- pandas/tests/test_expressions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 0318757f76a11..3032a288032a2 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -20,9 +20,6 @@ import pandas.util.testing as tm -if not expr._USE_NUMEXPR: - numexpr = pytest.importorskip('numexpr') - _frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') _frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') _mixed = DataFrame({'A': _frame['A'].copy(), @@ -50,6 +47,7 @@ _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) +@pytest.mark.skipif(not expr._USE_NUMEXPR, reason='not using numexpr') class TestExpressions(tm.TestCase): def setUp(self): From dda3c4292b28d4dbead8bb6ae9927373aea9fe23 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 12:51:11 -0500 Subject: [PATCH 10/52] TST: make test_gbq single cpu --- pandas/tests/io/test_gbq.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 0868edd2147b5..0317ebc49ad2c 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,6 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) +@pytest.mark.single class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -298,6 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -329,6 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -360,6 +363,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) +@pytest.mark.single class GBQUnitTests(tm.TestCase): def setUp(self): @@ -446,6 +450,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) +@pytest.mark.single class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -499,6 +504,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) +@pytest.mark.single class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -901,6 +907,7 @@ def test_configuration_without_query(self): configuration=config) +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1215,6 +1222,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') +@pytest.mark.single class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1272,6 +1280,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 47f7ce325aa141d3c5abbc6ce512afefd25825ac Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 20:49:08 +0000 Subject: [PATCH 11/52] C level list --- pandas/index.pyx | 152 +++++++++++++++++++++++++++++++++++------ pandas/indexes/base.py | 29 ++++++-- 2 files changed, 155 insertions(+), 26 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 90ab6f5328df8..527a71684842f 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -3,7 +3,8 @@ from numpy cimport ndarray from numpy cimport (float64_t, int32_t, int64_t, uint8_t, - NPY_DATETIME, NPY_TIMEDELTA) + NPY_DATETIME, NPY_TIMEDELTA, PyArray_SimpleNewFromData, + NPY_INT64) cimport cython cimport numpy as cnp @@ -23,7 +24,7 @@ from pandas.tslib import Timestamp, Timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) -from cpython cimport PyTuple_Check, PyList_Check +from cpython cimport PyTuple_Check, PyList_Check, PyMem_Malloc, PyMem_Free cdef extern from "datetime.h": bint PyDateTime_Check(object o) @@ -44,6 +45,104 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +ctypedef struct Int64ListNode: + int64_t value + Int64ListNode *next + +ctypedef struct Int64List: + Int64ListNode *root + Int64ListNode *last + Py_ssize_t n + bint owns + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef Int64List* Int64List_create_array(Py_ssize_t n): + + cdef: + Int64List *lst = PyMem_Malloc(n * sizeof(Int64List)) + Py_ssize_t i + + for i in range(n): + lst[i].n = 0 + lst[i].root = NULL + lst[i].last = NULL + + return lst + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n): + cdef: + Int64ListNode *next + Int64ListNode *p + Py_ssize_t i + + for i in range(n): + if lst[i].owns: + p = lst[i].root + while p is not NULL: + next = p[0].next + PyMem_Free(p) + p = next + + PyMem_Free(lst) + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void _append(Int64List *lst, int64_t x): + + cdef Int64ListNode *nn = PyMem_Malloc(sizeof(Int64ListNode)) + + nn[0].value = x + nn[0].next = NULL + + if lst[0].root is NULL: + lst[0].root = nn + lst[0].owns = 1 + else: + lst[0].last[0].next = nn + + lst[0].last = nn + lst[0].n += 1 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void _copy_to(Int64List *dst, Int64List *src) nogil: + dst[0].root = src[0].root + dst[0].last = src[0].last + dst[0].n = src[0].n + dst[0].owns = 0 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, + Py_ssize_t *nt): + nt[0] = 0 + cdef: + Py_ssize_t last = 0 + Int64ListNode* node + + for i in range(n): + nt[0] += lst[i].n + + cdef int64_t *data = PyMem_Malloc(nt[0] * sizeof(int64_t)) + + for i in range(n): + + node = lst[i].root + while node is not NULL: + data[last] = node[0].value + last += 1 + node = node[0].next + + return data + @cython.boundscheck(False) @cython.wraparound(False) @@ -51,7 +150,8 @@ cdef extern from "Python.h": cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, - list[:] result, list[:] missing): + Int64List* result, + Int64List* missing): cdef: Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0] @@ -63,26 +163,26 @@ cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, if val0 == val1: while i < n and values[idx0[i]] == val1: - result[idx1[j]].append(idx0[i]) + _append(&(result[idx1[j]]), idx0[i]) i += 1 j += 1 while j < n_t and val0 == targets[idx1[j]]: - result[idx1[j]] = result[idx1[j-1]] + _copy_to(&(result[idx1[j]]), &(result[idx1[j-1]])) j += 1 elif val0 > val1: - result[idx1[j]].append(-1) - missing[idx1[j]].append(idx1[j]) + _append(&(result[idx1[j]]), -1) + _append(&(missing[idx1[j]]), idx1[j]) j += 1 else: i += 1 while j < n_t: - result[idx1[j]].append(-1) - missing[idx1[j]].append(idx1[j]) + _append(&(result[idx1[j]]), -1) + _append(&(missing[idx1[j]]), idx1[j]) j += 1 @@ -420,27 +520,39 @@ cdef class IndexEngine: cdef: ndarray values object val0, val1 - Py_ssize_t i, n_t + Py_ssize_t n_t self._ensure_mapping_populated() values = self._get_index_values() n_t = len(targets) - result = np.empty((n_t,), dtype=np.object_) - result.fill([]) - result = np.frompyfunc(list,1,1)(result) - - missing = np.empty((n_t,), dtype=np.object_) - missing.fill([]) - missing = np.frompyfunc(list,1,1)(missing) + cdef: + Int64List* result = Int64List_create_array(n_t) + Int64List* missing = Int64List_create_array(n_t) _indexer_non_unique_orderable_loop(values, targets, idx0, idx1, result, missing) - result = np.concatenate(result) - missing = np.asarray(np.concatenate(missing), np.int64) + cdef: + Py_ssize_t nres, nmis + int64_t *cresult + int64_t *cmissing + + cresult = Int64List_concat_array(result, n_t, &nres) + cmissing = Int64List_concat_array(missing, n_t, &nmis) - return result, missing + Int64List_destroy_array(result, n_t) + Int64List_destroy_array(missing, n_t) + + cdef: + cnp.npy_intp *dims0 = [nres] + cnp.npy_intp *dims1 = [nmis] + ndarray npy_result = PyArray_SimpleNewFromData(1, dims0, + NPY_INT64, cresult) + ndarray npy_missing = PyArray_SimpleNewFromData(1, dims1, + NPY_INT64, cmissing) + + return npy_result, npy_missing cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 81ae7ad1c5a0a..ea9bb86dba358 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2506,19 +2506,36 @@ def get_indexer_non_unique(self, target): if self.is_all_dates: self = Index(self.asi8) tgt_values = target.asi8 + src_values = self.asi8 else: tgt_values = target._values + src_values = self._values try: - if self.is_all_dates: - idx0 = np.argsort(self.asi8, kind='mergesort') + src_values[0] < tgt_values[0] + src_values[0] > tgt_values[0] + except TypeError: + orderable = False + else: + try: + if self.is_monotonic_increasing: + idx0 = np.arange(len(src_values)) + else: + idx0 = np.argsort(src_values, kind='mergesort') + + if target.is_monotonic_increasing: + idx1 = np.arange(len(tgt_values)) + else: + idx1 = np.argsort(tgt_values, kind='mergesort') + + except TypeError: + orderable = False else: - idx0 = np.argsort(self._values, kind='mergesort') + orderable = True - idx1 = np.argsort(tgt_values, kind='mergesort') + if orderable: indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) - - except TypeError: + else: indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return Index(indexer), missing From 09dd91bad3e550993c9e3924f8d02dbb4a312b39 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 21:16:36 +0000 Subject: [PATCH 12/52] no gil --- pandas/index.pyx | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 527a71684842f..9c73ae1045b5c 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -1,6 +1,7 @@ # cython: profile=False from numpy cimport ndarray +from libc.stdlib cimport malloc, free from numpy cimport (float64_t, int32_t, int64_t, uint8_t, NPY_DATETIME, NPY_TIMEDELTA, PyArray_SimpleNewFromData, @@ -58,10 +59,10 @@ ctypedef struct Int64List: @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef Int64List* Int64List_create_array(Py_ssize_t n): +cdef Int64List* Int64List_create_array(Py_ssize_t n) nogil: cdef: - Int64List *lst = PyMem_Malloc(n * sizeof(Int64List)) + Int64List *lst = malloc(n * sizeof(Int64List)) Py_ssize_t i for i in range(n): @@ -74,7 +75,7 @@ cdef Int64List* Int64List_create_array(Py_ssize_t n): @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n): +cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n) nogil: cdef: Int64ListNode *next Int64ListNode *p @@ -85,17 +86,17 @@ cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n): p = lst[i].root while p is not NULL: next = p[0].next - PyMem_Free(p) + free(p) p = next - PyMem_Free(lst) + free(lst) @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef void _append(Int64List *lst, int64_t x): +cdef inline void _append(Int64List *lst, int64_t x) nogil: - cdef Int64ListNode *nn = PyMem_Malloc(sizeof(Int64ListNode)) + cdef Int64ListNode *nn = malloc(sizeof(Int64ListNode)) nn[0].value = x nn[0].next = NULL @@ -112,7 +113,7 @@ cdef void _append(Int64List *lst, int64_t x): @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef void _copy_to(Int64List *dst, Int64List *src) nogil: +cdef inline void _copy_to(Int64List *dst, Int64List *src) nogil: dst[0].root = src[0].root dst[0].last = src[0].last dst[0].n = src[0].n @@ -122,7 +123,7 @@ cdef void _copy_to(Int64List *dst, Int64List *src) nogil: @cython.wraparound(False) @cython.initializedcheck(False) cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, - Py_ssize_t *nt): + Py_ssize_t *nt) nogil: nt[0] = 0 cdef: Py_ssize_t last = 0 @@ -131,7 +132,7 @@ cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, for i in range(n): nt[0] += lst[i].n - cdef int64_t *data = PyMem_Malloc(nt[0] * sizeof(int64_t)) + cdef int64_t *data = malloc(nt[0] * sizeof(int64_t)) for i in range(n): From 010393c4cb650b78e3e51af417e7037737e8d3b6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 21:43:50 -0500 Subject: [PATCH 13/52] ENH: expose Int64VectorData in hashtable.pxd --- pandas/hashtable.pxd | 14 ++++++++++++++ pandas/src/hashtable_class_helper.pxi.in | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index cd06b938310a8..cabfa43a76f26 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,5 +1,6 @@ from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t, int64_t, float64_t) +from numpy cimport ndarray # prototypes for sharing @@ -35,3 +36,16 @@ cdef class StringHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + +cdef struct Int64VectorData: + int64_t *data + size_t n, m + +cdef class Int64Vector: + cdef Int64VectorData *data + cdef ndarray ao + + cdef resize(self) + cpdef to_array(self) + cdef inline void append(self, int64_t x) + cdef extend(self, int64_t[:] x) diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 74c38dfdb393e..ef385ba7dca1c 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -24,10 +24,14 @@ dtypes = [('Float64', 'float64', 'float64_t'), {{for name, dtype, arg in dtypes}} +{{if dtype != 'int64'}} + ctypedef struct {{name}}VectorData: {{arg}} *data size_t n, m +{{endif}} + @cython.wraparound(False) @cython.boundscheck(False) @@ -65,9 +69,11 @@ dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), cdef class {{name}}Vector: + {{if dtype != 'int64'}} cdef: {{name}}VectorData *data ndarray ao + {{endif}} def __cinit__(self): self.data = <{{name}}VectorData *>PyMem_Malloc( @@ -92,7 +98,7 @@ cdef class {{name}}Vector: def __len__(self): return self.data.n - def to_array(self): + cpdef to_array(self): self.ao.resize(self.data.n) self.data.m = self.data.n return self.ao @@ -104,6 +110,10 @@ cdef class {{name}}Vector: append_data_{{dtype}}(self.data, x) + cdef extend(self, {{arg}}[:] x): + for i in range(len(x)): + self.append(x[i]) + {{endfor}} cdef class StringVector: From d9e75c7e724e5f7449c8c57624ce9395c9ffe11a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Feb 2017 21:54:11 -0500 Subject: [PATCH 14/52] TST: xfail most test_gbq tests for now --- pandas/tests/io/test_gbq.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 0317ebc49ad2c..316afaf306011 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -363,7 +363,6 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.single class GBQUnitTests(tm.TestCase): def setUp(self): @@ -450,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -504,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -907,7 +906,7 @@ def test_configuration_without_query(self): configuration=config) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1022,8 +1021,6 @@ def test_upload_data_if_table_exists_append(self): def test_upload_data_if_table_exists_replace(self): - pytest.skip("buggy test") - destination_table = DESTINATION_TABLE + "4" test_size = 10 @@ -1222,7 +1219,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1280,7 +1277,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="flaky tests") class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 2e55efcdf5141f16f623827a3ad5c3f792f8f664 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 13 Feb 2017 07:27:28 +0000 Subject: [PATCH 15/52] capture index error --- pandas/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ea9bb86dba358..1ce58e87d2305 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2514,7 +2514,7 @@ def get_indexer_non_unique(self, target): try: src_values[0] < tgt_values[0] src_values[0] > tgt_values[0] - except TypeError: + except TypeError or IndexError: orderable = False else: try: From 6916dad1a7219072621e8a9f38b68348de9321b6 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 13 Feb 2017 08:32:04 +0000 Subject: [PATCH 16/52] wrong exception handling --- pandas/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 1ce58e87d2305..cd3304daec25b 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2514,7 +2514,7 @@ def get_indexer_non_unique(self, target): try: src_values[0] < tgt_values[0] src_values[0] > tgt_values[0] - except TypeError or IndexError: + except (TypeError, IndexError): orderable = False else: try: From 86ca84d8ec79eba5fe31bf0d4cbb24ec78fc333a Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 14 Feb 2017 08:29:18 -0500 Subject: [PATCH 17/52] TST: Fix gbq integration tests. gbq._Dataset.dataset() would not return full results This PR resolves an issue where `gbq._Dataset.datasets()` would not return all datasets under a Google BigQuery project. If `'nextPageToken'` is populated, then another `datasets().list()` request should be sent with `'pageToken'` set to collect more results. In the past few days, additional datasets were added under the Google BigQuery project id used by pandas as part of the following github project : https://github.com/pydata/pandas-gbq . The addition of datasets caused many gbq unit tests to fail because in function `clean_gbq_environment()`, we check to see if the dataset exists using the incomplete results from `gbq._Dataset.datasets()` before we attempt to delete it. Author: Anthonios Partheniou Closes #15381 from parthea/fix-broken-gbq-unit-tests and squashes the following commits: 61bc1e7 [Anthonios Partheniou] TST: Fix gbq tests. gbq.dataset()/gbq.tables would not return full results. --- pandas/io/gbq.py | 67 ++++++++++++++++++++++++------------- pandas/tests/io/test_gbq.py | 16 ++++----- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 169a2b1df9b4c..0ffb6b4bf8c05 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1056,21 +1056,32 @@ def datasets(self): List of datasets under the specific project """ - try: - list_dataset_response = self.service.datasets().list( - projectId=self.project_id).execute().get('datasets', None) + dataset_list = [] + next_page_token = None + first_query = True - if not list_dataset_response: - return [] + while first_query or next_page_token: + first_query = False - dataset_list = list() + try: + list_dataset_response = self.service.datasets().list( + projectId=self.project_id, + pageToken=next_page_token).execute() - for row_num, raw_row in enumerate(list_dataset_response): - dataset_list.append(raw_row['datasetReference']['datasetId']) + dataset_response = list_dataset_response.get('datasets') + next_page_token = list_dataset_response.get('nextPageToken') - return dataset_list - except self.http_error as ex: - self.process_http_error(ex) + if not dataset_response: + return dataset_list + + for row_num, raw_row in enumerate(dataset_response): + dataset_list.append( + raw_row['datasetReference']['datasetId']) + + except self.http_error as ex: + self.process_http_error(ex) + + return dataset_list def create(self, dataset_id): """ Create a dataset in Google BigQuery @@ -1140,19 +1151,29 @@ def tables(self, dataset_id): List of tables under the specific dataset """ - try: - list_table_response = self.service.tables().list( - projectId=self.project_id, - datasetId=dataset_id).execute().get('tables', None) + table_list = [] + next_page_token = None + first_query = True - if not list_table_response: - return [] + while first_query or next_page_token: + first_query = False - table_list = list() + try: + list_table_response = self.service.tables().list( + projectId=self.project_id, + datasetId=dataset_id, + pageToken=next_page_token).execute() - for row_num, raw_row in enumerate(list_table_response): - table_list.append(raw_row['tableReference']['tableId']) + table_response = list_table_response.get('tables') + next_page_token = list_table_response.get('nextPageToken') - return table_list - except self.http_error as ex: - self.process_http_error(ex) + if not table_response: + return table_list + + for row_num, raw_row in enumerate(table_response): + table_list.append(raw_row['tableReference']['tableId']) + + except self.http_error as ex: + self.process_http_error(ex) + + return table_list diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 316afaf306011..dfbf3ca69b111 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -449,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -503,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -906,7 +906,7 @@ def test_configuration_without_query(self): configuration=config) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1219,7 +1219,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1277,7 +1277,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) -@pytest.mark.xfail(run=False, reason="flaky tests") +@pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From ff0deecbc8f8e9ae3d274e5e7cd7c0056de1a6c2 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Feb 2017 08:33:34 -0500 Subject: [PATCH 18/52] Bug: Raise ValueError with interpolate & fillna limit = 0 (#9217) closes #9217 Author: Matt Roeschke Closes #14994 from mroeschke/fix_9217 and squashes the following commits: c1790ee [Matt Roeschke] Unify ValueError message and correct cython limits 6f041e6 [Matt Roeschke] Bug: Raise ValueError with interpolate limit = 0 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 6 ++--- pandas/core/internals.py | 4 +++ pandas/core/missing.py | 8 ++++-- pandas/src/algos_common_helper.pxi.in | 36 ++++++++++++++++++--------- pandas/tests/series/test_missing.py | 18 ++++++++++++++ 6 files changed, 56 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index aa620bce0df59..d76e33caffbf1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -421,6 +421,7 @@ Other API Changes - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) +- ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 228dd2acd2124..20e6e027dbf09 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3262,7 +3262,7 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be - filled. + filled. Must be greater than 0 if not None. downcast : dict, default is None a dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate @@ -3281,6 +3281,7 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None): inplace = validate_bool_kwarg(inplace, 'inplace') + if isinstance(value, (list, tuple)): raise TypeError('"value" parameter must be a scalar or dict, but ' 'you passed a "{0}"'.format(type(value).__name__)) @@ -3292,7 +3293,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, axis = 0 axis = self._get_axis_number(axis) method = missing.clean_fill_method(method) - from pandas import DataFrame if value is None: if method is None: @@ -3687,7 +3687,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 0: fill column-by-column * 1: fill row-by-row limit : int, default None. - Maximum number of consecutive NaNs to fill. + Maximum number of consecutive NaNs to fill. Must be greater than 0. limit_direction : {'forward', 'backward', 'both'}, default 'forward' If limit is specified, consecutive NaNs will be filled in this direction. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f0b1516d786c6..6cd5eceed5f2a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -372,6 +372,10 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, original_value = value mask = isnull(self.values) if limit is not None: + if not is_integer(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') if self.ndim > 2: raise NotImplementedError("number of dimensions for 'fillna' " "is currently limited to 2") diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e83a0518d97f6..ffd0423572f5e 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -12,7 +12,7 @@ is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_integer_dtype, _ensure_float64, is_scalar, - needs_i8_conversion) + needs_i8_conversion, is_integer) from pandas.types.missing import isnull @@ -169,7 +169,11 @@ def _interp_limit(invalid, fw_limit, bw_limit): # the beginning (see issues #9218 and #10420) violate_limit = sorted(start_nans) - if limit: + if limit is not None: + if not is_integer(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') if limit_direction == 'forward': violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 5e87528943005..42089f9520ab6 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -83,8 +83,10 @@ def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, if limit is None: lim = nright else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: @@ -146,8 +148,10 @@ def pad_inplace_{{name}}(ndarray[{{c_type}}] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit val = values[0] @@ -180,8 +184,10 @@ def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit for j in range(K): @@ -240,8 +246,10 @@ def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, if limit is None: lim = nright else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: @@ -304,8 +312,10 @@ def backfill_inplace_{{name}}(ndarray[{{c_type}}] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit val = values[N - 1] @@ -338,8 +348,10 @@ def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, if limit is None: lim = N else: - if limit < 0: - raise ValueError('Limit must be non-negative') + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') lim = limit for j in range(K): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 405d6c98a5d37..23eb6a40f5f1d 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -295,6 +295,13 @@ def test_fillna_raise(self): self.assertRaises(TypeError, s.fillna, [1, 2]) self.assertRaises(TypeError, s.fillna, (1, 2)) + # related GH 9217, make sure limit is an int and greater than 0 + s = Series([1, 2, 3, None]) + for limit in [-1, 0, 1., 2.]: + for method in ['backfill', 'bfill', 'pad', 'ffill', None]: + with tm.assertRaises(ValueError): + s.fillna(1, limit=limit, method=method) + def test_fillna_nat(self): series = Series([0, 1, 2, tslib.iNaT], dtype='M8[ns]') @@ -865,6 +872,17 @@ def test_interp_limit(self): result = s.interpolate(method='linear', limit=2) assert_series_equal(result, expected) + # GH 9217, make sure limit is an int and greater than 0 + methods = ['linear', 'time', 'index', 'values', 'nearest', 'zero', + 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', + 'polynomial', 'spline', 'piecewise_polynomial', None, + 'from_derivatives', 'pchip', 'akima'] + s = pd.Series([1, 2, np.nan, np.nan, 5]) + for limit in [-1, 0, 1., 2.]: + for method in methods: + with tm.assertRaises(ValueError): + s.interpolate(limit=limit, method=method) + def test_interp_limit_forward(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) From 5959fe1fffe4b5749de63d6a26ac64349bc791ac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 17:35:05 -0500 Subject: [PATCH 19/52] CLN: create core/sorting.py just a small reorg to put sorting / grouping utilities into a separate area Author: Jeff Reback Closes #15402 from jreback/sorting and squashes the following commits: fdcf9a1 [Jeff Reback] change a couple of sorting.py functions to be non-private (public to pandas internals) 90ff22d [Jeff Reback] split up some value_counts groupby tests a bit 18ea902 [Jeff Reback] CLN: create core/sorting.py 92dcb07 [Jeff Reback] CLN: remove numpy_groupby as not used --- pandas/core/frame.py | 26 +- pandas/core/groupby.py | 376 +--------------------- pandas/core/reshape.py | 13 +- pandas/core/series.py | 10 +- pandas/core/sorting.py | 357 ++++++++++++++++++++ pandas/indexes/multi.py | 12 +- pandas/tests/groupby/test_filters.py | 21 -- pandas/tests/groupby/test_groupby.py | 169 ---------- pandas/tests/groupby/test_misc.py | 101 ------ pandas/tests/groupby/test_value_counts.py | 60 ++++ pandas/tests/test_sorting.py | 339 +++++++++++++++++++ pandas/tests/tools/test_merge.py | 135 +------- pandas/tools/merge.py | 4 +- 13 files changed, 802 insertions(+), 821 deletions(-) create mode 100644 pandas/core/sorting.py delete mode 100644 pandas/tests/groupby/test_misc.py create mode 100644 pandas/tests/groupby/test_value_counts.py create mode 100644 pandas/tests/test_sorting.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aa03bfb9a54b9..16f8d4658dc20 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3141,7 +3141,7 @@ def duplicated(self, subset=None, keep='first'): ------- duplicated : Series """ - from pandas.core.groupby import get_group_index + from pandas.core.sorting import get_group_index from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): @@ -3179,7 +3179,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, raise ValueError('Length of ascending (%d) != length of by (%d)' % (len(ascending), len(by))) if len(by) > 1: - from pandas.core.groupby import _lexsort_indexer + from pandas.core.sorting import lexsort_indexer def trans(v): if needs_i8_conversion(v): @@ -3193,11 +3193,11 @@ def trans(v): raise ValueError('Cannot sort by duplicate column %s' % str(x)) keys.append(trans(k)) - indexer = _lexsort_indexer(keys, orders=ascending, - na_position=na_position) + indexer = lexsort_indexer(keys, orders=ascending, + na_position=na_position) indexer = _ensure_platform_int(indexer) else: - from pandas.core.groupby import _nargsort + from pandas.core.sorting import nargsort by = by[0] k = self.xs(by, axis=other_axis).values @@ -3214,8 +3214,8 @@ def trans(v): if isinstance(ascending, (tuple, list)): ascending = ascending[0] - indexer = _nargsort(k, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort(k, kind=kind, ascending=ascending, + na_position=na_position) new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), @@ -3300,17 +3300,17 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sort_remaining=sort_remaining) elif isinstance(labels, MultiIndex): - from pandas.core.groupby import _lexsort_indexer + from pandas.core.sorting import lexsort_indexer # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer if not labels.is_lexsorted(): labels = MultiIndex.from_tuples(labels.values) - indexer = _lexsort_indexer(labels.labels, orders=ascending, - na_position=na_position) + indexer = lexsort_indexer(labels.labels, orders=ascending, + na_position=na_position) else: - from pandas.core.groupby import _nargsort + from pandas.core.sorting import nargsort # GH11080 - Check monotonic-ness before sort an index # if monotonic (already sorted), return None or copy() according @@ -3322,8 +3322,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self.copy() - indexer = _nargsort(labels, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort(labels, kind=kind, ascending=ascending, + na_position=na_position) new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a228861270aea..23c835318b0e6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -7,7 +7,7 @@ import copy from pandas.compat import ( - zip, range, long, lzip, + zip, range, lzip, callable, map ) from pandas import compat @@ -47,6 +47,9 @@ from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel +from pandas.core.sorting import (get_group_index_sorter, get_group_index, + compress_group_index, get_flattened_iterator, + decons_obs_group_ids, get_indexer_dict) from pandas.util.decorators import (cache_readonly, Substitution, Appender, make_signature, deprecate_kwarg) from pandas.formats.printing import pprint_thing @@ -59,7 +62,6 @@ from pandas.lib import Timestamp import pandas.tslib as tslib import pandas.algos as _algos -import pandas.hashtable as _hash _doc_template = """ @@ -729,7 +731,7 @@ def _cumcount_array(self, ascending=True): (though the default is sort=True) for groupby in general """ ids, _, ngroups = self.grouper.group_info - sorter = _get_group_index_sorter(ids, ngroups) + sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) if count == 0: @@ -1616,9 +1618,12 @@ def _get_group_keys(self): return self.levels[0] else: comp_ids, _, ngroups = self.group_info + # provide "flattened" iterator for multi-group setting - mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels) - return [mapper.get_key(i) for i in range(ngroups)] + return get_flattened_iterator(comp_ids, + ngroups, + self.levels, + self.labels) def apply(self, f, data, axis=0): mutated = self.mutated @@ -1662,7 +1667,7 @@ def indices(self): label_list = [ping.labels for ping in self.groupings] keys = [_values_from_object(ping.group_index) for ping in self.groupings] - return _get_indices_dict(label_list, keys) + return get_indexer_dict(label_list, keys) @property def labels(self): @@ -1726,7 +1731,7 @@ def _get_compressed_labels(self): if len(all_labels) > 1: group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) - return _compress_group_index(group_index, sort=self.sort) + return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] return ping.labels, np.arange(len(ping.group_index)) @@ -2027,7 +2032,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() - indexer = _get_group_index_sorter(group_index, ngroups) + indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer, convert=False) group_index = algos.take_nd(group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, @@ -2424,7 +2429,6 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, a BaseGrouper. """ - group_axis = obj._get_axis(axis) # validate that the passed level is compatible with the passed @@ -4206,7 +4210,7 @@ def slabels(self): @cache_readonly def sort_idx(self): # Counting sort indexer - return _get_group_index_sorter(self.labels, self.ngroups) + return get_group_index_sorter(self.labels, self.ngroups) def __iter__(self): sdata = self._get_sorted_data() @@ -4302,355 +4306,3 @@ def get_splitter(data, *args, **kwargs): klass = NDFrameSplitter return klass(data, *args, **kwargs) - - -# ---------------------------------------------------------------------- -# Misc utilities - - -def get_group_index(labels, shape, sort, xnull): - """ - For the particular label_list, gets the offsets into the hypothetical list - representing the totally ordered cartesian product of all possible label - combinations, *as long as* this space fits within int64 bounds; - otherwise, though group indices identify unique combinations of - labels, they cannot be deconstructed. - - If `sort`, rank of returned ids preserve lexical ranks of labels. - i.e. returned id's can be used to do lexical sort on labels; - - If `xnull` nulls (-1 labels) are passed through. - - Parameters - ---------- - labels: sequence of arrays - Integers identifying levels at each location - shape: sequence of ints same length as labels - Number of unique levels at each location - sort: boolean - If the ranks of returned ids should match lexical ranks of labels - xnull: boolean - If true nulls are excluded. i.e. -1 values in the labels are - passed through - Returns - ------- - An array of type int64 where two elements are equal if their corresponding - labels are equal at all location. - """ - def _int64_cut_off(shape): - acc = long(1) - for i, mul in enumerate(shape): - acc *= long(mul) - if not acc < _INT64_MAX: - return i - return len(shape) - - def loop(labels, shape): - # how many levels can be done without overflow: - nlev = _int64_cut_off(shape) - - # compute flat ids for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - out = stride * labels[0].astype('i8', subok=False, copy=False) - - for i in range(1, nlev): - if shape[i] == 0: - stride = 0 - else: - stride //= shape[i] - out += labels[i] * stride - - if xnull: # exclude nulls - mask = labels[0] == -1 - for lab in labels[1:nlev]: - mask |= lab == -1 - out[mask] = -1 - - if nlev == len(shape): # all levels done! - return out - - # compress what has been done so far in order to avoid overflow - # to retain lexical ranks, obs_ids should be sorted - comp_ids, obs_ids = _compress_group_index(out, sort=sort) - - labels = [comp_ids] + labels[nlev:] - shape = [len(obs_ids)] + shape[nlev:] - - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) - - -_INT64_MAX = np.iinfo(np.int64).max - - -def _int64_overflow_possible(shape): - the_prod = long(1) - for x in shape: - the_prod *= long(x) - - return the_prod >= _INT64_MAX - - -def decons_group_index(comp_labels, shape): - # reconstruct labels - if _int64_overflow_possible(shape): - # at some point group indices are factorized, - # and may not be deconstructed here! wrong path! - raise ValueError('cannot deconstruct factorized group indices!') - - label_list = [] - factor = 1 - y = 0 - x = comp_labels - for i in reversed(range(len(shape))): - labels = (x - y) % (factor * shape[i]) // factor - np.putmask(labels, comp_labels < 0, -1) - label_list.append(labels) - y = labels * factor - factor *= shape[i] - return label_list[::-1] - - -def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): - """ - reconstruct labels from observed group ids - - Parameters - ---------- - xnull: boolean, - if nulls are excluded; i.e. -1 labels are passed through - """ - from pandas.hashtable import unique_label_indices - - if not xnull: - lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') - shape = np.asarray(shape, dtype='i8') + lift - - if not _int64_overflow_possible(shape): - # obs ids are deconstructable! take the fast route! - out = decons_group_index(obs_ids, shape) - return out if xnull or not lift.any() \ - else [x - y for x, y in zip(out, lift)] - - i = unique_label_indices(comp_ids) - i8copy = lambda a: a.astype('i8', subok=False, copy=True) - return [i8copy(lab[i]) for lab in labels] - - -def _indexer_from_factorized(labels, shape, compress=True): - ids = get_group_index(labels, shape, sort=True, xnull=False) - - if not compress: - ngroups = (ids.size and ids.max()) + 1 - else: - ids, obs = _compress_group_index(ids, sort=True) - ngroups = len(obs) - - return _get_group_index_sorter(ids, ngroups) - - -def _lexsort_indexer(keys, orders=None, na_position='last'): - labels = [] - shape = [] - if isinstance(orders, bool): - orders = [orders] * len(keys) - elif orders is None: - orders = [True] * len(keys) - - for key, order in zip(keys, orders): - - # we are already a Categorical - if is_categorical_dtype(key): - c = key - - # create the Categorical - else: - c = Categorical(key, ordered=True) - - if na_position not in ['last', 'first']: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - - n = len(c.categories) - codes = c.codes.copy() - - mask = (c.codes == -1) - if order: # ascending - if na_position == 'last': - codes = np.where(mask, n, codes) - elif na_position == 'first': - codes += 1 - else: # not order means descending - if na_position == 'last': - codes = np.where(mask, n, n - codes - 1) - elif na_position == 'first': - codes = np.where(mask, 0, n - codes) - if mask.any(): - n += 1 - - shape.append(n) - labels.append(codes) - - return _indexer_from_factorized(labels, shape) - - -def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): - """ - This is intended to be a drop-in replacement for np.argsort which - handles NaNs. It adds ascending and na_position parameters. - GH #6399, #5231 - """ - - # specially handle Categorical - if is_categorical_dtype(items): - return items.argsort(ascending=ascending) - - items = np.asanyarray(items) - idx = np.arange(len(items)) - mask = isnull(items) - non_nans = items[~mask] - non_nan_idx = idx[~mask] - nan_idx = np.nonzero(mask)[0] - if not ascending: - non_nans = non_nans[::-1] - non_nan_idx = non_nan_idx[::-1] - indexer = non_nan_idx[non_nans.argsort(kind=kind)] - if not ascending: - indexer = indexer[::-1] - # Finally, place the NaNs at the end or the beginning according to - # na_position - if na_position == 'last': - indexer = np.concatenate([indexer, nan_idx]) - elif na_position == 'first': - indexer = np.concatenate([nan_idx, indexer]) - else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - return indexer - - -class _KeyMapper(object): - - """ - Ease my suffering. Map compressed group id -> key tuple - """ - - def __init__(self, comp_ids, ngroups, labels, levels): - self.levels = levels - self.labels = labels - self.comp_ids = comp_ids.astype(np.int64) - - self.k = len(labels) - self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)] - - self._populate_tables() - - def _populate_tables(self): - for labs, table in zip(self.labels, self.tables): - table.map(self.comp_ids, labs.astype(np.int64)) - - def get_key(self, comp_id): - return tuple(level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels)) - - -def _get_indices_dict(label_list, keys): - shape = list(map(len, keys)) - - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - ngroups = ((group_index.size and group_index.max()) + 1) \ - if _int64_overflow_possible(shape) \ - else np.prod(shape, dtype='i8') - - sorter = _get_group_index_sorter(group_index, ngroups) - - sorted_labels = [lab.take(sorter) for lab in label_list] - group_index = group_index.take(sorter) - - return lib.indices_fast(sorter, group_index, keys, sorted_labels) - - -# ---------------------------------------------------------------------- -# sorting levels...cleverly? - -def _get_group_index_sorter(group_index, ngroups): - """ - _algos.groupsort_indexer implements `counting sort` and it is at least - O(ngroups), where - ngroups = prod(shape) - shape = map(len, keys) - that is, linear in the number of combinations (cartesian product) of unique - values of groupby keys. This can be huge when doing multi-key groupby. - np.argsort(kind='mergesort') is O(count x log(count)) where count is the - length of the data-frame; - Both algorithms are `stable` sort and that is necessary for correctness of - groupby operations. e.g. consider: - df.groupby(key)[col].transform('first') - """ - count = len(group_index) - alpha = 0.0 # taking complexities literally; there may be - beta = 1.0 # some room for fine-tuning these parameters - do_groupsort = (count > 0 and ((alpha + beta * ngroups) < - (count * np.log(count)))) - if do_groupsort: - sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), - ngroups) - return _ensure_platform_int(sorter) - else: - return group_index.argsort(kind='mergesort') - - -def _compress_group_index(group_index, sort=True): - """ - Group_index is offsets into cartesian product of all possible labels. This - space can be huge, so this function compresses it, by computing offsets - (comp_ids) into the list of unique labels (obs_group_ids). - """ - - size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) - table = _hash.Int64HashTable(size_hint) - - group_index = _ensure_int64(group_index) - - # note, group labels come out ascending (ie, 1,2,3 etc) - comp_ids, obs_group_ids = table.get_labels_groupby(group_index) - - if sort and len(obs_group_ids) > 0: - obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - - return comp_ids, obs_group_ids - - -def _reorder_by_uniques(uniques, labels): - # sorter is index where elements ought to go - sorter = uniques.argsort() - - # reverse_indexer is where elements came from - reverse_indexer = np.empty(len(sorter), dtype=np.int64) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = labels < 0 - - # move labels to right locations (ie, unsort ascending labels) - labels = algos.take_nd(reverse_indexer, labels, allow_fill=False) - np.putmask(labels, mask, -1) - - # sort observed ids - uniques = algos.take_nd(uniques, sorter, allow_fill=False) - - return uniques, labels - - -def numpy_groupby(data, labels, axis=0): - s = np.argsort(labels) - keys, inv = np.unique(labels, return_inverse=True) - i = inv.take(s) - groups_at = np.where(i != np.concatenate(([-1], i[:-1])))[0] - ordered_data = data.take(s, axis=axis) - group_sums = np.add.reduceat(ordered_data, groups_at, axis=axis) - - return group_sums diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index cebaf4e3fd89b..5fc0d590a6885 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -20,7 +20,8 @@ from pandas._sparse import IntIndex from pandas.core.categorical import Categorical, _factorize_from_iterable -from pandas.core.groupby import get_group_index, _compress_group_index +from pandas.core.sorting import (get_group_index, compress_group_index, + decons_obs_group_ids) import pandas.core.algorithms as algos import pandas.algos as _algos @@ -156,7 +157,7 @@ def get_result(self): # filter out missing levels if values.shape[1] > 0: - col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) + col_inds, obs_ids = compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] @@ -245,8 +246,6 @@ def get_new_index(self): def _unstack_multiple(data, clocs): - from pandas.core.groupby import decons_obs_group_ids - if len(clocs) == 0: return data @@ -268,7 +267,7 @@ def _unstack_multiple(data, clocs): shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape, sort=False, xnull=False) - comp_ids, obs_ids = _compress_group_index(group_index, sort=False) + comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) @@ -459,10 +458,8 @@ def _unstack_frame(obj, level, fill_value=None): def get_compressed_ids(labels, sizes): - from pandas.core.groupby import get_group_index - ids = get_group_index(labels, sizes, sort=True, xnull=False) - return _compress_group_index(ids, sort=True) + return compress_group_index(ids, sort=True) def stack(frame, level=-1, dropna=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index e1eac8f66017e..da47ab5dfb003 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1786,12 +1786,12 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): - from pandas.core.groupby import _lexsort_indexer - indexer = _lexsort_indexer(index.labels, orders=ascending) + from pandas.core.sorting import lexsort_indexer + indexer = lexsort_indexer(index.labels, orders=ascending) else: - from pandas.core.groupby import _nargsort - indexer = _nargsort(index, kind=kind, ascending=ascending, - na_position=na_position) + from pandas.core.sorting import nargsort + indexer = nargsort(index, kind=kind, ascending=ascending, + na_position=na_position) indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py new file mode 100644 index 0000000000000..71314da7745c0 --- /dev/null +++ b/pandas/core/sorting.py @@ -0,0 +1,357 @@ +""" miscellaneous sorting / groupby utilities """ + +import numpy as np +from pandas.compat import long +from pandas.core.categorical import Categorical +from pandas.types.common import (_ensure_platform_int, + _ensure_int64, + is_categorical_dtype) +from pandas.types.missing import isnull +import pandas.core.algorithms as algos +import pandas.algos as _algos +import pandas.hashtable as _hash +from pandas import lib + + +_INT64_MAX = np.iinfo(np.int64).max + + +def get_group_index(labels, shape, sort, xnull): + """ + For the particular label_list, gets the offsets into the hypothetical list + representing the totally ordered cartesian product of all possible label + combinations, *as long as* this space fits within int64 bounds; + otherwise, though group indices identify unique combinations of + labels, they cannot be deconstructed. + - If `sort`, rank of returned ids preserve lexical ranks of labels. + i.e. returned id's can be used to do lexical sort on labels; + - If `xnull` nulls (-1 labels) are passed through. + + Parameters + ---------- + labels: sequence of arrays + Integers identifying levels at each location + shape: sequence of ints same length as labels + Number of unique levels at each location + sort: boolean + If the ranks of returned ids should match lexical ranks of labels + xnull: boolean + If true nulls are excluded. i.e. -1 values in the labels are + passed through + Returns + ------- + An array of type int64 where two elements are equal if their corresponding + labels are equal at all location. + """ + def _int64_cut_off(shape): + acc = long(1) + for i, mul in enumerate(shape): + acc *= long(mul) + if not acc < _INT64_MAX: + return i + return len(shape) + + def loop(labels, shape): + # how many levels can be done without overflow: + nlev = _int64_cut_off(shape) + + # compute flat ids for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype='i8') + out = stride * labels[0].astype('i8', subok=False, copy=False) + + for i in range(1, nlev): + if shape[i] == 0: + stride = 0 + else: + stride //= shape[i] + out += labels[i] * stride + + if xnull: # exclude nulls + mask = labels[0] == -1 + for lab in labels[1:nlev]: + mask |= lab == -1 + out[mask] = -1 + + if nlev == len(shape): # all levels done! + return out + + # compress what has been done so far in order to avoid overflow + # to retain lexical ranks, obs_ids should be sorted + comp_ids, obs_ids = compress_group_index(out, sort=sort) + + labels = [comp_ids] + labels[nlev:] + shape = [len(obs_ids)] + shape[nlev:] + + return loop(labels, shape) + + def maybe_lift(lab, size): # pormote nan values + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + return loop(list(labels), list(shape)) + + +def is_int64_overflow_possible(shape): + the_prod = long(1) + for x in shape: + the_prod *= long(x) + + return the_prod >= _INT64_MAX + + +def decons_group_index(comp_labels, shape): + # reconstruct labels + if is_int64_overflow_possible(shape): + # at some point group indices are factorized, + # and may not be deconstructed here! wrong path! + raise ValueError('cannot deconstruct factorized group indices!') + + label_list = [] + factor = 1 + y = 0 + x = comp_labels + for i in reversed(range(len(shape))): + labels = (x - y) % (factor * shape[i]) // factor + np.putmask(labels, comp_labels < 0, -1) + label_list.append(labels) + y = labels * factor + factor *= shape[i] + return label_list[::-1] + + +def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): + """ + reconstruct labels from observed group ids + + Parameters + ---------- + xnull: boolean, + if nulls are excluded; i.e. -1 labels are passed through + """ + from pandas.hashtable import unique_label_indices + + if not xnull: + lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') + shape = np.asarray(shape, dtype='i8') + lift + + if not is_int64_overflow_possible(shape): + # obs ids are deconstructable! take the fast route! + out = decons_group_index(obs_ids, shape) + return out if xnull or not lift.any() \ + else [x - y for x, y in zip(out, lift)] + + i = unique_label_indices(comp_ids) + i8copy = lambda a: a.astype('i8', subok=False, copy=True) + return [i8copy(lab[i]) for lab in labels] + + +def indexer_from_factorized(labels, shape, compress=True): + ids = get_group_index(labels, shape, sort=True, xnull=False) + + if not compress: + ngroups = (ids.size and ids.max()) + 1 + else: + ids, obs = compress_group_index(ids, sort=True) + ngroups = len(obs) + + return get_group_index_sorter(ids, ngroups) + + +def lexsort_indexer(keys, orders=None, na_position='last'): + labels = [] + shape = [] + if isinstance(orders, bool): + orders = [orders] * len(keys) + elif orders is None: + orders = [True] * len(keys) + + for key, order in zip(keys, orders): + + # we are already a Categorical + if is_categorical_dtype(key): + c = key + + # create the Categorical + else: + c = Categorical(key, ordered=True) + + if na_position not in ['last', 'first']: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + n = len(c.categories) + codes = c.codes.copy() + + mask = (c.codes == -1) + if order: # ascending + if na_position == 'last': + codes = np.where(mask, n, codes) + elif na_position == 'first': + codes += 1 + else: # not order means descending + if na_position == 'last': + codes = np.where(mask, n, n - codes - 1) + elif na_position == 'first': + codes = np.where(mask, 0, n - codes) + if mask.any(): + n += 1 + + shape.append(n) + labels.append(codes) + + return indexer_from_factorized(labels, shape) + + +def nargsort(items, kind='quicksort', ascending=True, na_position='last'): + """ + This is intended to be a drop-in replacement for np.argsort which + handles NaNs. It adds ascending and na_position parameters. + GH #6399, #5231 + """ + + # specially handle Categorical + if is_categorical_dtype(items): + return items.argsort(ascending=ascending) + + items = np.asanyarray(items) + idx = np.arange(len(items)) + mask = isnull(items) + non_nans = items[~mask] + non_nan_idx = idx[~mask] + nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + if not ascending: + indexer = indexer[::-1] + # Finally, place the NaNs at the end or the beginning according to + # na_position + if na_position == 'last': + indexer = np.concatenate([indexer, nan_idx]) + elif na_position == 'first': + indexer = np.concatenate([nan_idx, indexer]) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + return indexer + + +class _KeyMapper(object): + + """ + Ease my suffering. Map compressed group id -> key tuple + """ + + def __init__(self, comp_ids, ngroups, levels, labels): + self.levels = levels + self.labels = labels + self.comp_ids = comp_ids.astype(np.int64) + + self.k = len(labels) + self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)] + + self._populate_tables() + + def _populate_tables(self): + for labs, table in zip(self.labels, self.tables): + table.map(self.comp_ids, labs.astype(np.int64)) + + def get_key(self, comp_id): + return tuple(level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels)) + + +def get_flattened_iterator(comp_ids, ngroups, levels, labels): + # provide "flattened" iterator for multi-group setting + mapper = _KeyMapper(comp_ids, ngroups, levels, labels) + return [mapper.get_key(i) for i in range(ngroups)] + + +def get_indexer_dict(label_list, keys): + """ return a diction of {labels} -> {indexers} """ + shape = list(map(len, keys)) + + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + ngroups = ((group_index.size and group_index.max()) + 1) \ + if is_int64_overflow_possible(shape) \ + else np.prod(shape, dtype='i8') + + sorter = get_group_index_sorter(group_index, ngroups) + + sorted_labels = [lab.take(sorter) for lab in label_list] + group_index = group_index.take(sorter) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + + +# ---------------------------------------------------------------------- +# sorting levels...cleverly? + +def get_group_index_sorter(group_index, ngroups): + """ + _algos.groupsort_indexer implements `counting sort` and it is at least + O(ngroups), where + ngroups = prod(shape) + shape = map(len, keys) + that is, linear in the number of combinations (cartesian product) of unique + values of groupby keys. This can be huge when doing multi-key groupby. + np.argsort(kind='mergesort') is O(count x log(count)) where count is the + length of the data-frame; + Both algorithms are `stable` sort and that is necessary for correctness of + groupby operations. e.g. consider: + df.groupby(key)[col].transform('first') + """ + count = len(group_index) + alpha = 0.0 # taking complexities literally; there may be + beta = 1.0 # some room for fine-tuning these parameters + do_groupsort = (count > 0 and ((alpha + beta * ngroups) < + (count * np.log(count)))) + if do_groupsort: + sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), + ngroups) + return _ensure_platform_int(sorter) + else: + return group_index.argsort(kind='mergesort') + + +def compress_group_index(group_index, sort=True): + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + """ + + size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) + table = _hash.Int64HashTable(size_hint) + + group_index = _ensure_int64(group_index) + + # note, group labels come out ascending (ie, 1,2,3 etc) + comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + + if sort and len(obs_group_ids) > 0: + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) + + return comp_ids, obs_group_ids + + +def _reorder_by_uniques(uniques, labels): + # sorter is index where elements ought to go + sorter = uniques.argsort() + + # reverse_indexer is where elements came from + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + + # move labels to right locations (ie, unsort ascending labels) + labels = algos.take_nd(reverse_indexer, labels, allow_fill=False) + np.putmask(labels, mask, -1) + + # sort observed ids + uniques = algos.take_nd(uniques, sorter, allow_fill=False) + + return uniques, labels diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 9ab07d87fd13b..653ba1fee5691 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -663,7 +663,7 @@ def is_unique(self): False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) def duplicated(self, keep='first'): - from pandas.core.groupby import get_group_index + from pandas.core.sorting import get_group_index from pandas.hashtable import duplicated_int64 shape = map(len, self.levels) @@ -1405,7 +1405,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Indices of output values in original index """ - from pandas.core.groupby import _indexer_from_factorized + from pandas.core.sorting import indexer_from_factorized if isinstance(level, (compat.string_types, int)): level = [level] @@ -1417,8 +1417,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): if not len(level) == len(ascending): raise ValueError("level must have same length as ascending") - from pandas.core.groupby import _lexsort_indexer - indexer = _lexsort_indexer(self.labels, orders=ascending) + from pandas.core.sorting import lexsort_indexer + indexer = lexsort_indexer(self.labels, orders=ascending) # level ordering else: @@ -1436,8 +1436,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): else: sortorder = level[0] - indexer = _indexer_from_factorized(primary, primshp, - compress=False) + indexer = indexer_from_factorized(primary, primshp, + compress=False) if not ascending: indexer = indexer[::-1] diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 1640858802047..46ddb5a5318fb 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -616,24 +616,3 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v) - - -def test_decons(): - from pandas.core.groupby import decons_group_index, get_group_index - - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) - - for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) - - shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] - testit(label_list, shape) - - shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] - testit(label_list, shape) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d625fa07d932c..3a6a9eaaa8e72 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1510,59 +1510,6 @@ def check_nunique(df, keys, as_index=True): check_nunique(frame, ['jim'], as_index=False) check_nunique(frame, ['jim', 'joe'], as_index=False) - def test_series_groupby_value_counts(self): - from itertools import product - np.random.seed(1234) - - def rebuild_index(df): - arr = list(map(df.index.get_level_values, range(df.index.nlevels))) - df.index = MultiIndex.from_arrays(arr, names=df.index.names) - return df - - def check_value_counts(df, keys, bins): - for isort, normalize, sort, ascending, dropna \ - in product((False, True), repeat=5): - - kwargs = dict(normalize=normalize, sort=sort, - ascending=ascending, dropna=dropna, bins=bins) - - gr = df.groupby(keys, sort=isort) - left = gr['3rd'].value_counts(**kwargs) - - gr = df.groupby(keys, sort=isort) - right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] - - # have to sort on index because of unstable sort on values - left, right = map(rebuild_index, (left, right)) # xref GH9212 - assert_series_equal(left.sort_index(), right.sort_index()) - - def loop(df): - bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) - keys = '1st', '2nd', ('1st', '2nd') - for k, b in product(keys, bins): - check_value_counts(df, k, b) - - days = date_range('2015-08-24', periods=10) - - for n, m in product((100, 1000), (5, 20)): - frame = DataFrame({ - '1st': np.random.choice( - list('abcd'), n), - '2nd': np.random.choice(days, n), - '3rd': np.random.randint(1, m + 1, n) - }) - - loop(frame) - - frame.loc[1::11, '1st'] = nan - frame.loc[3::17, '2nd'] = nan - frame.loc[7::19, '3rd'] = nan - frame.loc[8::19, '3rd'] = nan - frame.loc[9::19, '3rd'] = nan - - loop(frame) - def test_multiindex_passthru(self): # GH 7997 @@ -3071,22 +3018,6 @@ def test_panel_groupby(self): agged = grouped.mean() self.assert_index_equal(agged.minor_axis, Index([0, 1])) - def test_numpy_groupby(self): - from pandas.core.groupby import numpy_groupby - - data = np.random.randn(100, 100) - labels = np.random.randint(0, 10, size=100) - - df = DataFrame(data) - - result = df.groupby(labels).sum().values - expected = numpy_groupby(data, labels) - assert_almost_equal(result, expected) - - result = df.groupby(labels, axis=1).sum().values - expected = numpy_groupby(data, labels, axis=1) - assert_almost_equal(result, expected) - def test_groupby_2d_malformed(self): d = DataFrame(index=lrange(2)) d['group'] = ['g1', 'g2'] @@ -3112,85 +3043,6 @@ def test_int32_overflow(self): right = df.groupby(['D', 'C', 'B', 'A']).sum() self.assertEqual(len(left), len(right)) - def test_int64_overflow(self): - from pandas.core.groupby import _int64_overflow_possible - - B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) - A = np.arange(2500) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': A, - 'F': B, - 'G': A, - 'H': B, - 'values': np.random.randn(2500)}) - - lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) - rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) - - left = lg.sum()['values'] - right = rg.sum()['values'] - - exp_index, _ = left.index.sortlevel() - self.assert_index_equal(left.index, exp_index) - - exp_index, _ = right.index.sortlevel(0) - self.assert_index_equal(right.index, exp_index) - - tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' - ]].values)) - tups = com._asarray_tuplesafe(tups) - - expected = df.groupby(tups).sum()['values'] - - for k, v in compat.iteritems(expected): - self.assertEqual(left[k], right[k[::-1]]) - self.assertEqual(left[k], v) - self.assertEqual(len(left), len(right)) - - # GH9096 - values = range(55109) - data = pd.DataFrame.from_dict({'a': values, - 'b': values, - 'c': values, - 'd': values}) - grouped = data.groupby(['a', 'b', 'c', 'd']) - self.assertEqual(len(grouped), len(values)) - - arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) - i = np.random.choice(len(arr), len(arr) * 4) - arr = np.vstack((arr, arr[i])) # add sume duplicate rows - - i = np.random.permutation(len(arr)) - arr = arr[i] # shuffle rows - - df = DataFrame(arr, columns=list('abcde')) - df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 - gr = df.groupby(list('abcde')) - - # verify this is testing what it is supposed to test! - self.assertTrue(_int64_overflow_possible(gr.grouper.shape)) - - # mannually compute groupings - jim, joe = defaultdict(list), defaultdict(list) - for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): - jim[key].append(a) - joe[key].append(b) - - self.assertEqual(len(gr), len(jim)) - mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) - - def aggr(func): - f = lambda a: np.fromiter(map(func, a), dtype='f8') - arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=['jim', 'joe'], index=mi) - return res.sort_index() - - assert_frame_equal(gr.mean(), aggr(np.mean)) - assert_frame_equal(gr.median(), aggr(np.median)) - def test_groupby_sort_multi(self): df = DataFrame({'a': ['foo', 'bar', 'baz'], 'b': [3, 2, 1], @@ -4451,24 +4303,3 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v) - - -def test_decons(): - from pandas.core.groupby import decons_group_index, get_group_index - - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) - - for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) - - shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] - testit(label_list, shape) - - shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] - testit(label_list, shape) diff --git a/pandas/tests/groupby/test_misc.py b/pandas/tests/groupby/test_misc.py deleted file mode 100644 index 9395304385681..0000000000000 --- a/pandas/tests/groupby/test_misc.py +++ /dev/null @@ -1,101 +0,0 @@ -""" misc non-groupby routines, as they are defined in core/groupby.py """ - -import pytest -import numpy as np -from numpy import nan -from pandas.util import testing as tm -from pandas.core.groupby import _nargsort, _lexsort_indexer - - -class TestSorting(tm.TestCase): - - def test_lexsort_indexer(self): - keys = [[nan] * 5 + list(range(100)) + [nan] * 5] - # orders=True, na_position='last' - result = _lexsort_indexer(keys, orders=True, na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=True, na_position='first' - result = _lexsort_indexer(keys, orders=True, na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='last' - result = _lexsort_indexer(keys, orders=False, na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='first' - result = _lexsort_indexer(keys, orders=False, na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - def test_nargsort(self): - # np.argsort(items) places NaNs last - items = [nan] * 5 + list(range(100)) + [nan] * 5 - # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype='O') - - try: - # GH 2785; due to a regression in NumPy1.6.2 - np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) - np.argsort(items2, kind='mergesort') - except TypeError: - pytest.skip('requested sort not available for type') - - # mergesort is the most difficult to get right because we want it to be - # stable. - - # According to numpy/core/tests/test_multiarray, """The number of - # sorted items must be greater than ~50 to check the actual algorithm - # because quick and merge sort fall over to insertion sort for small - # arrays.""" - - # mergesort, ascending=True, na_position='last' - result = _nargsort(items, kind='mergesort', ascending=True, - na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = _nargsort(items, kind='mergesort', ascending=True, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = _nargsort(items, kind='mergesort', ascending=False, - na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = _nargsort(items, kind='mergesort', ascending=False, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='last' - result = _nargsort(items2, kind='mergesort', ascending=True, - na_position='last') - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = _nargsort(items2, kind='mergesort', ascending=True, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = _nargsort(items2, kind='mergesort', ascending=False, - na_position='last') - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = _nargsort(items2, kind='mergesort', ascending=False, - na_position='first') - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py new file mode 100644 index 0000000000000..801d0da070112 --- /dev/null +++ b/pandas/tests/groupby/test_value_counts.py @@ -0,0 +1,60 @@ +import pytest + +from itertools import product +import numpy as np + +from pandas.util import testing as tm +from pandas import MultiIndex, DataFrame, Series, date_range + + +@pytest.mark.parametrize("n,m", product((100, 1000), (5, 20))) +def test_series_groupby_value_counts(n, m): + np.random.seed(1234) + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + def check_value_counts(df, keys, bins): + for isort, normalize, sort, ascending, dropna \ + in product((False, True), repeat=5): + + kwargs = dict(normalize=normalize, sort=sort, + ascending=ascending, dropna=dropna, bins=bins) + + gr = df.groupby(keys, sort=isort) + left = gr['3rd'].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr['3rd'].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ['3rd'] + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) + + def loop(df): + bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) + keys = '1st', '2nd', ('1st', '2nd') + for k, b in product(keys, bins): + check_value_counts(df, k, b) + + days = date_range('2015-08-24', periods=10) + + frame = DataFrame({ + '1st': np.random.choice( + list('abcd'), n), + '2nd': np.random.choice(days, n), + '3rd': np.random.randint(1, m + 1, n) + }) + + loop(frame) + + frame.loc[1::11, '1st'] = np.nan + frame.loc[3::17, '2nd'] = np.nan + frame.loc[7::19, '3rd'] = np.nan + frame.loc[8::19, '3rd'] = np.nan + frame.loc[9::19, '3rd'] = np.nan + + loop(frame) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py new file mode 100644 index 0000000000000..99361695b2371 --- /dev/null +++ b/pandas/tests/test_sorting.py @@ -0,0 +1,339 @@ +import pytest +from itertools import product +from collections import defaultdict + +import numpy as np +from numpy import nan +import pandas as pd +from pandas.core import common as com +from pandas import DataFrame, MultiIndex, merge, concat, Series, compat +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.core.sorting import (is_int64_overflow_possible, + decons_group_index, + get_group_index, + nargsort, + lexsort_indexer) + + +class TestSorting(tm.TestCase): + + def test_int64_overflow(self): + + B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) + A = np.arange(2500) + df = DataFrame({'A': A, + 'B': B, + 'C': A, + 'D': B, + 'E': A, + 'F': B, + 'G': A, + 'H': B, + 'values': np.random.randn(2500)}) + + lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) + rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) + + left = lg.sum()['values'] + right = rg.sum()['values'] + + exp_index, _ = left.index.sortlevel() + self.assert_index_equal(left.index, exp_index) + + exp_index, _ = right.index.sortlevel(0) + self.assert_index_equal(right.index, exp_index) + + tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' + ]].values)) + tups = com._asarray_tuplesafe(tups) + + expected = df.groupby(tups).sum()['values'] + + for k, v in compat.iteritems(expected): + self.assertEqual(left[k], right[k[::-1]]) + self.assertEqual(left[k], v) + self.assertEqual(len(left), len(right)) + + # GH9096 + values = range(55109) + data = pd.DataFrame.from_dict({'a': values, + 'b': values, + 'c': values, + 'd': values}) + grouped = data.groupby(['a', 'b', 'c', 'd']) + self.assertEqual(len(grouped), len(values)) + + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) + i = np.random.choice(len(arr), len(arr) * 4) + arr = np.vstack((arr, arr[i])) # add sume duplicate rows + + i = np.random.permutation(len(arr)) + arr = arr[i] # shuffle rows + + df = DataFrame(arr, columns=list('abcde')) + df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 + gr = df.groupby(list('abcde')) + + # verify this is testing what it is supposed to test! + self.assertTrue(is_int64_overflow_possible(gr.grouper.shape)) + + # mannually compute groupings + jim, joe = defaultdict(list), defaultdict(list) + for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): + jim[key].append(a) + joe[key].append(b) + + self.assertEqual(len(gr), len(jim)) + mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) + + def aggr(func): + f = lambda a: np.fromiter(map(func, a), dtype='f8') + arr = np.vstack((f(jim.values()), f(joe.values()))).T + res = DataFrame(arr, columns=['jim', 'joe'], index=mi) + return res.sort_index() + + assert_frame_equal(gr.mean(), aggr(np.mean)) + assert_frame_equal(gr.median(), aggr(np.median)) + + def test_lexsort_indexer(self): + keys = [[nan] * 5 + list(range(100)) + [nan] * 5] + # orders=True, na_position='last' + result = lexsort_indexer(keys, orders=True, na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=True, na_position='first' + result = lexsort_indexer(keys, orders=True, na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='last' + result = lexsort_indexer(keys, orders=False, na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='first' + result = lexsort_indexer(keys, orders=False, na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + def test_nargsort(self): + # np.argsort(items) places NaNs last + items = [nan] * 5 + list(range(100)) + [nan] * 5 + # np.argsort(items2) may not place NaNs first + items2 = np.array(items, dtype='O') + + try: + # GH 2785; due to a regression in NumPy1.6.2 + np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) + np.argsort(items2, kind='mergesort') + except TypeError: + pytest.skip('requested sort not available for type') + + # mergesort is the most difficult to get right because we want it to be + # stable. + + # According to numpy/core/tests/test_multiarray, """The number of + # sorted items must be greater than ~50 to check the actual algorithm + # because quick and merge sort fall over to insertion sort for small + # arrays.""" + + # mergesort, ascending=True, na_position='last' + result = nargsort(items, kind='mergesort', ascending=True, + na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = nargsort(items, kind='mergesort', ascending=True, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = nargsort(items, kind='mergesort', ascending=False, + na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = nargsort(items, kind='mergesort', ascending=False, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='last' + result = nargsort(items2, kind='mergesort', ascending=True, + na_position='last') + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = nargsort(items2, kind='mergesort', ascending=True, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = nargsort(items2, kind='mergesort', ascending=False, + na_position='last') + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = nargsort(items2, kind='mergesort', ascending=False, + na_position='first') + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + +class TestMerge(tm.TestCase): + + @pytest.mark.slow + def test_int64_overflow_issues(self): + + # #2690, combinatorial explosion + df1 = DataFrame(np.random.randn(1000, 7), + columns=list('ABCDEF') + ['G1']) + df2 = DataFrame(np.random.randn(1000, 7), + columns=list('ABCDEF') + ['G2']) + + # it works! + result = merge(df1, df2, how='outer') + self.assertTrue(len(result) == 2000) + + low, high, n = -1 << 10, 1 << 10, 1 << 20 + left = DataFrame(np.random.randint(low, high, (n, 7)), + columns=list('ABCDEFG')) + left['left'] = left.sum(axis=1) + + # one-2-one match + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + right.columns = right.columns[:-1].tolist() + ['right'] + right.index = np.arange(len(right)) + right['right'] *= -1 + + out = merge(left, right, how='outer') + self.assertEqual(len(out), len(left)) + assert_series_equal(out['left'], - out['right'], check_names=False) + result = out.iloc[:, :-2].sum(axis=1) + assert_series_equal(out['left'], result, check_names=False) + self.assertTrue(result.name is None) + + out.sort_values(out.columns.tolist(), inplace=True) + out.index = np.arange(len(out)) + for how in ['left', 'right', 'outer', 'inner']: + assert_frame_equal(out, merge(left, right, how=how, sort=True)) + + # check that left merge w/ sort=False maintains left frame order + out = merge(left, right, how='left', sort=False) + assert_frame_equal(left, out[left.columns.tolist()]) + + out = merge(right, left, how='left', sort=False) + assert_frame_equal(right, out[right.columns.tolist()]) + + # one-2-many/none match + n = 1 << 11 + left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), + columns=list('ABCDEFG')) + + # confirm that this is checking what it is supposed to check + shape = left.apply(Series.nunique).values + self.assertTrue(is_int64_overflow_possible(shape)) + + # add duplicates to left frame + left = concat([left, left], ignore_index=True) + + right = DataFrame(np.random.randint(low, high, (n // 2, 7)) + .astype('int64'), + columns=list('ABCDEFG')) + + # add duplicates & overlap with left to the right frame + i = np.random.choice(len(left), n) + right = concat([right, right, left.iloc[i]], ignore_index=True) + + left['left'] = np.random.randn(len(left)) + right['right'] = np.random.randn(len(right)) + + # shuffle left & right frames + i = np.random.permutation(len(left)) + left = left.iloc[i].copy() + left.index = np.arange(len(left)) + + i = np.random.permutation(len(right)) + right = right.iloc[i].copy() + right.index = np.arange(len(right)) + + # manually compute outer merge + ldict, rdict = defaultdict(list), defaultdict(list) + + for idx, row in left.set_index(list('ABCDEFG')).iterrows(): + ldict[idx].append(row['left']) + + for idx, row in right.set_index(list('ABCDEFG')).iterrows(): + rdict[idx].append(row['right']) + + vals = [] + for k, lval in ldict.items(): + rval = rdict.get(k, [np.nan]) + for lv, rv in product(lval, rval): + vals.append(k + tuple([lv, rv])) + + for k, rval in rdict.items(): + if k not in ldict: + for rv in rval: + vals.append(k + tuple([np.nan, rv])) + + def align(df): + df = df.sort_values(df.columns.tolist()) + df.index = np.arange(len(df)) + return df + + def verify_order(df): + kcols = list('ABCDEFG') + assert_frame_equal(df[kcols].copy(), + df[kcols].sort_values(kcols, kind='mergesort')) + + out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) + out = align(out) + + jmask = {'left': out['left'].notnull(), + 'right': out['right'].notnull(), + 'inner': out['left'].notnull() & out['right'].notnull(), + 'outer': np.ones(len(out), dtype='bool')} + + for how in 'left', 'right', 'outer', 'inner': + mask = jmask[how] + frame = align(out[mask].copy()) + self.assertTrue(mask.all() ^ mask.any() or how == 'outer') + + for sort in [False, True]: + res = merge(left, right, how=how, sort=sort) + if sort: + verify_order(res) + + # as in GH9092 dtypes break with outer/right join + assert_frame_equal(frame, align(res), + check_dtype=how not in ('right', 'outer')) + + +def test_decons(): + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index d66cd793ec0be..472d8674f9f8d 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -10,9 +10,7 @@ from pandas.compat import lrange, lzip from pandas.tools.concat import concat from pandas.tools.merge import merge, MergeError -from pandas.util.testing import (assert_frame_equal, - assert_series_equal, - slow) +from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm @@ -1092,137 +1090,6 @@ def test_merge_na_keys(self): tm.assert_frame_equal(result, expected) - @slow - def test_int64_overflow_issues(self): - from itertools import product - from collections import defaultdict - from pandas.core.groupby import _int64_overflow_possible - - # #2690, combinatorial explosion - df1 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G1']) - df2 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G2']) - - # it works! - result = merge(df1, df2, how='outer') - self.assertTrue(len(result) == 2000) - - low, high, n = -1 << 10, 1 << 10, 1 << 20 - left = DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) - left['left'] = left.sum(axis=1) - - # one-2-one match - i = np.random.permutation(len(left)) - right = left.iloc[i].copy() - right.columns = right.columns[:-1].tolist() + ['right'] - right.index = np.arange(len(right)) - right['right'] *= -1 - - out = merge(left, right, how='outer') - self.assertEqual(len(out), len(left)) - assert_series_equal(out['left'], - out['right'], check_names=False) - result = out.iloc[:, :-2].sum(axis=1) - assert_series_equal(out['left'], result, check_names=False) - self.assertTrue(result.name is None) - - out.sort_values(out.columns.tolist(), inplace=True) - out.index = np.arange(len(out)) - for how in ['left', 'right', 'outer', 'inner']: - assert_frame_equal(out, merge(left, right, how=how, sort=True)) - - # check that left merge w/ sort=False maintains left frame order - out = merge(left, right, how='left', sort=False) - assert_frame_equal(left, out[left.columns.tolist()]) - - out = merge(right, left, how='left', sort=False) - assert_frame_equal(right, out[right.columns.tolist()]) - - # one-2-many/none match - n = 1 << 11 - left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), - columns=list('ABCDEFG')) - - # confirm that this is checking what it is supposed to check - shape = left.apply(Series.nunique).values - self.assertTrue(_int64_overflow_possible(shape)) - - # add duplicates to left frame - left = concat([left, left], ignore_index=True) - - right = DataFrame(np.random.randint(low, high, (n // 2, 7)) - .astype('int64'), - columns=list('ABCDEFG')) - - # add duplicates & overlap with left to the right frame - i = np.random.choice(len(left), n) - right = concat([right, right, left.iloc[i]], ignore_index=True) - - left['left'] = np.random.randn(len(left)) - right['right'] = np.random.randn(len(right)) - - # shuffle left & right frames - i = np.random.permutation(len(left)) - left = left.iloc[i].copy() - left.index = np.arange(len(left)) - - i = np.random.permutation(len(right)) - right = right.iloc[i].copy() - right.index = np.arange(len(right)) - - # manually compute outer merge - ldict, rdict = defaultdict(list), defaultdict(list) - - for idx, row in left.set_index(list('ABCDEFG')).iterrows(): - ldict[idx].append(row['left']) - - for idx, row in right.set_index(list('ABCDEFG')).iterrows(): - rdict[idx].append(row['right']) - - vals = [] - for k, lval in ldict.items(): - rval = rdict.get(k, [np.nan]) - for lv, rv in product(lval, rval): - vals.append(k + tuple([lv, rv])) - - for k, rval in rdict.items(): - if k not in ldict: - for rv in rval: - vals.append(k + tuple([np.nan, rv])) - - def align(df): - df = df.sort_values(df.columns.tolist()) - df.index = np.arange(len(df)) - return df - - def verify_order(df): - kcols = list('ABCDEFG') - assert_frame_equal(df[kcols].copy(), - df[kcols].sort_values(kcols, kind='mergesort')) - - out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) - out = align(out) - - jmask = {'left': out['left'].notnull(), - 'right': out['right'].notnull(), - 'inner': out['left'].notnull() & out['right'].notnull(), - 'outer': np.ones(len(out), dtype='bool')} - - for how in 'left', 'right', 'outer', 'inner': - mask = jmask[how] - frame = align(out[mask].copy()) - self.assertTrue(mask.all() ^ mask.any() or how == 'outer') - - for sort in [False, True]: - res = merge(left, right, how=how, sort=sort) - if sort: - verify_order(res) - - # as in GH9092 dtypes break with outer/right join - assert_frame_equal(frame, align(res), - check_dtype=how not in ('right', 'outer')) - def test_join_multi_levels(self): # GH 3662 diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d938c2eeacbef..e82e702cb6e55 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -34,6 +34,7 @@ concatenate_block_managers) from pandas.util.decorators import Appender, Substitution +from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos import pandas.core.common as com @@ -1397,10 +1398,9 @@ def _sort_labels(uniques, left, right): def _get_join_keys(llab, rlab, shape, sort): - from pandas.core.groupby import _int64_overflow_possible # how many levels can be done without overflow - pred = lambda i: not _int64_overflow_possible(shape[:i]) + pred = lambda i: not is_int64_overflow_possible(shape[:i]) nlev = next(filter(pred, range(len(shape), 0, -1))) # get keys for the first `nlev` levels From 4b97db4caa94690691316df6303092f4954e7e6f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 19:57:51 -0500 Subject: [PATCH 20/52] TST: disable gbq tests again --- pandas/tests/io/test_gbq.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index dfbf3ca69b111..0a76267054ee6 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): @@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() @@ -449,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self): private_key=re.sub('[a-z]', '9', _get_private_key_contents())) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestReadGBQIntegration(tm.TestCase): @classmethod @@ -503,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self): tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): @classmethod @@ -906,7 +906,7 @@ def test_configuration_without_query(self): configuration=config) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1219,7 +1219,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. @@ -1277,7 +1277,7 @@ def test_upload_data(self): self.assertEqual(result['num_rows'][0], test_size) -@pytest.mark.single +@pytest.mark.xfail(run=False, reason="intermittent failures") class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 25fb173dcaff5401f2b496e17beba28d14d54c66 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 20:15:20 -0500 Subject: [PATCH 21/52] TST: fix incorrect url in compressed url network tests in parser --- pandas/tests/io/parser/test_network.py | 53 ++++++++++---------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 4d75b59b09560..6e762368f82c5 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -7,7 +7,6 @@ import os import pytest -import functools from itertools import product import pandas.util.testing as tm @@ -15,42 +14,32 @@ from pandas.io.parsers import read_csv, read_table -class TestCompressedUrl(object): +@pytest.fixture(scope='module') +def salaries_table(): + path = os.path.join(tm.get_data_path(), 'salaries.csv') + return read_table(path) - compression_to_extension = { - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', - } - def setup(self): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - self.local_table = read_table(path) - self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/' - 'pandas/io/tests/parser/data/salaries.csv') +@tm.network +@pytest.mark.parametrize( + "compression,extension", [('gzip', '.gz'), ('bz2', '.bz2'), + ('zip', '.zip'), ('xz', '.xz')]) +def test_compressed_urls(salaries_table, compression, extension): + # test reading compressed urls with various engines and + # extension inference + base_url = ('https://github.com/pandas-dev/pandas/raw/master/' + 'pandas/tests/io/parser/data/salaries.csv') + + url = base_url + extension + + # args is a (compression, engine) tuple + for (c, engine) in product([compression, 'infer'], ['python', 'c']): - @tm.network - def test_compressed_urls(self): - # Test reading compressed tables from URL. - msg = ('Test reading {}-compressed tables from URL: ' - 'compression="{}", engine="{}"') - - for compression, extension in self.compression_to_extension.items(): - url = self.base_url + extension - # args is a (compression, engine) tuple - for args in product([compression, 'infer'], ['python', 'c']): - # test_fxn is a workaround for more descriptive nose reporting. - # See http://stackoverflow.com/a/37393684/4651668. - test_fxn = functools.partial(self.check_table) - test_fxn.description = msg.format(compression, *args) - yield (test_fxn, url) + args - - def check_table(self, url, compression, engine): if url.endswith('.xz'): tm._skip_if_no_lzma() - url_table = read_table(url, compression=compression, engine=engine) - tm.assert_frame_equal(url_table, self.local_table) + + url_table = read_table(url, compression=c, engine=engine) + tm.assert_frame_equal(url_table, salaries_table) class TestS3(tm.TestCase): From 03bb9003b3b3db92f3c20a60e88fd2001d6b3948 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 20:44:44 -0500 Subject: [PATCH 22/52] TST: incorrect skip in when --skip-network is run closes #15407 --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b3683de3a173b..623feb99e9cdc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -17,5 +17,5 @@ def pytest_runtest_setup(item): if 'slow' not in item.keywords and item.config.getoption("--only-slow"): pytest.skip("skipping due to --only-slow") - if 'skip' in item.keywords and item.config.getoption("--skip-network"): + if 'network' in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") From bbb583c30bcee83ed3a2e9a3acfc83535f270632 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Feb 2017 22:25:23 -0500 Subject: [PATCH 23/52] TST: fix test_nework.py fixture under py27 --- pandas/tests/io/parser/test_network.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 6e762368f82c5..721d447262149 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -20,11 +20,15 @@ def salaries_table(): return read_table(path) -@tm.network @pytest.mark.parametrize( "compression,extension", [('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'), ('xz', '.xz')]) def test_compressed_urls(salaries_table, compression, extension): + check_compressed_urls(salaries_table, compression, extension) + + +@tm.network +def check_compressed_urls(salaries_table, compression, extension): # test reading compressed urls with various engines and # extension inference base_url = ('https://github.com/pandas-dev/pandas/raw/master/' From 2372d275b4b2565b4c406d3dfc7c4b4993f1e625 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 15 Feb 2017 10:20:27 -0500 Subject: [PATCH 24/52] BLD: Numexpr 2.4.6 required closes #15213 Author: Francesc Alted Closes #15383 from FrancescAlted/numexpr-2.4.6 and squashes the following commits: c417fe2 [Francesc Alted] Simplify and remove UserWarning testing on numexpr import e1b34a9 [Francesc Alted] Force a reload of pd.computation for actually triggering the UserWarning c081199 [Francesc Alted] Relax the exact message for the ImportError 73f0319 [Francesc Alted] numexpr requisite raised to 2.4.6 0d4ab9a [Francesc Alted] Restored the old numexpr version dependencies to adjust for old requirements c1aae19 [Francesc Alted] Fixed a lint error 7575ba2 [Francesc Alted] Using constants instead of literals for numexpr version 7a275ce [Francesc Alted] Fixed a typo 93f54aa [Francesc Alted] numexpr section moved to Other API changes section 3b6e58b [Francesc Alted] Removed recomendation for numexpr 2.6.2 f225598 [Francesc Alted] Updated test_compat for numexpr 2.4.6 8bd4ed1 [Francesc Alted] numexpr 2.4.6 requirement moved to other enhancements section e45b742 [Francesc Alted] Moved pinned versions in CI folder to 2.4.6 6e12e29 [Francesc Alted] Added a notice on the recommended numexpr version ac62653 [Francesc Alted] Require numexpr 2.4.6 ab79c54 [Francesc Alted] Require numexpr 2.6.2 --- ci/requirements-3.4_SLOW.run | 2 +- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.20.0.txt | 4 +++- pandas/computation/__init__.py | 17 +++++------------ pandas/tests/computation/test_compat.py | 15 ++++----------- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/ci/requirements-3.4_SLOW.run b/ci/requirements-3.4_SLOW.run index 39018439a1223..90156f62c6e71 100644 --- a/ci/requirements-3.4_SLOW.run +++ b/ci/requirements-3.4_SLOW.run @@ -9,7 +9,7 @@ html5lib patsy beautiful-soup scipy -numexpr=2.4.4 +numexpr=2.4.6 pytables matplotlib lxml diff --git a/doc/source/install.rst b/doc/source/install.rst index 1c7cbc9326614..80a5d7e7d375b 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -226,7 +226,7 @@ Recommended Dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.1 or higher (excluding a buggy 2.4.4). Version 2.4.6 or higher is highly recommended. + If installed, must be Version 2.4.6 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d76e33caffbf1..26006083d81b4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1,6 +1,6 @@ .. _whatsnew_0200: -v0.20.0 (????, 2016) +v0.20.0 (????, 2017) -------------------- This is a major release from 0.19 and includes a small number of API changes, several new features, @@ -158,6 +158,7 @@ Other enhancements .. _whatsnew_0200.api_breaking: + Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -429,6 +430,7 @@ Other API Changes - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) - The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). - Reorganization of timeseries development tests (:issue:`14854`) +- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). .. _whatsnew_0200.deprecations: diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py index 9e94215eecf62..e13faf890d1f8 100644 --- a/pandas/computation/__init__.py +++ b/pandas/computation/__init__.py @@ -3,26 +3,19 @@ from distutils.version import LooseVersion _NUMEXPR_INSTALLED = False +_MIN_NUMEXPR_VERSION = "2.4.6" try: import numexpr as ne ver = ne.__version__ - _NUMEXPR_INSTALLED = ver >= LooseVersion('2.1') + _NUMEXPR_INSTALLED = ver >= LooseVersion(_MIN_NUMEXPR_VERSION) - # we specifically disallow 2.4.4 as - # has some hard-to-diagnose bugs - if ver == LooseVersion('2.4.4'): - _NUMEXPR_INSTALLED = False - warnings.warn( - "The installed version of numexpr {ver} is not supported " - "in pandas and will be not be used\n".format(ver=ver), - UserWarning) - - elif not _NUMEXPR_INSTALLED: + if not _NUMEXPR_INSTALLED: warnings.warn( "The installed version of numexpr {ver} is not supported " "in pandas and will be not be used\nThe minimum supported " - "version is 2.1\n".format(ver=ver), UserWarning) + "version is {min_ver}\n".format( + ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning) except ImportError: # pragma: no cover pass diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index 599d0c10336dc..77994ac6d2f53 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -10,6 +10,7 @@ from pandas.computation.engines import _engines import pandas.computation.expr as expr +from pandas.computation import _MIN_NUMEXPR_VERSION ENGINES_PARSERS = list(product(_engines, expr._parsers)) @@ -21,15 +22,10 @@ def test_compat(): try: import numexpr as ne ver = ne.__version__ - if ver == LooseVersion('2.4.4'): + if ver < LooseVersion(_MIN_NUMEXPR_VERSION): assert not _NUMEXPR_INSTALLED - elif ver < LooseVersion('2.1'): - with tm.assert_produces_warning(UserWarning, - check_stacklevel=False): - assert not _NUMEXPR_INSTALLED else: assert _NUMEXPR_INSTALLED - except ImportError: pytest.skip("not testing numexpr version compat") @@ -51,12 +47,9 @@ def testit(): except ImportError: pytest.skip("no numexpr") else: - if ne.__version__ < LooseVersion('2.1'): - with tm.assertRaisesRegexp(ImportError, "'numexpr' version is " - ".+, must be >= 2.1"): + if ne.__version__ < LooseVersion(_MIN_NUMEXPR_VERSION): + with tm.assertRaises(ImportError): testit() - elif ne.__version__ == LooseVersion('2.4.4'): - pytest.skip("numexpr version==2.4.4") else: testit() else: From b261dfe38f114b57e358ad09051501684d88587f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 10:23:36 -0500 Subject: [PATCH 25/52] TST: print skipped tests files xref #15341 Author: Jeff Reback Closes #15408 from jreback/skip and squashes the following commits: 547bee6 [Jeff Reback] TST: print skipped tests files --- .travis.yml | 3 ++- ci/install_travis.sh | 1 + ci/print_skipped.py | 7 ++++--- ci/script_multi.sh | 8 ++++---- ci/script_single.sh | 8 ++++---- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6b90e49b336b2..6245213cec06f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -332,5 +332,6 @@ after_script: - echo "after_script start" - ci/install_test.sh - source activate pandas && python -c "import pandas; pandas.show_versions();" - - ci/print_skipped.py /tmp/pytest.xml + - ci/print_skipped.py /tmp/single.xml + - ci/print_skipped.py /tmp/multiple.xml - echo "after_script done" diff --git a/ci/install_travis.sh b/ci/install_travis.sh index ad804b96a0d82..802d8c9f6b776 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -112,6 +112,7 @@ fi source activate pandas pip install pytest-xdist + if [ "$LINT" ]; then conda install flake8 pip install cpplint diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 9fb05df64bcea..dd2180f6eeb19 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -30,20 +30,21 @@ def parse_results(filename): i += 1 assert i - 1 == len(skipped) assert i - 1 == len(skipped) - assert len(skipped) == int(root.attrib['skip']) + # assert len(skipped) == int(root.attrib['skip']) return '\n'.join(skipped) def main(args): print('SKIPPED TESTS:') - print(parse_results(args.filename)) + for fn in args.filename: + print(parse_results(fn)) return 0 def parse_args(): import argparse parser = argparse.ArgumentParser() - parser.add_argument('filename', help='XUnit file to parse') + parser.add_argument('filename', nargs='+', help='XUnit file to parse') return parser.parse_args() diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 83f8427cc57ad..f5fbcbbc12f83 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/cov.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-append --cov-report xml:/tmp/cov.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas else - echo pytest -n 2 -m "not single" $TEST_ARGS pandas - pytest -n 2 -m "not single" $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/ci/script_single.sh b/ci/script_single.sh index 38021fcac5721..2d7962352842b 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -20,11 +20,11 @@ fi if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas - pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/pytest.xml $TEST_ARGS pandas + echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" $TEST_ARGS pandas - pytest -m "single" $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" From e351ed0fd211a204f960b9116bc13f75ed1f97c4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 10:24:45 -0500 Subject: [PATCH 26/52] PERF: high memory in MI closes #13904 Creates an efficient MultiIndexHashTable in cython. This allows us to efficiently store a multi-index for fast indexing (.get_loc() and .get_indexer()), with the current tuple-based (and gil holding) use of the PyObject Hash Table. This uses the pandas.tools.hashing routines to hash each of the 'values' of a MI to a single uint64. So this makes MI more memory friendly and much more efficient. You get these speedups, because the creation of the hashtable is now much more efficient. Author: Jeff Reback Closes #15245 from jreback/mi and squashes the following commits: 7df6c34 [Jeff Reback] PERF: high memory in MI --- asv_bench/benchmarks/indexing.py | 30 +++- asv_bench/benchmarks/reindex.py | 4 +- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/core/algorithms.py | 3 +- pandas/core/frame.py | 3 +- pandas/hashtable.pxd | 8 + pandas/index.pyx | 39 ++++- pandas/indexes/base.py | 5 +- pandas/indexes/multi.py | 203 ++++++++++++++++++---- pandas/io/pytables.py | 4 +- pandas/src/algos_common_helper.pxi.in | 4 +- pandas/src/hashtable_class_helper.pxi.in | 152 +++++++++++++--- pandas/tests/frame/test_mutate_columns.py | 29 +++- pandas/tests/frame/test_repr_info.py | 32 ++++ pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/indexes/test_multi.py | 136 +++++++++++++-- pandas/tests/indexing/test_multiindex.py | 3 +- pandas/tests/test_multilevel.py | 4 +- pandas/tests/tools/test_hashing.py | 12 ++ pandas/tests/tools/test_join.py | 6 +- pandas/tools/hashing.py | 44 +++-- pandas/types/cast.py | 3 +- 22 files changed, 605 insertions(+), 125 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 27cd320c661e0..d938cc6a6dc4d 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -88,7 +88,7 @@ def setup(self): def time_getitem_scalar(self): self.ts[self.dt] - + class DataFrameIndexing(object): goal_time = 0.2 @@ -189,6 +189,15 @@ def setup(self): self.eps_C = 5 self.eps_D = 5000 self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() + self.miint = MultiIndex.from_product( + [np.arange(1000), + np.arange(1000)], names=['one', 'two']) + + import string + self.mistring = MultiIndex.from_product( + [np.arange(1000), + np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) def time_series_xs_mi_ix(self): self.s.ix[999] @@ -197,7 +206,24 @@ def time_frame_xs_mi_ix(self): self.df.ix[999] def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + self.mdt2.loc[self.idx[ + (self.test_A - self.eps_A):(self.test_A + self.eps_A), + (self.test_B - self.eps_B):(self.test_B + self.eps_B), + (self.test_C - self.eps_C):(self.test_C + self.eps_C), + (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + + def time_multiindex_get_indexer(self): + self.miint.get_indexer( + np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object)) + + def time_multiindex_string_get_loc(self): + self.mistring.get_loc((999, 19, 'Z')) + + def time_is_monotonic(self): + self.miint.is_monotonic class PanelIndexing(object): diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8db0cd7629332..6fe6c32a96df9 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -16,8 +16,8 @@ def setup(self): data=np.random.rand(10000, 30), columns=range(30)) # multi-index - N = 1000 - K = 20 + N = 5000 + K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 26006083d81b4..4708abe4d592e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -472,7 +472,7 @@ Performance Improvements - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - +- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. @@ -502,6 +502,8 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) + +- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 05cfb1bd9ec27..c922ac21e12eb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = _ensure_int64(indexer) + indexer = _ensure_int64(indexer, copy=False) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False @@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 16f8d4658dc20..9c66f6dbb273e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1752,7 +1752,8 @@ def _sizeof_fmt(num, size_qualifier): # all cases (e.g., it misses categorical data even with object # categories) deep = False - if 'object' in counts or is_object_dtype(self.index): + if ('object' in counts or + self.index._is_memory_usage_qualified()): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index cabfa43a76f26..9b352ae1c003b 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -31,6 +31,14 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) +cdef class MultiIndexHashTable(HashTable): + cdef: + kh_uint64_t *table + object mi + + cpdef get_item(self, object val) + cpdef set_item(self, object key, Py_ssize_t val) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/index.pyx b/pandas/index.pyx index 0c975d1775a03..37fe7d90bebe0 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -182,7 +182,7 @@ cdef class IndexEngine: Py_ssize_t i, n int last_true - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) n = len(values) result = np.empty(n, dtype=bool) @@ -284,7 +284,6 @@ cdef class IndexEngine: if not self.is_mapping_populated: values = self._get_index_values() - self.mapping = self._make_hash_table(len(values)) self.mapping.map_locations(values) @@ -322,7 +321,7 @@ cdef class IndexEngine: Py_ssize_t i, j, n, n_t, n_alloc self._ensure_mapping_populated() - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) stargets = set(targets) n = len(values) n_t = len(targets) @@ -554,5 +553,39 @@ cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _du_utc) +cdef class MultiIndexEngine(IndexEngine): + + def _call_monotonic(self, object mi): + # defer these back to the mi iteself + return (mi.is_monotonic_increasing, + mi.is_monotonic_decreasing, + mi.is_unique) + + def get_backfill_indexer(self, other, limit=None): + # we coerce to ndarray-of-tuples + values = np.array(self._get_index_values()) + return algos.backfill_object(values, other, limit=limit) + + def get_pad_indexer(self, other, limit=None): + # we coerce to ndarray-of-tuples + values = np.array(self._get_index_values()) + return algos.pad_object(values, other, limit=limit) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError("'{val}' is an invalid key".format(val=val)) + + self._ensure_mapping_populated() + if not self.unique: + return self._get_loc_duplicates(val) + + try: + return self.mapping.get_item(val) + except TypeError: + raise KeyError(val) + + cdef _make_hash_table(self, n): + return _hash.MultiIndexHashTable(n) + # Generated from template. include "index_class_helper.pxi" diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index bb2941a121452..c483fb0764a4c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1431,6 +1431,10 @@ def inferred_type(self): """ return a string of the type inferred from the values """ return lib.infer_dtype(self) + def _is_memory_usage_qualified(self): + """ return a boolean if we need a qualified .info display """ + return self.is_object() + def is_type_compatible(self, kind): return kind == self.inferred_type @@ -2446,7 +2450,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): 'if index and target are monotonic' % method) side = 'left' if method == 'pad' else 'right' - target = np.asarray(target) # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 653ba1fee5691..57739548a17d6 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -14,7 +14,6 @@ from pandas.compat.numpy import function as nv from pandas import compat - from pandas.types.common import (_ensure_int64, _ensure_platform_int, is_object_dtype, @@ -73,6 +72,7 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] + _engine_type = _index.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -114,7 +114,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, result._verify_integrity() if _set_identity: result._reset_identity() - return result def _verify_integrity(self, labels=None, levels=None): @@ -429,6 +428,12 @@ def _shallow_copy(self, values=None, **kwargs): def dtype(self): return np.dtype('O') + def _is_memory_usage_qualified(self): + """ return a boolean if we need a qualified .info display """ + def f(l): + return 'mixed' in l or 'string' in l or 'unicode' in l + return any([f(l) for l in self._inferred_type_levels]) + @Appender(Index.memory_usage.__doc__) def memory_usage(self, deep=False): # we are overwriting our base class to avoid @@ -619,6 +624,10 @@ def _get_level_number(self, level): _tuples = None + @cache_readonly + def _engine(self): + return self._engine_type(lambda: self, len(self)) + @property def values(self): if self._tuples is not None: @@ -655,10 +664,95 @@ def _has_complex_internals(self): # to disable groupby tricks return True + @cache_readonly + def is_monotonic(self): + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + return self.is_monotonic_increasing + + @cache_readonly + def is_monotonic_increasing(self): + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + + # reversed() because lexsort() wants the most significant key last. + values = [self._get_level_values(i) + for i in reversed(range(len(self.levels)))] + try: + sort_order = np.lexsort(values) + return Index(sort_order).is_monotonic + except TypeError: + + # we have mixed types and np.lexsort is not happy + return Index(self.values).is_monotonic + + @property + def is_monotonic_decreasing(self): + """ + return if the index is monotonic decreasing (only equal or + decreasing) values. + """ + return False + @cache_readonly def is_unique(self): return not self.duplicated().any() + @cache_readonly + def _have_mixed_levels(self): + """ return a boolean list indicated if we have mixed levels """ + return ['mixed' in l for l in self._inferred_type_levels] + + @cache_readonly + def _inferred_type_levels(self): + """ return a list of the inferred types, one for each level """ + return [i.inferred_type for i in self.levels] + + @cache_readonly + def _hashed_values(self): + """ return a uint64 ndarray of my hashed values """ + from pandas.tools.hashing import hash_tuples + return hash_tuples(self) + + def _hashed_indexing_key(self, key): + """ + validate and return the hash for the provided key + + *this is internal for use for the cython routines* + + Paramters + --------- + key : string or tuple + + Returns + ------- + np.uint64 + + Notes + ----- + we need to stringify if we have mixed levels + + """ + from pandas.tools.hashing import hash_tuples + + if not isinstance(key, tuple): + return hash_tuples(key) + + if not len(key) == self.nlevels: + raise KeyError + + def f(k, stringify): + if stringify and not isinstance(k, compat.string_types): + k = str(k) + return k + key = tuple([f(k, stringify) + for k, stringify in zip(key, self._have_mixed_levels)]) + return hash_tuples(key) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) @@ -748,26 +842,44 @@ def _try_mi(k): raise InvalidIndexError(key) - def get_level_values(self, level): + def _get_level_values(self, level): """ - Return vector of label values for requested level, equal to the length - of the index + Return vector of label values for requested level, + equal to the length of the index + + **this is an internal method** Parameters ---------- - level : int or level name + level : int level Returns ------- values : ndarray """ - num = self._get_level_number(level) - unique = self.levels[num] # .values - labels = self.labels[num] - filled = algos.take_1d(unique.values, labels, + + unique = self.levels[level] + labels = self.labels[level] + filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) - values = unique._shallow_copy(filled) - return values + return filled + + def get_level_values(self, level): + """ + Return vector of label values for requested level, + equal to the length of the index + + Parameters + ---------- + level : int or level name + + Returns + ------- + values : Index + """ + level = self._get_level_number(level) + values = self._get_level_values(level) + return self.levels[level]._shallow_copy(values) def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep=None, formatter=None): @@ -852,7 +964,8 @@ def to_frame(self, index=True): from pandas import DataFrame result = DataFrame({(name or level): self.get_level_values(level) for name, level in - zip(self.names, range(len(self.levels)))}) + zip(self.names, range(len(self.levels)))}, + copy=False) if index: result.index = self return result @@ -1482,29 +1595,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) - target_index = target - if isinstance(target, MultiIndex): - target_index = target._tuple_index + # empty indexer + if is_list_like(target) and not len(target): + return _ensure_platform_int(np.array([])) + + if not isinstance(target, MultiIndex): + try: + target = MultiIndex.from_tuples(target) + except (TypeError, ValueError): - if not is_object_dtype(target_index.dtype): - return np.ones(len(target_index)) * -1 + # let's instead try with a straight Index + if method is None: + return Index(self.values).get_indexer(target, + method=method, + limit=limit, + tolerance=tolerance) if not self.is_unique: raise Exception('Reindexing only valid with uniquely valued Index ' 'objects') - self_index = self._tuple_index - if method == 'pad' or method == 'backfill': if tolerance is not None: raise NotImplementedError("tolerance not implemented yet " 'for MultiIndex') - indexer = self_index._get_fill_indexer(target, method, limit) + indexer = self._get_fill_indexer(target, method, limit) elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for MultiIndex; see GitHub issue 9365') else: - indexer = self_index._engine.get_indexer(target._values) + # we may not compare equally because of hashing if we + # don't have the same dtypes + if self._inferred_type_levels != target._inferred_type_levels: + return Index(self.values).get_indexer(target.values) + + indexer = self._engine.get_indexer(target) return _ensure_platform_int(indexer) @@ -1571,17 +1696,6 @@ def reindex(self, target, method=None, level=None, limit=None, return target, indexer - @cache_readonly - def _tuple_index(self): - """ - Convert MultiIndex to an Index of tuples - - Returns - ------- - index : Index - """ - return Index(self._values) - def get_slice_bound(self, label, side, kind): if not isinstance(label, tuple): @@ -1828,8 +1942,9 @@ def partial_selection(key, indexer=None): key = tuple(self[indexer].tolist()[0]) - return (self._engine.get_loc(_values_from_object(key)), - None) + return (self._engine.get_loc( + _values_from_object(key)), None) + else: return partial_selection(key) else: @@ -2115,10 +2230,24 @@ def equals(self, other): return False for i in range(self.nlevels): + slabels = self.labels[i] + slabels = slabels[slabels != -1] svalues = algos.take_nd(np.asarray(self.levels[i]._values), - self.labels[i], allow_fill=False) + slabels, allow_fill=False) + + olabels = other.labels[i] + olabels = olabels[olabels != -1] ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - other.labels[i], allow_fill=False) + olabels, allow_fill=False) + + # since we use NaT both datetime64 and timedelta64 + # we can have a situation where a level is typed say + # timedelta64 in self (IOW it has other values than NaT) + # but types datetime64 in other (where its all NaT) + # but these are equivalent + if len(svalues) == 0 and len(ovalues) == 0: + continue + if not array_equivalent(svalues, ovalues): return False diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9224f7d3d9a94..d8de1dcd61977 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3787,9 +3787,9 @@ def read(self, where=None, columns=None, **kwargs): lp = DataFrame(c.data, index=long_index, columns=c.values) # need a better algorithm - tuple_index = long_index._tuple_index + tuple_index = long_index.values - unique_tuples = lib.fast_unique(tuple_index.values) + unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 42089f9520ab6..b83dec1d26242 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -579,12 +579,12 @@ def get_dispatch(dtypes): {{for name, c_type, dtype in get_dispatch(dtypes)}} -cpdef ensure_{{name}}(object arr): +cpdef ensure_{{name}}(object arr, copy=True): if util.is_array(arr): if ( arr).descr.type_num == NPY_{{c_type}}: return arr else: - return arr.astype(np.{{dtype}}) + return arr.astype(np.{{dtype}}, copy=copy) else: return np.array(arr, dtype=np.{{dtype}}) diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index ef385ba7dca1c..3ce82dace40a9 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -262,13 +262,6 @@ cdef class {{name}}HashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, {{dtype}}_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): cdef: khiter_t k @@ -501,18 +494,6 @@ cdef class StringHashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef: - Py_ssize_t i, val - char *v - - v = util.get_c_string(key) - - for i in range(iterations): - k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k @@ -755,15 +736,6 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k @@ -874,3 +846,127 @@ cdef class PyObjectHashTable(HashTable): count += 1 return np.asarray(labels) + + +cdef class MultiIndexHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_uint64() + self.mi = None + kh_resize_uint64(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + kh_destroy_uint64(self.table) + self.table = NULL + + def __len__(self): + return self.table.size + + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(uint64_t) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + + def _check_for_collisions(self, int64_t[:] locs, object mi): + # validate that the locs map to the actual values + # provided in the mi + # we can only check if we *don't* have any missing values + # :< + cdef: + ndarray[int64_t] alocs + + alocs = np.asarray(locs) + if (alocs != -1).all(): + + result = self.mi.take(locs) + if isinstance(mi, tuple): + from pandas import Index + mi = Index([mi]) + if not result.equals(mi): + raise AssertionError( + "hash collision\nlocs:\n{}\n" + "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + + def __contains__(self, object key): + try: + self.get_item(key) + return True + except (KeyError, ValueError, TypeError): + return False + + cpdef get_item(self, object key): + cdef: + khiter_t k + uint64_t value + int64_t[:] locs + Py_ssize_t loc + + value = self.mi._hashed_indexing_key(key) + k = kh_get_uint64(self.table, value) + if k != self.table.n_buckets: + loc = self.table.vals[k] + locs = np.array([loc], dtype=np.int64) + self._check_for_collisions(locs, key) + return loc + else: + raise KeyError(key) + + cpdef set_item(self, object key, Py_ssize_t val): + raise NotImplementedError + + @cython.boundscheck(False) + def map_locations(self, object mi): + cdef: + Py_ssize_t i, n + ndarray[uint64_t] values + uint64_t val + int ret = 0 + khiter_t k + + self.mi = mi + n = len(mi) + values = mi._hashed_values + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_uint64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, object mi): + # look up with a target mi + cdef: + Py_ssize_t i, n + ndarray[uint64_t] values + int ret = 0 + uint64_t val + khiter_t k + int64_t[:] locs + + n = len(mi) + values = mi._hashed_values + + locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_uint64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + self._check_for_collisions(locs, mi) + return np.asarray(locs) + + def unique(self, object mi): + raise NotImplementedError + + def get_labels(self, object mi, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + raise NotImplementedError diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 6b4c56747c981..fe3f3c554a9b5 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import print_function - +import pytest from pandas.compat import range, lrange import numpy as np -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, MultiIndex from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -165,6 +165,31 @@ def test_delitem(self): del self.frame['A'] self.assertNotIn('A', self.frame) + def test_delitem_multiindex(self): + midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + df = DataFrame(np.random.randn(4, 4), columns=midx) + assert len(df.columns) == 4 + assert ('A', ) in df.columns + assert 'A' in df.columns + + result = df['A'] + assert isinstance(result, DataFrame) + del df['A'] + + assert len(df.columns) == 2 + + # A still in the levels, BUT get a KeyError if trying + # to delete + assert ('A', ) not in df.columns + with pytest.raises(KeyError): + del df[('A',)] + + # xref: https://github.com/pandas-dev/pandas/issues/2770 + # the 'A' is STILL in the columns! + assert 'A' in df.columns + with pytest.raises(KeyError): + del df['A'] + def test_pop(self): self.frame.columns.name = 'baz' diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 2df297d03bcdf..024e11e63a924 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -301,10 +301,12 @@ def test_info_memory_usage(self): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() + # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " in res[-1]) + # do not display memory usage cas df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() @@ -312,11 +314,13 @@ def test_info_memory_usage(self): df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() + # memory usage is a lower bound, so print it as XYZ+ MB self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() + # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) @@ -380,6 +384,34 @@ def test_info_memory_usage(self): diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100) + def test_info_memory_usage_qualified(self): + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=[1, 2, 3]) + df.info(buf=buf) + self.assertFalse('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=list('ABC')) + df.info(buf=buf) + self.assertTrue('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=pd.MultiIndex.from_product( + [range(3), range(3)])) + df.info(buf=buf) + self.assertFalse('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=pd.MultiIndex.from_product( + [range(3), ['foo', 'bar']])) + df.info(buf=buf) + self.assertTrue('+' in buf.getvalue()) + def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3a6a9eaaa8e72..d53446870beb1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1588,7 +1588,7 @@ def test_groupby_as_index_cython(self): result = grouped.mean() expected = data.groupby(['A', 'B']).mean() - arrays = lzip(*expected.index._tuple_index) + arrays = lzip(*expected.index.values) expected.insert(0, 'A', arrays[0]) expected.insert(1, 'B', arrays[1]) expected.index = np.arange(len(expected)) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 702c4758da245..5611492b4af1b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1046,6 +1046,21 @@ def test_contains(self): self.assertNotIn(('bar', 'two'), self.index) self.assertNotIn(None, self.index) + def test_contains_top_level(self): + midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + assert 'A' in midx + assert 'A' not in midx._engine + + def test_contains_with_nat(self): + # MI with a NaT + mi = MultiIndex(levels=[['C'], + pd.date_range('2012-01-01', periods=5)], + labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, 'B']) + self.assertTrue(('C', pd.Timestamp('2012-01-01')) in mi) + for val in mi.values: + self.assertTrue(val in mi) + def test_is_all_dates(self): self.assertFalse(self.index.is_all_dates) @@ -1102,6 +1117,17 @@ def test_get_loc_duplicates(self): xp = 0 assert (rs == xp) + def test_get_value_duplicates(self): + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + + assert index.get_loc('D') == slice(0, 3) + with pytest.raises(KeyError): + index._engine.get_value(np.array([]), 'D') + def test_get_loc_level(self): index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( @@ -1294,7 +1320,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rbfill1) # pass non-MultiIndex - r1 = idx1.get_indexer(idx2._tuple_index) + r1 = idx1.get_indexer(idx2.values) rexp1 = idx1.get_indexer(idx2) assert_almost_equal(r1, rexp1) @@ -1316,6 +1342,19 @@ def test_get_indexer_nearest(self): with tm.assertRaises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_hash_collisions(self): + # non-smoke test that we don't get hash collisions + + index = MultiIndex.from_product([np.arange(1000), np.arange(1000)], + names=['one', 'two']) + result = index.get_indexer(index.values) + self.assert_numpy_array_equal(result, + np.arange(len(index), dtype='int64')) + + for i in [0, 1, len(index) - 2, len(index) - 1]: + result = index.get_loc(index[i]) + self.assertEqual(result, i) + def test_format(self): self.index.format() self.index[:0].format() @@ -1420,12 +1459,13 @@ def test_bounds(self): self.index._bounds def test_equals_multi(self): - self.assertTrue(self.index.equals(self.index)) - self.assertTrue(self.index.equal_levels(self.index)) - - self.assertFalse(self.index.equals(self.index[:-1])) + assert self.index.equals(self.index) + assert not self.index.equals(self.index.values) + assert self.index.equals(Index(self.index.values)) - self.assertTrue(self.index.equals(self.index._tuple_index)) + assert self.index.equal_levels(self.index) + assert not self.index.equals(self.index[:-1]) + assert not self.index.equals(self.index[-1]) # different number of levels index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( @@ -1433,8 +1473,8 @@ def test_equals_multi(self): [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) index2 = MultiIndex(levels=index.levels[:-1], labels=index.labels[:-1]) - self.assertFalse(index.equals(index2)) - self.assertFalse(index.equal_levels(index2)) + assert not index.equals(index2) + assert not index.equal_levels(index2) # levels are different major_axis = Index(lrange(4)) @@ -1445,8 +1485,8 @@ def test_equals_multi(self): index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) - self.assertFalse(self.index.equals(index)) - self.assertFalse(self.index.equal_levels(index)) + assert not self.index.equals(index) + assert not self.index.equal_levels(index) # some of the labels are different major_axis = Index(['foo', 'bar', 'baz', 'qux']) @@ -1457,7 +1497,16 @@ def test_equals_multi(self): index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) - self.assertFalse(self.index.equals(index)) + assert not self.index.equals(index) + + def test_equals_missing_values(self): + # make sure take is not using -1 + i = pd.MultiIndex.from_tuples([(0, pd.NaT), + (0, pd.Timestamp('20130101'))]) + result = i[0:1].equals(i[0]) + self.assertFalse(result) + result = i[1:2].equals(i[1]) + self.assertFalse(result) def test_identical(self): mi = self.index.copy() @@ -1510,7 +1559,7 @@ def test_union(self): the_union = piece1 | piece2 - tups = sorted(self.index._tuple_index) + tups = sorted(self.index.values) expected = MultiIndex.from_tuples(tups) self.assertTrue(the_union.equals(expected)) @@ -1523,7 +1572,7 @@ def test_union(self): self.assertIs(the_union, self.index) # won't work in python 3 - # tuples = self.index._tuple_index + # tuples = self.index.values # result = self.index[:4] | tuples[4:] # self.assertTrue(result.equals(tuples)) @@ -1543,7 +1592,7 @@ def test_intersection(self): piece2 = self.index[3:] the_int = piece1 & piece2 - tups = sorted(self.index[3:5]._tuple_index) + tups = sorted(self.index[3:5].values) expected = MultiIndex.from_tuples(tups) self.assertTrue(the_int.equals(expected)) @@ -1557,7 +1606,7 @@ def test_intersection(self): self.assertTrue(empty.equals(expected)) # can't do in python 3 - # tuples = self.index._tuple_index + # tuples = self.index.values # result = self.index & tuples # self.assertTrue(result.equals(tuples)) @@ -1616,7 +1665,7 @@ def test_difference(self): self.assertEqual(len(result), 0) # raise Exception called with non-MultiIndex - result = first.difference(first._tuple_index) + result = first.difference(first.values) self.assertTrue(result.equals(first[:0])) # name from empty array @@ -1642,7 +1691,7 @@ def test_from_tuples(self): def test_argsort(self): result = self.index.argsort() - expected = self.index._tuple_index.argsort() + expected = self.index.values.argsort() tm.assert_numpy_array_equal(result, expected) def test_sortlevel(self): @@ -2297,11 +2346,60 @@ def test_level_setting_resets_attributes(self): ind = MultiIndex.from_arrays([ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] ]) - assert ind.is_monotonic + self.assertTrue(ind.is_monotonic) ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + # if this fails, probably didn't reset the cache correctly. - assert not ind.is_monotonic + self.assertFalse(ind.is_monotonic) + + def test_is_monotonic(self): + i = MultiIndex.from_product([np.arange(10), + np.arange(10)], names=['one', 'two']) + self.assertTrue(i.is_monotonic) + self.assertTrue(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([np.arange(10, 0, -1), + np.arange(10)], names=['one', 'two']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([np.arange(10), + np.arange(10, 0, -1)], + names=['one', 'two']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + # string ordering + i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], + ['mom', 'next', 'zenith']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.assertTrue(i.is_monotonic) + self.assertTrue(Index(i.values).is_monotonic) + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', + 'nl0000289783', + 'nl0000289965', 'nl0000301109']], + labels=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=['household_id', 'asset_id']) + + self.assertFalse(i.is_monotonic) def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 1e6ecbbcdc756..b6b9ac93b234c 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -413,9 +413,10 @@ def f(): df.loc[idx[:, :, 'Stock'], 'price'] *= 2 tm.assert_frame_equal(df, expected) - def test_getitem_multiindex(self): + def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! + index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8e0628eefa392..0f36af2c8c4e7 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1469,7 +1469,7 @@ def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' - arrays = [np.array(x) for x in zip(*df.columns._tuple_index)] + arrays = [np.array(x) for x in zip(*df.columns.values)] result = df['foo'] result2 = df.loc[:, 'foo'] @@ -1493,7 +1493,7 @@ def test_series_getitem_not_sorted(self): index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) - arrays = [np.array(x) for x in zip(*index._tuple_index)] + arrays = [np.array(x) for x in zip(*index.values)] result = s['qux'] result2 = s.loc['qux'] diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 05a352f259e8b..9bed0d428bc41 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -152,6 +152,18 @@ def test_categorical_consistency(self): tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) + def test_categorical_with_nan_consistency(self): + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], + categories=pd.date_range('2012-01-01', periods=5, name='B')) + expected = hash_array(c, categorize=False) + c = pd.Categorical.from_codes( + [-1, 0], + categories=[pd.Timestamp('2012-01-01')]) + result = hash_array(c, categorize=False) + assert result[0] in expected + assert result[1] in expected + def test_pandas_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: diff --git a/pandas/tests/tools/test_join.py b/pandas/tests/tools/test_join.py index ab42b1212301b..ee6b3d57b852d 100644 --- a/pandas/tests/tools/test_join.py +++ b/pandas/tests/tools/test_join.py @@ -7,7 +7,7 @@ from pandas.compat import lrange import pandas.compat as compat from pandas.util.testing import assert_frame_equal -from pandas import DataFrame, MultiIndex, Series, merge, concat +from pandas import DataFrame, MultiIndex, Series, Index, merge, concat import pandas._join as _join import pandas.util.testing as tm @@ -368,7 +368,7 @@ def test_join_multiindex(self): df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index.union(index2._tuple_index) + ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) @@ -378,7 +378,7 @@ def test_join_multiindex(self): df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) - ex_index = index1._tuple_index.union(index2._tuple_index) + ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 800e0b8815443..ef863510cdd87 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -5,7 +5,6 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex -import pandas.core.algorithms as algos from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, @@ -142,20 +141,18 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) - # create a list-of-ndarrays - def get_level_values(num): - unique = vals.levels[num] # .values - labels = vals.labels[num] - filled = algos.take_1d(unique._values, labels, - fill_value=unique._na_value) - return filled - - vals = [get_level_values(level) + # create a list-of-Categoricals + vals = [Categorical(vals.labels[level], + vals.levels[level], + ordered=False, + fastpath=True) for level in range(vals.nlevels)] # hash the list-of-ndarrays - hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) - for l in vals) + hashes = (_hash_categorical(cat, + encoding=encoding, + hash_key=hash_key) + for cat in vals) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] @@ -178,9 +175,26 @@ def _hash_categorical(c, encoding, hash_key): ------- ndarray of hashed values array, same size as len(c) """ - cat_hashed = hash_array(c.categories.values, encoding, hash_key, - categorize=False).astype(np.uint64, copy=False) - return c.rename_categories(cat_hashed).astype(np.uint64, copy=False) + hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construt the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH 15362 + + mask = c.isnull() + if len(hashed): + result = hashed.take(c.codes) + else: + result = np.zeros(len(mask), dtype='uint64') + + if mask.any(): + result[mask] = np.iinfo(np.uint64).max + + return result def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 6b1c3f9c00351..b1a17df64aecf 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -12,7 +12,8 @@ is_datetime64tz_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_dtype_equal, is_float_dtype, is_complex_dtype, - is_integer_dtype, is_datetime_or_timedelta_dtype, + is_integer_dtype, + is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, _string_dtypes, _coerce_to_dtype, From 93f5e3a0c11c82ad6b7365e83637d133c1a6e8a5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 12:59:11 -0500 Subject: [PATCH 27/52] STYLE: flake8 upgraded to 3.3 on conda (#15412) fixes for E305, 2 blank lines after a class definition --- pandas/compat/numpy/__init__.py | 1 + pandas/compat/numpy/function.py | 7 +++++++ pandas/computation/expr.py | 1 + pandas/core/algorithms.py | 2 ++ pandas/core/config.py | 1 + pandas/core/config_init.py | 2 ++ pandas/core/frame.py | 2 +- pandas/core/indexing.py | 2 ++ pandas/formats/format.py | 2 +- pandas/indexes/numeric.py | 3 +++ pandas/indexes/range.py | 1 + pandas/io/common.py | 2 ++ pandas/io/excel.py | 5 +++++ pandas/io/gbq.py | 1 + pandas/io/packers.py | 2 ++ pandas/io/parsers.py | 2 ++ pandas/io/pytables.py | 4 ++++ pandas/io/sql.py | 1 + pandas/io/stata.py | 1 + pandas/msgpack/__init__.py | 1 + pandas/sparse/frame.py | 1 + pandas/sparse/series.py | 1 + pandas/stats/moments.py | 3 +++ pandas/tests/sparse/test_libsparse.py | 2 +- pandas/tests/test_generic.py | 1 + pandas/tools/merge.py | 5 +++++ pandas/tools/plotting.py | 1 + pandas/tseries/frequencies.py | 2 ++ pandas/tseries/holiday.py | 2 ++ pandas/tseries/index.py | 1 + pandas/tseries/interval.py | 3 --- pandas/tseries/offsets.py | 4 ++++ pandas/tseries/resample.py | 4 ++++ pandas/tseries/timedeltas.py | 1 + pandas/types/generic.py | 1 + pandas/util/print_versions.py | 1 + pandas/util/terminal.py | 1 + pandas/util/testing.py | 3 +++ 38 files changed, 74 insertions(+), 6 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index bfd770d7af2c6..4a9a2647ece0f 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -67,6 +67,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs): return np.array(arr, *args, **kwargs) + __all__ = ['np', '_np_version_under1p8', '_np_version_under1p9', diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index eb9e9ecc359b2..4053994efa005 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -55,6 +55,7 @@ def __call__(self, args, kwargs, fname=None, raise ValueError("invalid validation method " "'{method}'".format(method=method)) + ARGMINMAX_DEFAULTS = dict(out=None) validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin', method='both', max_fname_arg_count=1) @@ -97,6 +98,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): validate_argmax(args, kwargs) return skipna + ARGSORT_DEFAULTS = OrderedDict() ARGSORT_DEFAULTS['axis'] = -1 ARGSORT_DEFAULTS['kind'] = 'quicksort' @@ -121,6 +123,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs): validate_argsort(args, kwargs, max_fname_arg_count=1) return ascending + CLIP_DEFAULTS = dict(out=None) validate_clip = CompatValidator(CLIP_DEFAULTS, fname='clip', method='both', max_fname_arg_count=3) @@ -141,6 +144,7 @@ def validate_clip_with_axis(axis, args, kwargs): validate_clip(args, kwargs) return axis + COMPRESS_DEFAULTS = OrderedDict() COMPRESS_DEFAULTS['axis'] = None COMPRESS_DEFAULTS['out'] = None @@ -170,6 +174,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): validate_cum_func(args, kwargs, fname=name) return skipna + LOGICAL_FUNC_DEFAULTS = dict(out=None) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') @@ -236,6 +241,7 @@ def validate_take_with_convert(convert, args, kwargs): validate_take(args, kwargs, max_fname_arg_count=3, method='both') return convert + TRANSPOSE_DEFAULTS = dict(axes=None) validate_transpose = CompatValidator(TRANSPOSE_DEFAULTS, fname='transpose', method='both', max_fname_arg_count=0) @@ -318,6 +324,7 @@ def validate_groupby_func(name, args, kwargs, allowed=None): "with groupby. Use .groupby(...)." "{func}() instead".format(func=name))) + RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod', 'mean', 'std', 'var') diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f1cf210754d12..a782287175327 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -669,6 +669,7 @@ def visitor(x, y): operands = node.values return reduce(visitor, operands) + # ast.Call signature changed on 3.5, # conditionally change which methods is named # visit_Call depending on Python version, #11097 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c922ac21e12eb..4ae46fe33a5cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -926,6 +926,7 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr): else: return inds + _dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} @@ -959,6 +960,7 @@ def _hashtable_algo(f, values, return_dtype=None): # use Object return f(htable.PyObjectHashTable, _ensure_object) + _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), diff --git a/pandas/core/config.py b/pandas/core/config.py index ed63c865ebfb4..1c0eb60b8ec2f 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -804,6 +804,7 @@ def inner(x): return inner + # common type validators, for convenience # usage: register_option(... , validator = is_int) is_int = is_type_factory(int) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fe47391c9ff81..d3db633f3aa04 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -278,6 +278,7 @@ def mpl_style_cb(key): return val + with cf.config_prefix('display'): cf.register_option('precision', 6, pc_precision_doc, validator=is_int) cf.register_option('float_format', None, float_format_doc, @@ -380,6 +381,7 @@ def use_inf_as_null_cb(key): from pandas.types.missing import _use_inf_as_null _use_inf_as_null(key) + with cf.config_prefix('mode'): cf.register_option('use_inf_as_null', False, use_inf_as_null_doc, cb=use_inf_as_null_cb) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c66f6dbb273e..f7c306ea7ce95 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5741,9 +5741,9 @@ def _from_nested_dict(data): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) + # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - DataFrame.plot = base.AccessorProperty(gfx.FramePlotMethods, gfx.FramePlotMethods) DataFrame.hist = gfx.hist_frame diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6bb2d1c479844..66510a7708e64 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -36,6 +36,7 @@ def get_indexers_list(): ('iat', _iAtIndexer), ] + # "null slice" _NS = slice(None, None) @@ -1850,6 +1851,7 @@ def _convert_key(self, key, is_setter=False): "indexers") return key + # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 1a7a06199ad8a..6b235b5e1bc33 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2479,9 +2479,9 @@ def _has_names(index): else: return index.name is not None + # ----------------------------------------------------------------------------- # Global formatting options - _initial_defencoding = None diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 0b9b337731d7f..00ddf5b0c918d 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -159,6 +159,7 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + Int64Index._add_numeric_methods() Int64Index._add_logical_methods() @@ -238,6 +239,7 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + UInt64Index._add_numeric_methods() UInt64Index._add_logical_methods() @@ -391,5 +393,6 @@ def isin(self, values, level=None): return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) + Float64Index._add_numeric_methods() Float64Index._add_logical_methods_disabled() diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 7a7902b503bd6..cc78361f843bf 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -652,5 +652,6 @@ def _evaluate_numeric_binop(self, other): reversed=True, step=operator.div) + RangeIndex._add_numeric_methods() RangeIndex._add_logical_methods() diff --git a/pandas/io/common.py b/pandas/io/common.py index b24acb256c4a9..74c51b74ca18a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -75,6 +75,7 @@ class ParserError(ValueError): """ pass + # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -116,6 +117,7 @@ def __iter__(self): def __next__(self): raise AbstractMethodError(self) + if not compat.PY3: BaseIterator.next = lambda self: self.__next__() diff --git a/pandas/io/excel.py b/pandas/io/excel.py index f34ba65cf7b51..2821983213646 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -886,12 +886,14 @@ def _convert_to_style(cls, style_dict): return xls_style + register_writer(_Openpyxl1Writer) class _OpenpyxlWriter(_Openpyxl1Writer): engine = 'openpyxl' + register_writer(_OpenpyxlWriter) @@ -1368,6 +1370,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for k, v in style_kwargs.items(): setattr(xcell, k, v) + register_writer(_Openpyxl22Writer) @@ -1491,6 +1494,7 @@ def _convert_to_style(cls, style_dict, num_format_str=None): return style + register_writer(_XlwtWriter) @@ -1603,4 +1607,5 @@ def _convert_to_style(self, style_dict, num_format_str=None): return xl_format + register_writer(_XlsxWriter) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 0ffb6b4bf8c05..a5558866937cf 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -58,6 +58,7 @@ def _test_google_api_imports(): raise ImportError("Missing module required for Google BigQuery " "support: {0}".format(str(e))) + logger = logging.getLogger('pandas.io.gbq') logger.setLevel(logging.ERROR) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index ab44e46c96b77..3f4be6ad459d8 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -217,6 +217,7 @@ def read(fh): raise ValueError('path_or_buf needs to be a string file path or file-like') + dtype_dict = {21: np.dtype('M8[ns]'), u('datetime64[ns]'): np.dtype('M8[ns]'), u('datetime64[us]'): np.dtype('M8[us]'), @@ -237,6 +238,7 @@ def dtype_for(t): return dtype_dict[t] return np.typeDict.get(t, t) + c2f_dict = {'complex': np.float64, 'complex128': np.float64, 'complex64': np.float32} diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8905dfa315c4..88d0c6c12c04f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -409,6 +409,7 @@ def _read(filepath_or_buffer, kwds): return data + _parser_defaults = { 'delimiter': None, @@ -655,6 +656,7 @@ def parser_f(filepath_or_buffer, return parser_f + read_csv = _make_parser_function('read_csv', sep=',') read_csv = Appender(_read_csv_doc)(read_csv) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d8de1dcd61977..65ac4e5654dce 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -74,6 +74,7 @@ def _ensure_encoding(encoding): encoding = _default_encoding return encoding + Term = Expr @@ -112,6 +113,7 @@ class ClosedFileError(Exception): class IncompatibilityWarning(Warning): pass + incompatibility_doc = """ where criteria is being ignored as this version [%s] is too old (or not-defined), read the file in and write it out to a new file to upgrade (with @@ -122,6 +124,7 @@ class IncompatibilityWarning(Warning): class AttributeConflictWarning(Warning): pass + attribute_conflict_doc = """ the [%s] attribute of the existing index is [%s] which conflicts with the new [%s], resetting the attribute to None @@ -131,6 +134,7 @@ class AttributeConflictWarning(Warning): class DuplicateWarning(Warning): pass + duplicate_doc = """ duplicate entries in table, taking most recently appended """ diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9fa01c413aca8..55e145b493dd9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -495,6 +495,7 @@ def has_table(table_name, con, flavor=None, schema=None): pandas_sql = pandasSQL_builder(con, flavor=flavor, schema=schema) return pandas_sql.has_table(table_name) + table_exists = has_table diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2be7657883e88..1698ade4c0102 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -459,6 +459,7 @@ class PossiblePrecisionLoss(Warning): class ValueLabelTypeMismatch(Warning): pass + value_label_mismatch_doc = """ Stata value labels (pandas categories) must be strings. Column {0} contains non-string labels which will be converted to strings. Please check that the diff --git a/pandas/msgpack/__init__.py b/pandas/msgpack/__init__.py index 33d60a12ef0a3..4d6e241171281 100644 --- a/pandas/msgpack/__init__.py +++ b/pandas/msgpack/__init__.py @@ -41,6 +41,7 @@ def packb(o, **kwargs): """ return Packer(**kwargs).pack(o) + # alias for compatibility to simplejson/marshal/pickle. load = unpack loads = unpackb diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 1fc93a967bdbb..61b8434b0ea09 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -863,6 +863,7 @@ def homogenize(series_dict): return output + # use unaccelerated ops for sparse objects ops.add_flex_arithmetic_methods(SparseDataFrame, use_numexpr=False, **ops.frame_flex_funcs) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 2d3a9effe6939..dfdbb3c89814a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -832,6 +832,7 @@ def from_coo(cls, A, dense_index=False): """ return _coo_to_sparse_series(A, dense_index=dense_index) + # overwrite series methods with unaccelerated versions ops.add_special_arithmetic_methods(SparseSeries, use_numexpr=False, **ops.series_special_funcs) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 95b209aee0b0c..914c4c08863a2 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -385,6 +385,7 @@ def ewmstd(arg, com=None, span=None, halflife=None, alpha=None, min_periods=0, bias=bias, func_kw=['bias']) + ewmvol = ewmstd @@ -476,6 +477,7 @@ def f(arg, window, min_periods=None, freq=None, center=False, **kwargs) return f + rolling_max = _rolling_func('max', 'Moving maximum.', how='max') rolling_min = _rolling_func('min', 'Moving minimum.', how='min') rolling_sum = _rolling_func('sum', 'Moving sum.') @@ -683,6 +685,7 @@ def f(arg, min_periods=1, freq=None, **kwargs): **kwargs) return f + expanding_max = _expanding_func('max', 'Expanding maximum.') expanding_min = _expanding_func('min', 'Expanding minimum.') expanding_sum = _expanding_func('sum', 'Expanding sum.') diff --git a/pandas/tests/sparse/test_libsparse.py b/pandas/tests/sparse/test_libsparse.py index 4d5a93d77cf14..0435b732911da 100644 --- a/pandas/tests/sparse/test_libsparse.py +++ b/pandas/tests/sparse/test_libsparse.py @@ -560,8 +560,8 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): check_cases(_check_case) -# too cute? oh but how I abhor code duplication +# too cute? oh but how I abhor code duplication check_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 28f1dc61533c1..b087ca21d3c25 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1588,6 +1588,7 @@ def test_to_xarray(self): # non-convertible self.assertRaises(ValueError, lambda: result.to_pandas()) + # run all the tests, but wrap each in a warning catcher for t in ['test_rename', 'test_rename_axis', 'test_get_numeric_data', 'test_get_default', 'test_nonzero', diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index e82e702cb6e55..ba53d42fccec7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -53,6 +53,7 @@ def wrapper(*args, **kwargs): return pd.concat(*args, **kwargs) return wrapper + concat = concat_wrap() @@ -66,6 +67,8 @@ def merge(left, right, how='inner', on=None, left_on=None, right_on=None, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, indicator=indicator) return op.get_result() + + if __debug__: merge.__doc__ = _merge_doc % '\nleft : DataFrame' @@ -264,6 +267,7 @@ def _merger(x, y): result = _merger(left, right) return result + ordered_merge.__doc__ = merge_ordered.__doc__ @@ -1334,6 +1338,7 @@ def _right_outer_join(x, y, max_groups): right_indexer, left_indexer = _join.left_outer_join(y, x, max_groups) return left_indexer, right_indexer + _join_functions = { 'inner': _join.inner_join, 'left': _join.left_outer_join, diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 0b1ced97d2b81..b2050d7d8d81e 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -149,6 +149,7 @@ def _mpl_ge_2_0_0(): except ImportError: return False + if _mpl_ge_1_5_0(): # Compat with mp 1.5, which uses cycler. import cycler diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e0c602bf5a037..957a934d13f09 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -660,6 +660,7 @@ def get_standard_freq(freq): warnings.warn(msg, FutureWarning, stacklevel=2) return to_offset(freq).rule_code + # --------------------------------------------------------------------- # Period codes @@ -795,6 +796,7 @@ def infer_freq(index, warn=True): inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() + _ONE_MICRO = long(1000) _ONE_MILLI = _ONE_MICRO * 1000 _ONE_SECOND = _ONE_MILLI * 1000 diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 31e40c6bcbb2c..d3d936693c266 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -286,6 +286,7 @@ def _apply_rule(self, dates): dates += offset return dates + holiday_calendars = {} @@ -461,6 +462,7 @@ def merge(self, other, inplace=False): else: return holidays + USMemorialDay = Holiday('MemorialDay', month=5, day=31, offset=DateOffset(weekday=MO(-1))) USLaborDay = Holiday('Labor Day', month=9, day=1, diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 6cbb696783e09..5f00e8b648689 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -138,6 +138,7 @@ def _ensure_datetime64(other): return other raise TypeError('%s type object %s' % (type(other), str(other))) + _midnight = time(0, 0) diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py index 6698c7e924758..22801318a1853 100644 --- a/pandas/tseries/interval.py +++ b/pandas/tseries/interval.py @@ -33,6 +33,3 @@ def __new__(self, starts, ends): def dtype(self): return self.values.dtype - -if __name__ == '__main__': - pass diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 370dd00762896..79227f6de90a5 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1652,6 +1652,7 @@ class WeekDay(object): SAT = 5 SUN = 6 + _int_to_weekday = { WeekDay.MON: 'MON', WeekDay.TUE: 'TUE', @@ -1924,6 +1925,7 @@ def onOffset(self, dt): modMonth = (dt.month - self.startingMonth) % 3 return BMonthEnd().onOffset(dt) and modMonth == 0 + _int_to_month = tslib._MONTH_ALIASES _month_to_int = dict((v, k) for k, v in _int_to_month.items()) @@ -2799,6 +2801,7 @@ def _delta_to_tick(delta): else: # pragma: no cover return Nano(nanos) + _delta_to_nanoseconds = tslib._delta_to_nanoseconds @@ -2931,6 +2934,7 @@ def generate_range(start=None, end=None, periods=None, raise ValueError('Offset %s did not decrement date' % offset) cur = next_date + prefix_mapping = dict((offset._prefix, offset) for offset in [ YearBegin, # 'AS' YearEnd, # 'A' diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 5692d6c5cabde..a6a10c08966d6 100755 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -552,6 +552,8 @@ def var(self, ddof=1, *args, **kwargs): """ nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) + + Resampler._deprecated_valids += dir(Resampler) # downsample methods @@ -969,6 +971,8 @@ def resample(obj, kind=None, **kwds): """ create a TimeGrouper and return our resampler """ tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) + + resample.__doc__ = Resampler.__doc__ diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 9bf39652a4e00..5a5d1533bfa91 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -87,6 +87,7 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors) + _unit_map = { 'Y': 'Y', 'y': 'Y', diff --git a/pandas/types/generic.py b/pandas/types/generic.py index 756fb47596700..e7b54ccc6f25e 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -57,4 +57,5 @@ class _ABCGeneric(type): def __instancecheck__(cls, inst): return hasattr(inst, "_data") + ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 7c5148caf7e74..b0f5d3994ed64 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -153,5 +153,6 @@ def main(): return 0 + if __name__ == "__main__": sys.exit(main()) diff --git a/pandas/util/terminal.py b/pandas/util/terminal.py index 6b8428ff75806..dadd09ae74ea4 100644 --- a/pandas/util/terminal.py +++ b/pandas/util/terminal.py @@ -115,6 +115,7 @@ def ioctl_GWINSZ(fd): return None return int(cr[1]), int(cr[0]) + if __name__ == "__main__": sizex, sizey = get_terminal_size() print('width = %s height = %s' % (sizex, sizey)) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 566ceec027b2b..cda386781e2ec 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -74,6 +74,7 @@ def reset_testing_mode(): if 'deprecate' in testing_mode: warnings.simplefilter('ignore', _testing_mode_warnings) + set_testing_mode() @@ -1381,6 +1382,7 @@ def assert_panelnd_equal(left, right, for i, item in enumerate(right._get_axis(0)): assert item in left, "non-matching item (left) '%s'" % item + # TODO: strangely check_names fails in py3 ? _panel_frame_equal = partial(assert_frame_equal, check_names=False) assert_panel_equal = partial(assert_panelnd_equal, @@ -2076,6 +2078,7 @@ def dec(f): return wrapper + # skip tests on exceptions with this message _network_error_messages = ( # 'urlopen error timed out', From 86ef3ca3ff7c836c5b7c01eb918201ec7c44c000 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 13:00:36 -0500 Subject: [PATCH 28/52] DOC: use shared_docs for Index.get_indexer, get_indexer_non_unique (#15411) * STYLE: flake8 upgraded to 3.3 on conda fixes for E305, 2 blank lines after a class definition * DOC: use shared_docs for Index.get_indexer, get_indexer_non_unique fix non-populated doc-strings for some methods in Index (take) --- pandas/indexes/base.py | 41 +++++++++++++++++++++++++++++--------- pandas/indexes/category.py | 40 +++++++------------------------------ pandas/indexes/multi.py | 40 ++++++++++--------------------------- pandas/tseries/period.py | 5 +++++ 4 files changed, 55 insertions(+), 71 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index c483fb0764a4c..e51824e72a2a0 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -65,6 +65,7 @@ _unsortable_types = frozenset(('mixed', 'mixed-integer')) _index_doc_kwargs = dict(klass='Index', inplace='', + target_klass='Index', unique='Index', duplicated='np.ndarray') _index_shared_docs = dict() @@ -1605,7 +1606,7 @@ def _append_same_dtype(self, to_concat, name): numpy.ndarray.take """ - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) @@ -2350,15 +2351,14 @@ def get_level_values(self, level): self._validate_index_level(level) return self - def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ + _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. Parameters ---------- - target : Index + target : %(target_klass)s method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -2387,6 +2387,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): positions matches the corresponding target values. Missing values in the target are marked by -1. """ + + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) if tolerance is not None: @@ -2496,11 +2499,28 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): indexer = np.where(distance <= tolerance, indexer, -1) return indexer + _index_shared_docs['get_indexer_non_unique'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array + """ + + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): - """ return an indexer suitable for taking from a non unique index - return the labels in the same order as the target, and - return a missing indexer into the target (missing are marked as -1 - in the indexer); target must be an iterable """ target = _ensure_index(target) pself, ptarget = self._possibly_promote(target) if pself is not self or ptarget is not target: @@ -2516,7 +2536,10 @@ def get_indexer_non_unique(self, target): return Index(indexer), missing def get_indexer_for(self, target, **kwargs): - """ guaranteed return of an indexer even when non-unique """ + """ + guaranteed return of an indexer even when non-unique + This dispatches to get_indexer or get_indexer_nonunique as appropriate + """ if self.is_unique: return self.get_indexer(target, **kwargs) indexer, _ = self.get_indexer_non_unique(target, **kwargs) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index e2e0fd056b111..acb2758641a62 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -18,6 +18,8 @@ import pandas.core.base as base import pandas.core.missing as missing import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) class CategoricalIndex(Index, base.PandasDelegate): @@ -289,7 +291,7 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() - @Appender(base._shared_docs['unique'] % ibase._index_doc_kwargs) + @Appender(base._shared_docs['unique'] % _index_doc_kwargs) def unique(self): result = base.IndexOpsMixin.unique(self) # CategoricalIndex._shallow_copy uses keeps original categories @@ -299,7 +301,7 @@ def unique(self): @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 codes = self.codes.astype('i8') @@ -425,34 +427,8 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. The mask determines whether labels are - found or not in the current index - - Parameters - ---------- - target : MultiIndex or Index (of tuples) - method : {'pad', 'ffill', 'backfill', 'bfill'} - pad / ffill: propagate LAST valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - - Notes - ----- - This is a low-level method and probably should be used at your own risk - - Examples - -------- - >>> indexer, mask = index.get_indexer(new_index) - >>> new_values = cur_values.take(indexer) - >>> new_values[-mask] = np.nan - - Returns - ------- - (indexer, mask) : (ndarray, ndarray) - """ method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) @@ -472,10 +448,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _ensure_platform_int(indexer) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): - """ this is the same for a CategoricalIndex for get_indexer; the API - returns the missing values as well - """ target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): @@ -497,7 +471,7 @@ def _convert_list_indexer(self, keyarr, kind=None): return None - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 57739548a17d6..18e1da7303d6d 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -43,6 +43,10 @@ _get_na_value, InvalidIndexError, _index_shared_docs) import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(klass='MultiIndex', + target_klass='MultiIndex or list of tuples')) class MultiIndex(Index): @@ -755,7 +759,7 @@ def f(k, stringify): @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.core.sorting import get_group_index from pandas.hashtable import duplicated_int64 @@ -1244,7 +1248,7 @@ def __getitem__(self, key): names=self.names, sortorder=sortorder, verify_integrity=False) - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) @@ -1564,34 +1568,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): return new_index, indexer + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. The mask determines whether labels are - found or not in the current index - - Parameters - ---------- - target : MultiIndex or Index (of tuples) - method : {'pad', 'ffill', 'backfill', 'bfill'} - pad / ffill: propagate LAST valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - - Notes - ----- - This is a low-level method and probably should be used at your own risk - - Examples - -------- - >>> indexer, mask = index.get_indexer(new_index) - >>> new_values = cur_values.take(indexer) - >>> new_values[-mask] = np.nan - - Returns - ------- - (indexer, mask) : (ndarray, ndarray) - """ method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) @@ -1633,6 +1611,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _ensure_platform_int(indexer) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + return super(MultiIndex, self).get_indexer_non_unique(target) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 98151d5b6130c..8a6b0c153bb50 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -44,6 +44,10 @@ from pandas.lib import infer_dtype import pandas.tslib as tslib from pandas.compat import zip, u +import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(target_klass='PeriodIndex or list of Periods')) def _field_accessor(name, alias, docstring=None): @@ -759,6 +763,7 @@ def get_value(self, series, key): return com._maybe_box(self, self._engine.get_value(s, key), series, key) + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = _ensure_index(target) From d6f8b460325fd79faa90858e2743878a7cc74dec Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Feb 2017 15:20:52 -0500 Subject: [PATCH 29/52] BLD: use latest conda version with latest miniconda installer on appveyor change 3.6 build to use numpy=1.12 & add back xlwt (was not on defaults for a while) Author: Jeff Reback Closes #15415 from jreback/appveyor and squashes the following commits: 2019f37 [Jeff Reback] force numpy version f82877b [Jeff Reback] remove extra conda list 3ace9f2 [Jeff Reback] CI: use numpy=1.12 on appveyor 6855a7b [Jeff Reback] BLD: use latest conda version with latest miniconda installer on appveyor --- appveyor.yml | 15 ++++++--------- ci/requirements-3.5-64.run | 2 +- ci/requirements-3.6-64.run | 4 ++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 42c3be13af809..d96e1dfcf76de 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,19 +18,19 @@ environment: matrix: - - CONDA_ROOT: "C:\\Miniconda3.5_64" + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" - CONDA_NPY: "111" + CONDA_NPY: "112" - - CONDA_ROOT: "C:\\Miniconda3.5_64" + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" CONDA_NPY: "110" - - CONDA_ROOT: "C:\\Miniconda3.5_64" + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "3.5" PYTHON_ARCH: "64" CONDA_PY: "35" @@ -66,8 +66,7 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - #- cmd: conda update -q conda - - cmd: conda install conda=4.2.15 + - cmd: conda update -q conda - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority @@ -83,7 +82,7 @@ install: - cmd: '%CMD_IN_ENV% conda build ci\appveyor.recipe -q' # create our env - - cmd: conda create -q -n pandas python=%PYTHON_VERSION% nose pytest + - cmd: conda create -q -n pandas python=%PYTHON_VERSION% pytest - cmd: activate pandas - SET REQ=ci\requirements-%PYTHON_VERSION%-%PYTHON_ARCH%.run - cmd: echo "installing requirements from %REQ%" @@ -95,7 +94,5 @@ install: test_script: # tests - cmd: activate pandas - - cmd: conda list - cmd: cd \ - cmd: python -c "import pandas; pandas.test(['--skip-slow', '--skip-network'])" - diff --git a/ci/requirements-3.5-64.run b/ci/requirements-3.5-64.run index 905c2ff3625bd..ad66f578d702a 100644 --- a/ci/requirements-3.5-64.run +++ b/ci/requirements-3.5-64.run @@ -1,6 +1,6 @@ python-dateutil pytz -numpy +numpy=1.11* openpyxl xlsxwriter xlrd diff --git a/ci/requirements-3.6-64.run b/ci/requirements-3.6-64.run index 58ba103504b2c..840d2867e9297 100644 --- a/ci/requirements-3.6-64.run +++ b/ci/requirements-3.6-64.run @@ -1,10 +1,10 @@ python-dateutil pytz -numpy +numpy=1.12* openpyxl xlsxwriter xlrd -#xlwt +xlwt scipy feather-format numexpr From f2246cfa215d01b68aebd2da4afb836d912d248d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Feb 2017 09:12:16 -0500 Subject: [PATCH 30/52] TST: convert yield based test_pickle.py to parametrized to remove warnings xref #15341 Author: Jeff Reback Closes #15416 from jreback/warn and squashes the following commits: a6af576 [Jeff Reback] TST: convert yield based test_pickle.py to parametrized to remove warnings xref #15341 --- pandas/tests/io/test_pickle.py | 535 +++++++++++++++++---------------- 1 file changed, 277 insertions(+), 258 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 5445c506b050c..1e3816c1556f6 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -1,6 +1,17 @@ # pylint: disable=E1101,E1103,W0232 -""" manage legacy pickle tests """ +""" +manage legacy pickle tests + +How to add pickle tests: + +1. Install pandas version intended to output the pickle. + +2. Execute "generate_legacy_storage_files.py" to create the pickle. +$ python generate_legacy_storage_files.py pickle + +3. Move the created pickle to "data/legacy_pickle/" directory. +""" import pytest import os @@ -9,277 +20,285 @@ import pandas as pd from pandas import Index -from pandas.compat import u, is_platform_little_endian +from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd -class TestPickle(): - """ - How to add pickle tests: +@pytest.fixture(scope='module') +def current_pickle_data(): + # our current version pickle data + from pandas.tests.io.generate_legacy_storage_files import ( + create_pickle_data) + return create_pickle_data() + + +# --------------------- +# comparision functions +# --------------------- +def compare_element(result, expected, typ, version=None): + if isinstance(expected, Index): + tm.assert_index_equal(expected, result) + return + + if typ.startswith('sp_'): + comparator = getattr(tm, "assert_%s_equal" % typ) + comparator(result, expected, exact_indices=False) + elif typ == 'timestamp': + if expected is pd.NaT: + assert result is pd.NaT + else: + tm.assert_equal(result, expected) + tm.assert_equal(result.freq, expected.freq) + else: + comparator = getattr(tm, "assert_%s_equal" % + typ, tm.assert_almost_equal) + comparator(result, expected) + + +def compare(data, vf, version): + + # py3 compat when reading py2 pickle + try: + data = pandas.read_pickle(vf) + except (ValueError) as e: + if 'unsupported pickle protocol:' in str(e): + # trying to read a py3 pickle in py2 + return + else: + raise + + m = globals() + for typ, dv in data.items(): + for dt, result in dv.items(): + try: + expected = data[typ][dt] + except (KeyError): + if version in ('0.10.1', '0.11.0') and dt == 'reg': + break + else: + raise + + # use a specific comparator + # if available + comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) + + comparator = m.get(comparator, m['compare_element']) + comparator(result, expected, typ, version) + return data + + +def compare_sp_series_ts(res, exp, typ, version): + # SparseTimeSeries integrated into SparseSeries in 0.12.0 + # and deprecated in 0.17.0 + if version and LooseVersion(version) <= "0.12.0": + tm.assert_sp_series_equal(res, exp, check_series_type=False) + else: + tm.assert_sp_series_equal(res, exp) + + +def compare_series_ts(result, expected, typ, version): + # GH 7748 + tm.assert_series_equal(result, expected) + tm.assert_equal(result.index.freq, expected.index.freq) + tm.assert_equal(result.index.freq.normalize, False) + tm.assert_series_equal(result > 0, expected > 0) + + # GH 9291 + freq = result.index.freq + tm.assert_equal(freq + Day(1), Day(2)) + + res = freq + pandas.Timedelta(hours=1) + tm.assert_equal(isinstance(res, pandas.Timedelta), True) + tm.assert_equal(res, pandas.Timedelta(days=1, hours=1)) + + res = freq + pandas.Timedelta(nanoseconds=1) + tm.assert_equal(isinstance(res, pandas.Timedelta), True) + tm.assert_equal(res, pandas.Timedelta(days=1, nanoseconds=1)) + + +def compare_series_dt_tz(result, expected, typ, version): + # 8260 + # dtype is object < 0.17.0 + if LooseVersion(version) < '0.17.0': + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + else: + tm.assert_series_equal(result, expected) - 1. Install pandas version intended to output the pickle. - 2. Execute "generate_legacy_storage_files.py" to create the pickle. - $ python generate_legacy_storage_files.py pickle +def compare_series_cat(result, expected, typ, version): + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_series_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': + tm.assert_series_equal(result, expected, check_categorical=False) + else: + tm.assert_series_equal(result, expected) - 3. Move the created pickle to "data/legacy_pickle/" directory. - NOTE: TestPickle can't be a subclass of tm.Testcase to use test generator. - http://stackoverflow.com/questions/6689537/ - nose-test-generators-inside-class - """ +def compare_frame_dt_mixed_tzs(result, expected, typ, version): + # 8260 + # dtype is object < 0.17.0 + if LooseVersion(version) < '0.17.0': + expected = expected.astype(object) + tm.assert_frame_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) - @classmethod - def setup_class(cls): - from pandas.tests.io.generate_legacy_storage_files import ( - create_pickle_data) - cls.data = create_pickle_data() - cls.path = u('__%s__.pickle' % tm.rands(10)) - def compare_element(self, result, expected, typ, version=None): - if isinstance(expected, Index): - tm.assert_index_equal(expected, result) - return +def compare_frame_cat_onecol(result, expected, typ, version): + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_frame_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': + tm.assert_frame_equal(result, expected, check_categorical=False) + else: + tm.assert_frame_equal(result, expected) - if typ.startswith('sp_'): - comparator = getattr(tm, "assert_%s_equal" % typ) - comparator(result, expected, exact_indices=False) - elif typ == 'timestamp': - if expected is pd.NaT: - assert result is pd.NaT - else: - tm.assert_equal(result, expected) - tm.assert_equal(result.freq, expected.freq) - else: - comparator = getattr(tm, "assert_%s_equal" % - typ, tm.assert_almost_equal) - comparator(result, expected) - - def compare(self, vf, version): - - # py3 compat when reading py2 pickle - try: - data = pandas.read_pickle(vf) - except (ValueError) as e: - if 'unsupported pickle protocol:' in str(e): - # trying to read a py3 pickle in py2 - return - else: - raise - - for typ, dv in data.items(): - for dt, result in dv.items(): - try: - expected = self.data[typ][dt] - except (KeyError): - if version in ('0.10.1', '0.11.0') and dt == 'reg': - break - else: - raise - - # use a specific comparator - # if available - comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) - comparator = getattr(self, comparator, self.compare_element) - comparator(result, expected, typ, version) - return data - - def compare_sp_series_ts(self, res, exp, typ, version): - # SparseTimeSeries integrated into SparseSeries in 0.12.0 - # and deprecated in 0.17.0 - if version and LooseVersion(version) <= "0.12.0": - tm.assert_sp_series_equal(res, exp, check_series_type=False) - else: - tm.assert_sp_series_equal(res, exp) - def compare_series_ts(self, result, expected, typ, version): - # GH 7748 - tm.assert_series_equal(result, expected) - tm.assert_equal(result.index.freq, expected.index.freq) - tm.assert_equal(result.index.freq.normalize, False) - tm.assert_series_equal(result > 0, expected > 0) - - # GH 9291 - freq = result.index.freq - tm.assert_equal(freq + Day(1), Day(2)) - - res = freq + pandas.Timedelta(hours=1) - tm.assert_equal(isinstance(res, pandas.Timedelta), True) - tm.assert_equal(res, pandas.Timedelta(days=1, hours=1)) - - res = freq + pandas.Timedelta(nanoseconds=1) - tm.assert_equal(isinstance(res, pandas.Timedelta), True) - tm.assert_equal(res, pandas.Timedelta(days=1, nanoseconds=1)) - - def compare_series_dt_tz(self, result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < '0.17.0': - expected = expected.astype(object) - tm.assert_series_equal(result, expected) - else: - tm.assert_series_equal(result, expected) - - def compare_series_cat(self, result, expected, typ, version): - # Categorical dtype is added in 0.15.0 - # ordered is changed in 0.16.0 - if LooseVersion(version) < '0.15.0': - tm.assert_series_equal(result, expected, check_dtype=False, - check_categorical=False) - elif LooseVersion(version) < '0.16.0': - tm.assert_series_equal(result, expected, check_categorical=False) - else: - tm.assert_series_equal(result, expected) - - def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < '0.17.0': - expected = expected.astype(object) - tm.assert_frame_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) - - def compare_frame_cat_onecol(self, result, expected, typ, version): - # Categorical dtype is added in 0.15.0 - # ordered is changed in 0.16.0 - if LooseVersion(version) < '0.15.0': - tm.assert_frame_equal(result, expected, check_dtype=False, - check_categorical=False) - elif LooseVersion(version) < '0.16.0': - tm.assert_frame_equal(result, expected, check_categorical=False) - else: - tm.assert_frame_equal(result, expected) - - def compare_frame_cat_and_float(self, result, expected, typ, version): - self.compare_frame_cat_onecol(result, expected, typ, version) - - def compare_index_period(self, result, expected, typ, version): - tm.assert_index_equal(result, expected) - tm.assertIsInstance(result.freq, MonthEnd) - tm.assert_equal(result.freq, MonthEnd()) - tm.assert_equal(result.freqstr, 'M') - tm.assert_index_equal(result.shift(2), expected.shift(2)) - - def compare_sp_frame_float(self, result, expected, typ, version): - if LooseVersion(version) <= '0.18.1': - tm.assert_sp_frame_equal(result, expected, exact_indices=False, - check_dtype=False) - else: - tm.assert_sp_frame_equal(result, expected) - - def read_pickles(self, version): - if not is_platform_little_endian(): - pytest.skip("known failure on non-little endian") - - pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) - n = 0 - for f in os.listdir(pth): - vf = os.path.join(pth, f) - data = self.compare(vf, version) - - if data is None: - continue - n += 1 - assert n > 0, 'Pickle files are not tested' - - def test_pickles(self): - pickle_path = tm.get_data_path('legacy_pickle') - n = 0 - for v in os.listdir(pickle_path): - pth = os.path.join(pickle_path, v) - if os.path.isdir(pth): - yield self.read_pickles, v - n += 1 - assert n > 0, 'Pickle files are not tested' - - def test_round_trip_current(self): - - try: - import cPickle as c_pickle - - def c_pickler(obj, path): - with open(path, 'wb') as fh: - c_pickle.dump(obj, fh, protocol=-1) - - def c_unpickler(path): - with open(path, 'rb') as fh: - fh.seek(0) - return c_pickle.load(fh) - except: - c_pickler = None - c_unpickler = None - - import pickle as python_pickle - - def python_pickler(obj, path): +def compare_frame_cat_and_float(result, expected, typ, version): + compare_frame_cat_onecol(result, expected, typ, version) + + +def compare_index_period(result, expected, typ, version): + tm.assert_index_equal(result, expected) + tm.assertIsInstance(result.freq, MonthEnd) + tm.assert_equal(result.freq, MonthEnd()) + tm.assert_equal(result.freqstr, 'M') + tm.assert_index_equal(result.shift(2), expected.shift(2)) + + +def compare_sp_frame_float(result, expected, typ, version): + if LooseVersion(version) <= '0.18.1': + tm.assert_sp_frame_equal(result, expected, exact_indices=False, + check_dtype=False) + else: + tm.assert_sp_frame_equal(result, expected) + + +# --------------------- +# tests +# --------------------- +def legacy_pickle_versions(): + # yield the pickle versions + pickle_path = tm.get_data_path('legacy_pickle') + for v in os.listdir(pickle_path): + pth = os.path.join(pickle_path, v) + if os.path.isdir(pth): + yield v + + +@pytest.mark.parametrize('version', legacy_pickle_versions()) +def test_pickles(current_pickle_data, version): + if not is_platform_little_endian(): + pytest.skip("known failure on non-little endian") + + pth = tm.get_data_path('legacy_pickle/{0}'.format(version)) + n = 0 + for f in os.listdir(pth): + vf = os.path.join(pth, f) + data = compare(current_pickle_data, vf, version) + + if data is None: + continue + n += 1 + assert n > 0, 'Pickle files are not tested' + + +def test_round_trip_current(current_pickle_data): + + try: + import cPickle as c_pickle + + def c_pickler(obj, path): with open(path, 'wb') as fh: - python_pickle.dump(obj, fh, protocol=-1) + c_pickle.dump(obj, fh, protocol=-1) - def python_unpickler(path): + def c_unpickler(path): with open(path, 'rb') as fh: fh.seek(0) - return python_pickle.load(fh) - - for typ, dv in self.data.items(): - for dt, expected in dv.items(): - - for writer in [pd.to_pickle, c_pickler, python_pickler]: - if writer is None: - continue - - with tm.ensure_clean(self.path) as path: - - # test writing with each pickler - writer(expected, path) - - # test reading with each unpickler - result = pd.read_pickle(path) - self.compare_element(result, expected, typ) - - if c_unpickler is not None: - result = c_unpickler(path) - self.compare_element(result, expected, typ) - - result = python_unpickler(path) - self.compare_element(result, expected, typ) - - def test_pickle_v0_14_1(self): - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') - # This code was executed once on v0.14.1 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - - def test_pickle_v0_15_2(self): - # ordered -> _ordered - # GH 9347 - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') - # This code was executed once on v0.15.2 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + return c_pickle.load(fh) + except: + c_pickler = None + c_unpickler = None + + import pickle as python_pickle + + def python_pickler(obj, path): + with open(path, 'wb') as fh: + python_pickle.dump(obj, fh, protocol=-1) + + def python_unpickler(path): + with open(path, 'rb') as fh: + fh.seek(0) + return python_pickle.load(fh) + + data = current_pickle_data + for typ, dv in data.items(): + for dt, expected in dv.items(): + + for writer in [pd.to_pickle, c_pickler, python_pickler]: + if writer is None: + continue + + with tm.ensure_clean() as path: + + # test writing with each pickler + writer(expected, path) + + # test reading with each unpickler + result = pd.read_pickle(path) + compare_element(result, expected, typ) + + if c_unpickler is not None: + result = c_unpickler(path) + compare_element(result, expected, typ) + + result = python_unpickler(path) + compare_element(result, expected, typ) + + +def test_pickle_v0_14_1(): + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + +def test_pickle_v0_15_2(): + # ordered -> _ordered + # GH 9347 + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) From ddb22f578b7c7147fd8bcd9fb7c8504a8053e313 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Thu, 16 Feb 2017 09:13:42 -0500 Subject: [PATCH 31/52] TST: Parametrize simple yield tests xref #15341 Author: Elliott Sales de Andrade Closes #15406 from QuLogic/pytest-simple-yield and squashes the following commits: b002752 [Elliott Sales de Andrade] TST: Set PYTHONHASHSEED so xdist doesn't break. 8368772 [Elliott Sales de Andrade] TST: Use fixtures for engine/parser where possible. c6cd346 [Elliott Sales de Andrade] TST: Parametrize remaining simple yield tests. 47bf1a1 [Elliott Sales de Andrade] TST: Replace ENGINES_PARSERS by parametrize. --- ci/script_multi.sh | 6 + pandas/tests/computation/test_compat.py | 11 +- pandas/tests/computation/test_eval.py | 233 ++++++------------------ pandas/tests/io/parser/test_network.py | 26 +-- pandas/util/testing.py | 15 +- 5 files changed, 92 insertions(+), 199 deletions(-) diff --git a/ci/script_multi.sh b/ci/script_multi.sh index f5fbcbbc12f83..41f71fd21f63f 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -17,6 +17,12 @@ if [ -n "$LOCALE_OVERRIDE" ]; then python -c "$pycmd" fi +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +echo PYTHONHASHSEED=$PYTHONHASHSEED + if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is simply a build test." elif [ "$COVERAGE" ]; then diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index 77994ac6d2f53..59bdde83aedd8 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -12,8 +12,6 @@ import pandas.computation.expr as expr from pandas.computation import _MIN_NUMEXPR_VERSION -ENGINES_PARSERS = list(product(_engines, expr._parsers)) - def test_compat(): # test we have compat with our version of nu @@ -30,12 +28,9 @@ def test_compat(): pytest.skip("not testing numexpr version compat") -def test_invalid_numexpr_version(): - for engine, parser in ENGINES_PARSERS: - yield check_invalid_numexpr_version, engine, parser - - -def check_invalid_numexpr_version(engine, parser): +@pytest.mark.parametrize('engine', _engines) +@pytest.mark.parametrize('parser', expr._parsers) +def test_invalid_numexpr_version(engine, parser): def testit(): a, b = 1, 2 res = pd.eval('a + b', engine=engine, parser=parser) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index ada714c8ac52e..b42f79fe5009b 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -20,6 +20,7 @@ from pandas.computation import pytables from pandas.computation.engines import _engines, NumExprClobberingError from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor +from pandas.computation.expressions import _USE_NUMEXPR, _NUMEXPR_INSTALLED from pandas.computation.ops import (_binary_ops_dict, _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms, @@ -38,6 +39,23 @@ _scalar_skip = 'in', 'not in' +@pytest.fixture(params=( + pytest.mark.skipif(engine == 'numexpr' and not _USE_NUMEXPR, + reason='numexpr enabled->{enabled}, ' + 'installed->{installed}'.format( + enabled=_USE_NUMEXPR, + installed=_NUMEXPR_INSTALLED))(engine) + for engine in _engines +)) +def engine(request): + return request.param + + +@pytest.fixture(params=expr._parsers) +def parser(request): + return request.param + + def engine_has_neg_frac(engine): return _engines[engine].has_neg_frac @@ -774,17 +792,17 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): f = lambda *args, **kwargs: np.random.randn() -ENGINES_PARSERS = list(product(_engines, expr._parsers)) - #------------------------------------- # typecasting rules consistency with python # issue #12388 class TestTypeCasting(object): - - def check_binop_typecasting(self, engine, parser, op, dt): - tm.skip_if_no_ne(engine) + @pytest.mark.parametrize('op', ['+', '-', '*', '**', '/']) + # maybe someday... numexpr has too many upcasting rules now + # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) + @pytest.mark.parametrize('dt', [np.float32, np.float64]) + def test_binop_typecasting(self, engine, parser, op, dt): df = mkdf(5, 3, data_gen_f=f, dtype=dt) s = 'df {} 3'.format(op) res = pd.eval(s, engine=engine, parser=parser) @@ -798,15 +816,6 @@ def check_binop_typecasting(self, engine, parser, op, dt): assert res.values.dtype == dt assert_frame_equal(res, eval(s)) - def test_binop_typecasting(self): - for engine, parser in ENGINES_PARSERS: - for op in ['+', '-', '*', '**', '/']: - # maybe someday... numexpr has too many upcasting rules now - # for dt in chain(*(np.sctypes[x] for x in ['uint', 'int', - # 'float'])): - for dt in [np.float32, np.float64]: - yield self.check_binop_typecasting, engine, parser, op, dt - #------------------------------------- # basic and complex alignment @@ -826,19 +835,13 @@ class TestAlignment(object): index_types = 'i', 'u', 'dt' lhs_index_types = index_types + ('s',) # 'p' - def check_align_nested_unary_op(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_align_nested_unary_op(self, engine, parser): s = 'df * ~2' df = mkdf(5, 3, data_gen_f=f) res = pd.eval(s, engine=engine, parser=parser) assert_frame_equal(res, df * ~2) - def test_align_nested_unary_op(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_align_nested_unary_op, engine, parser - - def check_basic_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_basic_frame_alignment(self, engine, parser): args = product(self.lhs_index_types, self.index_types, self.index_types) with warnings.catch_warnings(record=True): @@ -856,12 +859,7 @@ def check_basic_frame_alignment(self, engine, parser): res = pd.eval('df + df2', engine=engine, parser=parser) assert_frame_equal(res, df + df2) - def test_basic_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_basic_frame_alignment, engine, parser - - def check_frame_comparison(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_frame_comparison(self, engine, parser): args = product(self.lhs_index_types, repeat=2) for r_idx_type, c_idx_type in args: df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, @@ -874,12 +872,8 @@ def check_frame_comparison(self, engine, parser): res = pd.eval('df < df3', engine=engine, parser=parser) assert_frame_equal(res, df < df3) - def test_frame_comparison(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_frame_comparison, engine, parser - - def check_medium_complex_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) + @slow + def test_medium_complex_frame_alignment(self, engine, parser): args = product(self.lhs_index_types, self.index_types, self.index_types, self.index_types) @@ -899,14 +893,7 @@ def check_medium_complex_frame_alignment(self, engine, parser): engine=engine, parser=parser) assert_frame_equal(res, df + df2 + df3) - @slow - def test_medium_complex_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_medium_complex_frame_alignment, engine, parser - - def check_basic_frame_series_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) - + def test_basic_frame_series_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) @@ -932,13 +919,7 @@ def testit(r_idx_type, c_idx_type, index_name): for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) - def test_basic_frame_series_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_basic_frame_series_alignment, engine, parser - - def check_basic_series_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) - + def test_basic_series_frame_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) @@ -968,12 +949,7 @@ def testit(r_idx_type, c_idx_type, index_name): for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) - def test_basic_series_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_basic_series_frame_alignment, engine, parser - - def check_series_frame_commutativity(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_series_frame_commutativity(self, engine, parser): args = product(self.lhs_index_types, self.index_types, ('+', '*'), ('index', 'columns')) @@ -1000,13 +976,8 @@ def check_series_frame_commutativity(self, engine, parser): if engine == 'numexpr': assert_frame_equal(a, b) - def test_series_frame_commutativity(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_series_frame_commutativity, engine, parser - - def check_complex_series_frame_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) - + @slow + def test_complex_series_frame_alignment(self, engine, parser): import random args = product(self.lhs_index_types, self.index_types, self.index_types, self.index_types) @@ -1050,13 +1021,7 @@ def check_complex_series_frame_alignment(self, engine, parser): tm.assert_equal(res.shape, expected.shape) assert_frame_equal(res, expected) - @slow - def test_complex_series_frame_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield self.check_complex_series_frame_alignment, engine, parser - - def check_performance_warning_for_poor_alignment(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_performance_warning_for_poor_alignment(self, engine, parser): df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) if engine == 'numexpr': @@ -1098,11 +1063,6 @@ def check_performance_warning_for_poor_alignment(self, engine, parser): "".format(1, 'df', np.log10(s.size - df.shape[1]))) tm.assert_equal(msg, expected) - def test_performance_warning_for_poor_alignment(self): - for engine, parser in ENGINES_PARSERS: - yield (self.check_performance_warning_for_poor_alignment, engine, - parser) - #------------------------------------ # slightly more complex ops @@ -1762,18 +1722,12 @@ def setUpClass(cls): class TestScope(object): - def check_global_scope(self, e, engine, parser): - tm.skip_if_no_ne(engine) + def test_global_scope(self, engine, parser): + e = '_var_s * 2' tm.assert_numpy_array_equal(_var_s * 2, pd.eval(e, engine=engine, parser=parser)) - def test_global_scope(self): - e = '_var_s * 2' - for engine, parser in product(_engines, expr._parsers): - yield self.check_global_scope, e, engine, parser - - def check_no_new_locals(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_no_new_locals(self, engine, parser): x = 1 lcls = locals().copy() pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) @@ -1781,22 +1735,13 @@ def check_no_new_locals(self, engine, parser): lcls2.pop('lcls') tm.assert_equal(lcls, lcls2) - def test_no_new_locals(self): - for engine, parser in product(_engines, expr._parsers): - yield self.check_no_new_locals, engine, parser - - def check_no_new_globals(self, engine, parser): - tm.skip_if_no_ne(engine) + def test_no_new_globals(self, engine, parser): x = 1 gbls = globals().copy() pd.eval('x + 1', engine=engine, parser=parser) gbls2 = globals().copy() tm.assert_equal(gbls, gbls2) - def test_no_new_globals(self): - for engine, parser in product(_engines, expr._parsers): - yield self.check_no_new_globals, engine, parser - def test_invalid_engine(): tm.skip_if_no_ne() @@ -1816,7 +1761,9 @@ def test_invalid_parser(): 'pandas': PandasExprVisitor} -def check_disallowed_nodes(engine, parser): +@pytest.mark.parametrize('engine', _parsers) +@pytest.mark.parametrize('parser', _parsers) +def test_disallowed_nodes(engine, parser): tm.skip_if_no_ne(engine) VisitorClass = _parsers[parser] uns_ops = VisitorClass.unsupported_nodes @@ -1827,38 +1774,19 @@ def check_disallowed_nodes(engine, parser): getattr(inst, ops)() -def test_disallowed_nodes(): - for engine, visitor in product(_parsers, repeat=2): - yield check_disallowed_nodes, engine, visitor - - -def check_syntax_error_exprs(engine, parser): - tm.skip_if_no_ne(engine) +def test_syntax_error_exprs(engine, parser): e = 's +' with pytest.raises(SyntaxError): pd.eval(e, engine=engine, parser=parser) -def test_syntax_error_exprs(): - for engine, parser in ENGINES_PARSERS: - yield check_syntax_error_exprs, engine, parser - - -def check_name_error_exprs(engine, parser): - tm.skip_if_no_ne(engine) +def test_name_error_exprs(engine, parser): e = 's + t' with tm.assertRaises(NameError): pd.eval(e, engine=engine, parser=parser) -def test_name_error_exprs(): - for engine, parser in ENGINES_PARSERS: - yield check_name_error_exprs, engine, parser - - -def check_invalid_local_variable_reference(engine, parser): - tm.skip_if_no_ne(engine) - +def test_invalid_local_variable_reference(engine, parser): a, b = 1, 2 exprs = 'a + @b', '@a + b', '@a + @b' for expr in exprs: @@ -1870,13 +1798,7 @@ def check_invalid_local_variable_reference(engine, parser): pd.eval(exprs, engine=engine, parser=parser) -def test_invalid_local_variable_reference(): - for engine, parser in ENGINES_PARSERS: - yield check_invalid_local_variable_reference, engine, parser - - -def check_numexpr_builtin_raises(engine, parser): - tm.skip_if_no_ne(engine) +def test_numexpr_builtin_raises(engine, parser): sin, dotted_line = 1, 2 if engine == 'numexpr': with tm.assertRaisesRegexp(NumExprClobberingError, @@ -1887,51 +1809,35 @@ def check_numexpr_builtin_raises(engine, parser): tm.assert_equal(res, sin + dotted_line) -def test_numexpr_builtin_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_numexpr_builtin_raises, engine, parser - - -def check_bad_resolver_raises(engine, parser): - tm.skip_if_no_ne(engine) +def test_bad_resolver_raises(engine, parser): cannot_resolve = 42, 3.0 with tm.assertRaisesRegexp(TypeError, 'Resolver of type .+'): pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, parser=parser) -def test_bad_resolver_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_bad_resolver_raises, engine, parser - - -def check_empty_string_raises(engine, parser): +def test_empty_string_raises(engine, parser): # GH 13139 - tm.skip_if_no_ne(engine) with tm.assertRaisesRegexp(ValueError, 'expr cannot be an empty string'): pd.eval('', engine=engine, parser=parser) -def test_empty_string_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_empty_string_raises, engine, parser - - -def check_more_than_one_expression_raises(engine, parser): - tm.skip_if_no_ne(engine) +def test_more_than_one_expression_raises(engine, parser): with tm.assertRaisesRegexp(SyntaxError, 'only a single expression is allowed'): pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) -def test_more_than_one_expression_raises(): - for engine, parser in ENGINES_PARSERS: - yield check_more_than_one_expression_raises, engine, parser +@pytest.mark.parametrize('cmp', ('and', 'or')) +@pytest.mark.parametrize('lhs', (int, float)) +@pytest.mark.parametrize('rhs', (int, float)) +def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): + gen = {int: lambda: np.random.randint(10), float: np.random.randn} + mid = gen[lhs]() + lhs = gen[lhs]() + rhs = gen[rhs]() -def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): - tm.skip_if_no_ne(engine) - mid = gen[type(lhs)]() ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) @@ -1940,32 +1846,14 @@ def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): pd.eval(ex, engine=engine, parser=parser) -def test_bool_ops_fails_on_scalars(): - _bool_ops_syms = 'and', 'or' - dtypes = int, float - gen = {int: lambda: np.random.randint(10), float: np.random.randn} - for engine, parser, dtype1, cmp, dtype2 in product(_engines, expr._parsers, - dtypes, _bool_ops_syms, - dtypes): - yield (check_bool_ops_fails_on_scalars, gen, gen[dtype1](), cmp, - gen[dtype2](), engine, parser) - - -def check_inf(engine, parser): - tm.skip_if_no_ne(engine) +def test_inf(engine, parser): s = 'inf + 1' expected = np.inf result = pd.eval(s, engine=engine, parser=parser) tm.assert_equal(result, expected) -def test_inf(): - for engine, parser in ENGINES_PARSERS: - yield check_inf, engine, parser - - -def check_negate_lt_eq_le(engine, parser): - tm.skip_if_no_ne(engine) +def test_negate_lt_eq_le(engine, parser): df = pd.DataFrame([[0, 10], [1, 20]], columns=['cat', 'count']) expected = df[~(df.cat > 0)] @@ -1980,11 +1868,6 @@ def check_negate_lt_eq_le(engine, parser): tm.assert_frame_equal(result, expected) -def test_negate_lt_eq_le(): - for engine, parser in product(_engines, expr._parsers): - yield check_negate_lt_eq_le, engine, parser - - class TestValidate(tm.TestCase): def test_validate_bool_args(self): diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 721d447262149..4d6b6c7daa3c6 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -7,7 +7,6 @@ import os import pytest -from itertools import product import pandas.util.testing as tm from pandas import DataFrame @@ -21,14 +20,18 @@ def salaries_table(): @pytest.mark.parametrize( - "compression,extension", [('gzip', '.gz'), ('bz2', '.bz2'), - ('zip', '.zip'), ('xz', '.xz')]) -def test_compressed_urls(salaries_table, compression, extension): - check_compressed_urls(salaries_table, compression, extension) + "compression,extension", + [('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'), + tm._mark_skipif_no_lzma(('xz', '.xz'))]) +@pytest.mark.parametrize('mode', ['explicit', 'infer']) +@pytest.mark.parametrize('engine', ['python', 'c']) +def test_compressed_urls(salaries_table, compression, extension, mode, engine): + check_compressed_urls(salaries_table, compression, extension, mode, engine) @tm.network -def check_compressed_urls(salaries_table, compression, extension): +def check_compressed_urls(salaries_table, compression, extension, mode, + engine): # test reading compressed urls with various engines and # extension inference base_url = ('https://github.com/pandas-dev/pandas/raw/master/' @@ -36,14 +39,11 @@ def check_compressed_urls(salaries_table, compression, extension): url = base_url + extension - # args is a (compression, engine) tuple - for (c, engine) in product([compression, 'infer'], ['python', 'c']): + if mode != 'explicit': + compression = mode - if url.endswith('.xz'): - tm._skip_if_no_lzma() - - url_table = read_table(url, compression=c, engine=engine) - tm.assert_frame_equal(url_table, salaries_table) + url_table = read_table(url, compression=compression, engine=engine) + tm.assert_frame_equal(url_table, salaries_table) class TestS3(tm.TestCase): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cda386781e2ec..1bd539469dbe3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -307,12 +307,21 @@ def _skip_if_scipy_0_17(): pytest.skip("scipy 0.17") -def _skip_if_no_lzma(): +def _check_if_lzma(): try: return compat.import_lzma() except ImportError: - import pytest - pytest.skip('need backports.lzma to run') + return False + + +def _skip_if_no_lzma(): + return _check_if_lzma() or pytest.skip('need backports.lzma to run') + + +_mark_skipif_no_lzma = pytest.mark.skipif( + not _check_if_lzma(), + reason='need backports.lzma to run' +) def _skip_if_no_xarray(): From 5a8883b965610234366150897fe8963abffd6a7c Mon Sep 17 00:00:00 2001 From: Diego Fernandez Date: Thu, 16 Feb 2017 09:21:03 -0500 Subject: [PATCH 32/52] BUG: Ensure the right values are set in SeriesGroupBy.nunique closes #13453 Author: Diego Fernandez Closes #15418 from aiguofer/gh_13453 and squashes the following commits: c53bd70 [Diego Fernandez] Add test for #13453 in test_resample and add note to whatsnew 0daab80 [Diego Fernandez] Ensure the right values are set in SeriesGroupBy.nunique --- doc/source/whatsnew/v0.20.0.txt | 7 ++++--- pandas/core/groupby.py | 2 +- pandas/tests/groupby/test_groupby.py | 13 +++++++++++++ pandas/tests/tseries/test_resample.py | 20 ++++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4708abe4d592e..09551cfc0bcf8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -418,6 +418,7 @@ New Behavior: Other API Changes ^^^^^^^^^^^^^^^^^ +- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) @@ -428,9 +429,8 @@ Other API Changes - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) -- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). +- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss (:issue:`14064`, :issue:`14305`). - Reorganization of timeseries development tests (:issue:`14854`) -- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). .. _whatsnew_0200.deprecations: @@ -473,7 +473,7 @@ Performance Improvements (or with ``compat_x=True``) (:issue:`15073`). - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) -- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. +- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) @@ -553,6 +553,7 @@ Bug Fixes - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) +- Bug in ``groupby().nunique()`` with a datetimelike-grouper where bins counts were incorrect (:issue:`13453`) - Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) - Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 23c835318b0e6..ba2de295fa0a9 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3032,7 +3032,7 @@ def nunique(self, dropna=True): # we might have duplications among the bins if len(res) != len(ri): res, out = np.zeros(len(ri), dtype=out.dtype), res - res[ids] = out + res[ids[idx]] = out return Series(res, index=ri, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d53446870beb1..59cbcab23b9e7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4159,6 +4159,19 @@ def test_nunique_with_empty_series(self): expected = pd.Series(name='name', dtype='int64') tm.assert_series_equal(result, expected) + def test_nunique_with_timegrouper(self): + # GH 13453 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + Timestamp('2016-06-28 16:09:30'), + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}).set_index('time') + result = test.groupby(pd.TimeGrouper(freq='h'))['data'].nunique() + expected = test.groupby( + pd.TimeGrouper(freq='h') + )['data'].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index afb44887fe7d1..45bbc88ef711d 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -1939,6 +1939,26 @@ def test_resample_nunique(self): result = df.ID.groupby(pd.Grouper(freq='D')).nunique() assert_series_equal(result, expected) + def test_resample_nunique_with_date_gap(self): + # GH 13453 + index = pd.date_range('1-1-2000', '2-15-2000', freq='h') + index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h') + index3 = index.append(index2) + s = pd.Series(range(len(index3)), index=index3) + r = s.resample('M') + + # Since all elements are unique, these should all be the same + results = [ + r.count(), + r.nunique(), + r.agg(pd.Series.nunique), + r.agg('nunique') + ] + + assert_series_equal(results[0], results[1]) + assert_series_equal(results[0], results[2]) + assert_series_equal(results[0], results[3]) + def test_resample_group_info(self): # GH10914 for n, k in product((10000, 100000), (10, 100, 1000)): dr = date_range(start='2015-08-27', periods=n // 10, freq='T') From c7300ea9ccf6c8b4eeb5a4ae59dc2419753c9b18 Mon Sep 17 00:00:00 2001 From: abaldenko Date: Thu, 16 Feb 2017 12:39:27 -0500 Subject: [PATCH 33/52] BUG: Concat with inner join and empty DataFrame closes #15328 Author: abaldenko Closes #15397 from abaldenko/concat_empty_dataframe and squashes the following commits: 47c8735 [abaldenko] BUG: Concat with inner join and empty DataFrame fc473b7 [abaldenko] BUG: Concat with inner join and empty DataFrame b86dcb6 [abaldenko] BUG: Concat with inner join and empty DataFrame --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/tools/test_concat.py | 10 ++++++++++ pandas/tests/tools/test_merge.py | 8 ++++++++ pandas/tools/concat.py | 4 +++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 09551cfc0bcf8..ddb9088035d89 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -576,7 +576,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - +- Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 87a0dda34a525..2a28fccdc9b94 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1825,6 +1825,16 @@ def test_concat_bug_3602(self): result = concat([df1, df2], axis=1) assert_frame_equal(result, expected) + def test_concat_inner_join_empty(self): + # GH 15328 + df_empty = pd.DataFrame() + df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') + df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + + for how, expected in [('inner', df_expected), ('outer', df_a)]: + result = pd.concat([df_a, df_empty], axis=1, join=how) + assert_frame_equal(result, expected) + def test_concat_series_axis1_same_names_ignore_index(self): dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] s1 = Series(randn(len(dates)), index=dates, name='value') diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index 472d8674f9f8d..b3b5e7e29319b 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -52,6 +52,14 @@ def setUp(self): self.right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) + def test_merge_inner_join_empty(self): + # GH 15328 + df_empty = pd.DataFrame() + df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') + result = pd.merge(df_empty, df_a, left_index=True, right_index=True) + expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + assert_frame_equal(result, expected) + def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) diff --git a/pandas/tools/concat.py b/pandas/tools/concat.py index dbbc831b19d1d..31d7a9eb9a01a 100644 --- a/pandas/tools/concat.py +++ b/pandas/tools/concat.py @@ -284,7 +284,9 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if sum(obj.shape) > 0 or isinstance(obj, Series)] if (len(non_empties) and (keys is None and names is None and - levels is None and join_axes is None)): + levels is None and + join_axes is None and + not self.intersect)): objs = non_empties sample = objs[0] From 9b5d8488e8184da0507c09482f23ebfff34ecc43 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Thu, 16 Feb 2017 12:45:29 -0500 Subject: [PATCH 34/52] ENH: Added ability to freeze panes from DataFrame.to_excel() (#15160) closes #15160 Author: Jeff Carey Closes #15291 from jeffcarey/enh-15160 and squashes the following commits: cef8fce [Jeff Carey] ENH: Added ability to freeze panes from DataFrame.to_excel() --- doc/source/io.rst | 13 +++++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 19 ++++++++++++++++-- pandas/core/generic.py | 7 ++++++- pandas/io/excel.py | 34 ++++++++++++++++++++++++++------- pandas/tests/io/test_excel.py | 12 ++++++++++-- 6 files changed, 74 insertions(+), 12 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 22eac33a715ba..2d6ddf98437e5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2777,6 +2777,7 @@ Added support for Openpyxl >= 2.2 ``'xlsxwriter'`` will produce an Excel 2007-format workbook (xlsx). If omitted, an Excel 2007-formatted workbook is produced. + .. _io.excel.writers: Excel writer engines @@ -2823,6 +2824,18 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') +.. _io.excel.style: + +Style and Formatting +'''''''''''''''''''' + +The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. + +- ``float_format`` : Format string for floating point numbers (default None) +- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will +freeze the first row and first column (default None) + + .. _io.clipboard: Clipboard diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ddb9088035d89..75a8752c9bfa4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -153,6 +153,7 @@ Other enhancements - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) +- ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7c306ea7ce95..3ebdf72a5cde9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1390,7 +1390,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True): + merge_cells=True, encoding=None, inf_rep='inf', verbose=True, + freeze_panes=None): from pandas.io.excel import ExcelWriter need_save = False if encoding is None: @@ -1406,12 +1407,26 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', index_label=index_label, merge_cells=merge_cells, inf_rep=inf_rep) + formatted_cells = formatter.get_formatted_cells() + freeze_panes = self._validate_freeze_panes(freeze_panes) excel_writer.write_cells(formatted_cells, sheet_name, - startrow=startrow, startcol=startcol) + startrow=startrow, startcol=startcol, + freeze_panes=freeze_panes) if need_save: excel_writer.save() + def _validate_freeze_panes(self, freeze_panes): + if freeze_panes is not None: + if ( + len(freeze_panes) == 2 and + all(isinstance(item, int) for item in freeze_panes) + ): + return freeze_panes + + raise ValueError("freeze_panes must be of form (row, column)" + " where row and column are integers") + def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 20e6e027dbf09..204cd91ebfab0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1033,7 +1033,7 @@ def __setstate__(self, state): # I/O Methods _shared_docs['to_excel'] = """ - Write %(klass)s to a excel sheet + Write %(klass)s to an excel sheet %(versionadded_to_excel)s Parameters ---------- @@ -1072,6 +1072,11 @@ def __setstate__(self, state): inf_rep : string, default 'inf' Representation for infinity (there is no native representation for infinity in Excel) + freeze_panes : tuple of integer (length 2), default None + Specifies the bottommost row and rightmost column that + is to be frozen + + .. versionadded:: 0.20.0 Notes ----- diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 2821983213646..37a61b7dc9ab5 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -693,7 +693,8 @@ def engine(self): pass @abc.abstractmethod - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): """ Write given formated cells into Excel an excel sheet @@ -705,6 +706,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): Name of Excel sheet, if None, then use self.cur_sheet startrow: upper left cell row to dump data frame startcol: upper left cell column to dump data frame + freeze_panes: integer tuple of length 2 + contains the bottom-most row and right-most column to freeze """ pass @@ -804,7 +807,8 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter @@ -904,7 +908,8 @@ class _Openpyxl20Writer(_Openpyxl1Writer): engine = 'openpyxl20' openpyxl_majorver = 2 - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter @@ -1311,7 +1316,8 @@ class _Openpyxl22Writer(_Openpyxl20Writer): engine = 'openpyxl22' openpyxl_majorver = 2 - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) @@ -1324,6 +1330,10 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): wks.title = sheet_name self.sheets[sheet_name] = wks + if freeze_panes is not None: + wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1, + column=freeze_panes[1] + 1) + for cell in cells: xcell = wks.cell( row=startrow + cell.row + 1, @@ -1396,7 +1406,8 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -1407,6 +1418,11 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): wks = self.book.add_sheet(sheet_name) self.sheets[sheet_name] = wks + if freeze_panes is not None: + wks.set_panes_frozen(True) + wks.set_horz_split_pos(freeze_panes[0]) + wks.set_vert_split_pos(freeze_panes[1]) + style_dict = {} for cell in cells: @@ -1518,11 +1534,12 @@ def save(self): """ Save workbook to disk. """ + return self.book.close() - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, + freeze_panes=None): # Write the frame cells using xlsxwriter. - sheet_name = self._get_sheet_name(sheet_name) if sheet_name in self.sheets: @@ -1533,6 +1550,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): style_dict = {} + if freeze_panes is not None: + wks.freeze_panes(*(freeze_panes)) + for cell in cells: val = _conv_value(cell.val) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 0c2b443cffe52..b66cb24bf44d8 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1836,6 +1836,14 @@ def test_true_and_false_value_options(self): false_values=['bar']) tm.assert_frame_equal(read_frame, expected) + def test_freeze_panes(self): + # GH15160 + expected = DataFrame([[1, 2], [3, 4]], columns=['col1', 'col2']) + with ensure_clean(self.ext) as path: + expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) + result = read_excel(path) + tm.assert_frame_equal(expected, result) + def raise_wrapper(major_ver): def versioned_raise_wrapper(orig_method): @@ -1873,7 +1881,7 @@ class OpenpyxlTests(ExcelWriterBase, tm.TestCase): def test_to_excel_styleconverter(self): _skip_if_no_openpyxl() if not openpyxl_compat.is_compat(major_ver=1): - pytest.skip('incompatiable openpyxl version') + pytest.skip('incompatible openpyxl version') import openpyxl @@ -2095,7 +2103,7 @@ def test_to_excel_styleconverter(self): def test_write_cells_merge_styled(self): if not openpyxl_compat.is_compat(major_ver=2): - pytest.skip('incompatiable openpyxl version') + pytest.skip('incompatible openpyxl version') from pandas.formats.format import ExcelCell From c588dd1d0b7ea2dffb4e9906b8455739c9055037 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Fri, 17 Feb 2017 00:17:38 -0800 Subject: [PATCH 35/52] Documents touch-up for DataFrame.to_excel() freeze_panes option (#15436) --- doc/source/io.rst | 4 ++-- pandas/core/generic.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 2d6ddf98437e5..55ef2c09d43e4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2832,8 +2832,8 @@ Style and Formatting The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. - ``float_format`` : Format string for floating point numbers (default None) -- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will -freeze the first row and first column (default None) +- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default None) + .. _io.clipboard: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 204cd91ebfab0..26b9a880dd2c7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1073,7 +1073,7 @@ def __setstate__(self, state): Representation for infinity (there is no native representation for infinity in Excel) freeze_panes : tuple of integer (length 2), default None - Specifies the bottommost row and rightmost column that + Specifies the one-based bottommost row and rightmost column that is to be frozen .. versionadded:: 0.20.0 From f4e672ccc46da0a358c4729714b6343e39fafd7b Mon Sep 17 00:00:00 2001 From: Peter Date: Fri, 17 Feb 2017 13:09:20 +0000 Subject: [PATCH 36/52] BUG: to_sql convert index name to string (#15404) (#15423) * Converted index name to string to fix issue #15404 - BUG: to_sql errors with numeric index name - needs conversion to string * Additional int to string conversion added. Associated test cases added. * PEP 8 compliance edits * Removed extraneous brackets --- pandas/io/sql.py | 5 +++-- pandas/tests/io/test_sql.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 55e145b493dd9..bace43e785dff 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -750,7 +750,8 @@ def _get_column_names_and_types(self, dtype_mapper): for i, idx_label in enumerate(self.index): idx_type = dtype_mapper( self.frame.index.get_level_values(i)) - column_names_and_types.append((idx_label, idx_type, True)) + column_names_and_types.append((text_type(idx_label), + idx_type, True)) column_names_and_types += [ (text_type(self.frame.columns[i]), @@ -1220,7 +1221,7 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): def _get_unicode_name(name): try: - uname = name.encode("utf-8", "strict").decode("utf-8") + uname = text_type(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: raise ValueError("Cannot convert identifier to UTF-8: '%s'" % name) return uname diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 78560611da7aa..890f52e8c65e9 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -709,6 +709,21 @@ def test_to_sql_index_label(self): self.assertEqual(frame.columns[0], 'other_label', "Specified index_label not written to database") + # index name is integer + temp_frame.index.name = 0 + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace') + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], '0', + "Integer index label not written to database") + + temp_frame.index.name = None + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label=0) + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], '0', + "Integer index label not written to database") + def test_to_sql_index_label_multiindex(self): temp_frame = DataFrame({'col1': range(4)}, index=MultiIndex.from_product( From 54b6c6e1c443b992a1df3443669a59dbe430271f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Feb 2017 14:12:01 +0100 Subject: [PATCH 37/52] DOC: add whatsnew for #15423 --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 75a8752c9bfa4..c68af842a4f0c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -522,7 +522,7 @@ Bug Fixes - Bug in ``.groupby(...).rolling(...)`` when ``on`` is specified and using a ``DatetimeIndex`` (:issue:`15130`) - +- Bug in ``to_sql`` when writing a DataFrame with numeric index names (:issue:`15404`). - Bug in ``Series.iloc`` where a ``Categorical`` object for list-like indexes input was returned, where a ``Series`` was expected. (:issue:`14580`) From 763f42f7bba78acc0bf22f66281d1221b49c7238 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Feb 2017 09:51:46 -0500 Subject: [PATCH 38/52] TST: remove yielding tests from test_msgpacks.py (#15427) --- pandas/tests/io/test_packers.py | 88 ++++++++++++++++++--------------- pandas/tests/io/test_pickle.py | 8 +-- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 911cd8164571d..097c03937ca68 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -41,6 +41,22 @@ _ZLIB_INSTALLED = True +@pytest.fixture(scope='module') +def current_packers_data(): + # our current version packers data + from pandas.tests.io.generate_legacy_storage_files import ( + create_msgpack_data) + return create_msgpack_data() + + +@pytest.fixture(scope='module') +def all_packers_data(): + # our all of our current version packers data + from pandas.tests.io.generate_legacy_storage_files import ( + create_data) + return create_data() + + def check_arbitrary(a, b): if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): @@ -778,7 +794,16 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -class TestMsgpack(): +def legacy_packers_versions(): + # yield the packers versions + path = tm.get_data_path('legacy_msgpack') + for v in os.listdir(path): + p = os.path.join(path, v) + if os.path.isdir(p): + yield v + + +class TestMsgpack(object): """ How to add msgpack tests: @@ -788,48 +813,38 @@ class TestMsgpack(): $ python generate_legacy_storage_files.py msgpack 3. Move the created pickle to "data/legacy_msgpack/" directory. - - NOTE: TestMsgpack can't be a subclass of tm.Testcase to use test generator. - http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class """ - @classmethod - def setup_class(cls): - from pandas.tests.io.generate_legacy_storage_files import ( - create_msgpack_data, create_data) - cls.data = create_msgpack_data() - cls.all_data = create_data() - cls.path = u('__%s__.msgpack' % tm.rands(10)) - cls.minimum_structure = {'series': ['float', 'int', 'mixed', - 'ts', 'mi', 'dup'], - 'frame': ['float', 'int', 'mixed', 'mi'], - 'panel': ['float'], - 'index': ['int', 'date', 'period'], - 'mi': ['reg2']} - - def check_min_structure(self, data): + minimum_structure = {'series': ['float', 'int', 'mixed', + 'ts', 'mi', 'dup'], + 'frame': ['float', 'int', 'mixed', 'mi'], + 'panel': ['float'], + 'index': ['int', 'date', 'period'], + 'mi': ['reg2']} + + def check_min_structure(self, data, version): for typ, v in self.minimum_structure.items(): assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: msg = '"{0}" not found in data["{1}"]'.format(kind, typ) assert kind in data[typ], msg - def compare(self, vf, version): + def compare(self, current_data, all_data, vf, version): # GH12277 encoding default used to be latin-1, now utf-8 if LooseVersion(version) < '0.18.0': data = read_msgpack(vf, encoding='latin-1') else: data = read_msgpack(vf) - self.check_min_structure(data) + self.check_min_structure(data, version) for typ, dv in data.items(): - assert typ in self.all_data, ('unpacked data contains ' - 'extra key "{0}"' - .format(typ)) + assert typ in all_data, ('unpacked data contains ' + 'extra key "{0}"' + .format(typ)) for dt, result in dv.items(): - assert dt in self.all_data[typ], ('data["{0}"] contains extra ' - 'key "{1}"'.format(typ, dt)) + assert dt in current_data[typ], ('data["{0}"] contains extra ' + 'key "{1}"'.format(typ, dt)) try: - expected = self.data[typ][dt] + expected = current_data[typ][dt] except KeyError: continue @@ -862,9 +877,11 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - def read_msgpacks(self, version): + @pytest.mark.parametrize('version', legacy_packers_versions()) + def test_msgpacks_legacy(self, current_packers_data, all_packers_data, + version): - pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) + pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 @@ -873,19 +890,10 @@ def read_msgpacks(self, version): continue vf = os.path.join(pth, f) try: - self.compare(vf, version) + self.compare(current_packers_data, all_packers_data, + vf, version) except ImportError: # blosc not installed continue n += 1 assert n > 0, 'Msgpack files are not tested' - - def test_msgpack(self): - msgpack_path = tm.get_data_path('legacy_msgpack') - n = 0 - for v in os.listdir(msgpack_path): - pth = os.path.join(msgpack_path, v) - if os.path.isdir(pth): - yield self.read_msgpacks, v - n += 1 - assert n > 0, 'Msgpack files are not tested' diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 1e3816c1556f6..c736ec829808a 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -187,10 +187,10 @@ def compare_sp_frame_float(result, expected, typ, version): # --------------------- def legacy_pickle_versions(): # yield the pickle versions - pickle_path = tm.get_data_path('legacy_pickle') - for v in os.listdir(pickle_path): - pth = os.path.join(pickle_path, v) - if os.path.isdir(pth): + path = tm.get_data_path('legacy_pickle') + for v in os.listdir(path): + p = os.path.join(path, v) + if os.path.isdir(p): yield v From f65a6415f15d438432cc6954ead61b052c5d4d60 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Fri, 17 Feb 2017 10:07:11 -0500 Subject: [PATCH 39/52] ENH: Don't add rowspan/colspan if it's 1. Just a small thing I noticed in a [footnote here](https://danluu.com/web-bloat/#appendix-irony). Probably can't do much about the extra classes, but rowspan/colspan seem like easy fixes to save a few bytes per row/col and it's already done in the other code path. Author: Elliott Sales de Andrade Closes #15403 from QuLogic/no-extra-span and squashes the following commits: 9a8fcee [Elliott Sales de Andrade] Don't add rowspan/colspan if it's 1. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/formats/style.py | 55 ++++++++++++++++-------------- pandas/tests/formats/test_style.py | 38 +++++++-------------- 3 files changed, 43 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c68af842a4f0c..8e48dbbb083e8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -154,6 +154,7 @@ Other enhancements - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) +- HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/formats/style.py b/pandas/formats/style.py index b3e0f0f6c7462..89712910a22e1 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -251,21 +251,23 @@ def format_attr(pair): "class": " ".join(cs), "is_visible": True}) - for c in range(len(clabels[0])): + for c, value in enumerate(clabels[r]): cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] cs.extend(cell_context.get( "col_headings", {}).get(r, {}).get(c, [])) - value = clabels[r][c] - row_es.append({"type": "th", - "value": value, - "display_value": value, - "class": " ".join(cs), - "is_visible": _is_visible(c, r, col_lengths), - "attributes": [ - format_attr({"key": "colspan", - "value": col_lengths.get( - (r, c), 1)}) - ]}) + es = { + "type": "th", + "value": value, + "display_value": value, + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + } + colspan = col_lengths.get((r, c), 0) + if colspan > 1: + es["attributes"] = [ + format_attr({"key": "colspan", "value": colspan}) + ] + row_es.append(es) head.append(row_es) if self.data.index.names and not all(x is None @@ -289,19 +291,22 @@ def format_attr(pair): body = [] for r, idx in enumerate(self.data.index): - # cs.extend( - # cell_context.get("row_headings", {}).get(r, {}).get(c, [])) - row_es = [{"type": "th", - "is_visible": _is_visible(r, c, idx_lengths), - "attributes": [ - format_attr({"key": "rowspan", - "value": idx_lengths.get((c, r), 1)}) - ], - "value": rlabels[r][c], - "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, - "row%s" % r]), - "display_value": rlabels[r][c]} - for c in range(len(rlabels[r]))] + row_es = [] + for c, value in enumerate(rlabels[r]): + es = { + "type": "th", + "is_visible": _is_visible(r, c, idx_lengths), + "value": value, + "display_value": value, + "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, + "row%s" % r]), + } + rowspan = idx_lengths.get((c, r), 0) + if rowspan > 1: + es["attributes"] = [ + format_attr({"key": "rowspan", "value": rowspan}) + ] + row_es.append(es) for c, col in enumerate(self.data.columns): cs = [DATA_CLASS, "row%s" % r, "col%s" % c] diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index 53bb3f9010f7e..44af0b8ebb085 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -141,21 +141,18 @@ def test_empty_index_name_doesnt_display(self): 'type': 'th', 'value': 'A', 'is_visible': True, - 'attributes': ["colspan=1"], }, {'class': 'col_heading level0 col1', 'display_value': 'B', 'type': 'th', 'value': 'B', 'is_visible': True, - 'attributes': ["colspan=1"], }, {'class': 'col_heading level0 col2', 'display_value': 'C', 'type': 'th', 'value': 'C', 'is_visible': True, - 'attributes': ["colspan=1"], }]] self.assertEqual(result['head'], expected) @@ -168,11 +165,9 @@ def test_index_name(self): expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', 'display_value': '', 'is_visible': True}, {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'B', 'display_value': 'B', - 'is_visible': True, 'attributes': ['colspan=1']}, + 'value': 'B', 'display_value': 'B', 'is_visible': True}, {'class': 'col_heading level0 col1', 'type': 'th', - 'value': 'C', 'display_value': 'C', - 'is_visible': True, 'attributes': ['colspan=1']}], + 'value': 'C', 'display_value': 'C', 'is_visible': True}], [{'class': 'index_name level0', 'type': 'th', 'value': 'A'}, {'class': 'blank', 'type': 'th', 'value': ''}, @@ -191,9 +186,7 @@ def test_multiindex_name(self): {'class': 'blank level0', 'type': 'th', 'value': '', 'display_value': '', 'is_visible': True}, {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'C', 'display_value': 'C', - 'is_visible': True, 'attributes': ['colspan=1'], - }], + 'value': 'C', 'display_value': 'C', 'is_visible': True}], [{'class': 'index_name level0', 'type': 'th', 'value': 'A'}, {'class': 'index_name level1', 'type': 'th', @@ -618,16 +611,14 @@ def test_mi_sparse(self): body_1 = result['body'][0][1] expected_1 = { "value": 0, "display_value": 0, "is_visible": True, - "type": "th", "attributes": ["rowspan=1"], - "class": "row_heading level1 row0", + "type": "th", "class": "row_heading level1 row0", } tm.assert_dict_equal(body_1, expected_1) body_10 = result['body'][1][0] expected_10 = { "value": 'a', "display_value": 'a', "is_visible": False, - "type": "th", "attributes": ["rowspan=1"], - "class": "row_heading level0 row1", + "type": "th", "class": "row_heading level0 row1", } tm.assert_dict_equal(body_10, expected_10) @@ -637,9 +628,8 @@ def test_mi_sparse(self): 'is_visible': True, "display_value": ''}, {'type': 'th', 'class': 'blank level0', 'value': '', 'is_visible': True, 'display_value': ''}, - {'attributes': ['colspan=1'], 'class': 'col_heading level0 col0', - 'is_visible': True, 'type': 'th', 'value': 'A', - 'display_value': 'A'}] + {'type': 'th', 'class': 'col_heading level0 col0', 'value': 'A', + 'is_visible': True, 'display_value': 'A'}] self.assertEqual(head, expected) def test_mi_sparse_disabled(self): @@ -650,7 +640,7 @@ def test_mi_sparse_disabled(self): result = df.style._translate() body = result['body'] for row in body: - self.assertEqual(row[0]['attributes'], ['rowspan=1']) + assert 'attributes' not in row[0] def test_mi_sparse_index_names(self): df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( @@ -686,28 +676,24 @@ def test_mi_sparse_column_names(self): 'type': 'th', 'is_visible': True}, {'class': 'index_name level1', 'value': 'col_1', 'display_value': 'col_1', 'is_visible': True, 'type': 'th'}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col0', + {'class': 'col_heading level1 col0', 'display_value': 1, 'is_visible': True, 'type': 'th', 'value': 1}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col1', + {'class': 'col_heading level1 col1', 'display_value': 0, 'is_visible': True, 'type': 'th', 'value': 0}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col2', + {'class': 'col_heading level1 col2', 'display_value': 1, 'is_visible': True, 'type': 'th', 'value': 1}, - {'attributes': ['colspan=1'], - 'class': 'col_heading level1 col3', + {'class': 'col_heading level1 col3', 'display_value': 0, 'is_visible': True, 'type': 'th', From a17a03a404649c0672b75983432759e8a29e0804 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 18 Feb 2017 11:52:01 +0100 Subject: [PATCH 40/52] DOC: correct rpy2 examples (GH15142) (#15450) --- doc/source/r_interface.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index b5d699cad69d5..88634d7f75c63 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -41,15 +41,17 @@ In the remainder of this page, a few examples of explicit conversion is given. T Transferring R data sets into Python ------------------------------------ -The ``pandas2ri.ri2py`` function retrieves an R data set and converts it to the -appropriate pandas object (most likely a DataFrame): +Once the pandas conversion is activated (``pandas2ri.activate()``), many conversions +of R to pandas objects will be done automatically. For example, to obtain the 'iris' dataset as a pandas DataFrame: .. ipython:: python r.data('iris') - df_iris = pandas2ri.ri2py(r['iris']) - df_iris.head() + r['iris'].head() +If the pandas conversion was not activated, the above could also be accomplished +by explicitly converting it with the ``pandas2ri.ri2py`` function +(``pandas2ri.ri2py(r['iris'])``). Converting DataFrames into R objects ------------------------------------ @@ -65,7 +67,6 @@ DataFrames into the equivalent R object (that is, **data.frame**): print(type(r_dataframe)) print(r_dataframe) - The DataFrame's index is stored as the ``rownames`` attribute of the data.frame instance. From 29aeffb8d77f56c3a3862a6bfaee993aa7660500 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 18 Feb 2017 04:08:54 -0800 Subject: [PATCH 41/52] BUG: rolling not accepting Timedelta-like window args (#15443) Remove unnecessary pd.Timedelta --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/window.py | 4 +++- pandas/tests/test_window.py | 20 +++++++++++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8e48dbbb083e8..ae4a3d3c3d97f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -551,6 +551,7 @@ Bug Fixes - Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) - Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`) +- Bug in ``.rolling()`` where ``pd.Timedelta`` or ``datetime.timedelta`` was not accepted as a ``window`` argument (:issue:`15440`) - Bug in ``DataFrame.resample().median()`` if duplicate column names are present (:issue:`14233`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) diff --git a/pandas/core/window.py b/pandas/core/window.py index 50de6b84d7cba..3f9aa2b0ff392 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -10,6 +10,7 @@ import warnings import numpy as np from collections import defaultdict +from datetime import timedelta from pandas.types.generic import (ABCSeries, ABCDataFrame, @@ -1014,7 +1015,8 @@ def validate(self): # we allow rolling on a datetimelike index if (self.is_datetimelike and - isinstance(self.window, (compat.string_types, DateOffset))): + isinstance(self.window, (compat.string_types, DateOffset, + timedelta))): self._validate_monotonic() freq = self._validate_freq() diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 1bb1f91423a9d..452e8999ab13f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -4,7 +4,7 @@ import warnings from warnings import catch_warnings -from datetime import datetime +from datetime import datetime, timedelta from numpy.random import randn import numpy as np from distutils.version import LooseVersion @@ -401,6 +401,24 @@ def test_constructor_with_win_type(self): with self.assertRaises(ValueError): c(-1, win_type='boxcar') + def test_constructor_with_timedelta_window(self): + # GH 15440 + n = 10 + df = pd.DataFrame({'value': np.arange(n)}, + index=pd.date_range('2015-12-24', + periods=n, + freq="D")) + expected_data = np.append([0., 1.], np.arange(3., 27., 3)) + for window in [timedelta(days=3), pd.Timedelta(days=3)]: + result = df.rolling(window=window).sum() + expected = pd.DataFrame({'value': expected_data}, + index=pd.date_range('2015-12-24', + periods=n, + freq="D")) + tm.assert_frame_equal(result, expected) + expected = df.rolling('3D').sum() + tm.assert_frame_equal(result, expected) + def test_numpy_compat(self): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) From be4a63fe791e27c2f8a9ae4f3a419ccc255c1b5b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Feb 2017 12:04:48 -0500 Subject: [PATCH 42/52] BUG: testing on windows - we are passing builds which actually have an error - fix the small dtype issues Author: Jeff Reback Closes #15445 from jreback/windows and squashes the following commits: a5b7fb3 [Jeff Reback] change integer to power comparisions eab15c4 [Jeff Reback] don't force remove pandas cf3b9bd [Jeff Reback] more windows fixing efe6a76 [Jeff Reback] add cython to build 8194e63 [Jeff Reback] don't use appveyor recipe, just build inplace e064825 [Jeff Reback] TST: resample dtype issue xref #15418 10d9b26 [Jeff Reback] TST: run windows tests so failures show up in appeveyor --- appveyor.yml | 12 ++++---- ci/appveyor.recipe/bld.bat | 2 -- ci/appveyor.recipe/build.sh | 2 -- ci/appveyor.recipe/meta.yaml | 37 ------------------------- pandas/tests/indexing/test_timedelta.py | 3 +- pandas/tests/test_expressions.py | 10 +++---- pandas/tests/tseries/test_resample.py | 2 +- test.bat | 3 +- 8 files changed, 13 insertions(+), 58 deletions(-) delete mode 100644 ci/appveyor.recipe/bld.bat delete mode 100644 ci/appveyor.recipe/build.sh delete mode 100644 ci/appveyor.recipe/meta.yaml diff --git a/appveyor.yml b/appveyor.yml index d96e1dfcf76de..1c14698430996 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -78,21 +78,19 @@ install: # this is now the downloaded conda... - cmd: conda info -a - # build em using the local source checkout in the correct windows env - - cmd: '%CMD_IN_ENV% conda build ci\appveyor.recipe -q' - # create our env - - cmd: conda create -q -n pandas python=%PYTHON_VERSION% pytest + - cmd: conda create -q -n pandas python=%PYTHON_VERSION% cython pytest - cmd: activate pandas - SET REQ=ci\requirements-%PYTHON_VERSION%-%PYTHON_ARCH%.run - cmd: echo "installing requirements from %REQ%" - cmd: conda install -n pandas -q --file=%REQ% - cmd: conda list -n pandas - cmd: echo "installing requirements from %REQ% - done" - - ps: conda install -n pandas (conda build ci\appveyor.recipe -q --output) + + # build em using the local source checkout in the correct windows env + - cmd: '%CMD_IN_ENV% python setup.py build_ext --inplace' test_script: # tests - cmd: activate pandas - - cmd: cd \ - - cmd: python -c "import pandas; pandas.test(['--skip-slow', '--skip-network'])" + - cmd: test.bat diff --git a/ci/appveyor.recipe/bld.bat b/ci/appveyor.recipe/bld.bat deleted file mode 100644 index 284926fae8c04..0000000000000 --- a/ci/appveyor.recipe/bld.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -%PYTHON% setup.py install diff --git a/ci/appveyor.recipe/build.sh b/ci/appveyor.recipe/build.sh deleted file mode 100644 index f341bce6fcf96..0000000000000 --- a/ci/appveyor.recipe/build.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -$PYTHON setup.py install diff --git a/ci/appveyor.recipe/meta.yaml b/ci/appveyor.recipe/meta.yaml deleted file mode 100644 index 777fd9d682d48..0000000000000 --- a/ci/appveyor.recipe/meta.yaml +++ /dev/null @@ -1,37 +0,0 @@ -package: - name: pandas - version: 0.20.0 - -build: - number: {{environ.get('APPVEYOR_BUILD_NUMBER', 0)}} # [win] - string: np{{ environ.get('CONDA_NPY') }}py{{ environ.get('CONDA_PY') }}_{{ environ.get('APPVEYOR_BUILD_NUMBER', 0) }} # [win] - -source: - - # conda-build needs a full clone - # rather than a shallow git_url type clone - # https://github.com/conda/conda-build/issues/780 - path: ../../ - -requirements: - build: - - python - - cython - - numpy x.x - - setuptools - - pytz - - python-dateutil - - run: - - python - - numpy x.x - - python-dateutil - - pytz - -test: - imports: - - pandas - -about: - home: http://pandas.pydata.org - license: BSD diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index e5ccd72cac20a..5f0088382ce57 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -13,8 +13,7 @@ def test_boolean_indexing(self): [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] for cond, data in zip(conditions, expected_data): - result = df.copy() - result.loc[cond, 'x'] = 10 + result = df.assign(x=df.mask(cond, 10).astype('int64')) expected = pd.DataFrame(data, index=pd.to_timedelta(range(10), unit='s'), columns=['x']) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 3032a288032a2..f669ebe371f9d 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -12,7 +12,7 @@ from pandas.core.api import DataFrame, Panel from pandas.computation import expressions as expr -from pandas import compat, _np_version_under1p12 +from pandas import compat, _np_version_under1p11 from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_panel4d_equal, slow) @@ -70,10 +70,10 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, operations.append('div') for arith in operations: - # numpy >= 1.12 doesn't handle integers + # numpy >= 1.11 doesn't handle integers # raised to integer powers # https://github.com/pandas-dev/pandas/issues/15363 - if arith == 'pow' and not _np_version_under1p12: + if arith == 'pow' and not _np_version_under1p11: continue operator_name = arith @@ -272,10 +272,10 @@ def testit(): for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), ('div', '/'), ('pow', '**')]: - # numpy >= 1.12 doesn't handle integers + # numpy >= 1.11 doesn't handle integers # raised to integer powers # https://github.com/pandas-dev/pandas/issues/15363 - if op == 'pow' and not _np_version_under1p12: + if op == 'pow' and not _np_version_under1p11: continue if op == 'div': diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index 45bbc88ef711d..6e999c5b1d276 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -1944,7 +1944,7 @@ def test_resample_nunique_with_date_gap(self): index = pd.date_range('1-1-2000', '2-15-2000', freq='h') index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h') index3 = index.append(index2) - s = pd.Series(range(len(index3)), index=index3) + s = pd.Series(range(len(index3)), index=index3, dtype='int64') r = s.resample('M') # Since all elements are unique, these should all be the same diff --git a/test.bat b/test.bat index 7f9244abb2bc8..2c5f25c24a637 100644 --- a/test.bat +++ b/test.bat @@ -1,4 +1,3 @@ :: test on windows -:: nosetests --exe -A "not slow and not network and not disabled" pandas %* -pytest pandas +pytest --skip-slow --skip-network pandas From c7a1e009049c1e7da79c2ac8bc60a11ab948a5e0 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 00:21:37 +0000 Subject: [PATCH 43/52] get_indexer_non_unique for orderable indexes C level list no gil capture index error wrong exception handling --- pandas/index.pyx | 186 ++++++++++++++++++++++++++++++++++++++++- pandas/indexes/base.py | 30 ++++++- 2 files changed, 213 insertions(+), 3 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 0c975d1775a03..9c73ae1045b5c 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -1,9 +1,11 @@ # cython: profile=False from numpy cimport ndarray +from libc.stdlib cimport malloc, free from numpy cimport (float64_t, int32_t, int64_t, uint8_t, - NPY_DATETIME, NPY_TIMEDELTA) + NPY_DATETIME, NPY_TIMEDELTA, PyArray_SimpleNewFromData, + NPY_INT64) cimport cython cimport numpy as cnp @@ -23,7 +25,7 @@ from pandas.tslib import Timestamp, Timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) -from cpython cimport PyTuple_Check, PyList_Check +from cpython cimport PyTuple_Check, PyList_Check, PyMem_Malloc, PyMem_Free cdef extern from "datetime.h": bint PyDateTime_Check(object o) @@ -44,6 +46,146 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +ctypedef struct Int64ListNode: + int64_t value + Int64ListNode *next + +ctypedef struct Int64List: + Int64ListNode *root + Int64ListNode *last + Py_ssize_t n + bint owns + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef Int64List* Int64List_create_array(Py_ssize_t n) nogil: + + cdef: + Int64List *lst = malloc(n * sizeof(Int64List)) + Py_ssize_t i + + for i in range(n): + lst[i].n = 0 + lst[i].root = NULL + lst[i].last = NULL + + return lst + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n) nogil: + cdef: + Int64ListNode *next + Int64ListNode *p + Py_ssize_t i + + for i in range(n): + if lst[i].owns: + p = lst[i].root + while p is not NULL: + next = p[0].next + free(p) + p = next + + free(lst) + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef inline void _append(Int64List *lst, int64_t x) nogil: + + cdef Int64ListNode *nn = malloc(sizeof(Int64ListNode)) + + nn[0].value = x + nn[0].next = NULL + + if lst[0].root is NULL: + lst[0].root = nn + lst[0].owns = 1 + else: + lst[0].last[0].next = nn + + lst[0].last = nn + lst[0].n += 1 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef inline void _copy_to(Int64List *dst, Int64List *src) nogil: + dst[0].root = src[0].root + dst[0].last = src[0].last + dst[0].n = src[0].n + dst[0].owns = 0 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, + Py_ssize_t *nt) nogil: + nt[0] = 0 + cdef: + Py_ssize_t last = 0 + Int64ListNode* node + + for i in range(n): + nt[0] += lst[i].n + + cdef int64_t *data = malloc(nt[0] * sizeof(int64_t)) + + for i in range(n): + + node = lst[i].root + while node is not NULL: + data[last] = node[0].value + last += 1 + node = node[0].next + + return data + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1, + Int64List* result, + Int64List* missing): + cdef: + Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0] + + while i < n and j < n_t: + + val0 = values[idx0[i]] + val1 = targets[idx1[j]] + + if val0 == val1: + + while i < n and values[idx0[i]] == val1: + _append(&(result[idx1[j]]), idx0[i]) + i += 1 + + j += 1 + while j < n_t and val0 == targets[idx1[j]]: + _copy_to(&(result[idx1[j]]), &(result[idx1[j-1]])) + j += 1 + + elif val0 > val1: + + _append(&(result[idx1[j]]), -1) + _append(&(missing[idx1[j]]), idx1[j]) + j += 1 + + else: + i += 1 + + while j < n_t: + _append(&(result[idx1[j]]), -1) + _append(&(missing[idx1[j]]), idx1[j]) + j += 1 + cdef inline is_definitely_invalid_key(object val): if PyTuple_Check(val): @@ -372,6 +514,46 @@ cdef class IndexEngine: return result[0:count], missing[0:count_missing] + def get_indexer_non_unique_orderable(self, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1): + + cdef: + ndarray values + object val0, val1 + Py_ssize_t n_t + + self._ensure_mapping_populated() + values = self._get_index_values() + n_t = len(targets) + + cdef: + Int64List* result = Int64List_create_array(n_t) + Int64List* missing = Int64List_create_array(n_t) + + _indexer_non_unique_orderable_loop(values, targets, idx0, idx1, + result, missing) + + cdef: + Py_ssize_t nres, nmis + int64_t *cresult + int64_t *cmissing + + cresult = Int64List_concat_array(result, n_t, &nres) + cmissing = Int64List_concat_array(missing, n_t, &nmis) + + Int64List_destroy_array(result, n_t) + Int64List_destroy_array(missing, n_t) + + cdef: + cnp.npy_intp *dims0 = [nres] + cnp.npy_intp *dims1 = [nmis] + ndarray npy_result = PyArray_SimpleNewFromData(1, dims0, + NPY_INT64, cresult) + ndarray npy_missing = PyArray_SimpleNewFromData(1, dims1, + NPY_INT64, cmissing) + + return npy_result, npy_missing cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index bb2941a121452..cd3304daec25b 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2506,10 +2506,38 @@ def get_indexer_non_unique(self, target): if self.is_all_dates: self = Index(self.asi8) tgt_values = target.asi8 + src_values = self.asi8 else: tgt_values = target._values + src_values = self._values + + try: + src_values[0] < tgt_values[0] + src_values[0] > tgt_values[0] + except (TypeError, IndexError): + orderable = False + else: + try: + if self.is_monotonic_increasing: + idx0 = np.arange(len(src_values)) + else: + idx0 = np.argsort(src_values, kind='mergesort') + + if target.is_monotonic_increasing: + idx1 = np.arange(len(tgt_values)) + else: + idx1 = np.argsort(tgt_values, kind='mergesort') + + except TypeError: + orderable = False + else: + orderable = True + + if orderable: + indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) + else: + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return Index(indexer), missing def get_indexer_for(self, target, **kwargs): From 390bfb293961c3508c9e3194ac44672ff42346e4 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 00:21:37 +0000 Subject: [PATCH 44/52] get_indexer_non_unique for orderable indexes --- pandas/index.pyx | 69 ++++++++++++++++++++++++++++++++++++++++++ pandas/indexes/base.py | 13 +++++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 37fe7d90bebe0..8d96a39a4948c 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -45,6 +45,47 @@ cdef extern from "Python.h": int PySlice_Check(object) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1, + list[:] result, list[:] missing): + cdef: + Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0] + + while i < n and j < n_t: + + val0 = values[idx0[i]] + val1 = targets[idx1[j]] + + if val0 == val1: + + while i < n and values[idx0[i]] == val1: + result[idx1[j]].append(idx0[i]) + i += 1 + + j += 1 + while j < n_t and val0 == targets[idx1[j]]: + result[idx1[j]] = result[idx1[j-1]] + j += 1 + + elif val0 > val1: + + result[idx1[j]].append(-1) + missing[idx1[j]].append(idx1[j]) + j += 1 + + else: + i += 1 + + while j < n_t: + result[idx1[j]].append(-1) + missing[idx1[j]].append(idx1[j]) + j += 1 + + cdef inline is_definitely_invalid_key(object val): if PyTuple_Check(val): try: @@ -371,6 +412,34 @@ cdef class IndexEngine: return result[0:count], missing[0:count_missing] + def get_indexer_non_unique_orderable(self, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1): + + cdef: + ndarray values + object val0, val1 + Py_ssize_t i, n_t + + self._ensure_mapping_populated() + values = self._get_index_values() + n_t = len(targets) + + result = np.empty((n_t,), dtype=np.object_) + result.fill([]) + result = np.frompyfunc(list,1,1)(result) + + missing = np.empty((n_t,), dtype=np.object_) + missing.fill([]) + missing = np.frompyfunc(list,1,1)(missing) + + _indexer_non_unique_orderable_loop(values, targets, idx0, idx1, + result, missing) + + result = np.concatenate(result) + missing = np.asarray(np.concatenate(missing), np.int64) + + return result, missing cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index e51824e72a2a0..664840c4b83d7 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2532,7 +2532,18 @@ def get_indexer_non_unique(self, target): else: tgt_values = target._values - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + try: + if self.is_all_dates: + idx0 = np.argsort(self.asi8, kind='mergesort') + else: + idx0 = np.argsort(self._values, kind='mergesort') + + idx1 = np.argsort(tgt_values, kind='mergesort') + indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) + + except TypeError: + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return Index(indexer), missing def get_indexer_for(self, target, **kwargs): From f38cf52d7e7a596bd25a258cedc02c37f59d78b7 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 20:49:08 +0000 Subject: [PATCH 45/52] C level list --- pandas/index.pyx | 152 +++++++++++++++++++++++++++++++++++------ pandas/indexes/base.py | 29 ++++++-- 2 files changed, 155 insertions(+), 26 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 8d96a39a4948c..4532523c9842e 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -3,7 +3,8 @@ from numpy cimport ndarray from numpy cimport (float64_t, int32_t, int64_t, uint8_t, - NPY_DATETIME, NPY_TIMEDELTA) + NPY_DATETIME, NPY_TIMEDELTA, PyArray_SimpleNewFromData, + NPY_INT64) cimport cython cimport numpy as cnp @@ -23,7 +24,7 @@ from pandas.tslib import Timestamp, Timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) -from cpython cimport PyTuple_Check, PyList_Check +from cpython cimport PyTuple_Check, PyList_Check, PyMem_Malloc, PyMem_Free cdef extern from "datetime.h": bint PyDateTime_Check(object o) @@ -44,6 +45,104 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +ctypedef struct Int64ListNode: + int64_t value + Int64ListNode *next + +ctypedef struct Int64List: + Int64ListNode *root + Int64ListNode *last + Py_ssize_t n + bint owns + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef Int64List* Int64List_create_array(Py_ssize_t n): + + cdef: + Int64List *lst = PyMem_Malloc(n * sizeof(Int64List)) + Py_ssize_t i + + for i in range(n): + lst[i].n = 0 + lst[i].root = NULL + lst[i].last = NULL + + return lst + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n): + cdef: + Int64ListNode *next + Int64ListNode *p + Py_ssize_t i + + for i in range(n): + if lst[i].owns: + p = lst[i].root + while p is not NULL: + next = p[0].next + PyMem_Free(p) + p = next + + PyMem_Free(lst) + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void _append(Int64List *lst, int64_t x): + + cdef Int64ListNode *nn = PyMem_Malloc(sizeof(Int64ListNode)) + + nn[0].value = x + nn[0].next = NULL + + if lst[0].root is NULL: + lst[0].root = nn + lst[0].owns = 1 + else: + lst[0].last[0].next = nn + + lst[0].last = nn + lst[0].n += 1 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef void _copy_to(Int64List *dst, Int64List *src) nogil: + dst[0].root = src[0].root + dst[0].last = src[0].last + dst[0].n = src[0].n + dst[0].owns = 0 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, + Py_ssize_t *nt): + nt[0] = 0 + cdef: + Py_ssize_t last = 0 + Int64ListNode* node + + for i in range(n): + nt[0] += lst[i].n + + cdef int64_t *data = PyMem_Malloc(nt[0] * sizeof(int64_t)) + + for i in range(n): + + node = lst[i].root + while node is not NULL: + data[last] = node[0].value + last += 1 + node = node[0].next + + return data + @cython.boundscheck(False) @cython.wraparound(False) @@ -51,7 +150,8 @@ cdef extern from "Python.h": cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, - list[:] result, list[:] missing): + Int64List* result, + Int64List* missing): cdef: Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0] @@ -63,26 +163,26 @@ cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, if val0 == val1: while i < n and values[idx0[i]] == val1: - result[idx1[j]].append(idx0[i]) + _append(&(result[idx1[j]]), idx0[i]) i += 1 j += 1 while j < n_t and val0 == targets[idx1[j]]: - result[idx1[j]] = result[idx1[j-1]] + _copy_to(&(result[idx1[j]]), &(result[idx1[j-1]])) j += 1 elif val0 > val1: - result[idx1[j]].append(-1) - missing[idx1[j]].append(idx1[j]) + _append(&(result[idx1[j]]), -1) + _append(&(missing[idx1[j]]), idx1[j]) j += 1 else: i += 1 while j < n_t: - result[idx1[j]].append(-1) - missing[idx1[j]].append(idx1[j]) + _append(&(result[idx1[j]]), -1) + _append(&(missing[idx1[j]]), idx1[j]) j += 1 @@ -419,27 +519,39 @@ cdef class IndexEngine: cdef: ndarray values object val0, val1 - Py_ssize_t i, n_t + Py_ssize_t n_t self._ensure_mapping_populated() values = self._get_index_values() n_t = len(targets) - result = np.empty((n_t,), dtype=np.object_) - result.fill([]) - result = np.frompyfunc(list,1,1)(result) - - missing = np.empty((n_t,), dtype=np.object_) - missing.fill([]) - missing = np.frompyfunc(list,1,1)(missing) + cdef: + Int64List* result = Int64List_create_array(n_t) + Int64List* missing = Int64List_create_array(n_t) _indexer_non_unique_orderable_loop(values, targets, idx0, idx1, result, missing) - result = np.concatenate(result) - missing = np.asarray(np.concatenate(missing), np.int64) + cdef: + Py_ssize_t nres, nmis + int64_t *cresult + int64_t *cmissing + + cresult = Int64List_concat_array(result, n_t, &nres) + cmissing = Int64List_concat_array(missing, n_t, &nmis) - return result, missing + Int64List_destroy_array(result, n_t) + Int64List_destroy_array(missing, n_t) + + cdef: + cnp.npy_intp *dims0 = [nres] + cnp.npy_intp *dims1 = [nmis] + ndarray npy_result = PyArray_SimpleNewFromData(1, dims0, + NPY_INT64, cresult) + ndarray npy_missing = PyArray_SimpleNewFromData(1, dims1, + NPY_INT64, cmissing) + + return npy_result, npy_missing cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 664840c4b83d7..ac50402284303 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2529,19 +2529,36 @@ def get_indexer_non_unique(self, target): if self.is_all_dates: self = Index(self.asi8) tgt_values = target.asi8 + src_values = self.asi8 else: tgt_values = target._values + src_values = self._values try: - if self.is_all_dates: - idx0 = np.argsort(self.asi8, kind='mergesort') + src_values[0] < tgt_values[0] + src_values[0] > tgt_values[0] + except TypeError: + orderable = False + else: + try: + if self.is_monotonic_increasing: + idx0 = np.arange(len(src_values)) + else: + idx0 = np.argsort(src_values, kind='mergesort') + + if target.is_monotonic_increasing: + idx1 = np.arange(len(tgt_values)) + else: + idx1 = np.argsort(tgt_values, kind='mergesort') + + except TypeError: + orderable = False else: - idx0 = np.argsort(self._values, kind='mergesort') + orderable = True - idx1 = np.argsort(tgt_values, kind='mergesort') + if orderable: indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) - - except TypeError: + else: indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return Index(indexer), missing From 9dabf34a7b62593a0ce16a971307e403222cfe3d Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 12 Feb 2017 21:16:36 +0000 Subject: [PATCH 46/52] no gil --- pandas/index.pyx | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 4532523c9842e..216d26eea2b52 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -1,6 +1,7 @@ # cython: profile=False from numpy cimport ndarray +from libc.stdlib cimport malloc, free from numpy cimport (float64_t, int32_t, int64_t, uint8_t, NPY_DATETIME, NPY_TIMEDELTA, PyArray_SimpleNewFromData, @@ -58,10 +59,10 @@ ctypedef struct Int64List: @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef Int64List* Int64List_create_array(Py_ssize_t n): +cdef Int64List* Int64List_create_array(Py_ssize_t n) nogil: cdef: - Int64List *lst = PyMem_Malloc(n * sizeof(Int64List)) + Int64List *lst = malloc(n * sizeof(Int64List)) Py_ssize_t i for i in range(n): @@ -74,7 +75,7 @@ cdef Int64List* Int64List_create_array(Py_ssize_t n): @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n): +cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n) nogil: cdef: Int64ListNode *next Int64ListNode *p @@ -85,17 +86,17 @@ cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n): p = lst[i].root while p is not NULL: next = p[0].next - PyMem_Free(p) + free(p) p = next - PyMem_Free(lst) + free(lst) @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef void _append(Int64List *lst, int64_t x): +cdef inline void _append(Int64List *lst, int64_t x) nogil: - cdef Int64ListNode *nn = PyMem_Malloc(sizeof(Int64ListNode)) + cdef Int64ListNode *nn = malloc(sizeof(Int64ListNode)) nn[0].value = x nn[0].next = NULL @@ -112,7 +113,7 @@ cdef void _append(Int64List *lst, int64_t x): @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef void _copy_to(Int64List *dst, Int64List *src) nogil: +cdef inline void _copy_to(Int64List *dst, Int64List *src) nogil: dst[0].root = src[0].root dst[0].last = src[0].last dst[0].n = src[0].n @@ -122,7 +123,7 @@ cdef void _copy_to(Int64List *dst, Int64List *src) nogil: @cython.wraparound(False) @cython.initializedcheck(False) cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, - Py_ssize_t *nt): + Py_ssize_t *nt) nogil: nt[0] = 0 cdef: Py_ssize_t last = 0 @@ -131,7 +132,7 @@ cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, for i in range(n): nt[0] += lst[i].n - cdef int64_t *data = PyMem_Malloc(nt[0] * sizeof(int64_t)) + cdef int64_t *data = malloc(nt[0] * sizeof(int64_t)) for i in range(n): From f61b98f64ac25f19bee1c72b12662e2c8ae7a53d Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 13 Feb 2017 07:27:28 +0000 Subject: [PATCH 47/52] capture index error --- pandas/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ac50402284303..b8c98d010405c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2537,7 +2537,7 @@ def get_indexer_non_unique(self, target): try: src_values[0] < tgt_values[0] src_values[0] > tgt_values[0] - except TypeError: + except TypeError or IndexError: orderable = False else: try: From 6afb8c910259fe8b235a18cce0888e29533943d2 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 13 Feb 2017 08:32:04 +0000 Subject: [PATCH 48/52] wrong exception handling --- pandas/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b8c98d010405c..f9a76dd58c989 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2537,7 +2537,7 @@ def get_indexer_non_unique(self, target): try: src_values[0] < tgt_values[0] src_values[0] > tgt_values[0] - except TypeError or IndexError: + except (TypeError, IndexError): orderable = False else: try: From bf4b3f585a8d65827bd984c76797c689e72c35fc Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 20 Feb 2017 10:53:17 +0000 Subject: [PATCH 49/52] fixed-size arrays for get_index mapping --- pandas/index.pyx | 187 ++++++++++++++++++----------------------------- 1 file changed, 70 insertions(+), 117 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 216d26eea2b52..566b0872641a6 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -59,133 +59,114 @@ ctypedef struct Int64List: @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef Int64List* Int64List_create_array(Py_ssize_t n) nogil: +cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, + int64_t[:] mapping_count, int64_t[:] missing_count): cdef: - Int64List *lst = malloc(n * sizeof(Int64List)) - Py_ssize_t i + int64_t n_v = values.shape[0] + int64_t n_t = targets.shape[0] + int64_t i = 0 + int64_t j = 0 - for i in range(n): - lst[i].n = 0 - lst[i].root = NULL - lst[i].last = NULL + while i < n_v and j < n_t: - return lst - -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.initializedcheck(False) -cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n) nogil: - cdef: - Int64ListNode *next - Int64ListNode *p - Py_ssize_t i - - for i in range(n): - if lst[i].owns: - p = lst[i].root - while p is not NULL: - next = p[0].next - free(p) - p = next + val0 = values[idx0[i]] + val1 = targets[idx1[j]] - free(lst) + if val0 == val1: -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.initializedcheck(False) -cdef inline void _append(Int64List *lst, int64_t x) nogil: + while i < n_v and values[idx0[i]] == val1: + i += 1 + mapping_count[idx1[j]] += 1 - cdef Int64ListNode *nn = malloc(sizeof(Int64ListNode)) + j += 1 + while j < n_t and val0 == targets[idx1[j]]: + mapping_count[idx1[j]] = mapping_count[idx1[j-1]] + j += 1 - nn[0].value = x - nn[0].next = NULL + elif val0 > val1: - if lst[0].root is NULL: - lst[0].root = nn - lst[0].owns = 1 - else: - lst[0].last[0].next = nn + mapping_count[idx1[j]] += 1 + missing_count[idx1[j]] = 1 + j += 1 - lst[0].last = nn - lst[0].n += 1 + else: + i += 1 -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.initializedcheck(False) -cdef inline void _copy_to(Int64List *dst, Int64List *src) nogil: - dst[0].root = src[0].root - dst[0].last = src[0].last - dst[0].n = src[0].n - dst[0].owns = 0 + while j < n_t: + mapping_count[idx1[j]] += 1 + missing_count[idx1[j]] = 1 + j += 1 @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) -cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n, - Py_ssize_t *nt) nogil: - nt[0] = 0 - cdef: - Py_ssize_t last = 0 - Int64ListNode* node +cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, + int64_t[:] start_mapping, int64_t[:] start_missing, + int64_t[:] mapping, int64_t[:] missing): - for i in range(n): - nt[0] += lst[i].n - - cdef int64_t *data = malloc(nt[0] * sizeof(int64_t)) - - for i in range(n): - - node = lst[i].root - while node is not NULL: - data[last] = node[0].value - last += 1 - node = node[0].next - - return data - - -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.initializedcheck(False) -cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets, - int64_t[:] idx0, - int64_t[:] idx1, - Int64List* result, - Int64List* missing): cdef: - Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0] + int64_t n_v = values.shape[0] + int64_t n_t = targets.shape[0] + int64_t i = 0 + int64_t j = 0 + int64_t c - while i < n and j < n_t: + while i < n_v and j < n_t: val0 = values[idx0[i]] val1 = targets[idx1[j]] if val0 == val1: - while i < n and values[idx0[i]] == val1: - _append(&(result[idx1[j]]), idx0[i]) - i += 1 + c = 0 + while i < n_v and values[idx0[i]] == val1: + mapping[start_mapping[idx1[j]] + c] = idx0[i] + i += 1 + c += 1 j += 1 while j < n_t and val0 == targets[idx1[j]]: - _copy_to(&(result[idx1[j]]), &(result[idx1[j-1]])) + for ii in range(c): + mapping[start_mapping[idx1[j]] + ii] = \ + mapping[start_mapping[idx1[j-1]] + ii] j += 1 elif val0 > val1: - _append(&(result[idx1[j]]), -1) - _append(&(missing[idx1[j]]), idx1[j]) + mapping[start_mapping[idx1[j]]] = -1 + missing[start_missing[idx1[j]]] = idx1[j] j += 1 else: i += 1 while j < n_t: - _append(&(result[idx1[j]]), -1) - _append(&(missing[idx1[j]]), idx1[j]) + + mapping[start_mapping[idx1[j]]] = -1 + missing[start_missing[idx1[j]]] = idx1[j] j += 1 +def _map_targets_to_values(values, targets, idx0, idx1): + mapping_count = np.zeros(len(targets), int) + missing_count = np.zeros(len(targets), int) + + _count(values, targets, idx0, idx1, mapping_count, missing_count) + + np.cumsum(mapping_count, out=mapping_count) + np.cumsum(missing_count, out=missing_count) + + mapping = np.empty(mapping_count[-1], int) + missing = np.empty(missing_count[-1], int) + + mapping_count[1:] = mapping_count[:-1] + mapping_count[0] = 0 + missing_count -= 1 + + _map(values, targets, idx0, idx1, mapping_count, missing_count, mapping, + missing) + + return mapping, missing cdef inline is_definitely_invalid_key(object val): if PyTuple_Check(val): @@ -524,35 +505,7 @@ cdef class IndexEngine: self._ensure_mapping_populated() values = self._get_index_values() - n_t = len(targets) - - cdef: - Int64List* result = Int64List_create_array(n_t) - Int64List* missing = Int64List_create_array(n_t) - - _indexer_non_unique_orderable_loop(values, targets, idx0, idx1, - result, missing) - - cdef: - Py_ssize_t nres, nmis - int64_t *cresult - int64_t *cmissing - - cresult = Int64List_concat_array(result, n_t, &nres) - cmissing = Int64List_concat_array(missing, n_t, &nmis) - - Int64List_destroy_array(result, n_t) - Int64List_destroy_array(missing, n_t) - - cdef: - cnp.npy_intp *dims0 = [nres] - cnp.npy_intp *dims1 = [nmis] - ndarray npy_result = PyArray_SimpleNewFromData(1, dims0, - NPY_INT64, cresult) - ndarray npy_missing = PyArray_SimpleNewFromData(1, dims1, - NPY_INT64, cmissing) - - return npy_result, npy_missing + return _map_targets_to_values(values, targets, idx0, idx1) cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: From 0f37a64c7b53a240703c4fd47c6f35652b1f0fd6 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 20 Feb 2017 12:12:58 +0000 Subject: [PATCH 50/52] dtype=np.int64 --- pandas/indexes/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index f9a76dd58c989..2ef5aaab7fb14 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2542,14 +2542,16 @@ def get_indexer_non_unique(self, target): else: try: if self.is_monotonic_increasing: - idx0 = np.arange(len(src_values)) + idx0 = np.arange(len(src_values), dtype=np.int64) else: idx0 = np.argsort(src_values, kind='mergesort') + idx0 = np.asarray(idx0, dtype=np.int64) if target.is_monotonic_increasing: - idx1 = np.arange(len(tgt_values)) + idx1 = np.arange(len(tgt_values), dtype=np.int64) else: idx1 = np.argsort(tgt_values, kind='mergesort') + idx1 = np.asarray(idx1, dtype=np.int64) except TypeError: orderable = False From 3c218cecd8182f9b80fa0728a1738b8613c2f854 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 20 Feb 2017 12:59:40 +0000 Subject: [PATCH 51/52] empty and zeros with np.int64 --- pandas/index.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 566b0872641a6..1c8ed72d07b3e 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -148,16 +148,16 @@ cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, j += 1 def _map_targets_to_values(values, targets, idx0, idx1): - mapping_count = np.zeros(len(targets), int) - missing_count = np.zeros(len(targets), int) + mapping_count = np.zeros(len(targets), np.int64) + missing_count = np.zeros(len(targets), np.int64) _count(values, targets, idx0, idx1, mapping_count, missing_count) np.cumsum(mapping_count, out=mapping_count) np.cumsum(missing_count, out=missing_count) - mapping = np.empty(mapping_count[-1], int) - missing = np.empty(missing_count[-1], int) + mapping = np.empty(mapping_count[-1], np.int64) + missing = np.empty(missing_count[-1], np.int64) mapping_count[1:] = mapping_count[:-1] mapping_count[0] = 0 From 74ce239e4b3e306b1dba64a84086c5ae1e757ef5 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Mon, 20 Feb 2017 13:18:53 +0000 Subject: [PATCH 52/52] as array --- pandas/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 1c8ed72d07b3e..6e79008fcf4d2 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -504,7 +504,7 @@ cdef class IndexEngine: Py_ssize_t n_t self._ensure_mapping_populated() - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) return _map_targets_to_values(values, targets, idx0, idx1) cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: