diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index dcea59545aae3..6a739873a032f 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -43,6 +43,7 @@ "numexpr": [], "pytables": [], "openpyxl": [], + "xlsxwriter": [], "xlrd": [], "xlwt": [] }, diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 9bece56e15c90..a04a9d0814a30 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -930,6 +930,16 @@ def time_frame_xs_row(self): self.df.xs(50000) +class frame_sort_index(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + + def time_frame_sort_index(self): + self.df.sort_index() + + class series_string_vector_slice(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 4b82781fc39d9..eeca2d54381b2 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -320,3 +320,49 @@ def time_nogil_kth_smallest(self): def run(arr): algos.kth_smallest(arr, self.k) run() + +class nogil_datetime_fields(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000000 + self.dti = pd.date_range('1900-01-01', periods=self.N, freq='D') + self.period = self.dti.to_period('D') + if (not have_real_test_parallel): + raise NotImplementedError + + def time_datetime_field_year(self): + @test_parallel(num_threads=2) + def run(dti): + dti.year + run(self.dti) + + def time_datetime_field_day(self): + @test_parallel(num_threads=2) + def run(dti): + dti.day + run(self.dti) + + def time_datetime_field_daysinmonth(self): + @test_parallel(num_threads=2) + def run(dti): + dti.days_in_month + run(self.dti) + + def time_datetime_field_normalize(self): + @test_parallel(num_threads=2) + def run(dti): + dti.normalize() + run(self.dti) + + def time_datetime_to_period(self): + @test_parallel(num_threads=2) + def run(dti): + dti.to_period('S') + run(self.dti) + + def time_period_to_datetime(self): + @test_parallel(num_threads=2) + def run(period): + period.to_timestamp() + run(self.period) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 37969a6949157..a40ed3f1d6482 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -71,3 +71,23 @@ def setup(self): def time_series_nsmallest2(self): self.s2.nsmallest(3, take_last=True) self.s2.nsmallest(3, take_last=False) + + +class series_dropna_int64(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(1, 10, 1000000)) + + def time_series_dropna_int64(self): + self.s.dropna() + +class series_dropna_datetime(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(pd.date_range('2000-01-01', freq='S', periods=1000000)) + self.s[np.random.randint(1, 1000000, 100)] = pd.NaT + + def time_series_dropna_datetime(self): + self.s.dropna() diff --git a/ci/install_conda.sh b/ci/install_conda.sh index 8d99034a86109..6873a1656a8a4 100755 --- a/ci/install_conda.sh +++ b/ci/install_conda.sh @@ -73,7 +73,7 @@ bash miniconda.sh -b -p $HOME/miniconda || exit 1 conda config --set always_yes yes --set changeps1 no || exit 1 conda update -q conda || exit 1 conda config --add channels conda-forge || exit 1 -conda config --add channels http://conda.binstar.org/pandas || exit 1 +conda config --add channels http://conda.anaconda.org/pandas || exit 1 conda config --set ssl_verify false || exit 1 # Useful for debugging any issues with conda diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 644457d69b37f..9bc533110cea3 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -2,3 +2,5 @@ blosc httplib2 google-api-python-client == 1.2 python-gflags == 2.0 +pathlib +py diff --git a/ci/requirements-2.7_SLOW.pip b/ci/requirements-2.7_SLOW.pip new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ci/requirements-3.4.build b/ci/requirements-3.4.build index 6fdffd41bd4c4..8e2a952b840f7 100644 --- a/ci/requirements-3.4.build +++ b/ci/requirements-3.4.build @@ -2,3 +2,4 @@ python-dateutil pytz numpy=1.8.1 cython +libgfortran diff --git a/doc/source/conf.py b/doc/source/conf.py index f2a033eb82d9c..23095b7f4d24b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -299,8 +299,9 @@ intersphinx_mapping = { 'statsmodels': ('http://statsmodels.sourceforge.net/devel/', None), 'matplotlib': ('http://matplotlib.org/', None), - 'python': ('http://docs.python.org/', None), - 'numpy': ('http://docs.scipy.org/doc/numpy', None) + 'python': ('http://docs.python.org/3', None), + 'numpy': ('http://docs.scipy.org/doc/numpy', None), + 'py': ('http://pylib.readthedocs.org/en/latest/', None) } import glob autosummary_generate = glob.glob("*.rst") diff --git a/doc/source/io.rst b/doc/source/io.rst index 014daa3f68dbb..a7c0d31189a75 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -79,9 +79,10 @@ for some advanced strategies They can take a number of arguments: - - ``filepath_or_buffer``: Either a string path to a file, URL + - ``filepath_or_buffer``: Either a path to a file (a :class:`python:str`, + :class:`python:pathlib.Path`, or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), or any object with a ``read`` - method (such as an open file or ``StringIO``). + method (such as an open file or :class:`~python:io.StringIO`). - ``sep`` or ``delimiter``: A delimiter / separator to split fields on. With ``sep=None``, ``read_csv`` will try to infer the delimiter automatically in some cases by "sniffing". diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 1eff7d01d9d91..6171e409652f3 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -17,6 +17,7 @@ Highlights include: Enhancements ~~~~~~~~~~~~ +- ``DatetimeIndex`` now supports conversion to strings with astype(str)(:issue:`10442`) - Support for ``compression`` (gzip/bz2) in :method:`DataFrame.to_csv` (:issue:`7615`) @@ -27,6 +28,10 @@ Enhancements Other Enhancements ^^^^^^^^^^^^^^^^^^ +- ``pd.read_*`` functions can now also accept :class:`python:pathlib.Path`, or :class:`py:py._path.local.LocalPath` + objects for the ``filepath_or_buffer`` argument. (:issue:`11033`) +- Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`) + .. _whatsnew_0171.api: API changes @@ -37,17 +42,31 @@ API changes - Regression from 0.16.2 for output formatting of long floats/nan, restored in (:issue:`11302`) - Prettyprinting sets (e.g. in DataFrame cells) now uses set literal syntax (``{x, y}``) instead of Legacy Python syntax (``set([x, y])``) (:issue:`11215`) +- Indexing with a null key will raise a ``TypeError``, instead of a ``ValueError`` (:issue:`11356`) .. _whatsnew_0171.deprecations: Deprecations ^^^^^^^^^^^^ +- The ``pandas.io.ga`` module which implements ``google-analytics`` support is deprecated and will be removed in a future version (:issue:`11308`) +- Deprecate the ``engine`` keyword from ``.to_csv()``, which will be removed in a future version (:issue:`11274`) + + .. _whatsnew_0171.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Checking monotonic-ness before sorting on an index (:issue:`11080`) +- ``Series.dropna`` performance improvement when its dtype can't contain ``NaN`` (:issue:`11159`) + + +- Release the GIL on most datetime field operations (e.g. ``DatetimeIndex.year``, ``Series.dt.year``), normalization, and conversion to and from ``Period``, ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestamp`` (:issue:`11263`) + + +- Improved performance to ``to_excel`` (:issue:`11352`) + .. _whatsnew_0171.bug_fixes: Bug Fixes @@ -58,13 +77,19 @@ Bug Fixes - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`) -- Bug in tz-conversions with an ambiguous time and ``.dt`` accessors (:issues:`11295`) + +- Bug in tz-conversions with an ambiguous time and ``.dt`` accessors (:issue:`11295`) +- Bug in comparisons of Series vs list-likes (:issue:`11339`) +- Bug in ``DataFrame.replace`` with a ``datetime64[ns, tz]`` and a non-compat to_replace (:issue:`11326`, :issue:`11153`) +- Bug in list-like indexing with a mixed-integer Index (:issue:`11320`) +- Bug in ``pivot_table`` with ``margins=True`` when indexes are of ``Categorical`` dtype (:issue:`10993`) +- Bug in ``DataFrame.plot`` cannot use hex strings colors (:issue:`10299`) @@ -88,5 +113,12 @@ Bug Fixes - Bugs in ``to_excel`` with duplicate columns (:issue:`11007`, :issue:`10982`, :issue:`10970`) + - Fixed a bug that prevented the construction of an empty series of dtype ``datetime64[ns, tz]`` (:issue:`11245`). + +- Bug in ``read_excel`` with multi-index containing integers (:issue:`11317`) + +- Bug in ``to_excel`` with openpyxl 2.2+ and merging (:issue:`11408`) + +- Bug in ``DataFrame.to_dict()`` produces a ``np.datetime64`` object instead of ``Timestamp`` when only datetime is present in data (:issue:`11327`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 724843d379f64..c2c50bce04309 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,8 +5,6 @@ import re import collections import numbers -import codecs -import csv import types from datetime import datetime, timedelta from functools import partial @@ -19,7 +17,7 @@ import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import StringIO, BytesIO, range, long, u, zip, map, string_types, iteritems +from pandas.compat import BytesIO, range, long, u, zip, map, string_types, iteritems from pandas.core.dtypes import CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType from pandas.core.config import get_option @@ -446,14 +444,24 @@ def mask_missing(arr, values_to_mask): mask = None for x in nonna: if mask is None: - mask = arr == x + + # numpy elementwise comparison warning + if is_numeric_v_string_like(arr, x): + mask = False + else: + mask = arr == x # if x is a string and arr is not, then we get False and we must # expand the mask to size arr.shape if np.isscalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: - mask |= arr == x + + # numpy elementwise comparison warning + if is_numeric_v_string_like(arr, x): + mask |= False + else: + mask |= arr == x if na_mask.any(): if mask is None: @@ -2384,6 +2392,9 @@ def _maybe_make_list(obj): is_complex = lib.is_complex +def is_string_like(obj): + return isinstance(obj, (compat.text_type, compat.string_types)) + def is_iterator(obj): # python 3 generators have __next__ instead of next return hasattr(obj, 'next') or hasattr(obj, '__next__') @@ -2527,6 +2538,27 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): return issubclass(tipo, (np.datetime64, np.timedelta64)) +def is_numeric_v_string_like(a, b): + """ + numpy doesn't like to compare numeric arrays vs scalar string-likes + + return a boolean result if this is the case for a,b or b,a + + """ + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + is_a_numeric_array = is_a_array and is_numeric_dtype(a) + is_b_numeric_array = is_b_array and is_numeric_dtype(b) + + is_a_scalar_string_like = not is_a_array and is_string_like(a) + is_b_scalar_string_like = not is_b_array and is_string_like(b) + + return ( + is_a_numeric_array and is_b_scalar_string_like) or ( + is_b_numeric_array and is_a_scalar_string_like + ) + def is_datetimelike_v_numeric(a, b): # return if we have an i8 convertible and numeric comparision if not hasattr(a,'dtype'): @@ -2808,154 +2840,6 @@ def _all_none(*args): return True -class UTF8Recoder: - - """ - Iterator that reads an encoded stream and reencodes the input to UTF-8 - """ - - def __init__(self, f, encoding): - self.reader = codecs.getreader(encoding)(f) - - def __iter__(self): - return self - - def read(self, bytes=-1): - return self.reader.read(bytes).encode('utf-8') - - def readline(self): - return self.reader.readline().encode('utf-8') - - def next(self): - return next(self.reader).encode("utf-8") - - # Python 3 iterator - __next__ = next - - -def _get_handle(path, mode, encoding=None, compression=None): - """Gets file handle for given path and mode. - NOTE: Under Python 3.2, getting a compressed file handle means reading in - the entire file, decompressing it and decoding it to ``str`` all at once - and then wrapping it in a StringIO. - """ - if compression is not None: - if encoding is not None and not compat.PY3: - msg = 'encoding + compression not yet supported in Python 2' - raise ValueError(msg) - - if compression == 'gzip': - import gzip - f = gzip.GzipFile(path, mode) - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(path, mode) - else: - raise ValueError('Unrecognized compression type: %s' % - compression) - if compat.PY3: - from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding) - return f - else: - if compat.PY3: - if encoding: - f = open(path, mode, encoding=encoding) - else: - f = open(path, mode, errors='replace') - else: - f = open(path, mode) - - return f - - -if compat.PY3: # pragma: no cover - def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): - # ignore encoding - return csv.reader(f, dialect=dialect, **kwds) - - def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): - return csv.writer(f, dialect=dialect, **kwds) -else: - class UnicodeReader: - - """ - A CSV reader which will iterate over lines in the CSV file "f", - which is encoded in the given encoding. - - On Python 3, this is replaced (below) by csv.reader, which handles - unicode. - """ - - def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - f = UTF8Recoder(f, encoding) - self.reader = csv.reader(f, dialect=dialect, **kwds) - - def next(self): - row = next(self.reader) - return [compat.text_type(s, "utf-8") for s in row] - - # python 3 iterator - __next__ = next - - def __iter__(self): # pragma: no cover - return self - - class UnicodeWriter: - - """ - A CSV writer which will write rows to CSV file "f", - which is encoded in the given encoding. - """ - - def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - # Redirect output to a queue - self.queue = StringIO() - self.writer = csv.writer(self.queue, dialect=dialect, **kwds) - self.stream = f - self.encoder = codecs.getincrementalencoder(encoding)() - self.quoting = kwds.get("quoting", None) - - def writerow(self, row): - def _check_as_is(x): - return (self.quoting == csv.QUOTE_NONNUMERIC and - is_number(x)) or isinstance(x, str) - - row = [x if _check_as_is(x) - else pprint_thing(x).encode('utf-8') for x in row] - - self.writer.writerow([s for s in row]) - # Fetch UTF-8 output from the queue ... - data = self.queue.getvalue() - data = data.decode("utf-8") - # ... and reencode it into the target encoding - data = self.encoder.encode(data) - # write to the target stream - self.stream.write(data) - # empty queue - self.queue.truncate(0) - - def writerows(self, rows): - def _check_as_is(x): - return (self.quoting == csv.QUOTE_NONNUMERIC and - is_number(x)) or isinstance(x, str) - - for i, row in enumerate(rows): - rows[i] = [x if _check_as_is(x) - else pprint_thing(x).encode('utf-8') for x in row] - - self.writer.writerows([[s for s in row] for row in rows]) - # Fetch UTF-8 output from the queue ... - data = self.queue.getvalue() - data = data.decode("utf-8") - # ... and reencode it into the target encoding - data = self.encoder.encode(data) - # write to the target stream - self.stream.write(data) - # empty queue - self.queue.truncate(0) - - def get_dtype_kinds(l): """ Parameters diff --git a/pandas/core/format.py b/pandas/core/format.py index bf9b3bc8040de..efa4b182f1133 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -13,6 +13,7 @@ OrderedDict) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option, set_option +from pandas.io.common import _get_handle, UnicodeWriter import pandas.core.common as com import pandas.lib as lib from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime @@ -23,6 +24,7 @@ import itertools import csv +import warnings common_docstring = """ Parameters @@ -1264,7 +1266,11 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): - self.engine = engine # remove for 0.13 + if engine is not None: + warnings.warn("'engine' keyword is deprecated and " + "will be removed in a future version", + FutureWarning, stacklevel=3) + self.engine = engine # remove for 0.18 self.obj = obj if path_or_buf is None: @@ -1470,8 +1476,8 @@ def save(self): f = self.path_or_buf close = False else: - f = com._get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, + f = _get_handle(self.path_or_buf, self.mode, + encoding=self.encoding, compression=self.compression) close = True @@ -1483,7 +1489,7 @@ def save(self): quotechar=self.quotechar) if self.encoding is not None: writer_kwargs['encoding'] = self.encoding - self.writer = com.UnicodeWriter(f, **writer_kwargs) + self.writer = UnicodeWriter(f, **writer_kwargs) else: self.writer = csv.writer(f, **writer_kwargs) @@ -1702,9 +1708,9 @@ def _format_value(self, val): if lib.checknull(val): val = self.na_rep elif com.is_float(val): - if np.isposinf(val): + if lib.isposinf_scalar(val): val = self.inf_rep - elif np.isneginf(val): + elif lib.isneginf_scalar(val): val = '-%s' % self.inf_rep elif self.float_format is not None: val = float(self.float_format % val) @@ -1723,7 +1729,7 @@ def _format_header_mi(self): return columns = self.columns - level_strs = columns.format(sparsify=True, adjoin=False, names=False) + level_strs = columns.format(sparsify=self.merge_cells, adjoin=False, names=False) level_lengths = _get_level_lengths(level_strs) coloffset = 0 lnum = 0 @@ -1867,8 +1873,9 @@ def _format_hierarchical_rows(self): # MultiIndex columns require an extra row # with index names (blank if None) for - # unambigous round-trip - if isinstance(self.columns, MultiIndex): + # unambigous round-trip, unless not merging, + # in which case the names all go on one row Issue #11328 + if isinstance(self.columns, MultiIndex) and self.merge_cells: self.rowcounter += 1 # if index labels are not empty go ahead and dump diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 920d9ad96c5b6..827373c9a330b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -802,11 +802,12 @@ def to_dict(self, orient='dict'): elif orient.lower().startswith('sp'): return {'index': self.index.tolist(), 'columns': self.columns.tolist(), - 'data': self.values.tolist()} + 'data': lib.map_infer(self.values.ravel(), _maybe_box_datetimelike) + .reshape(self.values.shape).tolist()} elif orient.lower().startswith('s'): - return dict((k, v) for k, v in compat.iteritems(self)) + return dict((k, _maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [dict((k, v) for k, v in zip(self.columns, row)) + return [dict((k, _maybe_box_datetimelike(v)) for k, v in zip(self.columns, row)) for row in self.values] elif orient.lower().startswith('i'): return dict((k, v.to_dict()) for k, v in self.iterrows()) @@ -3157,6 +3158,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: from pandas.core.groupby import _nargsort + # GH11080 - Check monotonic-ness before sort an index + # if monotonic (already sorted), return None or copy() according to 'inplace' + if (ascending and labels.is_monotonic_increasing) or \ + (not ascending and labels.is_monotonic_decreasing): + if inplace: + return + else: + return self.copy() + indexer = _nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 98f9677fb6784..248203c259aaa 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2999,8 +2999,6 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, '{0!r}').format(type(to_replace).__name__) raise TypeError(msg) # pragma: no cover - new_data = new_data.convert(copy=not inplace, numeric=False) - if inplace: self._update_inplace(new_data) else: diff --git a/pandas/core/index.py b/pandas/core/index.py index 256ece6539b6f..7049ac33feac6 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -627,6 +627,10 @@ def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, dtype=dtype) + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self + def to_datetime(self, dayfirst=False): """ For an Index containing strings or datetime.datetime objects, attempt @@ -862,9 +866,10 @@ def to_int(): return self._invalid_indexer('label', key) if is_float(key): - if not self.is_floating(): - warnings.warn("scalar indexers for index type {0} should be integers and not floating point".format( - type(self).__name__), FutureWarning, stacklevel=3) + if isnull(key): + return self._invalid_indexer('label', key) + warnings.warn("scalar indexers for index type {0} should be integers and not floating point".format( + type(self).__name__), FutureWarning, stacklevel=3) return to_int() return key @@ -982,10 +987,6 @@ def _convert_list_indexer(self, keyarr, kind=None): if kind in [None, 'iloc', 'ix'] and is_integer_dtype(keyarr) \ and not self.is_floating() and not isinstance(keyarr, ABCPeriodIndex): - if self.inferred_type != 'integer': - keyarr = np.where(keyarr < 0, - len(self) + keyarr, keyarr) - if self.inferred_type == 'mixed-integer': indexer = self.get_indexer(keyarr) if (indexer >= 0).all(): @@ -998,6 +999,8 @@ def _convert_list_indexer(self, keyarr, kind=None): return maybe_convert_indices(indexer, len(self)) elif not self.inferred_type == 'integer': + keyarr = np.where(keyarr < 0, + len(self) + keyarr, keyarr) return keyarr return None @@ -3191,6 +3194,10 @@ def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 return duplicated_int64(self.codes.astype('i8'), keep) + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.astype('object') + def get_loc(self, key, method=None): """ Get integer location for requested label @@ -3723,9 +3730,23 @@ def astype(self, dtype): return Index(self._values, name=self.name, dtype=dtype) def _convert_scalar_indexer(self, key, kind=None): + """ + convert a scalar indexer + + Parameters + ---------- + key : label of the slice bound + kind : optional, type of the indexing operation (loc/ix/iloc/None) + + right now we are converting + floats -> ints if the index supports it + """ + if kind == 'iloc': - return super(Float64Index, self)._convert_scalar_indexer(key, - kind=kind) + if is_integer(key): + return key + return super(Float64Index, self)._convert_scalar_indexer(key, kind=kind) + return key def _convert_slice_indexer(self, key, kind=None): @@ -4278,7 +4299,7 @@ def _reference_duplicate_name(self, name): Returns True if the name refered to in self.names is duplicated. """ # count the times name equals an element in self.names. - return np.sum(name == np.asarray(self.names)) > 1 + return sum(name == n for n in self.names) > 1 def _format_native_types(self, **kwargs): return self.values @@ -4516,6 +4537,10 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, else: return result_levels + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.set_levels([ i._to_safe_for_reshape() for i in self.levels ]) + def to_hierarchical(self, n_repeat, n_shuffle=1): """ Return a MultiIndex reshaped to conform to the diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8b4528ef451ef..5eb25a53d4533 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1285,7 +1285,7 @@ def _has_valid_type(self, key, axis): def error(): if isnull(key): - raise ValueError( + raise TypeError( "cannot use label indexing with a null key") raise KeyError("the label [%s] is not in the [%s]" % (key, self.obj._get_axis_name(axis))) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c8c834180c9f6..f1d82ec1f3b2e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -17,7 +17,7 @@ is_datetime64tz_dtype, is_datetimetz, is_sparse, array_equivalent, _maybe_convert_string_to_object, is_categorical, needs_i8_conversion, is_datetimelike_v_numeric, - is_internal_type) + is_numeric_v_string_like, is_internal_type) from pandas.core.dtypes import DatetimeTZDtype from pandas.core.index import Index, MultiIndex, _ensure_index @@ -137,6 +137,11 @@ def get_values(self, dtype=None): def to_dense(self): return self.values.view() + def to_object_block(self, mgr): + """ return myself as an object block """ + values = self.get_values(dtype=object) + return self.make_block(values,klass=ObjectBlock) + @property def fill_value(self): return np.nan @@ -215,7 +220,7 @@ def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] - def reshape_nd(self, labels, shape, ref_items): + def reshape_nd(self, labels, shape, ref_items, mgr=None): """ Parameters ---------- @@ -312,7 +317,7 @@ def delete(self, loc): self.values = np.delete(self.values, loc, 0) self.mgr_locs = self.mgr_locs.delete(loc) - def apply(self, func, **kwargs): + def apply(self, func, mgr=None, **kwargs): """ apply the function to my values; return a block if we are not one """ result = func(self.values, **kwargs) if not isinstance(result, Block): @@ -320,13 +325,17 @@ def apply(self, func, **kwargs): return result - def fillna(self, value, limit=None, inplace=False, downcast=None): + def fillna(self, value, limit=None, inplace=False, downcast=None, mgr=None): + """ fillna on the block with the value. If we fail, then convert to ObjectBlock + and try again """ + if not self._can_hold_na: if inplace: - return [self] + return self else: - return [self.copy()] + return self.copy() + original_value = value mask = isnull(self.values) if limit is not None: if self.ndim > 2: @@ -334,9 +343,24 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): "is currently limited to 2") mask[mask.cumsum(self.ndim-1) > limit] = False - value = self._try_fill(value) - blocks = self.putmask(mask, value, inplace=inplace) - return self._maybe_downcast(blocks, downcast) + # fillna, but if we cannot coerce, then try again as an ObjectBlock + try: + values, _, value, _ = self._try_coerce_args(self.values, value) + blocks = self.putmask(mask, value, inplace=inplace) + blocks = [ b.make_block(values=self._try_coerce_result(b.values)) for b in blocks ] + return self._maybe_downcast(blocks, downcast) + except (TypeError, ValueError): + + # we can't process the value, but nothing to do + if not mask.any(): + return self if inplace else self.copy() + + # we cannot coerce the underlying object, so + # make an ObjectBlock + return self.to_object_block(mgr=mgr).fillna(original_value, + limit=limit, + inplace=inplace, + downcast=False) def _maybe_downcast(self, blocks, downcast=None): @@ -347,18 +371,14 @@ def _maybe_downcast(self, blocks, downcast=None): elif downcast is None and (self.is_timedelta or self.is_datetime): return blocks - result_blocks = [] - for b in blocks: - result_blocks.extend(b.downcast(downcast)) + return _extend_blocks([ b.downcast(downcast) for b in blocks ]) - return result_blocks - - def downcast(self, dtypes=None): + def downcast(self, dtypes=None, mgr=None): """ try to downcast each item to the dict of dtypes if present """ # turn it off completely if dtypes is False: - return [self] + return self values = self.values @@ -370,12 +390,12 @@ def downcast(self, dtypes=None): dtypes = 'infer' nv = _possibly_downcast_to_dtype(values, dtypes) - return [self.make_block(nv, - fastpath=True)] + return self.make_block(nv, + fastpath=True) # ndim > 1 if dtypes is None: - return [self] + return self if not (dtypes == 'infer' or isinstance(dtypes, dict)): raise ValueError("downcast must have a dictionary or 'infer' as " @@ -409,7 +429,7 @@ def astype(self, dtype, copy=False, raise_on_error=True, values=None, **kwargs): values=values, **kwargs) def _astype(self, dtype, copy=False, raise_on_error=True, values=None, - klass=None, **kwargs): + klass=None, mgr=None, **kwargs): """ Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True @@ -474,7 +494,7 @@ def convert(self, copy=True, **kwargs): return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ - return [self.copy()] if copy else [self] + return self.copy() if copy else self def _can_hold_element(self, value): raise NotImplementedError() @@ -520,7 +540,7 @@ def _try_operate(self, values): def _try_coerce_args(self, values, other): """ provide coercion to our input arguments """ - return values, other + return values, False, other, False def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -551,7 +571,7 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None, **kwargs): return values # block actions #### - def copy(self, deep=True): + def copy(self, deep=True, mgr=None): values = self.values if deep: values = values.copy() @@ -560,23 +580,45 @@ def copy(self, deep=True): fastpath=True) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False): + regex=False, convert=True, mgr=None): """ replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. It is used in ObjectBlocks. It is here for API compatibility.""" - mask = com.mask_missing(self.values, to_replace) - if filter is not None: - filtered_out = ~self.mgr_locs.isin(filter) - mask[filtered_out.nonzero()[0]] = False - if not mask.any(): - if inplace: - return [self] - return [self.copy()] - return self.putmask(mask, value, inplace=inplace) + original_to_replace = to_replace + + # try to replace, if we raise an error, convert to ObjectBlock and retry + try: + values, _, to_replace, _ = self._try_coerce_args(self.values, to_replace) + mask = com.mask_missing(values, to_replace) + if filter is not None: + filtered_out = ~self.mgr_locs.isin(filter) + mask[filtered_out.nonzero()[0]] = False + + blocks = self.putmask(mask, value, inplace=inplace) + if convert: + blocks = [ b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks ] + return blocks + except (TypeError, ValueError): + + # we can't process the value, but nothing to do + if not mask.any(): + return self if inplace else self.copy() - def setitem(self, indexer, value): + return self.to_object_block(mgr=mgr).replace(to_replace=original_to_replace, + value=value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert) + + + def _replace_single(self, *args, **kwargs): + """ no-op on a non-ObjectBlock """ + return self if kwargs['inplace'] else self.copy() + + def setitem(self, indexer, value, mgr=None): """ set the value inplace; return a new block (of a possibly different dtype) @@ -590,7 +632,7 @@ def setitem(self, indexer, value): value = np.nan # coerce args - values, value = self._try_coerce_args(self.values, value) + values, _, value, _ = self._try_coerce_args(self.values, value) arr_value = np.array(value) # cast the values to a type that can hold nan (if necessary) @@ -682,7 +724,7 @@ def _is_empty_indexer(indexer): return [self] def putmask(self, mask, new, align=True, inplace=False, - axis=0, transpose=False): + axis=0, transpose=False, mgr=None): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -797,7 +839,7 @@ def putmask(self, mask, new, align=True, inplace=False, def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, limit_direction='forward', - fill_value=None, coerce=False, downcast=None, **kwargs): + fill_value=None, coerce=False, downcast=None, mgr=None, **kwargs): def check_int_bool(self, inplace): # Only FloatBlocks will contain NaNs. @@ -824,7 +866,8 @@ def check_int_bool(self, inplace): limit=limit, fill_value=fill_value, coerce=coerce, - downcast=downcast) + downcast=downcast, + mgr=mgr) # try an interp method try: m = com._clean_interp_method(method, **kwargs) @@ -844,13 +887,14 @@ def check_int_bool(self, inplace): fill_value=fill_value, inplace=inplace, downcast=downcast, + mgr=mgr, **kwargs) raise ValueError("invalid method '{0}' to interpolate.".format(method)) def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, coerce=False, - downcast=None): + downcast=None, mgr=None): """ fillna but using the interpolate machinery """ # if we are coercing, then don't force the conversion @@ -862,8 +906,8 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, else: return [self.copy()] - fill_value = self._try_fill(fill_value) values = self.values if inplace else self.values.copy() + values, _, fill_value, _ = self._try_coerce_args(values, fill_value) values = self._try_operate(values) values = com.interpolate_2d(values, method=method, @@ -881,7 +925,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, limit_direction='forward', - inplace=False, downcast=None, **kwargs): + inplace=False, downcast=None, mgr=None, **kwargs): """ interpolate using scipy wrappers """ data = self.values if inplace else self.values.copy() @@ -957,13 +1001,13 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: return self.make_block_same_class(new_values, new_mgr_locs) - def diff(self, n, axis=1): + def diff(self, n, axis=1, mgr=None): """ return block for the diff of the values """ new_values = com.diff(self.values, n, axis=axis) return [self.make_block(values=new_values, fastpath=True)] - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, mgr=None): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than @@ -993,7 +1037,7 @@ def shift(self, periods, axis=0): return [self.make_block(new_values, fastpath=True)] - def eval(self, func, other, raise_on_error=True, try_cast=False): + def eval(self, func, other, raise_on_error=True, try_cast=False, mgr=None): """ evaluate the block; return result block from the result @@ -1003,6 +1047,7 @@ def eval(self, func, other, raise_on_error=True, try_cast=False): other : a ndarray/object raise_on_error : if True, raise when I can't perform the function, False by default (and just return the data that we had coming in) + try_cast : try casting the results to the input type Returns ------- @@ -1032,11 +1077,34 @@ def eval(self, func, other, raise_on_error=True, try_cast=False): transf = (lambda x: x.T) if is_transposed else (lambda x: x) # coerce/transpose the args if needed - values, other = self._try_coerce_args(transf(values), other) + values, values_mask, other, other_mask = self._try_coerce_args(transf(values), other) # get the result, may need to transpose the other def get_result(other): - return self._try_coerce_result(func(values, other)) + + # avoid numpy warning of comparisons again None + if other is None: + result = not func.__name__ == 'eq' + + # avoid numpy warning of elementwise comparisons to object + elif is_numeric_v_string_like(values, other): + result = False + + else: + result = func(values, other) + + # mask if needed + if isinstance(values_mask, np.ndarray) and values_mask.any(): + result = result.astype('float64',copy=False) + result[values_mask] = np.nan + if other_mask is True: + result = result.astype('float64',copy=False) + result[:] = np.nan + elif isinstance(other_mask, np.ndarray) and other_mask.any(): + result = result.astype('float64',copy=False) + result[other_mask.ravel()] = np.nan + + return self._try_coerce_result(result) # error handler if we have an issue operating with the function def handle_error(): @@ -1086,7 +1154,7 @@ def handle_error(): fastpath=True,)] def where(self, other, cond, align=True, raise_on_error=True, - try_cast=False, axis=0, transpose=False): + try_cast=False, axis=0, transpose=False, mgr=None): """ evaluate the block; return result block(s) from the result @@ -1128,22 +1196,22 @@ def where(self, other, cond, align=True, raise_on_error=True, other = _maybe_convert_string_to_object(other) # our where function - def func(c, v, o): - if c.ravel().all(): - return v + def func(cond, values, other): + if cond.ravel().all(): + return values - v, o = self._try_coerce_args(v, o) + values, values_mask, other, other_mask = self._try_coerce_args(values, other) try: return self._try_coerce_result( - expressions.where(c, v, o, raise_on_error=True) + expressions.where(cond, values, other, raise_on_error=True) ) except Exception as detail: if raise_on_error: raise TypeError('Could not operate [%s] with block values ' - '[%s]' % (repr(o), str(detail))) + '[%s]' % (repr(other), str(detail))) else: # return the values - result = np.empty(v.shape, dtype='float64') + result = np.empty(values.shape, dtype='float64') result.fill(np.nan) return result @@ -1253,6 +1321,34 @@ def get(self, item): else: return self.values + def putmask(self, mask, new, align=True, inplace=False, + axis=0, transpose=False, mgr=None): + """ + putmask the data to the block; we must be a single block and not generate + other blocks + + return the resulting block + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block(s), the result of the putmask + """ + new_values = self.values if inplace else self.values.copy() + new_values, _, new, _ = self._try_coerce_args(new_values, new) + + if isinstance(new, np.ndarray) and len(new) == len(mask): + new = new[mask] + new_values[mask] = new + new_values = self._try_coerce_result(new_values) + return [self.make_block(values=new_values)] + def _slice(self, slicer): """ return a slice of my values (but densify first) """ return self.get_values()[slicer] @@ -1386,45 +1482,56 @@ class TimeDeltaBlock(IntBlock): def fill_value(self): return tslib.iNaT - def _try_fill(self, value): - """ if we are a NaT, return the actual fill value """ - if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all(): - value = tslib.iNaT - elif isinstance(value, Timedelta): - value = value.value - elif isinstance(value, np.timedelta64): - pass - elif com.is_integer(value): - # coerce to seconds of timedelta - value = np.timedelta64(int(value * 1e9)) - elif isinstance(value, timedelta): - value = np.timedelta64(value) + def fillna(self, value, **kwargs): - return value + # allow filling with integers to be + # interpreted as seconds + if not isinstance(value, np.timedelta64) and com.is_integer(value): + value = Timedelta(value,unit='s') + return super(TimeDeltaBlock, self).fillna(value, **kwargs) def _try_coerce_args(self, values, other): - """ Coerce values and other to float64, with null values converted to - NaN. values is always ndarray-like, other may not be """ - def masker(v): - mask = isnull(v) - v = v.astype('float64') - v[mask] = np.nan - return v - - values = masker(values) - - if is_null_datelike_scalar(other): - other = np.nan - elif isinstance(other, (np.timedelta64, Timedelta, timedelta)): - other = _coerce_scalar_to_timedelta_type(other, unit='s', box=False).item() - if other == tslib.iNaT: - other = np.nan - elif lib.isscalar(other): - other = np.float64(other) + """ + Coerce values and other to int64, with null values converted to + iNaT. values is always ndarray-like, other may not be + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + + values_mask = isnull(values) + values = values.view('i8') + other_mask = False + + if isinstance(other, bool): + raise TypeError + elif is_null_datelike_scalar(other): + other = tslib.iNaT + other_mask = True + elif isinstance(other, Timedelta): + other_mask = isnull(other) + other = other.value + elif isinstance(other, np.timedelta64): + other_mask = isnull(other) + other = other.view('i8') + elif isinstance(other, timedelta): + other = Timedelta(other).value + elif isinstance(other, np.ndarray): + other_mask = isnull(other) + other = other.astype('i8',copy=False).view('i8') else: - other = masker(other) + # scalar + other = Timedelta(other) + other_mask = isnull(other) + other = other.value - return values, other + return values, values_mask, other, other_mask def _try_operate(self, values): """ return a version to operate on """ @@ -1496,13 +1603,13 @@ def should_store(self, value): return issubclass(value.dtype.type, np.bool_) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False): + regex=False, mgr=None): to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self return super(BoolBlock, self).replace(to_replace, value, inplace=inplace, filter=filter, - regex=regex) + regex=regex, mgr=mgr) class ObjectBlock(Block): @@ -1609,10 +1716,7 @@ def _maybe_downcast(self, blocks, downcast=None): return blocks # split and convert the blocks - result_blocks = [] - for blk in blocks: - result_blocks.extend(blk.convert(datetime=True, numeric=False)) - return result_blocks + return _extend_blocks([ b.convert(datetime=True, numeric=False) for b in blocks ]) def _can_hold_element(self, element): return True @@ -1626,38 +1730,53 @@ def should_store(self, value): np.datetime64, np.bool_)) or is_internal_type(value)) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False): - blk = [self] + regex=False, convert=True, mgr=None): to_rep_is_list = com.is_list_like(to_replace) value_is_list = com.is_list_like(value) both_lists = to_rep_is_list and value_is_list either_list = to_rep_is_list or value_is_list + result_blocks = [] + blocks = [self] + if not either_list and com.is_re(to_replace): - blk[0], = blk[0]._replace_single(to_replace, value, - inplace=inplace, filter=filter, - regex=True) + return self._replace_single(to_replace, value, + inplace=inplace, filter=filter, + regex=True, convert=convert, mgr=mgr) elif not (either_list or regex): - blk = super(ObjectBlock, self).replace(to_replace, value, - inplace=inplace, - filter=filter, regex=regex) + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex, + convert=convert, mgr=mgr) elif both_lists: for to_rep, v in zip(to_replace, value): - blk[0], = blk[0]._replace_single(to_rep, v, inplace=inplace, - filter=filter, regex=regex) + result_blocks = [] + for b in blocks: + result = b._replace_single(to_rep, v, inplace=inplace, + filter=filter, regex=regex, + convert=convert, mgr=mgr) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + elif to_rep_is_list and regex: for to_rep in to_replace: - blk[0], = blk[0]._replace_single(to_rep, value, - inplace=inplace, - filter=filter, regex=regex) - else: - blk[0], = blk[0]._replace_single(to_replace, value, - inplace=inplace, filter=filter, - regex=regex) - return blk + result_blocks = [] + for b in blocks: + result = b._replace_single(to_rep, value, + inplace=inplace, + filter=filter, regex=regex, + convert=convert, mgr=mgr) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + return self._replace_single(to_replace, value, + inplace=inplace, filter=filter, + convert=convert, regex=regex, mgr=mgr) def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False): + regex=False, convert=True, mgr=None): # to_replace is regex compilable to_rep_re = regex and com.is_re_compilable(to_replace) @@ -1689,13 +1808,11 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, else: # if the thing to replace is not a string or compiled regex call # the superclass method -> to_replace is some kind of object - result = super(ObjectBlock, self).replace(to_replace, value, - inplace=inplace, - filter=filter, - regex=regex) - if not isinstance(result, list): - result = [result] - return result + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, + regex=regex, + mgr=mgr) new_values = self.values if inplace else self.values.copy() @@ -1725,9 +1842,12 @@ def re_replacer(s): new_values[filt] = f(new_values[filt]) - return [self if inplace else - self.make_block(new_values, - fastpath=True)] + # convert + block = self.make_block(new_values) + if convert: + block = block.convert(by_item=True,numeric=False) + + return block class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): __slots__ = () @@ -1753,7 +1873,7 @@ def to_dense(self): return self.values.to_dense().view() def convert(self, copy=True, **kwargs): - return [self.copy() if copy else self] + return self.copy() if copy else self @property def array_dtype(self): @@ -1767,16 +1887,16 @@ def _slice(self, slicer): # return same dims as we currently have return self.values._slice(slicer) - def fillna(self, value, limit=None, inplace=False, downcast=None): + def fillna(self, value, limit=None, inplace=False, downcast=None, mgr=None): # we may need to upcast our fill to match our dtype if limit is not None: raise NotImplementedError("specifying a limit for 'fillna' has " "not been implemented yet") values = self.values if inplace else self.values.copy() - return [self.make_block_same_class(values=values.fillna(value=value, - limit=limit), - placement=self.mgr_locs)] + values = self._try_coerce_result(values.fillna(value=value, + limit=limit)) + return [self.make_block(values=values)] def interpolate(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, **kwargs): @@ -1787,7 +1907,7 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=limit), placement=self.mgr_locs) - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) @@ -1815,30 +1935,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def putmask(self, mask, new, align=True, inplace=False, - axis=0, transpose=False): - """ putmask the data to the block; it is possible that we may create a - new dtype of block - - return the resulting block(s) - - Parameters - ---------- - mask : the condition to respect - new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False - - Returns - ------- - a new block(s), the result of the putmask - """ - new_values = self.values if inplace else self.values.copy() - new_values[mask] = new - return [self.make_block_same_class(values=new_values, placement=self.mgr_locs)] - def _astype(self, dtype, copy=False, raise_on_error=True, values=None, - klass=None): + klass=None, mgr=None): """ Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True @@ -1882,7 +1980,7 @@ def __init__(self, values, placement, fastpath=True, placement=placement, **kwargs) - def _astype(self, dtype, **kwargs): + def _astype(self, dtype, mgr=None, **kwargs): """ these automatically copy, so copy=True has no effect raise on an except if raise == True @@ -1921,22 +2019,52 @@ def _try_operate(self, values): return values.view('i8') def _try_coerce_args(self, values, other): - """ Coerce values and other to dtype 'i8'. NaN and NaT convert to - the smallest i8, and will correctly round-trip to NaT if converted - back in _try_coerce_result. values is always ndarray-like, other - may not be """ + """ + Coerce values and other to dtype 'i8'. NaN and NaT convert to + the smallest i8, and will correctly round-trip to NaT if converted + back in _try_coerce_result. values is always ndarray-like, other + may not be + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + + values_mask = isnull(values) values = values.view('i8') + other_mask = False - if is_null_datelike_scalar(other): + if isinstance(other, bool): + raise TypeError + elif is_null_datelike_scalar(other): other = tslib.iNaT + other_mask = True elif isinstance(other, (datetime, np.datetime64, date)): - other = lib.Timestamp(other).asm8.view('i8') + other = lib.Timestamp(other) + if getattr(other,'tz') is not None: + raise TypeError("cannot coerce a Timestamp with a tz on a naive Block") + other_mask = isnull(other) + other = other.asm8.view('i8') elif hasattr(other, 'dtype') and com.is_integer_dtype(other): other = other.view('i8') else: - other = np.array(other, dtype='i8') + try: + other = np.asarray(other) + other_mask = isnull(other) - return values, other + other = other.astype('i8',copy=False).view('i8') + except ValueError: + + # coercion issues + # let higher levels handle + raise TypeError + + return values, values_mask, other, other_mask def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -1951,52 +2079,6 @@ def _try_coerce_result(self, result): def fill_value(self): return tslib.iNaT - def _try_fill(self, value): - """ if we are a NaT, return the actual fill value """ - if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all(): - value = tslib.iNaT - return value - - def fillna(self, value, limit=None, - inplace=False, downcast=None): - - mask = isnull(self.values) - value = self._try_fill(value) - - if limit is not None: - if self.ndim > 2: - raise NotImplementedError("number of dimensions for 'fillna' " - "is currently limited to 2") - mask[mask.cumsum(self.ndim-1)>limit]=False - - if mask.any(): - try: - return self._fillna_mask(mask, value, inplace=inplace) - except TypeError: - pass - # _fillna_mask raises TypeError when it fails - # cannot perform inplace op because of object coercion - values = self.get_values(dtype=object) - np.putmask(values, mask, value) - return [self.make_block(values, fastpath=True)] - else: - return [self if inplace else self.copy()] - - def _fillna_mask(self, mask, value, inplace=False): - if getattr(value, 'tzinfo', None) is None: - # Series comes to this path - values = self.values - if not inplace: - values = values.copy() - try: - np.putmask(values, mask, value) - return [self if inplace else - self.make_block(values, fastpath=True)] - except (ValueError, TypeError): - # scalar causes ValueError, and array causes TypeError - pass - raise TypeError - def to_native_types(self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -2068,28 +2150,25 @@ def get_values(self, dtype=None): .reshape(self.values.shape) return self.values - def _fillna_mask(self, mask, value, inplace=False): - # cannot perform inplace op for internal DatetimeIndex - my_tz = tslib.get_timezone(self.values.tz) - value_tz = tslib.get_timezone(getattr(value, 'tzinfo', None)) - - if (my_tz == value_tz or self.dtype == getattr(value, 'dtype', None)): - if my_tz == value_tz: - # hack for PY2.6 / numpy 1.7.1. - # Other versions can directly use self.values.putmask - # -------------------------------------- - try: - value = value.asm8 - except AttributeError: - value = tslib.Timestamp(value).asm8 - ### ------------------------------------ + def to_object_block(self, mgr): + """ + return myself as an object block - try: - values = self.values.putmask(mask, value) - return [self.make_block(values, fastpath=True)] - except ValueError: - pass - raise TypeError + Since we keep the DTI as a 1-d object, this is different + depends on BlockManager's ndim + """ + values = self.get_values(dtype=object) + kwargs = {} + if mgr.ndim > 1: + values = _block_shape(values,ndim=mgr.ndim) + kwargs['ndim'] = mgr.ndim + kwargs['placement']=[0] + return self.make_block(values, klass=ObjectBlock, **kwargs) + + def replace(self, *args, **kwargs): + # if we are forced to ObjectBlock, then don't coerce (to UTC) + kwargs['convert'] = False + return super(DatetimeTZBlock, self).replace(*args, **kwargs) def _slice(self, slicer): """ return a slice of my values """ @@ -2101,22 +2180,46 @@ def _slice(self, slicer): return self.values[slicer] def _try_coerce_args(self, values, other): - """ localize and return i8 for the values """ - values = values.tz_localize(None).asi8 + """ + localize and return i8 for the values + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar - if is_null_datelike_scalar(other): + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + values_mask = isnull(values) + values = values.tz_localize(None).asi8 + other_mask = False + + if isinstance(other, ABCSeries): + other = self._holder(other) + other_mask = isnull(other) + if isinstance(other, bool): + raise TypeError + elif is_null_datelike_scalar(other): other = tslib.iNaT + other_mask = True elif isinstance(other, self._holder): if other.tz != self.values.tz: raise ValueError("incompatible or non tz-aware value") other = other.tz_localize(None).asi8 - else: + other_mask = isnull(other) + elif isinstance(other, (np.datetime64, datetime, date)): other = lib.Timestamp(other) - if not getattr(other, 'tz', None): + tz = getattr(other, 'tz', None) + + # test we can have an equal time zone + if tz is None or str(tz) != str(self.values.tz): raise ValueError("incompatible or non tz-aware value") - other = other.value + other_mask = isnull(other) + other = other.tz_localize(None).value - return values, other + return values, values_mask, other, other_mask def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2128,7 +2231,7 @@ def _try_coerce_result(self, result): result = lib.Timestamp(result, tz=self.values.tz) return result - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, mgr=None): """ shift the block by periods """ ### think about moving this to the DatetimeIndex. This is a non-freq (number of periods) shift ### @@ -2210,7 +2313,7 @@ def __len__(self): except: return 0 - def copy(self, deep=True): + def copy(self, deep=True, mgr=None): return self.make_block_same_class(values=self.values, sparse_index=self.sp_index, kind=self.kind, copy=deep, @@ -2259,7 +2362,7 @@ def interpolate(self, method='pad', axis=0, inplace=False, return self.make_block_same_class(values=values, placement=self.mgr_locs) - def fillna(self, value, limit=None, inplace=False, downcast=None): + def fillna(self, value, limit=None, inplace=False, downcast=None, mgr=None): # we may need to upcast our fill to match our dtype if limit is not None: raise NotImplementedError("specifying a limit for 'fillna' has " @@ -2271,7 +2374,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): fill_value=value, placement=self.mgr_locs)] - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, mgr=None): """ shift the block by periods """ N = len(self.values.T) indexer = np.zeros(N, dtype=int) @@ -2715,12 +2818,9 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, consolidate kwargs[k] = obj.reindex_axis(b_items, axis=axis, copy=align_copy) + kwargs['mgr'] = self applied = getattr(b, f)(**kwargs) - - if isinstance(applied, list): - result_blocks.extend(applied) - else: - result_blocks.append(applied) + result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: return self.make_empty(axes or self.axes) @@ -2768,9 +2868,12 @@ def convert(self, **kwargs): def replace(self, **kwargs): return self.apply('replace', **kwargs) - def replace_list(self, src_list, dest_list, inplace=False, regex=False): + def replace_list(self, src_list, dest_list, inplace=False, regex=False, mgr=None): """ do a list replace """ + if mgr is None: + mgr = self + # figure out our mask a-priori to avoid repeated replacements values = self.as_matrix() @@ -2792,11 +2895,8 @@ def comp(s): for b in rb: if b.dtype == np.object_: result = b.replace(s, d, inplace=inplace, - regex=regex) - if isinstance(result, list): - new_rb.extend(result) - else: - new_rb.append(result) + regex=regex, mgr=mgr) + new_rb = _extend_blocks(result, new_rb) else: # get our mask for this element, sized to this # particular block @@ -2930,7 +3030,7 @@ def __contains__(self, item): def nblocks(self): return len(self.blocks) - def copy(self, deep=True): + def copy(self, deep=True, mgr=None): """ Make deep or shallow copy of BlockManager @@ -3122,7 +3222,7 @@ def get(self, item, fastpath=True): else: if isnull(item): - raise ValueError("cannot label index with a null key") + raise TypeError("cannot label index with a null key") indexer = self.items.get_indexer_for([item]) return self.reindex_indexer(new_axis=self.items[indexer], @@ -3327,6 +3427,9 @@ def insert(self, loc, item, value, allow_duplicates=False): if not isinstance(loc, int): raise TypeError("loc must be int") + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc+1)) @@ -3349,8 +3452,7 @@ def insert(self, loc, item, value, allow_duplicates=False): self._blklocs = np.insert(self._blklocs, loc, 0) self._blknos = np.insert(self._blknos, loc, len(self.blocks)) - self.axes[0] = self.items.insert(loc, item) - + self.axes[0] = new_axis self.blocks += (block,) self._shape = None @@ -4084,15 +4186,12 @@ def _consolidate(blocks): for (_can_consolidate, dtype), group_blocks in grouper: merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate) - if isinstance(merged_blocks, list): - new_blocks.extend(merged_blocks) - else: - new_blocks.append(merged_blocks) - + new_blocks = _extend_blocks(merged_blocks, new_blocks) return new_blocks def _merge_blocks(blocks, dtype=None, _can_consolidate=True): + if len(blocks) == 1: return blocks[0] @@ -4119,6 +4218,22 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): return blocks +def _extend_blocks(result, blocks=None): + """ return a new extended blocks, givin the result """ + if blocks is None: + blocks = [] + if isinstance(result, list): + for r in result: + if isinstance(r, list): + blocks.extend(r) + else: + blocks.append(r) + elif isinstance(result, BlockManager): + blocks.extend(result.blocks) + else: + blocks.append(result) + return blocks + def _block_shape(values, ndim=1, shape=None): """ guarantee the shape of the values to be at least 1 d """ if values.ndim <= ndim: @@ -4146,11 +4261,16 @@ def _possibly_compare(a, b, op): # numpy deprecation warning to have i8 vs integer comparisions if is_datetimelike_v_numeric(a, b): - res = False + result = False + + # numpy deprecation warning if comparing numeric vs string-like + elif is_numeric_v_string_like(a, b): + result = False + else: - res = op(a, b) + result = op(a, b) - if np.isscalar(res) and (is_a_array or is_b_array): + if lib.isscalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] if is_a_array: @@ -4160,7 +4280,7 @@ def _possibly_compare(a, b, op): type_names[1] = 'ndarray(dtype=%s)' % b.dtype raise TypeError("Cannot compare types %r and %r" % tuple(type_names)) - return res + return result def _concat_indexes(indexes): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 5b3d6069f17ec..bf331ff1b781c 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -720,7 +720,7 @@ def wrapper(self, other, axis=None): res = op(self.values, other) else: values = self.get_values() - if is_list_like(other): + if isinstance(other, (list, np.ndarray)): other = np.asarray(other) res = na_op(values, other) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 08ef82835830c..da0ab7bc59440 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -680,8 +680,8 @@ def _combine(self, other, func, axis=0): elif np.isscalar(other): return self._combine_const(other, func) else: - raise NotImplementedError(str(type(other)) + - ' is not supported in combine operation with ' + + raise NotImplementedError(str(type(other)) + + ' is not supported in combine operation with ' + str(type(self))) def _combine_const(self, other, func): diff --git a/pandas/core/series.py b/pandas/core/series.py index f4e3374626011..2fc90ef8596f1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2501,11 +2501,19 @@ def dropna(self, axis=0, inplace=False, **kwargs): 'argument "{0}"'.format(list(kwargs.keys())[0])) axis = self._get_axis_number(axis or 0) - result = remove_na(self) - if inplace: - self._update_inplace(result) + + if self._can_hold_na: + result = remove_na(self) + if inplace: + self._update_inplace(result) + else: + return result else: - return result + if inplace: + # do nothing + pass + else: + return self.copy() valid = lambda self, inplace=False, **kwargs: self.dropna(inplace=inplace, **kwargs) diff --git a/pandas/io/common.py b/pandas/io/common.py index b9cdd44e52555..e46f609077810 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,11 +2,28 @@ import sys import os +import csv +import codecs import zipfile from contextlib import contextmanager, closing -from pandas.compat import StringIO, string_types, BytesIO +from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat +from pandas.core.common import pprint_thing, is_number + + +try: + import pathlib + _PATHLIB_INSTALLED = True +except ImportError: + _PATHLIB_INSTALLED = False + + +try: + from py.path import local as LocalPath + _PY_PATH_INSTALLED = True +except: + _PY_PATH_INSTALLED = False if compat.PY3: @@ -201,6 +218,25 @@ def _validate_header_arg(header): "header=int or list-like of ints to specify " "the row(s) making up the column names") +def _stringify_path(filepath_or_buffer): + """Return the argument coerced to a string if it was a pathlib.Path + or a py.path.local + + Parameters + ---------- + filepath_or_buffer : object to be converted + + Returns + ------- + str_filepath_or_buffer : a the string version of the input path + """ + if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path): + return text_type(filepath_or_buffer) + if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath): + return filepath_or_buffer.strpath + return filepath_or_buffer + + def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ @@ -209,7 +245,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, Parameters ---------- - filepath_or_buffer : a url, filepath, or buffer + filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), + or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns @@ -257,6 +294,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, filepath_or_buffer = k return filepath_or_buffer, None, compression + # It is a pathlib.Path/py.path.local or string + filepath_or_buffer = _stringify_path(filepath_or_buffer) return _expand_user(filepath_or_buffer), None, compression @@ -284,3 +323,148 @@ def ZipFile(*args, **kwargs): yield zf else: ZipFile = zipfile.ZipFile + + +def _get_handle(path, mode, encoding=None, compression=None): + """Gets file handle for given path and mode. + """ + if compression is not None: + if encoding is not None and not compat.PY3: + msg = 'encoding + compression not yet supported in Python 2' + raise ValueError(msg) + + if compression == 'gzip': + import gzip + f = gzip.GzipFile(path, mode) + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(path, mode) + else: + raise ValueError('Unrecognized compression type: %s' % + compression) + if compat.PY3: + from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + return f + else: + if compat.PY3: + if encoding: + f = open(path, mode, encoding=encoding) + else: + f = open(path, mode, errors='replace') + else: + f = open(path, mode) + + return f + + +class UTF8Recoder: + + """ + Iterator that reads an encoded stream and reencodes the input to UTF-8 + """ + + def __init__(self, f, encoding): + self.reader = codecs.getreader(encoding)(f) + + def __iter__(self): + return self + + def read(self, bytes=-1): + return self.reader.read(bytes).encode("utf-8") + + def readline(self): + return self.reader.readline().encode("utf-8") + + def next(self): + return next(self.reader).encode("utf-8") + + # Python 3 iterator + __next__ = next + + +if compat.PY3: # pragma: no cover + def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): + # ignore encoding + return csv.reader(f, dialect=dialect, **kwds) + + def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): + return csv.writer(f, dialect=dialect, **kwds) +else: + class UnicodeReader: + + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + + On Python 3, this is replaced (below) by csv.reader, which handles + unicode. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + f = UTF8Recoder(f, encoding) + self.reader = csv.reader(f, dialect=dialect, **kwds) + + def next(self): + row = next(self.reader) + return [compat.text_type(s, "utf-8") for s in row] + + # python 3 iterator + __next__ = next + + def __iter__(self): # pragma: no cover + return self + + class UnicodeWriter: + + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + self.quoting = kwds.get("quoting", None) + + def writerow(self, row): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + row = [x if _check_as_is(x) + else pprint_thing(x).encode("utf-8") for x in row] + + self.writer.writerow([s for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + for i, row in enumerate(rows): + rows[i] = [x if _check_as_is(x) + else pprint_thing(x).encode("utf-8") for x in row] + + self.writer.writerows([[s for s in row] for row in rows]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) \ No newline at end of file diff --git a/pandas/io/data.py b/pandas/io/data.py index 310b165101bdf..ac6f14e846bec 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -1024,7 +1024,7 @@ def _validate_expiry(self, expiry): if expiry in expiry_dates: return expiry else: - index = DatetimeIndex(expiry_dates).order() + index = DatetimeIndex(expiry_dates).sort_values() return index[index.date >= expiry][0].date() def get_forward_data(self, months, call=True, put=False, near=False, diff --git a/pandas/io/excel.py b/pandas/io/excel.py index a7a844cdfcb40..ffd2768c78824 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -708,7 +708,12 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) - xcell.value = _conv_value(cell.val) + if (isinstance(cell.val, compat.string_types) + and xcell.data_type_for_value(cell.val) + != xcell.TYPE_STRING): + xcell.set_value_explicit(cell.val) + else: + xcell.value = _conv_value(cell.val) style = None if cell.style: style = self._convert_to_style(cell.style) @@ -1240,7 +1245,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): start_row=startrow + cell.row + 1, start_column=startcol + cell.col + 1, end_column=startcol + cell.mergeend + 1, - end_row=startrow + cell.mergeend + 1 + end_row=startrow + cell.mergestart + 1 ) # When cells are merged only the top-left cell is preserved diff --git a/pandas/io/ga.py b/pandas/io/ga.py index b6b4081e3650f..a6f9c9ed9467f 100644 --- a/pandas/io/ga.py +++ b/pandas/io/ga.py @@ -20,6 +20,12 @@ from oauth2client.client import AccessTokenRefreshError from pandas.compat import zip, u +# GH11038 +import warnings +warnings.warn("The pandas.io.ga module is deprecated and will be " + "removed in a future version.", + FutureWarning, stacklevel=2) + TYPE_MAP = {u('INTEGER'): int, u('FLOAT'): float, u('TIME'): int} NO_CALLBACK = auth.OOB_CALLBACK_URN diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index e7241036b94c4..fff36a82529e3 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -511,7 +511,8 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, connector.delete_and_recreate_table(dataset_id, table_id, table_schema, verbose) elif if_exists == 'append': if not connector.verify_schema(dataset_id, table_id, table_schema): - raise InvalidSchema("The schema of the destination table does not match") + raise InvalidSchema("Please verify that the column order, structure and data types in the DataFrame " + "match the schema of the destination table.") else: table.create(table_id, table_schema) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8ac1aed9d9af7..a9c7c1587ff43 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,7 +17,8 @@ from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser -from pandas.io.common import get_filepath_or_buffer, _validate_header_arg +from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, + _get_handle, UnicodeReader, UTF8Recoder) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -865,17 +866,20 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, # extract the columns field_count = len(header[0]) - + def extract(r): return tuple([r[i] for i in range(field_count) if i not in sic]) columns = lzip(*[extract(r) for r in header]) names = ic + columns + def tostr(x): + return str(x) if not isinstance(x, compat.string_types) else x + # if we find 'Unnamed' all of a single level, then our header was too # long for n in range(len(columns[0])): - if all(['Unnamed' in c[n] for c in columns]): + if all(['Unnamed' in tostr(c[n]) for c in columns]): raise _parser.CParserError( "Passed header=[%s] are too many rows for this " "multi_index of columns" @@ -1084,7 +1088,7 @@ def __init__(self, src, **kwds): if 'utf-16' in (kwds.get('encoding') or ''): if isinstance(src, compat.string_types): src = open(src, 'rb') - src = com.UTF8Recoder(src, kwds['encoding']) + src = UTF8Recoder(src, kwds['encoding']) kwds['encoding'] = 'utf-8' # #2442 @@ -1420,7 +1424,7 @@ def __init__(self, f, **kwds): self._comment_lines = [] if isinstance(f, compat.string_types): - f = com._get_handle(f, 'r', encoding=self.encoding, + f = _get_handle(f, 'r', encoding=self.encoding, compression=self.compression) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) @@ -1540,17 +1544,17 @@ class MyDialect(csv.Dialect): dia.delimiter = sniffed.delimiter if self.encoding is not None: self.buf.extend(list( - com.UnicodeReader(StringIO(line), - dialect=dia, - encoding=self.encoding))) + UnicodeReader(StringIO(line), + dialect=dia, + encoding=self.encoding))) else: self.buf.extend(list(csv.reader(StringIO(line), dialect=dia))) if self.encoding is not None: - reader = com.UnicodeReader(f, dialect=dia, - encoding=self.encoding, - strict=True) + reader = UnicodeReader(f, dialect=dia, + encoding=self.encoding, + strict=True) else: reader = csv.reader(f, dialect=dia, strict=True) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4de641bb67926..4e25b546bddf2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1839,7 +1839,9 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself, don't disturb the blocks by # trying to downcast - block = block.fillna(nan_rep, downcast=False)[0] + block = block.fillna(nan_rep, downcast=False) + if isinstance(block, list): + block = block[0] data = block.values # see if we have a valid string type diff --git a/pandas/io/tests/data/testmultiindex.xls b/pandas/io/tests/data/testmultiindex.xls index 3664c5c8dedcc..51ef0f6c04cba 100644 Binary files a/pandas/io/tests/data/testmultiindex.xls and b/pandas/io/tests/data/testmultiindex.xls differ diff --git a/pandas/io/tests/data/testmultiindex.xlsm b/pandas/io/tests/data/testmultiindex.xlsm index 8f359782b57bb..28c92a5f0be38 100644 Binary files a/pandas/io/tests/data/testmultiindex.xlsm and b/pandas/io/tests/data/testmultiindex.xlsm differ diff --git a/pandas/io/tests/data/testmultiindex.xlsx b/pandas/io/tests/data/testmultiindex.xlsx index a70110caf1ec7..815f3b07342ca 100644 Binary files a/pandas/io/tests/data/testmultiindex.xlsx and b/pandas/io/tests/data/testmultiindex.xlsx differ diff --git a/pandas/io/tests/data/testskiprows.xls b/pandas/io/tests/data/testskiprows.xls new file mode 100644 index 0000000000000..21ccd30ec62da Binary files /dev/null and b/pandas/io/tests/data/testskiprows.xls differ diff --git a/pandas/io/tests/data/testskiprows.xlsm b/pandas/io/tests/data/testskiprows.xlsm new file mode 100644 index 0000000000000..f5889ded4637a Binary files /dev/null and b/pandas/io/tests/data/testskiprows.xlsm differ diff --git a/pandas/io/tests/data/testskiprows.xlsx b/pandas/io/tests/data/testskiprows.xlsx new file mode 100644 index 0000000000000..2d7ce943a7214 Binary files /dev/null and b/pandas/io/tests/data/testskiprows.xlsx differ diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 03d1e4fb1f365..73cae1130c740 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -5,10 +5,20 @@ import os from os.path import isabs +import nose import pandas.util.testing as tm from pandas.io import common +try: + from pathlib import Path +except ImportError: + pass + +try: + from py.path import local as LocalPath +except ImportError: + pass class TestCommonIOCapabilities(tm.TestCase): @@ -27,6 +37,22 @@ def test_expand_user_normal_path(self): self.assertEqual(expanded_name, filename) self.assertEqual(os.path.expanduser(filename), expanded_name) + def test_stringify_path_pathlib(self): + tm._skip_if_no_pathlib() + + rel_path = common._stringify_path(Path('.')) + self.assertEqual(rel_path, '.') + redundant_path = common._stringify_path(Path('foo//bar')) + self.assertEqual(redundant_path, os.path.join('foo', 'bar')) + + def test_stringify_path_localpath(self): + tm._skip_if_no_localpath() + + path = os.path.join('foo', 'bar') + abs_path = os.path.abspath(path) + lpath = LocalPath(path) + self.assertEqual(common._stringify_path(lpath), abs_path) + def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index ad0e05f91d184..afc61dc42f569 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -9,12 +9,15 @@ import numpy as np import pandas as pd from pandas import DataFrame, Timestamp -from pandas.io import data as web -from pandas.io.data import DataReader, SymbolWarning, RemoteDataError, _yahoo_codes from pandas.util.testing import (assert_series_equal, assert_produces_warning, network, assert_frame_equal) import pandas.util.testing as tm +with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + from pandas.io import data as web + +from pandas.io.data import DataReader, SymbolWarning, RemoteDataError, _yahoo_codes + if compat.PY3: from urllib.error import HTTPError else: @@ -103,13 +106,15 @@ def test_get_multi1(self): @network def test_get_multi_invalid(self): sl = ['AAPL', 'AMZN', 'INVALID'] - pan = web.get_data_google(sl, '2012') - self.assertIn('INVALID', pan.minor_axis) + with tm.assert_produces_warning(SymbolWarning): + pan = web.get_data_google(sl, '2012') + self.assertIn('INVALID', pan.minor_axis) @network def test_get_multi_all_invalid(self): sl = ['INVALID', 'INVALID2', 'INVALID3'] - self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') + with tm.assert_produces_warning(SymbolWarning): + self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') @network def test_get_multi2(self): @@ -291,6 +296,7 @@ def test_get_date_ret_index(self): class TestYahooOptions(tm.TestCase): + @classmethod def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 40cbd97ea539f..4cb62edf71b1c 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -6,6 +6,7 @@ import os from distutils.version import LooseVersion +import warnings import operator import functools import nose @@ -557,6 +558,12 @@ def test_read_excel_multiindex(self): actual = read_excel(mi_file, 'mi_column_name', header=[0,1], index_col=0) tm.assert_frame_equal(actual, expected) + # Issue #11317 + expected.columns = mi.set_levels([1,2],level=1).set_names(['c1', 'c2']) + actual = read_excel(mi_file, 'name_with_int', index_col=0, header=[0,1]) + tm.assert_frame_equal(actual, expected) + + expected.columns = mi.set_names(['c1', 'c2']) expected.index = mi.set_names(['ilvl1', 'ilvl2']) actual = read_excel(mi_file, 'both_name', index_col=[0,1], header=[0,1]) tm.assert_frame_equal(actual, expected) @@ -660,6 +667,21 @@ def test_read_excel_chunksize(self): pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), chunksize=100) + def test_read_excel_skiprows_list(self): + #GH 4903 + actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + self.ext), + 'skiprows_list', skiprows=[0,2]) + expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], + [2, 3.5, pd.Timestamp('2015-01-02'), False], + [3, 4.5, pd.Timestamp('2015-01-03'), False], + [4, 5.5, pd.Timestamp('2015-01-04'), True]], + columns = ['a','b','c','d']) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + self.ext), + 'skiprows_list', skiprows=np.array([0,2])) + tm.assert_frame_equal(actual, expected) + class XlsReaderTests(XlrdTests, tm.TestCase): ext = '.xls' engine_name = 'xlrd' @@ -1067,7 +1089,38 @@ def test_to_excel_multiindex(self): df = read_excel(reader, 'test1', index_col=[0, 1], parse_dates=False) tm.assert_frame_equal(frame, df) - self.assertEqual(frame.index.names, df.index.names) + + # Test for Issue 11328. If column indices are integers, make + # sure they are handled correctly for either setting of + # merge_cells + def test_to_excel_multiindex_cols(self): + _skip_if_no_xlrd() + + frame = self.frame + arrays = np.arange(len(frame.index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, + names=['first', 'second']) + frame.index = new_index + + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), + (50, 1), (50, 2)]) + frame.columns = new_cols_index + header = [0, 1] + if not self.merge_cells: + header = 0 + + with ensure_clean(self.ext) as path: + # round trip + frame.to_excel(path, 'test1', merge_cells=self.merge_cells) + reader = ExcelFile(path) + df = read_excel(reader, 'test1', header=header, + index_col=[0, 1], + parse_dates=False) + if not self.merge_cells: + fm = frame.columns.format(sparsify=False, + adjoin=False, names=False) + frame.columns = [ ".".join(map(str, q)) for q in zip(*fm) ] + tm.assert_frame_equal(frame, df) def test_to_excel_multiindex_dates(self): _skip_if_no_xlrd() @@ -1814,7 +1867,6 @@ def test_column_format(self): # Applicable to xlsxwriter only. _skip_if_no_xlsxwriter() - import warnings with warnings.catch_warnings(): # Ignore the openpyxl lxml warning. warnings.simplefilter("ignore") diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py index 13d31b43ac39a..965b3441d7405 100644 --- a/pandas/io/tests/test_ga.py +++ b/pandas/io/tests/test_ga.py @@ -1,6 +1,7 @@ import os from datetime import datetime +import warnings import nose import pandas as pd from pandas import compat @@ -13,7 +14,12 @@ try: import httplib2 - import pandas.io.ga as ga + import apiclient + + # deprecated + with warnings.catch_warnings(record=True): + import pandas.io.ga as ga + from pandas.io.ga import GAnalytics, read_ga from pandas.io.auth import AuthenticationConfigError, reset_default_token_store from pandas.io import auth diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 894b699281c80..3434afc4129c4 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -461,20 +461,21 @@ def test_sparse_frame(self): def test_sparse_panel(self): - items = ['x', 'y', 'z'] - p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) - sp = p.to_sparse() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + items = ['x', 'y', 'z'] + p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) + sp = p.to_sparse() - self._check_roundtrip(sp, tm.assert_panel_equal, - check_panel_type=True) + self._check_roundtrip(sp, tm.assert_panel_equal, + check_panel_type=True) - sp2 = p.to_sparse(kind='integer') - self._check_roundtrip(sp2, tm.assert_panel_equal, - check_panel_type=True) + sp2 = p.to_sparse(kind='integer') + self._check_roundtrip(sp2, tm.assert_panel_equal, + check_panel_type=True) - sp3 = p.to_sparse(fill_value=0) - self._check_roundtrip(sp3, tm.assert_panel_equal, - check_panel_type=True) + sp3 = p.to_sparse(fill_value=0) + self._check_roundtrip(sp3, tm.assert_panel_equal, + check_panel_type=True) class TestCompression(TestPackers): diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py index 51d6ac02f0f20..ef72ad4964ff2 100644 --- a/pandas/io/tests/test_wb.py +++ b/pandas/io/tests/test_wb.py @@ -5,9 +5,11 @@ from pandas.util.testing import network from pandas.util.testing import assert_frame_equal from numpy.testing.decorators import slow -from pandas.io.wb import search, download, get_countries import pandas.util.testing as tm +# deprecated +with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + from pandas.io.wb import search, download, get_countries class TestWB(tm.TestCase): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 2b4974155d44c..74842d9a165fe 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -269,6 +269,18 @@ cpdef checknull_old(object val): else: return util._checknull(val) +cpdef isposinf_scalar(object val): + if util.is_float_object(val) and val == INF: + return True + else: + return False + +cpdef isneginf_scalar(object val): + if util.is_float_object(val) and val == NEGINF: + return True + else: + return False + def isscalar(object val): """ Return True if given value is scalar. diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py index a2e6d08d07b58..4b579e9263742 100644 --- a/pandas/rpy/tests/test_common.py +++ b/pandas/rpy/tests/test_common.py @@ -6,6 +6,7 @@ import numpy as np import unittest import nose +import warnings import pandas.util.testing as tm try: diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index b765fdb8d67be..f275a34ca90db 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -283,7 +283,15 @@ def __getitem__(self, key): if com.is_integer(key): return self._get_val_at(key) else: - data_slice = self.values[key] + if isinstance(key, SparseArray): + key = np.asarray(key) + if hasattr(key,'__len__') and len(self) != len(key): + indices = self.sp_index + if hasattr(indices,'to_int_index'): + indices = indices.to_int_index() + data_slice = self.values.take(indices.indices)[key] + else: + data_slice = self.values[key] return self._constructor(data_slice) def __getslice__(self, i, j): @@ -513,7 +521,12 @@ def make_sparse(arr, kind='block', fill_value=nan): else: mask = arr != fill_value - indices = np.arange(length, dtype=np.int32)[mask] + length = len(arr) + if length != mask.size: + # the arr is a SparseArray + indices = mask.sp_index.indices + else: + indices = np.arange(length, dtype=np.int32)[mask] if kind == 'block': locs, lens = splib.get_blocks(indices) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index a86942718091c..9ce08c550dd0d 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -39,10 +39,6 @@ from pandas.sparse.tests.test_array import assert_sp_array_equal -import warnings -warnings.filterwarnings(action='ignore', category=FutureWarning) - - def _test_data1(): # nan-based arr = np.arange(20, dtype=float) @@ -503,15 +499,6 @@ def check(a, b): result = self.bseries + self.bseries.to_dense() assert_sp_series_equal(result, self.bseries + self.bseries) - # @dec.knownfailureif(True, 'Known NumPy failer as of 1.5.1') - def test_operators_corner2(self): - raise nose.SkipTest('known failer on numpy 1.5.1') - - # NumPy circumvents __r*__ operations - val = np.float64(3.0) - result = val - self.zbseries - assert_sp_series_equal(result, 3 - self.zbseries) - def test_binary_operators(self): # skipping for now ##### @@ -1778,20 +1765,23 @@ def setUp(self): 'ItemC': panel_data3(), 'ItemD': panel_data1(), } - self.panel = SparsePanel(self.data_dict) + with tm.assert_produces_warning(FutureWarning): + self.panel = SparsePanel(self.data_dict) @staticmethod def _test_op(panel, op): # arithmetic tests - result = op(panel, 1) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = op(panel, 1) assert_sp_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) def test_constructor(self): - self.assertRaises(ValueError, SparsePanel, self.data_dict, - items=['Item0', 'ItemA', 'ItemB']) - with tm.assertRaisesRegexp(TypeError, - "input must be a dict, a 'list' was passed"): - SparsePanel(['a', 'b', 'c']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertRaises(ValueError, SparsePanel, self.data_dict, + items=['Item0', 'ItemA', 'ItemB']) + with tm.assertRaisesRegexp(TypeError, + "input must be a dict, a 'list' was passed"): + SparsePanel(['a', 'b', 'c']) # deprecation GH11157 def test_deprecation(self): @@ -1800,13 +1790,15 @@ def test_deprecation(self): # GH 9272 def test_constructor_empty(self): - sp = SparsePanel() + with tm.assert_produces_warning(FutureWarning): + sp = SparsePanel() self.assertEqual(len(sp.items), 0) self.assertEqual(len(sp.major_axis), 0) self.assertEqual(len(sp.minor_axis), 0) def test_from_dict(self): - fd = SparsePanel.from_dict(self.data_dict) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + fd = SparsePanel.from_dict(self.data_dict) assert_sp_panel_equal(fd, self.panel) def test_pickle(self): @@ -1830,21 +1822,25 @@ def test_to_dense(self): assert_panel_equal(dwp, dwp2) def test_to_frame(self): - def _compare_with_dense(panel): - slp = panel.to_frame() - dlp = panel.to_dense().to_frame() - self.assert_numpy_array_equal(slp.values, dlp.values) - self.assertTrue(slp.index.equals(dlp.index)) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + def _compare_with_dense(panel): + slp = panel.to_frame() + dlp = panel.to_dense().to_frame() - _compare_with_dense(self.panel) - _compare_with_dense(self.panel.reindex(items=['ItemA'])) + self.assert_numpy_array_equal(slp.values, dlp.values) + self.assertTrue(slp.index.equals(dlp.index)) - zero_panel = SparsePanel(self.data_dict, default_fill_value=0) - self.assertRaises(Exception, zero_panel.to_frame) + _compare_with_dense(self.panel) + _compare_with_dense(self.panel.reindex(items=['ItemA'])) - self.assertRaises(Exception, self.panel.to_frame, - filter_observations=False) + with tm.assert_produces_warning(FutureWarning): + zero_panel = SparsePanel(self.data_dict, default_fill_value=0) + self.assertRaises(Exception, zero_panel.to_frame) + + self.assertRaises(Exception, self.panel.to_frame, + filter_observations=False) def test_long_to_wide_sparse(self): pass @@ -1885,47 +1881,53 @@ def test_delitem_pop(self): self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') def test_copy(self): - cop = self.panel.copy() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + cop = self.panel.copy() assert_sp_panel_equal(cop, self.panel) def test_reindex(self): - def _compare_with_dense(swp, items, major, minor): - swp_re = swp.reindex(items=items, major=major, - minor=minor) - dwp_re = swp.to_dense().reindex(items=items, major=major, - minor=minor) - assert_panel_equal(swp_re.to_dense(), dwp_re) - - _compare_with_dense(self.panel, self.panel.items[:2], - self.panel.major_axis[::2], - self.panel.minor_axis[::2]) - _compare_with_dense(self.panel, None, - self.panel.major_axis[::2], - self.panel.minor_axis[::2]) - - self.assertRaises(ValueError, self.panel.reindex) - - # TODO: do something about this later... - self.assertRaises(Exception, self.panel.reindex, - items=['item0', 'ItemA', 'ItemB']) - - # test copying - cp = self.panel.reindex(self.panel.major_axis, copy=True) - cp['ItemA']['E'] = cp['ItemA']['A'] - self.assertNotIn('E', self.panel['ItemA']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + def _compare_with_dense(swp, items, major, minor): + swp_re = swp.reindex(items=items, major=major, + minor=minor) + dwp_re = swp.to_dense().reindex(items=items, major=major, + minor=minor) + assert_panel_equal(swp_re.to_dense(), dwp_re) + + _compare_with_dense(self.panel, self.panel.items[:2], + self.panel.major_axis[::2], + self.panel.minor_axis[::2]) + _compare_with_dense(self.panel, None, + self.panel.major_axis[::2], + self.panel.minor_axis[::2]) + + self.assertRaises(ValueError, self.panel.reindex) + + # TODO: do something about this later... + self.assertRaises(Exception, self.panel.reindex, + items=['item0', 'ItemA', 'ItemB']) + + # test copying + cp = self.panel.reindex(self.panel.major_axis, copy=True) + cp['ItemA']['E'] = cp['ItemA']['A'] + self.assertNotIn('E', self.panel['ItemA']) def test_operators(self): def _check_ops(panel): + def _dense_comp(op): - dense = panel.to_dense() - sparse_result = op(panel) - dense_result = op(dense) - assert_panel_equal(sparse_result.to_dense(), dense_result) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + dense = panel.to_dense() + sparse_result = op(panel) + dense_result = op(dense) + assert_panel_equal(sparse_result.to_dense(), dense_result) def _mixed_comp(op): - result = op(panel, panel.to_dense()) - expected = op(panel.to_dense(), panel.to_dense()) - assert_panel_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = op(panel, panel.to_dense()) + expected = op(panel.to_dense(), panel.to_dense()) + assert_panel_equal(result, expected) op1 = lambda x: x + 2 diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd index 0896965162698..f2f764c785894 100644 --- a/pandas/src/datetime.pxd +++ b/pandas/src/datetime.pxd @@ -95,14 +95,14 @@ cdef extern from "datetime/np_datetime.h": int apply_tzinfo) npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, - pandas_datetimestruct *d) + pandas_datetimestruct *d) nogil void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, - pandas_datetimestruct *result) + pandas_datetimestruct *result) nogil int days_per_month_table[2][12] - int dayofweek(int y, int m, int d) - int is_leapyear(int64_t year) + int dayofweek(int y, int m, int d) nogil + int is_leapyear(int64_t year) nogil PANDAS_DATETIMEUNIT get_datetime64_unit(object o) cdef extern from "datetime/np_datetime_strings.h": diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 2a7c2135f8045..b431bb58bc991 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -76,11 +76,11 @@ cdef extern from "period_helper.h": int64_t get_period_ordinal(int year, int month, int day, int hour, int minute, int second, int microseconds, int picoseconds, - int freq) except INT32_MIN + int freq) nogil except INT32_MIN int64_t get_python_ordinal(int64_t period_ordinal, int freq) except INT32_MIN - int get_date_info(int64_t ordinal, int freq, date_info *dinfo) except INT32_MIN + int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil except INT32_MIN double getAbsTime(int, int64_t, int64_t) int pyear(int64_t ordinal, int freq) except INT32_MIN @@ -139,13 +139,14 @@ def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): out = np.empty(l, dtype='i8') if tz is None: - for i in range(l): - if dtarr[i] == iNaT: - out[i] = iNaT - continue - pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) - out[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + with nogil: + for i in range(l): + if dtarr[i] == NPY_NAT: + out[i] = NPY_NAT + continue + pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) + out[i] = get_period_ordinal(dts.year, dts.month, dts.day, + dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) else: out = localize_dt64arr_to_period(dtarr, freq, tz) return out @@ -163,11 +164,12 @@ def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): out = np.empty(l, dtype='i8') - for i in range(l): - if periodarr[i] == iNaT: - out[i] = iNaT - continue - out[i] = period_ordinal_to_dt64(periodarr[i], freq) + with nogil: + for i in range(l): + if periodarr[i] == NPY_NAT: + out[i] = NPY_NAT + continue + out[i] = period_ordinal_to_dt64(periodarr[i], freq) return out @@ -245,13 +247,13 @@ def period_ordinal(int y, int m, int d, int h, int min, int s, int us, int ps, i return get_period_ordinal(y, m, d, h, min, s, us, ps, freq) -cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq): +cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: cdef: pandas_datetimestruct dts date_info dinfo float subsecond_fraction - if ordinal == iNaT: + if ordinal == NPY_NAT: return NPY_NAT get_date_info(ordinal, freq, &dinfo) diff --git a/pandas/src/period_helper.c b/pandas/src/period_helper.c index 032bc44de6355..e056b1fa9a522 100644 --- a/pandas/src/period_helper.c +++ b/pandas/src/period_helper.c @@ -113,7 +113,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int yearoffset; /* Range check */ - Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), + Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), PyExc_ValueError, "year out of range: %i", year); @@ -136,7 +136,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, day); yearoffset = dInfoCalc_YearOffset(year, calendar); - if (PyErr_Occurred()) goto onError; + if (yearoffset == INT_ERR_CODE) goto onError; absdate = day + month_offset[leap][month - 1] + yearoffset; @@ -155,7 +155,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, /* Calculate the absolute time */ { - Py_AssertWithArg(hour >= 0 && hour <= 23, + Py_AssertWithArg(hour >= 0 && hour <= 23, PyExc_ValueError, "hour out of range (0-23): %i", hour); @@ -212,8 +212,7 @@ int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, while (1) { /* Calculate the year offset */ yearoffset = dInfoCalc_YearOffset(year, calendar); - if (PyErr_Occurred()) - goto onError; + if (yearoffset == INT_ERR_CODE) goto onError; /* Backward correction: absdate must be greater than the yearoffset */ @@ -310,7 +309,7 @@ static int calc_conversion_factors_matrix_size() { } matrix_size = max_value(matrix_size, period_value); } - return matrix_size + 1; + return matrix_size + 1; } static void alloc_conversion_factors_matrix(int matrix_size) { diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 3615cc3dc8ad8..e2ed27156d2b5 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -43,7 +43,6 @@ class TestMoments(Base): def setUp(self): self._create_data() - warnings.simplefilter("ignore", category=FutureWarning) def test_centered_axis_validation(self): # ok @@ -887,7 +886,6 @@ def _create_data(self): def setUp(self): self._create_data() - warnings.simplefilter("ignore", category=FutureWarning) def _test_moments_consistency(self, min_periods, @@ -1513,9 +1511,6 @@ def test_rolling_functions_window_non_shrinkage(self): functions = [lambda x: mom.rolling_cov(x, x, pairwise=True, window=10, min_periods=5), lambda x: mom.rolling_corr(x, x, pairwise=True, window=10, min_periods=5), - # rolling_corr_pairwise is depracated, so the following line should be deleted - # when rolling_corr_pairwise is removed. - lambda x: mom.rolling_corr_pairwise(x, x, window=10, min_periods=5), ] for f in functions: df_result_panel = f(df) @@ -1582,9 +1577,6 @@ def test_moment_functions_zero_length(self): lambda x: mom.expanding_corr(x, x, pairwise=True, min_periods=5), lambda x: mom.rolling_cov(x, x, pairwise=True, window=10, min_periods=5), lambda x: mom.rolling_corr(x, x, pairwise=True, window=10, min_periods=5), - # rolling_corr_pairwise is depracated, so the following line should be deleted - # when rolling_corr_pairwise is removed. - lambda x: mom.rolling_corr_pairwise(x, x, window=10, min_periods=5), ] for f in functions: df1_result_panel = f(df1) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index bf2cfc6216a60..140b54225b8e8 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -2952,6 +2952,12 @@ def test_to_csv_date_format(self): self.assertEqual(df_day.to_csv(), expected_default_day) self.assertEqual(df_day.to_csv(date_format='%Y-%m-%d'), expected_default_day) + # deprecation GH11274 + def test_to_csv_engine_kw_deprecation(self): + with tm.assert_produces_warning(FutureWarning): + df = DataFrame({'col1' : [1], 'col2' : ['a'], 'col3' : [10.1] }) + df.to_csv(engine='python') + def test_round_dataframe(self): # GH 2665 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 8a9afcb7d1291..dc0e0e2670565 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -381,15 +381,11 @@ def test_getitem_boolean(self): assert_frame_equal(subframe_obj, subframe) # test that Series indexers reindex - import warnings - warnings.filterwarnings(action='ignore', category=UserWarning) - - indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1]) - - subframe_obj = self.tsframe[indexer_obj] - assert_frame_equal(subframe_obj, subframe) + with tm.assert_produces_warning(UserWarning): + indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1]) - warnings.filterwarnings(action='default', category=UserWarning) + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) # test df[df > 0] for df in [ self.tsframe, self.mixed_frame, self.mixed_float, self.mixed_int ]: @@ -488,6 +484,18 @@ def test_getitem_ix_mixed_integer(self): expected = df.ix[Index([1, 10], dtype=object)] assert_frame_equal(result, expected) + # 11320 + df = pd.DataFrame({ "rna": (1.5,2.2,3.2,4.5), + -1000: [11,21,36,40], + 0: [10,22,43,34], + 1000:[0, 10, 20, 30] },columns=['rna',-1000,0,1000]) + result = df[[1000]] + expected = df.iloc[:,[3]] + assert_frame_equal(result, expected) + result = df[[-1000]] + expected = df.iloc[:,[1]] + assert_frame_equal(result, expected) + def test_getitem_setitem_ix_negative_integers(self): result = self.frame.ix[:, -1] assert_series_equal(result, self.frame['D']) @@ -4716,6 +4724,58 @@ def test_to_dict(self): for k2, v2 in compat.iteritems(v): self.assertEqual(v2, recons_data[k2][k]) + def test_to_dict_timestamp(self): + + # GH11247 + # split/records producing np.datetime64 rather than Timestamps + # on datetime64[ns] dtypes only + + tsmp = Timestamp('20130101') + test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]}) + test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]}) + + expected_records = [{'A': tsmp, 'B': tsmp}, + {'A': tsmp, 'B': tsmp}] + expected_records_mixed = [{'A': tsmp, 'B': 1}, + {'A': tsmp, 'B': 2}] + + tm.assert_almost_equal(test_data.to_dict( + orient='records'), expected_records) + tm.assert_almost_equal(test_data_mixed.to_dict( + orient='records'), expected_records_mixed) + + expected_series = { + 'A': Series([tsmp, tsmp]), + 'B': Series([tsmp, tsmp]), + } + expected_series_mixed = { + 'A': Series([tsmp, tsmp]), + 'B': Series([1, 2]), + } + + tm.assert_almost_equal(test_data.to_dict( + orient='series'), expected_series) + tm.assert_almost_equal(test_data_mixed.to_dict( + orient='series'), expected_series_mixed) + + expected_split = { + 'index': [0, 1], + 'data': [[tsmp, tsmp], + [tsmp, tsmp]], + 'columns': ['A', 'B'] + } + expected_split_mixed = { + 'index': [0, 1], + 'data': [[tsmp, 1], + [tsmp, 2]], + 'columns': ['A', 'B'] + } + + tm.assert_almost_equal(test_data.to_dict( + orient='split'), expected_split) + tm.assert_almost_equal(test_data_mixed.to_dict( + orient='split'), expected_split_mixed) + def test_to_dict_invalid_orient(self): df = DataFrame({'A':[0, 1]}) self.assertRaises(ValueError, df.to_dict, orient='xinvalid') @@ -5779,7 +5839,7 @@ def check(df): def f(): df.loc[:,np.nan] - self.assertRaises(ValueError, f) + self.assertRaises(TypeError, f) df = DataFrame([[1,2,3],[4,5,6]], index=[1,np.nan]) @@ -6618,31 +6678,25 @@ def test_to_csv_cols_reordering(self): # GH3454 import pandas as pd - def _check_df(df,cols=None): - with ensure_clean() as path: - df.to_csv(path,columns = cols,engine='python') - rs_p = pd.read_csv(path,index_col=0) - df.to_csv(path,columns = cols,chunksize=chunksize) - rs_c = pd.read_csv(path,index_col=0) - - if cols: - df = df[cols] - assert (rs_c.columns==rs_p.columns).all() - assert_frame_equal(df,rs_c,check_names=False) - chunksize=5 N = int(chunksize*2.5) df= mkdf(N, 3) cs = df.columns cols = [cs[2],cs[0]] - _check_df(df,cols) + + with ensure_clean() as path: + df.to_csv(path,columns = cols,chunksize=chunksize) + rs_c = pd.read_csv(path,index_col=0) + + assert_frame_equal(df[cols],rs_c,check_names=False) def test_to_csv_legacy_raises_on_dupe_cols(self): df= mkdf(10, 3) df.columns = ['a','a','b'] with ensure_clean() as path: - self.assertRaises(NotImplementedError,df.to_csv,path,engine='python') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertRaises(NotImplementedError,df.to_csv,path,engine='python') def test_to_csv_new_dupe_cols(self): import pandas as pd @@ -7163,6 +7217,7 @@ def test_to_csv_chunking(self): rs = read_csv(filename,index_col=0) assert_frame_equal(rs, aa) + @slow def test_to_csv_wide_frame_formatting(self): # Issue #8621 df = DataFrame(np.random.randn(1, 100010), columns=None, index=None) @@ -9400,18 +9455,20 @@ def test_regex_replace_dict_nested(self): def test_regex_replace_dict_nested_gh4115(self): df = pd.DataFrame({'Type':['Q','T','Q','Q','T'], 'tmp':2}) expected = DataFrame({'Type': [0,1,0,0,1], 'tmp': 2}) - assert_frame_equal(df.replace({'Type': {'Q':0,'T':1}}), expected) + result = df.replace({'Type': {'Q':0,'T':1}}) + assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} df = DataFrame(mix) + expec = DataFrame({'a': mix['a'], 'b': np.array([nan] * 4), + 'c': [nan, nan, nan, 'd']}) + res = df.replace([r'\s*\.\s*', 'a|b'], nan, regex=True) res2 = df.copy() res3 = df.copy() res2.replace([r'\s*\.\s*', 'a|b'], nan, regex=True, inplace=True) res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=nan, inplace=True) - expec = DataFrame({'a': mix['a'], 'b': np.array([nan] * 4), - 'c': [nan, nan, nan, 'd']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) @@ -9465,8 +9522,8 @@ def test_regex_replace_series_of_regexes(self): def test_regex_replace_numeric_to_object_conversion(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} df = DataFrame(mix) - res = df.replace(0, 'a') expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix['b'], 'c': mix['c']}) + res = df.replace(0, 'a') assert_frame_equal(res, expec) self.assertEqual(res.a.dtype, np.object_) @@ -9895,6 +9952,56 @@ def test_replace_datetime(self): result = df.replace(d) tm.assert_frame_equal(result, expected) + def test_replace_datetimetz(self): + + # GH 11326 + # behaving poorly when presented with a datetime64[ns, tz] + df = DataFrame({'A' : date_range('20130101',periods=3,tz='US/Eastern'), + 'B' : [0, np.nan, 2]}) + result = df.replace(np.nan,1) + expected = DataFrame({'A' : date_range('20130101',periods=3,tz='US/Eastern'), + 'B' : Series([0, 1, 2],dtype='float64')}) + assert_frame_equal(result, expected) + + result = df.fillna(1) + assert_frame_equal(result, expected) + + result = df.replace(0,np.nan) + expected = DataFrame({'A' : date_range('20130101',periods=3,tz='US/Eastern'), + 'B' : [np.nan, np.nan, 2]}) + assert_frame_equal(result, expected) + + result = df.replace(Timestamp('20130102',tz='US/Eastern'),Timestamp('20130104',tz='US/Eastern')) + expected = DataFrame({'A' : [Timestamp('20130101',tz='US/Eastern'), + Timestamp('20130104',tz='US/Eastern'), + Timestamp('20130103',tz='US/Eastern')], + 'B' : [0, np.nan, 2]}) + assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1,0] = np.nan + result = result.replace({'A' : pd.NaT }, Timestamp('20130104',tz='US/Eastern')) + assert_frame_equal(result, expected) + + # coerce to object + result = df.copy() + result.iloc[1,0] = np.nan + result = result.replace({'A' : pd.NaT }, Timestamp('20130104',tz='US/Pacific')) + expected = DataFrame({'A' : [Timestamp('20130101',tz='US/Eastern'), + Timestamp('20130104',tz='US/Pacific'), + Timestamp('20130103',tz='US/Eastern')], + 'B' : [0, np.nan, 2]}) + assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1,0] = np.nan + result = result.replace({'A' : np.nan }, Timestamp('20130104')) + expected = DataFrame({'A' : [Timestamp('20130101',tz='US/Eastern'), + Timestamp('20130104'), + Timestamp('20130103',tz='US/Eastern')], + 'B' : [0, np.nan, 2]}) + assert_frame_equal(result, expected) + def test_combine_multiple_frames_dtypes(self): # GH 2759 @@ -15198,10 +15305,14 @@ def test_to_csv_date_format(self): pname = '__tmp_to_csv_date_format__' with ensure_clean(pname) as path: for engine in [None, 'python']: + w = FutureWarning if engine == 'python' else None + dt_index = self.tsframe.index datetime_frame = DataFrame({'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) - datetime_frame.to_csv(path, date_format='%Y%m%d', engine=engine) + with tm.assert_produces_warning(w, check_stacklevel=False): + datetime_frame.to_csv(path, date_format='%Y%m%d', engine=engine) + # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -15210,7 +15321,9 @@ def test_to_csv_date_format(self): assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + with tm.assert_produces_warning(w, check_stacklevel=False): + datetime_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime('%Y-%m-%d')) @@ -15221,7 +15334,8 @@ def test_to_csv_date_format(self): # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format='%Y%m%d', engine=engine) + with tm.assert_produces_warning(w, check_stacklevel=False): + datetime_frame_columns.to_csv(path, date_format='%Y%m%d', engine=engine) test = read_csv(path, index_col=0) @@ -15235,7 +15349,8 @@ def test_to_csv_date_format(self): nat_index = to_datetime(['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) nat_frame = DataFrame({'A': nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + with tm.assert_produces_warning(w, check_stacklevel=False): + nat_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) test = read_csv(path, parse_dates=[0, 1], index_col=0) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 061382e0e16de..d29673e96ecdd 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -39,8 +39,7 @@ class Generic(object): _multiprocess_can_split_ = True def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) + pass @property def _ndim(self): diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index b2d8ff8ba0b00..b85f4628ae013 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -2689,6 +2689,18 @@ def test_line_colors(self): self._check_colors(ax.get_lines(), linecolors=['red'] * 5) tm.close() + # GH 10299 + custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF'] + ax = df.plot(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + tm.close() + + with tm.assertRaises(ValueError): + # Color contains shorthand hex value results in ValueError + custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] + # Forced show plot + _check_plot_works(df.plot, color=custom_colors) + @slow def test_line_colors_and_styles_subplots(self): # GH 9894 @@ -2725,6 +2737,20 @@ def test_line_colors_and_styles_subplots(self): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() + # GH 10299 + custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF'] + axes = df.plot(color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + with tm.assertRaises(ValueError): + # Color contains shorthand hex value results in ValueError + custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] + # Forced show plot + _check_plot_works(df.plot, color=custom_colors, subplots=True, + filterwarnings='ignore') + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) for cmap in ['jet', cm.jet]: axes = df.plot(colormap=cmap, subplots=True) @@ -3143,6 +3169,7 @@ def test_pie_df_nan(self): ax.get_legend().get_texts()], base_expected[:i] + base_expected[i+1:]) + @slow def test_errorbar_plot(self): d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} df = DataFrame(d) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8eb641ce8f494..46026a4c887a6 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1655,6 +1655,7 @@ def check_nunique(df, keys): check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) + @slow def test_series_groupby_value_counts(self): from itertools import product diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 90f85b3f4576d..a2d789aaf8b70 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -25,6 +25,7 @@ import pandas.util.testing as tm from pandas import date_range +from numpy.testing.decorators import slow _verbose = False @@ -1689,74 +1690,71 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): _ = df.loc[(0,)] + @slow def test_multiindex_get_loc(self): # GH7724, GH2646 - # ignore the warning here - warnings.simplefilter('ignore', PerformanceWarning) + with warnings.catch_warnings(record=True): - # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn - cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] + # test indexing into a multi-index before & past the lexsort depth + from numpy.random import randint, choice, randn + cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] - def validate(mi, df, key): - mask = np.ones(len(df)).astype('bool') + def validate(mi, df, key): + mask = np.ones(len(df)).astype('bool') - # test for all partials of this key - for i, k in enumerate(key): - mask &= df.iloc[:, i] == k + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k - if not mask.any(): - self.assertNotIn(key[:i+1], mi.index) - continue - - self.assertIn(key[:i+1], mi.index) - right = df[mask].copy() + if not mask.any(): + self.assertNotIn(key[:i+1], mi.index) + continue - if i + 1 != len(key): # partial key - right.drop(cols[:i+1], axis=1, inplace=True) - right.set_index(cols[i+1:-1], inplace=True) - assert_frame_equal(mi.loc[key[:i+1]], right) + self.assertIn(key[:i+1], mi.index) + right = df[mask].copy() - else: # full key - right.set_index(cols[:-1], inplace=True) - if len(right) == 1: # single hit - right = Series(right['jolia'].values, - name=right.index[0], index=['jolia']) - assert_series_equal(mi.loc[key[:i+1]], right) - else: # multi hit + if i + 1 != len(key): # partial key + right.drop(cols[:i+1], axis=1, inplace=True) + right.set_index(cols[i+1:-1], inplace=True) assert_frame_equal(mi.loc[key[:i+1]], right) - def loop(mi, df, keys): - for key in keys: - validate(mi, df, key) - - n, m = 1000, 50 - - vals = [randint(0, 10, n), choice(list('abcdefghij'), n), - choice(pd.date_range('20141009', periods=10).tolist(), n), - choice(list('ZYXWVUTSRQ'), n), randn(n)] - vals = list(map(tuple, zip(*vals))) - - # bunch of keys for testing - keys = [randint(0, 11, m), choice(list('abcdefghijk'), m), - choice(pd.date_range('20141009', periods=11).tolist(), m), - choice(list('ZYXWVUTSRQP'), m)] - keys = list(map(tuple, zip(*keys))) - keys += list(map(lambda t: t[:-1], vals[::n//m])) - - # covers both unique index and non-unique index - df = pd.DataFrame(vals, columns=cols) - a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) - - for frame in a, b: - for i in range(5): # lexsort depth - df = frame.copy() if i == 0 else frame.sort_values(by=cols[:i]) - mi = df.set_index(cols[:-1]) - assert not mi.index.lexsort_depth < i - loop(mi, df, keys) - - # restore - warnings.simplefilter('always', PerformanceWarning) + else: # full key + right.set_index(cols[:-1], inplace=True) + if len(right) == 1: # single hit + right = Series(right['jolia'].values, + name=right.index[0], index=['jolia']) + assert_series_equal(mi.loc[key[:i+1]], right) + else: # multi hit + assert_frame_equal(mi.loc[key[:i+1]], right) + + def loop(mi, df, keys): + for key in keys: + validate(mi, df, key) + + n, m = 1000, 50 + + vals = [randint(0, 10, n), choice(list('abcdefghij'), n), + choice(pd.date_range('20141009', periods=10).tolist(), n), + choice(list('ZYXWVUTSRQ'), n), randn(n)] + vals = list(map(tuple, zip(*vals))) + + # bunch of keys for testing + keys = [randint(0, 11, m), choice(list('abcdefghijk'), m), + choice(pd.date_range('20141009', periods=11).tolist(), m), + choice(list('ZYXWVUTSRQP'), m)] + keys = list(map(tuple, zip(*keys))) + keys += list(map(lambda t: t[:-1], vals[::n//m])) + + # covers both unique index and non-unique index + df = pd.DataFrame(vals, columns=cols) + a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) + + for frame in a, b: + for i in range(5): # lexsort depth + df = frame.copy() if i == 0 else frame.sort_values(by=cols[:i]) + mi = df.set_index(cols[:-1]) + assert not mi.index.lexsort_depth < i + loop(mi, df, keys) def test_series_getitem_multiindex(self): @@ -4653,6 +4651,7 @@ def test_indexing_dtypes_on_empty(self): assert_series_equal(df2.loc[:,'a'], df2.iloc[:,0]) assert_series_equal(df2.loc[:,'a'], df2.ix[:,0]) + @slow def test_large_dataframe_indexing(self): #GH10692 result = DataFrame({'x': range(10**6)},dtype='int64') @@ -4660,6 +4659,7 @@ def test_large_dataframe_indexing(self): expected = DataFrame({'x': range(10**6 + 1)},dtype='int64') assert_frame_equal(result, expected) + @slow def test_large_mi_dataframe_indexing(self): #GH10645 result = MultiIndex.from_arrays([range(10**6), range(10**6)]) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 00553102e172f..fbab0d2a92203 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -306,7 +306,7 @@ def test_try_coerce_arg(self): block = create_block('datetime', [0]) # coerce None - none_coerced = block._try_coerce_args(block.values, None)[1] + none_coerced = block._try_coerce_args(block.values, None)[2] self.assertTrue(pd.Timestamp(none_coerced) is pd.NaT) # coerce different types of date bojects @@ -314,7 +314,7 @@ def test_try_coerce_arg(self): datetime(2010, 10, 10), date(2010, 10, 10)) for val in vals: - coerced = block._try_coerce_args(block.values, val)[1] + coerced = block._try_coerce_args(block.values, val)[2] self.assertEqual(np.int64, type(coerced)) self.assertEqual(pd.Timestamp('2010-10-10'), pd.Timestamp(coerced)) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index cfc98f5c20360..a24f71482c404 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -161,6 +161,19 @@ def test_maybe_indices_to_slice_middle(self): self.assert_numpy_array_equal(maybe_slice, indices) self.assert_numpy_array_equal(target[indices], target[maybe_slice]) + def test_isinf_scalar(self): + #GH 11352 + self.assertTrue(lib.isposinf_scalar(float('inf'))) + self.assertTrue(lib.isposinf_scalar(np.inf)) + self.assertFalse(lib.isposinf_scalar(-np.inf)) + self.assertFalse(lib.isposinf_scalar(1)) + self.assertFalse(lib.isposinf_scalar('a')) + + self.assertTrue(lib.isneginf_scalar(float('-inf'))) + self.assertTrue(lib.isneginf_scalar(-np.inf)) + self.assertFalse(lib.isneginf_scalar(np.inf)) + self.assertFalse(lib.isneginf_scalar(1)) + self.assertFalse(lib.isneginf_scalar('a')) class Testisscalar(tm.TestCase): @@ -232,4 +245,4 @@ def test_lisscalar_pandas_containers(self): import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) \ No newline at end of file + exit=False) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index df61387734cb3..5b00ea163d85f 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -28,8 +28,6 @@ class TestMultiLevel(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 6d6c289a6dfa6..b9db95fe06a43 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -3,6 +3,7 @@ from functools import partial +import warnings import numpy as np from pandas import Series from pandas.core.common import isnull, is_integer_dtype @@ -135,7 +136,7 @@ def _coerce_tds(targ, res): return targ, res try: - if axis != 0 and hasattr(targ, 'shape') and targ.ndim: + if axis != 0 and hasattr(targ, 'shape') and targ.ndim and targ.shape != res.shape: res = np.split(res, [targ.shape[0]], axis=0)[0] except: targ, res = _coerce_tds(targ, res) @@ -364,10 +365,11 @@ def test_returned_dtype(self): "return dtype expected from %s is %s, got %s instead" % (method, dtype, result.dtype)) def test_nanmedian(self): - self.check_funs(nanops.nanmedian, np.median, - allow_complex=False, allow_str=False, allow_date=False, - allow_tdelta=True, - allow_obj='convert') + with warnings.catch_warnings(record=True): + self.check_funs(nanops.nanmedian, np.median, + allow_complex=False, allow_str=False, allow_date=False, + allow_tdelta=True, + allow_obj='convert') def test_nanvar(self): self.check_funs_ddof(nanops.nanvar, np.var, diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 0dad55a9133b6..1f8bcf8c9879f 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -5,6 +5,7 @@ from inspect import getargspec import operator import nose +from functools import wraps import numpy as np import pandas as pd @@ -17,6 +18,7 @@ import pandas.core.common as com from pandas import compat from pandas.compat import range, lrange, StringIO, OrderedDict +from pandas import SparsePanel from pandas.util.testing import (assert_panel_equal, assert_frame_equal, @@ -31,6 +33,22 @@ import pandas.core.panel as panelm import pandas.util.testing as tm +def ignore_sparse_panel_future_warning(func): + """ + decorator to ignore FutureWarning if we have a SparsePanel + + can be removed when SparsePanel is fully removed + """ + @wraps(func) + def wrapper(self, *args, **kwargs): + + if isinstance(self.panel, SparsePanel): + with assert_produces_warning(FutureWarning, check_stacklevel=False): + return func(self, *args, **kwargs) + else: + return func(self, *args, **kwargs) + + return wrapper class PanelTests(object): panel = None @@ -56,6 +74,7 @@ class SafeForLongAndSparse(object): def test_repr(self): foo = repr(self.panel) + @ignore_sparse_panel_future_warning def test_copy_names(self): for attr in ('major_axis', 'minor_axis'): getattr(self.panel, attr).name = None @@ -233,6 +252,7 @@ def test_get_plane_axes(self): index, columns = self.panel._get_plane_axes('minor_axis') index, columns = self.panel._get_plane_axes(0) + @ignore_sparse_panel_future_warning def test_truncate(self): dates = self.panel.major_axis start, end = dates[1], dates[5] @@ -293,6 +313,7 @@ def test_iteritems(self): self.assertEqual(len(list(compat.iteritems(self.panel))), len(self.panel.items)) + @ignore_sparse_panel_future_warning def test_combineFrame(self): def check_op(op, name): # items @@ -321,7 +342,7 @@ def check_op(op, name): assert_frame_equal(result.minor_xs(idx), op(self.panel.minor_xs(idx), xs)) - from pandas import SparsePanel + ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] if not compat.PY3: ops.append('div') @@ -348,16 +369,18 @@ def check_op(op, name): com.pprint_thing("Failing operation: %r" % name) raise + @ignore_sparse_panel_future_warning def test_combinePanel(self): result = self.panel.add(self.panel) self.assert_panel_equal(result, self.panel * 2) + @ignore_sparse_panel_future_warning def test_neg(self): self.assert_panel_equal(-self.panel, self.panel * -1) # issue 7692 def test_raise_when_not_implemented(self): - p = Panel(np.arange(3*4*5).reshape(3,4,5), items=['ItemA','ItemB','ItemC'], + p = Panel(np.arange(3*4*5).reshape(3,4,5), items=['ItemA','ItemB','ItemC'], major_axis=pd.date_range('20130101',periods=4),minor_axis=list('ABCDE')) d = p.sum(axis=1).ix[0] ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'div', 'mod', 'pow'] @@ -365,6 +388,7 @@ def test_raise_when_not_implemented(self): with self.assertRaises(NotImplementedError): getattr(p,op)(d, axis=0) + @ignore_sparse_panel_future_warning def test_select(self): p = self.panel @@ -396,7 +420,9 @@ def test_get_value(self): expected = self.panel[item][mnr][mjr] assert_almost_equal(result, expected) + @ignore_sparse_panel_future_warning def test_abs(self): + result = self.panel.abs() result2 = abs(self.panel) expected = np.abs(self.panel) @@ -872,9 +898,6 @@ def assert_panel_equal(cls, x, y): assert_panel_equal(x, y) def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) - self.panel = _panel.copy() self.panel.major_axis.name = None self.panel.minor_axis.name = None @@ -1534,6 +1557,7 @@ def test_transpose_copy(self): panel.values[0, 1, 1] = np.nan self.assertTrue(notnull(result.values[1, 0, 1])) + @ignore_sparse_panel_future_warning def test_to_frame(self): # filtered filtered = self.panel.to_frame() @@ -2313,6 +2337,7 @@ def test_to_string(self): buf = StringIO() self.panel.to_string(buf) + @ignore_sparse_panel_future_warning def test_truncate(self): dates = self.panel.index.levels[0] start, end = dates[1], dates[5] diff --git a/pandas/tests/test_rplot.py b/pandas/tests/test_rplot.py index e79acfcbc58d8..4342417db193b 100644 --- a/pandas/tests/test_rplot.py +++ b/pandas/tests/test_rplot.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- from pandas.compat import range -import pandas.tools.rplot as rplot import pandas.util.testing as tm from pandas import read_csv import os - import nose +with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + import pandas.tools.rplot as rplot def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 9c86c3f894c67..5ce25f5d93800 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4032,6 +4032,21 @@ def test_datetime64_tz_fillna(self): Timestamp('2011-01-04 10:00', tz=tz)]) self.assert_series_equal(expected, result) + # filling with a naive/other zone, coerce to object + result = s.fillna(Timestamp('20130101')) + expected = Series([Timestamp('2011-01-01 10:00', tz=tz), + Timestamp('2013-01-01'), + Timestamp('2011-01-03 10:00', tz=tz), + Timestamp('2013-01-01')]) + self.assert_series_equal(expected, result) + + result = s.fillna(Timestamp('20130101',tz='US/Pacific')) + expected = Series([Timestamp('2011-01-01 10:00', tz=tz), + Timestamp('2013-01-01',tz='US/Pacific'), + Timestamp('2011-01-03 10:00', tz=tz), + Timestamp('2013-01-01',tz='US/Pacific')]) + self.assert_series_equal(expected, result) + def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) s.fillna(method='ffill', inplace=True) @@ -4269,6 +4284,43 @@ def test_object_comparisons(self): expected = -(s == 'a') assert_series_equal(result, expected) + def test_comparison_tuples(self): + # GH11339 + # comparisons vs tuple + s = Series([(1,1),(1,2)]) + + result = s == (1,2) + expected = Series([False,True]) + assert_series_equal(result, expected) + + result = s != (1,2) + expected = Series([True, False]) + assert_series_equal(result, expected) + + result = s == (0,0) + expected = Series([False, False]) + assert_series_equal(result, expected) + + result = s != (0,0) + expected = Series([True, True]) + assert_series_equal(result, expected) + + s = Series([(1,1),(1,1)]) + + result = s == (1,1) + expected = Series([True, True]) + assert_series_equal(result, expected) + + result = s != (1,1) + expected = Series([False, False]) + assert_series_equal(result, expected) + + s = Series([frozenset([1]),frozenset([1,2])]) + + result = s == frozenset([1]) + expected = Series([True, False]) + assert_series_equal(result, expected) + def test_comparison_operators_with_nas(self): s = Series(bdate_range('1/1/2000', periods=10), dtype=object) s[::2] = np.nan @@ -5117,7 +5169,6 @@ def test_dropna_empty(self): # invalid axis self.assertRaises(ValueError, s.dropna, axis=1) - def test_datetime64_tz_dropna(self): # DatetimeBlock s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, @@ -5140,6 +5191,18 @@ def test_datetime64_tz_dropna(self): self.assertEqual(result.dtype, 'datetime64[ns, Asia/Tokyo]') self.assert_series_equal(result, expected) + def test_dropna_no_nan(self): + for s in [Series([1, 2, 3], name='x'), + Series([False, True, False], name='x')]: + + result = s.dropna() + self.assert_series_equal(result, s) + self.assertFalse(result is s) + + s2 = s.copy() + s2.dropna(inplace=True) + self.assert_series_equal(s2, s) + def test_axis_alias(self): s = Series([1, 2, np.nan]) assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index')) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 89fe9463282b6..de7a5f5a73f3d 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -189,7 +189,13 @@ def _add_margins(table, data, values, rows, cols, aggfunc): margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names - result = result.append(margin_dummy) + try: + result = result.append(margin_dummy) + except TypeError: + + # we cannot reshape, so coerce the axis + result.index = result.index._to_safe_for_reshape() + result = result.append(margin_dummy) result.index.names = row_names return result @@ -218,6 +224,7 @@ def _compute_grand_margin(data, values, aggfunc): def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin): + if len(cols) > 0: # need to "interleave" the margins table_pieces = [] @@ -235,7 +242,13 @@ def _all_key(key): # we are going to mutate this, so need to copy! piece = piece.copy() - piece[all_key] = margin[key] + try: + piece[all_key] = margin[key] + except TypeError: + + # we cannot reshape, so coerce the axis + piece.set_axis(cat_axis, piece._get_axis(cat_axis)._to_safe_for_reshape()) + piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 929a72cfd4adc..b555a7dc2b3a1 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -20,6 +20,7 @@ from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv import pandas.algos as algos import pandas.util.testing as tm +from numpy.testing.decorators import slow a_ = np.array @@ -1410,6 +1411,7 @@ def test_merge_na_keys(self): tm.assert_frame_equal(result, expected) + @slow def test_int64_overflow_issues(self): from itertools import product from collections import defaultdict diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 50ae574c03067..f0052774d66a2 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -719,6 +719,26 @@ def test_crosstab_dropna(self): ('two', 'dull'), ('two', 'shiny')]) assert_equal(res.columns.values, m.values) + def test_categorical_margins(self): + # GH 10989 + df = pd.DataFrame({'x': np.arange(8), + 'y': np.arange(8) // 4, + 'z': np.arange(8) % 2}) + + expected = pd.DataFrame([[1.0, 2.0, 1.5],[5, 6, 5.5],[3, 4, 3.5]]) + expected.index = Index([0,1,'All'],name='y') + expected.columns = Index([0,1,'All'],name='z') + + data = df.copy() + table = data.pivot_table('x', 'y', 'z', margins=True) + tm.assert_frame_equal(table, expected) + + data = df.copy() + data.y = data.y.astype('category') + data.z = data.z.astype('category') + table = data.pivot_table('x', 'y', 'z', margins=True) + tm.assert_frame_equal(table, expected) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 814a9ccc45582..868057c675594 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -756,6 +756,8 @@ def astype(self, dtype): return self.asi8.copy() elif dtype == _NS_DTYPE and self.tz is not None: return self.tz_convert('UTC').tz_localize(None) + elif dtype == str: + return self._shallow_copy(values=self.format(), infer=True) else: # pragma: no cover raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 24edc54582ec1..4d353eccba972 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -45,6 +45,32 @@ def test_ops_properties_basic(self): self.assertEqual(s.day,10) self.assertRaises(AttributeError, lambda : s.weekday) + def test_astype_str(self): + # test astype string - #10442 + result = date_range('2012-01-01', periods=4, name='test_name').astype(str) + expected = Index(['2012-01-01', '2012-01-02', '2012-01-03','2012-01-04'], + name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with tz and name + result = date_range('2012-01-01', periods=3, name='test_name', tz='US/Eastern').astype(str) + expected = Index(['2012-01-01 00:00:00-05:00', '2012-01-02 00:00:00-05:00', + '2012-01-03 00:00:00-05:00'], name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and name + result = date_range('1/1/2011', periods=3, freq='H', name='test_name').astype(str) + expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', '2011-01-01 02:00:00'], + name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and timezone + result = date_range('3/6/2012 00:00', periods=2, freq='H', + tz='Europe/London', name='test_name').astype(str) + expected = Index(['2012-03-06 00:00:00+00:00', '2012-03-06 01:00:00+00:00'], + dtype=object, name='test_name') + tm.assert_index_equal(result, expected) + def test_asobject_tolist(self): idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') expected_list = [pd.Timestamp('2013-01-31'), pd.Timestamp('2013-02-28'), @@ -503,7 +529,6 @@ def test_infer_freq(self): tm.assert_index_equal(idx, result) self.assertEqual(result.freq, freq) - class TestTimedeltaIndexOps(Ops): def setUp(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index a80bdf970cccb..230016f00374f 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2223,6 +2223,7 @@ def test_append_join_nondatetimeindex(self): # it works rng.join(idx, how='outer') + def test_astype(self): rng = date_range('1/1/2000', periods=10) @@ -2235,6 +2236,17 @@ def test_astype(self): expected = date_range('1/1/2000', periods=10, tz='US/Eastern').tz_convert('UTC').tz_localize(None) tm.assert_index_equal(result, expected) + # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex + result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) + expected = pd.Series(['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(pd.date_range('2012-01-01', periods=3, tz='US/Eastern')).astype(str) + expected = Series(['2012-01-01 00:00:00-05:00', '2012-01-02 00:00:00-05:00', '2012-01-03 00:00:00-05:00'], + dtype=object) + tm.assert_series_equal(result, expected) + + def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) self.assertRaises(ValueError, idx.to_period) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 398c5f0232de1..8e6d4019c69a3 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3849,6 +3849,7 @@ def get_time_micros(ndarray[int64_t] dtindex): @cython.wraparound(False) +@cython.boundscheck(False) def get_date_field(ndarray[int64_t] dtindex, object field): ''' Given a int64-based datetime index, extract the year, month, etc., @@ -3872,130 +3873,142 @@ def get_date_field(ndarray[int64_t] dtindex, object field): out = np.empty(count, dtype='i4') if field == 'Y': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.year + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.year return out elif field == 'M': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.month + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.month return out elif field == 'D': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.day + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.day return out elif field == 'h': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.hour + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.hour return out elif field == 'm': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.min + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.min return out elif field == 's': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.sec + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.sec return out elif field == 'us': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.us + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.us return out elif field == 'ns': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.ps / 1000 + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.ps / 1000 return out elif field == 'doy': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - isleap = is_leapyear(dts.year) - out[i] = _month_offset[isleap, dts.month-1] + dts.day + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + isleap = is_leapyear(dts.year) + out[i] = _month_offset[isleap, dts.month-1] + dts.day return out elif field == 'dow': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - ts = convert_to_tsobject(dtindex[i], None, None) - out[i] = ts_dayofweek(ts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dayofweek(dts.year, dts.month, dts.day) return out elif field == 'woy': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue - - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None) - isleap = is_leapyear(dts.year) - isleap_prev = is_leapyear(dts.year - 1) - mo_off = _month_offset[isleap, dts.month - 1] - doy = mo_off + dts.day - dow = ts_dayofweek(ts) - - #estimate - woy = (doy - 1) - dow + 3 - if woy >= 0: - woy = woy / 7 + 1 - - # verify - if woy < 0: - if (woy > -2) or (woy == -2 and isleap_prev): - woy = 53 - else: - woy = 52 - elif woy == 53: - if 31 - dts.day + dow < 3: - woy = 1 + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + isleap = is_leapyear(dts.year) + isleap_prev = is_leapyear(dts.year - 1) + mo_off = _month_offset[isleap, dts.month - 1] + doy = mo_off + dts.day + dow = dayofweek(dts.year, dts.month, dts.day) + + #estimate + woy = (doy - 1) - dow + 3 + if woy >= 0: + woy = woy / 7 + 1 + + # verify + if woy < 0: + if (woy > -2) or (woy == -2 and isleap_prev): + woy = 53 + else: + woy = 52 + elif woy == 53: + if 31 - dts.day + dow < 3: + woy = 1 - out[i] = woy + out[i] = woy return out elif field == 'q': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = dts.month - out[i] = ((out[i] - 1) / 3) + 1 + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.month + out[i] = ((out[i] - 1) / 3) + 1 return out elif field == 'dim': - for i in range(count): - if dtindex[i] == NPY_NAT: out[i] = -1; continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - out[i] = monthrange(dts.year, dts.month)[1] + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = days_in_month(dts) return out raise ValueError("Field %s not supported" % field) @@ -4239,12 +4252,13 @@ def date_normalize(ndarray[int64_t] stamps, tz=None): tz = maybe_get_tz(tz) result = _normalize_local(stamps, tz) else: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) - result[i] = _normalized_stamp(&dts) + with nogil: + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + result[i] = _normalized_stamp(&dts) return result @@ -4256,12 +4270,13 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pandas_datetimestruct dts if _is_utc(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) - result[i] = _normalized_stamp(&dts) + with nogil: + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + result[i] = _normalized_stamp(&dts) elif _is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: @@ -4304,7 +4319,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): return result -cdef inline int64_t _normalized_stamp(pandas_datetimestruct *dts): +cdef inline int64_t _normalized_stamp(pandas_datetimestruct *dts) nogil: dts.hour = 0 dts.min = 0 dts.sec = 0 @@ -4369,6 +4384,8 @@ def monthrange(int64_t year, int64_t month): cdef inline int64_t ts_dayofweek(_TSObject ts): return dayofweek(ts.dts.year, ts.dts.month, ts.dts.day) +cdef inline int days_in_month(pandas_datetimestruct dts) nogil: + return days_per_month_table[is_leapyear(dts.year)][dts.month-1] cpdef normalize_date(object dt): ''' @@ -4388,17 +4405,18 @@ cpdef normalize_date(object dt): cdef inline int _year_add_months(pandas_datetimestruct dts, - int months): + int months) nogil: '''new year number after shifting pandas_datetimestruct number of months''' return dts.year + (dts.month + months - 1) / 12 cdef inline int _month_add_months(pandas_datetimestruct dts, - int months): + int months) nogil: '''new month number after shifting pandas_datetimestruct number of months''' cdef int new_month = (dts.month + months) % 12 return 12 if new_month == 0 else new_month @cython.wraparound(False) +@cython.boundscheck(False) def shift_months(int64_t[:] dtindex, int months, object day=None): ''' Given an int64-based datetime index, shift all elements @@ -4411,24 +4429,26 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): ''' cdef: Py_ssize_t i - int days_in_month pandas_datetimestruct dts int count = len(dtindex) + cdef int days_in_current_month int64_t[:] out = np.empty(count, dtype='int64') - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - else: - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - - if day is None: + if day is None: + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) dts.year = _year_add_months(dts, months) dts.month = _month_add_months(dts, months) - #prevent day from wrapping around month end - days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1] - dts.day = min(dts.day, days_in_month) - elif day == 'start': + + dts.day = min(dts.day, days_in_month(dts)) + out[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + elif day == 'start': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) dts.year = _year_add_months(dts, months) dts.month = _month_add_months(dts, months) @@ -4439,21 +4459,28 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): dts.month = _month_add_months(dts, -1) else: dts.day = 1 - elif day == 'end': - days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1] + out[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + elif day == 'end': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + days_in_current_month = days_in_month(dts) + dts.year = _year_add_months(dts, months) dts.month = _month_add_months(dts, months) # similar semantics - when adding shift forward by one # month if already at an end of month - if months >= 0 and dts.day == days_in_month: + if months >= 0 and dts.day == days_in_current_month: dts.year = _year_add_months(dts, 1) dts.month = _month_add_months(dts, 1) - days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1] - dts.day = days_in_month + dts.day = days_in_month(dts) + out[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + else: + raise ValueError("day must be None, 'start' or 'end'") - out[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) return np.asarray(out) #---------------------------------------------------------------------- diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 362351c7c31c2..a278c4d0f9045 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -59,7 +59,6 @@ def reset_testing_mode(): if 'deprecate' in testing_mode: warnings.simplefilter('ignore', DeprecationWarning) - set_testing_mode() class TestCase(unittest.TestCase): @@ -255,6 +254,23 @@ def _skip_if_python26(): import nose raise nose.SkipTest("skipping on python2.6") + +def _skip_if_no_pathlib(): + try: + from pathlib import Path + except ImportError: + import nose + raise nose.SkipTest("pathlib not available") + + +def _skip_if_no_localpath(): + try: + from py.path import local as LocalPath + except ImportError: + import nose + raise nose.SkipTest("py.path not installed") + + def _incompat_bottleneck_version(method): """ skip if we have bottleneck installed and its >= 1.0 @@ -1958,7 +1974,6 @@ def handle_success(self, exc_type, exc_value, traceback): raise_with_traceback(e, traceback) return True - @contextmanager def assert_produces_warning(expected_warning=Warning, filter_level="always", clear=None, check_stacklevel=True): @@ -2005,6 +2020,7 @@ def assert_produces_warning(expected_warning=Warning, filter_level="always", warnings.simplefilter(filter_level) yield w extra_warnings = [] + for actual_warning in w: if (expected_warning and issubclass(actual_warning.category, expected_warning)):