diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6f91eba1ad239..c7d731249f9cf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -10,6 +10,6 @@ #### Output of ``pd.show_versions()``
-# Paste the output here +# Paste the output here pd.show_versions() here
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 0f9689dadcbb0..2e394ed4268f3 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -148,12 +148,12 @@ class to_numeric(object): N = 500000 data_dict = { - 'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)), - 'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)), + 'string-int': (['1'] * (N // 2)) + ([2] * (N // 2)), + 'string-nint': (['-1'] * (N // 2)) + ([2] * (N // 2)), 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)), - 'int-list': ([1] * (N / 2)) + ([2] * (N / 2)), + 'string-float': (['1.1'] * (N // 2)) + ([2] * (N // 2)), + 'int-list': ([1] * (N // 2)) + ([2] * (N // 2)), 'int32': np.repeat(np.int32(1), N) } diff --git a/ci/lint.sh b/ci/lint.sh index a866b04445f96..115a2cdaf7899 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -7,16 +7,10 @@ source activate pandas RET=0 if [ "$LINT" ]; then - echo "Linting" - for path in 'api' 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.py' - if [ $? -ne "0" ]; then - RET=1 - fi - - done + # pandas/rpy is deprecated and will be removed. + # pandas/src is C code, so no need to search there. + echo "Linting *.py" + flake8 pandas --filename '*.py' --exclude pandas/rpy,pandas/src echo "Linting *.py DONE" echo "Linting *.pyx" diff --git a/ci/requirements-3.4.build b/ci/requirements-3.4.build index 4a4bd9d433428..e6e59dcba63fe 100644 --- a/ci/requirements-3.4.build +++ b/ci/requirements-3.4.build @@ -1,3 +1,3 @@ numpy=1.8.1 -cython +cython=0.24.1 libgfortran=1.0 diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh index 719db67f384e0..e404ca73a405e 100755 --- a/ci/travis_encrypt_gbq.sh +++ b/ci/travis_encrypt_gbq.sh @@ -1,11 +1,10 @@ #!/bin/bash GBQ_JSON_FILE=$1 -GBQ_PROJECT_ID=$2 -if [[ $# -ne 2 ]]; then +if [[ $# -ne 1 ]]; then echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ - " " + "" exit 1 fi @@ -23,9 +22,9 @@ echo "Encrypting $GBQ_JSON_FILE..." read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); -echo "Adding your secure key and project id to travis_gbq_config.txt ..." -echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY\n"\ -"GBQ_PROJECT_ID='$GBQ_PROJECT_ID'" > travis_gbq_config.txt +echo "Adding your secure key to travis_gbq_config.txt ..." +echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ +> travis_gbq_config.txt echo "Done. Removing file $GBQ_JSON_FILE" rm $GBQ_JSON_FILE diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt index 3b68d62f177cc..0b28cdedbd0d7 100644 --- a/ci/travis_gbq_config.txt +++ b/ci/travis_gbq_config.txt @@ -1,3 +1,2 @@ TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key -GBQ_PROJECT_ID='pandas-travis' diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh index 7ff4c08f78e37..9967d40e49f0a 100755 --- a/ci/travis_process_gbq_encryption.sh +++ b/ci/travis_process_gbq_encryption.sh @@ -2,10 +2,12 @@ source ci/travis_gbq_config.txt -if [[ -n ${!TRAVIS_IV_ENV} ]]; then +if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then + echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; +elif [[ -n ${!TRAVIS_IV_ENV} ]]; then openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID=$GBQ_PROJECT_ID; + export GBQ_PROJECT_ID='pandas-travis'; echo 'Successfully decrypted gbq credentials' fi diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index a8a47a9d979c0..44ee6223d5ee1 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -643,20 +643,22 @@ on Travis-CI and are only accessible from the pandas-dev/pandas repository. The credentials won't be available on forks of pandas. Here are the steps to run gbq integration tests on a forked repository: -#. First, complete all the steps in the `Encrypting Files Prerequisites - `__ section. -#. Sign into `Travis `__ using your GitHub account. -#. Enable your forked repository of pandas for testing in `Travis - `__. -#. Run the following command from terminal where the current working directory - is the ``ci`` folder:: - - ./travis_encrypt_gbq.sh - -#. Create a new branch from the branch used in your pull request. Commit the - encrypted file called ``travis_gbq.json.enc`` as well as the file - ``travis_gbq_config.txt``, in an otherwise empty commit. DO NOT commit the - ``*.json`` file which contains your unencrypted private key. +#. Go to `Travis CI `__ and sign in with your GitHub + account. +#. Click on the ``+`` icon next to the ``My Repositories`` list and enable + Travis builds for your fork. +#. Click on the gear icon to edit your travis build, and add two environment + variables: + + - ``GBQ_PROJECT_ID`` with the value being the ID of your BigQuery project. + + - ``SERVICE_ACCOUNT_KEY`` with the value being the contents of the JSON key + that you downloaded for your service account. Use single quotes around + your JSON key to ensure that it is treated as a string. + + For both environment variables, keep the "Display value in build log" option + DISABLED. These variables contain sensitive data and you do not want their + contents being exposed in build logs. #. Your branch should be tested automatically once it is pushed. You can check the status by visiting your Travis branches page which exists at the following location: https://travis-ci.org/your-user-name/pandas/branches . diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index a37b1e89c7cc3..087b265ee83f2 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -143,7 +143,7 @@ both "column wise min/max and global min/max coloring." API ----- -`pandas-datareader `__ +`pandas-datareader `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``pandas-datareader`` is a remote data access library for pandas. ``pandas.io`` from pandas < 0.17.0 is now refactored/split-off to and importable from ``pandas_datareader`` (PyPI:``pandas-datareader``). Many/most of the supported APIs have at least a documentation paragraph in the `pandas-datareader docs `_: diff --git a/doc/source/io.rst b/doc/source/io.rst index ae71587c8b46b..ba1bd328d2991 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2789,7 +2789,7 @@ both on the writing (serialization), and reading (deserialization). | 0.17 / Python 3 | >=0.18 / any Python | +----------------------+------------------------+ | 0.18 | >= 0.18 | - +======================+========================+ + +----------------------+------------------------+ Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. diff --git a/doc/source/release.rst b/doc/source/release.rst index d210065f04459..622e9a53ff8f0 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,50 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org + +pandas 0.19.1 +------------- + +**Release date:** November 3, 2016 + +This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, +bug fixes and performance improvements. + +See the :ref:`v0.19.1 Whatsnew ` page for an overview of all +bugs that have been fixed in 0.19.1. + +Thanks +~~~~~~ + +- Adam Chainz +- Anthonios Partheniou +- Arash Rouhani +- Ben Kandel +- Brandon M. Burroughs +- Chris +- chris-b1 +- Chris Warth +- David Krych +- dubourg +- gfyoung +- Iván Vallés Pérez +- Jeff Reback +- Joe Jevnik +- Jon M. Mease +- Joris Van den Bossche +- Josh Owen +- Keshav Ramaswamy +- Larry Ren +- mattrijk +- Michael Felt +- paul-mannino +- Piotr Chromiec +- Robert Bradshaw +- Sinhrks +- Thiago Serafim +- Tom Bird + + pandas 0.19.0 ------------- diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index 0944d849cfafd..6ecd4b487c798 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -600,7 +600,7 @@ Enhancements .. ipython:: python t = Timestamp('20130101 09:01:02') - t + pd.datetools.Nano(123) + t + pd.tseries.offsets.Nano(123) - A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs` for more. diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3ee4cc1dde92d..5022d9c12dd90 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -1,15 +1,12 @@ .. _whatsnew_0191: -v0.19.1 (????, 2016) ---------------------- +v0.19.1 (November 3, 2016) +-------------------------- -This is a minor bug-fix release from 0.19.0 and includes a large number of -bug fixes along with several new features, enhancements, and performance improvements. +This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, +bug fixes and performance improvements. We recommend that all users upgrade to this version. -Highlights include: - - .. contents:: What's new in v0.19.1 :local: :backlinks: none @@ -21,11 +18,10 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) +- Fixed performance regression in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) -- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) - - +- Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). .. _whatsnew_0191.bug_fixes: @@ -33,35 +29,27 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - - - +- Source installs from PyPI will now again work without ``cython`` installed, as in previous versions (:issue:`14204`) +- Compat with Cython 0.25 for building (:issue:`14496`) +- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). +- Fixed regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). +- Fixed regression in ``Index.difference`` where the ``freq`` of a ``DatetimeIndex`` was incorrectly set (:issue:`14323`) +- Added back ``pandas.core.common.array_equivalent`` with a deprecation warning (:issue:`14555`). +- Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) +- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) +- Fixed regression in ``Index.append`` when categorical indices were appended (:issue:`14545`). +- Fixed regression in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) +- Fixed regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) - - - - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) - Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) - - - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). -- Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) - - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) - - - Bug in ``Series.__setitem__`` which allowed mutating read-only arrays (:issue:`14359`). - - -- Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) - Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`) - - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) - - - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) - - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) @@ -69,12 +57,6 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - - - - - - - - -- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` + is not scalar and ``values`` is not specified (:issue:`14380`) +- Enhancement in ``pandas.io.json.json_normalize``Added errors{'raise','ignore'} for keys not found in meta (:issue: '14505') \ No newline at end of file diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index d4d8b7e4e9747..49aa31c375e25 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import numpy as np + import pandas as pd from pandas.core import common as com from pandas import api @@ -184,6 +186,11 @@ def test_deprecation_core_common(self): for t in self.allowed: self.check_deprecation(getattr(com, t), getattr(types, t)) + def test_deprecation_core_common_array_equivalent(self): + + with tm.assert_produces_warning(DeprecationWarning): + com.array_equivalent(np.array([1, 2]), np.array([1, 2])) + def test_deprecation_core_common_moved(self): # these are in pandas.types.common diff --git a/pandas/core/common.py b/pandas/core/common.py index 341bd3b4cc845..295947bbc1166 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -64,6 +64,15 @@ def wrapper(*args, **kwargs): setattr(m, t, outer(t)) +# deprecate array_equivalent + +def array_equivalent(*args, **kwargs): + warnings.warn("'pandas.core.common.array_equivalent' is deprecated and " + "is no longer public API", DeprecationWarning, stacklevel=2) + from pandas.types import missing + return missing.array_equivalent(*args, **kwargs) + + class PandasError(Exception): pass diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 037ab900e6150..8e18b65e80385 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4008,6 +4008,8 @@ def asfreq(self, freq, method=None, how=None, normalize=False): ------- converted : type of caller + Notes + ----- To learn more about the frequency strings, please see `this link `__. """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2a7f896e1b871..afddb86988970 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -6,7 +6,7 @@ import warnings import copy -from pandas.compat import( +from pandas.compat import ( zip, range, long, lzip, callable, map ) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11721a5bdac29..43beefffd448e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -6,7 +6,6 @@ from collections import defaultdict import numpy as np -from numpy import percentile as _quantile from pandas.core.base import PandasObject @@ -1147,8 +1146,9 @@ def get_result(other): def handle_error(): if raise_on_error: + # The 'detail' variable is defined in outer scope. raise TypeError('Could not operate %s with block values %s' % - (repr(other), str(detail))) + (repr(other), str(detail))) # noqa else: # return the values result = np.empty(values.shape, dtype='O') @@ -1315,16 +1315,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) - mask = isnull(self.values) - if not lib.isscalar(mask) and mask.any(): - # even though this could be a 2-d mask it appears - # as a 1-d result - mask = mask.reshape(values.shape) - result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) - values = _block_shape(values[~mask], ndim=self.ndim) - if self.ndim > 1: - values = values.reshape(result_shape) + def _nanpercentile1D(values, mask, q, **kw): + values = values[~mask] + + if len(values) == 0: + if is_scalar(q): + return self._na_value + else: + return np.array([self._na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, **kw) + + def _nanpercentile(values, q, axis, **kw): + + mask = isnull(self.values) + if not is_scalar(mask) and mask.any(): + if self.ndim == 1: + return _nanpercentile1D(values, mask, q, **kw) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, **kw) from pandas import Float64Index is_empty = values.shape[axis] == 0 @@ -1343,13 +1365,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: try: - result = _quantile(values, np.array(qs) * 100, - axis=axis, **kw) + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, **kw) except ValueError: # older numpies don't handle an array for q - result = [_quantile(values, q * 100, - axis=axis, **kw) for q in qs] + result = [_nanpercentile(values, q * 100, + axis=axis, **kw) for q in qs] result = np.array(result, copy=False) if self.ndim > 1: @@ -1368,7 +1390,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: result = np.array([self._na_value] * len(self)) else: - result = _quantile(values, qs * 100, axis=axis, **kw) + result = _nanpercentile(values, qs * 100, axis=axis, **kw) ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c6b13885dd01..188204d83d985 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2915,8 +2915,8 @@ def create_from_value(value, index, dtype): return subarr - # scalar like - if subarr.ndim == 0: + # scalar like, GH + if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 4d2dcd259e623..54eaf86315a88 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1464,13 +1464,13 @@ def append(self, other): names = set([obj.name for obj in to_concat]) name = None if len(names) > 1 else self.name - typs = _concat.get_dtype_kinds(to_concat) - - if 'category' in typs: - # if any of the to_concat is category + if self.is_categorical(): + # if calling index is category, don't check dtype of others from pandas.indexes.category import CategoricalIndex return CategoricalIndex._append_same_dtype(self, to_concat, name) + typs = _concat.get_dtype_kinds(to_concat) + if len(typs) == 1: return self._append_same_dtype(to_concat, name=name) return _concat._concat_index_asobject(to_concat, name=name) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index a9f452db69659..f9576d92d8a49 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1907,6 +1907,13 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): return np.array(labels == loc, dtype=bool) else: # sorted, so can return slice object -> view + try: + loc = labels.dtype.type(loc) + except TypeError: + # this occurs when loc is a slice (partial string indexing) + # but the TypeError raised by searchsorted in this case + # is catched in Index._has_valid_type() + pass i = labels.searchsorted(loc, side='left') j = labels.searchsorted(loc, side='right') return slice(i, j) diff --git a/pandas/io/json.py b/pandas/io/json.py index 1e258101a5d86..3b0930a36e199 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -22,10 +22,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): - if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -53,7 +52,6 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', class Writer(object): - def __init__(self, obj, orient, date_format, double_precision, ensure_ascii, date_unit, default_handler=None): self.obj = obj @@ -123,32 +121,38 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` - orient + orient : string, + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{index -> [index], columns -> [columns], data -> [values]}`` + - ``'records'`` : list like + ``[{column -> value}, ... , {column -> value}]`` + - ``'index'`` : dict like ``{index -> {column -> value}}`` + - ``'columns'`` : dict like ``{column -> {index -> value}}`` + - ``'values'`` : just the values array + + The allowed and default values depend on the value + of the `typ` parameter. - * `Series` + * when ``typ == 'series'``, + - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - - allowed values are: ``{'split','records','index'}`` - The Series index must be unique for orient ``'index'``. - * `DataFrame` + * when ``typ == 'frame'``, + - allowed orients are ``{'split','records','index', + 'columns','values'}`` - default is ``'columns'`` - - allowed values are: {'split','records','index','columns','values'} - - The DataFrame index must be unique for orients 'index' and - 'columns'. - - The DataFrame columns must be unique for orients 'index', - 'columns', and 'records'. - - * The format of the JSON string - - - split : dict like - ``{index -> [index], columns -> [columns], data -> [values]}`` - - records : list like - ``[{column -> value}, ... , {column -> value}]`` - - index : dict like ``{index -> {column -> value}}`` - - columns : dict like ``{column -> {index -> value}}`` - - values : just the values array + - The DataFrame index must be unique for orients ``'index'`` and + ``'columns'``. + - The DataFrame columns must be unique for orients ``'index'``, + ``'columns'``, and ``'records'``. typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True @@ -197,7 +201,48 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Returns ------- - result : Series or DataFrame + result : Series or DataFrame, depending on the value of `typ`. + + See Also + -------- + DataFrame.to_json + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + + Encoding/decoding a Dataframe using ``'split'`` formatted JSON: + + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + >>> pd.read_json(_, orient='split') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + >>> pd.read_json(_, orient='index') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + >>> pd.read_json(_, orient='records') + col 1 col 2 + 0 a b + 1 c d """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, @@ -244,7 +289,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, class Parser(object): - _STAMP_UNITS = ('s', 'ms', 'us', 'ns') _MIN_STAMPS = { 's': long(31536000), @@ -445,8 +489,8 @@ def _parse_no_numpy(self): if orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + json, + precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: @@ -520,8 +564,8 @@ def _parse_no_numpy(self): elif orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + json, + precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": @@ -548,7 +592,6 @@ def _process_converter(self, f, filt=None): new_obj[i] = c if needs_new_obj: - # possibly handle dup columns new_obj = DataFrame(new_obj, index=self.obj.index) new_obj.columns = self.obj.columns @@ -581,9 +624,9 @@ def is_ok(col): col_lower = col.lower() if (col_lower.endswith('_at') or col_lower.endswith('_time') or - col_lower == 'modified' or - col_lower == 'date' or - col_lower == 'datetime' or + col_lower == 'modified' or + col_lower == 'date' or + col_lower == 'datetime' or col_lower.startswith('timestamp')): return True return False @@ -593,6 +636,7 @@ def is_ok(col): lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates)) + # --------------------------------------------------------------------- # JSON normalization routines @@ -676,7 +720,7 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, - record_prefix=None): + record_prefix=None, errors='raise'): """ "Normalize" semi-structured JSON data into a flat table @@ -693,6 +737,8 @@ def json_normalize(data, record_path=None, meta=None, If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None + error: {'raise', 'ignore'}, default 'raise' + * ignore: will ignore keyErrors if keys listed in meta are not always present Returns ------- @@ -728,6 +774,7 @@ def json_normalize(data, record_path=None, meta=None, 4 Cuyahoga 1337 John Kasich Ohio OH """ + def _pull_field(js, spec): result = js if isinstance(spec, list): @@ -792,7 +839,13 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise KeyError("Try running with errors='ignore' as the following key may not always be present: "+str(e)) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8cf04e08ab03..090a21632cddb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1456,6 +1456,8 @@ def __init__(self, src, **kwds): def close(self): for f in self.handles: f.close() + + # close additional handles opened by C parser (for compression) try: self._reader.close() except: @@ -1759,6 +1761,9 @@ def __init__(self, f, **kwds): self.delimiter = kwds['delimiter'] self.quotechar = kwds['quotechar'] + if isinstance(self.quotechar, compat.text_type): + self.quotechar = str(self.quotechar) + self.escapechar = kwds['escapechar'] self.doublequote = kwds['doublequote'] self.skipinitialspace = kwds['skipinitialspace'] @@ -2191,16 +2196,16 @@ def _handle_usecols(self, columns, usecols_key): usecols_key is used if there are string usecols. """ if self.usecols is not None: - if any([isinstance(u, string_types) for u in self.usecols]): + if any([isinstance(col, string_types) for col in self.usecols]): if len(columns) > 1: raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] - for u in self.usecols: - if isinstance(u, string_types): - col_indices.append(usecols_key.index(u)) + for col in self.usecols: + if isinstance(col, string_types): + col_indices.append(usecols_key.index(col)) else: - col_indices.append(u) + col_indices.append(col) else: col_indices = self.usecols diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 47642c2e2bc28..c9f8d32e1b504 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -507,10 +507,11 @@ def _engine_builder(con): if isinstance(con, string_types): try: import sqlalchemy - con = sqlalchemy.create_engine(con) - return con except ImportError: _SQLALCHEMY_INSTALLED = False + else: + con = sqlalchemy.create_engine(con) + return con return con diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4848db97194d9..4877728d9ec52 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,6 +225,59 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + + def test_json_normalise_fix(self): + # issue 14505 + j = { + "Trades": [{ + "general": { + "tradeid": 100, + "trade_version": 1, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + } + }, { + "general": { + "tradeid": 100, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + } + } + ] + } + j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') + expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + + self.assertEqual(j.fillna('').to_dict(), expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 0219e16391be8..3be02c55ea10a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -17,8 +17,8 @@ import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex from pandas import compat -from pandas.compat import(StringIO, BytesIO, PY3, - range, lrange, u) +from pandas.compat import (StringIO, BytesIO, PY3, + range, lrange, u) from pandas.io.common import DtypeWarning, EmptyDataError, URLError from pandas.io.parsers import TextFileReader, TextParser @@ -1602,3 +1602,26 @@ def test_internal_eof_byte(self): expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b']) result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) + + def test_file_handles(self): + # GH 14418 - don't close user provided file handles + + fh = StringIO('a,b\n1,2') + self.read_csv(fh) + self.assertFalse(fh.closed) + + with open(self.csv1, 'r') as f: + self.read_csv(f) + self.assertFalse(f.closed) + + # mmap not working with python engine + if self.engine != 'python': + + import mmap + with open(self.csv1, 'r') as f: + m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + self.read_csv(m) + # closed attribute new in python 3.2 + if PY3: + self.assertFalse(m.closed) + m.close() diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py index d0f1493be0621..765cec8243a0a 100644 --- a/pandas/io/tests/parser/quoting.py +++ b/pandas/io/tests/parser/quoting.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas.compat import StringIO +from pandas.compat import PY3, StringIO, u class QuotingTests(object): @@ -138,3 +138,16 @@ def test_double_quote(self): result = self.read_csv(StringIO(data), quotechar='"', doublequote=False) tm.assert_frame_equal(result, expected) + + def test_quotechar_unicode(self): + # See gh-14477 + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + result = self.read_csv(StringIO(data), quotechar=u('"')) + tm.assert_frame_equal(result, expected) + + # Compared to Python 3.x, Python 2.x does not handle unicode well. + if PY3: + result = self.read_csv(StringIO(data), quotechar=u('\u0394')) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index c9f50dec6c01e..9f01adb6fabcb 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -190,3 +190,11 @@ def test_skiprows_lineterminator(self): skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) + + def test_skiprows_infield_quote(self): + # see gh-14459 + data = 'a"\nb"\na\n1' + expected = DataFrame({'a': [1]}) + + df = self.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index af8989baabbc0..e9d19bbd8be66 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -944,7 +944,7 @@ def test_sqlalchemy_type_mapping(self): self.assertTrue(isinstance( table.table.c['time'].type, sqltypes.DateTime)) - def test_to_sql_read_sql_with_database_uri(self): + def test_database_uri_string(self): # Test read_sql and .to_sql method with a database URI (GH10654) test_frame1 = self.test_frame1 @@ -963,6 +963,12 @@ def test_to_sql_read_sql_with_database_uri(self): tm.assert_frame_equal(test_frame1, test_frame3) tm.assert_frame_equal(test_frame1, test_frame4) + # using driver that will not be installed on Travis to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with tm.assertRaisesRegexp(ImportError, "pg8000"): + sql.read_sql("select * from table", db_uri) + def _make_iris_table_metadata(self): sa = sqlalchemy metadata = sa.MetaData() diff --git a/pandas/msgpack/__init__.py b/pandas/msgpack/__init__.py index 0c2370df936a4..33d60a12ef0a3 100644 --- a/pandas/msgpack/__init__.py +++ b/pandas/msgpack/__init__.py @@ -1,11 +1,10 @@ # coding: utf-8 -# flake8: noqa - -from pandas.msgpack._version import version -from pandas.msgpack.exceptions import * from collections import namedtuple +from pandas.msgpack.exceptions import * # noqa +from pandas.msgpack._version import version # noqa + class ExtType(namedtuple('ExtType', 'code data')): """ExtType represents ext type in msgpack.""" @@ -18,11 +17,10 @@ def __new__(cls, code, data): raise ValueError("code must be 0~127") return super(ExtType, cls).__new__(cls, code, data) +import os # noqa -import os -from pandas.msgpack._packer import Packer -from pandas.msgpack._unpacker import unpack, unpackb, Unpacker - +from pandas.msgpack._packer import Packer # noqa +from pandas.msgpack._unpacker import unpack, unpackb, Unpacker # noqa def pack(o, stream, **kwargs): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 12525c7a9c587..93a494c176b99 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -272,7 +272,7 @@ cdef class TextReader: parser_t *parser object file_handle, na_fvalues object true_values, false_values - object dsource + object handle bint na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks @@ -554,9 +554,9 @@ cdef class TextReader: def close(self): # we need to properly close an open derived # filehandle here, e.g. and UTFRecoder - if self.dsource is not None: + if self.handle is not None: try: - self.dsource.close() + self.handle.close() except: pass @@ -570,7 +570,8 @@ cdef class TextReader: if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: raise TypeError('bad "quoting" value') - if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + if not isinstance(quote_char, (str, compat.text_type, + bytes)) and quote_char is not None: dtype = type(quote_char).__name__ raise TypeError('"quotechar" must be string, ' 'not {dtype}'.format(dtype=dtype)) @@ -640,6 +641,7 @@ cdef class TextReader: else: raise ValueError('Unrecognized compression type: %s' % self.compression) + self.handle = source if isinstance(source, basestring): if not isinstance(source, bytes): @@ -683,8 +685,6 @@ cdef class TextReader: raise IOError('Expected file path name or file-like object,' ' got %s type' % type(source)) - self.dsource = source - cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index af85b7b894d26..748edc7fcacc5 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -478,9 +478,10 @@ static int end_line(parser_t *self) { } } - if (self->state == SKIP_LINE || \ - self->state == QUOTE_IN_SKIP_LINE || \ - self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE + if (self->state == START_FIELD_IN_SKIP_LINE || \ + self->state == IN_FIELD_IN_SKIP_LINE || \ + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \ + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE ) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count @@ -761,38 +762,54 @@ int tokenize_bytes(parser_t *self, size_t line_limit) switch(self->state) { - case SKIP_LINE: - TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state)); + case START_FIELD_IN_SKIP_LINE: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } break; - case QUOTE_IN_SKIP_LINE: + case IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { if (self->doublequote) { - self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE; + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } } break; - case QUOTE_IN_QUOTE_IN_SKIP_LINE: + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } break; @@ -846,9 +863,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // start of record if (skip_this_line(self, self->file_lines)) { if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; if (IS_TERMINATOR(c)) { END_LINE(); diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 8f7ae436bb7b7..487c1265d9358 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -123,9 +123,10 @@ typedef enum { EAT_COMMENT, EAT_LINE_COMMENT, WHITESPACE_LINE, - SKIP_LINE, - QUOTE_IN_SKIP_LINE, - QUOTE_IN_QUOTE_IN_SKIP_LINE, + START_FIELD_IN_SKIP_LINE, + IN_FIELD_IN_SKIP_LINE, + IN_QUOTED_FIELD_IN_SKIP_LINE, + QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, FINISHED } ParserState; diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d21db5ba52a45..e55ba3e161ed9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -259,6 +259,14 @@ def test_constructor_dict(self): frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) self.assert_index_equal(frame.index, Index([], dtype=np.int64)) + # GH 14381 + # Dict with None value + frame_none = DataFrame(dict(a=None), index=[0]) + frame_none_list = DataFrame(dict(a=[None]), index=[0]) + tm.assert_equal(frame_none.get_value(0, 'a'), None) + tm.assert_equal(frame_none_list.get_value(0, 'a'), None) + tm.assert_frame_equal(frame_none, frame_none_list) + # GH10856 # dict with scalar values should raise error, even if columns passed with tm.assertRaises(ValueError): diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 52e8697abe850..22414a6ba8a53 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -262,6 +262,11 @@ def test_quantile_datetime(self): index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected) + # empty when numeric_only=True + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # result = df[['a', 'c']].quantile(.5) + # result = df[['a', 'c']].quantile([.5]) + def test_quantile_invalid(self): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: @@ -340,3 +345,95 @@ def test_quantile_box(self): pd.Timedelta('2 days')]], index=[0.5], columns=list('AaBbCc')) tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + # full-nan column + df['b'] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, + index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.NaT], index=['a'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame({'a': [pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03')], + 'b': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], + name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], + columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty(self): + + # floats + df = DataFrame(columns=['a', 'b'], dtype='float64') + + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5, axis=1) + # res = df.quantile([0.5], axis=1) + + # ints + df = DataFrame(columns=['a', 'b'], dtype='int64') + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5) + + # datetimes + df = DataFrame(columns=['a', 'b'], dtype='datetime64') + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 5e5e9abda1200..12cd62f8b4cc0 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -405,3 +405,11 @@ def memory_usage(f): # high upper bound self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000) + + def test_info_categorical(self): + # GH14298 + idx = pd.CategoricalIndex(['a', 'b']) + df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 21471b1883209..b839ed6331457 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1576,11 +1576,10 @@ def test_string_index_repr(self): # py3/py2 repr can differ because of "u" prefix # which also affects to displayed element size - # suppress flake8 warnings if PY3: coerce = lambda x: x else: - coerce = unicode + coerce = unicode # noqa # short idx = pd.Index(['a', 'bb', 'ccc']) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 9f8405bcc2e1e..c76f5ff22c534 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -278,6 +278,11 @@ def test_append(self): # invalid objects self.assertRaises(TypeError, lambda: ci.append(Index(['a', 'd']))) + # GH14298 - if base object is not categorical -> coerce to object + result = Index(['c', 'a']).append(ci) + expected = Index(list('caaabbca')) + tm.assert_index_equal(result, expected, exact=True) + def test_insert(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index b04e840ffc849..68db163be6fde 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -732,30 +732,21 @@ def test_fillna_datetime64(self): dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of DatetimeIndex does not preserve frequency, - # so a differencing operation should not retain the freq field of the - # original index. - i = pd.date_range("20160920", "20160925", freq="D") - - a = pd.date_range("20160921", "20160924", freq="D") - expected = pd.DatetimeIndex(["20160920", "20160925"], freq=None) - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.date_range("20160922", "20160925", freq="D") - b_diff = i.difference(b) - expected = pd.DatetimeIndex(["20160920", "20160921"], freq=None) - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_diff = a_diff.union(b_diff) - expected = pd.DatetimeIndex(["20160920", "20160921", "20160925"], - freq=None) - tm.assert_index_equal(union_of_diff, expected) - tm.assert_attr_equal('freq', union_of_diff, expected) + def test_difference_freq(self): + # GH14323: difference of DatetimeIndex should not preserve frequency + + index = date_range("20160920", "20160925", freq="D") + other = date_range("20160921", "20160924", freq="D") + expected = DatetimeIndex(["20160920", "20160925"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = date_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = DatetimeIndex(["20160920", "20160921"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) class TestPeriodIndex(DatetimeLike, tm.TestCase): @@ -963,29 +954,23 @@ def test_no_millisecond_field(self): with self.assertRaises(AttributeError): DatetimeIndex([]).millisecond - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of Period MUST preserve frequency, but the ability - # to union results must be preserved - i = pd.period_range("20160920", "20160925", freq="D") - - a = pd.period_range("20160921", "20160924", freq="D") - expected = pd.PeriodIndex(["20160920", "20160925"], freq='D') - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.period_range("20160922", "20160925", freq="D") - b_diff = i.difference(b) - expected = pd.PeriodIndex(["20160920", "20160921"], freq='D') - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_diff = a_diff.union(b_diff) - expected = pd.PeriodIndex(["20160920", "20160921", "20160925"], - freq='D') - tm.assert_index_equal(union_of_diff, expected) - tm.assert_attr_equal('freq', union_of_diff, expected) + def test_difference_freq(self): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq='D') + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = PeriodIndex(["20160920", "20160921"], freq='D') + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) class TestTimedeltaIndex(DatetimeLike, tm.TestCase): @@ -1199,27 +1184,19 @@ def test_fillna_timedelta(self): [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of TimedeltaIndex does not preserve frequency, - # so a differencing operation should not retain the freq field of the - # original index. - i = pd.timedelta_range("0 days", "5 days", freq="D") - - a = pd.timedelta_range("1 days", "4 days", freq="D") - expected = pd.TimedeltaIndex(["0 days", "5 days"], freq=None) - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.timedelta_range("2 days", "5 days", freq="D") - b_diff = i.difference(b) - expected = pd.TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_difference = a_diff.union(b_diff) - expected = pd.TimedeltaIndex(["0 days", "1 days", "5 days"], - freq=None) - tm.assert_index_equal(union_of_difference, expected) - tm.assert_attr_equal('freq', union_of_difference, expected) + def test_difference_freq(self): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 26d50aa55431f..38e715fce2720 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -29,12 +29,7 @@ def setUp(self): def create_index(self): return RangeIndex(5) - def test_binops(self): - ops = [operator.add, operator.sub, operator.mul, operator.floordiv, - operator.truediv, pow] - scalars = [-1, 1, 2] - idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), - RangeIndex(-10, 10, 2), RangeIndex(5, -5, -1)] + def check_binop(self, ops, scalars, idxs): for op in ops: for a, b in combinations(idxs, 2): result = op(a, b) @@ -46,6 +41,23 @@ def test_binops(self): expected = op(Int64Index(idx), scalar) tm.assert_index_equal(result, expected) + def test_binops(self): + ops = [operator.add, operator.sub, operator.mul, operator.floordiv, + operator.truediv] + scalars = [-1, 1, 2] + idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), RangeIndex(5, -5, -1)] + self.check_binop(ops, scalars, idxs) + + def test_binops_pow(self): + # later versions of numpy don't allow powers of negative integers + # so test separately + # https://github.com/numpy/numpy/pull/8127 + ops = [pow] + scalars = [1, 2] + idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)] + self.check_binop(ops, scalars, idxs) + def test_too_many_names(self): def testit(): self.index.names = ["roger", "harold"] diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 7d2517987e526..76db6c90a685f 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -184,3 +184,35 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + + def test_quantile_empty(self): + + # floats + s = Series([], dtype='float64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # int + s = Series([], dtype='int64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # datetime + s = Series([], dtype='datetime64[ns]') + + res = s.quantile(0.5) + self.assertTrue(res is pd.NaT) + + res = s.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5]) + tm.assert_series_equal(res, exp) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 96213a4aec34d..4645ae24684ff 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -100,7 +100,7 @@ def round(self, freq, *args, **kwargs): def floor(self, freq): return self._round(freq, np.floor) - @Appender(_round_doc % "floor") + @Appender(_round_doc % "ceil") def ceil(self, freq): return self._round(freq, np.ceil) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f68750e242f1f..70e2d2c121773 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1453,8 +1453,9 @@ def _maybe_cast_slice_bound(self, label, side, kind): # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) # because label may be passed to searchsorted - # the bounds need swapped if index is reverse sorted - if self.is_monotonic_decreasing: + # the bounds need swapped if index is reverse sorted and has a + # length (is_monotonic_decreasing gives True for empty index) + if self.is_monotonic_decreasing and len(self): return upper if side == 'left' else lower return lower if side == 'left' else upper else: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c13805d383e5d..aa8a5d10cd9d3 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3911,6 +3911,18 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: ts.ix[::0]) + def test_slice_bounds_empty(self): + # GH 14354 + empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') + + right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') + exp = Timestamp('2015-01-02 23:59:59.999999999') + self.assertEqual(right, exp) + + left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') + exp = Timestamp('2015-01-02 00:00:00') + self.assertEqual(left, exp) + class TestDatetime64(tm.TestCase): """ diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 57bb01e5e0406..05517bf6cf53a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -35,7 +35,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat -from pandas.compat import( +from pandas.compat import ( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, raise_with_traceback, httplib, is_platform_windows, is_platform_32bit, PY3 diff --git a/setup.py b/setup.py index 3f8667cd6fe42..a17dd502d7706 100755 --- a/setup.py +++ b/setup.py @@ -85,7 +85,11 @@ def is_platform_mac(): try: if not _CYTHON_INSTALLED: raise ImportError('No supported version of Cython installed.') - from Cython.Distutils import build_ext as _build_ext + try: + from Cython.Distutils.old_build_ext import old_build_ext as _build_ext + except ImportError: + # Pre 0.25 + from Cython.Distutils import build_ext as _build_ext cython = True except ImportError: cython = False