From ff5ef13bd700576388c70a174f4fc5474a171687 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 01:38:14 +0200 Subject: [PATCH 01/50] BUG: Add check for array lengths in from_arrays method (GH13599) --- pandas/indexes/multi.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 05b2045a4850f..e43c993005161 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -840,6 +840,12 @@ def from_arrays(cls, arrays, sortorder=None, names=None): name = None if names is None else names[0] return Index(arrays[0], name=name) + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i-1]): + raise ValueError('all arrays must be same length') + cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] levels = [c.categories for c in cats] labels = [c.codes for c in cats] From db98e32756700bbe9cb82cce9200bbf17227aac9 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 02:28:22 +0200 Subject: [PATCH 02/50] BUG: Add test for array length mismatch --- pandas/tests/indexes/test_multi.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e6a8aafc32be4..acacfcea69d1b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -632,6 +632,13 @@ def test_from_arrays_index_series_period(self): tm.assert_index_equal(result, result2) + def test_from_arrays_different_lengths(self): + # GH13599 + idx1 = [1, 2, 3] + idx2 = ['a', 'b'] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + pd.MultiIndex.from_arrays, [idx1, idx2]) + def test_from_product(self): first = ['foo', 'bar', 'buz'] From 5e7bd921c9e80c46d36b1baeff0810281d9b693a Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 02:33:14 +0200 Subject: [PATCH 03/50] BUG: Fix minor issue with new test for from_arrays --- pandas/tests/indexes/test_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index acacfcea69d1b..7904c2e3b4208 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -637,7 +637,7 @@ def test_from_arrays_different_lengths(self): idx1 = [1, 2, 3] idx2 = ['a', 'b'] assertRaisesRegexp(ValueError, '^all arrays must be same length$', - pd.MultiIndex.from_arrays, [idx1, idx2]) + MultiIndex.from_arrays, [idx1, idx2]) def test_from_product(self): From 27d29158780bc7127bd944fc41eed3b74f38870b Mon Sep 17 00:00:00 2001 From: yui-knk Date: Tue, 12 Jul 2016 22:14:09 -0400 Subject: [PATCH 04/50] CLN: Fix compile time warnings This commit suppresses these warnings warning: comparison of constant -1 with expression\ of type 'PANDAS_DATETIMEUNIT' is always true\ [-Wtautological-constant-out-of-range-compare] Author: yui-knk Closes #13607 from yui-knk/fix_c_warning and squashes the following commits: e9eee1d [yui-knk] CLN: Fix compile time warnings --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/src/datetime/np_datetime_strings.c | 28 ++++------------------- pandas/src/ujson/python/objToJSON.c | 2 +- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f457b8d4bd1f6..fb09f99f2a7fe 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -536,6 +536,7 @@ Bug Fixes - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) +- Clean some compile time warnings in datetime parsing (:issue:`13607`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index 3a1d37f86cc28..b633d6cde0820 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -460,7 +460,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -503,7 +503,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -975,7 +975,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -1005,11 +1005,6 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { int len = 0; - /* If no unit is provided, return the maximum length */ - if (base == -1) { - return PANDAS_DATETIME_MAX_ISO8601_STRLEN; - } - switch (base) { /* Generic units can only be used to represent NaT */ /*case PANDAS_FR_GENERIC:*/ @@ -1146,28 +1141,13 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, local = 0; } - /* Automatically detect a good unit */ - if (base == -1) { - base = lossless_unit_from_datetimestruct(dts); - /* - * If there's a timezone, use at least minutes precision, - * and never split up hours and minutes by default - */ - if ((base < PANDAS_FR_m && local) || base == PANDAS_FR_h) { - base = PANDAS_FR_m; - } - /* Don't split up dates by default */ - else if (base < PANDAS_FR_D) { - base = PANDAS_FR_D; - } - } /* * Print weeks with the same precision as days. * * TODO: Could print weeks with YYYY-Www format if the week * epoch is a Monday. */ - else if (base == PANDAS_FR_W) { + if (base == PANDAS_FR_W) { base = PANDAS_FR_D; } diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 925c18cd23d8f..1080e9548ba56 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -450,7 +450,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) { - int base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; + PANDAS_DATETIMEUNIT base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; if (((PyObjectEncoder*) tc->encoder)->datetimeIso) { From 06103dd7735335e51fcd77a36b2e8a714286a059 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Jul 2016 12:31:44 +0200 Subject: [PATCH 05/50] Pin IPython for doc build to 4.x (see #13639) --- ci/requirements-2.7_DOC_BUILD.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index b87a41df4191d..a07721c75cf34 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,4 +1,4 @@ -ipython +ipython=4 ipykernel sphinx nbconvert From 7dd4091458d9117e57d2ad9ce3126855bd00108c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 13 Jul 2016 07:51:59 -0400 Subject: [PATCH 06/50] CLN: reorg type inference & introspection closes #12503 Author: Jeff Reback Closes #13147 from jreback/types and squashes the following commits: 244649a [Jeff Reback] CLN: reorg type inference & introspection --- ci/lint.sh | 2 +- doc/source/whatsnew/v0.19.0.txt | 22 +- pandas/__init__.py | 2 +- pandas/api/__init__.py | 1 + pandas/api/tests/__init__.py | 0 pandas/api/tests/test_api.py | 213 +++ pandas/api/types/__init__.py | 4 + pandas/compat/numpy/function.py | 3 +- pandas/computation/ops.py | 8 +- pandas/computation/pytables.py | 4 +- pandas/computation/tests/test_eval.py | 19 +- pandas/core/algorithms.py | 145 +- pandas/core/api.py | 2 +- pandas/core/base.py | 31 +- pandas/core/categorical.py | 33 +- pandas/core/common.py | 1656 +------------------ pandas/core/config_init.py | 2 +- pandas/core/convert.py | 127 -- pandas/core/frame.py | 113 +- pandas/core/generic.py | 106 +- pandas/core/groupby.py | 94 +- pandas/core/indexing.py | 49 +- pandas/core/internals.py | 129 +- pandas/core/missing.py | 73 +- pandas/core/nanops.py | 27 +- pandas/core/ops.py | 36 +- pandas/core/panel.py | 26 +- pandas/core/reshape.py | 10 +- pandas/core/series.py | 69 +- pandas/core/strings.py | 25 +- pandas/core/window.py | 41 +- pandas/formats/format.py | 43 +- pandas/formats/printing.py | 4 +- pandas/formats/style.py | 7 +- pandas/indexes/base.py | 79 +- pandas/indexes/category.py | 38 +- pandas/indexes/multi.py | 34 +- pandas/indexes/numeric.py | 22 +- pandas/indexes/range.py | 18 +- pandas/io/common.py | 4 +- pandas/io/data.py | 4 +- pandas/io/excel.py | 26 +- pandas/io/html.py | 4 +- pandas/io/packers.py | 8 +- pandas/io/parsers.py | 44 +- pandas/io/pickle.py | 6 +- pandas/io/pytables.py | 37 +- pandas/io/sql.py | 16 +- pandas/io/stata.py | 16 +- pandas/io/tests/test_sql.py | 15 +- pandas/io/tests/test_stata.py | 2 +- pandas/sparse/array.py | 46 +- pandas/sparse/frame.py | 10 +- pandas/sparse/list.py | 4 +- pandas/sparse/panel.py | 6 +- pandas/sparse/series.py | 8 +- pandas/src/testing.pyx | 12 +- pandas/stats/moments.py | 4 +- pandas/stats/ols.py | 2 +- pandas/tests/frame/test_apply.py | 6 +- pandas/tests/frame/test_constructors.py | 3 +- pandas/tests/frame/test_dtypes.py | 8 +- pandas/tests/frame/test_indexing.py | 16 +- pandas/tests/indexing/test_indexing.py | 17 +- pandas/tests/series/test_constructors.py | 13 +- pandas/tests/series/test_datetime_values.py | 7 +- pandas/tests/series/test_indexing.py | 24 +- pandas/tests/series/test_quantile.py | 6 +- pandas/tests/test_base.py | 8 +- pandas/tests/test_categorical.py | 61 +- pandas/tests/test_common.py | 658 +------- pandas/tests/test_generic.py | 4 +- pandas/tests/test_graphics.py | 6 +- pandas/tests/test_groupby.py | 12 +- pandas/tests/test_infer_and_convert.py | 653 -------- pandas/tests/test_lib.py | 1 + pandas/tests/test_multilevel.py | 5 +- pandas/tests/test_nanops.py | 4 +- pandas/tests/test_panel.py | 5 +- pandas/tests/test_panel4d.py | 4 +- pandas/tests/test_strings.py | 7 +- pandas/tests/types/test_cast.py | 193 +++ pandas/tests/types/test_common.py | 22 + pandas/tests/types/test_dtypes.py | 19 +- pandas/tests/types/test_generic.py | 36 +- pandas/tests/types/test_inference.py | 820 +++++++++ pandas/tests/types/test_io.py | 116 ++ pandas/tests/types/test_missing.py | 243 +++ pandas/tests/types/test_types.py | 40 - pandas/tools/merge.py | 46 +- pandas/tools/pivot.py | 6 +- pandas/tools/plotting.py | 67 +- pandas/tools/tile.py | 14 +- pandas/tools/util.py | 19 +- pandas/tseries/base.py | 44 +- pandas/tseries/common.py | 16 +- pandas/tseries/converter.py | 28 +- pandas/tseries/frequencies.py | 23 +- pandas/tseries/index.py | 43 +- pandas/tseries/offsets.py | 4 +- pandas/tseries/period.py | 62 +- pandas/tseries/tdi.py | 33 +- pandas/tseries/tests/test_bin_groupby.py | 6 +- pandas/tseries/tests/test_period.py | 4 +- pandas/tseries/tests/test_resample.py | 5 +- pandas/tseries/tests/test_timeseries.py | 3 +- pandas/tseries/tests/test_timezones.py | 2 +- pandas/tseries/timedeltas.py | 8 +- pandas/tseries/tools.py | 35 +- pandas/tseries/util.py | 4 +- pandas/types/api.py | 121 +- pandas/types/cast.py | 860 ++++++++++ pandas/types/common.py | 448 +++++ pandas/types/concat.py | 47 +- pandas/types/inference.py | 104 ++ pandas/types/missing.py | 394 +++++ pandas/util/testing.py | 20 +- pandas/util/validators.py | 4 +- 118 files changed, 4944 insertions(+), 4134 deletions(-) create mode 100644 pandas/api/__init__.py create mode 100644 pandas/api/tests/__init__.py create mode 100644 pandas/api/tests/test_api.py create mode 100644 pandas/api/types/__init__.py delete mode 100644 pandas/core/convert.py delete mode 100644 pandas/tests/test_infer_and_convert.py create mode 100644 pandas/tests/types/test_cast.py create mode 100644 pandas/tests/types/test_common.py create mode 100644 pandas/tests/types/test_inference.py create mode 100644 pandas/tests/types/test_io.py create mode 100644 pandas/tests/types/test_missing.py delete mode 100644 pandas/tests/types/test_types.py create mode 100644 pandas/types/cast.py create mode 100644 pandas/types/common.py create mode 100644 pandas/types/inference.py create mode 100644 pandas/types/missing.py diff --git a/ci/lint.sh b/ci/lint.sh index a4c960084040f..9f582f72fcdd7 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -8,7 +8,7 @@ RET=0 if [ "$LINT" ]; then echo "Linting" - for path in 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' + for path in 'api' 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' do echo "linting -> pandas/$path" flake8 pandas/$path --filename '*.py' diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index fb09f99f2a7fe..bef02a06135de 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -10,6 +10,7 @@ users upgrade to this version. Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` +- pandas development api, see :ref:`here ` .. contents:: What's new in v0.18.2 :local: @@ -20,6 +21,25 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0190.dev_api: + +pandas development API +^^^^^^^^^^^^^^^^^^^^^^ + +As part of making pandas APi more uniform and accessible in the future, we have created a standard +sub-package of pandas, ``pandas.api`` to hold public API's. We are starting by exposing type +introspection functions in ``pandas.api.types``. More sub-packages and officially sanctioned API's +will be published in future versions of pandas. + +The following are now part of this API: + +.. ipython:: python + + import pprint + from pandas.api import types + funcs = [ f for f in dir(types) if not f.startswith('_') ] + pprint.pprint(funcs) + .. _whatsnew_0190.enhancements.asof_merge: :func:`merge_asof` for asof-style time-series joining @@ -227,7 +247,7 @@ Other enhancements - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A top-level function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) .. _whatsnew_0190.api: diff --git a/pandas/__init__.py b/pandas/__init__.py index 350898c9925e7..2d91c97144e3c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -16,7 +16,7 @@ if missing_dependencies: raise ImportError("Missing required dependencies {0}".format(missing_dependencies)) - +del hard_dependencies, dependency, missing_dependencies # numpy compat from pandas.compat.numpy import * diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py new file mode 100644 index 0000000000000..fcbf42f6dabc4 --- /dev/null +++ b/pandas/api/__init__.py @@ -0,0 +1 @@ +""" public toolkit API """ diff --git a/pandas/api/tests/__init__.py b/pandas/api/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py new file mode 100644 index 0000000000000..3f6c97441d659 --- /dev/null +++ b/pandas/api/tests/test_api.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- + +import pandas as pd +from pandas.core import common as com +from pandas import api +from pandas.api import types +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +class Base(object): + + def check(self, namespace, expected, ignored=None): + # see which names are in the namespace, minus optional + # ignored ones + # compare vs the expected + + result = sorted([f for f in dir(namespace) if not f.startswith('_')]) + if ignored is not None: + result = sorted(list(set(result) - set(ignored))) + + expected = sorted(expected) + tm.assert_almost_equal(result, expected) + + +class TestPDApi(Base, tm.TestCase): + + # these are optionally imported based on testing + # & need to be ignored + ignored = ['tests', 'rpy', 'sandbox', 'locale'] + + # top-level sub-packages + lib = ['api', 'compat', 'computation', 'core', + 'indexes', 'formats', 'pandas', + 'test', 'tools', 'tseries', + 'types', 'util', 'options', 'io'] + + # top-level packages that are c-imports, should rename to _* + # to avoid naming conflicts + lib_to_rename = ['algos', 'hashtable', 'tslib', 'msgpack', 'sparse', + 'json', 'lib', 'index', 'parser'] + + # these are already deprecated; awaiting removal + deprecated_modules = ['ols', 'stats'] + + # misc + misc = ['IndexSlice', 'NaT'] + + # top-level classes + classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset', + 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index', + 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', + 'Period', 'PeriodIndex', 'RangeIndex', + 'Series', 'SparseArray', 'SparseDataFrame', + 'SparseSeries', 'TimeGrouper', 'Timedelta', + 'TimedeltaIndex', 'Timestamp'] + + # these are already deprecated; awaiting removal + deprecated_classes = ['SparsePanel', 'TimeSeries', 'WidePanel', + 'SparseTimeSeries'] + + # these should be deperecated in the future + deprecated_classes_in_future = ['Panel', 'Panel4D', + 'SparseList', 'Term'] + + # these should be removed from top-level namespace + remove_classes_from_top_level_namespace = ['Expr'] + + # external modules exposed in pandas namespace + modules = ['np', 'datetime', 'datetools'] + + # top-level functions + funcs = ['bdate_range', 'concat', 'crosstab', 'cut', + 'date_range', 'eval', + 'factorize', 'get_dummies', 'get_store', + 'infer_freq', 'isnull', 'lreshape', + 'match', 'melt', 'notnull', 'offsets', + 'merge', 'merge_ordered', 'merge_asof', + 'period_range', + 'pivot', 'pivot_table', 'plot_params', 'qcut', + 'scatter_matrix', + 'show_versions', 'timedelta_range', 'unique', + 'value_counts', 'wide_to_long'] + + # top-level option funcs + funcs_option = ['reset_option', 'describe_option', 'get_option', + 'option_context', 'set_option', + 'set_eng_float_format'] + + # top-level read_* funcs + funcs_read = ['read_clipboard', 'read_csv', 'read_excel', 'read_fwf', + 'read_gbq', 'read_hdf', 'read_html', 'read_json', + 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', + 'read_sql_query', 'read_sql_table', 'read_stata', + 'read_table'] + + # top-level to_* funcs + funcs_to = ['to_datetime', 'to_msgpack', + 'to_numeric', 'to_pickle', 'to_timedelta'] + + # these should be deperecated in the future + deprecated_funcs_in_future = ['pnow', 'groupby', 'info'] + + # these are already deprecated; awaiting removal + deprecated_funcs = ['ewma', 'ewmcorr', 'ewmcov', 'ewmstd', 'ewmvar', + 'ewmvol', 'expanding_apply', 'expanding_corr', + 'expanding_count', 'expanding_cov', 'expanding_kurt', + 'expanding_max', 'expanding_mean', 'expanding_median', + 'expanding_min', 'expanding_quantile', + 'expanding_skew', 'expanding_std', 'expanding_sum', + 'expanding_var', 'fama_macbeth', 'rolling_apply', + 'rolling_corr', 'rolling_count', 'rolling_cov', + 'rolling_kurt', 'rolling_max', 'rolling_mean', + 'rolling_median', 'rolling_min', 'rolling_quantile', + 'rolling_skew', 'rolling_std', 'rolling_sum', + 'rolling_var', 'rolling_window', 'ordered_merge'] + + def test_api(self): + + self.check(pd, + self.lib + self.lib_to_rename + self.misc + + self.modules + self.deprecated_modules + + self.classes + self.deprecated_classes + + self.deprecated_classes_in_future + + self.remove_classes_from_top_level_namespace + + self.funcs + self.funcs_option + + self.funcs_read + self.funcs_to + + self.deprecated_funcs + + self.deprecated_funcs_in_future, + self.ignored) + + +class TestApi(Base, tm.TestCase): + + allowed = ['tests', 'types'] + + def test_api(self): + + self.check(api, self.allowed) + + +class TestTypes(Base, tm.TestCase): + + allowed = ['is_any_int_dtype', 'is_bool', 'is_bool_dtype', + 'is_categorical', 'is_categorical_dtype', 'is_complex', + 'is_complex_dtype', 'is_datetime64_any_dtype', + 'is_datetime64_dtype', 'is_datetime64_ns_dtype', + 'is_datetime64tz_dtype', 'is_datetimetz', 'is_dtype_equal', + 'is_extension_type', 'is_float', 'is_float_dtype', + 'is_floating_dtype', 'is_int64_dtype', 'is_integer', + 'is_integer_dtype', 'is_number', 'is_numeric_dtype', + 'is_object_dtype', 'is_scalar', 'is_sparse', + 'is_string_dtype', 'is_timedelta64_dtype', + 'is_timedelta64_ns_dtype', + 'is_re', 'is_re_compilable', + 'is_dict_like', 'is_iterator', + 'is_list_like', 'is_hashable', + 'is_named_tuple', 'is_sequence', + 'pandas_dtype'] + + def test_types(self): + + self.check(types, self.allowed) + + def check_deprecation(self, fold, fnew): + with tm.assert_produces_warning(FutureWarning): + try: + result = fold('foo') + expected = fnew('foo') + self.assertEqual(result, expected) + except TypeError: + self.assertRaises(TypeError, + lambda: fnew('foo')) + except AttributeError: + self.assertRaises(AttributeError, + lambda: fnew('foo')) + + def test_deprecation_core_common(self): + + # test that we are in fact deprecating + # the pandas.core.common introspectors + for t in self.allowed: + self.check_deprecation(getattr(com, t), getattr(types, t)) + + def test_deprecation_core_common_moved(self): + + # these are in pandas.types.common + l = ['is_datetime_arraylike', + 'is_datetime_or_timedelta_dtype', + 'is_datetimelike', + 'is_datetimelike_v_numeric', + 'is_datetimelike_v_object', + 'is_datetimetz', + 'is_int_or_datetime_dtype', + 'is_period_arraylike', + 'is_string_like', + 'is_string_like_dtype'] + + from pandas.types import common as c + for t in l: + self.check_deprecation(getattr(com, t), getattr(c, t)) + + def test_removed_from_core_common(self): + + for t in ['is_null_datelike_scalar', + 'ensure_float']: + self.assertRaises(AttributeError, lambda: getattr(com, t)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py new file mode 100644 index 0000000000000..ee217543f0420 --- /dev/null +++ b/pandas/api/types/__init__.py @@ -0,0 +1,4 @@ +""" public toolkit API """ + +from pandas.types.api import * # noqa +del np # noqa diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 15bf6d31b7109..adc17c7514832 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,8 @@ from numpy import ndarray from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) -from pandas.core.common import is_bool, is_integer, UnsupportedFunctionCall +from pandas.core.common import UnsupportedFunctionCall +from pandas.types.common import is_integer, is_bool from pandas.compat import OrderedDict diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 7a0743f6b2778..96a04cff9372e 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -7,11 +7,11 @@ import numpy as np +from pandas.types.common import is_list_like, is_scalar import pandas as pd from pandas.compat import PY3, string_types, text_type import pandas.core.common as com from pandas.formats.printing import pprint_thing, pprint_thing_encoded -import pandas.lib as lib from pandas.core.base import StringMixin from pandas.computation.common import _ensure_decoded, _result_type_many from pandas.computation.scope import _DEFAULT_GLOBALS @@ -100,7 +100,7 @@ def update(self, value): @property def isscalar(self): - return lib.isscalar(self._value) + return is_scalar(self._value) @property def type(self): @@ -229,7 +229,7 @@ def _in(x, y): try: return x.isin(y) except AttributeError: - if com.is_list_like(x): + if is_list_like(x): try: return y.isin(x) except AttributeError: @@ -244,7 +244,7 @@ def _not_in(x, y): try: return ~x.isin(y) except AttributeError: - if com.is_list_like(x): + if is_list_like(x): try: return ~y.isin(x) except AttributeError: diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index d6d55d15fec30..e375716b0d606 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,6 +7,8 @@ from datetime import datetime, timedelta import numpy as np import pandas as pd + +from pandas.types.common import is_list_like import pandas.core.common as com from pandas.compat import u, string_types, DeepChainMap from pandas.core.base import StringMixin @@ -127,7 +129,7 @@ def pr(left, right): def conform(self, rhs): """ inplace conform rhs """ - if not com.is_list_like(rhs): + if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): rhs = rhs.ravel() diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 5019dd392a567..066df0521fef6 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -13,6 +13,7 @@ from numpy.random import randn, rand, randint import numpy as np +from pandas.types.common import is_list_like, is_scalar import pandas as pd from pandas.core import common as com from pandas import DataFrame, Series, Panel, date_range @@ -200,7 +201,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - scalar_with_in_notin = (lib.isscalar(rhs) and (cmp1 in skip_these or + scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or cmp2 in skip_these)) if scalar_with_in_notin: with tm.assertRaises(TypeError): @@ -253,7 +254,7 @@ def check_operands(left, right, cmp_op): def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) - if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): + if cmp1 in ('in', 'not in') and not is_list_like(rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) @@ -331,7 +332,7 @@ def check_pow(self, lhs, arith1, rhs): expected = self.get_expected_pow_result(lhs, rhs) result = pd.eval(ex, engine=self.engine, parser=self.parser) - if (lib.isscalar(lhs) and lib.isscalar(rhs) and + if (is_scalar(lhs) and is_scalar(rhs) and _is_py3_complex_incompat(result, expected)): self.assertRaises(AssertionError, tm.assert_numpy_array_equal, result, expected) @@ -364,16 +365,16 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) - if lib.isscalar(rhs) and cmp1 in skip_these: + if is_scalar(rhs) and cmp1 in skip_these: self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) else: # compound - if lib.isscalar(lhs) and lib.isscalar(rhs): + if is_scalar(lhs) and is_scalar(rhs): lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) - if lib.isscalar(expected): + if is_scalar(expected): expected = not expected else: expected = ~expected @@ -643,17 +644,17 @@ def test_identical(self): x = 1 result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, 1) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = 1.5 result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, 1.5) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = False result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, False) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = np.array([1]) result = pd.eval('x', engine=self.engine, parser=self.parser) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4b40bce79cbb5..c3ba734353a8d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -7,10 +7,31 @@ import numpy as np from pandas import compat, lib, tslib, _np_version_under1p8 +from pandas.types.cast import _maybe_promote +from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex +from pandas.types.common import (is_integer_dtype, + is_int64_dtype, + is_categorical_dtype, + is_extension_type, + is_datetimetz, + is_period_arraylike, + is_datetime_or_timedelta_dtype, + is_float_dtype, + needs_i8_conversion, + is_categorical, + is_datetime64_dtype, + is_timedelta64_dtype, + is_scalar, + _ensure_platform_int, + _ensure_object, + _ensure_float64, + _ensure_int64, + is_list_like) +from pandas.types.missing import isnull + import pandas.core.common as com import pandas.algos as algos import pandas.hashtable as htable -from pandas.types import api as gt from pandas.compat import string_types from pandas.tslib import iNaT @@ -105,12 +126,12 @@ def isin(comps, values): boolean array same length as comps """ - if not com.is_list_like(comps): + if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) - if not com.is_list_like(values): + if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) @@ -126,15 +147,15 @@ def isin(comps, values): f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing - if com.is_datetime64_dtype(comps): + if is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') - elif com.is_timedelta64_dtype(comps): + elif is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') - elif com.is_int64_dtype(comps): + elif is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) @@ -171,20 +192,20 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): vals = np.asarray(values) # localize to UTC - is_datetimetz = com.is_datetimetz(values) - if is_datetimetz: + is_datetimetz_type = is_datetimetz(values) + if is_datetimetz_type: values = DatetimeIndex(values) vals = values.tz_localize(None) - is_datetime = com.is_datetime64_dtype(vals) - is_timedelta = com.is_timedelta64_dtype(vals) + is_datetime = is_datetime64_dtype(vals) + is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) - labels = com._ensure_platform_int(labels) + labels = _ensure_platform_int(labels) uniques = uniques.to_array() @@ -194,7 +215,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) - t.map_locations(com._ensure_object(uniques)) + t.map_locations(_ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ @@ -202,8 +223,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): dtype=object)) for f in [lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types)]]) - sorter = com._ensure_platform_int(t.lookup( - com._ensure_object(ordered))) + sorter = _ensure_platform_int(t.lookup( + _ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -214,7 +235,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.take(sorter) - if is_datetimetz: + if is_datetimetz_type: # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( @@ -267,7 +288,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes - if com.is_extension_type(values) and not com.is_datetimetz(values): + if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) @@ -298,9 +319,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False, def _value_counts_arraylike(values, dropna=True): - is_datetimetz = com.is_datetimetz(values) - is_period = (isinstance(values, gt.ABCPeriodIndex) or - com.is_period_arraylike(values)) + is_datetimetz_type = is_datetimetz(values) + is_period = (isinstance(values, ABCPeriodIndex) or + is_period_arraylike(values)) orig = values @@ -308,7 +329,7 @@ def _value_counts_arraylike(values, dropna=True): values = Series(values).values dtype = values.dtype - if com.is_datetime_or_timedelta_dtype(dtype) or is_period: + if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex @@ -327,8 +348,8 @@ def _value_counts_arraylike(values, dropna=True): keys = keys.astype(dtype) # dtype handling - if is_datetimetz: - if isinstance(orig, gt.ABCDatetimeIndex): + if is_datetimetz_type: + if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz @@ -336,15 +357,15 @@ def _value_counts_arraylike(values, dropna=True): if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) - elif com.is_integer_dtype(dtype): - values = com._ensure_int64(values) + elif is_integer_dtype(dtype): + values = _ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) - elif com.is_float_dtype(dtype): - values = com._ensure_float64(values) + elif is_float_dtype(dtype): + values = _ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: - values = com._ensure_object(values) - mask = com.isnull(values) + values = _ensure_object(values) + mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) @@ -366,8 +387,8 @@ def mode(values): constructor = Series dtype = values.dtype - if com.is_integer_dtype(values): - values = com._ensure_int64(values) + if is_integer_dtype(values): + values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): @@ -375,11 +396,11 @@ def mode(values): values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) - elif com.is_categorical_dtype(values): + elif is_categorical_dtype(values): result = constructor(values.mode()) else: - mask = com.isnull(values) - values = com._ensure_object(values) + mask = isnull(values) + values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) @@ -459,7 +480,7 @@ def quantile(x, q, interpolation_method='fraction'): """ x = np.asarray(x) - mask = com.isnull(x) + mask = isnull(x) x = x[~mask] @@ -486,7 +507,7 @@ def _get_score(at): return score - if lib.isscalar(q): + if is_scalar(q): return _get_score(q) else: q = np.asarray(q, np.float64) @@ -593,18 +614,18 @@ def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ - if com.is_float_dtype(dtype): - return f(htable.Float64HashTable, com._ensure_float64) - elif com.is_integer_dtype(dtype): - return f(htable.Int64HashTable, com._ensure_int64) - elif com.is_datetime64_dtype(dtype): + if is_float_dtype(dtype): + return f(htable.Float64HashTable, _ensure_float64) + elif is_integer_dtype(dtype): + return f(htable.Int64HashTable, _ensure_int64) + elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' - return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) - elif com.is_timedelta64_dtype(dtype): + return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) + elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' - return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) + return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: - return f(htable.PyObjectHashTable, com._ensure_object) + return f(htable.PyObjectHashTable, _ensure_object) _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), @@ -614,20 +635,20 @@ def _hashtable_algo(f, dtype, return_dtype=None): def _get_data_algo(values, func_map): - if com.is_float_dtype(values): + if is_float_dtype(values): f = func_map['float64'] - values = com._ensure_float64(values) + values = _ensure_float64(values) - elif com.needs_i8_conversion(values): + elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') - elif com.is_integer_dtype(values): + elif is_integer_dtype(values): f = func_map['int64'] - values = com._ensure_int64(values) + values = _ensure_int64(values) else: f = func_map['generic'] - values = com._ensure_object(values) + values = _ensure_object(values) return f, values @@ -689,7 +710,7 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: - arr.take(com._ensure_platform_int(indexer), axis=axis, out=out) + arr.take(_ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask @@ -830,7 +851,7 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): return func def func(arr, indexer, out, fill_value=np.nan): - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info) @@ -854,7 +875,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call - common._maybe_promote to determine this type for any fill_value + _maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with mask_info : tuple of (ndarray, boolean) @@ -868,24 +889,24 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ # dispatch to internal type takes - if com.is_categorical(arr): + if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif com.is_datetimetz(arr): + elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = com._maybe_promote(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: @@ -931,7 +952,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: @@ -957,11 +978,11 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if row_idx is None: row_idx = np.arange(arr.shape[0], dtype=np.int64) else: - row_idx = com._ensure_int64(row_idx) + row_idx = _ensure_int64(row_idx) if col_idx is None: col_idx = np.arange(arr.shape[1], dtype=np.int64) else: - col_idx = com._ensure_int64(col_idx) + col_idx = _ensure_int64(col_idx) indexer = row_idx, col_idx if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() @@ -969,7 +990,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = com._maybe_promote(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: @@ -1032,7 +1053,7 @@ def diff(arr, n, axis=0): na = np.nan dtype = arr.dtype is_timedelta = False - if com.needs_i8_conversion(arr): + if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') na = tslib.iNaT diff --git a/pandas/core/api.py b/pandas/core/api.py index 0a6992bfebd70..579f21eb4ada8 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,7 +5,7 @@ import numpy as np from pandas.core.algorithms import factorize, match, unique, value_counts -from pandas.core.common import isnull, notnull +from pandas.types.missing import isnull, notnull from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.formats.format import set_eng_float_format diff --git a/pandas/core/base.py b/pandas/core/base.py index 13a6b4b7b4ce0..a0dfebdfde356 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,6 +4,12 @@ from pandas import compat from pandas.compat import builtins import numpy as np + +from pandas.types.missing import isnull +from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndex +from pandas.types.common import (_ensure_object, is_object_dtype, + is_list_like, is_scalar) + from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib @@ -11,7 +17,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError -from pandas.types import api as gt from pandas.formats.printing import pprint_thing _shared_docs = dict() @@ -121,7 +126,7 @@ def __sizeof__(self): """ if hasattr(self, 'memory_usage'): mem = self.memory_usage(deep=True) - if not lib.isscalar(mem): + if not is_scalar(mem): mem = mem.sum() return int(mem) @@ -293,15 +298,15 @@ def name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, gt.ABCSeries, - gt.ABCIndex, np.ndarray)): + if not isinstance(self._selection, (list, tuple, ABCSeries, + ABCIndex, np.ndarray)): return [self._selection] return self._selection @cache_readonly def _selected_obj(self): - if self._selection is None or isinstance(self.obj, gt.ABCSeries): + if self._selection is None or isinstance(self.obj, ABCSeries): return self.obj else: return self.obj[self._selection] @@ -313,7 +318,7 @@ def ndim(self): @cache_readonly def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, - gt.ABCDataFrame): + ABCDataFrame): return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: @@ -325,7 +330,7 @@ def __getitem__(self, key): if self._selection is not None: raise Exception('Column(s) %s already selected' % self._selection) - if isinstance(key, (list, tuple, gt.ABCSeries, gt.ABCIndex, + if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) @@ -553,7 +558,7 @@ def _agg(arg, func): if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], - gt.ABCDataFrame): + ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame @@ -682,7 +687,7 @@ def _gotitem(self, key, ndim, subset=None): **kwargs) self._reset_cache() if subset.ndim == 2: - if lib.isscalar(key) and key in subset or com.is_list_like(key): + if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self @@ -903,7 +908,7 @@ def argmin(self, axis=None): @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ - return com.isnull(self).any() + return isnull(self).any() def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): @@ -980,7 +985,7 @@ def nunique(self, dropna=True): """ uniqs = self.unique() n = len(uniqs) - if dropna and com.isnull(uniqs).any(): + if dropna and isnull(uniqs).any(): n -= 1 return n @@ -1053,7 +1058,7 @@ def memory_usage(self, deep=False): return self.values.memory_usage(deep=deep) v = self.values.nbytes - if deep and com.is_object_dtype(self): + if deep and is_object_dtype(self): v += lib.memory_usage_of_objects(self.values) return v @@ -1195,7 +1200,7 @@ def drop_duplicates(self, keep='first', inplace=False): False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): - keys = com._values_from_object(com._ensure_object(self.values)) + keys = com._values_from_object(_ensure_object(self.values)) duplicated = lib.duplicated(keys, keep=keep) try: return self._constructor(duplicated, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f4aeaf9184d09..79d8bfbf57f12 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -7,6 +7,22 @@ from pandas import compat, lib from pandas.compat import u +from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex +from pandas.types.missing import isnull, notnull +from pandas.types.cast import (_possibly_infer_to_datetimelike, + _coerce_indexer_dtype) +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (_ensure_int64, + _ensure_object, + _ensure_platform_int, + is_dtype_equal, + is_datetimelike, + is_categorical_dtype, + is_integer_dtype, is_bool, + is_list_like, is_sequence, + is_scalar) +from pandas.core.common import is_null_slice + from pandas.core.algorithms import factorize, take_1d from pandas.core.base import (PandasObject, PandasDelegate, NoNewAttributesMixin, _shared_docs) @@ -16,13 +32,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) -from pandas.core.common import ( - ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, - is_dtype_equal, is_categorical_dtype, is_integer_dtype, - _possibly_infer_to_datetimelike, is_list_like, - is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64, - _coerce_indexer_dtype) -from pandas.types.api import CategoricalDtype from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option @@ -64,7 +73,7 @@ def f(self, other): # With cat[0], for example, being ``np.int64(1)`` by the time it gets # into this function would become ``np.array(1)``. other = lib.item_from_zerodim(other) - if lib.isscalar(other): + if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) return getattr(self._codes, op)(i) @@ -968,7 +977,7 @@ def shift(self, periods): if codes.ndim > 1: raise NotImplementedError("Categorical with ndim > 1.") if np.prod(codes.shape) and (periods != 0): - codes = np.roll(codes, com._ensure_platform_int(periods), axis=0) + codes = np.roll(codes, _ensure_platform_int(periods), axis=0) if periods > 0: codes[:periods] = -1 else: @@ -1148,7 +1157,7 @@ def value_counts(self, dropna=True): counts : Series """ from numpy import bincount - from pandas.core.common import isnull + from pandas.types.missing import isnull from pandas.core.series import Series from pandas.core.index import CategoricalIndex @@ -1182,7 +1191,7 @@ def get_values(self): Index if datetime / periods """ # if we are a datetime and period index, return Index to keep metadata - if com.is_datetimelike(self.categories): + if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) return np.array(self) @@ -1933,7 +1942,7 @@ def _convert_to_list_like(list_like): if (is_sequence(list_like) or isinstance(list_like, tuple) or isinstance(list_like, types.GeneratorType)): return list(list_like) - elif lib.isscalar(list_like): + elif is_scalar(list_like): return [list_like] else: # is this reached? diff --git a/pandas/core/common.py b/pandas/core/common.py index 28bae362a3411..99dd2e9f5b8a9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2,23 +2,66 @@ Misc tools for implementing data structures """ -import re -import collections -import numbers +import sys +import warnings from datetime import datetime, timedelta from functools import partial import numpy as np -import pandas as pd -import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import (long, zip, map, string_types, - iteritems) -from pandas.types import api as gt -from pandas.types.api import * # noqa +from pandas.compat import long, zip, iteritems from pandas.core.config import get_option +from pandas.types.generic import ABCSeries +from pandas.types.common import _NS_DTYPE, is_integer +from pandas.types.inference import _iterable_not_string +from pandas.types.missing import isnull +from pandas.api import types +from pandas.types import common + +# back-compat of public API +# deprecate these functions +m = sys.modules['pandas.core.common'] +for t in [t for t in dir(types) if not t.startswith('_')]: + + def outer(t=t): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.core.common.{t} is deprecated. " + "import from the public API: " + "pandas.api.types.{t} instead".format(t=t), + FutureWarning, stacklevel=2) + return getattr(types, t)(*args, **kwargs) + return wrapper + + setattr(m, t, outer(t)) + +# back-compat for non-public functions +# deprecate these functions +for t in ['is_datetime_arraylike', + 'is_datetime_or_timedelta_dtype', + 'is_datetimelike', + 'is_datetimelike_v_numeric', + 'is_datetimelike_v_object', + 'is_datetimetz', + 'is_int_or_datetime_dtype', + 'is_period_arraylike', + 'is_string_like', + 'is_string_like_dtype']: + + def outer(t=t): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.core.common.{t} is deprecated. " + "These are not longer public API functions, " + "but can be imported from " + "pandas.types.common.{t} instead".format(t=t), + FutureWarning, stacklevel=2) + return getattr(common, t)(*args, **kwargs) + return wrapper + + setattr(m, t, outer(t)) class PandasError(Exception): @@ -58,322 +101,6 @@ def __str__(self): self.class_instance.__class__.__name__) -_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name - for t in ['O', 'int8', 'uint8', 'int16', 'uint16', - 'int32', 'uint32', 'int64', 'uint64']]) - -_NS_DTYPE = np.dtype('M8[ns]') -_TD_DTYPE = np.dtype('m8[ns]') -_INT64_DTYPE = np.dtype(np.int64) -_DATELIKE_DTYPES = set([np.dtype(t) - for t in ['M8[ns]', 'M8[ns]', - 'm8[ns]', 'm8[ns]']]) -_int8_max = np.iinfo(np.int8).max -_int16_max = np.iinfo(np.int16).max -_int32_max = np.iinfo(np.int32).max -_int64_max = np.iinfo(np.int64).max - - -def isnull(obj): - """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) - - Parameters - ---------- - arr : ndarray or object value - Object to check for null-ness - - Returns - ------- - isnulled : array-like of bool or bool - Array or bool indicating whether an object is null or if an array is - given which of the element is null. - - See also - -------- - pandas.notnull: boolean inverse of pandas.isnull - """ - return _isnull(obj) - - -def _isnull_new(obj): - if lib.isscalar(obj): - return lib.checknull(obj) - # hack (for now) because MI registers as ndarray - elif isinstance(obj, pd.MultiIndex): - raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (gt.ABCSeries, np.ndarray, pd.Index)): - return _isnull_ndarraylike(obj) - elif isinstance(obj, gt.ABCGeneric): - return obj._constructor(obj._data.isnull(func=isnull)) - elif isinstance(obj, list) or hasattr(obj, '__array__'): - return _isnull_ndarraylike(np.asarray(obj)) - else: - return obj is None - - -def _isnull_old(obj): - """Detect missing values. Treat None, NaN, INF, -INF as null. - - Parameters - ---------- - arr: ndarray or object value - - Returns - ------- - boolean ndarray or boolean - """ - if lib.isscalar(obj): - return lib.checknull_old(obj) - # hack (for now) because MI registers as ndarray - elif isinstance(obj, pd.MultiIndex): - raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (gt.ABCSeries, np.ndarray, pd.Index)): - return _isnull_ndarraylike_old(obj) - elif isinstance(obj, gt.ABCGeneric): - return obj._constructor(obj._data.isnull(func=_isnull_old)) - elif isinstance(obj, list) or hasattr(obj, '__array__'): - return _isnull_ndarraylike_old(np.asarray(obj)) - else: - return obj is None - - -_isnull = _isnull_new - - -def _use_inf_as_null(key): - """Option change callback for null/inf behaviour - Choose which replacement for numpy.isnan / ~numpy.isfinite is used. - - Parameters - ---------- - flag: bool - True means treat None, NaN, INF, -INF as null (old way), - False means None and NaN are null, but INF, -INF are not null - (new way). - - Notes - ----- - This approach to setting global module values is discussed and - approved here: - - * http://stackoverflow.com/questions/4859217/ - programmatically-creating-variables-in-python/4859312#4859312 - """ - flag = get_option(key) - if flag: - globals()['_isnull'] = _isnull_old - else: - globals()['_isnull'] = _isnull_new - - -def _isnull_ndarraylike(obj): - - values = getattr(obj, 'values', obj) - dtype = values.dtype - - if is_string_dtype(dtype): - if is_categorical_dtype(values): - from pandas import Categorical - if not isinstance(values, Categorical): - values = values.values - result = values.isnull() - else: - - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj(values.ravel()) - result[...] = vec.reshape(shape) - - elif is_datetimelike(obj): - # this is the NaT pattern - result = values.view('i8') == tslib.iNaT - else: - result = np.isnan(values) - - # box - if isinstance(obj, gt.ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) - - return result - - -def _isnull_ndarraylike_old(obj): - values = getattr(obj, 'values', obj) - dtype = values.dtype - - if is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj_old(values.ravel()) - result[:] = vec.reshape(shape) - - elif dtype in _DATELIKE_DTYPES: - # this is the NaT pattern - result = values.view('i8') == tslib.iNaT - else: - result = ~np.isfinite(values) - - # box - if isinstance(obj, gt.ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) - - return result - - -def notnull(obj): - """Replacement for numpy.isfinite / ~numpy.isnan which is suitable for use - on object arrays. - - Parameters - ---------- - arr : ndarray or object value - Object to check for *not*-null-ness - - Returns - ------- - isnulled : array-like of bool or bool - Array or bool indicating whether an object is *not* null or if an array - is given which of the element is *not* null. - - See also - -------- - pandas.isnull : boolean inverse of pandas.notnull - """ - res = isnull(obj) - if lib.isscalar(res): - return not res - return ~res - - -def is_null_datelike_scalar(other): - """ test whether the object is a null datelike, e.g. Nat - but guard against passing a non-scalar """ - if other is pd.NaT or other is None: - return True - elif lib.isscalar(other): - - # a timedelta - if hasattr(other, 'dtype'): - return other.view('i8') == tslib.iNaT - elif is_integer(other) and other == tslib.iNaT: - return True - return isnull(other) - return False - - -def array_equivalent(left, right, strict_nan=False): - """ - True if two arrays, left and right, have equal non-NaN elements, and NaNs - in corresponding locations. False otherwise. It is assumed that left and - right are NumPy arrays of the same dtype. The behavior of this function - (particularly with respect to NaNs) is not defined if the dtypes are - different. - - Parameters - ---------- - left, right : ndarrays - strict_nan : bool, default False - If True, consider NaN and None to be different. - - Returns - ------- - b : bool - Returns True if the arrays are equivalent. - - Examples - -------- - >>> array_equivalent( - ... np.array([1, 2, np.nan]), - ... np.array([1, 2, np.nan])) - True - >>> array_equivalent( - ... np.array([1, np.nan, 2]), - ... np.array([1, 2, np.nan])) - False - """ - - left, right = np.asarray(left), np.asarray(right) - - # shape compat - if left.shape != right.shape: - return False - - # Object arrays can contain None, NaN and NaT. - # string dtypes must be come to this path for NumPy 1.7.1 compat - if is_string_dtype(left) or is_string_dtype(right): - - if not strict_nan: - # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object(_ensure_object(left.ravel()), - _ensure_object(right.ravel())) - - for left_value, right_value in zip(left, right): - if left_value is tslib.NaT and right_value is not tslib.NaT: - return False - - elif isinstance(left_value, float) and np.isnan(left_value): - if (not isinstance(right_value, float) or - not np.isnan(right_value)): - return False - else: - if left_value != right_value: - return False - return True - - # NaNs can occur in float and complex arrays. - if is_float_dtype(left) or is_complex_dtype(left): - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - - # numpy will will not allow this type of datetimelike vs integer comparison - elif is_datetimelike_v_numeric(left, right): - return False - - # M8/m8 - elif needs_i8_conversion(left) and needs_i8_conversion(right): - if not is_dtype_equal(left.dtype, right.dtype): - return False - - left = left.view('i8') - right = right.view('i8') - - # NaNs cannot occur otherwise. - try: - return np.array_equal(left, right) - except AttributeError: - # see gh-13388 - # - # NumPy v1.7.1 has a bug in its array_equal - # function that prevents it from correctly - # comparing two arrays with complex dtypes. - # This bug is corrected in v1.8.0, so remove - # this try-except block as soon as we stop - # supporting NumPy versions < 1.8.0 - if not is_dtype_equal(left.dtype, right.dtype): - return False - - left = left.tolist() - right = right.tolist() - - return left == right - - -def _iterable_not_string(x): - return (isinstance(x, collections.Iterable) and - not isinstance(x, compat.string_types)) - - def flatten(l): """Flatten an arbitrarily nested sequence. @@ -398,510 +125,6 @@ def flatten(l): yield el -def _coerce_indexer_dtype(indexer, categories): - """ coerce the indexer input array to the smallest dtype possible """ - l = len(categories) - if l < _int8_max: - return _ensure_int8(indexer) - elif l < _int16_max: - return _ensure_int16(indexer) - elif l < _int32_max: - return _ensure_int32(indexer) - return _ensure_int64(indexer) - - -def _coerce_to_dtypes(result, dtypes): - """ given a dtypes and a result set, coerce the result elements to the - dtypes - """ - if len(result) != len(dtypes): - raise AssertionError("_coerce_to_dtypes requires equal len arrays") - - from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type - - def conv(r, dtype): - try: - if isnull(r): - pass - elif dtype == _NS_DTYPE: - r = lib.Timestamp(r) - elif dtype == _TD_DTYPE: - r = _coerce_scalar_to_timedelta_type(r) - elif dtype == np.bool_: - # messy. non 0/1 integers do not get converted. - if is_integer(r) and r not in [0, 1]: - return int(r) - r = bool(r) - elif dtype.kind == 'f': - r = float(r) - elif dtype.kind == 'i': - r = int(r) - except: - pass - - return r - - return [conv(r, dtype) for r, dtype in zip(result, dtypes)] - - -def _infer_fill_value(val): - """ - infer the fill value for the nan/NaT from the provided - scalar/ndarray/list-like if we are a NaT, return the correct dtyped - element to provide proper block construction - """ - - if not is_list_like(val): - val = [val] - val = np.array(val, copy=False) - if is_datetimelike(val): - return np.array('NaT', dtype=val.dtype) - elif is_object_dtype(val.dtype): - dtype = lib.infer_dtype(_ensure_object(val)) - if dtype in ['datetime', 'datetime64']: - return np.array('NaT', dtype=_NS_DTYPE) - elif dtype in ['timedelta', 'timedelta64']: - return np.array('NaT', dtype=_TD_DTYPE) - return np.nan - - -def _infer_dtype_from_scalar(val): - """ interpret the dtype from a scalar """ - - dtype = np.object_ - - # a 1-element ndarray - if isinstance(val, np.ndarray): - if val.ndim != 0: - raise ValueError( - "invalid ndarray passed to _infer_dtype_from_scalar") - - dtype = val.dtype - val = val.item() - - elif isinstance(val, compat.string_types): - - # If we create an empty array using a string to infer - # the dtype, NumPy will only allocate one character per entry - # so this is kind of bad. Alternately we could use np.repeat - # instead of np.empty (but then you still don't want things - # coming out as np.str_! - - dtype = np.object_ - - elif isinstance(val, (np.datetime64, - datetime)) and getattr(val, 'tzinfo', None) is None: - val = lib.Timestamp(val).value - dtype = np.dtype('M8[ns]') - - elif isinstance(val, (np.timedelta64, timedelta)): - val = lib.Timedelta(val).value - dtype = np.dtype('m8[ns]') - - elif is_bool(val): - dtype = np.bool_ - - elif is_integer(val): - if isinstance(val, np.integer): - dtype = type(val) - else: - dtype = np.int64 - - elif is_float(val): - if isinstance(val, np.floating): - dtype = type(val) - else: - dtype = np.float64 - - elif is_complex(val): - dtype = np.complex_ - - return dtype, val - - -def _is_na_compat(arr, fill_value=np.nan): - """ - Parameters - ---------- - arr: a numpy array - fill_value: fill value, default to np.nan - - Returns - ------- - True if we can fill using this fill_value - """ - dtype = arr.dtype - if isnull(fill_value): - return not (is_bool_dtype(dtype) or - is_integer_dtype(dtype)) - return True - - -def _maybe_fill(arr, fill_value=np.nan): - """ - if we have a compatiable fill_value and arr dtype, then fill - """ - if _is_na_compat(arr, fill_value): - arr.fill(fill_value) - return arr - - -def _maybe_promote(dtype, fill_value=np.nan): - - # if we passed an array here, determine the fill value by dtype - if isinstance(fill_value, np.ndarray): - if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): - fill_value = tslib.iNaT - else: - - # we need to change to object type as our - # fill_value is of object type - if fill_value.dtype == np.object_: - dtype = np.dtype(np.object_) - fill_value = np.nan - - # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, (np.datetime64, np.timedelta64)): - # for now: refuse to upcast datetime64 - # (this is because datetime64 will not implicitly upconvert - # to object correctly as of numpy 1.6.1) - if isnull(fill_value): - fill_value = tslib.iNaT - else: - if issubclass(dtype.type, np.datetime64): - try: - fill_value = lib.Timestamp(fill_value).value - except: - # the proper thing to do here would probably be to upcast - # to object (but numpy 1.6.1 doesn't do this properly) - fill_value = tslib.iNaT - elif issubclass(dtype.type, np.timedelta64): - try: - fill_value = lib.Timedelta(fill_value).value - except: - # as for datetimes, cannot upcast to object - fill_value = tslib.iNaT - else: - fill_value = tslib.iNaT - elif is_datetimetz(dtype): - if isnull(fill_value): - fill_value = tslib.iNaT - elif is_float(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - dtype = np.float64 - elif is_bool(fill_value): - if not issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif is_integer(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - # upcast to prevent overflow - arr = np.asarray(fill_value) - if arr != arr.astype(dtype): - dtype = arr.dtype - elif is_complex(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, (np.integer, np.floating)): - dtype = np.complex128 - elif fill_value is None: - if is_float_dtype(dtype) or is_complex_dtype(dtype): - fill_value = np.nan - elif is_integer_dtype(dtype): - dtype = np.float64 - fill_value = np.nan - elif is_datetime_or_timedelta_dtype(dtype): - fill_value = tslib.iNaT - else: - dtype = np.object_ - else: - dtype = np.object_ - - # in case we have a string that looked like a number - if is_categorical_dtype(dtype): - pass - elif is_datetimetz(dtype): - pass - elif issubclass(np.dtype(dtype).type, compat.string_types): - dtype = np.object_ - - return dtype, fill_value - - -def _maybe_upcast_putmask(result, mask, other): - """ - A safe version of putmask that potentially upcasts the result - - Parameters - ---------- - result : ndarray - The destination array. This will be mutated in-place if no upcasting is - necessary. - mask : boolean ndarray - other : ndarray or scalar - The source array or value - - Returns - ------- - result : ndarray - changed : boolean - Set to true if the result array was upcasted - """ - - if mask.any(): - # Two conversions for date-like dtypes that can't be done automatically - # in np.place: - # NaN -> NaT - # integer or integer array -> date-like array - if result.dtype in _DATELIKE_DTYPES: - if lib.isscalar(other): - if isnull(other): - other = result.dtype.type('nat') - elif is_integer(other): - other = np.array(other, dtype=result.dtype) - elif is_integer_dtype(other): - other = np.array(other, dtype=result.dtype) - - def changeit(): - - # try to directly set by expanding our array to full - # length of the boolean - try: - om = other[mask] - om_at = om.astype(result.dtype) - if (om == om_at).all(): - new_result = result.values.copy() - new_result[mask] = om_at - result[:] = new_result - return result, False - except: - pass - - # we are forced to change the dtype of the result as the input - # isn't compatible - r, _ = _maybe_upcast(result, fill_value=other, copy=True) - np.place(r, mask, other) - - return r, True - - # we want to decide whether place will work - # if we have nans in the False portion of our mask then we need to - # upcast (possibly), otherwise we DON't want to upcast (e.g. if we - # have values, say integers, in the success portion then it's ok to not - # upcast) - new_dtype, _ = _maybe_promote(result.dtype, other) - if new_dtype != result.dtype: - - # we have a scalar or len 0 ndarray - # and its nan and we are changing some values - if (lib.isscalar(other) or - (isinstance(other, np.ndarray) and other.ndim < 1)): - if isnull(other): - return changeit() - - # we have an ndarray and the masking has nans in it - else: - - if isnull(other[mask]).any(): - return changeit() - - try: - np.place(result, mask, other) - except: - return changeit() - - return result, False - - -def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): - """ provide explict type promotion and coercion - - Parameters - ---------- - values : the ndarray that we want to maybe upcast - fill_value : what we want to fill with - dtype : if None, then use the dtype of the values, else coerce to this type - copy : if True always make a copy even if no upcast is required - """ - - if is_extension_type(values): - if copy: - values = values.copy() - else: - if dtype is None: - dtype = values.dtype - new_dtype, fill_value = _maybe_promote(dtype, fill_value) - if new_dtype != values.dtype: - values = values.astype(new_dtype) - elif copy: - values = values.copy() - - return values, fill_value - - -def _possibly_cast_item(obj, item, dtype): - chunk = obj[item] - - if chunk.values.dtype != dtype: - if dtype in (np.object_, np.bool_): - obj[item] = chunk.astype(np.object_) - elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: %s" % dtype) - - -def _possibly_downcast_to_dtype(result, dtype): - """ try to cast to the specified dtype (e.g. convert back to bool/int - or could be an astype of float64->float32 - """ - - if lib.isscalar(result): - return result - - def trans(x): - return x - - if isinstance(dtype, compat.string_types): - if dtype == 'infer': - inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) - if inferred_type == 'boolean': - dtype = 'bool' - elif inferred_type == 'integer': - dtype = 'int64' - elif inferred_type == 'datetime64': - dtype = 'datetime64[ns]' - elif inferred_type == 'timedelta64': - dtype = 'timedelta64[ns]' - - # try to upcast here - elif inferred_type == 'floating': - dtype = 'int64' - if issubclass(result.dtype.type, np.number): - - def trans(x): # noqa - return x.round() - else: - dtype = 'object' - - if isinstance(dtype, compat.string_types): - dtype = np.dtype(dtype) - - try: - - # don't allow upcasts here (except if empty) - if dtype.kind == result.dtype.kind: - if (result.dtype.itemsize <= dtype.itemsize and - np.prod(result.shape)): - return result - - if issubclass(dtype.type, np.floating): - return result.astype(dtype) - elif dtype == np.bool_ or issubclass(dtype.type, np.integer): - - # if we don't have any elements, just astype it - if not np.prod(result.shape): - return trans(result).astype(dtype) - - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - # if we have any nulls, then we are done - if isnull(arr).any() or not np.allclose(arr, - trans(arr).astype(dtype)): - return result - - # a comparable, e.g. a Decimal may slip in here - elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, - float, bool)): - return result - - if (issubclass(result.dtype.type, (np.object_, np.number)) and - notnull(result).all()): - new_result = trans(result).astype(dtype) - try: - if np.allclose(new_result, result): - return new_result - except: - - # comparison of an object dtype with a number type could - # hit here - if (new_result == result).all(): - return new_result - - # a datetimelike - elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: - try: - result = result.astype(dtype) - except: - if dtype.tz: - # convert to datetime and change timezone - result = pd.to_datetime(result).tz_localize(dtype.tz) - - except: - pass - - return result - - -def _maybe_convert_string_to_object(values): - """ - - Convert string-like and string-like array to convert object dtype. - This is to avoid numpy to handle the array as str dtype. - """ - if isinstance(values, string_types): - values = np.array([values], dtype=object) - elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): - values = values.astype(object) - return values - - -def _maybe_convert_scalar(values): - """ - Convert a python scalar to the appropriate numpy dtype if possible - This avoids numpy directly converting according to platform preferences - """ - if lib.isscalar(values): - dtype, values = _infer_dtype_from_scalar(values) - try: - values = dtype(values) - except TypeError: - pass - return values - - -def _lcd_dtypes(a_dtype, b_dtype): - """ return the lcd dtype to hold these types """ - - if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): - return _NS_DTYPE - elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): - return _TD_DTYPE - elif is_complex_dtype(a_dtype): - if is_complex_dtype(b_dtype): - return a_dtype - return np.float64 - elif is_integer_dtype(a_dtype): - if is_integer_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - return np.int64 - return np.float64 - elif is_float_dtype(a_dtype): - if is_float_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - else: - return np.float64 - elif is_integer(b_dtype): - return np.float64 - return np.object - - def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: @@ -909,66 +132,20 @@ def _consensus_name_attr(objs): return None return name -# ---------------------------------------------------------------------- -# Lots of little utilities - - -def _validate_date_like_dtype(dtype): - try: - typ = np.datetime_data(dtype)[0] - except ValueError as e: - raise TypeError('%s' % e) - if typ != 'generic' and typ != 'ns': - raise ValueError('%r is too specific of a frequency, try passing %r' % - (dtype.name, dtype.type.__name__)) - - -def _invalidate_string_dtypes(dtype_set): - """Change string like dtypes to object for - ``DataFrame.select_dtypes()``. - """ - non_string_dtypes = dtype_set - _string_dtypes - if non_string_dtypes != dtype_set: - raise TypeError("string dtypes are not allowed, use 'object' instead") - - -def _get_dtype_from_object(dtype): - """Get a numpy dtype.type-style object. This handles the datetime64[ns] - and datetime64[ns, TZ] compat - - Notes - ----- - If nothing can be found, returns ``object``. - """ - # type object from a dtype - if isinstance(dtype, type) and issubclass(dtype, np.generic): - return dtype - elif is_categorical(dtype): - return gt.CategoricalDtype().type - elif is_datetimetz(dtype): - return gt.DatetimeTZDtype(dtype).type - elif isinstance(dtype, np.dtype): # dtype object - try: - _validate_date_like_dtype(dtype) - except TypeError: - # should still pass if we don't have a datelike - pass - return dtype.type - elif isinstance(dtype, compat.string_types): - if dtype == 'datetime' or dtype == 'timedelta': - dtype += '64' - - try: - return _get_dtype_from_object(getattr(np, dtype)) - except (AttributeError, TypeError): - # handles cases like _get_dtype(int) - # i.e., python objects that are valid dtypes (unlike user-defined - # types, in general) - # TypeError handles the float16 typecode of 'e' - # further handle internal types - pass - return _get_dtype_from_object(np.dtype(dtype)) +def _maybe_match_name(a, b): + a_has = hasattr(a, 'name') + b_has = hasattr(b, 'name') + if a_has and b_has: + if a.name == b.name: + return a.name + else: + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None def _get_info_slice(obj, indexer): @@ -1005,225 +182,8 @@ def _maybe_box_datetimelike(value): _values_from_object = lib.values_from_object -def _possibly_castable(arr): - # return False to force a non-fastpath - - # check datetime64[ns]/timedelta64[ns] are valid - # otherwise try to coerce - kind = arr.dtype.kind - if kind == 'M' or kind == 'm': - return arr.dtype in _DATELIKE_DTYPES - - return arr.dtype.name not in _POSSIBLY_CAST_DTYPES - - -def _possibly_convert_platform(values): - """ try to do platform conversion, allow ndarray or list here """ - - if isinstance(values, (list, tuple)): - values = lib.list_to_object_array(values) - if getattr(values, 'dtype', None) == np.object_: - if hasattr(values, '_values'): - values = values._values - values = lib.maybe_convert_objects(values) - - return values - - -def _possibly_cast_to_datetime(value, dtype, errors='raise'): - """ try to cast the array/value to a datetimelike dtype, converting float - nan to iNaT - """ - from pandas.tseries.timedeltas import to_timedelta - from pandas.tseries.tools import to_datetime - - if dtype is not None: - if isinstance(dtype, compat.string_types): - dtype = np.dtype(dtype) - - is_datetime64 = is_datetime64_dtype(dtype) - is_datetime64tz = is_datetime64tz_dtype(dtype) - is_timedelta64 = is_timedelta64_dtype(dtype) - - if is_datetime64 or is_datetime64tz or is_timedelta64: - - # force the dtype if needed - if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): - if dtype.name == 'datetime64[ns]': - dtype = _NS_DTYPE - else: - raise TypeError("cannot convert datetimelike to " - "dtype [%s]" % dtype) - elif is_datetime64tz: - - # our NaT doesn't support tz's - # this will coerce to DatetimeIndex with - # a matching dtype below - if lib.isscalar(value) and isnull(value): - value = [value] - - elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): - if dtype.name == 'timedelta64[ns]': - dtype = _TD_DTYPE - else: - raise TypeError("cannot convert timedeltalike to " - "dtype [%s]" % dtype) - - if lib.isscalar(value): - if value == tslib.iNaT or isnull(value): - value = tslib.iNaT - else: - value = np.array(value, copy=False) - - # have a scalar array-like (e.g. NaT) - if value.ndim == 0: - value = tslib.iNaT - - # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, - dtype): - try: - if is_datetime64: - value = to_datetime(value, errors=errors)._values - elif is_datetime64tz: - # input has to be UTC at this point, so just - # localize - value = to_datetime( - value, - errors=errors).tz_localize(dtype.tz) - elif is_timedelta64: - value = to_timedelta(value, errors=errors)._values - except (AttributeError, ValueError, TypeError): - pass - - # coerce datetimelike to object - elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): - if is_object_dtype(dtype): - ints = np.asarray(value).view('i8') - return tslib.ints_to_pydatetime(ints) - - # we have a non-castable dtype that was passed - raise TypeError('Cannot cast datetime64 to %s' % dtype) - - else: - - is_array = isinstance(value, np.ndarray) - - # catch a datetime/timedelta that is not of ns variety - # and no coercion specified - if is_array and value.dtype.kind in ['M', 'm']: - dtype = value.dtype - - if dtype.kind == 'M' and dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) - - elif dtype.kind == 'm' and dtype != _TD_DTYPE: - value = to_timedelta(value) - - # only do this if we have an array and the dtype of the array is not - # setup already we are not an integer/object, so don't bother with this - # conversion - elif not (is_array and not (issubclass(value.dtype.type, np.integer) or - value.dtype == np.object_)): - value = _possibly_infer_to_datetimelike(value) - - return value - - -def _possibly_infer_to_datetimelike(value, convert_dates=False): - """ - we might have an array (or single object) that is datetime like, - and no dtype is passed don't change the value unless we find a - datetime/timedelta set - - this is pretty strict in that a datetime/timedelta is REQUIRED - in addition to possible nulls/string likes - - ONLY strings are NOT datetimelike - - Parameters - ---------- - value : np.array / Series / Index / list-like - convert_dates : boolean, default False - if True try really hard to convert dates (such as datetime.date), other - leave inferred dtype 'date' alone - - """ - - if isinstance(value, (gt.ABCDatetimeIndex, gt.ABCPeriodIndex)): - return value - elif isinstance(value, gt.ABCSeries): - if isinstance(value._values, gt.ABCDatetimeIndex): - return value._values - - v = value - if not is_list_like(v): - v = [v] - v = np.array(v, copy=False) - shape = v.shape - if not v.ndim == 1: - v = v.ravel() - - if len(v): - - def _try_datetime(v): - # safe coerce to datetime64 - try: - v = tslib.array_to_datetime(v, errors='raise') - except ValueError: - - # we might have a sequence of the same-datetimes with tz's - # if so coerce to a DatetimeIndex; if they are not the same, - # then these stay as object dtype - try: - from pandas import to_datetime - return to_datetime(v) - except: - pass - - except: - pass - - return v.reshape(shape) - - def _try_timedelta(v): - # safe coerce to timedelta64 - - # will try first with a string & object conversion - from pandas.tseries.timedeltas import to_timedelta - try: - return to_timedelta(v)._values.reshape(shape) - except: - return v - - # do a quick inference for perf - sample = v[:min(3, len(v))] - inferred_type = lib.infer_dtype(sample) - - if (inferred_type in ['datetime', 'datetime64'] or - (convert_dates and inferred_type in ['date'])): - value = _try_datetime(v) - elif inferred_type in ['timedelta', 'timedelta64']: - value = _try_timedelta(v) - - # It's possible to have nulls intermixed within the datetime or - # timedelta. These will in general have an inferred_type of 'mixed', - # so have to try both datetime and timedelta. - - # try timedelta first to avoid spurious datetime conversions - # e.g. '00:00:01' is a timedelta but technically is also a datetime - elif inferred_type in ['mixed']: - - if lib.is_possible_datetimelike_array(_ensure_object(v)): - value = _try_timedelta(v) - if lib.infer_dtype(value) in ['mixed']: - value = _try_datetime(v) - - return value - - def is_bool_indexer(key): - if isinstance(key, (gt.ABCSeries, np.ndarray)): + if isinstance(key, (ABCSeries, np.ndarray)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) @@ -1250,12 +210,6 @@ def _default_index(n): return RangeIndex(0, n, name=None) -def ensure_float(arr): - if issubclass(arr.dtype.type, (np.integer, np.bool_)): - arr = arr.astype(float) - return arr - - def _mut_exclusive(**kwargs): item1, item2 = kwargs.items() label1, val1 = item1 @@ -1287,6 +241,10 @@ def _all_not_none(*args): return True +def _count_not_none(*args): + return sum(x is not None for x in args) + + def _try_sort(iterable): listed = list(iterable) try: @@ -1295,10 +253,6 @@ def _try_sort(iterable): return listed -def _count_not_none(*args): - return sum(x is not None for x in args) - - def iterpairs(seq): """ Parameters @@ -1451,349 +405,6 @@ def _maybe_make_list(obj): return [obj] return obj -# TYPE TESTING - -is_bool = lib.is_bool - -is_integer = lib.is_integer - -is_float = lib.is_float - -is_complex = lib.is_complex - - -def is_string_like(obj): - return isinstance(obj, (compat.text_type, compat.string_types)) - - -def is_iterator(obj): - # python 3 generators have __next__ instead of next - return hasattr(obj, 'next') or hasattr(obj, '__next__') - - -def is_number(obj): - return isinstance(obj, (numbers.Number, np.number)) - - -def is_period_arraylike(arr): - """ return if we are period arraylike / PeriodIndex """ - if isinstance(arr, pd.PeriodIndex): - return True - elif isinstance(arr, (np.ndarray, gt.ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'period' - return getattr(arr, 'inferred_type', None) == 'period' - - -def is_datetime_arraylike(arr): - """ return if we are datetime arraylike / DatetimeIndex """ - if isinstance(arr, gt.ABCDatetimeIndex): - return True - elif isinstance(arr, (np.ndarray, gt.ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' - return getattr(arr, 'inferred_type', None) == 'datetime' - - -def is_datetimelike(arr): - return (arr.dtype in _DATELIKE_DTYPES or - isinstance(arr, gt.ABCPeriodIndex) or - is_datetimetz(arr)) - - -def _coerce_to_dtype(dtype): - """ coerce a string / np.dtype to a dtype """ - if is_categorical_dtype(dtype): - dtype = gt.CategoricalDtype() - elif is_datetime64tz_dtype(dtype): - dtype = gt.DatetimeTZDtype(dtype) - else: - dtype = np.dtype(dtype) - return dtype - - -def _get_dtype(arr_or_dtype): - if isinstance(arr_or_dtype, np.dtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype) - elif isinstance(arr_or_dtype, gt.CategoricalDtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, gt.DatetimeTZDtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, compat.string_types): - if is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtype.construct_from_string(arr_or_dtype) - elif is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtype.construct_from_string(arr_or_dtype) - - if hasattr(arr_or_dtype, 'dtype'): - arr_or_dtype = arr_or_dtype.dtype - return np.dtype(arr_or_dtype) - - -def _get_dtype_type(arr_or_dtype): - if isinstance(arr_or_dtype, np.dtype): - return arr_or_dtype.type - elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype).type - elif isinstance(arr_or_dtype, gt.CategoricalDtype): - return gt.CategoricalDtypeType - elif isinstance(arr_or_dtype, gt.DatetimeTZDtype): - return gt.DatetimeTZDtypeType - elif isinstance(arr_or_dtype, compat.string_types): - if is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtypeType - elif is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtypeType - return _get_dtype_type(np.dtype(arr_or_dtype)) - try: - return arr_or_dtype.dtype.type - except AttributeError: - return type(None) - - -def is_dtype_equal(source, target): - """ return a boolean if the dtypes are equal """ - try: - source = _get_dtype(source) - target = _get_dtype(target) - return source == target - except (TypeError, AttributeError): - - # invalid comparison - # object == category will hit this - return False - - -def is_any_int_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.integer) - - -def is_integer_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_int64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.int64) - - -def is_int_or_datetime_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) or - issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_datetime64_dtype(arr_or_dtype): - try: - tipo = _get_dtype_type(arr_or_dtype) - except TypeError: - return False - return issubclass(tipo, np.datetime64) - - -def is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtype.is_dtype(arr_or_dtype) - - -def is_datetime64_any_dtype(arr_or_dtype): - return (is_datetime64_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) - - -def is_datetime64_ns_dtype(arr_or_dtype): - try: - tipo = _get_dtype(arr_or_dtype) - except TypeError: - return False - return tipo == _NS_DTYPE - - -def is_timedelta64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.timedelta64) - - -def is_timedelta64_ns_dtype(arr_or_dtype): - tipo = _get_dtype(arr_or_dtype) - return tipo == _TD_DTYPE - - -def is_datetime_or_timedelta_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, (np.datetime64, np.timedelta64)) - - -def is_numeric_v_string_like(a, b): - """ - numpy doesn't like to compare numeric arrays vs scalar string-likes - - return a boolean result if this is the case for a,b or b,a - - """ - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) - - is_a_numeric_array = is_a_array and is_numeric_dtype(a) - is_b_numeric_array = is_b_array and is_numeric_dtype(b) - is_a_string_array = is_a_array and is_string_like_dtype(a) - is_b_string_array = is_b_array and is_string_like_dtype(b) - - is_a_scalar_string_like = not is_a_array and is_string_like(a) - is_b_scalar_string_like = not is_b_array and is_string_like(b) - - return ((is_a_numeric_array and is_b_scalar_string_like) or - (is_b_numeric_array and is_a_scalar_string_like) or - (is_a_numeric_array and is_b_string_array) or - (is_b_numeric_array and is_a_string_array)) - - -def is_datetimelike_v_numeric(a, b): - # return if we have an i8 convertible and numeric comparison - if not hasattr(a, 'dtype'): - a = np.asarray(a) - if not hasattr(b, 'dtype'): - b = np.asarray(b) - - def is_numeric(x): - return is_integer_dtype(x) or is_float_dtype(x) - - is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_numeric(b)) or - (is_datetimelike(b) and is_numeric(a))) - - -def is_datetimelike_v_object(a, b): - # return if we have an i8 convertible and object comparsion - if not hasattr(a, 'dtype'): - a = np.asarray(a) - if not hasattr(b, 'dtype'): - b = np.asarray(b) - - def f(x): - return is_object_dtype(x) - - def is_object(x): - return is_integer_dtype(x) or is_float_dtype(x) - - is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_object(b)) or - (is_datetimelike(b) and is_object(a))) - - -def needs_i8_conversion(arr_or_dtype): - return (is_datetime_or_timedelta_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) - - -def is_numeric_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, (np.number, np.bool_)) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_string_dtype(arr_or_dtype): - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('O', 'S', 'U') - - -def is_string_like_dtype(arr_or_dtype): - # exclude object as its a mixed dtype - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('S', 'U') - - -def is_float_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.floating) - - -def is_floating_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return isinstance(tipo, np.floating) - - -def is_bool_dtype(arr_or_dtype): - try: - tipo = _get_dtype_type(arr_or_dtype) - except ValueError: - # this isn't even a dtype - return False - return issubclass(tipo, np.bool_) - - -def is_sparse(array): - """ return if we are a sparse array """ - return isinstance(array, (gt.ABCSparseArray, gt.ABCSparseSeries)) - - -def is_datetimetz(array): - """ return if we are a datetime with tz array """ - return ((isinstance(array, gt.ABCDatetimeIndex) and - getattr(array, 'tz', None) is not None) or - is_datetime64tz_dtype(array)) - - -def is_extension_type(value): - """ - if we are a klass that is preserved by the internals - these are internal klasses that we represent (and don't use a np.array) - """ - if is_categorical(value): - return True - elif is_sparse(value): - return True - elif is_datetimetz(value): - return True - return False - - -def is_categorical(array): - """ return if we are a categorical possibility """ - return isinstance(array, gt.ABCCategorical) or is_categorical_dtype(array) - - -def is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtype.is_dtype(arr_or_dtype) - - -def is_complex_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.complexfloating) - - -def is_object_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.object_) - - -def is_re(obj): - return isinstance(obj, re._pattern_type) - - -def is_re_compilable(obj): - try: - re.compile(obj) - except TypeError: - return False - else: - return True - - -def is_list_like(arg): - return (hasattr(arg, '__iter__') and - not isinstance(arg, compat.string_and_binary_types)) - - -def is_dict_like(arg): - return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') - - -def is_named_tuple(arg): - return isinstance(arg, tuple) and hasattr(arg, '_fields') - def is_null_slice(obj): """ we have a null slice """ @@ -1807,47 +418,6 @@ def is_full_slice(obj, l): obj.step is None) -def is_hashable(arg): - """Return True if hash(arg) will succeed, False otherwise. - - Some types will pass a test against collections.Hashable but fail when they - are actually hashed with hash(). - - Distinguish between these and other types by trying the call to hash() and - seeing if they raise TypeError. - - Examples - -------- - >>> a = ([],) - >>> isinstance(a, collections.Hashable) - True - >>> is_hashable(a) - False - """ - # unfortunately, we can't use isinstance(arg, collections.Hashable), which - # can be faster than calling hash, because numpy scalars on Python 3 fail - # this test - - # reconsider this decision once this numpy bug is fixed: - # https://github.com/numpy/numpy/issues/5562 - - try: - hash(arg) - except TypeError: - return False - else: - return True - - -def is_sequence(x): - try: - iter(x) - len(x) # it has a length - return not isinstance(x, compat.string_and_binary_types) - except (TypeError, AttributeError): - return False - - def _get_callable_name(obj): # typical case has name if hasattr(obj, '__name__'): @@ -1875,74 +445,6 @@ def _apply_if_callable(maybe_callable, obj, **kwargs): return maybe_callable -_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, - compat.text_type))) - -_ensure_float64 = algos.ensure_float64 -_ensure_float32 = algos.ensure_float32 -_ensure_int64 = algos.ensure_int64 -_ensure_int32 = algos.ensure_int32 -_ensure_int16 = algos.ensure_int16 -_ensure_int8 = algos.ensure_int8 -_ensure_platform_int = algos.ensure_platform_int -_ensure_object = algos.ensure_object - - -def _astype_nansafe(arr, dtype, copy=True): - """ return a view if copy is False, but - need to be very careful as the result shape could change! """ - if not isinstance(dtype, np.dtype): - dtype = _coerce_to_dtype(dtype) - - if issubclass(dtype.type, compat.text_type): - # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) - elif issubclass(dtype.type, compat.string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) - elif is_datetime64_dtype(arr): - if dtype == object: - return tslib.ints_to_pydatetime(arr.view(np.int64)) - elif dtype == np.int64: - return arr.view(dtype) - elif dtype != _NS_DTYPE: - raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % - (arr.dtype, dtype)) - return arr.astype(_NS_DTYPE) - elif is_timedelta64_dtype(arr): - if dtype == np.int64: - return arr.view(dtype) - elif dtype == object: - return tslib.ints_to_pytimedelta(arr.view(np.int64)) - - # in py3, timedelta64[ns] are int64 - elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or - (not compat.PY3 and dtype != _TD_DTYPE)): - - # allow frequency conversions - if dtype.kind == 'm': - mask = isnull(arr) - result = arr.astype(dtype).astype(np.float64) - result[mask] = np.nan - return result - - raise TypeError("cannot astype a timedelta from [%s] to [%s]" % - (arr.dtype, dtype)) - - return arr.astype(_TD_DTYPE) - elif (np.issubdtype(arr.dtype, np.floating) and - np.issubdtype(dtype, np.integer)): - - if np.isnan(arr).any(): - raise ValueError('Cannot convert NA to integer') - elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): - # work around NumPy brokenness, #1987 - return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) - - if copy: - return arr.astype(dtype) - return arr.view(dtype) - - def _all_none(*args): for arg in args: if arg is not None: @@ -1988,6 +490,9 @@ class Sentinel(object): return Sentinel() +# ---------------------------------------------------------------------- +# Detect our environment + def in_interactive_session(): """ check if we're running in an interactive shell @@ -2055,21 +560,6 @@ def in_ipython_frontend(): return False -def _maybe_match_name(a, b): - a_has = hasattr(a, 'name') - b_has = hasattr(b, 'name') - if a_has and b_has: - if a.name == b.name: - return a.name - else: - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None - - def _random_state(state=None): """ Helper function for processing random_state arguments. diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3ca2c6cd014bc..5cbc968f06fa7 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -366,7 +366,7 @@ def mpl_style_cb(key): def use_inf_as_null_cb(key): - from pandas.core.common import _use_inf_as_null + from pandas.types.missing import _use_inf_as_null _use_inf_as_null(key) with cf.config_prefix('mode'): diff --git a/pandas/core/convert.py b/pandas/core/convert.py deleted file mode 100644 index 7f4fe73c688f8..0000000000000 --- a/pandas/core/convert.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Functions for converting object to other types -""" - -import numpy as np - -import pandas as pd -from pandas.core.common import (_possibly_cast_to_datetime, is_object_dtype, - isnull) -import pandas.lib as lib - - -# TODO: Remove in 0.18 or 2017, which ever is sooner -def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, - convert_timedeltas=True, copy=True): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - # if we have passed in a list or scalar - if isinstance(values, (list, tuple)): - values = np.array(values, dtype=np.object_) - if not hasattr(values, 'dtype'): - values = np.array([values], dtype=np.object_) - - # convert dates - if convert_dates and values.dtype == np.object_: - - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == 'coerce': - new_values = _possibly_cast_to_datetime(values, 'M8[ns]', - errors='coerce') - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects(values, - convert_datetime=convert_dates) - - # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: - - if convert_timedeltas == 'coerce': - from pandas.tseries.timedeltas import to_timedelta - new_values = to_timedelta(values, coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas) - - # convert to numeric - if values.dtype == np.object_: - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - except: - pass - else: - # soft-conversion - values = lib.maybe_convert_objects(values) - - values = values.copy() if copy else values - - return values - - -def _soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, - coerce=False, copy=True): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - conversion_count = sum((datetime, numeric, timedelta)) - if conversion_count == 0: - raise ValueError('At least one of datetime, numeric or timedelta must ' - 'be True.') - elif conversion_count > 1 and coerce: - raise ValueError("Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True.") - - if isinstance(values, (list, tuple)): - # List or scalar - values = np.array(values, dtype=np.object_) - elif not hasattr(values, 'dtype'): - values = np.array([values], dtype=np.object_) - elif not is_object_dtype(values.dtype): - # If not object, do not attempt conversion - values = values.copy() if copy else values - return values - - # If 1 flag is coerce, ensure 2 others are False - if coerce: - # Immediate return if coerce - if datetime: - return pd.to_datetime(values, errors='coerce', box=False) - elif timedelta: - return pd.to_timedelta(values, errors='coerce', box=False) - elif numeric: - return pd.to_numeric(values, errors='coerce') - - # Soft conversions - if datetime: - values = lib.maybe_convert_objects(values, convert_datetime=datetime) - - if timedelta and is_object_dtype(values.dtype): - # Object check to ensure only run if previous did not convert - values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) - - if numeric and is_object_dtype(values.dtype): - try: - converted = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) - # If all NaNs, then do not-alter - values = converted if not isnull(converted).all() else values - values = values.copy() if copy else values - except: - pass - - return values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e01fc6dca6be3..334526b424be5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,12 +23,43 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import ( - isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, - is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, - _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, - is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike, - _dict_compat) +from pandas.types.cast import (_maybe_upcast, + _infer_dtype_from_scalar, + _possibly_cast_to_datetime, + _possibly_infer_to_datetimelike, + _possibly_convert_platform, + _possibly_downcast_to_dtype, + _invalidate_string_dtypes, + _coerce_to_dtypes, + _maybe_upcast_putmask) +from pandas.types.common import (is_categorical_dtype, + is_object_dtype, + is_extension_type, + is_datetimetz, + is_datetime64_dtype, + is_bool_dtype, + is_integer_dtype, + is_float_dtype, + is_integer, + is_scalar, + needs_i8_conversion, + _get_dtype_from_object, + _lcd_dtypes, + _ensure_float, + _ensure_float64, + _ensure_int64, + _ensure_platform_int, + is_list_like, + is_iterator, + is_sequence, + is_named_tuple) +from pandas.types.missing import isnull, notnull + +from pandas.core.common import (PandasError, _try_sort, + _default_index, + _values_from_object, + _maybe_box_datetimelike, + _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, @@ -268,7 +299,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: - if com.is_named_tuple(data[0]) and columns is None: + if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields arrays, columns = _to_arrays(data, columns, dtype=dtype) columns = _ensure_index(columns) @@ -940,7 +971,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, if columns is not None: columns = _ensure_index(columns) - if com.is_iterator(data): + if is_iterator(data): if nrows == 0: return cls() @@ -1051,7 +1082,7 @@ def to_records(self, index=True, convert_datetime64=True): y : recarray """ if index: - if com.is_datetime64_dtype(self.index) and convert_datetime64: + if is_datetime64_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: if isinstance(self.index, MultiIndex): @@ -1920,7 +1951,7 @@ def _ixs(self, i, axis=0): copy = True else: new_values = self._data.fast_xs(i) - if lib.isscalar(new_values): + if is_scalar(new_values): return new_values # if we are a copy, mark as such @@ -2072,7 +2103,7 @@ def _getitem_multilevel(self, key): return self._get_item_cache(key) def _getitem_frame(self, key): - if key.values.size and not com.is_bool_dtype(key.values): + if key.values.size and not is_bool_dtype(key.values): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) @@ -2289,7 +2320,7 @@ def select_dtypes(self, include=None, exclude=None): 5 False """ include, exclude = include or (), exclude or () - if not (com.is_list_like(include) and com.is_list_like(exclude)): + if not (is_list_like(include) and is_list_like(exclude)): raise TypeError('include and exclude must both be non-string' ' sequences') selection = tuple(map(frozenset, (include, exclude))) @@ -2300,9 +2331,9 @@ def select_dtypes(self, include=None, exclude=None): # convert the myriad valid dtypes object to a single representation include, exclude = map( - lambda x: frozenset(map(com._get_dtype_from_object, x)), selection) + lambda x: frozenset(map(_get_dtype_from_object, x)), selection) for dtypes in (include, exclude): - com._invalidate_string_dtypes(dtypes) + _invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): @@ -2392,7 +2423,7 @@ def _setitem_array(self, key, value): def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 - if key.values.size and not com.is_bool_dtype(key.values): + if key.values.size and not is_bool_dtype(key.values): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) @@ -2586,7 +2617,7 @@ def reindexer(value): value = _sanitize_index(value, self.index, copy=False) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: - value = com._possibly_convert_platform(value) + value = _possibly_convert_platform(value) else: value = com._asarray_tuplesafe(value) elif value.ndim == 2: @@ -2602,7 +2633,7 @@ def reindexer(value): # upcast the scalar dtype, value = _infer_dtype_from_scalar(value) value = np.repeat(value, len(self.index)).astype(dtype) - value = com._possibly_cast_to_datetime(value, dtype) + value = _possibly_cast_to_datetime(value, dtype) # return internal types directly if is_extension_type(value): @@ -2916,8 +2947,8 @@ def _maybe_casted_values(index, labels=None): mask = labels == -1 values = values.take(labels) if mask.any(): - values, changed = com._maybe_upcast_putmask(values, mask, - np.nan) + values, changed = _maybe_upcast_putmask(values, mask, + np.nan) return values new_index = _default_index(len(new_obj)) @@ -3131,14 +3162,14 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, raise ValueError('When sorting by column, axis must be 0 (rows)') if not isinstance(by, list): by = [by] - if com.is_sequence(ascending) and len(by) != len(ascending): + if is_sequence(ascending) and len(by) != len(ascending): raise ValueError('Length of ascending (%d) != length of by (%d)' % (len(ascending), len(by))) if len(by) > 1: from pandas.core.groupby import _lexsort_indexer def trans(v): - if com.needs_i8_conversion(v): + if needs_i8_conversion(v): return v.view('i8') return v @@ -3151,7 +3182,7 @@ def trans(v): keys.append(trans(k)) indexer = _lexsort_indexer(keys, orders=ascending, na_position=na_position) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) else: from pandas.core.groupby import _nargsort @@ -3320,7 +3351,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, inplace=inplace, sort_remaining=sort_remaining) def _nsorted(self, columns, n, method, keep): - if not com.is_list_like(columns): + if not is_list_like(columns): columns = [columns] columns = list(columns) ser = getattr(self[columns[0]], method)(n, keep=keep) @@ -3658,28 +3689,28 @@ def combine(self, other, func, fill_value=None, overwrite=True): # if we have different dtypes, possibily promote new_dtype = this_dtype if this_dtype != other_dtype: - new_dtype = com._lcd_dtypes(this_dtype, other_dtype) + new_dtype = _lcd_dtypes(this_dtype, other_dtype) series = series.astype(new_dtype) otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype - needs_i8_conversion = com.needs_i8_conversion(new_dtype) - if needs_i8_conversion: + needs_i8_conversion_i = needs_i8_conversion(new_dtype) + if needs_i8_conversion_i: this_dtype = new_dtype arr = func(series, otherSeries, True) else: arr = func(series, otherSeries) if do_fill: - arr = com.ensure_float(arr) + arr = _ensure_float(arr) arr[this_mask & other_mask] = NA # try to downcast back to the original dtype - if needs_i8_conversion: - arr = com._possibly_cast_to_datetime(arr, this_dtype) + if needs_i8_conversion_i: + arr = _possibly_cast_to_datetime(arr, this_dtype) else: - arr = com._possibly_downcast_to_dtype(arr, this_dtype) + arr = _possibly_downcast_to_dtype(arr, this_dtype) result[col] = arr @@ -4581,7 +4612,7 @@ def _dict_round(df, decimals): yield vals def _series_round(s, decimals): - if com.is_integer_dtype(s) or com.is_float_dtype(s): + if is_integer_dtype(s) or is_float_dtype(s): return s.round(decimals) return s @@ -4592,7 +4623,7 @@ def _series_round(s, decimals): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") new_cols = [col for col in _dict_round(self, decimals)] - elif com.is_integer(decimals): + elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] @@ -4634,14 +4665,14 @@ def corr(self, method='pearson', min_periods=1): mat = numeric_df.values if method == 'pearson': - correl = _algos.nancorr(com._ensure_float64(mat), minp=min_periods) + correl = _algos.nancorr(_ensure_float64(mat), minp=min_periods) elif method == 'spearman': - correl = _algos.nancorr_spearman(com._ensure_float64(mat), + correl = _algos.nancorr_spearman(_ensure_float64(mat), minp=min_periods) else: if min_periods is None: min_periods = 1 - mat = com._ensure_float64(mat).T + mat = _ensure_float64(mat).T corrf = nanops.get_corr_func(method) K = len(cols) correl = np.empty((K, K), dtype=float) @@ -4696,7 +4727,7 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = _algos.nancorr(com._ensure_float64(mat), cov=True, + baseCov = _algos.nancorr(_ensure_float64(mat), cov=True, minp=min_periods) return self._constructor(baseCov, index=cols, columns=cols) @@ -4825,7 +4856,7 @@ def _count_level(self, level, axis=0, numeric_only=False): level = count_axis._get_level_number(level) level_index = count_axis.levels[level] - labels = com._ensure_int64(count_axis.labels[level]) + labels = _ensure_int64(count_axis.labels[level]) counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) @@ -4906,7 +4937,7 @@ def f(x): # try to coerce to the original dtypes item by item if we can if axis == 0: - result = com._coerce_to_dtypes(result, self.dtypes) + result = _coerce_to_dtypes(result, self.dtypes) return Series(result, index=labels) @@ -5376,13 +5407,13 @@ def _prep_ndarray(values, copy=True): return np.empty((0, 0), dtype=object) def convert(v): - return com._possibly_convert_platform(v) + return _possibly_convert_platform(v) # we could have a 1-dim or 2-dim list here # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: - if com.is_list_like(values[0]) or hasattr(values[0], 'len'): + if is_list_like(values[0]) or hasattr(values[0], 'len'): values = np.array([convert(v) for v in values]) else: values = convert(values) @@ -5570,7 +5601,7 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): def convert(arr): if dtype != object and dtype != np.object: arr = lib.maybe_convert_objects(arr, try_float=coerce_float) - arr = com._possibly_cast_to_datetime(arr, dtype) + arr = _possibly_cast_to_datetime(arr, dtype) return arr arrays = [convert(arr) for arr in content] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b4bcae47cbbdf..d6e6f571be53a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,6 +8,29 @@ import pandas.lib as lib import pandas as pd + + +from pandas.types.common import (_coerce_to_dtype, + _ensure_int64, + needs_i8_conversion, + is_scalar, + is_integer, is_bool, + is_bool_dtype, + is_numeric_dtype, + is_datetime64_dtype, + is_timedelta64_dtype, + is_list_like, + is_dict_like, + is_re_compilable) +from pandas.types.cast import _maybe_promote, _maybe_upcast_putmask +from pandas.types.missing import isnull, notnull +from pandas.types.generic import ABCSeries, ABCPanel + +from pandas.core.common import (_values_from_object, + _maybe_box_datetimelike, + SettingWithCopyError, SettingWithCopyWarning, + AbstractMethodError) + from pandas.core.base import PandasObject from pandas.core.index import (Index, MultiIndex, _ensure_index, InvalidIndexError) @@ -25,11 +48,6 @@ from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, isidentifier, set_function_name) -from pandas.core.common import (isnull, notnull, is_list_like, - _values_from_object, _maybe_promote, - _maybe_box_datetimelike, ABCSeries, - SettingWithCopyError, SettingWithCopyWarning, - AbstractMethodError) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config @@ -46,10 +64,6 @@ Name or list of names which refer to the axis items.""") -def is_dictlike(x): - return isinstance(x, (dict, com.ABCSeries)) - - def _single_replace(self, to_replace, method, inplace, limit): if self.ndim != 1: raise TypeError('cannot replace {0} with method {1} on a {2}' @@ -116,7 +130,7 @@ def _validate_dtype(self, dtype): """ validate the passed dtype """ if dtype is not None: - dtype = com._coerce_to_dtype(dtype) + dtype = _coerce_to_dtype(dtype) # a compound dtype if dtype.kind == 'V': @@ -310,7 +324,7 @@ def _from_axes(cls, data, axes, **kwargs): def _get_axis_number(self, axis): axis = self._AXIS_ALIASES.get(axis, axis) - if com.is_integer(axis): + if is_integer(axis): if axis in self._AXIS_NAMES: return axis else: @@ -717,8 +731,8 @@ def rename_axis(self, mapper, axis=0, copy=True, inplace=False): 1 2 5 2 3 6 """ - non_mapper = lib.isscalar(mapper) or (com.is_list_like(mapper) and not - com.is_dict_like(mapper)) + non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not + is_dict_like(mapper)) if non_mapper: return self._set_axis_name(mapper, axis=axis) else: @@ -912,7 +926,7 @@ def bool(self): v = self.squeeze() if isinstance(v, (bool, np.bool_)): return bool(v) - elif lib.isscalar(v): + elif is_scalar(v): raise ValueError("bool cannot act on a non-boolean single element " "{0}".format(self.__class__.__name__)) @@ -1764,10 +1778,10 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): else: return self.take(loc, axis=axis, convert=True) - if not lib.isscalar(loc): + if not is_scalar(loc): new_index = self.index[loc] - if lib.isscalar(loc): + if is_scalar(loc): new_values = self._data.fast_xs(loc) # may need to box a datelike-scalar @@ -2340,7 +2354,7 @@ def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, index = _ensure_index(index) if indexer is not None: - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) # TODO: speed up on homogeneous DataFrame objects new_data = new_data.reindex_indexer(index, indexer, axis=baxis, @@ -3202,10 +3216,10 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return self if self.ndim == 1: - if isinstance(value, (dict, com.ABCSeries)): + if isinstance(value, (dict, ABCSeries)): from pandas import Series value = Series(value) - elif not com.is_list_like(value): + elif not is_list_like(value): pass else: raise ValueError("invalid fill value with a %s" % @@ -3215,7 +3229,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, inplace=inplace, downcast=downcast) - elif isinstance(value, (dict, com.ABCSeries)): + elif isinstance(value, (dict, ABCSeries)): if axis == 1: raise NotImplementedError('Currently only can fill ' 'with dict/Series column ' @@ -3228,7 +3242,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, obj = result[k] obj.fillna(v, limit=limit, inplace=True) return result - elif not com.is_list_like(value): + elif not is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, downcast=downcast) @@ -3354,7 +3368,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, and play with this method to gain intuition about how it works. """ - if not com.is_bool(regex) and to_replace is not None: + if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is " "not a bool") if axis is not None: @@ -3367,15 +3381,15 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat - if not is_dictlike(to_replace) and not is_dictlike(regex): + if not is_dict_like(to_replace) and not is_dict_like(regex): to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): return _single_replace(self, to_replace, method, inplace, limit) - if not is_dictlike(to_replace): - if not is_dictlike(regex): + if not is_dict_like(to_replace): + if not is_dict_like(regex): raise TypeError('If "to_replace" and "value" are both None' ' and "to_replace" is not a list, then ' 'regex must be a mapping') @@ -3385,7 +3399,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, items = list(compat.iteritems(to_replace)) keys, values = zip(*items) - are_mappings = [is_dictlike(v) for v in values] + are_mappings = [is_dict_like(v) for v in values] if any(are_mappings): if not all(are_mappings): @@ -3418,8 +3432,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self new_data = self._data - if is_dictlike(to_replace): - if is_dictlike(value): # {'A' : NA} -> {'A' : 0} + if is_dict_like(to_replace): + if is_dict_like(value): # {'A' : NA} -> {'A' : 0} res = self if inplace else self.copy() for c, src in compat.iteritems(to_replace): if c in value and c in self: @@ -3429,7 +3443,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return None if inplace else res # {'A': NA} -> 0 - elif not com.is_list_like(value): + elif not is_list_like(value): for k, src in compat.iteritems(to_replace): if k in self: new_data = new_data.replace(to_replace=src, @@ -3441,8 +3455,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, raise TypeError('value argument must be scalar, dict, or ' 'Series') - elif com.is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] - if com.is_list_like(value): + elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if is_list_like(value): if len(to_replace) != len(value): raise ValueError('Replacement lists must match ' 'in length. Expecting %d got %d ' % @@ -3458,8 +3472,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, value=value, inplace=inplace, regex=regex) elif to_replace is None: - if not (com.is_re_compilable(regex) or - com.is_list_like(regex) or is_dictlike(regex)): + if not (is_re_compilable(regex) or + is_list_like(regex) or is_dict_like(regex)): raise TypeError("'regex' must be a string or a compiled " "regular expression or a list or dict of " "strings or regular expressions, you " @@ -3470,7 +3484,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: # dest iterable dict-like - if is_dictlike(value): # NA -> {'A' : 0, 'B' : -1} + if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} new_data = self._data for k, v in compat.iteritems(value): @@ -3480,7 +3494,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, inplace=inplace, regex=regex) - elif not com.is_list_like(value): # NA -> 0 + elif not is_list_like(value): # NA -> 0 new_data = self._data.replace(to_replace=to_replace, value=value, inplace=inplace, regex=regex) @@ -3792,14 +3806,14 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 3 0.230930 0.000000 4 1.100000 0.570967 """ - if isinstance(self, com.ABCPanel): + if isinstance(self, ABCPanel): raise NotImplementedError("clip is not supported yet for panels") axis = nv.validate_clip_with_axis(axis, args, kwargs) # GH 2747 (arguments were reversed) if lower is not None and upper is not None: - if lib.isscalar(lower) and lib.isscalar(upper): + if is_scalar(lower) and is_scalar(upper): lower, upper = min(lower, upper), max(lower, upper) result = self @@ -4485,10 +4499,12 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, new_other = np.array(other, dtype=self.dtype) except ValueError: new_other = np.array(other) + except TypeError: + new_other = other # we can end up comparing integers and m8[ns] # which is a numpy no no - is_i8 = com.needs_i8_conversion(self.dtype) + is_i8 = needs_i8_conversion(self.dtype) if is_i8: matches = False else: @@ -4497,7 +4513,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if matches is False or not matches.all(): # coerce other to a common dtype if we can - if com.needs_i8_conversion(self.dtype): + if needs_i8_conversion(self.dtype): try: other = np.array(other, dtype=self.dtype) except: @@ -4550,7 +4566,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, dtype, fill_value = _maybe_promote(other.dtype) new_other = np.empty(len(icond), dtype=dtype) new_other.fill(fill_value) - com._maybe_upcast_putmask(new_other, icond, other) + _maybe_upcast_putmask(new_other, icond, other) other = new_other else: @@ -5058,7 +5074,7 @@ def describe_categorical_1d(data): if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - if com.is_datetime64_dtype(data): + if is_datetime64_dtype(data): asint = data.dropna().values.view('i8') names += ['top', 'freq', 'first', 'last'] result += [lib.Timestamp(top), freq, @@ -5071,11 +5087,11 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name) def describe_1d(data): - if com.is_bool_dtype(data): + if is_bool_dtype(data): return describe_categorical_1d(data) - elif com.is_numeric_dtype(data): + elif is_numeric_dtype(data): return describe_numeric_1d(data) - elif com.is_timedelta64_dtype(data): + elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: return describe_categorical_1d(data) @@ -5162,7 +5178,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1) if freq is None: - mask = com.isnull(_values_from_object(self)) + mask = isnull(_values_from_object(self)) np.putmask(rs.values, mask, np.nan) return rs diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 077acc1e81444..6179857978b7b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -13,6 +13,25 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 + +from pandas.types.common import (_DATELIKE_DTYPES, + is_numeric_dtype, + is_timedelta64_dtype, is_datetime64_dtype, + is_categorical_dtype, + is_datetime_or_timedelta_dtype, + is_bool, is_integer_dtype, + is_complex_dtype, + is_bool_dtype, + is_scalar, + _ensure_float64, + _ensure_platform_int, + _ensure_int64, + _ensure_object, + _ensure_float) +from pandas.types.cast import _possibly_downcast_to_dtype +from pandas.types.missing import isnull, notnull, _maybe_fill + +from pandas.core.common import _values_from_object, AbstractMethodError from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) from pandas.core.categorical import Categorical @@ -30,14 +49,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com -from pandas.core.common import(_possibly_downcast_to_dtype, isnull, - notnull, _DATELIKE_DTYPES, is_numeric_dtype, - is_timedelta64_dtype, is_datetime64_dtype, - is_categorical_dtype, _values_from_object, - is_datetime_or_timedelta_dtype, is_bool, - is_bool_dtype, AbstractMethodError, - _maybe_fill) -from pandas.core.config import option_context, is_callable +from pandas.core.config import option_context import pandas.lib as lib from pandas.lib import Timestamp import pandas.tslib as tslib @@ -662,7 +674,7 @@ def apply(self, func, *args, **kwargs): # resolve functions to their callable functions prior, this # wouldn't be needed if args or kwargs: - if is_callable(func): + if callable(func): @wraps(func) def f(g): @@ -752,7 +764,7 @@ def _try_cast(self, result, obj): else: dtype = obj.dtype - if not lib.isscalar(result): + if not is_scalar(result): result = _possibly_downcast_to_dtype(result, dtype) return result @@ -817,7 +829,7 @@ def _python_agg_general(self, func, *args, **kwargs): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): - values = com.ensure_float(values) + values = _ensure_float(values) output[name] = self._try_cast(values[mask], result) @@ -1595,7 +1607,7 @@ def size(self): """ ids, _, ngroup = self.group_info - ids = com._ensure_platform_int(ids) + ids = _ensure_platform_int(ids) out = np.bincount(ids[ids != -1], minlength=ngroup or None) return Series(out, index=self.result_index, dtype='int64') @@ -1631,7 +1643,7 @@ def group_info(self): comp_ids, obs_group_ids = self._get_compressed_labels() ngroups = len(obs_group_ids) - comp_ids = com._ensure_int64(comp_ids) + comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups def _get_compressed_labels(self): @@ -1671,7 +1683,7 @@ def get_group_levels(self): name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): - labels = com._ensure_platform_int(labels) + labels = _ensure_platform_int(labels) levels = ping.group_index.take(labels) name_list.append(levels) @@ -1780,11 +1792,11 @@ def _cython_operation(self, kind, values, how, axis): values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): - values = _algos.ensure_float64(values) - elif com.is_integer_dtype(values): + values = _ensure_float64(values) + elif is_integer_dtype(values): values = values.astype('int64', copy=False) - elif is_numeric and not com.is_complex_dtype(values): - values = _algos.ensure_float64(values) + elif is_numeric and not is_complex_dtype(values): + values = _ensure_float64(values) else: values = values.astype(object) @@ -1793,7 +1805,7 @@ def _cython_operation(self, kind, values, how, axis): kind, how, values, is_numeric) except NotImplementedError: if is_numeric: - values = _algos.ensure_float64(values) + values = _ensure_float64(values) func, dtype_str = self._get_cython_function( kind, how, values, is_numeric) else: @@ -1821,7 +1833,7 @@ def _cython_operation(self, kind, values, how, axis): result = self._transform( result, accum, values, labels, func, is_numeric) - if com.is_integer_dtype(result): + if is_integer_dtype(result): if len(result[result == tslib.iNaT]) > 0: result = result.astype('float64') result[result == tslib.iNaT] = np.nan @@ -1834,7 +1846,7 @@ def _cython_operation(self, kind, values, how, axis): result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - com._ensure_object(result), + _ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] @@ -1996,7 +2008,7 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): def __init__(self, bins, binlabels, filter_empty=False, mutated=False): - self.bins = com._ensure_int64(bins) + self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated @@ -2061,7 +2073,7 @@ def group_info(self): obs_group_ids = np.arange(ngroups) rep = np.diff(np.r_[0, self.bins]) - rep = com._ensure_platform_int(rep) + rep = _ensure_platform_int(rep) if ngroups == len(self.bins): comp_ids = np.repeat(np.arange(ngroups), rep) else: @@ -2449,7 +2461,7 @@ def is_in_obj(gpr): def _is_label_like(val): return (isinstance(val, compat.string_types) or - (val is not None and lib.isscalar(val))) + (val is not None and is_scalar(val))) def _convert_grouper(axis, grouper): @@ -2671,7 +2683,7 @@ def _aggregate_multiple_funcs(self, arg, _level): results[name] = obj.aggregate(func) if isinstance(list(compat.itervalues(results))[0], - com.ABCDataFrame): + DataFrame): # let higher level handle if _level: @@ -2870,9 +2882,9 @@ def nunique(self, dropna=True): 'val.dtype must be object, got %s' % val.dtype val, _ = algos.factorize(val, sort=False) sorter = np.lexsort((val, ids)) - isnull = lambda a: a == -1 + _isnull = lambda a: a == -1 else: - isnull = com.isnull + _isnull = isnull ids, val = ids[sorter], val[sorter] @@ -2882,7 +2894,7 @@ def nunique(self, dropna=True): inc = np.r_[1, val[1:] != val[:-1]] # 1st item of each group is a new unique observation - mask = isnull(val) + mask = _isnull(val) if dropna: inc[idx] = 1 inc[mask] = 0 @@ -2998,8 +3010,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) - if com.is_integer_dtype(out): - out = com._ensure_int64(out) + if is_integer_dtype(out): + out = _ensure_int64(out) return Series(out, index=mi, name=self.name) # for compat. with algos.value_counts need to ensure every @@ -3029,8 +3041,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) - if com.is_integer_dtype(out): - out = com._ensure_int64(out) + if is_integer_dtype(out): + out = _ensure_int64(out) return Series(out, index=mi, name=self.name) def count(self): @@ -3039,7 +3051,7 @@ def count(self): val = self.obj.get_values() mask = (ids != -1) & ~isnull(val) - ids = com._ensure_platform_int(ids) + ids = _ensure_platform_int(ids) out = np.bincount(ids[mask], minlength=ngroups or None) return Series(out, @@ -3616,7 +3628,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa pass # interpret the result of the filter - if is_bool(res) or (lib.isscalar(res) and isnull(res)): + if is_bool(res) or (is_scalar(res) and isnull(res)): if res and notnull(res): indices.append(self._get_index(name)) else: @@ -3813,7 +3825,7 @@ def count(self): """ Compute count of group, excluding missing values """ from functools import partial from pandas.lib import count_level_2d - from pandas.core.common import _isnull_ndarraylike as isnull + from pandas.types.missing import _isnull_ndarraylike as isnull data, _ = self._get_data_to_aggregate() ids, _, ngroups = self.grouper.group_info @@ -3934,7 +3946,7 @@ class DataSplitter(object): def __init__(self, data, labels, ngroups, axis=0): self.data = data - self.labels = com._ensure_int64(labels) + self.labels = _ensure_int64(labels) self.ngroups = ngroups self.axis = axis @@ -4115,7 +4127,7 @@ def loop(labels, shape): def maybe_lift(lab, size): # pormote nan values return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - labels = map(com._ensure_int64, labels) + labels = map(_ensure_int64, labels) if not xnull: labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) @@ -4331,9 +4343,9 @@ def _get_group_index_sorter(group_index, ngroups): alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters if alpha + beta * ngroups < count * np.log(count): - sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), + sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) - return com._ensure_platform_int(sorter) + return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort') @@ -4348,7 +4360,7 @@ def _compress_group_index(group_index, sort=True): size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) table = _hash.Int64HashTable(size_hint) - group_index = com._ensure_int64(group_index) + group_index = _ensure_int64(group_index) # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) @@ -4390,7 +4402,7 @@ def _groupby_indices(values): _, counts = _hash.value_count_scalar64(codes, False) else: reverse, codes, counts = _algos.group_labels( - _values_from_object(com._ensure_object(values))) + _values_from_object(_ensure_object(values))) return _algos.groupby_indices(reverse, codes, counts) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9485f50ed07f1..0cba8308c1c53 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,17 +1,24 @@ # pylint: disable=W0223 -from pandas.core.index import Index, MultiIndex +import numpy as np from pandas.compat import range, zip import pandas.compat as compat +from pandas.types.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.types.common import (is_integer_dtype, + is_integer, is_float, + is_categorical_dtype, + is_list_like, + is_sequence, + is_scalar, + _ensure_platform_int) +from pandas.types.missing import isnull, _infer_fill_value + +from pandas.core.index import Index, MultiIndex + import pandas.core.common as com -import pandas.lib as lib -from pandas.core.common import (is_bool_indexer, is_integer_dtype, - _asarray_tuplesafe, is_list_like, isnull, - is_null_slice, is_full_slice, ABCSeries, - ABCDataFrame, ABCPanel, is_float, - _values_from_object, _infer_fill_value, - is_integer) -import numpy as np +from pandas.core.common import (is_bool_indexer, _asarray_tuplesafe, + is_null_slice, is_full_slice, + _values_from_object) # the supported indexers @@ -67,7 +74,7 @@ def __getitem__(self, key): key = tuple(com._apply_if_callable(x, self.obj) for x in key) try: values = self.obj.get_value(*key) - if lib.isscalar(values): + if is_scalar(values): return values except Exception: pass @@ -625,7 +632,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if (sum_aligners == self.ndim and - all([com.is_sequence(_) for _ in indexer])): + all([is_sequence(_) for _ in indexer])): ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values # single indexer @@ -639,7 +646,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): ax = obj.axes[i] # multiple aligners (or null slices) - if com.is_sequence(idx) or isinstance(idx, slice): + if is_sequence(idx) or isinstance(idx, slice): if single_aligner and is_null_slice(idx): continue new_ix = ax[idx] @@ -685,7 +692,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): return ser - elif lib.isscalar(indexer): + elif is_scalar(indexer): ax = self.obj._get_axis(1) if ser.index.equals(ax): @@ -710,7 +717,7 @@ def _align_frame(self, indexer, df): sindexers = [] for i, ix in enumerate(indexer): ax = self.obj.axes[i] - if com.is_sequence(ix) or isinstance(ix, slice): + if is_sequence(ix) or isinstance(ix, slice): if idx is None: idx = ax[ix].ravel() elif cols is None: @@ -761,7 +768,7 @@ def _align_frame(self, indexer, df): val = df.reindex(index=ax)._values return val - elif lib.isscalar(indexer) and is_panel: + elif is_scalar(indexer) and is_panel: idx = self.obj.axes[1] cols = self.obj.axes[2] @@ -857,7 +864,7 @@ def _convert_for_reindex(self, key, axis=0): keyarr = _asarray_tuplesafe(key) if is_integer_dtype(keyarr) and not labels.is_integer(): - keyarr = com._ensure_platform_int(keyarr) + keyarr = _ensure_platform_int(keyarr) return labels.take(keyarr) return keyarr @@ -968,7 +975,7 @@ def _getitem_nested_tuple(self, tup): axis += 1 # if we have a scalar, we are done - if lib.isscalar(obj) or not hasattr(obj, 'ndim'): + if is_scalar(obj) or not hasattr(obj, 'ndim'): break # has the dim of the obj changed? @@ -1038,7 +1045,7 @@ def _getitem_iterable(self, key, axis=0): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if com.is_categorical_dtype(labels): + if is_categorical_dtype(labels): keyarr = labels._shallow_copy(keyarr) # have the index handle the indexer and possibly return @@ -1799,7 +1806,7 @@ def check_bool_indexer(ax, key): result = key if isinstance(key, ABCSeries) and not key.index.equals(ax): result = result.reindex(ax) - mask = com.isnull(result._values) + mask = isnull(result._values) if mask.any(): raise IndexingError('Unalignable boolean Series key provided') @@ -1941,9 +1948,9 @@ def _non_reducing_slice(slice_): def pred(part): # true when slice does *not* reduce - return isinstance(part, slice) or com.is_list_like(part) + return isinstance(part, slice) or is_list_like(part) - if not com.is_list_like(slice_): + if not is_list_like(slice_): if not isinstance(slice_, slice): # a 1-d slice, like df.loc[1] slice_ = [[slice_]] diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1ea567f15cb7f..363ac8249eb06 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -10,29 +10,48 @@ from pandas.core.base import PandasObject -from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, - _TD_DTYPE, ABCSeries, is_list_like, - _infer_dtype_from_scalar, is_null_slice, - is_dtype_equal, is_null_datelike_scalar, - _maybe_promote, is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, is_sparse, - array_equivalent, _is_na_compat, - _maybe_convert_string_to_object, - _maybe_convert_scalar, - is_categorical, is_datetimelike_v_numeric, - is_numeric_v_string_like, is_extension_type) +from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype +from pandas.types.common import (_TD_DTYPE, _NS_DTYPE, + _ensure_int64, _ensure_platform_int, + is_integer, + is_dtype_equal, + is_timedelta64_dtype, + is_datetime64_dtype, is_datetimetz, is_sparse, + is_categorical, is_categorical_dtype, + is_integer_dtype, + is_datetime64tz_dtype, + is_object_dtype, + is_datetimelike_v_numeric, + is_numeric_v_string_like, is_extension_type, + is_list_like, + is_re, + is_re_compilable, + is_scalar, + _get_dtype) +from pandas.types.cast import (_possibly_downcast_to_dtype, + _maybe_convert_string_to_object, + _maybe_upcast, + _maybe_convert_scalar, _maybe_promote, + _infer_dtype_from_scalar, + _soft_convert_objects, + _possibly_convert_objects, + _astype_nansafe) +from pandas.types.missing import (isnull, array_equivalent, + _is_na_compat, + is_null_datelike_scalar) +import pandas.types.concat as _concat + +from pandas.types.generic import ABCSeries +from pandas.core.common import is_null_slice import pandas.core.algorithms as algos -from pandas.types.api import DatetimeTZDtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer from pandas.core.categorical import Categorical, maybe_to_categorical from pandas.tseries.index import DatetimeIndex from pandas.formats.printing import pprint_thing -import pandas.core.common as com -import pandas.types.concat as _concat + import pandas.core.missing as missing -import pandas.core.convert as convert from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib @@ -112,8 +131,8 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if com.is_categorical_dtype(dtype): - if dtype == com.CategoricalDtype(): + if is_categorical_dtype(dtype): + if dtype == CategoricalDtype(): return True # this is a pd.Categorical, but is not @@ -137,7 +156,7 @@ def get_values(self, dtype=None): return an internal format, currently just the ndarray this is often overriden to handle to_dense like operations """ - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): return self.values.astype(object) return self.values @@ -481,7 +500,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = com._astype_nansafe(values.ravel(), dtype, copy=True) + values = _astype_nansafe(values.ravel(), dtype, copy=True) values = values.reshape(self.shape) newb = make_block(values, placement=self.mgr_locs, dtype=dtype, @@ -651,7 +670,7 @@ def setitem(self, indexer, value, mgr=None): # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value): - dtype, _ = com._maybe_promote(arr_value.dtype) + dtype, _ = _maybe_promote(arr_value.dtype) values = values.astype(dtype) transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) @@ -684,7 +703,7 @@ def _is_scalar_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return all([lib.isscalar(idx) for idx in indexer]) + return all([is_scalar(idx) for idx in indexer]) return False def _is_empty_indexer(indexer): @@ -724,7 +743,7 @@ def _is_empty_indexer(indexer): if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, value.dtype): dtype = value.dtype - elif lib.isscalar(value): + elif is_scalar(value): dtype, _ = _infer_dtype_from_scalar(value) else: dtype = 'infer' @@ -838,7 +857,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, n = np.array(new) # type of the new block - dtype, _ = com._maybe_promote(n.dtype) + dtype, _ = _maybe_promote(n.dtype) # we need to explicitly astype here to make a copy n = n.astype(dtype) @@ -1027,7 +1046,7 @@ def shift(self, periods, axis=0, mgr=None): # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(self.values) + new_values, fill_value = _maybe_upcast(self.values) # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous @@ -1036,7 +1055,7 @@ def shift(self, periods, axis=0, mgr=None): axis = new_values.ndim - axis - 1 if np.prod(new_values.shape): - new_values = np.roll(new_values, com._ensure_platform_int(periods), + new_values = np.roll(new_values, _ensure_platform_int(periods), axis=axis) axis_indexer = [slice(None)] * self.ndim @@ -1306,7 +1325,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): from pandas import Float64Index is_empty = values.shape[axis] == 0 - if com.is_list_like(qs): + if is_list_like(qs): ax = Float64Index(qs) if is_empty: @@ -1350,7 +1369,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) - if lib.isscalar(result): + if is_scalar(result): return ax, self.make_block_scalar(result) return ax, make_block(result, placement=np.arange(len(result)), @@ -1591,7 +1610,7 @@ def _can_hold_element(self, element): tipo = element.dtype.type return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) - return com.is_integer(element) + return is_integer(element) def _try_cast(self, element): try: @@ -1600,7 +1619,7 @@ def _try_cast(self, element): return element def should_store(self, value): - return com.is_integer_dtype(value) and value.dtype == self.dtype + return is_integer_dtype(value) and value.dtype == self.dtype class DatetimeLikeBlockMixin(object): @@ -1621,7 +1640,7 @@ def get_values(self, dtype=None): """ return object dtype as boxed values, such as Timestamps/Timedelta """ - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): return lib.map_infer(self.values.ravel(), self._box_func).reshape(self.values.shape) return self.values @@ -1641,7 +1660,7 @@ def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as seconds - if not isinstance(value, np.timedelta64) and com.is_integer(value): + if not isinstance(value, np.timedelta64) and is_integer(value): value = Timedelta(value, unit='s') return super(TimeDeltaBlock, self).fillna(value, **kwargs) @@ -1795,10 +1814,10 @@ def convert(self, *args, **kwargs): new_style |= kw in kwargs if new_style: - fn = convert._soft_convert_objects + fn = _soft_convert_objects fn_inputs = new_inputs else: - fn = convert._possibly_convert_objects + fn = _possibly_convert_objects fn_inputs = ['convert_dates', 'convert_numeric', 'convert_timedeltas'] fn_inputs += ['copy'] @@ -1884,15 +1903,15 @@ def should_store(self, value): def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): - to_rep_is_list = com.is_list_like(to_replace) - value_is_list = com.is_list_like(value) + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list either_list = to_rep_is_list or value_is_list result_blocks = [] blocks = [self] - if not either_list and com.is_re(to_replace): + if not either_list and is_re(to_replace): return self._replace_single(to_replace, value, inplace=inplace, filter=filter, regex=True, convert=convert, mgr=mgr) @@ -1930,10 +1949,10 @@ def replace(self, to_replace, value, inplace=False, filter=None, def _replace_single(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): # to_replace is regex compilable - to_rep_re = regex and com.is_re_compilable(to_replace) + to_rep_re = regex and is_re_compilable(to_replace) # regex is regex compilable - regex_re = com.is_re_compilable(regex) + regex_re = is_re_compilable(regex) # only one will survive if to_rep_re and regex_re: @@ -2046,7 +2065,7 @@ def _try_coerce_result(self, result): # GH12564: CategoricalBlock is 1-dim only # while returned results could be any dim - if ((not com.is_categorical_dtype(result)) and + if ((not is_categorical_dtype(result)) and isinstance(result, np.ndarray)): result = _block_shape(result, ndim=self.ndim) @@ -2151,7 +2170,7 @@ def _astype(self, dtype, mgr=None, **kwargs): """ # if we are passed a datetime64[ns, tz] - if com.is_datetime64tz_dtype(dtype): + if is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) values = self.values @@ -2167,7 +2186,7 @@ def _can_hold_element(self, element): if is_list_like(element): element = np.array(element) return element.dtype == _NS_DTYPE or element.dtype == np.int64 - return (com.is_integer(element) or isinstance(element, datetime) or + return (is_integer(element) or isinstance(element, datetime) or isnull(element)) def _try_cast(self, element): @@ -2209,7 +2228,7 @@ def _try_coerce_args(self, values, other): "naive Block") other_mask = isnull(other) other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and com.is_integer_dtype(other): + elif hasattr(other, 'dtype') and is_integer_dtype(other): other = other.view('i8') else: try: @@ -2315,7 +2334,7 @@ def external_values(self): def get_values(self, dtype=None): # return object dtype as Timestamps with the zones - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) return lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) @@ -2561,7 +2580,7 @@ def shift(self, periods, axis=0, mgr=None): new_values = self.values.to_dense().take(indexer) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(new_values) + new_values, fill_value = _maybe_upcast(new_values) if periods > 0: new_values[:periods] = fill_value else: @@ -3491,7 +3510,7 @@ def get(self, item, fastpath=True): indexer = np.arange(len(self.items))[isnull(self.items)] # allow a single nan location indexer - if not lib.isscalar(indexer): + if not is_scalar(indexer): if len(indexer) == 1: loc = indexer.item() else: @@ -3823,7 +3842,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_tuple[0] is None: - _, fill_value = com._maybe_promote(blk.dtype) + _, fill_value = _maybe_promote(blk.dtype) fill_tuple = (fill_value, ) return [blk.take_nd(slobj, axis=0, @@ -3881,7 +3900,7 @@ def _make_na_block(self, placement, fill_value=None): block_shape = list(self.shape) block_shape[0] = len(placement) - dtype, fill_value = com._infer_dtype_from_scalar(fill_value) + dtype, fill_value = _infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) return make_block(block_values, placement=placement) @@ -4560,7 +4579,7 @@ def _possibly_compare(a, b, op): else: result = op(a, b) - if lib.isscalar(result) and (is_a_array or is_b_array): + if is_scalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] if is_a_array: @@ -4611,7 +4630,7 @@ def _factor_indexer(shape, labels): expanded label indexer """ mult = np.array(shape)[::-1].cumprod()[::-1] - return com._ensure_platform_int( + return _ensure_platform_int( np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) @@ -4631,7 +4650,7 @@ def _get_blkno_placements(blknos, blk_count, group=True): """ - blknos = com._ensure_int64(blknos) + blknos = _ensure_int64(blknos) # FIXME: blk_count is unused, but it may avoid the use of dicts in cython for blkno, indexer in lib.get_blkno_indexers(blknos, group): @@ -4721,7 +4740,7 @@ def _putmask_smart(v, m, n): pass # change the dtype - dtype, _ = com._maybe_promote(n.dtype) + dtype, _ = _maybe_promote(n.dtype) nv = v.astype(dtype) try: nv[m] = n[m] @@ -4787,9 +4806,9 @@ def get_empty_dtype_and_na(join_units): if dtype is None: continue - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): upcast_cls = 'category' - elif com.is_datetimetz(dtype): + elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' @@ -5062,8 +5081,8 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return com._get_dtype(com._maybe_promote(self.block.dtype, - self.block.fill_value)[0]) + return _get_dtype(_maybe_promote(self.block.dtype, + self.block.fill_value)[0]) return self._dtype diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 911fcaf529f98..b847415f274db 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -5,10 +5,15 @@ import numpy as np from distutils.version import LooseVersion -import pandas.core.common as com import pandas.algos as algos import pandas.lib as lib from pandas.compat import range, string_types +from pandas.types.common import (is_numeric_v_string_like, + is_float_dtype, is_datetime64_dtype, + is_integer_dtype, _ensure_float64, + is_scalar, + _DATELIKE_DTYPES) +from pandas.types.missing import isnull def mask_missing(arr, values_to_mask): @@ -24,7 +29,7 @@ def mask_missing(arr, values_to_mask): except Exception: values_to_mask = np.array(values_to_mask, dtype=object) - na_mask = com.isnull(values_to_mask) + na_mask = isnull(values_to_mask) nonna = values_to_mask[~na_mask] mask = None @@ -32,28 +37,28 @@ def mask_missing(arr, values_to_mask): if mask is None: # numpy elementwise comparison warning - if com.is_numeric_v_string_like(arr, x): + if is_numeric_v_string_like(arr, x): mask = False else: mask = arr == x # if x is a string and arr is not, then we get False and we must # expand the mask to size arr.shape - if lib.isscalar(mask): + if is_scalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: # numpy elementwise comparison warning - if com.is_numeric_v_string_like(arr, x): + if is_numeric_v_string_like(arr, x): mask |= False else: mask |= arr == x if na_mask.any(): if mask is None: - mask = com.isnull(arr) + mask = isnull(arr) else: - mask |= com.isnull(arr) + mask |= isnull(arr) return mask @@ -110,7 +115,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, """ # Treat the original, non-scipy methods first. - invalid = com.isnull(yvalues) + invalid = isnull(yvalues) valid = ~invalid if not valid.any(): @@ -442,12 +447,12 @@ def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_1d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object @@ -456,7 +461,7 @@ def pad_1d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values @@ -467,12 +472,12 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_1d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object @@ -481,7 +486,7 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) @@ -493,12 +498,12 @@ def pad_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_2d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.pad_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object @@ -507,7 +512,7 @@ def pad_2d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): @@ -523,12 +528,12 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_2d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object @@ -537,7 +542,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): @@ -570,22 +575,22 @@ def fill_zeros(result, x, y, name, fill): mask the nan's from x """ - if fill is None or com.is_float_dtype(result): + if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x - is_typed_variable = (hasattr(y, 'dtype') or hasattr(y, 'type')) - is_scalar = lib.isscalar(y) + is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) + is_scalar_type = is_scalar(y) - if not is_typed_variable and not is_scalar: + if not is_variable_type and not is_scalar_type: return result - if is_scalar: + if is_scalar_type: y = np.array(y) - if com.is_integer_dtype(y): + if is_integer_dtype(y): if (y == 0).any(): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f390e3f04a6c3..7b89373dda7ba 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,16 +11,19 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.core.common import (isnull, notnull, _values_from_object, - _maybe_upcast_putmask, _ensure_float64, - _ensure_int64, _ensure_object, is_float, - is_integer, is_complex, is_float_dtype, - is_complex_dtype, is_integer_dtype, - is_bool_dtype, is_object_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, _get_dtype, - is_int_or_datetime_dtype, is_any_int_dtype, - _int64_max) +from pandas.types.common import (_ensure_int64, _ensure_object, + _ensure_float64, _get_dtype, + is_float, is_scalar, + is_integer, is_complex, is_float_dtype, + is_complex_dtype, is_integer_dtype, + is_bool_dtype, is_object_dtype, + is_datetime64_dtype, is_timedelta64_dtype, + is_datetime_or_timedelta_dtype, + is_int_or_datetime_dtype, is_any_int_dtype) +from pandas.types.cast import _int64_max, _maybe_upcast_putmask +from pandas.types.missing import isnull, notnull + +from pandas.core.common import _values_from_object class disallow(object): @@ -351,7 +354,7 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): d = count - dtype.type(ddof) # always return NaN, never inf - if lib.isscalar(count): + if is_scalar(count): if count <= ddof: count = np.nan d = np.nan @@ -623,7 +626,7 @@ def _get_counts(mask, axis, dtype=float): return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) - if lib.isscalar(count): + if is_scalar(count): return dtype.type(count) try: return count.astype(dtype) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 3aaca1eea486e..d76f011df3dd8 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -13,21 +13,25 @@ from pandas import compat, lib, tslib import pandas.index as _index from pandas.util.decorators import Appender -import pandas.core.common as com import pandas.computation.expressions as expressions from pandas.lib import isscalar from pandas.tslib import iNaT from pandas.compat import bind_method import pandas.core.missing as missing import pandas.algos as _algos -from pandas.core.common import (is_list_like, notnull, isnull, - _values_from_object, _maybe_match_name, - needs_i8_conversion, is_datetimelike_v_numeric, - is_integer_dtype, is_categorical_dtype, - is_object_dtype, is_timedelta64_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, PerformanceWarning, - ABCSeries, ABCIndex) +from pandas.core.common import (_values_from_object, _maybe_match_name, + PerformanceWarning) +from pandas.types.missing import notnull, isnull +from pandas.types.common import (needs_i8_conversion, + is_datetimelike_v_numeric, + is_integer_dtype, is_categorical_dtype, + is_object_dtype, is_timedelta64_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_bool_dtype, is_datetimetz, + is_list_like, + _ensure_object) +from pandas.types.cast import _maybe_upcast_putmask +from pandas.types.generic import ABCSeries, ABCIndex # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -446,7 +450,7 @@ def _convert_to_array(self, values, name=None, other=None): supplied_dtype = values.dtype inferred_type = supplied_dtype or lib.infer_dtype(values) if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or - com.is_datetimetz(inferred_type)): + is_datetimetz(inferred_type)): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (supplied_dtype is None and other is not None and @@ -463,7 +467,7 @@ def _convert_to_array(self, values, name=None, other=None): hasattr(ovalues, 'tz')): values = pd.DatetimeIndex(values) # datetime array with tz - elif com.is_datetimetz(values): + elif is_datetimetz(values): if isinstance(values, ABCSeries): values = values._values elif not (isinstance(values, (np.ndarray, ABCSeries)) and @@ -625,7 +629,7 @@ def na_op(x, y): "{op}".format(typ=type(x).__name__, op=str_rep)) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result @@ -820,8 +824,8 @@ def na_op(x, y): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: - x = com._ensure_object(x) - y = com._ensure_object(y) + x = _ensure_object(x) + y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: try: @@ -1095,7 +1099,7 @@ def na_op(x, y): "objects of type {x} and {y}".format( op=name, x=type(x), y=type(y))) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = missing.fill_zeros(result, x, y, name, fill_zeros) @@ -1220,7 +1224,7 @@ def na_op(x, y): result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 7d0bedcc2b381..4d61563cccce5 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -8,17 +8,21 @@ import numpy as np +from pandas.types.cast import (_infer_dtype_from_scalar, + _possibly_cast_item) +from pandas.types.common import (is_integer, is_list_like, + is_string_like, is_scalar) +from pandas.types.missing import notnull + import pandas.computation.expressions as expressions import pandas.core.common as com import pandas.core.ops as ops import pandas.core.missing as missing from pandas import compat -from pandas import lib from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict) from pandas.compat.numpy import function as nv from pandas.core.categorical import Categorical -from pandas.core.common import (PandasError, _try_sort, _default_index, - _infer_dtype_from_scalar, is_list_like) +from pandas.core.common import PandasError, _try_sort, _default_index from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, _ensure_index, @@ -168,7 +172,7 @@ def _init_data(self, data, copy, dtype, **kwargs): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None - elif lib.isscalar(data) and all(x is not None for x in passed_axes): + elif is_scalar(data) and all(x is not None for x in passed_axes): if dtype is None: dtype, data = _infer_dtype_from_scalar(data) values = np.empty([len(x) for x in passed_axes], dtype=dtype) @@ -552,7 +556,7 @@ def set_value(self, *args, **kwargs): made_bigger = not np.array_equal(axes[0], self._info_axis) # how to make this logic simpler? if made_bigger: - com._possibly_cast_item(result, args[0], likely_dtype) + _possibly_cast_item(result, args[0], likely_dtype) return result.set_value(*args) @@ -582,7 +586,7 @@ def __setitem__(self, key, value): 'object was {1}'.format( shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) - elif lib.isscalar(value): + elif is_scalar(value): dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) @@ -653,7 +657,7 @@ def round(self, decimals=0, *args, **kwargs): """ nv.validate_round(args, kwargs) - if com.is_integer(decimals): + if is_integer(decimals): result = np.apply_along_axis(np.round, 0, self.values) return self._wrap_result(result, axis=0) raise TypeError("decimals must be an integer") @@ -687,7 +691,7 @@ def dropna(self, axis=0, how='any', inplace=False): axis = self._get_axis_number(axis) values = self.values - mask = com.notnull(values) + mask = notnull(values) for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))): mask = mask.sum(ax) @@ -711,7 +715,7 @@ def _combine(self, other, func, axis=0): return self._combine_panel(other, func) elif isinstance(other, DataFrame): return self._combine_frame(other, func, axis=axis) - elif lib.isscalar(other): + elif is_scalar(other): return self._combine_const(other, func) else: raise NotImplementedError("%s is not supported in combine " @@ -924,7 +928,7 @@ def to_frame(self, filter_observations=True): if filter_observations: # shaped like the return DataFrame - mask = com.notnull(self.values).all(axis=0) + mask = notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() else: @@ -1218,7 +1222,7 @@ def transpose(self, *args, **kwargs): # check if a list of axes was passed in instead as a # single *args element if (len(args) == 1 and hasattr(args[0], '__iter__') and - not com.is_string_like(args[0])): + not is_string_like(args[0])): axes = args[0] else: axes = args diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 8d237016d1b33..4f601a2d377a6 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -6,6 +6,11 @@ import numpy as np +from pandas.types.common import _ensure_platform_int, is_list_like +from pandas.types.cast import _maybe_promote +from pandas.types.missing import notnull +import pandas.types.concat as _concat + from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -14,11 +19,8 @@ from pandas._sparse import IntIndex from pandas.core.categorical import Categorical -from pandas.core.common import notnull, _ensure_platform_int, _maybe_promote from pandas.core.groupby import get_group_index, _compress_group_index -import pandas.core.common as com -import pandas.types.concat as _concat import pandas.core.algorithms as algos import pandas.algos as _algos @@ -1063,7 +1065,7 @@ def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") - if com.is_list_like(item): + if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode))) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8015670212181..2c7f298dde2ec 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,18 +13,33 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, is_bool_indexer, - _default_index, _maybe_upcast, - _asarray_tuplesafe, _infer_dtype_from_scalar, - is_list_like, _values_from_object, - is_categorical_dtype, - _possibly_cast_to_datetime, - _possibly_castable, _possibly_convert_platform, - _try_sort, is_extension_type, is_datetimetz, - _maybe_match_name, ABCSparseArray, - _coerce_to_dtype, SettingWithCopyError, - _maybe_box_datetimelike, ABCDataFrame, - _dict_compat, is_integer) +from pandas.types.common import (_coerce_to_dtype, is_categorical_dtype, + is_integer, is_integer_dtype, + is_float_dtype, + is_extension_type, is_datetimetz, + is_datetimelike, + is_timedelta64_dtype, + is_list_like, + is_hashable, + is_iterator, + is_dict_like, + is_scalar, + _ensure_platform_int) +from pandas.types.generic import ABCSparseArray, ABCDataFrame +from pandas.types.cast import (_maybe_upcast, _infer_dtype_from_scalar, + _possibly_convert_platform, + _possibly_cast_to_datetime, _possibly_castable) +from pandas.types.missing import isnull, notnull + +from pandas.core.common import (is_bool_indexer, + _default_index, + _asarray_tuplesafe, + _values_from_object, + _try_sort, + _maybe_match_name, + SettingWithCopyError, + _maybe_box_datetimelike, + _dict_compat) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, Float64Index, _ensure_index) from pandas.core.indexing import check_bool_indexer, maybe_convert_indices @@ -303,7 +318,7 @@ def name(self): @name.setter def name(self, value): - if value is not None and not com.is_hashable(value): + if value is not None and not is_hashable(value): raise TypeError('Series.name must be a hashable type') object.__setattr__(self, '_name', value) @@ -580,7 +595,7 @@ def __getitem__(self, key): try: result = self.index.get_value(self, key) - if not lib.isscalar(result): + if not is_scalar(result): if is_list_like(result) and not isinstance(result, Series): # we need to box if we have a non-unique index here @@ -613,10 +628,10 @@ def __getitem__(self, key): except Exception: raise - if com.is_iterator(key): + if is_iterator(key): key = list(key) - if is_bool_indexer(key): + if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) return self._get_with(key) @@ -710,9 +725,9 @@ def setitem(key, value): elif key is Ellipsis: self[:] = value return - elif is_bool_indexer(key): + elif com.is_bool_indexer(key): pass - elif com.is_timedelta64_dtype(self.dtype): + elif is_timedelta64_dtype(self.dtype): # reassign a null value to iNaT if isnull(value): value = tslib.iNaT @@ -736,7 +751,7 @@ def setitem(key, value): if 'unorderable' in str(e): # pragma: no cover raise IndexError(key) - if is_bool_indexer(key): + if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) try: self._where(~key, value, inplace=True) @@ -1060,7 +1075,7 @@ def _get_repr(self, name=False, header=True, index=True, length=True, def __iter__(self): """ provide iteration over the values of the Series box values if necessary """ - if com.is_datetimelike(self): + if is_datetimelike(self): return (_maybe_box_datetimelike(x) for x in self._values) else: return iter(self._values) @@ -1349,7 +1364,7 @@ def quantile(self, q=0.5, interpolation='linear'): result = self._data.quantile(qs=q, interpolation=interpolation) - if com.is_list_like(q): + if is_list_like(q): return self._constructor(result, index=Float64Index(q), name=self.name) @@ -1481,7 +1496,7 @@ def dot(self, other): @Appender(base._shared_docs['searchsorted']) def searchsorted(self, v, side='left', sorter=None): if sorter is not None: - sorter = com._ensure_platform_int(sorter) + sorter = _ensure_platform_int(sorter) return self._values.searchsorted(Series(v)._values, side=side, sorter=sorter) @@ -1727,7 +1742,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.groupby import _lexsort_indexer indexer = _lexsort_indexer(index.labels, orders=ascending) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) else: new_index, indexer = index.sort_values(return_indexer=True, @@ -2265,8 +2280,8 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, @Appender(generic._shared_docs['rename'] % _shared_doc_kwargs) def rename(self, index=None, **kwargs): - non_mapping = lib.isscalar(index) or (com.is_list_like(index) and - not com.is_dict_like(index)) + non_mapping = is_scalar(index) or (is_list_like(index) and + not is_dict_like(index)) if non_mapping: return self._set_name(index, inplace=kwargs.get('inplace')) return super(Series, self).rename(index=index, **kwargs) @@ -2345,7 +2360,7 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): if convert: indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) new_index = self.index.take(indices) new_values = self._values.take(indices) return self._constructor(new_values, @@ -2771,7 +2786,7 @@ def _try_cast(arr, take_fast_path): subarr = np.array(data, copy=False) # possibility of nan -> garbage - if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): + if is_float_dtype(data.dtype) and is_integer_dtype(dtype): if not isnull(data).any(): subarr = _try_cast(data, True) elif copy: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a3f687b7fd73c..6ec28f9735850 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,14 +1,19 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import (isnull, notnull, _values_from_object, - is_bool_dtype, - is_list_like, is_categorical_dtype, - is_object_dtype, is_string_like) +from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.missing import isnull, notnull +from pandas.types.common import (is_bool_dtype, + is_categorical_dtype, + is_object_dtype, + is_string_like, + is_list_like, + is_scalar) +from pandas.core.common import _values_from_object + from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin -from pandas.types import api as gt from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -152,7 +157,7 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, gt.ABCSeries): + if isinstance(arr, ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -343,7 +348,7 @@ def str_repeat(arr, repeats): ------- repeated : Series/Index of objects """ - if lib.isscalar(repeats): + if is_scalar(repeats): def rep(x): try: @@ -696,7 +701,7 @@ def str_extractall(arr, pat, flags=0): if regex.groups == 0: raise ValueError("pattern contains no capture groups") - if isinstance(arr, gt.ABCIndex): + if isinstance(arr, ABCIndex): arr = arr.to_series().reset_index(drop=True) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) @@ -1538,7 +1543,7 @@ def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) def zfill(self, width): - """" + """ Filling left side of strings in the Series/Index with 0. Equivalent to :meth:`str.zfill`. @@ -1820,7 +1825,7 @@ class StringAccessorMixin(object): def _make_str_accessor(self): from pandas.core.index import Index - if (isinstance(self, gt.ABCSeries) and + if (isinstance(self, ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): diff --git a/pandas/core/window.py b/pandas/core/window.py index 1e34d18fe3e54..bc4d34529287b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -11,6 +11,15 @@ import numpy as np from collections import defaultdict +from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.types.common import (is_integer, + is_bool, + is_float_dtype, + is_integer_dtype, + needs_i8_conversion, + is_timedelta64_dtype, + is_list_like, + _ensure_float64) import pandas as pd from pandas.lib import isscalar from pandas.core.base import (PandasObject, SelectionMixin, @@ -64,10 +73,10 @@ def _constructor(self): return Window def validate(self): - if self.center is not None and not com.is_bool(self.center): + if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") if self.min_periods is not None and not \ - com.is_integer(self.min_periods): + is_integer(self.min_periods): raise ValueError("min_periods must be an integer") def _convert_freq(self, how=None): @@ -75,7 +84,7 @@ def _convert_freq(self, how=None): obj = self._selected_obj if (self.freq is not None and - isinstance(obj, (com.ABCSeries, com.ABCDataFrame))): + isinstance(obj, (ABCSeries, ABCDataFrame))): if how is not None: warnings.warn("The how kw argument is deprecated and removed " "in a future version. You can resample prior " @@ -111,7 +120,7 @@ def _gotitem(self, key, ndim, subset=None): self = self._shallow_copy(subset) self._reset_cache() if subset.ndim == 2: - if isscalar(key) and key in subset or com.is_list_like(key): + if isscalar(key) and key in subset or is_list_like(key): self._selection = key return self @@ -150,11 +159,11 @@ def _prep_values(self, values=None, kill_inf=True, how=None): # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 - if com.is_float_dtype(values.dtype): - values = com._ensure_float64(values) - elif com.is_integer_dtype(values.dtype): - values = com._ensure_float64(values) - elif com.needs_i8_conversion(values.dtype): + if is_float_dtype(values.dtype): + values = _ensure_float64(values) + elif is_integer_dtype(values.dtype): + values = _ensure_float64(values) + elif needs_i8_conversion(values.dtype): raise NotImplementedError("ops for {action} for this " "dtype {dtype} are not " "implemented".format( @@ -162,7 +171,7 @@ def _prep_values(self, values=None, kill_inf=True, how=None): dtype=values.dtype)) else: try: - values = com._ensure_float64(values) + values = _ensure_float64(values) except (ValueError, TypeError): raise TypeError("cannot handle this type -> {0}" "".format(values.dtype)) @@ -184,7 +193,7 @@ def _wrap_result(self, result, block=None, obj=None): # coerce if necessary if block is not None: - if com.is_timedelta64_dtype(block.values.dtype): + if is_timedelta64_dtype(block.values.dtype): result = pd.to_timedelta( result.ravel(), unit='ns').values.reshape(result.shape) @@ -345,7 +354,7 @@ def _prep_window(self, **kwargs): window = self._get_window() if isinstance(window, (list, tuple, np.ndarray)): return com._asarray_tuplesafe(window).astype(float) - elif com.is_integer(window): + elif is_integer(window): import scipy.signal as sig # the below may pop from kwargs @@ -543,7 +552,7 @@ def _apply(self, func, name=None, window=None, center=None, def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) # GH #12373: rolling functions error on float32 data - return cfunc(com._ensure_float64(arg), + return cfunc(_ensure_float64(arg), window, minp, **kwargs) # calculation function @@ -586,7 +595,7 @@ def count(self): results = [] for b in blocks: - if com.needs_i8_conversion(b.values): + if needs_i8_conversion(b.values): result = b.notnull().astype(int) else: try: @@ -850,7 +859,7 @@ class Rolling(_Rolling_and_Expanding): def validate(self): super(Rolling, self).validate() - if not com.is_integer(self.window): + if not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: raise ValueError("window must be non-negative") @@ -1484,7 +1493,7 @@ def _get_center_of_mass(com, span, halflife, alpha): def _offset(window, center): - if not com.is_integer(window): + if not is_integer(window): window = len(window) offset = (window - 1) / 2. if center else 0 try: diff --git a/pandas/formats/format.py b/pandas/formats/format.py index cc46ed57aeff0..436a9d5d5d4c8 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -10,8 +10,19 @@ import sys +from pandas.types.missing import isnull, notnull +from pandas.types.common import (is_categorical_dtype, + is_float_dtype, + is_period_arraylike, + is_integer_dtype, + is_datetimetz, + is_integer, + is_float, + is_numeric_dtype, + is_datetime64_dtype, + is_timedelta64_dtype) + from pandas.core.base import PandasObject -from pandas.core.common import isnull, notnull, is_numeric_dtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, @@ -194,7 +205,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done # for Categoricals - if com.is_categorical_dtype(self.tr_series.dtype): + if is_categorical_dtype(self.tr_series.dtype): level_info = self.tr_series._values._repr_categories_info() if footer: footer += "\n" @@ -316,12 +327,12 @@ def should_show_dimensions(self): def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): - if com.is_integer(i): + if is_integer(i): return self.formatters[i] else: return None else: - if com.is_integer(i) and i not in self.columns: + if is_integer(i) and i not in self.columns: i = self.columns[i] return self.formatters.get(i, None) @@ -1646,7 +1657,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, def _format_value(self, val): if lib.checknull(val): val = self.na_rep - elif com.is_float(val): + elif is_float(val): if lib.isposinf_scalar(val): val = self.inf_rep elif lib.isneginf_scalar(val): @@ -1867,19 +1878,19 @@ def get_formatted_cells(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.'): - if com.is_categorical_dtype(values): + if is_categorical_dtype(values): fmt_klass = CategoricalArrayFormatter - elif com.is_float_dtype(values.dtype): + elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter - elif com.is_period_arraylike(values): + elif is_period_arraylike(values): fmt_klass = PeriodArrayFormatter - elif com.is_integer_dtype(values.dtype): + elif is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter - elif com.is_datetimetz(values): + elif is_datetimetz(values): fmt_klass = Datetime64TZFormatter - elif com.is_datetime64_dtype(values.dtype): + elif is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter - elif com.is_timedelta64_dtype(values.dtype): + elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter @@ -1949,14 +1960,14 @@ def _format(x): if isinstance(vals, Index): vals = vals._values - is_float = lib.map_infer(vals, com.is_float) & notnull(vals) - leading_space = is_float.any() + is_float_type = lib.map_infer(vals, is_float) & notnull(vals) + leading_space = is_float_type.any() fmt_values = [] for i, v in enumerate(vals): - if not is_float[i] and leading_space: + if not is_float_type[i] and leading_space: fmt_values.append(' %s' % _format(v)) - elif is_float[i]: + elif is_float_type[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) diff --git a/pandas/formats/printing.py b/pandas/formats/printing.py index a4eaec8d5334b..37bd4b63d6f7a 100644 --- a/pandas/formats/printing.py +++ b/pandas/formats/printing.py @@ -2,9 +2,9 @@ printing tools """ +from pandas.types.inference import is_sequence from pandas import compat from pandas.compat import u -import pandas.core.common as com from pandas.core.config import get_option @@ -213,7 +213,7 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items) - elif (com.is_sequence(thing) and + elif (is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, quote_strings=quote_strings, diff --git a/pandas/formats/style.py b/pandas/formats/style.py index 477ecccc03f4f..472fd958d35eb 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -17,10 +17,11 @@ "or `pip install Jinja2`" raise ImportError(msg) +from pandas.types.common import is_float, is_string_like + import numpy as np import pandas as pd from pandas.compat import lzip, range -import pandas.core.common as com from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice try: import matplotlib.pyplot as plt @@ -153,7 +154,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if com.is_float(x): + if is_float(x): return '{:>.{precision}g}'.format(x, precision=self.precision) else: return x @@ -893,7 +894,7 @@ def _highlight_extrema(data, color='yellow', max_=True): def _maybe_wrap_formatter(formatter): - if com.is_string_like(formatter): + if is_string_like(formatter): return lambda x: formatter.format(x) elif callable(formatter): return formatter diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 0bb80be013275..5c9938c932da2 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -12,6 +12,28 @@ from pandas.compat import range, u from pandas.compat.numpy import function as nv from pandas import compat + + +from pandas.types.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex +from pandas.types.missing import isnull, array_equivalent +from pandas.types.common import (_ensure_int64, _ensure_object, + _ensure_platform_int, + is_datetimetz, + is_integer, + is_float, + is_dtype_equal, + is_object_dtype, + is_categorical_dtype, + is_bool_dtype, + is_integer_dtype, is_float_dtype, + needs_i8_conversion, + is_iterator, is_list_like, + is_scalar) +from pandas.types.cast import _coerce_indexer_dtype +from pandas.core.common import (is_bool_indexer, + _values_from_object, + _asarray_tuplesafe) + from pandas.core.base import (PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin) import pandas.core.base as base @@ -22,15 +44,6 @@ import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing -from pandas.core.common import (isnull, array_equivalent, - is_object_dtype, is_datetimetz, ABCSeries, - ABCPeriodIndex, ABCMultiIndex, - _values_from_object, is_float, is_integer, - is_iterator, is_categorical_dtype, - _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, - is_integer_dtype, is_float_dtype, - needs_i8_conversion) from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin @@ -223,7 +236,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') else: - subarr = com._asarray_tuplesafe(data, dtype=object) + subarr = _asarray_tuplesafe(data, dtype=object) # _asarray_tuplesafe does not always copy underlying data, # so need to make sure that this happens @@ -264,7 +277,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) - elif data is None or lib.isscalar(data): + elif data is None or is_scalar(data): cls._scalar_data_error(data) else: if (tupleize_cols and isinstance(data, list) and data and @@ -284,7 +297,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # python2 - MultiIndex fails on mixed types pass # other iterable of some kind - subarr = com._asarray_tuplesafe(data, dtype=object) + subarr = _asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) """ @@ -539,7 +552,7 @@ def _coerce_to_ndarray(cls, data): """ if not isinstance(data, (np.ndarray, Index)): - if data is None or lib.isscalar(data): + if data is None or is_scalar(data): cls._scalar_data_error(data) # other iterable of some kind @@ -841,7 +854,7 @@ def to_datetime(self, dayfirst=False): return DatetimeIndex(self.values) def _assert_can_do_setop(self, other): - if not com.is_list_like(other): + if not is_list_like(other): raise TypeError('Input must be Index or array-like') return True @@ -1325,7 +1338,7 @@ def __getitem__(self, key): getitem = self._data.__getitem__ promote = self._shallow_copy - if lib.isscalar(key): + if is_scalar(key): return getitem(key) if isinstance(key, slice): @@ -1338,7 +1351,7 @@ def __getitem__(self, key): key = _values_from_object(key) result = getitem(key) - if not lib.isscalar(result): + if not is_scalar(result): return promote(result) else: return result @@ -1426,7 +1439,7 @@ def _ensure_compat_concat(indexes): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) if self._can_hold_na: taken = self._assert_take_fillable(self.values, indices, allow_fill=allow_fill, @@ -1442,7 +1455,7 @@ def take(self, indices, axis=0, allow_fill=True, def _assert_take_fillable(self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan): """ Internal method to handle NA filling of take """ - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: @@ -1491,7 +1504,7 @@ def _convert_for_op(self, value): def _assert_can_do_op(self, value): """ Check value is valid for scalar op """ - if not lib.isscalar(value): + if not is_scalar(value): msg = "'value' must be a scalar, passed: {0}" raise TypeError(msg.format(type(value).__name__)) @@ -1706,7 +1719,7 @@ def argsort(self, *args, **kwargs): return result.argsort(*args, **kwargs) def __add__(self, other): - if com.is_list_like(other): + if is_list_like(other): warnings.warn("using '+' to provide set union with Indexes is " "deprecated, use '|' or .union()", FutureWarning, stacklevel=2) @@ -1783,7 +1796,7 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) @@ -1866,7 +1879,7 @@ def intersection(self, other): if self.equals(other): return self._get_consensus_name(other) - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other) @@ -2028,7 +2041,7 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, Index) and lib.isscalar(key): + if isinstance(s, Index) and is_scalar(key): try: return s[key] except (IndexError, ValueError): @@ -2061,7 +2074,7 @@ def get_value(self, series, key): raise e1 except TypeError: # python 3 - if lib.isscalar(key): # pragma: no cover + if is_scalar(key): # pragma: no cover raise IndexError(key) raise InvalidIndexError(key) @@ -2137,7 +2150,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return pself.get_indexer(ptarget, method=method, limit=limit, tolerance=tolerance) - if not com.is_dtype_equal(self.dtype, target.dtype): + if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer(target, method=method, limit=limit, @@ -2161,7 +2174,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): indexer = self._engine.get_indexer(target._values) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def _convert_tolerance(self, tolerance): # override this method on subclasses @@ -2443,7 +2456,7 @@ def _reindex_non_unique(self, target): if len(missing): l = np.arange(len(indexer)) - missing = com._ensure_platform_int(missing) + missing = _ensure_platform_int(missing) missing_labels = target.take(missing) missing_indexer = _ensure_int64(l[~check]) cur_labels = self.take(indexer[check])._values @@ -2541,7 +2554,7 @@ def join(self, other, how='left', level=None, return_indexers=False): result = x, z, y return result - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.join(other, how=how, return_indexers=return_indexers) @@ -2637,8 +2650,8 @@ def _join_non_unique(self, other, how='left', return_indexers=False): [other._values], how=how, sort=True) - left_idx = com._ensure_platform_int(left_idx) - right_idx = com._ensure_platform_int(right_idx) + left_idx = _ensure_platform_int(left_idx) + right_idx = _ensure_platform_int(right_idx) join_index = self.values.take(left_idx) mask = left_idx == -1 @@ -2850,9 +2863,9 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): kind=kind) # return a slice - if not lib.isscalar(start_slice): + if not is_scalar(start_slice): raise AssertionError("Start slice bound is non-scalar") - if not lib.isscalar(end_slice): + if not is_scalar(end_slice): raise AssertionError("End slice bound is non-scalar") return slice(start_slice, end_slice, step) @@ -3483,7 +3496,7 @@ def _get_na_value(dtype): def _ensure_frozen(array_like, categories, copy=False): - array_like = com._coerce_indexer_dtype(array_like, categories) + array_like = _coerce_indexer_dtype(array_like, categories) array_like = array_like.view(FrozenNDArray) if copy: array_like = array_like.copy() diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 84b8926f4177f..f1d4fe2f26bdd 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -1,15 +1,21 @@ import numpy as np -import pandas.lib as lib import pandas.index as _index from pandas import compat from pandas.compat.numpy import function as nv +from pandas.types.generic import ABCCategorical, ABCSeries +from pandas.types.common import (is_categorical_dtype, + _ensure_platform_int, + is_list_like, + is_scalar) +from pandas.types.missing import array_equivalent + + from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg) from pandas.core.config import get_option from pandas.indexes.base import Index, _index_shared_docs import pandas.core.base as base -import pandas.core.common as com import pandas.core.missing as missing import pandas.indexes.base as ibase @@ -49,7 +55,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, com.ABCCategorical): + if isinstance(data, ABCCategorical): data = cls._create_categorical(cls, data, categories, ordered) elif isinstance(data, CategoricalIndex): data = data._data @@ -58,7 +64,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, # don't allow scalars # if data is None, then categories must be provided - if lib.isscalar(data): + if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] @@ -116,7 +122,7 @@ def _create_categorical(self, data, categories=None, ordered=None): ------- Categorical """ - if not isinstance(data, com.ABCCategorical): + if not isinstance(data, ABCCategorical): from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered) else: @@ -164,7 +170,7 @@ def _is_dtype_compat(self, other): ------ TypeError if the dtypes are not compatible """ - if com.is_categorical_dtype(other): + if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): @@ -172,7 +178,7 @@ def _is_dtype_compat(self, other): "when appending") else: values = other - if not com.is_list_like(values): + if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) @@ -191,7 +197,7 @@ def equals(self, other): try: other = self._is_dtype_compat(other) - return com.array_equivalent(self._data, other) + return array_equivalent(self._data, other) except (TypeError, ValueError): pass @@ -360,7 +366,7 @@ def reindex(self, target, method=None, level=None, limit=None, target = ibase._ensure_index(target) - if not com.is_categorical_dtype(target) and not target.is_unique: + if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) @@ -388,7 +394,7 @@ def reindex(self, target, method=None, level=None, limit=None, # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) - if com.is_categorical_dtype(target): + if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -460,7 +466,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def get_indexer_non_unique(self, target): """ this is the same for a CategoricalIndex for get_indexer; the API @@ -491,7 +497,7 @@ def _convert_list_indexer(self, keyarr, kind=None): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -591,12 +597,12 @@ def _evaluate_compare(self, other): self, other._values, categories=self.categories, ordered=self.ordered) - if isinstance(other, (com.ABCCategorical, np.ndarray, - com.ABCSeries)): + if isinstance(other, (ABCCategorical, np.ndarray, + ABCSeries)): if len(self.values) != len(other): raise ValueError("Lengths must match to compare") - if isinstance(other, com.ABCCategorical): + if isinstance(other, ABCCategorical): if not self.values.is_dtype_equal(other): raise TypeError("categorical index comparisions must " "have the same categories and ordered " @@ -619,7 +625,7 @@ def _delegate_method(self, name, *args, **kwargs): if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if lib.isscalar(res): + if is_scalar(res): return res return CategoricalIndex(res, name=self.name) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 05b2045a4850f..365a971f82a3b 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -13,6 +13,21 @@ from pandas.compat import range, zip, lrange, lzip, map from pandas.compat.numpy import function as nv from pandas import compat + + +from pandas.types.common import (_ensure_int64, + _ensure_platform_int, + is_object_dtype, + is_iterator, + is_list_like, + is_scalar) +from pandas.types.missing import isnull, array_equivalent +from pandas.core.common import (_values_from_object, + is_bool_indexer, + is_null_slice, + PerformanceWarning) + + from pandas.core.base import FrozenList import pandas.core.base as base from pandas.util.decorators import (Appender, cache_readonly, @@ -21,13 +36,6 @@ import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing -from pandas.core.common import (isnull, array_equivalent, - is_object_dtype, - _values_from_object, - is_iterator, - _ensure_int64, is_bool_indexer, - is_list_like, is_null_slice, - PerformanceWarning) from pandas.core.config import get_option @@ -798,7 +806,7 @@ def lexsort_depth(self): else: return 0 - int64_labels = [com._ensure_int64(lab) for lab in self.labels] + int64_labels = [_ensure_int64(lab) for lab in self.labels] for k in range(self.nlevels, 0, -1): if lib.is_lexsorted(int64_labels[:k]): return k @@ -984,7 +992,7 @@ def __setstate__(self, state): self._reset_identity() def __getitem__(self, key): - if lib.isscalar(key): + if is_scalar(key): retval = [] for lev, lab in zip(self.levels, self.labels): if lab[key] == -1: @@ -1011,7 +1019,7 @@ def __getitem__(self, key): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.labels, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -1313,7 +1321,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): if not ascending: indexer = indexer[::-1] - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_labels = [lab.take(indexer) for lab in self.labels] new_index = MultiIndex(labels=new_labels, levels=self.levels, @@ -1377,7 +1385,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): else: indexer = self_index._engine.get_indexer(target._values) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def reindex(self, target, method=None, level=None, limit=None, tolerance=None): @@ -1759,7 +1767,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # selected from pandas import Series mapper = Series(indexer) - indexer = labels.take(com._ensure_platform_int(indexer)) + indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) m = result.map(mapper)._values diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 89fc05fdcc5f5..86d22e141f781 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -3,13 +3,15 @@ import pandas.algos as _algos import pandas.index as _index +from pandas.types.common import (is_dtype_equal, pandas_dtype, + is_float_dtype, is_object_dtype, + is_integer_dtype, is_scalar) +from pandas.types.missing import array_equivalent, isnull +from pandas.core.common import _values_from_object + from pandas import compat from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly -import pandas.core.common as com -from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype, - is_float_dtype, is_object_dtype, - is_integer_dtype) import pandas.indexes.base as ibase @@ -164,8 +166,8 @@ def equals(self, other): if self.is_(other): return True - return com.array_equivalent(com._values_from_object(self), - com._values_from_object(other)) + return array_equivalent(_values_from_object(self), + _values_from_object(other)) def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None @@ -287,17 +289,17 @@ def _format_native_types(self, na_rep='', float_format=None, decimal='.', def get_value(self, series, key): """ we always want to get an index value, never a value """ - if not lib.isscalar(key): + if not is_scalar(key): raise InvalidIndexError from pandas.core.indexing import maybe_droplevels from pandas.core.series import Series - k = com._values_from_object(key) + k = _values_from_object(key) loc = self.get_loc(k) - new_values = com._values_from_object(series)[loc] + new_values = _values_from_object(series)[loc] - if lib.isscalar(new_values) or new_values is None: + if is_scalar(new_values) or new_values is None: return new_values new_index = self[loc] diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 168143fdea047..f680d2da0161e 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -4,14 +4,16 @@ import numpy as np import pandas.index as _index +from pandas.types.common import (is_integer, + is_scalar, + is_int64_dtype) + from pandas import compat from pandas.compat import lrange, range from pandas.compat.numpy import function as nv from pandas.indexes.base import Index, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly -import pandas.core.common as com import pandas.indexes.base as ibase -import pandas.lib as lib from pandas.indexes.numeric import Int64Index @@ -120,7 +122,7 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result = object.__new__(cls) # handle passed None, non-integers - if start is None or not com.is_integer(start): + if start is None or not is_integer(start): try: return RangeIndex(start, stop, step, name=name, **kwargs) except TypeError: @@ -139,7 +141,7 @@ def _simple_new(cls, start, stop=None, step=None, name=None, @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ - if not (dtype is None or com.is_int64_dtype(dtype)): + if not (dtype is None or is_int64_dtype(dtype)): raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex') @cache_readonly @@ -448,7 +450,7 @@ def __getitem__(self, key): """ super_getitem = super(RangeIndex, self).__getitem__ - if lib.isscalar(key): + if is_scalar(key): n = int(key) if n != key: return super_getitem(key) @@ -510,7 +512,7 @@ def __getitem__(self, key): return super_getitem(key) def __floordiv__(self, other): - if com.is_integer(other): + if is_integer(other): if (len(self) == 0 or self._start % other == 0 and self._step % other == 0): @@ -560,7 +562,7 @@ def _evaluate_numeric_binop(self, other): # we don't have a representable op # so return a base index - if not com.is_integer(rstep) or not rstep: + if not is_integer(rstep) or not rstep: raise ValueError else: @@ -577,7 +579,7 @@ def _evaluate_numeric_binop(self, other): # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors - if not all([com.is_integer(x) for x in + if not all([is_integer(x) for x in [rstart, rstop, rstep]]): result = result.astype('float64') diff --git a/pandas/io/common.py b/pandas/io/common.py index 76395928eb011..6f9bddd0fdf9b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -11,8 +11,8 @@ from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat from pandas.formats.printing import pprint_thing -from pandas.core.common import is_number, AbstractMethodError - +from pandas.core.common import AbstractMethodError +from pandas.types.common import is_number try: import pathlib diff --git a/pandas/io/data.py b/pandas/io/data.py index 5fa440e7bb1ff..68151fbb091fa 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -19,7 +19,9 @@ ) import pandas.compat as compat from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime, DatetimeIndex, DateOffset -from pandas.core.common import is_list_like, PandasError + +from pandas.types.common import is_list_like +from pandas.core.common import PandasError from pandas.io.common import urlopen, ZipFile, urlencode from pandas.tseries.offsets import MonthEnd from pandas.util.testing import _network_error_classes diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 775465ea9372d..703cdbeaa7a8f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -10,6 +10,9 @@ import abc import numpy as np +from pandas.types.common import (is_integer, is_float, + is_bool, is_list_like) + from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, @@ -22,7 +25,6 @@ from pandas.formats.printing import pprint_thing import pandas.compat as compat import pandas.compat.openpyxl_compat as openpyxl_compat -import pandas.core.common as com from warnings import warn from distutils.version import LooseVersion @@ -423,17 +425,17 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = DataFrame() continue - if com.is_list_like(header) and len(header) == 1: + if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: - if com.is_list_like(header): + if is_list_like(header): header_names = [] control_row = [True for x in data[0]] for row in header: - if com.is_integer(skiprows): + if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( @@ -444,9 +446,9 @@ def _parse_cell(cell_contents, cell_typ): else: data[header] = _trim_excel_header(data[header]) - if com.is_list_like(index_col): + if is_list_like(index_col): # forward fill values for MultiIndex index - if not com.is_list_like(header): + if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) @@ -459,7 +461,7 @@ def _parse_cell(cell_contents, cell_typ): else: last = data[row][col] - if com.is_list_like(header) and len(header) > 1: + if is_list_like(header) and len(header) > 1: has_index_names = True # GH 12292 : error when read one empty column from excel file @@ -556,21 +558,21 @@ def _pop_header_name(row, index_col): return none_fill(row[0]), row[1:] else: # pop out header name and fill w/ blank - i = index_col if not com.is_list_like(index_col) else max(index_col) + i = index_col if not is_list_like(index_col) else max(index_col) return none_fill(row[i]), row[:i] + [''] + row[i + 1:] def _conv_value(val): # Convert numpy types to Python types for the Excel writers. - if com.is_integer(val): + if is_integer(val): val = int(val) - elif com.is_float(val): + elif is_float(val): val = float(val) - elif com.is_bool(val): + elif is_bool(val): val = bool(val) elif isinstance(val, Period): val = "%s" % val - elif com.is_list_like(val): + elif is_list_like(val): val = str(val) return val diff --git a/pandas/io/html.py b/pandas/io/html.py index 609642e248eda..e0d84a9617ae4 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -12,12 +12,12 @@ import numpy as np +from pandas.types.common import is_list_like from pandas.io.common import (EmptyDataError, _is_url, urlopen, parse_url, _validate_header_arg) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) -from pandas.core import common as com from pandas import Series from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing @@ -107,7 +107,7 @@ def _get_skiprows(skiprows): """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) - elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows): + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return skiprows elif skiprows is None: return 0 diff --git a/pandas/io/packers.py b/pandas/io/packers.py index ff06a5f212f8b..14e2c9b371296 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -47,6 +47,10 @@ import numpy as np from pandas import compat from pandas.compat import u, u_safe + +from pandas.types.common import (is_categorical_dtype, is_object_dtype, + needs_i8_conversion, pandas_dtype) + from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, @@ -55,9 +59,7 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame -from pandas.core.common import (PerformanceWarning, - is_categorical_dtype, is_object_dtype, - needs_i8_conversion, pandas_dtype) +from pandas.core.common import PerformanceWarning from pandas.io.common import get_filepath_or_buffer from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dc9455289b757..84ea2a92b8026 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,20 +2,22 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import (range, lrange, StringIO, lzip, zip, - string_types, map, OrderedDict) -from pandas import compat from collections import defaultdict import re import csv import warnings +import datetime import numpy as np +from pandas import compat +from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.types.common import (is_integer, _ensure_object, + is_list_like, is_integer_dtype, + is_float, + is_scalar) from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame -import datetime -import pandas.core.common as com from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser @@ -326,11 +328,11 @@ def _validate_nrows(nrows): msg = "'nrows' must be an integer" if nrows is not None: - if com.is_float(nrows): + if is_float(nrows): if int(nrows) != nrows: raise ValueError(msg) nrows = int(nrows) - elif not com.is_integer(nrows): + elif not is_integer(nrows): raise ValueError(msg) return nrows @@ -869,7 +871,7 @@ def _clean_options(self, options, engine): # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers if engine != 'c': - if com.is_integer(skiprows): + if is_integer(skiprows): skiprows = lrange(skiprows) skiprows = set() if skiprows is None else set(skiprows) @@ -961,7 +963,7 @@ def _validate_parse_dates_arg(parse_dates): "for the 'parse_dates' parameter") if parse_dates is not None: - if lib.isscalar(parse_dates): + if is_scalar(parse_dates): if not lib.is_bool(parse_dates): raise TypeError(msg) @@ -1021,8 +1023,8 @@ def __init__(self, kwds): is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not (is_sequence and - all(map(com.is_integer, self.index_col)) or - com.is_integer(self.index_col)): + all(map(is_integer, self.index_col)) or + is_integer(self.index_col)): raise ValueError("index_col must only contain row numbers " "when specifying a multi-index header") @@ -1047,7 +1049,7 @@ def _should_parse_dates(self, i): name = self.index_names[i] j = self.index_col[i] - if lib.isscalar(self.parse_dates): + if is_scalar(self.parse_dates): return (j == self.parse_dates) or (name == self.parse_dates) else: return (j in self.parse_dates) or (name in self.parse_dates) @@ -1281,7 +1283,7 @@ def _convert_types(self, values, na_values, try_num_bool=True): mask = lib.ismember(values, na_values) na_count = mask.sum() if na_count > 0: - if com.is_integer_dtype(values): + if is_integer_dtype(values): values = values.astype(np.float64) np.putmask(values, mask, np.nan) return values, na_count @@ -1407,10 +1409,10 @@ def _set_noconvert_columns(self): usecols = self.usecols def _set(x): - if usecols and com.is_integer(x): + if usecols and is_integer(x): x = list(usecols)[x] - if not com.is_integer(x): + if not is_integer(x): x = names.index(x) self._reader.set_noconvert(x) @@ -1790,7 +1792,7 @@ def _set_no_thousands_columns(self): noconvert_columns = set() def _set(x): - if com.is_integer(x): + if is_integer(x): noconvert_columns.add(x) else: noconvert_columns.add(self.columns.index(x)) @@ -1954,7 +1956,7 @@ def _convert_data(self, data): def _to_recarray(self, data, columns): dtypes = [] - o = OrderedDict() + o = compat.OrderedDict() # use the columns to "order" the keys # in the unordered 'data' dictionary @@ -2439,7 +2441,7 @@ def converter(*date_cols): try: return tools._to_datetime( - com._ensure_object(strs), + _ensure_object(strs), utc=None, box=False, dayfirst=dayfirst, @@ -2492,7 +2494,7 @@ def _isindex(colspec): if isinstance(parse_spec, list): # list of column lists for colspec in parse_spec: - if lib.isscalar(colspec): + if is_scalar(colspec): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] if _isindex(colspec): @@ -2569,7 +2571,7 @@ def _clean_na_values(na_values, keep_default_na=True): (k, _floatify_na_values(v)) for k, v in na_values.items() # noqa ]) else: - if not com.is_list_like(na_values): + if not is_list_like(na_values): na_values = [na_values] na_values = _stringify_na_values(na_values) if keep_default_na: @@ -2622,7 +2624,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): if not isinstance(dtype, dict): dtype = defaultdict(lambda: dtype) # Convert column indexes to column names. - dtype = dict((columns[k] if com.is_integer(k) else k, v) + dtype = dict((columns[k] if is_integer(k) else k, v) for k, v in compat.iteritems(dtype)) if index_col is None or index_col is False: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index c19dae7f3545e..2358c296f782e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ import numpy as np from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 -import pandas.core.common as com +from pandas.types.common import is_datetime64_dtype, _NS_DTYPE def to_pickle(obj, path): @@ -86,7 +86,7 @@ def _unpickle_array(bytes): # All datetimes should be stored as M8[ns]. When unpickling with # numpy1.6, it will read these as M8[us]. So this ensures all # datetime64 types are read as MS[ns] - if com.is_datetime64_dtype(arr): - arr = arr.view(com._NS_DTYPE) + if is_datetime64_dtype(arr): + arr = arr.view(_NS_DTYPE) return arr diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d4ca717ddbc4e..038ca7ac7775b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -12,11 +12,21 @@ import warnings import os +from pandas.types.common import (is_list_like, + is_categorical_dtype, + is_timedelta64_dtype, + is_datetime64tz_dtype, + is_datetime64_dtype, + _ensure_object, + _ensure_int64, + _ensure_platform_int) +from pandas.types.missing import array_equivalent + import numpy as np import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index) + MultiIndex, Int64Index, isnull) from pandas.core import config from pandas.io.common import _stringify_path from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -32,7 +42,6 @@ _block2d_to_blocknd, _factor_indexer, _block_shape) from pandas.core.index import _ensure_index -import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter @@ -1677,7 +1686,7 @@ def validate_metadata(self, handler): new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) if new_metadata is not None and cur_metadata is not None \ - and not com.array_equivalent(new_metadata, cur_metadata): + and not array_equivalent(new_metadata, cur_metadata): raise ValueError("cannot append a categorical with " "different categories to the existing") @@ -2566,7 +2575,7 @@ def write_array(self, key, value, items=None): empty_array = self._is_empty_array(value.shape) transposed = False - if com.is_categorical_dtype(value): + if is_categorical_dtype(value): raise NotImplementedError('Cannot store a category dtype in ' 'a HDF5 dataset that uses format=' '"fixed". Use format="table".') @@ -2621,12 +2630,12 @@ def write_array(self, key, value, items=None): if empty_array: self.write_array_empty(key, value) else: - if com.is_datetime64_dtype(value.dtype): + if is_datetime64_dtype(value.dtype): self._handle.create_array( self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'datetime64' - elif com.is_datetime64tz_dtype(value.dtype): + elif is_datetime64tz_dtype(value.dtype): # store as UTC # with a zone self._handle.create_array(self.group, key, @@ -2635,7 +2644,7 @@ def write_array(self, key, value, items=None): node = getattr(self.group, key) node._v_attrs.tz = _get_tz(value.tz) node._v_attrs.value_type = 'datetime64' - elif com.is_timedelta64_dtype(value.dtype): + elif is_timedelta64_dtype(value.dtype): self._handle.create_array( self.group, key, value.view('i8')) getattr( @@ -3756,8 +3765,8 @@ def read(self, where=None, columns=None, **kwargs): if len(unique(key)) == len(key): sorter, _ = algos.groupsort_indexer( - com._ensure_int64(key), np.prod(N)) - sorter = com._ensure_platform_int(sorter) + _ensure_int64(key), np.prod(N)) + sorter = _ensure_platform_int(sorter) # create the objs for c in self.values_axes: @@ -3802,7 +3811,7 @@ def read(self, where=None, columns=None, **kwargs): unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) @@ -3903,7 +3912,7 @@ def write_data(self, chunksize, dropna=False): # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask - mask = com.isnull(a.data).all(axis=0) + mask = isnull(a.data).all(axis=0) if isinstance(mask, np.ndarray): masks.append(mask.astype('u1', copy=False)) @@ -4522,7 +4531,7 @@ def _convert_string_array(data, encoding, itemsize=None): # create the sized dtype if itemsize is None: - itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + itemsize = lib.max_len_string_array(_ensure_object(data.ravel())) data = np.asarray(data, dtype="S%d" % itemsize) return data @@ -4551,7 +4560,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - itemsize = lib.max_len_string_array(com._ensure_object(data)) + itemsize = lib.max_len_string_array(_ensure_object(data)) if compat.PY3: dtype = "U{0}".format(itemsize) else: @@ -4619,7 +4628,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = None self.coordinates = None - if com.is_list_like(where): + if is_list_like(where): # see if we have a passed coordinate like try: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 324988360c9fe..8485a3f13f047 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -13,13 +13,15 @@ import numpy as np import pandas.lib as lib -import pandas.core.common as com +from pandas.types.missing import isnull +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.common import (is_list_like, + is_datetime64tz_dtype) + from pandas.compat import (lzip, map, zip, raise_with_traceback, string_types, text_type) from pandas.core.api import DataFrame, Series -from pandas.core.common import isnull from pandas.core.base import PandasObject -from pandas.types.api import DatetimeTZDtype from pandas.tseries.tools import to_datetime from contextlib import contextmanager @@ -90,7 +92,7 @@ def _handle_date_column(col, format=None): # parse dates as timestamp format = 's' if format is None else format return to_datetime(col, errors='coerce', unit=format, utc=True) - elif com.is_datetime64tz_dtype(col): + elif is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 return (to_datetime(col, errors='coerce') @@ -123,7 +125,7 @@ def _parse_date_columns(data_frame, parse_dates): # we could in theory do a 'nice' conversion from a FixedOffset tz # GH11216 for col_name, df_col in data_frame.iteritems(): - if com.is_datetime64tz_dtype(df_col): + if is_datetime64tz_dtype(df_col): data_frame[col_name] = _handle_date_column(df_col) return data_frame @@ -876,7 +878,7 @@ def _create_table_setup(self): for name, typ, is_index in column_names_and_types] if self.keys is not None: - if not com.is_list_like(self.keys): + if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys @@ -1465,7 +1467,7 @@ def _create_table_setup(self): for cname, ctype, _ in column_names_and_types] if self.keys is not None and len(self.keys): - if not com.is_list_like(self.keys): + if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c7390cf240f8a..bd19102c7f18c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -14,6 +14,10 @@ import sys import struct from dateutil.relativedelta import relativedelta + +from pandas.types.common import (is_categorical_dtype, is_datetime64_dtype, + _ensure_object) + from pandas.core.base import StringMixin from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame @@ -24,7 +28,7 @@ zip, BytesIO from pandas.util.decorators import Appender import pandas as pd -import pandas.core.common as com + from pandas.io.common import get_filepath_or_buffer, BaseIterator from pandas.lib import max_len_string_array, infer_dtype from pandas.tslib import NaT, Timestamp @@ -358,7 +362,7 @@ def _datetime_to_stata_elapsed_vec(dates, fmt): def parse_dates_safe(dates, delta=False, year=False, days=False): d = {} - if com.is_datetime64_dtype(dates.values): + if is_datetime64_dtype(dates.values): if delta: delta = dates - stata_epoch d['delta'] = delta.values.astype( @@ -396,7 +400,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): index = dates.index if bad_loc.any(): dates = Series(dates) - if com.is_datetime64_dtype(dates): + if is_datetime64_dtype(dates): dates[bad_loc] = to_datetime(stata_epoch) else: dates[bad_loc] = stata_epoch @@ -1746,7 +1750,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(com._ensure_object(column.values)) + itemsize = max_len_string_array(_ensure_object(column.values)) return chr(max(itemsize, 1)) elif dtype == np.float64: return chr(255) @@ -1784,7 +1788,7 @@ def _dtype_to_default_stata_fmt(dtype, column): if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Writing general object arrays is not supported') - itemsize = max_len_string_array(com._ensure_object(column.values)) + itemsize = max_len_string_array(_ensure_object(column.values)) if itemsize > 244: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" @@ -1880,7 +1884,7 @@ def _prepare_categoricals(self, data): """Check for categorical columns, retain categorical information for Stata file and convert categorical data to int""" - is_cat = [com.is_categorical_dtype(data[col]) for col in data] + is_cat = [is_categorical_dtype(data[col]) for col in data] self._is_col_cat = is_cat self._value_labels = [] if not any(is_cat): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 9a995c17f0445..e5a49c5213a48 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -31,11 +31,12 @@ from datetime import datetime, date, time +from pandas.types.common import (is_object_dtype, is_datetime64_dtype, + is_datetime64tz_dtype) from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types -from pandas.core import common as com from pandas.core.datetools import format as date_format import pandas.io.sql as sql @@ -1275,7 +1276,7 @@ def test_datetime_with_timezone(self): def check(col): # check that a column is either datetime64[ns] # or datetime64[ns, UTC] - if com.is_datetime64_dtype(col.dtype): + if is_datetime64_dtype(col.dtype): # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" @@ -1285,7 +1286,7 @@ def check(col): # "2000-06-01 07:00:00" self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00')) - elif com.is_datetime64tz_dtype(col.dtype): + elif is_datetime64tz_dtype(col.dtype): self.assertTrue(str(col.dt.tz) == 'UTC') # "2000-01-01 00:00:00-08:00" should convert to @@ -1311,9 +1312,9 @@ def check(col): # even with the same versions of psycopg2 & sqlalchemy, possibly a # Postgrsql server version difference col = df.DateColWithTz - self.assertTrue(com.is_object_dtype(col.dtype) or - com.is_datetime64_dtype(col.dtype) or - com.is_datetime64tz_dtype(col.dtype), + self.assertTrue(is_object_dtype(col.dtype) or + is_datetime64_dtype(col.dtype) or + is_datetime64tz_dtype(col.dtype), "DateCol loaded with incorrect type -> {0}" .format(col.dtype)) @@ -1327,7 +1328,7 @@ def check(col): self.conn, chunksize=1)), ignore_index=True) col = df.DateColWithTz - self.assertTrue(com.is_datetime64tz_dtype(col.dtype), + self.assertTrue(is_datetime64tz_dtype(col.dtype), "DateCol loaded with incorrect type -> {0}" .format(col.dtype)) self.assertTrue(str(col.dt.tz) == 'UTC') diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 830c68d62efad..5f45d1b547e62 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -15,7 +15,7 @@ import pandas as pd from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series -from pandas.core.common import is_categorical_dtype +from pandas.types.common import is_categorical_dtype from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 0312fb023f7fd..35233d1b6ba94 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -15,6 +15,14 @@ from pandas.compat import range from pandas.compat.numpy import function as nv +from pandas.types.generic import ABCSparseArray, ABCSparseSeries +from pandas.types.common import (is_float, is_integer, + is_integer_dtype, _ensure_platform_int, + is_list_like, + is_scalar) +from pandas.types.cast import _possibly_convert_platform +from pandas.types.missing import isnull, notnull + from pandas._sparse import SparseIndex, BlockIndex, IntIndex import pandas._sparse as splib import pandas.index as _index @@ -40,13 +48,13 @@ def wrapper(self, other): if len(self) != len(other): raise AssertionError("length mismatch: %d vs. %d" % (len(self), len(other))) - if not isinstance(other, com.ABCSparseArray): + if not isinstance(other, ABCSparseArray): other = SparseArray(other, fill_value=self.fill_value) if name[0] == 'r': return _sparse_array_op(other, self, op, name[1:]) else: return _sparse_array_op(self, other, op, name) - elif lib.isscalar(other): + elif is_scalar(other): new_fill_value = op(np.float64(self.fill_value), np.float64(other)) return _wrap_result(name, op(self.sp_values, other), @@ -120,7 +128,7 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', if index is not None: if data is None: data = np.nan - if not lib.isscalar(data): + if not is_scalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) @@ -177,7 +185,7 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', @classmethod def _simple_new(cls, data, sp_index, fill_value): - if (com.is_integer_dtype(data) and com.is_float(fill_value) and + if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float @@ -288,7 +296,7 @@ def __getitem__(self, key): """ """ - if com.is_integer(key): + if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): data_slice = self.values[key] @@ -340,11 +348,11 @@ def take(self, indices, axis=0, allow_fill=True, if axis: raise ValueError("axis must be 0, input was {0}".format(axis)) - if com.is_integer(indices): + if is_integer(indices): # return scalar return self[indices] - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, @@ -380,7 +388,7 @@ def take(self, indices, axis=0, allow_fill=True, return self._simple_new(new_values, sp_index, self.fill_value) def __setitem__(self, key, value): - # if com.is_integer(key): + # if is_integer(key): # self.values[key] = value # else: # raise Exception("SparseArray does not support seting non-scalars @@ -395,7 +403,7 @@ def __setslice__(self, i, j, value): j = 0 slobj = slice(i, j) # noqa - # if not lib.isscalar(value): + # if not is_scalar(value): # raise Exception("SparseArray does not support seting non-scalars # via slices") @@ -445,12 +453,12 @@ def count(self): @property def _null_fill_value(self): - return com.isnull(self.fill_value) + return isnull(self.fill_value) @property def _valid_sp_values(self): sp_vals = self.sp_values - mask = com.notnull(sp_vals) + mask = notnull(sp_vals) return sp_vals[mask] @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) @@ -466,7 +474,7 @@ def fillna(self, value, downcast=None): fill_value=value) else: new_values = self.sp_values.copy() - new_values[com.isnull(new_values)] = value + new_values[isnull(new_values)] = value return self._simple_new(new_values, self.sp_index, fill_value=self.fill_value) @@ -498,7 +506,7 @@ def cumsum(self, axis=0, *args, **kwargs): nv.validate_cumsum(args, kwargs) # TODO: gh-12855 - return a SparseArray here - if com.notnull(self.fill_value): + if notnull(self.fill_value): return self.to_dense().cumsum() # TODO: what if sp_values contains NaN?? @@ -569,7 +577,7 @@ def _maybe_to_dense(obj): def _maybe_to_sparse(array): - if isinstance(array, com.ABCSparseSeries): + if isinstance(array, ABCSparseSeries): array = SparseArray(array.values, sparse_index=array.sp_index, fill_value=array.fill_value, copy=True) if not isinstance(array, SparseArray): @@ -588,15 +596,15 @@ def _sanitize_values(arr): else: # scalar - if lib.isscalar(arr): + if is_scalar(arr): arr = [arr] # ndarray if isinstance(arr, np.ndarray): pass - elif com.is_list_like(arr) and len(arr) > 0: - arr = com._possibly_convert_platform(arr) + elif is_list_like(arr) and len(arr) > 0: + arr = _possibly_convert_platform(arr) else: arr = np.asarray(arr) @@ -624,8 +632,8 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") - if com.isnull(fill_value): - mask = com.notnull(arr) + if isnull(fill_value): + mask = notnull(arr) else: mask = arr != fill_value diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 52a6e6edf0896..811d8019c7fee 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -10,13 +10,15 @@ from pandas import compat import numpy as np +from pandas.types.missing import isnull, notnull +from pandas.types.common import _ensure_platform_int + +from pandas.core.common import _try_sort from pandas.compat.numpy import function as nv -from pandas.core.common import isnull, _try_sort from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) -import pandas.core.common as com import pandas.core.algorithms as algos from pandas.core.internals import (BlockManager, create_block_manager_from_arrays) @@ -520,7 +522,7 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() @@ -546,7 +548,7 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, if level is not None: raise TypeError('Reindex by level not supported for sparse') - if com.notnull(fill_value): + if notnull(fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py index bc10b73a47723..666dae8071053 100644 --- a/pandas/sparse/list.py +++ b/pandas/sparse/list.py @@ -2,9 +2,9 @@ from pandas.core.base import PandasObject from pandas.formats.printing import pprint_thing +from pandas.types.common import is_scalar from pandas.sparse.array import SparseArray import pandas._sparse as splib -import pandas.lib as lib class SparseList(PandasObject): @@ -121,7 +121,7 @@ def append(self, value): ---------- value: scalar or array-like """ - if lib.isscalar(value): + if is_scalar(value): value = [value] sparr = SparseArray(value, fill_value=self.fill_value) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 88f396d20a91e..0996cd3bd826a 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -10,6 +10,7 @@ from pandas import compat import numpy as np +from pandas.types.common import is_list_like, is_scalar from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.frame import DataFrame from pandas.core.panel import Panel @@ -18,7 +19,6 @@ import pandas.core.common as com import pandas.core.ops as ops -import pandas.lib as lib class SparsePanelAxis(object): @@ -186,7 +186,7 @@ def _ixs(self, i, axis=0): key = self._get_axis(axis)[i] # xs cannot handle a non-scalar key, so just reindex here - if com.is_list_like(key): + if is_list_like(key): return self.reindex(**{self._get_axis_name(axis): key}) return self.xs(key, axis=axis) @@ -393,7 +393,7 @@ def _combine(self, other, func, axis=0): return self._combineFrame(other, func, axis=axis) elif isinstance(other, Panel): return self._combinePanel(other, func) - elif lib.isscalar(other): + elif is_scalar(other): new_frames = dict((k, func(v, other)) for k, v in self.iteritems()) return self._new_like(new_frames) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 5c7762c56ec6d..951c2ae0c0d5a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -8,8 +8,11 @@ import numpy as np import warnings +from pandas.types.missing import isnull +from pandas.types.common import is_scalar +from pandas.core.common import _values_from_object, _maybe_match_name + from pandas.compat.numpy import function as nv -from pandas.core.common import isnull, _values_from_object, _maybe_match_name from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -18,7 +21,6 @@ import pandas.core.common as com import pandas.core.ops as ops import pandas.index as _index -import pandas.lib as lib from pandas.util.decorators import Appender from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray, @@ -54,7 +56,7 @@ def wrapper(self, other): return _sparse_series_op(self, other, op, name) elif isinstance(other, DataFrame): return NotImplemented - elif lib.isscalar(other): + elif is_scalar(other): if isnull(other) or isnull(self.fill_value): new_fill_value = np.nan else: diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 6780cf311c244..e9563d9168206 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -1,7 +1,8 @@ import numpy as np from pandas import compat -from pandas.core.common import isnull, array_equivalent, is_dtype_equal +from pandas.types.missing import isnull, array_equivalent +from pandas.types.common import is_dtype_equal cdef NUMERIC_TYPES = ( bool, @@ -145,8 +146,15 @@ cpdef assert_almost_equal(a, b, if na != nb: from pandas.util.testing import raise_assert_detail + + # if we have a small diff set, print it + if abs(na-nb) < 10: + r = list(set(a) ^ set(b)) + else: + r = None + raise_assert_detail(obj, '{0} length are different'.format(obj), - na, nb) + na, nb, r) for i in xrange(len(a)): try: diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 46d30ab7fe313..bb475e47206c2 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -6,7 +6,7 @@ import warnings import numpy as np -from pandas import lib +from pandas.types.common import is_scalar from pandas.core.api import DataFrame, Series from pandas.util.decorators import Substitution, Appender @@ -226,7 +226,7 @@ def ensure_compat(dispatch, name, arg, func_kw=None, *args, **kwargs): aargs += ',' def f(a, b): - if lib.isscalar(b): + if is_scalar(b): return "{a}={b}".format(a=a, b=b) return "{a}=<{b}>".format(a=a, b=type(b).__name__) aargs = ','.join([f(a, b) for a, b in kwds.items() if b is not None]) diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py index 678689f2d2b30..b533d255bd196 100644 --- a/pandas/stats/ols.py +++ b/pandas/stats/ols.py @@ -13,7 +13,7 @@ from pandas.core.api import DataFrame, Series, isnull from pandas.core.base import StringMixin -from pandas.core.common import _ensure_float64 +from pandas.types.common import _ensure_float64 from pandas.core.index import MultiIndex from pandas.core.panel import Panel from pandas.util.decorators import cache_readonly diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 2b619b84a5994..020b7f1f1ab9d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -10,7 +10,7 @@ from pandas import (notnull, DataFrame, Series, MultiIndex, date_range, Timestamp, compat) import pandas as pd -import pandas.core.common as com +from pandas.types.dtypes import CategoricalDtype from pandas.util.testing import (assert_series_equal, assert_frame_equal) import pandas.util.testing as tm @@ -45,8 +45,8 @@ def test_apply(self): 'c1': ['C', 'C', 'D', 'D']}) df = df.apply(lambda ts: ts.astype('category')) self.assertEqual(df.shape, (4, 2)) - self.assertTrue(isinstance(df['c0'].dtype, com.CategoricalDtype)) - self.assertTrue(isinstance(df['c1'].dtype, com.CategoricalDtype)) + self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype)) + self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype)) def test_apply_mixed_datetimelike(self): # mixed datetimelike diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b42aef9447373..d21db5ba52a45 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,6 +14,7 @@ import numpy.ma as ma import numpy.ma.mrecords as mrecords +from pandas.types.common import is_integer_dtype from pandas.compat import (lmap, long, zip, range, lrange, lzip, OrderedDict, is_platform_little_endian) from pandas import compat @@ -809,7 +810,7 @@ def test_constructor_list_of_lists(self): # GH #484 l = [[1, 'a'], [2, 'b']] df = DataFrame(data=l, columns=["num", "str"]) - self.assertTrue(com.is_integer_dtype(df['num'])) + self.assertTrue(is_integer_dtype(df['num'])) self.assertEqual(df['str'].dtype, np.object_) # GH 4851 diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 5f95ff6b6b601..c650436eefaf3 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,15 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import print_function - from datetime import timedelta import numpy as np - from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, option_context) from pandas.compat import u -from pandas.core import common as com +from pandas.types.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -84,8 +82,8 @@ def test_datetime_with_tz_dtypes(self): tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series([np.dtype('datetime64[ns]'), - com.DatetimeTZDtype('datetime64[ns, US/Eastern]'), - com.DatetimeTZDtype('datetime64[ns, CET]')], + DatetimeTZDtype('datetime64[ns, US/Eastern]'), + DatetimeTZDtype('datetime64[ns, CET]')], ['A', 'B', 'C']) assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index d7fed8131a4f4..578df5ba9101e 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -17,6 +17,9 @@ date_range) import pandas as pd +from pandas.types.common import (is_float_dtype, + is_integer, + is_scalar) from pandas.util.testing import (assert_almost_equal, assert_numpy_array_equal, assert_series_equal, @@ -26,7 +29,6 @@ from pandas.core.indexing import IndexingError import pandas.util.testing as tm -import pandas.lib as lib from pandas.tests.frame.common import TestData @@ -1419,15 +1421,15 @@ def test_setitem_single_column_mixed_datetime(self): # set an allowable datetime64 type from pandas import tslib df.ix['b', 'timestamp'] = tslib.iNaT - self.assertTrue(com.isnull(df.ix['b', 'timestamp'])) + self.assertTrue(isnull(df.ix['b', 'timestamp'])) # allow this syntax df.ix['c', 'timestamp'] = nan - self.assertTrue(com.isnull(df.ix['c', 'timestamp'])) + self.assertTrue(isnull(df.ix['c', 'timestamp'])) # allow this syntax df.ix['d', :] = nan - self.assertTrue(com.isnull(df.ix['c', :]).all() == False) # noqa + self.assertTrue(isnull(df.ix['c', :]).all() == False) # noqa # as of GH 3216 this will now work! # try to set with a list like item @@ -1619,7 +1621,7 @@ def test_set_value_resize(self): res = self.frame.copy() res3 = res.set_value('foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['baz'])) + self.assertTrue(is_float_dtype(res3['baz'])) self.assertTrue(isnull(res3['baz'].drop(['foobar'])).all()) self.assertRaises(ValueError, res3.set_value, 'foobar', 'baz', 'sam') @@ -1662,7 +1664,7 @@ def test_single_element_ix_dont_upcast(self): (int, np.integer))) result = self.frame.ix[self.frame.index[5], 'E'] - self.assertTrue(com.is_integer(result)) + self.assertTrue(is_integer(result)) def test_irow(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2)) @@ -2268,7 +2270,7 @@ def _check_align(df, cond, other, check_dtypes=True): d = df[k].values c = cond[k].reindex(df[k].index).fillna(False).values - if lib.isscalar(other): + if is_scalar(other): o = other else: if isinstance(other, np.ndarray): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a6246790f83cb..44c7f2277293d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -6,6 +6,9 @@ import warnings from datetime import datetime +from pandas.types.common import (is_integer_dtype, + is_float_dtype, + is_scalar) from pandas.compat import range, lrange, lzip, StringIO, lmap, map from pandas.tslib import NaT from numpy import nan @@ -22,7 +25,7 @@ assert_frame_equal, assert_panel_equal, assert_attr_equal, slow) from pandas.formats.printing import pprint_thing -from pandas import concat, lib +from pandas import concat from pandas.core.common import PerformanceWarning import pandas.util.testing as tm @@ -200,7 +203,7 @@ def _print(result, error=None): return try: - if lib.isscalar(rs) and lib.isscalar(xp): + if is_scalar(rs) and is_scalar(xp): self.assertEqual(rs, xp) elif xp.ndim == 1: assert_series_equal(rs, xp) @@ -775,7 +778,7 @@ def test_ix_loc_consistency(self): # this is not an exhaustive case def compare(result, expected): - if lib.isscalar(expected): + if is_scalar(expected): self.assertEqual(result, expected) else: self.assertTrue(expected.equals(result)) @@ -2888,8 +2891,8 @@ def test_setitem_dtype_upcast(self): columns=['foo', 'bar', 'baz']) assert_frame_equal(left, right) - self.assertTrue(com.is_integer_dtype(left['foo'])) - self.assertTrue(com.is_integer_dtype(left['baz'])) + self.assertTrue(is_integer_dtype(left['foo'])) + self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), @@ -2900,8 +2903,8 @@ def test_setitem_dtype_upcast(self): columns=['foo', 'bar', 'baz']) assert_frame_equal(left, right) - self.assertTrue(com.is_float_dtype(left['foo'])) - self.assertTrue(com.is_float_dtype(left['baz'])) + self.assertTrue(is_float_dtype(left['foo'])) + self.assertTrue(is_float_dtype(left['baz'])) def test_setitem_iloc(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 2a7e8a957977f..b7ec4d570f18b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -8,10 +8,11 @@ import numpy.ma as ma import pandas as pd +from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype from pandas import Index, Series, isnull, date_range, period_range from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex -import pandas.core.common as com + import pandas.lib as lib from pandas.compat import lrange, range, zip, OrderedDict, long @@ -144,11 +145,11 @@ def test_constructor_categorical(self): ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') - self.assertTrue(com.is_categorical_dtype(cat)) - self.assertTrue(com.is_categorical_dtype(cat.dtype)) + self.assertTrue(is_categorical_dtype(cat)) + self.assertTrue(is_categorical_dtype(cat.dtype)) s = Series([1, 2, 3], dtype='category') - self.assertTrue(com.is_categorical_dtype(s)) - self.assertTrue(com.is_categorical_dtype(s.dtype)) + self.assertTrue(is_categorical_dtype(s)) + self.assertTrue(is_categorical_dtype(s.dtype)) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) @@ -429,7 +430,7 @@ def test_constructor_with_datetime_tz(self): s = Series(dr) self.assertTrue(s.dtype.name == 'datetime64[ns, US/Eastern]') self.assertTrue(s.dtype == 'datetime64[ns, US/Eastern]') - self.assertTrue(com.is_datetime64tz_dtype(s.dtype)) + self.assertTrue(is_datetime64tz_dtype(s.dtype)) self.assertTrue('datetime64[ns, US/Eastern]' in str(s)) # export diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 6e82f81f901a9..c25895548dcb9 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.types.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range) from pandas.tseries.period import PeriodIndex @@ -49,16 +50,16 @@ def test_dt_namespace_accessor(self): def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): - if com.is_integer_dtype(result): + if is_integer_dtype(result): result = result.astype('int64') - elif not com.is_list_like(result): + elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) - if not (com.is_list_like(a) and com.is_list_like(b)): + if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 15ca238ee32a0..64ebaa63cc10f 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -7,16 +7,14 @@ import numpy as np import pandas as pd +from pandas.types.common import is_integer, is_scalar from pandas import Index, Series, DataFrame, isnull, date_range from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta -import pandas.core.common as com import pandas.core.datetools as datetools -import pandas.lib as lib - from pandas.compat import lrange, range from pandas import compat from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -375,7 +373,7 @@ def test_getitem_ambiguous_keyerror(self): def test_getitem_unordered_dup(self): obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) - self.assertTrue(lib.isscalar(obj['c'])) + self.assertTrue(is_scalar(obj['c'])) self.assertEqual(obj['c'], 0) def test_getitem_dups_with_missing(self): @@ -1174,23 +1172,23 @@ def test_where_numeric_with_string(self): s = pd.Series([1, 2, 3]) w = s.where(s > 1, 'X') - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') w = s.where(s > 1, ['X', 'Y', 'Z']) - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index e0bff7fbd39e4..7d2517987e526 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -7,7 +7,7 @@ from pandas import (Index, Series, _np_version_under1p9) from pandas.tseries.index import Timestamp -import pandas.core.common as com +from pandas.types.common import is_integer import pandas.util.testing as tm from .common import TestData @@ -96,11 +96,11 @@ def test_quantile_interpolation_dtype(self): # interpolation = linear (default case) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) - self.assertTrue(com.is_integer(q)) + self.assertTrue(is_integer(q)) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) - self.assertTrue(com.is_integer(q)) + self.assertTrue(is_integer(q)) def test_quantile_interpolation_np_lt_1p9(self): # GH #10174 diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 77ae3ca20d123..2721d8d0e5e69 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -9,7 +9,7 @@ import pandas as pd import pandas.compat as compat -import pandas.core.common as com +from pandas.types.common import is_object_dtype, is_datetimetz import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta) @@ -517,7 +517,7 @@ def test_value_counts_unique_nunique(self): continue # special assign to the numpy array - if com.is_datetimetz(o): + if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = pd.tslib.iNaT @@ -982,8 +982,8 @@ def test_memory_usage(self): res = o.memory_usage() res_deep = o.memory_usage(deep=True) - if (com.is_object_dtype(o) or (isinstance(o, Series) and - com.is_object_dtype(o.index))): + if (is_object_dtype(o) or (isinstance(o, Series) and + is_object_dtype(o.index))): # if there are objects, only deep will pick them up self.assertTrue(res_deep > res) else: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 90876a4541da6..2ca1fc71df20a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -8,12 +8,17 @@ import numpy as np +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (is_categorical_dtype, + is_object_dtype, + is_float_dtype, + is_integer_dtype) + import pandas as pd import pandas.compat as compat -import pandas.core.common as com import pandas.util.testing as tm from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, - Timestamp, CategoricalIndex) + Timestamp, CategoricalIndex, isnull) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -195,18 +200,18 @@ def f(): # This should result in integer categories, not float! cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.categories)) + self.assertTrue(is_integer_dtype(cat.categories)) # https://github.com/pydata/pandas/issues/3678 cat = pd.Categorical([np.nan, 1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.categories)) + self.assertTrue(is_integer_dtype(cat.categories)) # this should result in floats cat = pd.Categorical([np.nan, 1, 2., 3]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) cat = pd.Categorical([np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) # Deprecating NaNs in categoires (GH #10748) # preserve int as far as possible by converting to object if NaN is in @@ -214,23 +219,23 @@ def f(): with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) - self.assertTrue(com.is_object_dtype(cat.categories)) + self.assertTrue(is_object_dtype(cat.categories)) # This doesn't work -> this would probably need some kind of "remember # the original type" feature to try to cast the array interface result # to... # vals = np.asarray(cat[cat.notnull()]) - # self.assertTrue(com.is_integer_dtype(vals)) + # self.assertTrue(is_integer_dtype(vals)) with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) - self.assertTrue(com.is_object_dtype(cat.categories)) + self.assertTrue(is_object_dtype(cat.categories)) # but don't do it for floats with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) # corner cases cat = pd.Categorical([1]) @@ -552,7 +557,7 @@ def test_na_flags_int_categories(self): cat = Categorical(labels, categories, fastpath=True) repr(cat) - self.assert_numpy_array_equal(com.isnull(cat), labels == -1) + self.assert_numpy_array_equal(isnull(cat), labels == -1) def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', @@ -2076,15 +2081,15 @@ def test_assignment_to_dataframe(self): result = df.dtypes expected = Series( - [np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D']) + [np.dtype('int32'), CategoricalDtype()], index=['value', 'D']) tm.assert_series_equal(result, expected) df['E'] = s str(df) result = df.dtypes - expected = Series([np.dtype('int32'), com.CategoricalDtype(), - com.CategoricalDtype()], + expected = Series([np.dtype('int32'), CategoricalDtype(), + CategoricalDtype()], index=['value', 'D', 'E']) tm.assert_series_equal(result, expected) @@ -3234,7 +3239,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.iloc[2, :] @@ -3244,7 +3249,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.iloc[2, 0] @@ -3254,7 +3259,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.loc["j", :] @@ -3264,7 +3269,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.loc["j", "cats"] @@ -3275,7 +3280,7 @@ def test_slicing_and_getting_ops(self): # res_df = df.ix["j":"k",[0,1]] # doesn't work? res_df = df.ix["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.ix["j", :] @@ -3285,7 +3290,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.ix[:, "cats"] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.ix["j", 0] @@ -3318,23 +3323,23 @@ def test_slicing_and_getting_ops(self): res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) def test_slicing_doc_examples(self): @@ -4114,7 +4119,7 @@ def test_astype_to_other(self): s = self.cat['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) - tm.assert_series_equal(s.astype(com.CategoricalDtype()), expected) + tm.assert_series_equal(s.astype(CategoricalDtype()), expected) self.assertRaises(ValueError, lambda: s.astype('float64')) cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) @@ -4139,10 +4144,10 @@ def cmp(a, b): # valid conversion for valid in [lambda x: x.astype('category'), - lambda x: x.astype(com.CategoricalDtype()), + lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype( - com.CategoricalDtype()) + CategoricalDtype()) ]: result = valid(s) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 56b1b542d547e..09dd3f7ab517c 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,21 +1,12 @@ # -*- coding: utf-8 -*- -import collections -from datetime import datetime, timedelta -import re import nose import numpy as np -import pandas as pd -from pandas.tslib import iNaT, NaT -from pandas import (Series, DataFrame, date_range, DatetimeIndex, - TimedeltaIndex, Timestamp, Float64Index) -from pandas import compat -from pandas.compat import range, lrange, lmap, u -from pandas.core.common import notnull, isnull, array_equivalent + +from pandas import Series, Timestamp +from pandas.compat import range, lmap import pandas.core.common as com -import pandas.core.convert as convert import pandas.util.testing as tm -import pandas.core.config as cf _multiprocess_can_split_ = True @@ -28,22 +19,6 @@ def test_mut_exclusive(): assert com._mut_exclusive(major=None, major_axis=None) is None -def test_is_sequence(): - is_seq = com.is_sequence - assert (is_seq((1, 2))) - assert (is_seq([1, 2])) - assert (not is_seq("abcd")) - assert (not is_seq(u("abcd"))) - assert (not is_seq(np.int64)) - - class A(object): - - def __getitem__(self): - return 1 - - assert (not is_seq(A())) - - def test_get_callable_name(): from functools import partial getname = com._get_callable_name @@ -68,407 +43,6 @@ def __call__(self): assert getname(1) is None -class TestInferDtype(tm.TestCase): - - def test_infer_dtype_from_scalar(self): - # Test that _infer_dtype_from_scalar is returning correct dtype for int - # and float. - - for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, - np.int32, np.uint64, np.int64]: - data = dtypec(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, type(data)) - - data = 12 - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.int64) - - for dtypec in [np.float16, np.float32, np.float64]: - data = dtypec(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, dtypec) - - data = np.float(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.float64) - - for data in [True, False]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.bool_) - - for data in [np.complex64(1), np.complex128(1)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.complex_) - - import datetime - for data in [np.datetime64(1, 'ns'), pd.Timestamp(1), - datetime.datetime(2000, 1, 1, 0, 0)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'M8[ns]') - - for data in [np.timedelta64(1, 'ns'), pd.Timedelta(1), - datetime.timedelta(1)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'm8[ns]') - - for data in [datetime.date(2000, 1, 1), - pd.Timestamp(1, tz='US/Eastern'), 'foo']: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.object_) - - -def test_notnull(): - assert notnull(1.) - assert not notnull(None) - assert not notnull(np.NaN) - - with cf.option_context("mode.use_inf_as_null", False): - assert notnull(np.inf) - assert notnull(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notnull(arr) - assert result.all() - - with cf.option_context("mode.use_inf_as_null", True): - assert not notnull(np.inf) - assert not notnull(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notnull(arr) - assert result.sum() == 2 - - with cf.option_context("mode.use_inf_as_null", False): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - -def test_isnull(): - assert not isnull(1.) - assert isnull(None) - assert isnull(np.NaN) - assert not isnull(np.inf) - assert not isnull(-np.inf) - - # series - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - # frame - for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), - tm.makeMixedDataFrame()]: - result = isnull(df) - expected = df.apply(isnull) - tm.assert_frame_equal(result, expected) - - # panel - for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) - ]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel_equal(result, expected) - - # panel 4d - for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel4d_equal(result, expected) - - -def test_isnull_lists(): - result = isnull([[False]]) - exp = np.array([[False]]) - assert (np.array_equal(result, exp)) - - result = isnull([[1], [2]]) - exp = np.array([[False], [False]]) - assert (np.array_equal(result, exp)) - - # list of strings / unicode - result = isnull(['foo', 'bar']) - assert (not result.any()) - - result = isnull([u('foo'), u('bar')]) - assert (not result.any()) - - -def test_isnull_nat(): - result = isnull([NaT]) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - result = isnull(np.array([NaT], dtype=object)) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - -def test_isnull_numpy_nat(): - arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), - np.datetime64('NaT', 's')]) - result = isnull(arr) - expected = np.array([True] * 4) - tm.assert_numpy_array_equal(result, expected) - - -def test_isnull_datetime(): - assert (not isnull(datetime.now())) - assert notnull(datetime.now()) - - idx = date_range('1/1/1990', periods=20) - assert (notnull(idx).all()) - - idx = np.asarray(idx) - idx[0] = iNaT - idx = DatetimeIndex(idx) - mask = isnull(idx) - assert (mask[0]) - assert (not mask[1:].any()) - - # GH 9129 - pidx = idx.to_period(freq='M') - mask = isnull(pidx) - assert (mask[0]) - assert (not mask[1:].any()) - - mask = isnull(pidx[1:]) - assert (not mask.any()) - - -class TestIsNull(tm.TestCase): - - def test_0d_array(self): - self.assertTrue(isnull(np.array(np.nan))) - self.assertFalse(isnull(np.array(0.0))) - self.assertFalse(isnull(np.array(0))) - # test object dtype - self.assertTrue(isnull(np.array(np.nan, dtype=object))) - self.assertFalse(isnull(np.array(0.0, dtype=object))) - self.assertFalse(isnull(np.array(0, dtype=object))) - - -class TestNumberScalar(tm.TestCase): - - def test_is_number(self): - - self.assertTrue(com.is_number(True)) - self.assertTrue(com.is_number(1)) - self.assertTrue(com.is_number(1.1)) - self.assertTrue(com.is_number(1 + 3j)) - self.assertTrue(com.is_number(np.bool(False))) - self.assertTrue(com.is_number(np.int64(1))) - self.assertTrue(com.is_number(np.float64(1.1))) - self.assertTrue(com.is_number(np.complex128(1 + 3j))) - self.assertTrue(com.is_number(np.nan)) - - self.assertFalse(com.is_number(None)) - self.assertFalse(com.is_number('x')) - self.assertFalse(com.is_number(datetime(2011, 1, 1))) - self.assertFalse(com.is_number(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_number(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_number(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_number(timedelta(1000))) - self.assertFalse(com.is_number(pd.Timedelta('1 days'))) - - # questionable - self.assertFalse(com.is_number(np.bool_(False))) - self.assertTrue(com.is_number(np.timedelta64(1, 'D'))) - - def test_is_bool(self): - self.assertTrue(com.is_bool(True)) - self.assertTrue(com.is_bool(np.bool(False))) - self.assertTrue(com.is_bool(np.bool_(False))) - - self.assertFalse(com.is_bool(1)) - self.assertFalse(com.is_bool(1.1)) - self.assertFalse(com.is_bool(1 + 3j)) - self.assertFalse(com.is_bool(np.int64(1))) - self.assertFalse(com.is_bool(np.float64(1.1))) - self.assertFalse(com.is_bool(np.complex128(1 + 3j))) - self.assertFalse(com.is_bool(np.nan)) - self.assertFalse(com.is_bool(None)) - self.assertFalse(com.is_bool('x')) - self.assertFalse(com.is_bool(datetime(2011, 1, 1))) - self.assertFalse(com.is_bool(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_bool(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_bool(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_bool(timedelta(1000))) - self.assertFalse(com.is_bool(np.timedelta64(1, 'D'))) - self.assertFalse(com.is_bool(pd.Timedelta('1 days'))) - - def test_is_integer(self): - self.assertTrue(com.is_integer(1)) - self.assertTrue(com.is_integer(np.int64(1))) - - self.assertFalse(com.is_integer(True)) - self.assertFalse(com.is_integer(1.1)) - self.assertFalse(com.is_integer(1 + 3j)) - self.assertFalse(com.is_integer(np.bool(False))) - self.assertFalse(com.is_integer(np.bool_(False))) - self.assertFalse(com.is_integer(np.float64(1.1))) - self.assertFalse(com.is_integer(np.complex128(1 + 3j))) - self.assertFalse(com.is_integer(np.nan)) - self.assertFalse(com.is_integer(None)) - self.assertFalse(com.is_integer('x')) - self.assertFalse(com.is_integer(datetime(2011, 1, 1))) - self.assertFalse(com.is_integer(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_integer(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_integer(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_integer(timedelta(1000))) - self.assertFalse(com.is_integer(pd.Timedelta('1 days'))) - - # questionable - self.assertTrue(com.is_integer(np.timedelta64(1, 'D'))) - - def test_is_float(self): - self.assertTrue(com.is_float(1.1)) - self.assertTrue(com.is_float(np.float64(1.1))) - self.assertTrue(com.is_float(np.nan)) - - self.assertFalse(com.is_float(True)) - self.assertFalse(com.is_float(1)) - self.assertFalse(com.is_float(1 + 3j)) - self.assertFalse(com.is_float(np.bool(False))) - self.assertFalse(com.is_float(np.bool_(False))) - self.assertFalse(com.is_float(np.int64(1))) - self.assertFalse(com.is_float(np.complex128(1 + 3j))) - self.assertFalse(com.is_float(None)) - self.assertFalse(com.is_float('x')) - self.assertFalse(com.is_float(datetime(2011, 1, 1))) - self.assertFalse(com.is_float(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_float(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_float(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_float(timedelta(1000))) - self.assertFalse(com.is_float(np.timedelta64(1, 'D'))) - self.assertFalse(com.is_float(pd.Timedelta('1 days'))) - - -def test_downcast_conv(): - # test downcasting - - arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) - - arr = np.array([8., 8., 8., 8., 8.9999999999995]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - arr = np.array([8., 8., 8., 8., 9.0000000000005]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - # conversions - - expected = np.array([1, 2]) - for dtype in [np.float64, object, np.int64]: - arr = np.array([1.0, 2.0], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected, check_dtype=False) - - for dtype in [np.float64, object]: - expected = np.array([1.0, 2.0, np.nan], dtype=dtype) - arr = np.array([1.0, 2.0, np.nan], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected) - - # empties - for dtype in [np.int32, np.float64, np.float32, np.bool_, - np.int64, object]: - arr = np.array([], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'int64') - tm.assert_almost_equal(result, np.array([], dtype=np.int64)) - assert result.dtype == np.int64 - - -def test_array_equivalent(): - assert array_equivalent(np.array([np.nan, np.nan]), - np.array([np.nan, np.nan])) - assert array_equivalent(np.array([np.nan, 1, np.nan]), - np.array([np.nan, 1, np.nan])) - assert array_equivalent(np.array([np.nan, None], dtype='object'), - np.array([np.nan, None], dtype='object')) - assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), - np.array([np.nan, 1 + 1j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1 + 1j], dtype='complex'), np.array( - [np.nan, 1 + 2j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) - assert not array_equivalent( - np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - assert array_equivalent(Float64Index([0, np.nan]), - Float64Index([0, np.nan])) - assert not array_equivalent( - Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), - DatetimeIndex([0, np.nan])) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - assert array_equivalent(TimedeltaIndex([0, np.nan]), - TimedeltaIndex([0, np.nan])) - assert not array_equivalent( - TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), - DatetimeIndex([0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( - [1, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) - - -def test_array_equivalent_str(): - for dtype in ['O', 'S', 'U']: - assert array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'B'], dtype=dtype)) - assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'X'], dtype=dtype)) - - -def test_datetimeindex_from_empty_datetime64_array(): - for unit in ['ms', 'us', 'ns']: - idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) - assert (len(idx) == 0) - - -def test_nan_to_nat_conversions(): - - df = DataFrame(dict({ - 'A': np.asarray( - lrange(10), dtype='float64'), - 'B': Timestamp('20010101') - })) - df.iloc[3:6, :] = np.nan - result = df.loc[4, 'B'].value - assert (result == iNaT) - - s = df['B'].copy() - s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert (isnull(s[8])) - - # numpy < 1.7.0 is wrong - from distutils.version import LooseVersion - if LooseVersion(np.__version__) >= '1.7.0': - assert (s[8].value == np.datetime64('NaT').astype(np.int64)) - - def test_any_none(): assert (com._any_none(1, 2, 3, None)) assert (not com._any_none(1, 2, 3, 4)) @@ -567,122 +141,6 @@ def test_groupby(): assert v == expected[k] -def test_is_list_like(): - passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), - Series([]), Series(['a']).str) - fails = (1, '2', object()) - - for p in passes: - assert com.is_list_like(p) - - for f in fails: - assert not com.is_list_like(f) - - -def test_is_dict_like(): - passes = [{}, {'A': 1}, pd.Series([1])] - fails = ['1', 1, [1, 2], (1, 2), range(2), pd.Index([1])] - - for p in passes: - assert com.is_dict_like(p) - - for f in fails: - assert not com.is_dict_like(f) - - -def test_is_named_tuple(): - passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) - fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) - - for p in passes: - assert com.is_named_tuple(p) - - for f in fails: - assert not com.is_named_tuple(f) - - -def test_is_hashable(): - - # all new-style classes are hashable by default - class HashableClass(object): - pass - - class UnhashableClass1(object): - __hash__ = None - - class UnhashableClass2(object): - - def __hash__(self): - raise TypeError("Not hashable") - - hashable = (1, - 3.14, - np.float64(3.14), - 'a', - tuple(), - (1, ), - HashableClass(), ) - not_hashable = ([], UnhashableClass1(), ) - abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) - - for i in hashable: - assert com.is_hashable(i) - for i in not_hashable: - assert not com.is_hashable(i) - for i in abc_hashable_not_really_hashable: - assert not com.is_hashable(i) - - # numpy.array is no longer collections.Hashable as of - # https://github.com/numpy/numpy/pull/5326, just test - # pandas.common.is_hashable() - assert not com.is_hashable(np.array([])) - - # old-style classes in Python 2 don't appear hashable to - # collections.Hashable but also seem to support hash() by default - if compat.PY2: - - class OldStyleClass(): - pass - - c = OldStyleClass() - assert not isinstance(c, collections.Hashable) - assert com.is_hashable(c) - hash(c) # this will not raise - - -def test_ensure_int32(): - values = np.arange(10, dtype=np.int32) - result = com._ensure_int32(values) - assert (result.dtype == np.int32) - - values = np.arange(10, dtype=np.int64) - result = com._ensure_int32(values) - assert (result.dtype == np.int32) - - -def test_is_re(): - passes = re.compile('ad'), - fails = 'x', 2, 3, object() - - for p in passes: - assert com.is_re(p) - - for f in fails: - assert not com.is_re(f) - - -def test_is_recompilable(): - passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), - re.compile(r'')) - fails = 1, [], object() - - for p in passes: - assert com.is_re_compilable(p) - - for f in fails: - assert not com.is_re_compilable(f) - - def test_random_state(): import numpy.random as npr # Check with seed @@ -730,83 +188,6 @@ def test_maybe_match_name(): assert (matched == 'y') -class TestMaybe(tm.TestCase): - - def test_maybe_convert_string_to_array(self): - result = com._maybe_convert_string_to_object('x') - tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) - self.assertTrue(result.dtype == object) - - result = com._maybe_convert_string_to_object(1) - self.assertEqual(result, 1) - - arr = np.array(['x', 'y'], dtype=str) - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) - self.assertTrue(result.dtype == object) - - # unicode - arr = np.array(['x', 'y']).astype('U') - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) - self.assertTrue(result.dtype == object) - - # object - arr = np.array(['x', 2], dtype=object) - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) - self.assertTrue(result.dtype == object) - - def test_maybe_convert_scalar(self): - - # pass thru - result = com._maybe_convert_scalar('x') - self.assertEqual(result, 'x') - result = com._maybe_convert_scalar(np.array([1])) - self.assertEqual(result, np.array([1])) - - # leave scalar dtype - result = com._maybe_convert_scalar(np.int64(1)) - self.assertEqual(result, np.int64(1)) - result = com._maybe_convert_scalar(np.int32(1)) - self.assertEqual(result, np.int32(1)) - result = com._maybe_convert_scalar(np.float32(1)) - self.assertEqual(result, np.float32(1)) - result = com._maybe_convert_scalar(np.int64(1)) - self.assertEqual(result, np.float64(1)) - - # coerce - result = com._maybe_convert_scalar(1) - self.assertEqual(result, np.int64(1)) - result = com._maybe_convert_scalar(1.0) - self.assertEqual(result, np.float64(1)) - result = com._maybe_convert_scalar(pd.Timestamp('20130101')) - self.assertEqual(result, pd.Timestamp('20130101').value) - result = com._maybe_convert_scalar(datetime(2013, 1, 1)) - self.assertEqual(result, pd.Timestamp('20130101').value) - result = com._maybe_convert_scalar(pd.Timedelta('1 day 1 min')) - self.assertEqual(result, pd.Timedelta('1 day 1 min').value) - - -class TestConvert(tm.TestCase): - - def test_possibly_convert_objects_copy(self): - values = np.array([1, 2]) - - out = convert._possibly_convert_objects(values, copy=False) - self.assertTrue(values is out) - - out = convert._possibly_convert_objects(values, copy=True) - self.assertTrue(values is not out) - - values = np.array(['apply', 'banana']) - out = convert._possibly_convert_objects(values, copy=False) - self.assertTrue(values is out) - - out = convert._possibly_convert_objects(values, copy=True) - self.assertTrue(values is not out) - - def test_dict_compat(): data_datetime64 = {np.datetime64('1990-03-15'): 1, np.datetime64('2015-03-15'): 2} @@ -817,39 +198,6 @@ def test_dict_compat(): assert (com._dict_compat(data_unchanged) == data_unchanged) -def test_is_timedelta(): - assert (com.is_timedelta64_dtype('timedelta64')) - assert (com.is_timedelta64_dtype('timedelta64[ns]')) - assert (not com.is_timedelta64_ns_dtype('timedelta64')) - assert (com.is_timedelta64_ns_dtype('timedelta64[ns]')) - - tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') - assert (com.is_timedelta64_dtype(tdi)) - assert (com.is_timedelta64_ns_dtype(tdi)) - assert (com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) - # Conversion to Int64Index: - assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) - assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) - - -def test_array_equivalent_compat(): - # see gh-13388 - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - assert (com.array_equivalent(m, n, strict_nan=True)) - assert (com.array_equivalent(m, n, strict_nan=False)) - - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) - assert (not com.array_equivalent(m, n, strict_nan=True)) - assert (not com.array_equivalent(m, n, strict_nan=False)) - - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) - assert (not com.array_equivalent(m, n, strict_nan=True)) - assert (not com.array_equivalent(m, n, strict_nan=False)) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 2f4c2b414cc30..a53e79439b017 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -7,12 +7,12 @@ from numpy import nan import pandas as pd +from pandas.types.common import is_scalar from pandas import (Index, Series, DataFrame, Panel, isnull, date_range, period_range, Panel4D) from pandas.core.index import MultiIndex import pandas.formats.printing as printing -import pandas.lib as lib from pandas.compat import range, zip, PY3 from pandas import compat @@ -53,7 +53,7 @@ def _construct(self, shape, value=None, dtype=None, **kwargs): if isinstance(shape, int): shape = tuple([shape] * self._ndim) if value is not None: - if lib.isscalar(value): + if is_scalar(value): if value == 'empty': arr = None diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index bd19a83ce2b64..3a5b0117948b7 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -9,6 +9,7 @@ from datetime import datetime, date +from pandas.types.common import is_list_like import pandas as pd from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range, bdate_range) @@ -16,7 +17,6 @@ iteritems, OrderedDict, PY3) from pandas.util.decorators import cache_readonly from pandas.formats.printing import pprint_thing -import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_is_valid_plot_return_object, slow) @@ -157,7 +157,7 @@ def _check_visible(self, collections, visible=True): """ from matplotlib.collections import Collection if not isinstance(collections, - Collection) and not com.is_list_like(collections): + Collection) and not is_list_like(collections): collections = [collections] for patch in collections: @@ -242,7 +242,7 @@ def _check_text_labels(self, texts, expected): expected : str or list-like which has the same length as texts expected text label, or its list """ - if not com.is_list_like(texts): + if not is_list_like(texts): self.assertEqual(texts.get_text(), expected) else: labels = [t.get_text() for t in texts] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a52f22fe2032a..57d43f22757ea 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -5,7 +5,8 @@ from datetime import datetime from numpy import nan -from pandas import date_range, bdate_range, Timestamp +from pandas.types.common import _ensure_platform_int +from pandas import date_range, bdate_range, Timestamp, isnull from pandas.core.index import Index, MultiIndex, CategoricalIndex from pandas.core.api import Categorical, DataFrame from pandas.core.common import UnsupportedFunctionCall @@ -163,9 +164,9 @@ def test_first_last_nth(self): grouped['B'].nth(0) self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan - self.assertTrue(com.isnull(grouped['B'].first()['foo'])) - self.assertTrue(com.isnull(grouped['B'].last()['foo'])) - self.assertTrue(com.isnull(grouped['B'].nth(0)['foo'])) + self.assertTrue(isnull(grouped['B'].first()['foo'])) + self.assertTrue(isnull(grouped['B'].last()['foo'])) + self.assertTrue(isnull(grouped['B'].nth(0)['foo'])) # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) @@ -1079,8 +1080,9 @@ def test_transform_fast(self): grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, - com._ensure_platform_int(grp.count().values)) + _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') + result = grp.transform(np.mean) assert_series_equal(result, expected) diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py deleted file mode 100644 index 5f016322f101f..0000000000000 --- a/pandas/tests/test_infer_and_convert.py +++ /dev/null @@ -1,653 +0,0 @@ -# -*- coding: utf-8 -*- - -from datetime import datetime, timedelta, date, time - -import numpy as np -import pandas as pd -import pandas.lib as lib -import pandas.util.testing as tm -from pandas import Index - -from pandas.compat import long, u, PY2 - - -class TestInference(tm.TestCase): - - def test_infer_dtype_bytes(self): - compare = 'string' if PY2 else 'bytes' - - # string array of bytes - arr = np.array(list('abc'), dtype='S1') - self.assertEqual(pd.lib.infer_dtype(arr), compare) - - # object array of bytes - arr = arr.astype(object) - self.assertEqual(pd.lib.infer_dtype(arr), compare) - - def test_isinf_scalar(self): - # GH 11352 - self.assertTrue(lib.isposinf_scalar(float('inf'))) - self.assertTrue(lib.isposinf_scalar(np.inf)) - self.assertFalse(lib.isposinf_scalar(-np.inf)) - self.assertFalse(lib.isposinf_scalar(1)) - self.assertFalse(lib.isposinf_scalar('a')) - - self.assertTrue(lib.isneginf_scalar(float('-inf'))) - self.assertTrue(lib.isneginf_scalar(-np.inf)) - self.assertFalse(lib.isneginf_scalar(np.inf)) - self.assertFalse(lib.isneginf_scalar(1)) - self.assertFalse(lib.isneginf_scalar('a')) - - def test_maybe_convert_numeric_infinities(self): - # see gh-13274 - infinities = ['inf', 'inF', 'iNf', 'Inf', - 'iNF', 'InF', 'INf', 'INF'] - na_values = set(['', 'NULL', 'nan']) - - pos = np.array(['inf'], dtype=np.float64) - neg = np.array(['-inf'], dtype=np.float64) - - msg = "Unable to parse string" - - for infinity in infinities: - for maybe_int in (True, False): - out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['-' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, neg) - - out = lib.maybe_convert_numeric( - np.array([u(infinity)], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['+' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - # too many characters - with tm.assertRaisesRegexp(ValueError, msg): - lib.maybe_convert_numeric( - np.array(['foo_' + infinity], dtype=object), - na_values, maybe_int) - - def test_maybe_convert_numeric_post_floatify_nan(self): - # see gh-13314 - data = np.array(['1.200', '-999.000', '4.500'], dtype=object) - expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) - nan_values = set([-999, -999.0]) - - for coerce_type in (True, False): - out = lib.maybe_convert_numeric(data, nan_values, coerce_type) - tm.assert_numpy_array_equal(out, expected) - - def test_convert_infs(self): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - self.assertTrue(result.dtype == np.float64) - - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - self.assertTrue(result.dtype == np.float64) - - def test_scientific_no_exponent(self): - # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False, True) - self.assertTrue(np.all(np.isnan(result))) - - def test_convert_non_hashable(self): - # GH13324 - # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, 'apple']) - result = lib.maybe_convert_numeric(arr, set(), False, True) - tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) - - -class TestTypeInference(tm.TestCase): - _multiprocess_can_split_ = True - - def test_length_zero(self): - result = lib.infer_dtype(np.array([], dtype='i4')) - self.assertEqual(result, 'integer') - - result = lib.infer_dtype([]) - self.assertEqual(result, 'empty') - - def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - def test_string(self): - pass - - def test_unicode(self): - pass - - def test_datetime(self): - - dates = [datetime(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'datetime64') - - def test_infer_dtype_datetime(self): - - arr = np.array([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([np.datetime64('2011-01-01'), - np.datetime64('2011-01-01')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # starts with nan - for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([n, np.datetime64('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([n, datetime(2011, 1, 1)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([n, pd.Timestamp('2011-01-02'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([n, np.datetime64('2011-01-02'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([n, datetime(2011, 1, 1), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # different type of nat - arr = np.array([np.timedelta64('nat'), - np.datetime64('2011-01-02')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.datetime64('2011-01-02'), - np.timedelta64('nat')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - # mixed datetime - arr = np.array([datetime(2011, 1, 1), - pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # should be datetime? - arr = np.array([np.datetime64('2011-01-01'), - pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([pd.Timestamp('2011-01-02'), - np.datetime64('2011-01-01')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed-integer') - - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - def test_infer_dtype_timedelta(self): - - arr = np.array([pd.Timedelta('1 days'), - pd.Timedelta('2 days')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([np.timedelta64(1, 'D'), - np.timedelta64(2, 'D')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([timedelta(1), timedelta(2)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - # starts with nan - for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Timedelta('1 days')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, np.timedelta64(1, 'D')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, timedelta(1)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, pd.Timedelta('1 days'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, np.timedelta64(1, 'D'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, timedelta(1), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - # different type of nat - arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], - dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], - dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - def test_infer_dtype_all_nan_nat_like(self): - arr = np.array([np.nan, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'floating') - - # nan and None mix are result in mixed - arr = np.array([np.nan, np.nan, None]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([None, np.nan, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - # pd.NaT - arr = np.array([pd.NaT]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([pd.NaT, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([np.nan, pd.NaT]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([np.nan, pd.NaT, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([None, pd.NaT, None]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # np.datetime64(nat) - arr = np.array([np.datetime64('nat')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.datetime64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([np.timedelta64('nat')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.timedelta64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - # datetime / timedelta mixed - arr = np.array([pd.NaT, np.datetime64('nat'), - np.timedelta64('nat'), np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], - dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - def test_is_datetimelike_array_all_nan_nat_like(self): - arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) - self.assertTrue(pd.lib.is_datetime_array(arr)) - self.assertTrue(pd.lib.is_datetime64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_array(arr)) - self.assertFalse(pd.lib.is_timedelta64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) - self.assertFalse(pd.lib.is_datetime_array(arr)) - self.assertFalse(pd.lib.is_datetime64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_array(arr)) - self.assertTrue(pd.lib.is_timedelta64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), - np.timedelta64('nat')]) - self.assertFalse(pd.lib.is_datetime_array(arr)) - self.assertFalse(pd.lib.is_datetime64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_array(arr)) - self.assertFalse(pd.lib.is_timedelta64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, pd.NaT]) - self.assertTrue(pd.lib.is_datetime_array(arr)) - self.assertTrue(pd.lib.is_datetime64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_array(arr)) - self.assertTrue(pd.lib.is_timedelta64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, np.nan], dtype=object) - self.assertFalse(pd.lib.is_datetime_array(arr)) - self.assertFalse(pd.lib.is_datetime64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_array(arr)) - self.assertFalse(pd.lib.is_timedelta64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - def test_date(self): - - dates = [date(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'date') - - def test_to_object_array_tuples(self): - r = (5, 6) - values = [r] - result = lib.to_object_array_tuples(values) - - try: - # make sure record array works - from collections import namedtuple - record = namedtuple('record', 'x y') - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass - - def test_to_object_array_width(self): - # see gh-13320 - rows = [[1, 2, 3], [4, 5, 6]] - - expected = np.array(rows, dtype=object) - out = lib.to_object_array(rows) - tm.assert_numpy_array_equal(out, expected) - - expected = np.array(rows, dtype=object) - out = lib.to_object_array(rows, min_width=1) - tm.assert_numpy_array_equal(out, expected) - - expected = np.array([[1, 2, 3, None, None], - [4, 5, 6, None, None]], dtype=object) - out = lib.to_object_array(rows, min_width=5) - tm.assert_numpy_array_equal(out, expected) - - def test_object(self): - - # GH 7431 - # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - def test_categorical(self): - - # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list('abc')) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - def test_is_period(self): - self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) - self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) - self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) - self.assertFalse(lib.is_period(1)) - self.assertFalse(lib.is_period(np.nan)) - - -class TestConvert(tm.TestCase): - - def test_convert_objects(self): - arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O') - result = lib.maybe_convert_objects(arr) - self.assertTrue(result.dtype == np.object_) - - def test_convert_objects_ints(self): - # test that we can detect many kinds of integers - dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - - for dtype_str in dtypes: - arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') - self.assertTrue(arr[0].dtype == np.dtype(dtype_str)) - result = lib.maybe_convert_objects(arr) - self.assertTrue(issubclass(result.dtype.type, np.integer)) - - def test_convert_objects_complex_number(self): - for dtype in np.sctypes['complex']: - arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') - self.assertTrue(arr[0].dtype == np.dtype(dtype)) - result = lib.maybe_convert_objects(arr) - self.assertTrue(issubclass(result.dtype.type, np.complexfloating)) - - -class Testisscalar(tm.TestCase): - - def test_isscalar_builtin_scalars(self): - self.assertTrue(lib.isscalar(None)) - self.assertTrue(lib.isscalar(True)) - self.assertTrue(lib.isscalar(False)) - self.assertTrue(lib.isscalar(0.)) - self.assertTrue(lib.isscalar(np.nan)) - self.assertTrue(lib.isscalar('foobar')) - self.assertTrue(lib.isscalar(b'foobar')) - self.assertTrue(lib.isscalar(u('efoobar'))) - self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) - self.assertTrue(lib.isscalar(date(2014, 1, 1))) - self.assertTrue(lib.isscalar(time(12, 0))) - self.assertTrue(lib.isscalar(timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.NaT)) - - def test_isscalar_builtin_nonscalars(self): - self.assertFalse(lib.isscalar({})) - self.assertFalse(lib.isscalar([])) - self.assertFalse(lib.isscalar([1])) - self.assertFalse(lib.isscalar(())) - self.assertFalse(lib.isscalar((1, ))) - self.assertFalse(lib.isscalar(slice(None))) - self.assertFalse(lib.isscalar(Ellipsis)) - - def test_isscalar_numpy_array_scalars(self): - self.assertTrue(lib.isscalar(np.int64(1))) - self.assertTrue(lib.isscalar(np.float64(1.))) - self.assertTrue(lib.isscalar(np.int32(1))) - self.assertTrue(lib.isscalar(np.object_('foobar'))) - self.assertTrue(lib.isscalar(np.str_('foobar'))) - self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) - self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) - self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) - self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) - - def test_isscalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: - self.assertFalse(lib.isscalar(zerodim)) - self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) - - def test_isscalar_numpy_arrays(self): - self.assertFalse(lib.isscalar(np.array([]))) - self.assertFalse(lib.isscalar(np.array([[]]))) - self.assertFalse(lib.isscalar(np.matrix('1; 2'))) - - def test_isscalar_pandas_scalars(self): - self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) - self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) - - def test_lisscalar_pandas_containers(self): - self.assertFalse(lib.isscalar(pd.Series())) - self.assertFalse(lib.isscalar(pd.Series([1]))) - self.assertFalse(lib.isscalar(pd.DataFrame())) - self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) - self.assertFalse(lib.isscalar(pd.Panel())) - self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) - self.assertFalse(lib.isscalar(pd.Index([]))) - self.assertFalse(lib.isscalar(pd.Index([1]))) - - -class TestParseSQL(tm.TestCase): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - self.assert_numpy_array_equal(result, expected) - self.assert_numpy_array_equal(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - self.assert_numpy_array_equal(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - self.assert_numpy_array_equal(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_downcast_int64(self): - from pandas.parser import na_values - - arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) - expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) - - # default argument - result = lib.downcast_int64(arr, na_values) - self.assert_numpy_array_equal(result, expected) - - result = lib.downcast_int64(arr, na_values, use_unsigned=False) - self.assert_numpy_array_equal(result, expected) - - expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) - result = lib.downcast_int64(arr, na_values, use_unsigned=True) - self.assert_numpy_array_equal(result, expected) - - # still cast to int8 despite use_unsigned=True - # because of the negative number as an element - arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) - expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) - result = lib.downcast_int64(arr, na_values, use_unsigned=True) - self.assert_numpy_array_equal(result, expected) - - arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) - expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) - result = lib.downcast_int64(arr, na_values) - self.assert_numpy_array_equal(result, expected) - - int8_na = na_values[np.int8] - int64_na = na_values[np.int64] - arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) - expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) - result = lib.downcast_int64(arr, na_values) - self.assert_numpy_array_equal(result, expected) - - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 10a6bb5c75b01..84d7226f1b2f5 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -222,6 +222,7 @@ def test_duplicated_with_nas(): expected = trues + trues assert (np.array_equal(result, expected)) + if __name__ == '__main__': import nose diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1b1db90ea713d..f3b0becccf596 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -10,6 +10,7 @@ from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp +from pandas.types.common import is_float_dtype, is_integer_dtype from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assertRaisesRegexp) import pandas.core.common as com @@ -787,8 +788,8 @@ def test_delevel_infer_dtype(self): df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() - self.assertTrue(com.is_integer_dtype(deleveled['prm1'])) - self.assertTrue(com.is_float_dtype(deleveled['prm2'])) + self.assertTrue(is_integer_dtype(deleveled['prm1'])) + self.assertTrue(is_float_dtype(deleveled['prm2'])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 904bedde03312..eeeddc278c714 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,8 +5,8 @@ import warnings import numpy as np -from pandas import Series -from pandas.core.common import isnull, is_integer_dtype +from pandas import Series, isnull +from pandas.types.common import is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index b1f09ad2685e3..f2e13867d3bf0 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -10,12 +10,13 @@ import numpy as np import pandas as pd +from pandas.types.common import is_float_dtype from pandas import Series, DataFrame, Index, isnull, notnull, pivot, MultiIndex from pandas.core.datetools import bday from pandas.core.nanops import nanall, nanany from pandas.core.panel import Panel from pandas.core.series import remove_na -import pandas.core.common as com + from pandas.formats.printing import pprint_thing from pandas import compat from pandas.compat import range, lrange, StringIO, OrderedDict, signature @@ -903,7 +904,7 @@ def test_set_value(self): self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['ItemE'].values)) + self.assertTrue(is_float_dtype(res3['ItemE'].values)) with tm.assertRaisesRegexp(TypeError, "There must be an argument for each axis" " plus the value provided"): diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 607048df29faa..16a55c7ec4aeb 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -6,12 +6,12 @@ import numpy as np +from pandas.types.common import is_float_dtype from pandas import Series, Index, isnull, notnull from pandas.core.datetools import bday from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.core.series import remove_na -import pandas.core.common as com from pandas.util.testing import (assert_panel_equal, assert_panel4d_equal, @@ -595,7 +595,7 @@ def test_set_value(self): self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['l4'].values)) + self.assertTrue(is_float_dtype(res3['l4'].values)) class TestPanel4d(tm.TestCase, CheckIndexing, SafeForSparse, diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 67d171bb8efda..4d23bed620265 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -12,8 +12,7 @@ from pandas.compat import range, u import pandas.compat as compat -from pandas import (Index, Series, DataFrame, isnull, MultiIndex) -import pandas.core.common as com +from pandas import (Index, Series, DataFrame, isnull, MultiIndex, notnull) from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -1350,7 +1349,7 @@ def test_len(self): values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) result = values.str.len() - exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + exp = values.map(lambda x: len(x) if notnull(x) else NA) tm.assert_series_equal(result, exp) # mixed @@ -1368,7 +1367,7 @@ def test_len(self): 'fooooooo')]) result = values.str.len() - exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + exp = values.map(lambda x: len(x) if notnull(x) else NA) tm.assert_series_equal(result, exp) def test_findall(self): diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py new file mode 100644 index 0000000000000..dd3f07ea8157f --- /dev/null +++ b/pandas/tests/types/test_cast.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +""" +These test the private routines in types/cast.py + +""" + + +import nose +from datetime import datetime +import numpy as np + +from pandas import Timedelta, Timestamp +from pandas.types.cast import (_possibly_downcast_to_dtype, + _possibly_convert_objects, + _infer_dtype_from_scalar, + _maybe_convert_string_to_object, + _maybe_convert_scalar) +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +def test_downcast_conv(): + # test downcasting + + arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + assert (np.array_equal(result, arr)) + + arr = np.array([8., 8., 8., 8., 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + arr = np.array([8., 8., 8., 8., 9.0000000000005]) + result = _possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + # conversions + + expected = np.array([1, 2]) + for dtype in [np.float64, object, np.int64]: + arr = np.array([1.0, 2.0], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected, check_dtype=False) + + for dtype in [np.float64, object]: + expected = np.array([1.0, 2.0, np.nan], dtype=dtype) + arr = np.array([1.0, 2.0, np.nan], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected) + + # empties + for dtype in [np.int32, np.float64, np.float32, np.bool_, + np.int64, object]: + arr = np.array([], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'int64') + tm.assert_almost_equal(result, np.array([], dtype=np.int64)) + assert result.dtype == np.int64 + + +class TestInferDtype(tm.TestCase): + + def test_infer_dtype_from_scalar(self): + # Test that _infer_dtype_from_scalar is returning correct dtype for int + # and float. + + for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, + np.int32, np.uint64, np.int64]: + data = dtypec(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, type(data)) + + data = 12 + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.int64) + + for dtypec in [np.float16, np.float32, np.float64]: + data = dtypec(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, dtypec) + + data = np.float(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.float64) + + for data in [True, False]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.bool_) + + for data in [np.complex64(1), np.complex128(1)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.complex_) + + import datetime + for data in [np.datetime64(1, 'ns'), Timestamp(1), + datetime.datetime(2000, 1, 1, 0, 0)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, 'M8[ns]') + + for data in [np.timedelta64(1, 'ns'), Timedelta(1), + datetime.timedelta(1)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, 'm8[ns]') + + for data in [datetime.date(2000, 1, 1), + Timestamp(1, tz='US/Eastern'), 'foo']: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.object_) + + +class TestMaybe(tm.TestCase): + + def test_maybe_convert_string_to_array(self): + result = _maybe_convert_string_to_object('x') + tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) + self.assertTrue(result.dtype == object) + + result = _maybe_convert_string_to_object(1) + self.assertEqual(result, 1) + + arr = np.array(['x', 'y'], dtype=str) + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # unicode + arr = np.array(['x', 'y']).astype('U') + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # object + arr = np.array(['x', 2], dtype=object) + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) + self.assertTrue(result.dtype == object) + + def test_maybe_convert_scalar(self): + + # pass thru + result = _maybe_convert_scalar('x') + self.assertEqual(result, 'x') + result = _maybe_convert_scalar(np.array([1])) + self.assertEqual(result, np.array([1])) + + # leave scalar dtype + result = _maybe_convert_scalar(np.int64(1)) + self.assertEqual(result, np.int64(1)) + result = _maybe_convert_scalar(np.int32(1)) + self.assertEqual(result, np.int32(1)) + result = _maybe_convert_scalar(np.float32(1)) + self.assertEqual(result, np.float32(1)) + result = _maybe_convert_scalar(np.int64(1)) + self.assertEqual(result, np.float64(1)) + + # coerce + result = _maybe_convert_scalar(1) + self.assertEqual(result, np.int64(1)) + result = _maybe_convert_scalar(1.0) + self.assertEqual(result, np.float64(1)) + result = _maybe_convert_scalar(Timestamp('20130101')) + self.assertEqual(result, Timestamp('20130101').value) + result = _maybe_convert_scalar(datetime(2013, 1, 1)) + self.assertEqual(result, Timestamp('20130101').value) + result = _maybe_convert_scalar(Timedelta('1 day 1 min')) + self.assertEqual(result, Timedelta('1 day 1 min').value) + + +class TestConvert(tm.TestCase): + + def test_possibly_convert_objects_copy(self): + values = np.array([1, 2]) + + out = _possibly_convert_objects(values, copy=False) + self.assertTrue(values is out) + + out = _possibly_convert_objects(values, copy=True) + self.assertTrue(values is not out) + + values = np.array(['apply', 'banana']) + out = _possibly_convert_objects(values, copy=False) + self.assertTrue(values is out) + + out = _possibly_convert_objects(values, copy=True) + self.assertTrue(values is not out) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py new file mode 100644 index 0000000000000..0a586410ad5a0 --- /dev/null +++ b/pandas/tests/types/test_common.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np + +from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype +from pandas.types.common import pandas_dtype + +_multiprocess_can_split_ = True + + +def test_pandas_dtype(): + + assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( + 'datetime64[ns, US/Eastern]') + assert pandas_dtype('category') == CategoricalDtype() + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + assert pandas_dtype(dtype) == np.dtype(dtype) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index d48b9baf64777..1743e80ae01a9 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -4,13 +4,14 @@ import nose import numpy as np from pandas import Series, Categorical, date_range -import pandas.core.common as com -from pandas.types.api import CategoricalDtype -from pandas.core.common import (is_categorical_dtype, - is_categorical, DatetimeTZDtype, - is_datetime64tz_dtype, is_datetimetz, - is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype) + +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (is_categorical_dtype, + is_categorical, DatetimeTZDtype, + is_datetime64tz_dtype, is_datetimetz, + is_dtype_equal, is_datetime64_ns_dtype, + is_datetime64_dtype, + _coerce_to_dtype) import pandas.util.testing as tm _multiprocess_can_split_ = True @@ -124,9 +125,9 @@ def test_subclass(self): self.assertTrue(issubclass(type(a), type(b))) def test_coerce_to_dtype(self): - self.assertEqual(com._coerce_to_dtype('datetime64[ns, US/Eastern]'), + self.assertEqual(_coerce_to_dtype('datetime64[ns, US/Eastern]'), DatetimeTZDtype('ns', 'US/Eastern')) - self.assertEqual(com._coerce_to_dtype('datetime64[ns, Asia/Tokyo]'), + self.assertEqual(_coerce_to_dtype('datetime64[ns, Asia/Tokyo]'), DatetimeTZDtype('ns', 'Asia/Tokyo')) def test_compat(self): diff --git a/pandas/tests/types/test_generic.py b/pandas/tests/types/test_generic.py index 5549a3a376992..89913de6f6069 100644 --- a/pandas/tests/types/test_generic.py +++ b/pandas/tests/types/test_generic.py @@ -3,8 +3,8 @@ import nose import numpy as np import pandas as pd -import pandas.core.common as com import pandas.util.testing as tm +from pandas.types import generic as gt _multiprocess_can_split_ = True @@ -22,24 +22,24 @@ class TestABCClasses(tm.TestCase): sparse_array = pd.SparseArray(np.random.randn(10)) def test_abc_types(self): - self.assertIsInstance(pd.Index(['a', 'b', 'c']), com.ABCIndex) - self.assertIsInstance(pd.Int64Index([1, 2, 3]), com.ABCInt64Index) - self.assertIsInstance(pd.Float64Index([1, 2, 3]), com.ABCFloat64Index) - self.assertIsInstance(self.multi_index, com.ABCMultiIndex) - self.assertIsInstance(self.datetime_index, com.ABCDatetimeIndex) - self.assertIsInstance(self.timedelta_index, com.ABCTimedeltaIndex) - self.assertIsInstance(self.period_index, com.ABCPeriodIndex) + self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) + self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) + self.assertIsInstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) + self.assertIsInstance(self.multi_index, gt.ABCMultiIndex) + self.assertIsInstance(self.datetime_index, gt.ABCDatetimeIndex) + self.assertIsInstance(self.timedelta_index, gt.ABCTimedeltaIndex) + self.assertIsInstance(self.period_index, gt.ABCPeriodIndex) self.assertIsInstance(self.categorical_df.index, - com.ABCCategoricalIndex) - self.assertIsInstance(pd.Index(['a', 'b', 'c']), com.ABCIndexClass) - self.assertIsInstance(pd.Int64Index([1, 2, 3]), com.ABCIndexClass) - self.assertIsInstance(pd.Series([1, 2, 3]), com.ABCSeries) - self.assertIsInstance(self.df, com.ABCDataFrame) - self.assertIsInstance(self.df.to_panel(), com.ABCPanel) - self.assertIsInstance(self.sparse_series, com.ABCSparseSeries) - self.assertIsInstance(self.sparse_array, com.ABCSparseArray) - self.assertIsInstance(self.categorical, com.ABCCategorical) - self.assertIsInstance(pd.Period('2012', freq='A-DEC'), com.ABCPeriod) + gt.ABCCategoricalIndex) + self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndexClass) + self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass) + self.assertIsInstance(pd.Series([1, 2, 3]), gt.ABCSeries) + self.assertIsInstance(self.df, gt.ABCDataFrame) + self.assertIsInstance(self.df.to_panel(), gt.ABCPanel) + self.assertIsInstance(self.sparse_series, gt.ABCSparseSeries) + self.assertIsInstance(self.sparse_array, gt.ABCSparseArray) + self.assertIsInstance(self.categorical, gt.ABCCategorical) + self.assertIsInstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) if __name__ == '__main__': diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py new file mode 100644 index 0000000000000..34d10ee9dfa42 --- /dev/null +++ b/pandas/tests/types/test_inference.py @@ -0,0 +1,820 @@ +# -*- coding: utf-8 -*- + +""" +These the test the public routines exposed in types/common.py +related to inference and not otherwise tested in types/test_common.py + +""" + +import nose +import collections +import re +from datetime import datetime, date, timedelta, time +import numpy as np + +import pandas as pd +from pandas import lib, tslib +from pandas import (Series, Index, DataFrame, Timedelta, + DatetimeIndex, TimedeltaIndex, Timestamp, + Panel, Period) +from pandas.compat import u, PY2, lrange +from pandas.types import inference +from pandas.types.common import (is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_number, + is_integer, + is_float, + is_bool, + is_scalar, + _ensure_int32) +from pandas.types.missing import isnull +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +def test_is_sequence(): + is_seq = inference.is_sequence + assert (is_seq((1, 2))) + assert (is_seq([1, 2])) + assert (not is_seq("abcd")) + assert (not is_seq(u("abcd"))) + assert (not is_seq(np.int64)) + + class A(object): + + def __getitem__(self): + return 1 + + assert (not is_seq(A())) + + +def test_is_list_like(): + passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), + Series([]), Series(['a']).str) + fails = (1, '2', object()) + + for p in passes: + assert inference.is_list_like(p) + + for f in fails: + assert not inference.is_list_like(f) + + +def test_is_dict_like(): + passes = [{}, {'A': 1}, Series([1])] + fails = ['1', 1, [1, 2], (1, 2), range(2), Index([1])] + + for p in passes: + assert inference.is_dict_like(p) + + for f in fails: + assert not inference.is_dict_like(f) + + +def test_is_named_tuple(): + passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) + fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) + + for p in passes: + assert inference.is_named_tuple(p) + + for f in fails: + assert not inference.is_named_tuple(f) + + +def test_is_hashable(): + + # all new-style classes are hashable by default + class HashableClass(object): + pass + + class UnhashableClass1(object): + __hash__ = None + + class UnhashableClass2(object): + + def __hash__(self): + raise TypeError("Not hashable") + + hashable = (1, + 3.14, + np.float64(3.14), + 'a', + tuple(), + (1, ), + HashableClass(), ) + not_hashable = ([], UnhashableClass1(), ) + abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) + + for i in hashable: + assert inference.is_hashable(i) + for i in not_hashable: + assert not inference.is_hashable(i) + for i in abc_hashable_not_really_hashable: + assert not inference.is_hashable(i) + + # numpy.array is no longer collections.Hashable as of + # https://github.com/numpy/numpy/pull/5326, just test + # is_hashable() + assert not inference.is_hashable(np.array([])) + + # old-style classes in Python 2 don't appear hashable to + # collections.Hashable but also seem to support hash() by default + if PY2: + + class OldStyleClass(): + pass + + c = OldStyleClass() + assert not isinstance(c, collections.Hashable) + assert inference.is_hashable(c) + hash(c) # this will not raise + + +def test_is_re(): + passes = re.compile('ad'), + fails = 'x', 2, 3, object() + + for p in passes: + assert inference.is_re(p) + + for f in fails: + assert not inference.is_re(f) + + +def test_is_recompilable(): + passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), + re.compile(r'')) + fails = 1, [], object() + + for p in passes: + assert inference.is_re_compilable(p) + + for f in fails: + assert not inference.is_re_compilable(f) + + +class TestInference(tm.TestCase): + + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(lib.infer_dtype(arr), compare) + + def test_isinf_scalar(self): + # GH 11352 + self.assertTrue(lib.isposinf_scalar(float('inf'))) + self.assertTrue(lib.isposinf_scalar(np.inf)) + self.assertFalse(lib.isposinf_scalar(-np.inf)) + self.assertFalse(lib.isposinf_scalar(1)) + self.assertFalse(lib.isposinf_scalar('a')) + + self.assertTrue(lib.isneginf_scalar(float('-inf'))) + self.assertTrue(lib.isneginf_scalar(-np.inf)) + self.assertFalse(lib.isneginf_scalar(np.inf)) + self.assertFalse(lib.isneginf_scalar(1)) + self.assertFalse(lib.isneginf_scalar('a')) + + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(['inf', 'inf', 'inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False, True) + self.assertTrue(np.all(np.isnan(result))) + + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, 'apple']) + result = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + + +class TestTypeInference(tm.TestCase): + _multiprocess_can_split_ = True + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'integer') + + result = lib.infer_dtype([]) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'datetime64') + + def test_infer_dtype_datetime(self): + + arr = np.array([Timestamp('2011-01-01'), + Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.datetime64('2011-01-01'), + np.datetime64('2011-01-01')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1)]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, pd.Timestamp('2011-01-02'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # different type of nat + arr = np.array([np.timedelta64('nat'), + np.datetime64('2011-01-02')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.datetime64('2011-01-02'), + np.timedelta64('nat')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + # mixed datetime + arr = np.array([datetime(2011, 1, 1), + pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # should be datetime? + arr = np.array([np.datetime64('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Timestamp('2011-01-02'), + np.datetime64('2011-01-01')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) + self.assertEqual(lib.infer_dtype(arr), 'mixed-integer') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_timedelta(self): + + arr = np.array([pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([np.timedelta64(1, 'D'), + np.timedelta64(2, 'D')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([timedelta(1), timedelta(2)]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, Timedelta('1 days')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1)]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, pd.Timedelta('1 days'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # different type of nat + arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'floating') + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([None, np.nan, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + # pd.NaT + arr = np.array([pd.NaT]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([pd.NaT, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([None, pd.NaT, None]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # np.datetime64(nat) + arr = np.array([np.datetime64('nat')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([np.timedelta64('nat')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64('nat'), + np.timedelta64('nat'), np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + self.assertTrue(lib.is_datetime_array(arr)) + self.assertTrue(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertTrue(lib.is_timedelta_array(arr)) + self.assertTrue(lib.is_timedelta64_array(arr)) + self.assertTrue(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), + np.timedelta64('nat')]) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT]) + self.assertTrue(lib.is_datetime_array(arr)) + self.assertTrue(lib.is_datetime64_array(arr)) + self.assertTrue(lib.is_timedelta_array(arr)) + self.assertTrue(lib.is_timedelta64_array(arr)) + self.assertTrue(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, np.nan], dtype=object) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + def test_date(self): + + dates = [date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'date') + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) # noqa + except ImportError: + pass + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array([[1, 2, 3, None, None], + [4, 5, 6, None, None]], dtype=object) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + + def test_is_period(self): + self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) + self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) + self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) + self.assertFalse(lib.is_period(1)) + self.assertFalse(lib.is_period(np.nan)) + + def test_categorical(self): + + # GH 8974 + from pandas import Categorical, Series + arr = Categorical(list('abc')) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + +class TestNumberScalar(tm.TestCase): + + def test_is_number(self): + + self.assertTrue(is_number(True)) + self.assertTrue(is_number(1)) + self.assertTrue(is_number(1.1)) + self.assertTrue(is_number(1 + 3j)) + self.assertTrue(is_number(np.bool(False))) + self.assertTrue(is_number(np.int64(1))) + self.assertTrue(is_number(np.float64(1.1))) + self.assertTrue(is_number(np.complex128(1 + 3j))) + self.assertTrue(is_number(np.nan)) + + self.assertFalse(is_number(None)) + self.assertFalse(is_number('x')) + self.assertFalse(is_number(datetime(2011, 1, 1))) + self.assertFalse(is_number(np.datetime64('2011-01-01'))) + self.assertFalse(is_number(Timestamp('2011-01-01'))) + self.assertFalse(is_number(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_number(timedelta(1000))) + self.assertFalse(is_number(Timedelta('1 days'))) + + # questionable + self.assertFalse(is_number(np.bool_(False))) + self.assertTrue(is_number(np.timedelta64(1, 'D'))) + + def test_is_bool(self): + self.assertTrue(is_bool(True)) + self.assertTrue(is_bool(np.bool(False))) + self.assertTrue(is_bool(np.bool_(False))) + + self.assertFalse(is_bool(1)) + self.assertFalse(is_bool(1.1)) + self.assertFalse(is_bool(1 + 3j)) + self.assertFalse(is_bool(np.int64(1))) + self.assertFalse(is_bool(np.float64(1.1))) + self.assertFalse(is_bool(np.complex128(1 + 3j))) + self.assertFalse(is_bool(np.nan)) + self.assertFalse(is_bool(None)) + self.assertFalse(is_bool('x')) + self.assertFalse(is_bool(datetime(2011, 1, 1))) + self.assertFalse(is_bool(np.datetime64('2011-01-01'))) + self.assertFalse(is_bool(Timestamp('2011-01-01'))) + self.assertFalse(is_bool(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_bool(timedelta(1000))) + self.assertFalse(is_bool(np.timedelta64(1, 'D'))) + self.assertFalse(is_bool(Timedelta('1 days'))) + + def test_is_integer(self): + self.assertTrue(is_integer(1)) + self.assertTrue(is_integer(np.int64(1))) + + self.assertFalse(is_integer(True)) + self.assertFalse(is_integer(1.1)) + self.assertFalse(is_integer(1 + 3j)) + self.assertFalse(is_integer(np.bool(False))) + self.assertFalse(is_integer(np.bool_(False))) + self.assertFalse(is_integer(np.float64(1.1))) + self.assertFalse(is_integer(np.complex128(1 + 3j))) + self.assertFalse(is_integer(np.nan)) + self.assertFalse(is_integer(None)) + self.assertFalse(is_integer('x')) + self.assertFalse(is_integer(datetime(2011, 1, 1))) + self.assertFalse(is_integer(np.datetime64('2011-01-01'))) + self.assertFalse(is_integer(Timestamp('2011-01-01'))) + self.assertFalse(is_integer(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_integer(timedelta(1000))) + self.assertFalse(is_integer(Timedelta('1 days'))) + + # questionable + self.assertTrue(is_integer(np.timedelta64(1, 'D'))) + + def test_is_float(self): + self.assertTrue(is_float(1.1)) + self.assertTrue(is_float(np.float64(1.1))) + self.assertTrue(is_float(np.nan)) + + self.assertFalse(is_float(True)) + self.assertFalse(is_float(1)) + self.assertFalse(is_float(1 + 3j)) + self.assertFalse(is_float(np.bool(False))) + self.assertFalse(is_float(np.bool_(False))) + self.assertFalse(is_float(np.int64(1))) + self.assertFalse(is_float(np.complex128(1 + 3j))) + self.assertFalse(is_float(None)) + self.assertFalse(is_float('x')) + self.assertFalse(is_float(datetime(2011, 1, 1))) + self.assertFalse(is_float(np.datetime64('2011-01-01'))) + self.assertFalse(is_float(Timestamp('2011-01-01'))) + self.assertFalse(is_float(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_float(timedelta(1000))) + self.assertFalse(is_float(np.timedelta64(1, 'D'))) + self.assertFalse(is_float(Timedelta('1 days'))) + + def test_is_timedelta(self): + self.assertTrue(is_timedelta64_dtype('timedelta64')) + self.assertTrue(is_timedelta64_dtype('timedelta64[ns]')) + self.assertFalse(is_timedelta64_ns_dtype('timedelta64')) + self.assertTrue(is_timedelta64_ns_dtype('timedelta64[ns]')) + + tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') + self.assertTrue(is_timedelta64_dtype(tdi)) + self.assertTrue(is_timedelta64_ns_dtype(tdi)) + self.assertTrue(is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) + + # Conversion to Int64Index: + self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) + self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) + + +class Testisscalar(tm.TestCase): + + def test_isscalar_builtin_scalars(self): + self.assertTrue(is_scalar(None)) + self.assertTrue(is_scalar(True)) + self.assertTrue(is_scalar(False)) + self.assertTrue(is_scalar(0.)) + self.assertTrue(is_scalar(np.nan)) + self.assertTrue(is_scalar('foobar')) + self.assertTrue(is_scalar(b'foobar')) + self.assertTrue(is_scalar(u('efoobar'))) + self.assertTrue(is_scalar(datetime(2014, 1, 1))) + self.assertTrue(is_scalar(date(2014, 1, 1))) + self.assertTrue(is_scalar(time(12, 0))) + self.assertTrue(is_scalar(timedelta(hours=1))) + self.assertTrue(is_scalar(pd.NaT)) + + def test_isscalar_builtin_nonscalars(self): + self.assertFalse(is_scalar({})) + self.assertFalse(is_scalar([])) + self.assertFalse(is_scalar([1])) + self.assertFalse(is_scalar(())) + self.assertFalse(is_scalar((1, ))) + self.assertFalse(is_scalar(slice(None))) + self.assertFalse(is_scalar(Ellipsis)) + + def test_isscalar_numpy_array_scalars(self): + self.assertTrue(is_scalar(np.int64(1))) + self.assertTrue(is_scalar(np.float64(1.))) + self.assertTrue(is_scalar(np.int32(1))) + self.assertTrue(is_scalar(np.object_('foobar'))) + self.assertTrue(is_scalar(np.str_('foobar'))) + self.assertTrue(is_scalar(np.unicode_(u('foobar')))) + self.assertTrue(is_scalar(np.bytes_(b'foobar'))) + self.assertTrue(is_scalar(np.datetime64('2014-01-01'))) + self.assertTrue(is_scalar(np.timedelta64(1, 'h'))) + + def test_isscalar_numpy_zerodim_arrays(self): + for zerodim in [np.array(1), np.array('foobar'), + np.array(np.datetime64('2014-01-01')), + np.array(np.timedelta64(1, 'h')), + np.array(np.datetime64('NaT'))]: + self.assertFalse(is_scalar(zerodim)) + self.assertTrue(is_scalar(lib.item_from_zerodim(zerodim))) + + def test_isscalar_numpy_arrays(self): + self.assertFalse(is_scalar(np.array([]))) + self.assertFalse(is_scalar(np.array([[]]))) + self.assertFalse(is_scalar(np.matrix('1; 2'))) + + def test_isscalar_pandas_scalars(self): + self.assertTrue(is_scalar(Timestamp('2014-01-01'))) + self.assertTrue(is_scalar(Timedelta(hours=1))) + self.assertTrue(is_scalar(Period('2014-01-01'))) + + def test_lisscalar_pandas_containers(self): + self.assertFalse(is_scalar(Series())) + self.assertFalse(is_scalar(Series([1]))) + self.assertFalse(is_scalar(DataFrame())) + self.assertFalse(is_scalar(DataFrame([[1]]))) + self.assertFalse(is_scalar(Panel())) + self.assertFalse(is_scalar(Panel([[[1]]]))) + self.assertFalse(is_scalar(Index([]))) + self.assertFalse(is_scalar(Index([1]))) + + +def test_datetimeindex_from_empty_datetime64_array(): + for unit in ['ms', 'us', 'ns']: + idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) + assert (len(idx) == 0) + + +def test_nan_to_nat_conversions(): + + df = DataFrame(dict({ + 'A': np.asarray( + lrange(10), dtype='float64'), + 'B': Timestamp('20010101') + })) + df.iloc[3:6, :] = np.nan + result = df.loc[4, 'B'].value + assert (result == tslib.iNaT) + + s = df['B'].copy() + s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) + assert (isnull(s[8])) + + # numpy < 1.7.0 is wrong + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.7.0': + assert (s[8].value == np.datetime64('NaT').astype(np.int64)) + + +def test_ensure_int32(): + values = np.arange(10, dtype=np.int32) + result = _ensure_int32(values) + assert (result.dtype == np.int32) + + values = np.arange(10, dtype=np.int64) + result = _ensure_int32(values) + assert (result.dtype == np.int32) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_io.py b/pandas/tests/types/test_io.py new file mode 100644 index 0000000000000..545edf8f1386c --- /dev/null +++ b/pandas/tests/types/test_io.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pandas.lib as lib +import pandas.util.testing as tm + +from pandas.compat import long, u + + +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equal(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py new file mode 100644 index 0000000000000..edcb69de7bfad --- /dev/null +++ b/pandas/tests/types/test_missing.py @@ -0,0 +1,243 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np +from datetime import datetime +from pandas.util import testing as tm + +from pandas.core import config as cf +from pandas.compat import u +from pandas.tslib import iNaT +from pandas import (NaT, Float64Index, Series, + DatetimeIndex, TimedeltaIndex, date_range) +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.missing import (array_equivalent, isnull, notnull, + na_value_for_dtype) + +_multiprocess_can_split_ = True + + +def test_notnull(): + assert notnull(1.) + assert not notnull(None) + assert not notnull(np.NaN) + + with cf.option_context("mode.use_inf_as_null", False): + assert notnull(np.inf) + assert notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.all() + + with cf.option_context("mode.use_inf_as_null", True): + assert not notnull(np.inf) + assert not notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.sum() == 2 + + with cf.option_context("mode.use_inf_as_null", False): + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + assert (isinstance(isnull(s), Series)) + + +def test_isnull(): + assert not isnull(1.) + assert isnull(None) + assert isnull(np.NaN) + assert not isnull(np.inf) + assert not isnull(-np.inf) + + # series + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + assert (isinstance(isnull(s), Series)) + + # frame + for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), + tm.makeMixedDataFrame()]: + result = isnull(df) + expected = df.apply(isnull) + tm.assert_frame_equal(result, expected) + + # panel + for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) + ]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel_equal(result, expected) + + # panel 4d + for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel4d_equal(result, expected) + + +def test_isnull_lists(): + result = isnull([[False]]) + exp = np.array([[False]]) + assert (np.array_equal(result, exp)) + + result = isnull([[1], [2]]) + exp = np.array([[False], [False]]) + assert (np.array_equal(result, exp)) + + # list of strings / unicode + result = isnull(['foo', 'bar']) + assert (not result.any()) + + result = isnull([u('foo'), u('bar')]) + assert (not result.any()) + + +def test_isnull_nat(): + result = isnull([NaT]) + exp = np.array([True]) + assert (np.array_equal(result, exp)) + + result = isnull(np.array([NaT], dtype=object)) + exp = np.array([True]) + assert (np.array_equal(result, exp)) + + +def test_isnull_numpy_nat(): + arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), + np.datetime64('NaT', 's')]) + result = isnull(arr) + expected = np.array([True] * 4) + tm.assert_numpy_array_equal(result, expected) + + +def test_isnull_datetime(): + assert (not isnull(datetime.now())) + assert notnull(datetime.now()) + + idx = date_range('1/1/1990', periods=20) + assert (notnull(idx).all()) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isnull(idx) + assert (mask[0]) + assert (not mask[1:].any()) + + # GH 9129 + pidx = idx.to_period(freq='M') + mask = isnull(pidx) + assert (mask[0]) + assert (not mask[1:].any()) + + mask = isnull(pidx[1:]) + assert (not mask.any()) + + +class TestIsNull(tm.TestCase): + + def test_0d_array(self): + self.assertTrue(isnull(np.array(np.nan))) + self.assertFalse(isnull(np.array(0.0))) + self.assertFalse(isnull(np.array(0))) + # test object dtype + self.assertTrue(isnull(np.array(np.nan, dtype=object))) + self.assertFalse(isnull(np.array(0.0, dtype=object))) + self.assertFalse(isnull(np.array(0, dtype=object))) + + +def test_array_equivalent(): + assert array_equivalent(np.array([np.nan, np.nan]), + np.array([np.nan, np.nan])) + assert array_equivalent(np.array([np.nan, 1, np.nan]), + np.array([np.nan, 1, np.nan])) + assert array_equivalent(np.array([np.nan, None], dtype='object'), + np.array([np.nan, None], dtype='object')) + assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), + np.array([np.nan, 1 + 1j], dtype='complex')) + assert not array_equivalent( + np.array([np.nan, 1 + 1j], dtype='complex'), np.array( + [np.nan, 1 + 2j], dtype='complex')) + assert not array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) + assert not array_equivalent( + np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) + assert array_equivalent(Float64Index([0, np.nan]), + Float64Index([0, np.nan])) + assert not array_equivalent( + Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), + DatetimeIndex([0, np.nan])) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + assert array_equivalent(TimedeltaIndex([0, np.nan]), + TimedeltaIndex([0, np.nan])) + assert not array_equivalent( + TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), + DatetimeIndex([0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( + [1, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex( + [0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( + [0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + + +def test_array_equivalent_compat(): + # see gh-13388 + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + assert (array_equivalent(m, n, strict_nan=True)) + assert (array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) + assert (not array_equivalent(m, n, strict_nan=True)) + assert (not array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) + assert (not array_equivalent(m, n, strict_nan=True)) + assert (not array_equivalent(m, n, strict_nan=False)) + + +def test_array_equivalent_str(): + for dtype in ['O', 'S', 'U']: + assert array_equivalent(np.array(['A', 'B'], dtype=dtype), + np.array(['A', 'B'], dtype=dtype)) + assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), + np.array(['A', 'X'], dtype=dtype)) + + +def test_na_value_for_dtype(): + for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), + DatetimeTZDtype('datetime64[ns, US/Eastern]')]: + assert na_value_for_dtype(dtype) is NaT + + for dtype in ['u1', 'u2', 'u4', 'u8', + 'i1', 'i2', 'i4', 'i8']: + assert na_value_for_dtype(np.dtype(dtype)) == 0 + + for dtype in ['bool']: + assert na_value_for_dtype(np.dtype(dtype)) is False + + for dtype in ['f2', 'f4', 'f8']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + for dtype in ['O']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_types.py b/pandas/tests/types/test_types.py deleted file mode 100644 index b9f6006cab731..0000000000000 --- a/pandas/tests/types/test_types.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -import nose -import numpy as np - -from pandas import NaT -from pandas.types.api import (DatetimeTZDtype, CategoricalDtype, - na_value_for_dtype, pandas_dtype) - - -def test_pandas_dtype(): - - assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( - 'datetime64[ns, US/Eastern]') - assert pandas_dtype('category') == CategoricalDtype() - for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: - assert pandas_dtype(dtype) == np.dtype(dtype) - - -def test_na_value_for_dtype(): - for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]')]: - assert na_value_for_dtype(dtype) is NaT - - for dtype in ['u1', 'u2', 'u4', 'u8', - 'i1', 'i2', 'i4', 'i8']: - assert na_value_for_dtype(np.dtype(dtype)) == 0 - - for dtype in ['bool']: - assert na_value_for_dtype(np.dtype(dtype)) is False - - for dtype in ['f2', 'f4', 'f8']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - for dtype in ['O']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 075dff9cf6c38..5b66e55eb60b6 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -12,6 +12,21 @@ from pandas import (Categorical, DataFrame, Series, Index, MultiIndex, Timedelta) from pandas.core.frame import _merge_doc +from pandas.types.generic import ABCSeries +from pandas.types.common import (is_datetime64tz_dtype, + is_datetime64_dtype, + needs_i8_conversion, + is_int64_dtype, + is_integer, + is_int_or_datetime_dtype, + is_dtype_equal, + is_bool, + is_list_like, + _ensure_int64, + _ensure_platform_int, + _ensure_object) +from pandas.types.missing import na_value_for_dtype + from pandas.core.generic import NDFrame from pandas.core.index import (_get_combined_index, _ensure_index, _get_consensus_names, @@ -19,18 +34,10 @@ from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import (ABCSeries, is_dtype_equal, - is_datetime64_dtype, - is_int64_dtype, - is_integer, - is_bool, - is_list_like, - needs_i8_conversion) import pandas.core.algorithms as algos import pandas.core.common as com import pandas.types.concat as _concat -from pandas.types.api import na_value_for_dtype import pandas.algos as _algos import pandas.hashtable as _hash @@ -436,7 +443,7 @@ def _merger(x, y): # if we DO have duplicates, then # we cannot guarantee order - sorter = com._ensure_platform_int( + sorter = _ensure_platform_int( np.concatenate([groupby.indices[g] for g, _ in groupby])) if len(result) != len(sorter): if check_duplicates: @@ -1111,8 +1118,8 @@ def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = _algos.left_outer_join( - com._ensure_int64(left_key), - com._ensure_int64(right_key), + _ensure_int64(left_key), + _ensure_int64(right_key), count, sort=sort) return left_indexer, right_indexer @@ -1158,18 +1165,17 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): - if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): + if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values - - if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): + if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer - lk = com._ensure_int64(com._values_from_object(lk)) - rk = com._ensure_int64(com._values_from_object(rk)) + lk = _ensure_int64(com._values_from_object(lk)) + rk = _ensure_int64(com._values_from_object(rk)) else: klass = _hash.Factorizer - lk = com._ensure_object(lk) - rk = com._ensure_object(rk) + lk = _ensure_object(lk) + rk = _ensure_object(rk) rizer = klass(max(len(lk), len(rk))) @@ -1208,10 +1214,10 @@ def _sort_labels(uniques, left, right): reverse_indexer = np.empty(len(sorter), dtype=np.int64) reverse_indexer.put(sorter, np.arange(len(sorter))) - new_left = reverse_indexer.take(com._ensure_platform_int(left)) + new_left = reverse_indexer.take(_ensure_platform_int(left)) np.putmask(new_left, left == -1, -1) - new_right = reverse_indexer.take(com._ensure_platform_int(right)) + new_right = reverse_indexer.take(_ensure_platform_int(right)) np.putmask(new_right, right == -1, -1) return new_left, new_right diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index e1405bc9e6add..3e2b7c3af460e 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1,6 +1,7 @@ # pylint: disable=E1103 +from pandas.types.common import is_list_like, is_scalar from pandas import Series, DataFrame from pandas.core.index import MultiIndex, Index from pandas.core.groupby import Grouper @@ -9,7 +10,6 @@ from pandas.compat import range, lrange, zip from pandas import compat import pandas.core.common as com -import pandas.lib as lib import numpy as np @@ -95,7 +95,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', values_passed = values is not None if values_passed: - if com.is_list_like(values): + if is_list_like(values): values_multi = True values = list(values) else: @@ -361,7 +361,7 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (lib.isscalar(by) or + elif (is_scalar(by) or isinstance(by, (np.ndarray, Index, Series, Grouper)) or hasattr(by, '__call__')): by = [by] diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index b6c1926c1e7fc..4cf3364a03056 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -11,10 +11,17 @@ import numpy as np +from pandas.types.common import (is_list_like, + is_integer, + is_number, + is_hashable, + is_iterator) +from pandas.types.missing import isnull, notnull + from pandas.util.decorators import cache_readonly, deprecate_kwarg from pandas.core.base import PandasObject -import pandas.core.common as com -from pandas.core.common import AbstractMethodError + +from pandas.core.common import AbstractMethodError, _try_sort from pandas.core.generic import _shared_docs, _shared_doc_kwargs from pandas.core.index import Index, MultiIndex from pandas.core.series import Series, remove_na @@ -161,7 +168,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") - colors = list(color) if com.is_list_like(color) else color + colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't @@ -336,7 +343,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) - mask = com.notnull(df) + mask = notnull(df) marker = _get_marker_compat(marker) @@ -980,7 +987,7 @@ def _validate_color_args(self): "simultaneously. Using 'color'") if 'color' in self.kwds and self.style is not None: - if com.is_list_like(self.style): + if is_list_like(self.style): styles = self.style else: styles = [self.style] @@ -1001,7 +1008,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): # TODO: unused? # if self.sort_columns: - # columns = com._try_sort(data.columns) + # columns = _try_sort(data.columns) # else: # columns = data.columns @@ -1099,13 +1106,13 @@ def result(self): Return result axes """ if self.subplots: - if self.layout is not None and not com.is_list_like(self.ax): + if self.layout is not None and not is_list_like(self.ax): return self.axes.reshape(*self.layout) else: return self.axes else: sec_true = isinstance(self.secondary_y, bool) and self.secondary_y - all_sec = (com.is_list_like(self.secondary_y) and + all_sec = (is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries) if (sec_true or all_sec): # if all data is plotted on secondary, return right axes @@ -1322,7 +1329,7 @@ def _get_xticks(self, convert_period=False): @classmethod def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): - mask = com.isnull(y) + mask = isnull(y) if mask.any(): y = np.ma.array(y) y = np.ma.masked_where(mask, y) @@ -1463,8 +1470,8 @@ def match_labels(data, e): err = np.atleast_2d(evalues) err = np.tile(err, (self.nseries, 1)) - elif com.is_list_like(err): - if com.is_iterator(err): + elif is_list_like(err): + if is_iterator(err): err = np.atleast_2d(list(err)) else: # raw error values @@ -1486,7 +1493,7 @@ def match_labels(data, e): if len(err) == 1: err = np.tile(err, (self.nseries, 1)) - elif com.is_number(err): + elif is_number(err): err = np.tile([err], (self.nseries, len(self.data))) else: @@ -1543,9 +1550,9 @@ def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: raise ValueError(self._kind + ' requires and x and y column') - if com.is_integer(x) and not self.data.columns.holds_integer(): + if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] - if com.is_integer(y) and not self.data.columns.holds_integer(): + if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] self.x = x self.y = y @@ -1569,7 +1576,7 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs): # the handling of this argument later s = 20 super(ScatterPlot, self).__init__(data, x, y, s=s, **kwargs) - if com.is_integer(c) and not self.data.columns.holds_integer(): + if is_integer(c) and not self.data.columns.holds_integer(): c = self.data.columns[c] self.c = c @@ -1577,7 +1584,7 @@ def _make_plot(self): x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] - c_is_column = com.is_hashable(c) and c in self.data.columns + c_is_column = is_hashable(c) and c in self.data.columns # plot a colorbar only if a colormap is provided or necessary cb = self.kwds.pop('colorbar', self.colormap or c_is_column) @@ -1629,7 +1636,7 @@ class HexBinPlot(PlanePlot): def __init__(self, data, x, y, C=None, **kwargs): super(HexBinPlot, self).__init__(data, x, y, **kwargs) - if com.is_integer(C) and not self.data.columns.holds_integer(): + if is_integer(C) and not self.data.columns.holds_integer(): C = self.data.columns[C] self.C = C @@ -1912,9 +1919,9 @@ def __init__(self, data, **kwargs): self.ax_pos = self.tick_pos - self.tickoffset def _args_adjust(self): - if com.is_list_like(self.bottom): + if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - if com.is_list_like(self.left): + if is_list_like(self.left): self.left = np.array(self.left) @classmethod @@ -2027,18 +2034,18 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - if com.is_integer(self.bins): + if is_integer(self.bins): # create common bin edge values = (self.data._convert(datetime=True)._get_numeric_data()) values = np.ravel(values) - values = values[~com.isnull(values)] + values = values[~isnull(values)] hist, self.bins = np.histogram( values, bins=self.bins, range=self.kwds.get('range', None), weights=self.kwds.get('weights', None)) - if com.is_list_like(self.bottom): + if is_list_like(self.bottom): self.bottom = np.array(self.bottom) @classmethod @@ -2046,7 +2053,7 @@ def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) - y = y[~com.isnull(y)] + y = y[~isnull(y)] base = np.zeros(len(bins) - 1) bottom = bottom + \ @@ -2411,7 +2418,7 @@ def _plot(data, x=None, y=None, subplots=False, msg = "{0} requires either y column or 'subplots=True'" raise ValueError(msg.format(kind)) elif y is not None: - if com.is_integer(y) and not data.columns.holds_integer(): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] # converted to series actually. copy to not modify data = data[y].copy() @@ -2420,12 +2427,12 @@ def _plot(data, x=None, y=None, subplots=False, else: if isinstance(data, DataFrame): if x is not None: - if com.is_integer(x) and not data.columns.holds_integer(): + if is_integer(x) and not data.columns.holds_integer(): x = data.columns[x] data = data.set_index(x) if y is not None: - if com.is_integer(y) and not data.columns.holds_integer(): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] label = kwds['label'] if 'label' in kwds else y series = data[y].copy() # Don't modify @@ -2434,7 +2441,7 @@ def _plot(data, x=None, y=None, subplots=False, for kw in ['xerr', 'yerr']: if (kw in kwds) and \ (isinstance(kwds[kw], string_types) or - com.is_integer(kwds[kw])): + is_integer(kwds[kw])): try: kwds[kw] = data[kwds[kw]] except (IndexError, KeyError, TypeError): @@ -2897,7 +2904,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, layout=layout) _axes = _flatten(axes) - for i, col in enumerate(com._try_sort(data.columns)): + for i, col in enumerate(_try_sort(data.columns)): ax = _axes[i] ax.hist(data[col].dropna().values, bins=bins, **kwds) ax.set_title(col) @@ -3345,7 +3352,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if ax is None: fig = plt.figure(**fig_kw) else: - if com.is_list_like(ax): + if is_list_like(ax): ax = _flatten(ax) if layout is not None: warnings.warn("When passing multiple axes, layout keyword is " @@ -3487,7 +3494,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): def _flatten(axes): - if not com.is_list_like(axes): + if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, Index)): return axes.ravel() diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index b0bbf8ba70354..62bbfc2f630a5 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -2,12 +2,14 @@ Quantilization functions and related stuff """ +from pandas.types.missing import isnull +from pandas.types.common import (is_float, is_integer, + is_scalar) + from pandas.core.api import Series from pandas.core.categorical import Categorical import pandas.core.algorithms as algos -import pandas.core.common as com import pandas.core.nanops as nanops -import pandas.lib as lib from pandas.compat import zip import numpy as np @@ -80,7 +82,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 if not np.iterable(bins): - if lib.isscalar(bins) and bins < 1: + if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") try: # for array-like sz = x.size @@ -164,7 +166,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ - if com.is_integer(q): + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q @@ -194,7 +196,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, if include_lowest: ids[x == bins[0]] = 1 - na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) + na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -264,7 +266,7 @@ def _format_label(x, precision=3): fmt_str = '%%.%dg' % precision if np.isinf(x): return str(x) - elif com.is_float(x): + elif is_float(x): frac, whole = np.modf(x) sgn = '-' if x < 0 else '' whole = abs(whole) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index d70904e1bf286..b8b28663387cc 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,12 @@ import numpy as np import pandas.lib as lib +from pandas.types.common import (is_number, + is_numeric_dtype, + is_datetime_or_timedelta_dtype, + _ensure_object) +from pandas.types.cast import _possibly_downcast_to_dtype + import pandas as pd from pandas.compat import reduce from pandas.core.index import Index @@ -141,7 +147,7 @@ def to_numeric(arg, errors='raise', downcast=None): elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): - if com.is_number(arg): + if is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') @@ -151,14 +157,13 @@ def to_numeric(arg, errors='raise', downcast=None): values = arg try: - if com.is_numeric_dtype(values): + if is_numeric_dtype(values): pass - elif com.is_datetime_or_timedelta_dtype(values): + elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: - values = com._ensure_object(values) + values = _ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True - values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) @@ -168,7 +173,7 @@ def to_numeric(arg, errors='raise', downcast=None): # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified - if downcast is not None and com.is_numeric_dtype(values): + if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): @@ -189,7 +194,7 @@ def to_numeric(arg, errors='raise', downcast=None): # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: - values = com._possibly_downcast_to_dtype( + values = _possibly_downcast_to_dtype( values, dtype) # successful conversion diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 4bafac873ea09..fe0440170383b 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -9,10 +9,16 @@ from pandas.compat.numpy import function as nv import numpy as np - +from pandas.types.common import (is_integer, is_float, + is_bool_dtype, _ensure_int64, + is_scalar, + is_list_like) +from pandas.types.generic import (ABCIndex, ABCSeries, + ABCPeriodIndex, ABCIndexClass) +from pandas.types.missing import isnull from pandas.core import common as com, algorithms -from pandas.core.common import (is_integer, is_float, is_bool_dtype, - AbstractMethodError) +from pandas.core.common import AbstractMethodError + import pandas.formats.printing as printing import pandas.tslib as tslib import pandas._period as prlib @@ -111,9 +117,9 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True): @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, com.ABCIndex, com.ABCSeries)): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries)): left = left.view('i8') - if isinstance(right, (np.ndarray, com.ABCIndex, com.ABCSeries)): + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries)): right = right.view('i8') results = joinf(left, right) if with_indexers: @@ -133,10 +139,10 @@ def _evaluate_compare(self, other, op): # coerce to a similar object if not isinstance(other, type(self)): - if not com.is_list_like(other): + if not is_list_like(other): # scalar other = [other] - elif lib.isscalar(lib.item_from_zerodim(other)): + elif is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) @@ -174,7 +180,7 @@ def _ensure_localized(self, result): # reconvert to local tz if getattr(self, 'tz', None) is not None: - if not isinstance(result, com.ABCIndexClass): + if not isinstance(result, ABCIndexClass): result = self._simple_new(result) result = result.tz_localize(self.tz) return result @@ -202,7 +208,7 @@ def _format_with_header(self, header, **kwargs): def __contains__(self, key): try: res = self.get_loc(key) - return lib.isscalar(res) or type(res) == slice or np.any(res) + return is_scalar(res) or type(res) == slice or np.any(res) except (KeyError, TypeError, ValueError): return False @@ -213,7 +219,7 @@ def __getitem__(self, key): """ is_int = is_integer(key) - if lib.isscalar(key) and not is_int: + if is_scalar(key) and not is_int: raise ValueError getitem = self._data.__getitem__ @@ -282,7 +288,7 @@ def _nat_new(self, box=True): return result attribs = self._get_attributes_dict() - if not isinstance(self, com.ABCPeriodIndex): + if not isinstance(self, ABCPeriodIndex): attribs['freq'] = None return self._simple_new(result, **attribs) @@ -312,7 +318,7 @@ def sort_values(self, return_indexer=False, ascending=True): attribs = self._get_attributes_dict() freq = attribs['freq'] - if freq is not None and not isinstance(self, com.ABCPeriodIndex): + if freq is not None and not isinstance(self, ABCPeriodIndex): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: @@ -328,7 +334,7 @@ def sort_values(self, return_indexer=False, ascending=True): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_int64(indices) + indices = _ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): @@ -340,7 +346,7 @@ def take(self, indices, axis=0, allow_fill=True, na_value=tslib.iNaT) # keep freq in PeriodIndex, reset otherwise - freq = self.freq if isinstance(self, com.ABCPeriodIndex) else None + freq = self.freq if isinstance(self, ABCPeriodIndex) else None return self._shallow_copy(taken, freq=freq) def get_duplicates(self): @@ -545,7 +551,7 @@ def _convert_scalar_indexer(self, key, kind=None): # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem - if lib.isscalar(key): + if is_scalar(key): is_int = is_integer(key) is_flt = is_float(key) if kind in ['loc'] and (is_int or is_flt): @@ -591,7 +597,7 @@ def __add__(self, other): elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(other) - elif com.is_integer(other): + elif is_integer(other): return self.shift(other) elif isinstance(other, (tslib.Timestamp, datetime)): return self._add_datelike(other) @@ -619,7 +625,7 @@ def __sub__(self, other): elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(-other) - elif com.is_integer(other): + elif is_integer(other): return self.shift(-other) elif isinstance(other, (tslib.Timestamp, datetime)): return self._sub_datelike(other) @@ -791,9 +797,9 @@ def summary(self, name=None): def _ensure_datetimelike_to_i8(other): """ helper for coercing an input scalar or array to i8 """ - if lib.isscalar(other) and com.isnull(other): + if lib.isscalar(other) and isnull(other): other = tslib.iNaT - elif isinstance(other, com.ABCIndexClass): + elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 8937e83c7009a..46e8bd43e8ff8 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -3,19 +3,21 @@ """ import numpy as np + +from pandas.types.common import (_NS_DTYPE, _TD_DTYPE, + is_period_arraylike, + is_datetime_arraylike, is_integer_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, is_categorical_dtype, + is_list_like) + from pandas.core.base import PandasDelegate, NoNewAttributesMixin -from pandas.core import common as com from pandas.tseries.index import DatetimeIndex from pandas._period import IncompatibleFrequency # flake8: noqa from pandas.tseries.period import PeriodIndex from pandas.tseries.tdi import TimedeltaIndex from pandas import tslib from pandas.core.algorithms import take_1d -from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, - is_datetime_arraylike, is_integer_dtype, - is_list_like, - is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_categorical_dtype) def is_datetimelike(data): @@ -129,7 +131,7 @@ def _delegate_method(self, name, *args, **kwargs): method = getattr(self.values, name) result = method(*args, **kwargs) - if not com.is_list_like(result): + if not is_list_like(result): return result result = Series(result, index=self.index, name=self.name) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 78b185ae8cf31..fc23f4f99449b 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -10,6 +10,14 @@ from matplotlib.ticker import Formatter, AutoLocator, Locator from matplotlib.transforms import nonsingular + +from pandas.types.common import (is_float, is_integer, + is_integer_dtype, + is_float_dtype, + is_datetime64_ns_dtype, + is_period_arraylike, + ) + from pandas.compat import lrange import pandas.compat as compat import pandas.lib as lib @@ -73,8 +81,8 @@ class TimeConverter(units.ConversionInterface): @staticmethod def convert(value, unit, axis): valid_types = (str, pydt.time) - if (isinstance(value, valid_types) or com.is_integer(value) or - com.is_float(value)): + if (isinstance(value, valid_types) or is_integer(value) or + is_float(value)): return time2num(value) if isinstance(value, Index): return value.map(time2num) @@ -129,14 +137,14 @@ def convert(values, units, axis): raise TypeError('Axis must have `freq` set to convert to Periods') valid_types = (compat.string_types, datetime, Period, pydt.date, pydt.time) - if (isinstance(values, valid_types) or com.is_integer(values) or - com.is_float(values)): + if (isinstance(values, valid_types) or is_integer(values) or + is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): return values.asfreq(axis.freq).values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - if com.is_period_arraylike(values): + if is_period_arraylike(values): return PeriodIndex(values, freq=axis.freq).values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] @@ -149,7 +157,7 @@ def get_datevalue(date, freq): elif isinstance(date, (compat.string_types, datetime, pydt.date, pydt.time)): return Period(date, freq).ordinal - elif (com.is_integer(date) or com.is_float(date) or + elif (is_integer(date) or is_float(date) or (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): return date elif date is None: @@ -163,8 +171,8 @@ def _dt_to_float_ordinal(dt): preserving hours, minutes, seconds and microseconds. Return value is a :func:`float`. """ - if (isinstance(dt, (np.ndarray, Index, Series)) and - com.is_datetime64_ns_dtype(dt)): + if (isinstance(dt, (np.ndarray, Index, Series) + ) and is_datetime64_ns_dtype(dt)): base = dates.epoch2num(dt.asi8 / 1.0E9) else: base = dates.date2num(dt) @@ -188,7 +196,7 @@ def try_parse(values): return _dt_to_float_ordinal(lib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) - elif (com.is_integer(values) or com.is_float(values)): + elif (is_integer(values) or is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) @@ -198,7 +206,7 @@ def try_parse(values): if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) - if com.is_integer_dtype(values) or com.is_float_dtype(values): + if is_integer_dtype(values) or is_float_dtype(values): return values try: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3f1d0c6d969a6..e2132deb97d64 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -6,12 +6,17 @@ import numpy as np +from pandas.types.generic import ABCSeries +from pandas.types.common import (is_integer, + is_period_arraylike, + is_timedelta64_dtype, + is_datetime64_dtype) + import pandas.core.algorithms as algos from pandas.core.algorithms import unique from pandas.tseries.offsets import DateOffset from pandas.util.decorators import cache_readonly import pandas.tseries.offsets as offsets -import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib from pandas.tslib import Timedelta @@ -255,8 +260,8 @@ def get_freq_code(freqstr): freqstr = (freqstr.rule_code, freqstr.n) if isinstance(freqstr, tuple): - if (com.is_integer(freqstr[0]) and - com.is_integer(freqstr[1])): + if (is_integer(freqstr[0]) and + is_integer(freqstr[1])): # e.g., freqstr = (2000, 1) return freqstr else: @@ -265,13 +270,13 @@ def get_freq_code(freqstr): code = _period_str_to_code(freqstr[0]) stride = freqstr[1] except: - if com.is_integer(freqstr[1]): + if is_integer(freqstr[1]): raise code = _period_str_to_code(freqstr[1]) stride = freqstr[0] return code, stride - if com.is_integer(freqstr): + if is_integer(freqstr): return (freqstr, 1) base, stride = _base_and_stride(freqstr) @@ -843,16 +848,16 @@ def infer_freq(index, warn=True): """ import pandas as pd - if isinstance(index, com.ABCSeries): + if isinstance(index, ABCSeries): values = index._values - if not (com.is_datetime64_dtype(values) or - com.is_timedelta64_dtype(values) or + if not (is_datetime64_dtype(values) or + is_timedelta64_dtype(values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible " "dtype on a Series of {0}".format(index.dtype)) index = values - if com.is_period_arraylike(index): + if is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif isinstance(index, pd.TimedeltaIndex): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 9b36bc5907066..47bb69b8d7ad6 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,13 +6,25 @@ from datetime import timedelta import numpy as np from pandas.core.base import _shared_docs -from pandas.core.common import (_INT64_DTYPE, _NS_DTYPE, _maybe_box, - _values_from_object, ABCSeries, - DatetimeTZDtype, PerformanceWarning, - is_datetimetz, is_datetime64_dtype, - is_datetime64_ns_dtype, is_dtype_equal, - is_float, is_integer, is_integer_dtype, - is_object_dtype, is_string_dtype) + +from pandas.types.common import (_NS_DTYPE, _INT64_DTYPE, + is_object_dtype, is_datetime64_dtype, + is_datetimetz, is_dtype_equal, + is_integer, is_float, + is_integer_dtype, + is_datetime64_ns_dtype, + is_bool_dtype, + is_string_dtype, + is_list_like, + is_scalar, + _ensure_int64) +from pandas.types.generic import ABCSeries +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.missing import isnull + +import pandas.types.concat as _concat +from pandas.core.common import (_values_from_object, _maybe_box, + PerformanceWarning) from pandas.core.index import Index, Int64Index, Float64Index from pandas.indexes.base import _index_shared_docs @@ -27,7 +39,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) import pandas.core.common as com -import pandas.types.concat as _concat import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -87,7 +98,7 @@ def wrapper(self, other): isinstance(other, compat.string_types)): other = _to_m8(other, tz=self.tz) result = func(other) - if com.isnull(other): + if isnull(other): result.fill(nat_result) else: if isinstance(other, list): @@ -109,7 +120,7 @@ def wrapper(self, other): result[self._isnan] = nat_result # support of bool dtype indexers - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return Index(result) @@ -277,7 +288,7 @@ def __new__(cls, data=None, ambiguous=ambiguous) if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if lib.isscalar(data): + if is_scalar(data): raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -537,7 +548,7 @@ def _generate(cls, start, end, periods, name, offset, index = _generate_regular_range(start, end, periods, offset) if tz is not None and getattr(index, 'tz', None) is None: - index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz, + index = tslib.tz_localize_to_utc(_ensure_int64(index), tz, ambiguous=ambiguous) index = index.view(_NS_DTYPE) @@ -601,7 +612,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, return cls(values, name=name, freq=freq, tz=tz, dtype=dtype, **kwargs).values elif not is_datetime64_dtype(values): - values = com._ensure_int64(values).view(_NS_DTYPE) + values = _ensure_int64(values).view(_NS_DTYPE) result = object.__new__(cls) result._data = values @@ -1683,7 +1694,7 @@ def inferred_type(self): def dtype(self): if self.tz is None: return _NS_DTYPE - return com.DatetimeTZDtype('ns', self.tz) + return DatetimeTZDtype('ns', self.tz) @property def is_all_dates(self): @@ -1787,9 +1798,9 @@ def delete(self, loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: - if com.is_list_like(loc): + if is_list_like(loc): loc = lib.maybe_indices_to_slice( - com._ensure_int64(np.array(loc)), len(self)) + _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index d0b1fd746d0d5..f12ba8083f545 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -3,9 +3,9 @@ from pandas import compat import numpy as np +from pandas.types.generic import ABCSeries, ABCDatetimeIndex, ABCPeriod from pandas.tseries.tools import to_datetime, normalize_date -from pandas.core.common import (ABCSeries, ABCDatetimeIndex, ABCPeriod, - AbstractMethodError) +from pandas.core.common import AbstractMethodError # import after tools, dateutil check from dateutil.relativedelta import relativedelta, weekday diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 750e7a5553ef6..45f634050a5d8 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,6 +1,24 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta import numpy as np + + +from pandas.core import common as com +from pandas.types.common import (is_integer, + is_float, + is_object_dtype, + is_integer_dtype, + is_float_dtype, + is_scalar, + is_timedelta64_dtype, + is_bool_dtype, + _ensure_int64, + _ensure_object) + +from pandas.types.generic import ABCSeries +from pandas.types.missing import isnull + + import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries.index import DatetimeIndex, Int64Index, Index @@ -17,15 +35,10 @@ from pandas.core.base import _shared_docs from pandas.indexes.base import _index_shared_docs -import pandas.core.common as com -from pandas.core.common import ( - _maybe_box, _values_from_object, ABCSeries, is_float, is_integer, - is_integer_dtype, is_object_dtype, isnull) from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution from pandas.lib import Timedelta -import pandas.lib as lib import pandas.tslib as tslib import pandas.core.missing as missing from pandas.compat import zip, u @@ -209,7 +222,7 @@ def _generate_range(cls, start, end, periods, freq, fields): def _from_arraylike(cls, data, freq, tz): if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): - if lib.isscalar(data) or isinstance(data, Period): + if is_scalar(data) or isinstance(data, Period): raise ValueError('PeriodIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -219,13 +232,13 @@ def _from_arraylike(cls, data, freq, tz): data = list(data) try: - data = com._ensure_int64(data) + data = _ensure_int64(data) if freq is None: raise ValueError('freq not specified') data = np.array([Period(x, freq=freq).ordinal for x in data], dtype=np.int64) except (TypeError, ValueError): - data = com._ensure_object(data) + data = _ensure_object(data) if freq is None: freq = period.extract_freq(data) @@ -242,7 +255,7 @@ def _from_arraylike(cls, data, freq, tz): base1, base2, 1) else: - if freq is None and com.is_object_dtype(data): + if freq is None and is_object_dtype(data): # must contain Period instance and thus extract ordinals freq = period.extract_freq(data) data = period.extract_ordinals(data, freq) @@ -256,9 +269,9 @@ def _from_arraylike(cls, data, freq, tz): data = dt64arr_to_periodarr(data, freq, tz) else: try: - data = com._ensure_int64(data) + data = _ensure_int64(data) except (TypeError, ValueError): - data = com._ensure_object(data) + data = _ensure_object(data) data = period.extract_ordinals(data, freq) return data, freq @@ -266,9 +279,9 @@ def _from_arraylike(cls, data, freq, tz): @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - if not com.is_integer_dtype(values): + if not is_integer_dtype(values): values = np.array(values, copy=False) - if (len(values) > 0 and com.is_float_dtype(values)): + if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: return PeriodIndex(values, name=name, freq=freq, **kwargs) @@ -339,7 +352,7 @@ def __array_wrap__(self, result, context=None): # from here because numpy catches. raise ValueError(msg.format(func.__name__)) - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return PeriodIndex(result, freq=self.freq, name=self.name) @@ -580,9 +593,9 @@ def _maybe_convert_timedelta(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): - if com.is_integer_dtype(other): + if is_integer_dtype(other): return other - elif com.is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -657,10 +670,11 @@ def get_value(self, series, key): Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ - s = _values_from_object(series) + s = com._values_from_object(series) try: - return _maybe_box(self, super(PeriodIndex, self).get_value(s, key), - series, key) + return com._maybe_box(self, + super(PeriodIndex, self).get_value(s, key), + series, key) except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) @@ -683,16 +697,16 @@ def get_value(self, series, key): return series[key] elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal - return _maybe_box(self, self._engine.get_value(s, key), - series, key) + return com._maybe_box(self, self._engine.get_value(s, key), + series, key) else: raise KeyError(key) except TypeError: pass key = Period(key, self.freq).ordinal - return _maybe_box(self, self._engine.get_value(s, key), - series, key) + return com._maybe_box(self, self._engine.get_value(s, key), + series, key) def get_indexer(self, target, method=None, limit=None, tolerance=None): if hasattr(target, 'freq') and target.freq != self.freq: @@ -849,7 +863,7 @@ def _apply_meta(self, rawarr): def __getitem__(self, key): getitem = self._data.__getitem__ - if lib.isscalar(key): + if is_scalar(key): val = getitem(key) return Period(ordinal=val, freq=self.freq) else: diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index dbc0078b67ae7..f9fb51ebf710c 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -2,11 +2,20 @@ from datetime import timedelta import numpy as np -from pandas.core.common import (ABCSeries, _TD_DTYPE, _maybe_box, - _values_from_object, isnull, - is_integer, is_float, is_integer_dtype, - is_object_dtype, is_timedelta64_dtype, - is_timedelta64_ns_dtype) +from pandas.types.common import (_TD_DTYPE, + is_integer, is_float, + is_bool_dtype, + is_list_like, + is_scalar, + is_integer_dtype, + is_object_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + _ensure_int64) +from pandas.types.missing import isnull +from pandas.types.generic import ABCSeries +from pandas.core.common import _maybe_box, _values_from_object + from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u @@ -44,10 +53,10 @@ def wrapper(self, other): # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) - if com.isnull(other): + if isnull(other): result.fill(nat_result) else: - if not com.is_list_like(other): + if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values @@ -66,7 +75,7 @@ def wrapper(self, other): result[self._isnan] = nat_result # support of bool dtype indexers - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return Index(result) @@ -175,7 +184,7 @@ def __new__(cls, data=None, unit=None, data = to_timedelta(data, unit=unit, box=False) if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if lib.isscalar(data): + if is_scalar(data): raise ValueError('TimedeltaIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -261,7 +270,7 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): if values.dtype == np.object_: values = tslib.array_to_timedelta64(values) if values.dtype != _TD_DTYPE: - values = com._ensure_int64(values).view(_TD_DTYPE) + values = _ensure_int64(values).view(_TD_DTYPE) result = object.__new__(cls) result._data = values @@ -905,9 +914,9 @@ def delete(self, loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: - if com.is_list_like(loc): + if is_list_like(loc): loc = lib.maybe_indices_to_slice( - com._ensure_int64(np.array(loc)), len(self)) + _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tseries/tests/test_bin_groupby.py index 6b6c468b7c391..08c0833be0cd6 100644 --- a/pandas/tseries/tests/test_bin_groupby.py +++ b/pandas/tseries/tests/test_bin_groupby.py @@ -3,12 +3,12 @@ from numpy import nan import numpy as np +from pandas.types.common import _ensure_int64 from pandas import Index, isnull from pandas.util.testing import assert_almost_equal import pandas.util.testing as tm import pandas.lib as lib import pandas.algos as algos -from pandas.core import common as com def test_series_grouper(): @@ -90,8 +90,8 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = com._ensure_int64(np.repeat(np.arange(3), - np.diff(np.r_[0, bins]))) + labels = _ensure_int64(np.repeat(np.arange(3), + np.diff(np.r_[0, bins]))) func = getattr(algos, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 807fb86b1b4da..591fa19aad585 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4326,10 +4326,10 @@ def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='period[D]') val = series[3] - self.assertTrue(com.isnull(val)) + self.assertTrue(isnull(val)) series[2] = val - self.assertTrue(com.isnull(series[2])) + self.assertTrue(isnull(series[2])) def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 2236d20975eee..518f69485004c 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -11,10 +11,11 @@ import pandas.util.testing as tm from pandas import (Series, DataFrame, Panel, Index, isnull, notnull, Timestamp) + +from pandas.types.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict from pandas.core.base import SpecificationError -from pandas.core.common import (ABCSeries, ABCDataFrame, - UnsupportedFunctionCall) +from pandas.core.common import UnsupportedFunctionCall from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS from pandas.tseries.frequencies import to_offset diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index e594d31e57296..299ec374567e7 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -12,6 +12,7 @@ import pandas.lib as lib import pandas.tslib as tslib +from pandas.types.common import is_datetime64_ns_dtype import pandas as pd import pandas.compat as compat import pandas.core.common as com @@ -2282,7 +2283,7 @@ def test_to_datetime_tz_psycopg2(self): i = pd.DatetimeIndex([ '2000-01-01 08:00:00+00:00' ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) - self.assertFalse(com.is_datetime64_ns_dtype(i)) + self.assertFalse(is_datetime64_ns_dtype(i)) # tz coerceion result = pd.to_datetime(i, errors='coerce') diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 71a041d5139a2..470aafafec547 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -5,6 +5,7 @@ import numpy as np import pytz +from pandas.types.dtypes import DatetimeTZDtype from pandas import (Index, Series, DataFrame, isnull, Timestamp) from pandas import DatetimeIndex, to_datetime, NaT @@ -17,7 +18,6 @@ from pytz import NonExistentTimeError import pandas.util.testing as tm -from pandas.types.api import DatetimeTZDtype from pandas.util.testing import assert_frame_equal, set_timezone from pandas.compat import lrange, zip diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 5a28218500858..7f28ec86ec40d 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -4,9 +4,11 @@ import numpy as np import pandas.tslib as tslib -from pandas.core.common import (ABCSeries, is_integer_dtype, - is_timedelta64_dtype, is_list_like, - _ensure_object, ABCIndexClass) +from pandas.types.common import (_ensure_object, + is_integer_dtype, + is_timedelta64_dtype, + is_list_like) +from pandas.types.generic import ABCSeries, ABCIndexClass from pandas.util.decorators import deprecate_kwarg diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index efb8590dfccf4..067e8ec19f644 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -4,8 +4,17 @@ import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.common as com -from pandas.core.common import ABCIndexClass, ABCSeries, ABCDataFrame + +from pandas.types.common import (_ensure_object, + is_datetime64_ns_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_list_like) +from pandas.types.generic import (ABCIndexClass, ABCSeries, + ABCDataFrame) +from pandas.types.missing import notnull + import pandas.compat as compat from pandas.util.decorators import deprecate_kwarg @@ -161,7 +170,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element - non_nan_elements = com.notnull(arr).nonzero()[0] + non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) @@ -307,7 +316,7 @@ def _convert_listlike(arg, box, format, name=None): arg = np.array(arg, dtype='O') # these are shortcutable - if com.is_datetime64_ns_dtype(arg): + if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, @@ -317,7 +326,7 @@ def _convert_listlike(arg, box, format, name=None): return arg - elif com.is_datetime64tz_dtype(arg): + elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz='utc' if utc else None) if utc: @@ -342,7 +351,7 @@ def _convert_listlike(arg, box, format, name=None): raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') - arg = com._ensure_object(arg) + arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: @@ -399,7 +408,7 @@ def _convert_listlike(arg, box, format, name=None): require_iso8601=require_iso8601 ) - if com.is_datetime64_dtype(result) and box: + if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) @@ -424,7 +433,7 @@ def _convert_listlike(arg, box, format, name=None): return _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) - elif com.is_list_like(arg): + elif is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0] @@ -511,7 +520,7 @@ def coerce(values): values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 - if com.is_integer_dtype(values): + if is_integer_dtype(values): values = values.astype('int64', copy=False) return values @@ -574,7 +583,7 @@ def calc_with_mask(carg, mask): # a float with actual np.nan try: carg = arg.astype(np.float64) - return calc_with_mask(carg, com.notnull(carg)) + return calc_with_mask(carg, notnull(carg)) except: pass @@ -654,7 +663,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): def _guess_time_format_for_array(arr): # Try to guess the format based on the first non-NaN element - non_nan_elements = com.notnull(arr).nonzero()[0] + non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): element = arr[non_nan_elements[0]] for time_format in _time_formats: @@ -705,7 +714,7 @@ def _convert_listlike(arg, format): raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') - arg = com._ensure_object(arg) + arg = _ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) @@ -762,7 +771,7 @@ def _convert_listlike(arg, format): return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) - elif com.is_list_like(arg): + elif is_list_like(arg): return _convert_listlike(arg, format) return _convert_listlike(np.array([arg]), format)[0] diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 7e314657cb25c..98a93d22b09a6 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -1,6 +1,6 @@ from pandas.compat import lrange import numpy as np -import pandas.core.common as com +from pandas.types.common import _ensure_platform_int from pandas.core.frame import DataFrame import pandas.core.nanops as nanops @@ -69,7 +69,7 @@ def pivot_annual(series, freq=None): raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset - flat_index = com._ensure_platform_int(flat_index) + flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) diff --git a/pandas/types/api.py b/pandas/types/api.py index 721d8d29bba8b..2d68e041f632e 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -1,75 +1,54 @@ # flake8: noqa import numpy as np -from pandas.compat import string_types -from .dtypes import (CategoricalDtype, CategoricalDtypeType, - DatetimeTZDtype, DatetimeTZDtypeType) -from .generic import (ABCIndex, ABCInt64Index, ABCRangeIndex, - ABCFloat64Index, ABCMultiIndex, - ABCDatetimeIndex, - ABCTimedeltaIndex, ABCPeriodIndex, - ABCCategoricalIndex, - ABCIndexClass, - ABCSeries, ABCDataFrame, ABCPanel, - ABCSparseSeries, ABCSparseArray, - ABCCategorical, ABCPeriod, - ABCGeneric) - -def pandas_dtype(dtype): - """ - Converts input into a pandas only dtype object or a numpy dtype object. - - Parameters - ---------- - dtype : object to be converted - - Returns - ------- - np.dtype or a pandas dtype - """ - if isinstance(dtype, DatetimeTZDtype): - return dtype - elif isinstance(dtype, CategoricalDtype): - return dtype - elif isinstance(dtype, string_types): - try: - return DatetimeTZDtype.construct_from_string(dtype) - except TypeError: - pass - - try: - return CategoricalDtype.construct_from_string(dtype) - except TypeError: - pass - - return np.dtype(dtype) - -def na_value_for_dtype(dtype): - """ - Return a dtype compat na value - - Parameters - ---------- - dtype : string / dtype - - Returns - ------- - dtype compat na value - """ - - from pandas.core import common as com - from pandas import NaT - dtype = pandas_dtype(dtype) - - if (com.is_datetime64_dtype(dtype) or - com.is_datetime64tz_dtype(dtype) or - com.is_timedelta64_dtype(dtype)): - return NaT - elif com.is_float_dtype(dtype): - return np.nan - elif com.is_integer_dtype(dtype): - return 0 - elif com.is_bool_dtype(dtype): - return False - return np.nan +from .common import (pandas_dtype, + is_dtype_equal, + is_extension_type, + + # categorical + is_categorical, + is_categorical_dtype, + + # datetimelike + is_datetimetz, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime64_any_dtype, + is_datetime64_ns_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + + # string-like + is_string_dtype, + is_object_dtype, + + # sparse + is_sparse, + + # numeric types + is_scalar, + is_sparse, + is_bool, + is_integer, + is_float, + is_complex, + is_number, + is_any_int_dtype, + is_integer_dtype, + is_int64_dtype, + is_numeric_dtype, + is_float_dtype, + is_floating_dtype, + is_bool_dtype, + is_complex_dtype, + + # like + is_re, + is_re_compilable, + is_dict_like, + is_iterator, + is_list_like, + is_hashable, + is_named_tuple, + is_sequence) diff --git a/pandas/types/cast.py b/pandas/types/cast.py new file mode 100644 index 0000000000000..e55cb91d36430 --- /dev/null +++ b/pandas/types/cast.py @@ -0,0 +1,860 @@ +""" routings for casting """ + +from datetime import datetime, timedelta +import numpy as np +from pandas import lib, tslib +from pandas.tslib import iNaT +from pandas.compat import string_types, text_type, PY3 +from .common import (_ensure_object, is_bool, is_integer, is_float, + is_complex, is_datetimetz, is_categorical_dtype, + is_extension_type, is_object_dtype, + is_datetime64tz_dtype, is_datetime64_dtype, + is_timedelta64_dtype, is_dtype_equal, + is_float_dtype, is_complex_dtype, + is_integer_dtype, is_datetime_or_timedelta_dtype, + is_scalar, + _string_dtypes, + _coerce_to_dtype, + _ensure_int8, _ensure_int16, + _ensure_int32, _ensure_int64, + _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, + _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) +from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries +from .missing import isnull, notnull +from .inference import is_list_like + +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max +_int64_max = np.iinfo(np.int64).max + + +def _possibly_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list, tuple)): + values = lib.list_to_object_array(values) + if getattr(values, 'dtype', None) == np.object_: + if hasattr(values, '_values'): + values = values._values + values = lib.maybe_convert_objects(values) + + return values + + +def _possibly_downcast_to_dtype(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + + if is_scalar(result): + return result + + def trans(x): + return x + + if isinstance(dtype, string_types): + if dtype == 'infer': + inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) + if inferred_type == 'boolean': + dtype = 'bool' + elif inferred_type == 'integer': + dtype = 'int64' + elif inferred_type == 'datetime64': + dtype = 'datetime64[ns]' + elif inferred_type == 'timedelta64': + dtype = 'timedelta64[ns]' + + # try to upcast here + elif inferred_type == 'floating': + dtype = 'int64' + if issubclass(result.dtype.type, np.number): + + def trans(x): # noqa + return x.round() + else: + dtype = 'object' + + if isinstance(dtype, string_types): + dtype = np.dtype(dtype) + + try: + + # don't allow upcasts here (except if empty) + if dtype.kind == result.dtype.kind: + if (result.dtype.itemsize <= dtype.itemsize and + np.prod(result.shape)): + return result + + if issubclass(dtype.type, np.floating): + return result.astype(dtype) + elif dtype == np.bool_ or issubclass(dtype.type, np.integer): + + # if we don't have any elements, just astype it + if not np.prod(result.shape): + return trans(result).astype(dtype) + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([r[0]]) + + # if we have any nulls, then we are done + if isnull(arr).any() or not np.allclose(arr, + trans(arr).astype(dtype)): + return result + + # a comparable, e.g. a Decimal may slip in here + elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, + float, bool)): + return result + + if (issubclass(result.dtype.type, (np.object_, np.number)) and + notnull(result).all()): + new_result = trans(result).astype(dtype) + try: + if np.allclose(new_result, result): + return new_result + except: + + # comparison of an object dtype with a number type could + # hit here + if (new_result == result).all(): + return new_result + + # a datetimelike + elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: + try: + result = result.astype(dtype) + except: + if dtype.tz: + # convert to datetime and change timezone + from pandas import to_datetime + result = to_datetime(result).tz_localize(dtype.tz) + + except: + pass + + return result + + +def _maybe_upcast_putmask(result, mask, other): + """ + A safe version of putmask that potentially upcasts the result + + Parameters + ---------- + result : ndarray + The destination array. This will be mutated in-place if no upcasting is + necessary. + mask : boolean ndarray + other : ndarray or scalar + The source array or value + + Returns + ------- + result : ndarray + changed : boolean + Set to true if the result array was upcasted + """ + + if mask.any(): + # Two conversions for date-like dtypes that can't be done automatically + # in np.place: + # NaN -> NaT + # integer or integer array -> date-like array + if result.dtype in _DATELIKE_DTYPES: + if is_scalar(other): + if isnull(other): + other = result.dtype.type('nat') + elif is_integer(other): + other = np.array(other, dtype=result.dtype) + elif is_integer_dtype(other): + other = np.array(other, dtype=result.dtype) + + def changeit(): + + # try to directly set by expanding our array to full + # length of the boolean + try: + om = other[mask] + om_at = om.astype(result.dtype) + if (om == om_at).all(): + new_result = result.values.copy() + new_result[mask] = om_at + result[:] = new_result + return result, False + except: + pass + + # we are forced to change the dtype of the result as the input + # isn't compatible + r, _ = _maybe_upcast(result, fill_value=other, copy=True) + np.place(r, mask, other) + + return r, True + + # we want to decide whether place will work + # if we have nans in the False portion of our mask then we need to + # upcast (possibly), otherwise we DON't want to upcast (e.g. if we + # have values, say integers, in the success portion then it's ok to not + # upcast) + new_dtype, _ = _maybe_promote(result.dtype, other) + if new_dtype != result.dtype: + + # we have a scalar or len 0 ndarray + # and its nan and we are changing some values + if (is_scalar(other) or + (isinstance(other, np.ndarray) and other.ndim < 1)): + if isnull(other): + return changeit() + + # we have an ndarray and the masking has nans in it + else: + + if isnull(other[mask]).any(): + return changeit() + + try: + np.place(result, mask, other) + except: + return changeit() + + return result, False + + +def _maybe_promote(dtype, fill_value=np.nan): + + # if we passed an array here, determine the fill value by dtype + if isinstance(fill_value, np.ndarray): + if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): + fill_value = iNaT + else: + + # we need to change to object type as our + # fill_value is of object type + if fill_value.dtype == np.object_: + dtype = np.dtype(np.object_) + fill_value = np.nan + + # returns tuple of (dtype, fill_value) + if issubclass(dtype.type, (np.datetime64, np.timedelta64)): + # for now: refuse to upcast datetime64 + # (this is because datetime64 will not implicitly upconvert + # to object correctly as of numpy 1.6.1) + if isnull(fill_value): + fill_value = iNaT + else: + if issubclass(dtype.type, np.datetime64): + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast + # to object (but numpy 1.6.1 doesn't do this properly) + fill_value = iNaT + elif issubclass(dtype.type, np.timedelta64): + try: + fill_value = lib.Timedelta(fill_value).value + except: + # as for datetimes, cannot upcast to object + fill_value = iNaT + else: + fill_value = iNaT + elif is_datetimetz(dtype): + if isnull(fill_value): + fill_value = iNaT + elif is_float(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + dtype = np.float64 + elif is_bool(fill_value): + if not issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif is_integer(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + # upcast to prevent overflow + arr = np.asarray(fill_value) + if arr != arr.astype(dtype): + dtype = arr.dtype + elif is_complex(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, (np.integer, np.floating)): + dtype = np.complex128 + elif fill_value is None: + if is_float_dtype(dtype) or is_complex_dtype(dtype): + fill_value = np.nan + elif is_integer_dtype(dtype): + dtype = np.float64 + fill_value = np.nan + elif is_datetime_or_timedelta_dtype(dtype): + fill_value = iNaT + else: + dtype = np.object_ + else: + dtype = np.object_ + + # in case we have a string that looked like a number + if is_categorical_dtype(dtype): + pass + elif is_datetimetz(dtype): + pass + elif issubclass(np.dtype(dtype).type, string_types): + dtype = np.object_ + + return dtype, fill_value + + +def _infer_dtype_from_scalar(val): + """ interpret the dtype from a scalar """ + + dtype = np.object_ + + # a 1-element ndarray + if isinstance(val, np.ndarray): + if val.ndim != 0: + raise ValueError( + "invalid ndarray passed to _infer_dtype_from_scalar") + + dtype = val.dtype + val = val.item() + + elif isinstance(val, string_types): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = np.object_ + + elif isinstance(val, (np.datetime64, + datetime)) and getattr(val, 'tzinfo', None) is None: + val = lib.Timestamp(val).value + dtype = np.dtype('M8[ns]') + + elif isinstance(val, (np.timedelta64, timedelta)): + val = lib.Timedelta(val).value + dtype = np.dtype('m8[ns]') + + elif is_bool(val): + dtype = np.bool_ + + elif is_integer(val): + if isinstance(val, np.integer): + dtype = type(val) + else: + dtype = np.int64 + + elif is_float(val): + if isinstance(val, np.floating): + dtype = type(val) + else: + dtype = np.float64 + + elif is_complex(val): + dtype = np.complex_ + + return dtype, val + + +def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): + """ provide explict type promotion and coercion + + Parameters + ---------- + values : the ndarray that we want to maybe upcast + fill_value : what we want to fill with + dtype : if None, then use the dtype of the values, else coerce to this type + copy : if True always make a copy even if no upcast is required + """ + + if is_extension_type(values): + if copy: + values = values.copy() + else: + if dtype is None: + dtype = values.dtype + new_dtype, fill_value = _maybe_promote(dtype, fill_value) + if new_dtype != values.dtype: + values = values.astype(new_dtype) + elif copy: + values = values.copy() + + return values, fill_value + + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) + + +def _invalidate_string_dtypes(dtype_set): + """Change string like dtypes to object for + ``DataFrame.select_dtypes()``. + """ + non_string_dtypes = dtype_set - _string_dtypes + if non_string_dtypes != dtype_set: + raise TypeError("string dtypes are not allowed, use 'object' instead") + + +def _maybe_convert_string_to_object(values): + """ + + Convert string-like and string-like array to convert object dtype. + This is to avoid numpy to handle the array as str dtype. + """ + if isinstance(values, string_types): + values = np.array([values], dtype=object) + elif (isinstance(values, np.ndarray) and + issubclass(values.dtype.type, (np.string_, np.unicode_))): + values = values.astype(object) + return values + + +def _maybe_convert_scalar(values): + """ + Convert a python scalar to the appropriate numpy dtype if possible + This avoids numpy directly converting according to platform preferences + """ + if is_scalar(values): + dtype, values = _infer_dtype_from_scalar(values) + try: + values = dtype(values) + except TypeError: + pass + return values + + +def _coerce_indexer_dtype(indexer, categories): + """ coerce the indexer input array to the smallest dtype possible """ + l = len(categories) + if l < _int8_max: + return _ensure_int8(indexer) + elif l < _int16_max: + return _ensure_int16(indexer) + elif l < _int32_max: + return _ensure_int32(indexer) + return _ensure_int64(indexer) + + +def _coerce_to_dtypes(result, dtypes): + """ + given a dtypes and a result set, coerce the result elements to the + dtypes + """ + if len(result) != len(dtypes): + raise AssertionError("_coerce_to_dtypes requires equal len arrays") + + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + def conv(r, dtype): + try: + if isnull(r): + pass + elif dtype == _NS_DTYPE: + r = lib.Timestamp(r) + elif dtype == _TD_DTYPE: + r = _coerce_scalar_to_timedelta_type(r) + elif dtype == np.bool_: + # messy. non 0/1 integers do not get converted. + if is_integer(r) and r not in [0, 1]: + return int(r) + r = bool(r) + elif dtype.kind == 'f': + r = float(r) + elif dtype.kind == 'i': + r = int(r) + except: + pass + + return r + + return [conv(r, dtype) for r, dtype in zip(result, dtypes)] + + +def _astype_nansafe(arr, dtype, copy=True): + """ return a view if copy is False, but + need to be very careful as the result shape could change! """ + if not isinstance(dtype, np.dtype): + dtype = _coerce_to_dtype(dtype) + + if issubclass(dtype.type, text_type): + # in Py3 that's str, in Py2 that's unicode + return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + elif issubclass(dtype.type, string_types): + return lib.astype_str(arr.ravel()).reshape(arr.shape) + elif is_datetime64_dtype(arr): + if dtype == object: + return tslib.ints_to_pydatetime(arr.view(np.int64)) + elif dtype == np.int64: + return arr.view(dtype) + elif dtype != _NS_DTYPE: + raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % + (arr.dtype, dtype)) + return arr.astype(_NS_DTYPE) + elif is_timedelta64_dtype(arr): + if dtype == np.int64: + return arr.view(dtype) + elif dtype == object: + return tslib.ints_to_pytimedelta(arr.view(np.int64)) + + # in py3, timedelta64[ns] are int64 + elif ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or + (not PY3 and dtype != _TD_DTYPE)): + + # allow frequency conversions + if dtype.kind == 'm': + mask = isnull(arr) + result = arr.astype(dtype).astype(np.float64) + result[mask] = np.nan + return result + + raise TypeError("cannot astype a timedelta from [%s] to [%s]" % + (arr.dtype, dtype)) + + return arr.astype(_TD_DTYPE) + elif (np.issubdtype(arr.dtype, np.floating) and + np.issubdtype(dtype, np.integer)): + + if np.isnan(arr).any(): + raise ValueError('Cannot convert NA to integer') + elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): + # work around NumPy brokenness, #1987 + return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) + + if copy: + return arr.astype(dtype) + return arr.view(dtype) + + +def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, + convert_timedeltas=True, copy=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + # if we have passed in a list or scalar + if isinstance(values, (list, tuple)): + values = np.array(values, dtype=np.object_) + if not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + + # convert dates + if convert_dates and values.dtype == np.object_: + + # we take an aggressive stance and convert to datetime64[ns] + if convert_dates == 'coerce': + new_values = _possibly_cast_to_datetime(values, 'M8[ns]', + errors='coerce') + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects(values, + convert_datetime=convert_dates) + + # convert timedeltas + if convert_timedeltas and values.dtype == np.object_: + + if convert_timedeltas == 'coerce': + from pandas.tseries.timedeltas import to_timedelta + new_values = to_timedelta(values, coerce=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects( + values, convert_timedelta=convert_timedeltas) + + # convert to numeric + if values.dtype == np.object_: + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric(values, set(), + coerce_numeric=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + except: + pass + else: + # soft-conversion + values = lib.maybe_convert_objects(values) + + values = values.copy() if copy else values + + return values + + +def _soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, + coerce=False, copy=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + conversion_count = sum((datetime, numeric, timedelta)) + if conversion_count == 0: + raise ValueError('At least one of datetime, numeric or timedelta must ' + 'be True.') + elif conversion_count > 1 and coerce: + raise ValueError("Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True.") + + if isinstance(values, (list, tuple)): + # List or scalar + values = np.array(values, dtype=np.object_) + elif not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + elif not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + values = values.copy() if copy else values + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + # Immediate return if coerce + if datetime: + from pandas import to_datetime + return to_datetime(values, errors='coerce', box=False) + elif timedelta: + from pandas import to_timedelta + return to_timedelta(values, errors='coerce', box=False) + elif numeric: + from pandas import to_numeric + return to_numeric(values, errors='coerce') + + # Soft conversions + if datetime: + values = lib.maybe_convert_objects(values, convert_datetime=datetime) + + if timedelta and is_object_dtype(values.dtype): + # Object check to ensure only run if previous did not convert + values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) + + if numeric and is_object_dtype(values.dtype): + try: + converted = lib.maybe_convert_numeric(values, set(), + coerce_numeric=True) + # If all NaNs, then do not-alter + values = converted if not isnull(converted).all() else values + values = values.copy() if copy else values + except: + pass + + return values + + +def _possibly_castable(arr): + # return False to force a non-fastpath + + # check datetime64[ns]/timedelta64[ns] are valid + # otherwise try to coerce + kind = arr.dtype.kind + if kind == 'M' or kind == 'm': + return arr.dtype in _DATELIKE_DTYPES + + return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + + +def _possibly_infer_to_datetimelike(value, convert_dates=False): + """ + we might have a array (or single object) that is datetime like, + and no dtype is passed don't change the value unless we find a + datetime/timedelta set + + this is pretty strict in that a datetime/timedelta is REQUIRED + in addition to possible nulls/string likes + + ONLY strings are NOT datetimelike + + Parameters + ---------- + value : np.array / Series / Index / list-like + convert_dates : boolean, default False + if True try really hard to convert dates (such as datetime.date), other + leave inferred dtype 'date' alone + + """ + + if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex)): + return value + elif isinstance(value, ABCSeries): + if isinstance(value._values, ABCDatetimeIndex): + return value._values + + v = value + + if not is_list_like(v): + v = [v] + v = np.array(v, copy=False) + shape = v.shape + if not v.ndim == 1: + v = v.ravel() + + if len(v): + + def _try_datetime(v): + # safe coerce to datetime64 + try: + v = tslib.array_to_datetime(v, errors='raise') + except ValueError: + + # we might have a sequence of the same-datetimes with tz's + # if so coerce to a DatetimeIndex; if they are not the same, + # then these stay as object dtype + try: + from pandas import to_datetime + return to_datetime(v) + except: + pass + + except: + pass + + return v.reshape(shape) + + def _try_timedelta(v): + # safe coerce to timedelta64 + + # will try first with a string & object conversion + from pandas import to_timedelta + try: + return to_timedelta(v)._values.reshape(shape) + except: + return v + + # do a quick inference for perf + sample = v[:min(3, len(v))] + inferred_type = lib.infer_dtype(sample) + + if (inferred_type in ['datetime', 'datetime64'] or + (convert_dates and inferred_type in ['date'])): + value = _try_datetime(v) + elif inferred_type in ['timedelta', 'timedelta64']: + value = _try_timedelta(v) + + # It's possible to have nulls intermixed within the datetime or + # timedelta. These will in general have an inferred_type of 'mixed', + # so have to try both datetime and timedelta. + + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but technically is also a datetime + elif inferred_type in ['mixed']: + + if lib.is_possible_datetimelike_array(_ensure_object(v)): + value = _try_timedelta(v) + if lib.infer_dtype(value) in ['mixed']: + value = _try_datetime(v) + + return value + + +def _possibly_cast_to_datetime(value, dtype, errors='raise'): + """ try to cast the array/value to a datetimelike dtype, converting float + nan to iNaT + """ + from pandas.tseries.timedeltas import to_timedelta + from pandas.tseries.tools import to_datetime + + if dtype is not None: + if isinstance(dtype, string_types): + dtype = np.dtype(dtype) + + is_datetime64 = is_datetime64_dtype(dtype) + is_datetime64tz = is_datetime64tz_dtype(dtype) + is_timedelta64 = is_timedelta64_dtype(dtype) + + if is_datetime64 or is_datetime64tz or is_timedelta64: + + # force the dtype if needed + if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): + if dtype.name == 'datetime64[ns]': + dtype = _NS_DTYPE + else: + raise TypeError("cannot convert datetimelike to " + "dtype [%s]" % dtype) + elif is_datetime64tz: + + # our NaT doesn't support tz's + # this will coerce to DatetimeIndex with + # a matching dtype below + if is_scalar(value) and isnull(value): + value = [value] + + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): + if dtype.name == 'timedelta64[ns]': + dtype = _TD_DTYPE + else: + raise TypeError("cannot convert timedeltalike to " + "dtype [%s]" % dtype) + + if is_scalar(value): + if value == tslib.iNaT or isnull(value): + value = tslib.iNaT + else: + value = np.array(value, copy=False) + + # have a scalar array-like (e.g. NaT) + if value.ndim == 0: + value = tslib.iNaT + + # we have an array of datetime or timedeltas & nulls + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, + dtype): + try: + if is_datetime64: + value = to_datetime(value, errors=errors)._values + elif is_datetime64tz: + # input has to be UTC at this point, so just + # localize + value = to_datetime( + value, + errors=errors).tz_localize(dtype.tz) + elif is_timedelta64: + value = to_timedelta(value, errors=errors)._values + except (AttributeError, ValueError, TypeError): + pass + + # coerce datetimelike to object + elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): + if is_object_dtype(dtype): + ints = np.asarray(value).view('i8') + return tslib.ints_to_pydatetime(ints) + + # we have a non-castable dtype that was passed + raise TypeError('Cannot cast datetime64 to %s' % dtype) + + else: + + is_array = isinstance(value, np.ndarray) + + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified + if is_array and value.dtype.kind in ['M', 'm']: + dtype = value.dtype + + if dtype.kind == 'M' and dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) + + elif dtype.kind == 'm' and dtype != _TD_DTYPE: + value = to_timedelta(value) + + # only do this if we have an array and the dtype of the array is not + # setup already we are not an integer/object, so don't bother with this + # conversion + elif not (is_array and not (issubclass(value.dtype.type, np.integer) or + value.dtype == np.object_)): + value = _possibly_infer_to_datetimelike(value) + + return value diff --git a/pandas/types/common.py b/pandas/types/common.py new file mode 100644 index 0000000000000..9d0ccaac843ef --- /dev/null +++ b/pandas/types/common.py @@ -0,0 +1,448 @@ +""" common type operations """ + +import numpy as np +from pandas.compat import string_types, text_type, binary_type +from pandas import lib, algos +from .dtypes import (CategoricalDtype, CategoricalDtypeType, + DatetimeTZDtype, DatetimeTZDtypeType, + ExtensionDtype) +from .generic import (ABCCategorical, ABCPeriodIndex, + ABCDatetimeIndex, ABCSeries, + ABCSparseArray, ABCSparseSeries) +from .inference import is_integer, is_string_like +from .inference import * # noqa + + +_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name + for t in ['O', 'int8', 'uint8', 'int16', 'uint16', + 'int32', 'uint32', 'int64', 'uint64']]) + +_NS_DTYPE = np.dtype('M8[ns]') +_TD_DTYPE = np.dtype('m8[ns]') +_INT64_DTYPE = np.dtype(np.int64) +_DATELIKE_DTYPES = set([np.dtype(t) + for t in ['M8[ns]', 'M8[ns]', + 'm8[ns]', 'm8[ns]']]) + +_ensure_float64 = algos.ensure_float64 +_ensure_float32 = algos.ensure_float32 + + +def _ensure_float(arr): + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + arr = arr.astype(float) + return arr + +_ensure_int64 = algos.ensure_int64 +_ensure_int32 = algos.ensure_int32 +_ensure_int16 = algos.ensure_int16 +_ensure_int8 = algos.ensure_int8 +_ensure_platform_int = algos.ensure_platform_int +_ensure_object = algos.ensure_object + + +def is_object_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.object_) + + +def is_sparse(array): + """ return if we are a sparse array """ + return isinstance(array, (ABCSparseArray, ABCSparseSeries)) + + +def is_categorical(array): + """ return if we are a categorical possibility """ + return isinstance(array, ABCCategorical) or is_categorical_dtype(array) + + +def is_datetimetz(array): + """ return if we are a datetime with tz array """ + return ((isinstance(array, ABCDatetimeIndex) and + getattr(array, 'tz', None) is not None) or + is_datetime64tz_dtype(array)) + + +def is_datetime64_dtype(arr_or_dtype): + try: + tipo = _get_dtype_type(arr_or_dtype) + except TypeError: + return False + return issubclass(tipo, np.datetime64) + + +def is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtype.is_dtype(arr_or_dtype) + + +def is_timedelta64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.timedelta64) + + +def is_categorical_dtype(arr_or_dtype): + return CategoricalDtype.is_dtype(arr_or_dtype) + + +def is_string_dtype(arr_or_dtype): + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('O', 'S', 'U') + + +def is_period_arraylike(arr): + """ return if we are period arraylike / PeriodIndex """ + if isinstance(arr, ABCPeriodIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'period' + return getattr(arr, 'inferred_type', None) == 'period' + + +def is_datetime_arraylike(arr): + """ return if we are datetime arraylike / DatetimeIndex """ + if isinstance(arr, ABCDatetimeIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' + return getattr(arr, 'inferred_type', None) == 'datetime' + + +def is_datetimelike(arr): + return (arr.dtype in _DATELIKE_DTYPES or + isinstance(arr, ABCPeriodIndex) or + is_datetimetz(arr)) + + +def is_dtype_equal(source, target): + """ return a boolean if the dtypes are equal """ + try: + source = _get_dtype(source) + target = _get_dtype(target) + return source == target + except (TypeError, AttributeError): + + # invalid comparison + # object == category will hit this + return False + + +def is_any_int_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.integer) + + +def is_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + +def is_int_or_datetime_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) or + issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_datetime64_any_dtype(arr_or_dtype): + return (is_datetime64_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) + + +def is_datetime64_ns_dtype(arr_or_dtype): + try: + tipo = _get_dtype(arr_or_dtype) + except TypeError: + return False + return tipo == _NS_DTYPE + + +def is_timedelta64_ns_dtype(arr_or_dtype): + tipo = _get_dtype(arr_or_dtype) + return tipo == _TD_DTYPE + + +def is_datetime_or_timedelta_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, (np.datetime64, np.timedelta64)) + + +def is_numeric_v_string_like(a, b): + """ + numpy doesn't like to compare numeric arrays vs scalar string-likes + + return a boolean result if this is the case for a,b or b,a + + """ + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + is_a_numeric_array = is_a_array and is_numeric_dtype(a) + is_b_numeric_array = is_b_array and is_numeric_dtype(b) + is_a_string_array = is_a_array and is_string_like_dtype(a) + is_b_string_array = is_b_array and is_string_like_dtype(b) + + is_a_scalar_string_like = not is_a_array and is_string_like(a) + is_b_scalar_string_like = not is_b_array and is_string_like(b) + + return ((is_a_numeric_array and is_b_scalar_string_like) or + (is_b_numeric_array and is_a_scalar_string_like) or + (is_a_numeric_array and is_b_string_array) or + (is_b_numeric_array and is_a_string_array)) + + +def is_datetimelike_v_numeric(a, b): + # return if we have an i8 convertible and numeric comparison + if not hasattr(a, 'dtype'): + a = np.asarray(a) + if not hasattr(b, 'dtype'): + b = np.asarray(b) + + def is_numeric(x): + return is_integer_dtype(x) or is_float_dtype(x) + + is_datetimelike = needs_i8_conversion + return ((is_datetimelike(a) and is_numeric(b)) or + (is_datetimelike(b) and is_numeric(a))) + + +def is_datetimelike_v_object(a, b): + # return if we have an i8 convertible and object comparsion + if not hasattr(a, 'dtype'): + a = np.asarray(a) + if not hasattr(b, 'dtype'): + b = np.asarray(b) + + def f(x): + return is_object_dtype(x) + + def is_object(x): + return is_integer_dtype(x) or is_float_dtype(x) + + is_datetimelike = needs_i8_conversion + return ((is_datetimelike(a) and is_object(b)) or + (is_datetimelike(b) and is_object(a))) + + +def needs_i8_conversion(arr_or_dtype): + return (is_datetime_or_timedelta_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) + + +def is_numeric_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, (np.number, np.bool_)) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_string_like_dtype(arr_or_dtype): + # exclude object as its a mixed dtype + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('S', 'U') + + +def is_float_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.floating) + + +def is_floating_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return isinstance(tipo, np.floating) + + +def is_bool_dtype(arr_or_dtype): + try: + tipo = _get_dtype_type(arr_or_dtype) + except ValueError: + # this isn't even a dtype + return False + return issubclass(tipo, np.bool_) + + +def is_extension_type(value): + """ + if we are a klass that is preserved by the internals + these are internal klasses that we represent (and don't use a np.array) + """ + if is_categorical(value): + return True + elif is_sparse(value): + return True + elif is_datetimetz(value): + return True + return False + + +def is_complex_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.complexfloating) + + +def _coerce_to_dtype(dtype): + """ coerce a string / np.dtype to a dtype """ + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + elif is_datetime64tz_dtype(dtype): + dtype = DatetimeTZDtype(dtype) + else: + dtype = np.dtype(dtype) + return dtype + + +def _get_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype) + elif isinstance(arr_or_dtype, CategoricalDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, DatetimeTZDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtype.construct_from_string(arr_or_dtype) + elif is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtype.construct_from_string(arr_or_dtype) + + if hasattr(arr_or_dtype, 'dtype'): + arr_or_dtype = arr_or_dtype.dtype + return np.dtype(arr_or_dtype) + + +def _get_dtype_type(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype.type + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype).type + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtypeType + elif isinstance(arr_or_dtype, DatetimeTZDtype): + return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtypeType + elif is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtypeType + return _get_dtype_type(np.dtype(arr_or_dtype)) + try: + return arr_or_dtype.dtype.type + except AttributeError: + return type(None) + + +def _get_dtype_from_object(dtype): + """Get a numpy dtype.type-style object. This handles the datetime64[ns] + and datetime64[ns, TZ] compat + + Notes + ----- + If nothing can be found, returns ``object``. + """ + + # type object from a dtype + if isinstance(dtype, type) and issubclass(dtype, np.generic): + return dtype + elif is_categorical(dtype): + return CategoricalDtype().type + elif is_datetimetz(dtype): + return DatetimeTZDtype(dtype).type + elif isinstance(dtype, np.dtype): # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # should still pass if we don't have a datelike + pass + return dtype.type + elif isinstance(dtype, string_types): + if dtype == 'datetime' or dtype == 'timedelta': + dtype += '64' + + try: + return _get_dtype_from_object(getattr(np, dtype)) + except (AttributeError, TypeError): + # handles cases like _get_dtype(int) + # i.e., python objects that are valid dtypes (unlike user-defined + # types, in general) + # TypeError handles the float16 typecode of 'e' + # further handle internal types + pass + + return _get_dtype_from_object(np.dtype(dtype)) + + +def _validate_date_like_dtype(dtype): + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError('%s' % e) + if typ != 'generic' and typ != 'ns': + raise ValueError('%r is too specific of a frequency, try passing %r' % + (dtype.name, dtype.type.__name__)) + + +def _lcd_dtypes(a_dtype, b_dtype): + """ return the lcd dtype to hold these types """ + + if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): + return _NS_DTYPE + elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): + return _TD_DTYPE + elif is_complex_dtype(a_dtype): + if is_complex_dtype(b_dtype): + return a_dtype + return np.float64 + elif is_integer_dtype(a_dtype): + if is_integer_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + return np.int64 + return np.float64 + elif is_float_dtype(a_dtype): + if is_float_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + else: + return np.float64 + elif is_integer(b_dtype): + return np.float64 + return np.object + +_string_dtypes = frozenset(map(_get_dtype_from_object, (binary_type, + text_type))) + + +def pandas_dtype(dtype): + """ + Converts input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + """ + if isinstance(dtype, DatetimeTZDtype): + return dtype + elif isinstance(dtype, CategoricalDtype): + return dtype + elif isinstance(dtype, string_types): + try: + return DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + pass + + try: + return CategoricalDtype.construct_from_string(dtype) + except TypeError: + pass + elif isinstance(dtype, ExtensionDtype): + return dtype + + return np.dtype(dtype) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 44338f26eb2e8..3b30531fb30ac 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -3,10 +3,19 @@ """ import numpy as np -import pandas.core.common as com import pandas.tslib as tslib from pandas import compat from pandas.compat import map +from .common import (is_categorical_dtype, + is_sparse, + is_datetimetz, + is_datetime64_dtype, + is_timedelta64_dtype, + is_object_dtype, + is_bool_dtype, + is_dtype_equal, + _NS_DTYPE, + _TD_DTYPE) def get_dtype_kinds(l): @@ -24,19 +33,19 @@ def get_dtype_kinds(l): for arr in l: dtype = arr.dtype - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): typ = 'category' - elif com.is_sparse(arr): + elif is_sparse(arr): typ = 'sparse' - elif com.is_datetimetz(arr): + elif is_datetimetz(arr): typ = 'datetimetz' - elif com.is_datetime64_dtype(dtype): + elif is_datetime64_dtype(dtype): typ = 'datetime' - elif com.is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(dtype): typ = 'timedelta' - elif com.is_object_dtype(dtype): + elif is_object_dtype(dtype): typ = 'object' - elif com.is_bool_dtype(dtype): + elif is_bool_dtype(dtype): typ = 'bool' else: typ = dtype.kind @@ -51,14 +60,14 @@ def _get_series_result_type(result): """ if isinstance(result, dict): # concat Series with axis 1 - if all(com.is_sparse(c) for c in compat.itervalues(result)): + if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame - elif com.is_sparse(result): + elif is_sparse(result): # concat Series with axis 1 from pandas.sparse.api import SparseSeries return SparseSeries @@ -165,7 +174,7 @@ def _concat_categorical(to_concat, axis=0): def convert_categorical(x): # coerce to object dtype - if com.is_categorical_dtype(x.dtype): + if is_categorical_dtype(x.dtype): return x.get_values() return x.ravel() @@ -177,7 +186,7 @@ def convert_categorical(x): # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical - categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories categories = categoricals[0] @@ -235,7 +244,7 @@ def union_categoricals(to_union): if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") - if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype) + if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") @@ -272,7 +281,7 @@ def convert_to_pydatetime(x, axis): # coerce to an object dtype # if dtype is of datetimetz or timezone - if x.dtype.kind == com._NS_DTYPE.kind: + if x.dtype.kind == _NS_DTYPE.kind: if getattr(x, 'tz', None) is not None: x = x.asobject.values else: @@ -280,7 +289,7 @@ def convert_to_pydatetime(x, axis): x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) x = x.reshape(shape) - elif x.dtype == com._TD_DTYPE: + elif x.dtype == _TD_DTYPE: shape = x.shape x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) x = x.reshape(shape) @@ -310,12 +319,12 @@ def convert_to_pydatetime(x, axis): elif 'datetime' in typs: new_values = np.concatenate([x.view(np.int64) for x in to_concat], axis=axis) - return new_values.view(com._NS_DTYPE) + return new_values.view(_NS_DTYPE) elif 'timedelta' in typs: new_values = np.concatenate([x.view(np.int64) for x in to_concat], axis=axis) - return new_values.view(com._TD_DTYPE) + return new_values.view(_TD_DTYPE) # need to coerce to object to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] @@ -350,7 +359,7 @@ def convert_sparse(x, axis): return x if typs is None: - typs = com.get_dtype_kinds(to_concat) + typs = get_dtype_kinds(to_concat) if len(typs) == 1: # concat input as it is if all inputs are sparse @@ -374,7 +383,7 @@ def convert_sparse(x, axis): # input may be sparse / dense mixed and may have different fill_value # input must contain sparse at least 1 - sparses = [c for c in to_concat if com.is_sparse(c)] + sparses = [c for c in to_concat if is_sparse(c)] fill_values = [c.fill_value for c in sparses] sp_indexes = [c.sp_index for c in sparses] diff --git a/pandas/types/inference.py b/pandas/types/inference.py new file mode 100644 index 0000000000000..35a2dc2fb831b --- /dev/null +++ b/pandas/types/inference.py @@ -0,0 +1,104 @@ +""" basic inference routines """ + +import collections +import re +import numpy as np +from numbers import Number +from pandas.compat import (string_types, text_type, + string_and_binary_types) +from pandas import lib + +is_bool = lib.is_bool + +is_integer = lib.is_integer + +is_float = lib.is_float + +is_complex = lib.is_complex + +is_scalar = lib.isscalar + + +def is_number(obj): + return isinstance(obj, (Number, np.number)) + + +def is_string_like(obj): + return isinstance(obj, (text_type, string_types)) + + +def _iterable_not_string(x): + return (isinstance(x, collections.Iterable) and + not isinstance(x, string_types)) + + +def is_iterator(obj): + # python 3 generators have __next__ instead of next + return hasattr(obj, 'next') or hasattr(obj, '__next__') + + +def is_re(obj): + return isinstance(obj, re._pattern_type) + + +def is_re_compilable(obj): + try: + re.compile(obj) + except TypeError: + return False + else: + return True + + +def is_list_like(arg): + return (hasattr(arg, '__iter__') and + not isinstance(arg, string_and_binary_types)) + + +def is_dict_like(arg): + return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') + + +def is_named_tuple(arg): + return isinstance(arg, tuple) and hasattr(arg, '_fields') + + +def is_hashable(arg): + """Return True if hash(arg) will succeed, False otherwise. + + Some types will pass a test against collections.Hashable but fail when they + are actually hashed with hash(). + + Distinguish between these and other types by trying the call to hash() and + seeing if they raise TypeError. + + Examples + -------- + >>> a = ([],) + >>> isinstance(a, collections.Hashable) + True + >>> is_hashable(a) + False + """ + # unfortunately, we can't use isinstance(arg, collections.Hashable), which + # can be faster than calling hash, because numpy scalars on Python 3 fail + # this test + + # reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 + + try: + hash(arg) + except TypeError: + return False + else: + return True + + +def is_sequence(x): + try: + iter(x) + len(x) # it has a length + return not isinstance(x, string_and_binary_types) + except (TypeError, AttributeError): + return False diff --git a/pandas/types/missing.py b/pandas/types/missing.py new file mode 100644 index 0000000000000..8b4193d02beb7 --- /dev/null +++ b/pandas/types/missing.py @@ -0,0 +1,394 @@ +""" +missing types & inference +""" +import numpy as np +from pandas import lib +from pandas.tslib import NaT, iNaT +from .generic import (ABCMultiIndex, ABCSeries, + ABCIndexClass, ABCGeneric) +from .common import (is_string_dtype, is_datetimelike, + is_datetimelike_v_numeric, is_float_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, + is_complex_dtype, is_categorical_dtype, + is_string_like_dtype, is_bool_dtype, + is_integer_dtype, is_dtype_equal, + needs_i8_conversion, _ensure_object, + pandas_dtype, + is_scalar, + is_object_dtype, + is_integer, + _TD_DTYPE, + _NS_DTYPE, + _DATELIKE_DTYPES) +from .inference import is_list_like + + +def isnull(obj): + """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) + + Parameters + ---------- + arr : ndarray or object value + Object to check for null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is null or if an array is + given which of the element is null. + + See also + -------- + pandas.notnull: boolean inverse of pandas.isnull + """ + return _isnull(obj) + + +def _isnull_new(obj): + if is_scalar(obj): + return lib.checknull(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + return _isnull_ndarraylike(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=isnull)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike(np.asarray(obj)) + else: + return obj is None + + +def _isnull_old(obj): + """Detect missing values. Treat None, NaN, INF, -INF as null. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + """ + if is_scalar(obj): + return lib.checknull_old(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + return _isnull_ndarraylike_old(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=_isnull_old)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike_old(np.asarray(obj)) + else: + return obj is None + + +_isnull = _isnull_new + + +def _use_inf_as_null(key): + """Option change callback for null/inf behaviour + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * http://stackoverflow.com/questions/4859217/ + programmatically-creating-variables-in-python/4859312#4859312 + """ + from pandas.core.config import get_option + flag = get_option(key) + if flag: + globals()['_isnull'] = _isnull_old + else: + globals()['_isnull'] = _isnull_new + + +def _isnull_ndarraylike(obj): + + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if is_string_dtype(dtype): + if is_categorical_dtype(values): + from pandas import Categorical + if not isinstance(values, Categorical): + values = values.values + result = values.isnull() + else: + + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) + + elif is_datetimelike(obj): + # this is the NaT pattern + result = values.view('i8') == iNaT + else: + result = np.isnan(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def _isnull_ndarraylike_old(obj): + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj_old(values.ravel()) + result[:] = vec.reshape(shape) + + elif dtype in _DATELIKE_DTYPES: + # this is the NaT pattern + result = values.view('i8') == iNaT + else: + result = ~np.isfinite(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def notnull(obj): + """Replacement for numpy.isfinite / -numpy.isnan which is suitable for use + on object arrays. + + Parameters + ---------- + arr : ndarray or object value + Object to check for *not*-null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is *not* null or if an array + is given which of the element is *not* null. + + See also + -------- + pandas.isnull : boolean inverse of pandas.notnull + """ + res = isnull(obj) + if is_scalar(res): + return not res + return ~res + + +def is_null_datelike_scalar(other): + """ test whether the object is a null datelike, e.g. Nat + but guard against passing a non-scalar """ + if other is NaT or other is None: + return True + elif is_scalar(other): + + # a timedelta + if hasattr(other, 'dtype'): + return other.view('i8') == iNaT + elif is_integer(other) and other == iNaT: + return True + return isnull(other) + return False + + +def _is_na_compat(arr, fill_value=np.nan): + """ + Parameters + ---------- + arr: a numpy array + fill_value: fill value, default to np.nan + + Returns + ------- + True if we can fill using this fill_value + """ + dtype = arr.dtype + if isnull(fill_value): + return not (is_bool_dtype(dtype) or + is_integer_dtype(dtype)) + return True + + +def array_equivalent(left, right, strict_nan=False): + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs + in corresponding locations. False otherwise. It is assumed that left and + right are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent( + ... np.array([1, 2, np.nan]), + ... np.array([1, 2, np.nan])) + True + >>> array_equivalent( + ... np.array([1, np.nan, 2]), + ... np.array([1, 2, np.nan])) + False + """ + + left, right = np.asarray(left), np.asarray(right) + + # shape compat + if left.shape != right.shape: + return False + + # Object arrays can contain None, NaN and NaT. + # string dtypes must be come to this path for NumPy 1.7.1 compat + if is_string_dtype(left) or is_string_dtype(right): + + if not strict_nan: + # isnull considers NaN and None to be equivalent. + return lib.array_equivalent_object( + _ensure_object(left.ravel()), _ensure_object(right.ravel())) + + for left_value, right_value in zip(left, right): + if left_value is NaT and right_value is not NaT: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if (not isinstance(right_value, float) or + not np.isnan(right_value)): + return False + else: + if left_value != right_value: + return False + return True + + # NaNs can occur in float and complex arrays. + if is_float_dtype(left) or is_complex_dtype(left): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + # numpy will will not allow this type of datetimelike vs integer comparison + elif is_datetimelike_v_numeric(left, right): + return False + + # M8/m8 + elif needs_i8_conversion(left) and needs_i8_conversion(right): + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.view('i8') + right = right.view('i8') + + # NaNs cannot occur otherwise. + try: + return np.array_equal(left, right) + except AttributeError: + # see gh-13388 + # + # NumPy v1.7.1 has a bug in its array_equal + # function that prevents it from correctly + # comparing two arrays with complex dtypes. + # This bug is corrected in v1.8.0, so remove + # this try-except block as soon as we stop + # supporting NumPy versions < 1.8.0 + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.tolist() + right = right.tolist() + + return left == right + + +def _infer_fill_value(val): + """ + infer the fill value for the nan/NaT from the provided + scalar/ndarray/list-like if we are a NaT, return the correct dtyped + element to provide proper block construction + """ + + if not is_list_like(val): + val = [val] + val = np.array(val, copy=False) + if is_datetimelike(val): + return np.array('NaT', dtype=val.dtype) + elif is_object_dtype(val.dtype): + dtype = lib.infer_dtype(_ensure_object(val)) + if dtype in ['datetime', 'datetime64']: + return np.array('NaT', dtype=_NS_DTYPE) + elif dtype in ['timedelta', 'timedelta64']: + return np.array('NaT', dtype=_TD_DTYPE) + return np.nan + + +def _maybe_fill(arr, fill_value=np.nan): + """ + if we have a compatiable fill_value and arr dtype, then fill + """ + if _is_na_compat(arr, fill_value): + arr.fill(fill_value) + return arr + + +def na_value_for_dtype(dtype): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + + Returns + ------- + np.dtype or a pandas dtype + """ + dtype = pandas_dtype(dtype) + + if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or + is_timedelta64_dtype(dtype)): + return NaT + elif is_float_dtype(dtype): + return np.nan + elif is_integer_dtype(dtype): + return 0 + elif is_bool_dtype(dtype): + return False + return np.nan diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 2961b2fb2241f..4442eed898b60 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -23,11 +23,14 @@ import numpy as np import pandas as pd -from pandas.core.common import (is_sequence, array_equivalent, - is_list_like, is_datetimelike_v_numeric, - is_datetimelike_v_object, - is_number, is_bool, - needs_i8_conversion, is_categorical_dtype) +from pandas.types.missing import array_equivalent +from pandas.types.common import (is_datetimelike_v_numeric, + is_datetimelike_v_object, + is_number, is_bool, + needs_i8_conversion, + is_categorical_dtype, + is_sequence, + is_list_like) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d @@ -1001,17 +1004,20 @@ def assert_categorical_equal(left, right, check_dtype=True, assert_attr_equal('ordered', left, right, obj=obj) -def raise_assert_detail(obj, message, left, right): +def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) if isinstance(right, np.ndarray): right = pprint_thing(right) + if diff is not None: + diff = "\n[diff]: {diff}".format(diff=diff) + msg = """{0} are different {1} [left]: {2} -[right]: {3}""".format(obj, message, left, right) +[right]: {3}{4}""".format(obj, message, left, right, diff) raise AssertionError(msg) diff --git a/pandas/util/validators.py b/pandas/util/validators.py index bbfd24df9c13e..964fa9d9b38d5 100644 --- a/pandas/util/validators.py +++ b/pandas/util/validators.py @@ -3,6 +3,8 @@ for validating data or function arguments """ +from pandas.types.common import is_bool + def _check_arg_length(fname, args, max_fname_arg_count, compat_args): """ @@ -35,8 +37,6 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): checked that arg_val_dict.keys() is a subset of compat_args """ - from pandas.core.common import is_bool - for key in arg_val_dict: # try checking equality directly with '=' operator, # as comparison may have been overriden for the left From 20de2661c8eff66e465248cbe28062eae0e0e3bb Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 13 Jul 2016 10:38:09 -0400 Subject: [PATCH 07/50] BLD: included pandas.api.* in setup.py (#13640) --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 8f8865ecc3b7a..650357588570a 100755 --- a/setup.py +++ b/setup.py @@ -547,6 +547,9 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=['pandas', + 'pandas.api', + 'pandas.api.tests', + 'pandas.api.types', 'pandas.compat', 'pandas.compat.numpy', 'pandas.computation', From 4a9e66e8947c09117baf6edcd5e0d33658490fd6 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 21:27:01 +0200 Subject: [PATCH 08/50] Minor fix for linter --- pandas/indexes/multi.py | 2 +- pandas/tests/indexes/test_multi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index e43c993005161..a65ff80c1fafa 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -843,7 +843,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i-1]): + if len(arrays[i]) != len(arrays[i - 1]): raise ValueError('all arrays must be same length') cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 7904c2e3b4208..82c69d5a675e6 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -637,7 +637,7 @@ def test_from_arrays_different_lengths(self): idx1 = [1, 2, 3] idx2 = ['a', 'b'] assertRaisesRegexp(ValueError, '^all arrays must be same length$', - MultiIndex.from_arrays, [idx1, idx2]) + MultiIndex.from_arrays, [idx1, idx2]) def test_from_product(self): From 94b829d58804f03cda9594a804b1fc52ab25f555 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 21:40:20 +0200 Subject: [PATCH 09/50] Update whatsnew entry --- doc/source/whatsnew/v0.19.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f457b8d4bd1f6..023a13f13e9f2 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -550,3 +550,5 @@ Bug Fixes - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) + +-Bug in ``MultiIndex.from_arrays`` doesn't check for arrays lengths (:issue:`13599`) From 44f3229709d40241917267f4cfa7b28f9a92678b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Jul 2016 09:12:52 +0200 Subject: [PATCH 10/50] DOC/BLD: pin IPython version to 4.2.0 (#13639) (#13647) --- ci/requirements-2.7_DOC_BUILD.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index a07721c75cf34..cde0719aa027e 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,4 +1,4 @@ -ipython=4 +ipython=4.2.0 ipykernel sphinx nbconvert From 6f0a020e0929d53b2341f58f970806c85facef91 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Thu, 14 Jul 2016 17:15:23 +0900 Subject: [PATCH 11/50] TST: reorganize tools.tests (#13619) --- pandas/tools/tests/test_concat.py | 432 +++++++++----- pandas/tools/tests/test_join.py | 787 ++++++++++++++++++++++++++ pandas/tools/tests/test_merge.py | 900 +----------------------------- 3 files changed, 1082 insertions(+), 1037 deletions(-) create mode 100644 pandas/tools/tests/test_join.py diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index a8c86657a48cc..568cf63c02e30 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -17,7 +17,7 @@ assert_almost_equal) -class TestConcatenate(tm.TestCase): +class ConcatenateBase(tm.TestCase): _multiprocess_can_split_ = True @@ -26,6 +26,9 @@ def setUp(self): self.mixed_frame = self.frame.copy() self.mixed_frame['foo'] = 'bar' + +class TestAppend(ConcatenateBase): + def test_append(self): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -142,42 +145,32 @@ def test_append_preserve_index_name(self): result = df1.append(df2) self.assertEqual(result.index.name, 'A') - def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] - - joined = df_list[0].join(df_list[1:]) - tm.assert_frame_equal(joined, df) - - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] - - def _check_diff_index(df_list, result, exp_index): - reindexed = [x.reindex(exp_index) for x in df_list] - expected = reindexed[0].join(reindexed[1:]) - tm.assert_frame_equal(result, expected) - - # different join types - joined = df_list[0].join(df_list[1:], how='outer') - _check_diff_index(df_list, joined, df.index) - - joined = df_list[0].join(df_list[1:]) - _check_diff_index(df_list, joined, df_list[0].index) - - joined = df_list[0].join(df_list[1:], how='inner') - _check_diff_index(df_list, joined, df.index[2:8]) - - self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') - - def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] - - result = df1.join([df2, df3]) - assert_frame_equal(result, df) + def test_append_dtype_coerce(self): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + import datetime as dt + from pandas import NaT + + df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0)], + columns=['start_time']) + df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), + dt.datetime(2013, 1, 4, 7, 10)]], + columns=['start_time', 'end_time']) + + expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10)], + name='end_time'), + Series([dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0)], + name='start_time')], axis=1) + result = df1.append(df2, ignore_index=True) + assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) @@ -188,6 +181,9 @@ def test_append_missing_column_proper_upcast(self): self.assertEqual(appended['A'].dtype, 'f8') self.assertEqual(appended['B'].dtype, 'O') + +class TestConcatenate(ConcatenateBase): + def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) @@ -524,35 +520,6 @@ def test_with_mixed_tuples(self): # it works concat([df1, df2]) - def test_join_dups(self): - - # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') - result.columns = expected.columns - assert_frame_equal(result, expected) - - # GH 4975, invalid join on dups - w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - - dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") - dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] - assert_frame_equal(dta, expected) - def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) @@ -649,86 +616,40 @@ def test_concat_mixed_objs(self): panel = tm.makePanel() self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) - def test_panel_join(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[:2, :10, :3] - p2 = panel.ix[2:, 5:, 2:] - - # left join - result = p1.join(p2) - expected = p1.copy() - expected['ItemC'] = p2['ItemC'] - tm.assert_panel_equal(result, expected) - - # right join - result = p1.join(p2, how='right') - expected = p2.copy() - expected['ItemA'] = p1['ItemA'] - expected['ItemB'] = p1['ItemB'] - expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) - tm.assert_panel_equal(result, expected) - - # inner join - result = p1.join(p2, how='inner') - expected = panel.ix[:, 5:10, 2:3] - tm.assert_panel_equal(result, expected) - - # outer join - result = p1.join(p2, how='outer') - expected = p1.reindex(major=panel.major_axis, - minor=panel.minor_axis) - expected = expected.join(p2.reindex(major=panel.major_axis, - minor=panel.minor_axis)) - tm.assert_panel_equal(result, expected) - - def test_panel_join_overlap(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] - p2 = panel.ix[['ItemB', 'ItemC']] - - # Expected index is - # - # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 - joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') - p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') - p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') - no_overlap = panel.ix[['ItemA']] - expected = no_overlap.join(p1_suf.join(p2_suf)) - tm.assert_panel_equal(joined, expected) - - def test_panel_join_many(self): - tm.K = 10 - panel = tm.makePanel() - tm.K = 4 + def test_empty_dtype_coerce(self): - panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + # xref to #12411 + # xref to #12045 + # xref to #11594 + # see below - joined = panels[0].join(panels[1:]) - tm.assert_panel_equal(joined, panel) + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) + df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) + result = concat([df1, df2]) + expected = df1.dtypes + tm.assert_series_equal(result.dtypes, expected) - panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + def test_dtype_coerceion(self): - data_dict = {} - for p in panels: - data_dict.update(p.iteritems()) + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) - joined = panels[0].join(panels[1:], how='inner') - expected = Panel.from_dict(data_dict, intersect=True) - tm.assert_panel_equal(joined, expected) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) - joined = panels[0].join(panels[1:], how='outer') - expected = Panel.from_dict(data_dict, intersect=False) - tm.assert_panel_equal(joined, expected) + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) - # edge cases - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='outer', lsuffix='foo', rsuffix='bar') - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='right') + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) def test_panel_concat_other_axes(self): panel = tm.makePanel() @@ -1080,6 +1001,239 @@ def test_concat_invalid_first_argument(self): expected = read_csv(StringIO(data)) assert_frame_equal(result, expected) + def test_concat_NaT_series(self): + # GH 11693 + # test for merging NaT series with datetime series. + x = Series(date_range('20151124 08:00', '20151124 09:00', + freq='1h', tz='US/Eastern')) + y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT with tz + expected = Series(pd.NaT, index=range(4), + dtype='datetime64[ns, US/Eastern]') + result = pd.concat([y, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # without tz + x = pd.Series(pd.date_range('20151124 08:00', + '20151124 09:00', freq='1h')) + y = pd.Series(pd.date_range('20151124 10:00', + '20151124 11:00', freq='1h')) + y[:] = pd.NaT + expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT without tz + x[:] = pd.NaT + expected = pd.Series(pd.NaT, index=range(4), + dtype='datetime64[ns]') + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_tz_frame(self): + df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), + B=pd.Timestamp('20130603', tz='CET')), + index=range(5)) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + assert_frame_equal(df2, df3) + + def test_concat_tz_series(self): + # GH 11755 + # tz and no tz + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(date_range('2012-01-01', '2012-01-02')) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # GH 11887 + # concat tz and object + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(['a', 'b']) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # 12217 + # 12306 fixed I think + + # Concat'ing two UTC times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('UTC') + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize('UTC') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, UTC]') + + # Concat'ing two London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + # Concat'ing 2+1 London times + first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + # Concat'ing 1+2 London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + def test_concat_tz_series_with_datetimelike(self): + # GH 12620 + # tz and timedelta + x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-02-01', tz='US/Eastern')] + y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + + # tz and period + y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + + def test_concat_tz_series_tzlocal(self): + # GH 13583 + tm._skip_if_no_dateutil() + import dateutil + x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] + y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y)) + self.assertEqual(result.dtype, 'datetime64[ns, tzlocal()]') + + def test_concat_period_series(self): + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + # different freq + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + # non-period + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(['A', 'B']) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + def test_concat_empty_series(self): + # GH 11082 + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(res, exp) + + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = pd.Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name=None) + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=['x', 0]) + tm.assert_frame_equal(res, exp) + + def test_default_index(self): + # is_series and ignore_index + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series([4, 5, 6], name='y') + res = pd.concat([s1, s2], axis=1, ignore_index=True) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_series and all inputs have no names + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_dataframe and ignore_index + df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) + df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], + columns=['A', 'B']) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py new file mode 100644 index 0000000000000..86aee0b4a01c9 --- /dev/null +++ b/pandas/tools/tests/test_join.py @@ -0,0 +1,787 @@ +# pylint: disable=E1103 + +import nose + +from numpy.random import randn +import numpy as np + +import pandas as pd +from pandas.compat import lrange +import pandas.compat as compat +from pandas.tools.merge import merge, concat +from pandas.util.testing import assert_frame_equal +from pandas import DataFrame, MultiIndex, Series + +import pandas.algos as algos +import pandas.util.testing as tm +from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS + + +a_ = np.array + + +class TestJoin(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + # aggregate multiple columns + self.df = DataFrame({'key1': get_test_data(), + 'key2': get_test_data(), + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + + # exclude a couple keys for fun + self.df = self.df[self.df['key2'] > 1] + + self.df2 = DataFrame({'key1': get_test_data(n=N // 5), + 'key2': get_test_data(ngroups=NGROUPS // 2, + n=N // 5), + 'value': np.random.randn(N // 5)}) + + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, + index=data['C']) + + def test_cython_left_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = algos.left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5, -1, -1]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_right_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = algos.left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + # 0 1 1 1 + exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, + # 2 2 4 + 6, 7, 8, 6, 7, 8, -1]) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, + 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_inner_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = algos.inner_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_left_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + + joined_both = merge(self.df, self.df2) + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='left') + + def test_right_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='right') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + + joined_both = merge(self.df, self.df2, how='right') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='right') + + def test_full_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='outer') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + + joined_both = merge(self.df, self.df2, how='outer') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='outer') + + def test_inner_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='inner') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + + joined_both = merge(self.df, self.df2, how='inner') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='inner') + + def test_handle_overlap(self): + joined = merge(self.df, self.df2, on='key2', + suffixes=['.foo', '.bar']) + + self.assertIn('key1.foo', joined) + self.assertIn('key1.bar', joined) + + def test_handle_overlap_arbitrary_key(self): + joined = merge(self.df, self.df2, + left_on='key2', right_on='key1', + suffixes=['.foo', '.bar']) + self.assertIn('key1.foo', joined) + self.assertIn('key2.bar', joined) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_series_equal(merged['MergedA'], target['A'], + check_names=False) + self.assert_series_equal(merged['MergedD'], target['D'], + check_names=False) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], + 'value': [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assertTrue(np.isnan(joined['two']['c'])) + self.assertTrue(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(KeyError, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(ValueError, target.join, source_copy, on='A') + + def test_join_on_fails_with_different_right_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, left_on='a', right_index=True) + + def test_join_on_fails_with_different_left_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}, + index=tm.makeCustomIndex(10, 2)) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}) + merge(df, df2, right_on='b', left_index=True) + + def test_join_on_fails_with_different_column_counts(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, right_on='a', left_on=['a', 'b']) + + def test_join_on_fails_with_wrong_object_type(self): + # GH12081 + wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] + df = DataFrame({'a': [1, 1]}) + + for obj in wrongly_typed: + with tm.assertRaisesRegexp(ValueError, str(type(obj))): + merge(obj, df, left_on='a', right_on='a') + with tm.assertRaisesRegexp(ValueError, str(type(obj))): + merge(df, obj, left_on='a', right_on='a') + + def test_join_on_pass_vector(self): + expected = self.target.join(self.source, on='C') + del expected['C'] + + join_col = self.target.pop('C') + result = self.target.join(self.source, on=join_col) + assert_frame_equal(result, expected) + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assertIn(col, merged) + self.assertTrue(merged[col].isnull().all()) + + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assert_index_equal(merged2.columns, merged.columns) + self.assertEqual(len(merged2), 0) + + def test_join_on_inner(self): + df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_series_equal(joined['key'], expected['key'], + check_dtype=False) + self.assert_series_equal(joined['value'], expected['value'], + check_dtype=False) + self.assert_index_equal(joined.index, expected.index) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + + def test_join_on_series_buglet(self): + # GH #638 + df = DataFrame({'a': [1, 1]}) + ds = Series([2], index=[1], name='b') + result = df.join(ds, on='a') + expected = DataFrame({'a': [1, 1], + 'b': [2, 2]}, index=df.index) + tm.assert_frame_equal(result, expected) + + def test_join_index_mixed(self): + df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assertEqual(df1['B'].dtype, np.int64) + self.assertEqual(df1['D'].dtype, np.bool_) + + df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in ['inner', 'outer', 'left', 'right']: + + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + + def test_join_unconsolidated(self): + # GH #331 + a = DataFrame(randn(30, 2), columns=['a', 'b']) + c = Series(randn(30)) + a['c'] = c + d = DataFrame(randn(30, 1), columns=['q']) + + # it works! + a.join(d) + d.join(a) + + def test_join_multiindex(self): + index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + df1 = DataFrame(data=np.random.randn(6), index=index1, + columns=['var X']) + df2 = DataFrame(data=np.random.randn(6), index=index2, + columns=['var Y']) + + df1 = df1.sortlevel(0) + df2 = df2.sortlevel(0) + + joined = df1.join(df2, how='outer') + ex_index = index1._tuple_index.union(index2._tuple_index) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + df1 = df1.sortlevel(1) + df2 = df2.sortlevel(1) + + joined = df1.join(df2, how='outer').sortlevel(0) + ex_index = index1._tuple_index.union(index2._tuple_index) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + def test_join_inner_multiindex(self): + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + joined = data.join(to_join, on=['key1', 'key2'], how='inner') + expected = merge(data, to_join.reset_index(), + left_on=['key1', 'key2'], + right_on=['first', 'second'], how='inner', + sort=False) + + expected2 = merge(to_join, data, + right_on=['key1', 'key2'], left_index=True, + how='inner', sort=False) + assert_frame_equal(joined, expected2.reindex_like(joined)) + + expected2 = merge(to_join, data, right_on=['key1', 'key2'], + left_index=True, how='inner', sort=False) + + expected = expected.drop(['first', 'second'], axis=1) + expected.index = joined.index + + self.assertTrue(joined.index.is_monotonic) + assert_frame_equal(joined, expected) + + # _assert_same_contents(expected, expected2.ix[:, expected.columns]) + + def test_join_hierarchical_mixed(self): + # GH 2024 + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) + new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) + other_df = DataFrame( + [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) + other_df.set_index('a', inplace=True) + # GH 9455, 12219 + with tm.assert_produces_warning(UserWarning): + result = merge(new_df, other_df, left_index=True, right_index=True) + self.assertTrue(('b', 'mean') in result) + self.assertTrue('b' in result) + + def test_join_float64_float32(self): + + a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) + joined = a.join(b) + self.assertEqual(joined.dtypes['a'], 'float64') + self.assertEqual(joined.dtypes['b'], 'float64') + self.assertEqual(joined.dtypes['c'], 'float32') + + a = np.random.randint(0, 5, 100).astype('int64') + b = np.random.random(100).astype('float64') + c = np.random.random(100).astype('float32') + df = DataFrame({'a': a, 'b': b, 'c': c}) + xpdf = DataFrame({'a': a, 'b': b, 'c': c}) + s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) + rs = df.merge(s, left_on='a', right_index=True) + self.assertEqual(rs.dtypes['a'], 'int64') + self.assertEqual(rs.dtypes['b'], 'float64') + self.assertEqual(rs.dtypes['c'], 'float32') + self.assertEqual(rs.dtypes['md'], 'float32') + + xp = xpdf.merge(s, left_on='a', right_index=True) + assert_frame_equal(rs, xp) + + def test_join_many_non_unique_index(self): + df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) + df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) + df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + + result = idf1.join([idf2, idf3], how='outer') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + + result = result.reset_index() + expected = expected[result.columns] + expected['a'] = expected.a.astype('int64') + expected['b'] = expected.b.astype('int64') + assert_frame_equal(result, expected) + + df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) + df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) + df3 = DataFrame( + {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + result = idf1.join([idf2, idf3], how='inner') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + + result = result.reset_index() + + assert_frame_equal(result, expected.ix[:, result.columns]) + + # GH 11519 + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + s = Series(np.repeat(np.arange(8), 2), + index=np.repeat(np.arange(8), 2), name='TEST') + inner = df.join(s, how='inner') + outer = df.join(s, how='outer') + left = df.join(s, how='left') + right = df.join(s, how='right') + assert_frame_equal(inner, outer) + assert_frame_equal(inner, left) + assert_frame_equal(inner, right) + + def test_join_sort(self): + left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], + 'value': [1, 2, 3, 4]}) + right = DataFrame({'value2': ['a', 'b', 'c']}, + index=['bar', 'baz', 'foo']) + + joined = left.join(right, on='key', sort=True) + expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], + 'value': [2, 3, 1, 4], + 'value2': ['a', 'b', 'c', 'c']}, + index=[1, 2, 0, 3]) + assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on='key', sort=False) + self.assert_index_equal(joined.index, pd.Index(lrange(4))) + + def test_mixed_type_join_with_suffix(self): + # GH #916 + df = DataFrame(np.random.randn(20, 6), + columns=['a', 'b', 'c', 'd', 'e', 'f']) + df.insert(0, 'id', 0) + df.insert(5, 'dt', 'foo') + + grouped = df.groupby('id') + mn = grouped.mean() + cn = grouped.count() + + # it works! + mn.join(cn, rsuffix='_right') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_join_dups(self): + + # joining dups + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer") + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x, y, z, w], axis=1) + expected.columns = ['x_x', 'y_x', 'x_y', + 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + assert_frame_equal(dta, expected) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = no_overlap.join(p1_suf.join(p2_suf)) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(p.iteritems()) + + joined = panels[0].join(panels[1:], how='inner') + expected = pd.Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = pd.Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + +def _check_join(left, right, result, join_col, how='left', + lsuffix='_x', rsuffix='_y'): + + # some smoke tests + for c in join_col: + assert(result[c].notnull().all()) + + left_grouped = left.groupby(join_col) + right_grouped = right.groupby(join_col) + + for group_key, group in result.groupby(join_col): + l_joined = _restrict_to_columns(group, left.columns, lsuffix) + r_joined = _restrict_to_columns(group, right.columns, rsuffix) + + try: + lgroup = left_grouped.get_group(group_key) + except KeyError: + if how in ('left', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(l_joined, left.columns, join_col) + else: + _assert_same_contents(l_joined, lgroup) + + try: + rgroup = right_grouped.get_group(group_key) + except KeyError: + if how in ('right', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(r_joined, right.columns, join_col) + else: + _assert_same_contents(r_joined, rgroup) + + +def _restrict_to_columns(group, columns, suffix): + found = [c for c in group.columns + if c in columns or c.replace(suffix, '') in columns] + + # filter + group = group.ix[:, found] + + # get rid of suffixes, if any + group = group.rename(columns=lambda x: x.replace(suffix, '')) + + # put in the right order... + group = group.ix[:, columns] + + return group + + +def _assert_same_contents(join_chunk, source): + NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... + + jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values + svalues = source.fillna(NA_SENTINEL).drop_duplicates().values + + rows = set(tuple(row) for row in jvalues) + assert(len(rows) == len(source)) + assert(all(tuple(row) in rows for row in svalues)) + + +def _assert_all_na(join_chunk, source_columns, join_col): + for c in source_columns: + if c in join_col: + continue + assert(join_chunk[c].isnull().all()) + + +def _join_by_hand(a, b, how='left'): + join_index = a.index.join(b.index, how=how) + + a_re = a.reindex(join_index) + b_re = b.reindex(join_index) + + result_columns = a.columns.append(b.columns) + + for col, s in compat.iteritems(b_re): + a_re[col] = s + return a_re.reindex(columns=result_columns) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6c448de741e0c..396b095fabbd6 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -9,23 +9,17 @@ import random import pandas as pd -from pandas.compat import range, lrange, lzip +from pandas.compat import lrange, lzip from pandas.tools.merge import merge, concat, MergeError from pandas.util.testing import (assert_frame_equal, assert_series_equal, slow) -from pandas import (DataFrame, Index, MultiIndex, - Series, date_range, Categorical, - compat) -import pandas.algos as algos +from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm -a_ = np.array - N = 50 NGROUPS = 8 -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] def get_test_data(ngroups=NGROUPS, n=N): @@ -58,496 +52,16 @@ def setUp(self): n=N // 5), 'value': np.random.randn(N // 5)}) - index, data = tm.getMixedTypeDict() - self.target = DataFrame(data, index=index) - - # Join on string value - self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, - index=data['C']) - self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) self.right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) - def test_cython_left_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - ls, rs = algos.left_outer_join(left, right, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5, -1, -1]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_right_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - rs, ls = algos.left_outer_join(right, left, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - # 0 1 1 1 - exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, - # 2 2 4 - 6, 7, 8, 6, 7, 8, -1]) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, - 4, 4, 4, 5, 5, 5, 6]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_inner_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) - max_group = 5 - - ls, rs = algos.inner_join(left, right, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_left_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') - - joined_both = merge(self.df, self.df2) - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='left') - - def test_right_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='right') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') - - joined_both = merge(self.df, self.df2, how='right') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='right') - - def test_full_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='outer') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') - - joined_both = merge(self.df, self.df2, how='outer') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='outer') - - def test_inner_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='inner') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') - - joined_both = merge(self.df, self.df2, how='inner') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='inner') - - def test_handle_overlap(self): - joined = merge(self.df, self.df2, on='key2', - suffixes=['.foo', '.bar']) - - self.assertIn('key1.foo', joined) - self.assertIn('key1.bar', joined) - - def test_handle_overlap_arbitrary_key(self): - joined = merge(self.df, self.df2, - left_on='key2', right_on='key1', - suffixes=['.foo', '.bar']) - self.assertIn('key1.foo', joined) - self.assertIn('key2.bar', joined) - def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) - def test_join_on(self): - target = self.target - source = self.source - - merged = target.join(source, on='C') - self.assert_series_equal(merged['MergedA'], target['A'], - check_names=False) - self.assert_series_equal(merged['MergedD'], target['D'], - check_names=False) - - # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], - 'value': [0, 0, 1, 1, 2]}) - assert_frame_equal(joined, expected) - - # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - self.assertTrue(np.isnan(joined['two']['c'])) - self.assertTrue(np.isnan(joined['three']['c'])) - - # merge column not p resent - self.assertRaises(KeyError, target.join, source, on='E') - - # overlap - source_copy = source.copy() - source_copy['A'] = 0 - self.assertRaises(ValueError, target.join, source_copy, on='A') - - def test_join_on_fails_with_different_right_index(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - merge(df, df2, left_on='a', right_index=True) - - def test_join_on_fails_with_different_left_index(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}, - index=tm.makeCustomIndex(10, 2)) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}) - merge(df, df2, right_on='b', left_index=True) - - def test_join_on_fails_with_different_column_counts(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - merge(df, df2, right_on='a', left_on=['a', 'b']) - - def test_join_on_fails_with_wrong_object_type(self): - # GH12081 - wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] - df = DataFrame({'a': [1, 1]}) - - for obj in wrongly_typed: - with tm.assertRaisesRegexp(ValueError, str(type(obj))): - merge(obj, df, left_on='a', right_on='a') - with tm.assertRaisesRegexp(ValueError, str(type(obj))): - merge(df, obj, left_on='a', right_on='a') - - def test_join_on_pass_vector(self): - expected = self.target.join(self.source, on='C') - del expected['C'] - - join_col = self.target.pop('C') - result = self.target.join(self.source, on=join_col) - assert_frame_equal(result, expected) - - def test_join_with_len0(self): - # nothing to merge - merged = self.target.join(self.source.reindex([]), on='C') - for col in self.source: - self.assertIn(col, merged) - self.assertTrue(merged[col].isnull().all()) - - merged2 = self.target.join(self.source.reindex([]), on='C', - how='inner') - self.assert_index_equal(merged2.columns, merged.columns) - self.assertEqual(len(merged2), 0) - - def test_join_on_inner(self): - df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) - - joined = df.join(df2, on='key', how='inner') - - expected = df.join(df2, on='key') - expected = expected[expected['value'].notnull()] - self.assert_series_equal(joined['key'], expected['key'], - check_dtype=False) - self.assert_series_equal(joined['value'], expected['value'], - check_dtype=False) - self.assert_index_equal(joined.index, expected.index) - - def test_join_on_singlekey_list(self): - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - - # corner cases - joined = df.join(df2, on=['key']) - expected = df.join(df2, on='key') - - assert_frame_equal(joined, expected) - - def test_join_on_series(self): - result = self.target.join(self.source['MergedA'], on='C') - expected = self.target.join(self.source[['MergedA']], on='C') - assert_frame_equal(result, expected) - - def test_join_on_series_buglet(self): - # GH #638 - df = DataFrame({'a': [1, 1]}) - ds = Series([2], index=[1], name='b') - result = df.join(ds, on='a') - expected = DataFrame({'a': [1, 1], - 'b': [2, 2]}, index=df.index) - tm.assert_frame_equal(result, expected) - - def test_join_index_mixed(self): - df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - self.assertEqual(df1['B'].dtype, np.int64) - self.assertEqual(df1['D'].dtype, np.bool_) - - df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) - - # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] - df1.columns = expected_columns[:4] - df2.columns = expected_columns[4:] - expected = _join_by_hand(df1, df2) - assert_frame_equal(joined, expected) - - # no overlapping blocks - df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' - - df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. - - for kind in JOIN_TYPES: - - joined = df1.join(df2, how=kind) - expected = _join_by_hand(df1, df2, how=kind) - assert_frame_equal(joined, expected) - - joined = df2.join(df1, how=kind) - expected = _join_by_hand(df2, df1, how=kind) - assert_frame_equal(joined, expected) - - def test_join_empty_bug(self): - # generated an exception in 0.4.3 - x = DataFrame() - x.join(DataFrame([3], index=[0], columns=['A']), how='outer') - - def test_join_unconsolidated(self): - # GH #331 - a = DataFrame(randn(30, 2), columns=['a', 'b']) - c = Series(randn(30)) - a['c'] = c - d = DataFrame(randn(30, 1), columns=['q']) - - # it works! - a.join(d) - d.join(a) - - def test_join_multiindex(self): - index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) - - index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) - - df1 = DataFrame(data=np.random.randn(6), index=index1, - columns=['var X']) - df2 = DataFrame(data=np.random.randn(6), index=index2, - columns=['var Y']) - - df1 = df1.sortlevel(0) - df2 = df2.sortlevel(0) - - joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index.union(index2._tuple_index) - expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) - expected.index.names = index1.names - assert_frame_equal(joined, expected) - self.assertEqual(joined.index.names, index1.names) - - df1 = df1.sortlevel(1) - df2 = df2.sortlevel(1) - - joined = df1.join(df2, how='outer').sortlevel(0) - ex_index = index1._tuple_index.union(index2._tuple_index) - expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) - expected.index.names = index1.names - - assert_frame_equal(joined, expected) - self.assertEqual(joined.index.names, index1.names) - - def test_join_inner_multiindex(self): - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] - - data = np.random.randn(len(key1)) - data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - joined = data.join(to_join, on=['key1', 'key2'], how='inner') - expected = merge(data, to_join.reset_index(), - left_on=['key1', 'key2'], - right_on=['first', 'second'], how='inner', - sort=False) - - expected2 = merge(to_join, data, - right_on=['key1', 'key2'], left_index=True, - how='inner', sort=False) - assert_frame_equal(joined, expected2.reindex_like(joined)) - - expected2 = merge(to_join, data, right_on=['key1', 'key2'], - left_index=True, how='inner', sort=False) - - expected = expected.drop(['first', 'second'], axis=1) - expected.index = joined.index - - self.assertTrue(joined.index.is_monotonic) - assert_frame_equal(joined, expected) - - # _assert_same_contents(expected, expected2.ix[:, expected.columns]) - - def test_join_hierarchical_mixed(self): - # GH 2024 - df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) - new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) - other_df = DataFrame( - [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) - other_df.set_index('a', inplace=True) - # GH 9455, 12219 - with tm.assert_produces_warning(UserWarning): - result = merge(new_df, other_df, left_index=True, right_index=True) - self.assertTrue(('b', 'mean') in result) - self.assertTrue('b' in result) - - def test_join_float64_float32(self): - - a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) - b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) - joined = a.join(b) - self.assertEqual(joined.dtypes['a'], 'float64') - self.assertEqual(joined.dtypes['b'], 'float64') - self.assertEqual(joined.dtypes['c'], 'float32') - - a = np.random.randint(0, 5, 100).astype('int64') - b = np.random.random(100).astype('float64') - c = np.random.random(100).astype('float32') - df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c}) - s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) - rs = df.merge(s, left_on='a', right_index=True) - self.assertEqual(rs.dtypes['a'], 'int64') - self.assertEqual(rs.dtypes['b'], 'float64') - self.assertEqual(rs.dtypes['c'], 'float32') - self.assertEqual(rs.dtypes['md'], 'float32') - - xp = xpdf.merge(s, left_on='a', right_index=True) - assert_frame_equal(rs, xp) - - def test_join_many_non_unique_index(self): - df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) - df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) - df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) - idf1 = df1.set_index(["a", "b"]) - idf2 = df2.set_index(["a", "b"]) - idf3 = df3.set_index(["a", "b"]) - - result = idf1.join([idf2, idf3], how='outer') - - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') - - result = result.reset_index() - expected = expected[result.columns] - expected['a'] = expected.a.astype('int64') - expected['b'] = expected.b.astype('int64') - assert_frame_equal(result, expected) - - df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) - df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) - df3 = DataFrame( - {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) - idf1 = df1.set_index(["a", "b"]) - idf2 = df2.set_index(["a", "b"]) - idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='inner') - - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') - - result = result.reset_index() - - assert_frame_equal(result, expected.ix[:, result.columns]) - - # GH 11519 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - s = Series(np.repeat(np.arange(8), 2), - index=np.repeat(np.arange(8), 2), name='TEST') - inner = df.join(s, how='inner') - outer = df.join(s, how='outer') - left = df.join(s, how='left') - right = df.join(s, how='right') - assert_frame_equal(inner, outer) - assert_frame_equal(inner, left) - assert_frame_equal(inner, right) - def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) @@ -651,23 +165,6 @@ def test_merge_nocopy(self): merged['d'] = 'peekaboo' self.assertTrue((right['d'] == 'peekaboo').all()) - def test_join_sort(self): - left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'value2': ['a', 'b', 'c']}, - index=['bar', 'baz', 'foo']) - - joined = left.join(right, on='key', sort=True) - expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], - 'value': [2, 3, 1, 4], - 'value2': ['a', 'b', 'c', 'c']}, - index=[1, 2, 0, 3]) - assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on='key', sort=False) - self.assert_index_equal(joined.index, pd.Index(lrange(4))) - def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -737,20 +234,6 @@ def test_handle_join_key_pass_array(self): merged = merge(left, right, left_index=True, right_on=key, how='outer') self.assert_series_equal(merged['key_0'], Series(key, name='key_0')) - def test_mixed_type_join_with_suffix(self): - # GH #916 - df = DataFrame(np.random.randn(20, 6), - columns=['a', 'b', 'c', 'd', 'e', 'f']) - df.insert(0, 'id', 0) - df.insert(5, 'dt', 'foo') - - grouped = df.groupby('id') - mn = grouped.mean() - cn = grouped.count() - - # it works! - mn.join(cn, rsuffix='_right') - def test_no_overlap_more_informative_error(self): dt = datetime.now() df1 = DataFrame({'x': ['a']}, index=[dt]) @@ -963,68 +446,6 @@ def _constructor(self): tm.assertIsInstance(result, NotADataFrame) - def test_empty_dtype_coerce(self): - - # xref to #12411 - # xref to #12045 - # xref to #11594 - # see below - - # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) - df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) - result = concat([df1, df2]) - expected = df1.dtypes - assert_series_equal(result.dtypes, expected) - - def test_dtype_coerceion(self): - - # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) - - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - # 12045 - import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - def test_append_dtype_coerce(self): - - # GH 4993 - # appending with datetime will incorrectly convert datetime64 - import datetime as dt - from pandas import NaT - - df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0)], - columns=['start_time']) - df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), - dt.datetime(2013, 1, 4, 7, 10)]], - columns=['start_time', 'end_time']) - - expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10)], - name='end_time'), - Series([dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) - assert_frame_equal(result, expected) - def test_join_append_timedeltas(self): import datetime as dt @@ -1140,239 +561,6 @@ def test_merge_on_periods(self): self.assertEqual(result['value_x'].dtype, 'object') self.assertEqual(result['value_y'].dtype, 'object') - def test_concat_NaT_series(self): - # GH 11693 - # test for merging NaT series with datetime series. - x = Series(date_range('20151124 08:00', '20151124 09:00', - freq='1h', tz='US/Eastern')) - y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') - expected = Series([x[0], x[1], pd.NaT, pd.NaT]) - - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT with tz - expected = Series(pd.NaT, index=range(4), - dtype='datetime64[ns, US/Eastern]') - result = pd.concat([y, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # without tz - x = pd.Series(pd.date_range('20151124 08:00', - '20151124 09:00', freq='1h')) - y = pd.Series(pd.date_range('20151124 10:00', - '20151124 11:00', freq='1h')) - y[:] = pd.NaT - expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT without tz - x[:] = pd.NaT - expected = pd.Series(pd.NaT, index=range(4), - dtype='datetime64[ns]') - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - def test_concat_tz_frame(self): - df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), - B=pd.Timestamp('20130603', tz='CET')), - index=range(5)) - - # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) - assert_frame_equal(df2, df3) - - def test_concat_tz_series(self): - # GH 11755 - # tz and no tz - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(date_range('2012-01-01', '2012-01-02')) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # GH 11887 - # concat tz and object - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(['a', 'b']) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # 12217 - # 12306 fixed I think - - # Concat'ing two UTC times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('UTC') - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('UTC') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, UTC]') - - # Concat'ing two London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - # Concat'ing 2+1 London times - first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - # Concat'ing 1+2 London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - def test_concat_tz_series_with_datetimelike(self): - # GH 12620 - # tz and timedelta - x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-02-01', tz='US/Eastern')] - y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) - - # tz and period - y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) - - def test_concat_tz_series_tzlocal(self): - # GH 13583 - tm._skip_if_no_dateutil() - import dateutil - x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] - y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y)) - self.assertEqual(result.dtype, 'datetime64[ns, tzlocal()]') - - def test_concat_period_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - # different freq - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - # non-period - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(['A', 'B']) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - def test_concat_empty_series(self): - # GH 11082 - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(res, exp) - - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') - res = pd.concat([s1, s2], axis=0) - # name will be reset - exp = pd.Series([1, 2, 3]) - tm.assert_series_equal(res, exp) - - # empty Series with no name - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name=None) - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0]) - tm.assert_frame_equal(res, exp) - - def test_default_index(self): - # is_series and ignore_index - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series([4, 5, 6], name='y') - res = pd.concat([s1, s2], axis=1, ignore_index=True) - self.assertIsInstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - # use check_index_type=True to check the result have - # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - # is_series and all inputs have no names - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - res = pd.concat([s1, s2], axis=1, ignore_index=False) - self.assertIsInstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - # is_dataframe and ignore_index - df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) - df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) - - res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], - columns=['A', 'B']) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - res = pd.concat([df1, df2], axis=1, ignore_index=True) - exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - def test_indicator(self): # PR #10054. xref #7412 and closes #8790. df1 = DataFrame({'col1': [0, 1], 'col_left': [ @@ -2134,90 +1322,6 @@ def f(): self.assertRaises(NotImplementedError, f) -def _check_join(left, right, result, join_col, how='left', - lsuffix='_x', rsuffix='_y'): - - # some smoke tests - for c in join_col: - assert(result[c].notnull().all()) - - left_grouped = left.groupby(join_col) - right_grouped = right.groupby(join_col) - - for group_key, group in result.groupby(join_col): - l_joined = _restrict_to_columns(group, left.columns, lsuffix) - r_joined = _restrict_to_columns(group, right.columns, rsuffix) - - try: - lgroup = left_grouped.get_group(group_key) - except KeyError: - if how in ('left', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) - - _assert_all_na(l_joined, left.columns, join_col) - else: - _assert_same_contents(l_joined, lgroup) - - try: - rgroup = right_grouped.get_group(group_key) - except KeyError: - if how in ('right', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) - - _assert_all_na(r_joined, right.columns, join_col) - else: - _assert_same_contents(r_joined, rgroup) - - -def _restrict_to_columns(group, columns, suffix): - found = [c for c in group.columns - if c in columns or c.replace(suffix, '') in columns] - - # filter - group = group.ix[:, found] - - # get rid of suffixes, if any - group = group.rename(columns=lambda x: x.replace(suffix, '')) - - # put in the right order... - group = group.ix[:, columns] - - return group - - -def _assert_same_contents(join_chunk, source): - NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... - - jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values - svalues = source.fillna(NA_SENTINEL).drop_duplicates().values - - rows = set(tuple(row) for row in jvalues) - assert(len(rows) == len(source)) - assert(all(tuple(row) in rows for row in svalues)) - - -def _assert_all_na(join_chunk, source_columns, join_col): - for c in source_columns: - if c in join_col: - continue - assert(join_chunk[c].isnull().all()) - - -def _join_by_hand(a, b, how='left'): - join_index = a.index.join(b.index, how=how) - - a_re = a.reindex(join_index) - b_re = b.reindex(join_index) - - result_columns = a.columns.append(b.columns) - - for col, s in compat.iteritems(b_re): - a_re[col] = s - return a_re.reindex(columns=result_columns) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From a711b4251c765c0c4b9d1c8deb985162dfaf09ae Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 14 Jul 2016 04:44:18 -0400 Subject: [PATCH 12/50] BF(TST): allow AttributeError being raised (in addition to TypeError) from mpl (#13641) Closes #13570 --- pandas/tests/test_graphics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 3a5b0117948b7..5493eb37c358b 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1330,7 +1330,8 @@ def test_plot(self): self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) - with tm.assertRaises(TypeError): + # mpl >= 1.5.2 (or slightly below) throw AttributError + with tm.assertRaises((TypeError, AttributeError)): df.plot.line(blarg=True) df = DataFrame(np.random.rand(10, 3), From 084ceaee135627680f4dd00115c3d6c7d930a22d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 14 Jul 2016 06:20:50 -0400 Subject: [PATCH 13/50] API, DEPR: Raise and Deprecate Reshape for Pandas Objects Author: gfyoung Closes #13012 from gfyoung/categorical-reshape-validate and squashes the following commits: 3ad161d [gfyoung] API: Prevent invalid arguments to Categorical.reshape --- doc/source/whatsnew/v0.19.0.txt | 3 ++ pandas/core/categorical.py | 23 +++++++-- pandas/core/internals.py | 26 +++++++++- pandas/core/series.py | 14 ++++-- pandas/indexes/base.py | 10 ++++ pandas/io/packers.py | 7 +-- pandas/tests/indexes/test_base.py | 6 +++ pandas/tests/series/test_analytics.py | 68 ++++++++++++++++----------- pandas/tests/test_categorical.py | 37 +++++++++++++-- 9 files changed, 151 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index bef02a06135de..688f3b7ff6ada 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -256,6 +256,7 @@ API changes ~~~~~~~~~~~ +- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue: `12882`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) @@ -449,6 +450,8 @@ Furthermore: Deprecations ^^^^^^^^^^^^ +- ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 79d8bfbf57f12..1d1a9f990e61a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -383,11 +383,28 @@ def itemsize(self): def reshape(self, new_shape, *args, **kwargs): """ - An ndarray-compatible method that returns - `self` because categorical instances cannot - actually be reshaped. + DEPRECATED: calling this method will raise an error in a + future release. + + An ndarray-compatible method that returns `self` because + `Categorical` instances cannot actually be reshaped. + + Parameters + ---------- + new_shape : int or tuple of ints + A 1-D array of integers that correspond to the new + shape of the `Categorical`. For more information on + the parameter, please refer to `np.reshape`. """ + warn("reshape is deprecated and will raise " + "in a subsequent release", FutureWarning, stacklevel=2) + nv.validate_reshape(args, kwargs) + + # while the 'new_shape' parameter has no effect, + # we should still enforce valid shape parameters + np.reshape(self.codes, new_shape) + return self @property diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 363ac8249eb06..ff12cfddbe9cd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1839,7 +1839,7 @@ def convert(self, *args, **kwargs): try: values = values.reshape(shape) values = _block_shape(values, ndim=self.ndim) - except AttributeError: + except (AttributeError, NotImplementedError): pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -3616,7 +3616,7 @@ def value_getitem(placement): return value else: if value.ndim == self.ndim - 1: - value = value.reshape((1,) + value.shape) + value = _safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -4686,6 +4686,28 @@ def rrenamer(x): _transform_index(right, rrenamer)) +def _safe_reshape(arr, new_shape): + """ + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a Categorical or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. + + Parameters + ---------- + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape + """ + if isinstance(arr, ABCSeries): + arr = arr._values + if not isinstance(arr, Categorical): + arr = arr.reshape(new_shape) + return arr + + def _transform_index(index, func): """ Apply function to all values found in index. diff --git a/pandas/core/series.py b/pandas/core/series.py index 2c7f298dde2ec..b933f68cfad62 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -843,14 +843,22 @@ def repeat(self, reps, *args, **kwargs): def reshape(self, *args, **kwargs): """ - Return the values attribute of `self` with shape `args`. - However, if the specified shape matches exactly the current - shape, `self` is returned for compatibility reasons. + DEPRECATED: calling this method will raise an error in a + future release. Please call ``.values.reshape(...)`` instead. + + return an ndarray with the values shape + if the specified shape matches exactly the current shape, then + return self (for compat) See also -------- numpy.ndarray.reshape """ + warnings.warn("reshape is deprecated and will raise " + "in a subsequent release. Please use " + ".values.reshape(...) instead", FutureWarning, + stacklevel=2) + if len(args) == 1 and hasattr(args[0], '__iter__'): shape = args[0] else: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 5c9938c932da2..b013d6ccb0b8e 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -957,6 +957,16 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + def reshape(self, *args, **kwargs): + """ + NOT IMPLEMENTED: do not call this method, as reshaping is not + supported for Index objects and will raise an error. + + Reshape an Index. + """ + raise NotImplementedError("reshaping is not supported " + "for Index objects") + @property def _has_complex_internals(self): # to disable groupby tricks in MultiIndex diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 14e2c9b371296..94f390955dddd 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -61,7 +61,7 @@ from pandas.core.generic import NDFrame from pandas.core.common import PerformanceWarning from pandas.io.common import get_filepath_or_buffer -from pandas.core.internals import BlockManager, make_block +from pandas.core.internals import BlockManager, make_block, _safe_reshape import pandas.core.internals as internals from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType @@ -622,8 +622,9 @@ def decode(obj): axes = obj[u'axes'] def create_block(b): - values = unconvert(b[u'values'], dtype_for(b[u'dtype']), - b[u'compress']).reshape(b[u'shape']) + values = _safe_reshape(unconvert( + b[u'values'], dtype_for(b[u'dtype']), + b[u'compress']), b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 67869901b068e..06662e52e3a6f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1413,6 +1413,12 @@ def test_take_fill_value(self): with tm.assertRaises(IndexError): idx.take(np.array([1, -5])) + def test_reshape_raise(self): + msg = "reshaping is not supported" + idx = pd.Index([0, 1, 2]) + tm.assertRaisesRegexp(NotImplementedError, msg, + idx.reshape, idx.shape) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self): # GH6552 idx = pd.Index([0, 1, 2]) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d9e2d8096c8d7..34cfb2f0c1529 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1554,49 +1554,63 @@ def test_shift_categorical(self): assert_index_equal(s.values.categories, sp1.values.categories) assert_index_equal(s.values.categories, sn2.values.categories) - def test_reshape_non_2d(self): - # GH 4554 - x = Series(np.random.random(201), name='x') - self.assertTrue(x.reshape(x.shape, ) is x) + def test_reshape_deprecate(self): + x = Series(np.random.random(10), name='x') + tm.assert_produces_warning(FutureWarning, x.reshape, x.shape) - # GH 2719 - a = Series([1, 2, 3, 4]) - result = a.reshape(2, 2) - expected = a.values.reshape(2, 2) - tm.assert_numpy_array_equal(result, expected) - self.assertIsInstance(result, type(expected)) + def test_reshape_non_2d(self): + # see gh-4554 + with tm.assert_produces_warning(FutureWarning): + x = Series(np.random.random(201), name='x') + self.assertTrue(x.reshape(x.shape, ) is x) + + # see gh-2719 + with tm.assert_produces_warning(FutureWarning): + a = Series([1, 2, 3, 4]) + result = a.reshape(2, 2) + expected = a.values.reshape(2, 2) + tm.assert_numpy_array_equal(result, expected) + self.assertIsInstance(result, type(expected)) def test_reshape_2d_return_array(self): x = Series(np.random.random(201), name='x') - result = x.reshape((-1, 1)) - self.assertNotIsInstance(result, Series) - result2 = np.reshape(x, (-1, 1)) - self.assertNotIsInstance(result2, Series) + with tm.assert_produces_warning(FutureWarning): + result = x.reshape((-1, 1)) + self.assertNotIsInstance(result, Series) - result = x[:, None] - expected = x.reshape((-1, 1)) - assert_almost_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result2 = np.reshape(x, (-1, 1)) + self.assertNotIsInstance(result2, Series) + + with tm.assert_produces_warning(FutureWarning): + result = x[:, None] + expected = x.reshape((-1, 1)) + assert_almost_equal(result, expected) def test_reshape_bad_kwarg(self): a = Series([1, 2, 3, 4]) - msg = "'foo' is an invalid keyword argument for this function" - tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "'foo' is an invalid keyword argument for this function" + tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) - msg = "reshape\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "reshape\(\) got an unexpected keyword argument 'foo'" + tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) def test_numpy_reshape(self): a = Series([1, 2, 3, 4]) - result = np.reshape(a, (2, 2)) - expected = a.values.reshape(2, 2) - tm.assert_numpy_array_equal(result, expected) - self.assertIsInstance(result, type(expected)) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = np.reshape(a, (2, 2)) + expected = a.values.reshape(2, 2) + tm.assert_numpy_array_equal(result, expected) + self.assertIsInstance(result, type(expected)) - result = np.reshape(a, a.shape) - tm.assert_series_equal(result, a) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = np.reshape(a, a.shape) + tm.assert_series_equal(result, a) def test_unstack(self): from numpy import nan diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2ca1fc71df20a..dd39861ac3114 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4058,13 +4058,40 @@ def test_numpy_repeat(self): msg = "the 'axis' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.repeat, cat, 2, axis=1) + def test_reshape(self): + cat = pd.Categorical([], categories=["a", "b"]) + tm.assert_produces_warning(FutureWarning, cat.reshape, 0) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(0), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape((5, -1)), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(cat.shape), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(cat.size), cat) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "can only specify one unknown dimension" + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + tm.assertRaisesRegexp(ValueError, msg, cat.reshape, (-2, -1)) + def test_numpy_reshape(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - self.assert_categorical_equal(np.reshape(cat, cat.shape), cat) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(np.reshape(cat, cat.shape), cat) - msg = "the 'order' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.reshape, - cat, cat.shape, order='F') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "the 'order' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.reshape, + cat, cat.shape, order='F') def test_na_actions(self): From 3f6d4bdd63d9a1ae27e587bd033e507f7a5e1109 Mon Sep 17 00:00:00 2001 From: yui-knk Date: Thu, 14 Jul 2016 06:47:32 -0400 Subject: [PATCH 14/50] CLN: Fix compile time warnings Author: yui-knk Closes #13643 from yui-knk/warning2 and squashes the following commits: ee3a4fb [yui-knk] CLN: Fix compile time warnings --- pandas/src/datetime/np_datetime.c | 2 +- pandas/src/ujson/python/objToJSON.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c index c30b404d2b8b2..80703c8b08de6 100644 --- a/pandas/src/datetime/np_datetime.c +++ b/pandas/src/datetime/np_datetime.c @@ -576,7 +576,7 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, } PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { - return ((PyDatetimeScalarObject *) obj)->obmeta.base; + return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *) obj)->obmeta.base; } diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 1080e9548ba56..75de63acbd7d6 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -493,7 +493,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outV PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) _obj; PRINTMARK(); - pandas_datetime_to_datetimestruct(obj->obval, obj->obmeta.base, &dts); + pandas_datetime_to_datetimestruct(obj->obval, (PANDAS_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } From c9a27ede0925ddbaa8d3ec9efd3c332a636505cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Jul 2016 16:26:07 +0200 Subject: [PATCH 15/50] CLN: fix some issues in asv benchmark suite (#13630) * CLN: fix params list * Fix issue in asv.conf.json for win32+other environment Fix mistaken exclusion of virtualenv or existing:same on win32 in the config. Credits: @pv * CLN: remove DataMatrix * ASV: fix exlusion of tables package for non-conda environments --- asv_bench/asv.conf.json | 6 +++--- asv_bench/benchmarks/indexing.py | 20 -------------------- asv_bench/benchmarks/inference.py | 10 +++++----- asv_bench/benchmarks/join_merge.py | 16 ---------------- 4 files changed, 8 insertions(+), 44 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 7b9fe353df2e3..f5fa849464881 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -77,11 +77,11 @@ // On conda install pytables, otherwise tables {"environment_type": "conda", "tables": ""}, {"environment_type": "conda", "pytables": null}, - {"environment_type": "virtualenv", "tables": null}, - {"environment_type": "virtualenv", "pytables": ""}, + {"environment_type": "(?!conda).*", "tables": null}, + {"environment_type": "(?!conda).*", "pytables": ""}, // On conda&win32, install libpython {"sys_platform": "(?!win32).*", "libpython": ""}, - {"sys_platform": "win32", "libpython": null}, + {"environment_type": "conda", "sys_platform": "win32", "libpython": null}, {"environment_type": "(?!conda).*", "libpython": ""} ], "include": [], diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 53d37a8161f43..094ae23a92fad 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -19,24 +19,6 @@ def time_dataframe_getitem_scalar(self): self.df[self.col][self.idx] -class datamatrix_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - try: - self.klass = DataMatrix - except: - self.klass = DataFrame - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_datamatrix_getitem_scalar(self): - self.df[self.col][self.idx] - - class series_get_value(object): goal_time = 0.2 @@ -498,5 +480,3 @@ def setup(self): def time_float_loc(self): self.ind.get_loc(0) - - diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 6809c351beade..ee9d3104be4b1 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -143,12 +143,12 @@ class to_numeric(object): param_names = ['data', 'downcast'] params = [ - [(['1'] * N / 2) + ([2] * N / 2), - (['-1'] * N / 2) + ([2] * N / 2), - np.repeat(np.array('1970-01-01', '1970-01-02', + [(['1'] * (N / 2)) + ([2] * (N / 2)), + (['-1'] * (N / 2)) + ([2] * (N / 2)), + np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - (['1.1'] * N / 2) + ([2] * N / 2), - ([1] * N / 2) + ([2] * N / 2), + (['1.1'] * (N / 2)) + ([2] * (N / 2)), + ([1] * (N / 2)) + ([2] * (N / 2)), np.repeat(np.int32(1), N)], [None, 'integer', 'signed', 'unsigned', 'float'], ] diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 39ebd9cb1cb73..dcd07911f2ff0 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -179,10 +179,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -210,10 +206,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -241,10 +233,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -272,10 +260,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) From 05b976c9339bad84f488c8d6813ed19232c9255c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 14 Jul 2016 20:06:52 -0400 Subject: [PATCH 16/50] TST: add tests for Timestamp.toordinal/fromordinal follow-up for #13593 Author: sinhrks Closes #13610 from sinhrks/depr_timestamp_offset2 and squashes the following commits: 28f8d41 [sinhrks] TST: add tests for Timestamp.toordinal --- pandas/tseries/tests/test_tslib.py | 27 +++++++++++++++++++++++++++ pandas/tslib.pyx | 21 ++++++++++++++++++--- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index ce88edcf4249b..31d6393c1c26e 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -255,6 +255,18 @@ def test_constructor_keyword(self): hour=1, minute=2, second=3, microsecond=999999)), repr(Timestamp('2015-11-12 01:02:03.999999'))) + def test_constructor_fromordinal(self): + base = datetime.datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal(), freq='D') + self.assertEqual(base, ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') + self.assertEqual(pd.Timestamp('2000-01-01', tz='US/Eastern'), ts) + self.assertEqual(base.toordinal(), ts.toordinal()) + def test_constructor_offset_depr(self): # GH 12160 with tm.assert_produces_warning(FutureWarning, @@ -270,6 +282,21 @@ def test_constructor_offset_depr(self): with tm.assertRaisesRegexp(TypeError, msg): Timestamp('2011-01-01', offset='D', freq='D') + def test_constructor_offset_depr_fromordinal(self): + # GH 12160 + base = datetime.datetime(2000, 1, 1) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp.fromordinal(base.toordinal(), offset='D') + self.assertEqual(pd.Timestamp('2000-01-01'), ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 650b4c7979d8d..2af08f2713262 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -235,12 +235,14 @@ class Timestamp(_Timestamp): ---------- ts_input : datetime-like, str, int, float Value to be converted to Timestamp - offset : str, DateOffset + freq : str, DateOffset Offset which Timestamp will have tz : string, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : string numpy unit used for conversion, if ts_input is int or float + offset : str, DateOffset + Deprecated, use freq The other two forms mimic the parameters from ``datetime.datetime``. They can be passed by either position or keyword, but not both mixed together. @@ -262,8 +264,21 @@ class Timestamp(_Timestamp): @classmethod def fromordinal(cls, ordinal, freq=None, tz=None, offset=None): - """ passed an ordinal, translate and convert to a ts - note: by definition there cannot be any tz info on the ordinal itself """ + """ + passed an ordinal, translate and convert to a ts + note: by definition there cannot be any tz info on the ordinal itself + + Parameters + ---------- + ordinal : int + date corresponding to a proleptic Gregorian ordinal + freq : str, DateOffset + Offset which Timestamp will have + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will have. + offset : str, DateOffset + Deprecated, use freq + """ return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz, offset=offset) @classmethod From 71a06752a7040a75402f3e30a82b96e10816b492 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 14 Jul 2016 20:12:33 -0400 Subject: [PATCH 17/50] CLN: Initialization coincides with mapping, hence with uniqueness check - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` Rebased version of https://github.com/pydata/pandas/pull/10229 which was [actually not](h ttps://github.com/pydata/pandas/pull/10229#issuecomment-131470116) fixed by https://github.com/pydata/pandas/pull/10199. Nothing particular relevant, just wanted to delete this branch locally and noticed it still applies: you'll judge what to do of it. Author: Pietro Battiston Closes #13594 from toobaz/fix_checkunique and squashes the following commits: a63bd12 [Pietro Battiston] CLN: Initialization coincides with mapping, hence with uniqueness check --- pandas/index.pyx | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 71717dd2d771b..bc985100692fc 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -80,7 +80,7 @@ cdef class IndexEngine: cdef: bint unique, monotonic_inc, monotonic_dec - bint initialized, monotonic_check, unique_check + bint initialized, monotonic_check def __init__(self, vgetter, n): self.vgetter = vgetter @@ -91,7 +91,6 @@ cdef class IndexEngine: self.monotonic_check = 0 self.unique = 0 - self.unique_check = 0 self.monotonic_inc = 0 self.monotonic_dec = 0 @@ -211,8 +210,8 @@ cdef class IndexEngine: property is_unique: def __get__(self): - if not self.unique_check: - self._do_unique_check() + if not self.initialized: + self.initialize() return self.unique == 1 @@ -246,9 +245,6 @@ cdef class IndexEngine: cdef _get_index_values(self): return self.vgetter() - cdef inline _do_unique_check(self): - self._ensure_mapping_populated() - def _call_monotonic(self, values): raise NotImplementedError @@ -270,7 +266,6 @@ cdef class IndexEngine: if len(self.mapping) == len(values): self.unique = 1 - self.unique_check = 1 self.initialized = 1 From 0a70b5fef3ae2363fea040ea47dd52247811c8c8 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 14 Jul 2016 20:26:01 -0400 Subject: [PATCH 18/50] API: Change Period('NAT') to return NaT closes #12759 closes #13582 Author: sinhrks Closes #13609 from sinhrks/period_nat and squashes the following commits: 9305c36 [sinhrks] COMPAT: Period(NaT) now returns pd.NaT --- doc/source/whatsnew/v0.19.0.txt | 39 +++ pandas/src/period.pyx | 269 ++++++++++--------- pandas/tests/indexes/test_datetimelike.py | 9 +- pandas/tseries/period.py | 49 ++-- pandas/tseries/tests/test_base.py | 26 +- pandas/tseries/tests/test_period.py | 305 +++++++++++++--------- pandas/tseries/tests/test_tslib.py | 7 + pandas/tslib.pyx | 5 +- 8 files changed, 407 insertions(+), 302 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 688f3b7ff6ada..c9f501c682a18 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -446,6 +446,45 @@ Furthermore: - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) +.. _whatsnew_0190.api.periodnat: + +``Period('NaT')`` now returns ``pd.NaT`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, ``Period`` has its own ``Period('NaT')`` representation different from ``pd.NaT``. Now ``Period('NaT')`` has been changed to return ``pd.NaT``. (:issue:`12759`, :issue:`13582`) + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.Period('NaT', freq='D') + Out[5]: Period('NaT', 'D') + +New Behavior: + +.. ipython:: python + + pd.Period('NaT') + + +To be compat with ``Period`` addition and subtraction, ``pd.NaT`` now supports addition and subtraction with ``int``. Previously it raises ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.NaT + 1 + ... + ValueError: Cannot add integral value to Timestamp without freq. + +New Behavior: + +.. ipython:: python + + pd.NaT + 1 + pd.NaT - 1 + + .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index af2e295ae0cfc..37f265ede07e7 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -472,7 +472,11 @@ def extract_ordinals(ndarray[object] values, freq): except AttributeError: p = Period(p, freq=freq) - ordinals[i] = p.ordinal + if p is tslib.NaT: + # input may contain NaT-like string + ordinals[i] = tslib.iNaT + else: + ordinals[i] = p.ordinal return ordinals @@ -665,24 +669,8 @@ class IncompatibleFrequency(ValueError): pass -cdef class Period(object): - """ - Represents an period of time +cdef class _Period(object): - Parameters - ---------- - value : Period or compat.string_types, default None - The time period represented (e.g., '4Q2005') - freq : str, default None - One of pandas period strings or corresponding objects - year : int, default None - month : int, default 1 - quarter : int, default None - day : int, default 1 - hour : int, default 0 - minute : int, default 0 - second : int, default 0 - """ cdef public: int64_t ordinal object freq @@ -711,97 +699,22 @@ cdef class Period(object): @classmethod def _from_ordinal(cls, ordinal, freq): """ fast creation from an ordinal and freq that are already validated! """ - self = Period.__new__(cls) - self.ordinal = ordinal - self.freq = cls._maybe_convert_freq(freq) - return self - - def __init__(self, value=None, freq=None, ordinal=None, - year=None, month=1, quarter=None, day=1, - hour=0, minute=0, second=0): - # freq points to a tuple (base, mult); base is one of the defined - # periods such as A, Q, etc. Every five minutes would be, e.g., - # ('T', 5) but may be passed in as a string like '5T' - - # ordinal is the period offset from the gregorian proleptic epoch - - if ordinal is not None and value is not None: - raise ValueError(("Only value or ordinal but not both should be " - "given but not both")) - elif ordinal is not None: - if not lib.is_integer(ordinal): - raise ValueError("Ordinal must be an integer") - if freq is None: - raise ValueError('Must supply freq for ordinal value') - - elif value is None: - if freq is None: - raise ValueError("If value is None, freq cannot be None") - ordinal = _ordinal_from_fields(year, month, quarter, day, - hour, minute, second, freq) - - elif isinstance(value, Period): - other = value - if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): - ordinal = other.ordinal - freq = other.freq - else: - converted = other.asfreq(freq) - ordinal = converted.ordinal - - elif is_null_datetimelike(value) or value in tslib._nat_strings: - ordinal = tslib.iNaT - if freq is None: - raise ValueError("If value is NaT, freq cannot be None " - "because it cannot be inferred") - - elif isinstance(value, compat.string_types) or lib.is_integer(value): - if lib.is_integer(value): - value = str(value) - value = value.upper() - dt, _, reso = parse_time_string(value, freq) - - if freq is None: - try: - freq = frequencies.Resolution.get_freq(reso) - except KeyError: - raise ValueError("Invalid frequency or could not infer: %s" % reso) - - elif isinstance(value, datetime): - dt = value - if freq is None: - raise ValueError('Must supply freq for datetime value') - elif isinstance(value, np.datetime64): - dt = Timestamp(value) - if freq is None: - raise ValueError('Must supply freq for datetime value') - elif isinstance(value, date): - dt = datetime(year=value.year, month=value.month, day=value.day) - if freq is None: - raise ValueError('Must supply freq for datetime value') - else: - msg = "Value must be Period, string, integer, or datetime" - raise ValueError(msg) - - base, mult = frequencies.get_freq_code(freq) - - if ordinal is None: - self.ordinal = get_period_ordinal(dt.year, dt.month, dt.day, - dt.hour, dt.minute, dt.second, - dt.microsecond, 0, base) + if ordinal == tslib.iNaT: + return tslib.NaT else: + self = _Period.__new__(cls) self.ordinal = ordinal - - self.freq = self._maybe_convert_freq(freq) + self.freq = cls._maybe_convert_freq(freq) + return self def __richcmp__(self, other, op): if isinstance(other, Period): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: - return _nat_scalar_rules[op] return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) + elif other is tslib.NaT: + return _nat_scalar_rules[op] # index/series like elif hasattr(other, '_typ'): return NotImplemented @@ -824,10 +737,7 @@ cdef class Period(object): offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + (nanos // offset_nanos) + ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) msg = 'Input cannot be converted to Period(freq={0})' raise IncompatibleFrequency(msg.format(self.freqstr)) @@ -835,10 +745,7 @@ cdef class Period(object): freqstr = frequencies.get_standard_freq(other) base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + other.n + ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) @@ -853,10 +760,7 @@ cdef class Period(object): elif other is tslib.NaT: return tslib.NaT elif lib.is_integer(other): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + other * self.freq.n + ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) else: # pragma: no cover return NotImplemented @@ -872,17 +776,12 @@ cdef class Period(object): neg_other = -other return self + neg_other elif lib.is_integer(other): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal - other * self.freq.n + ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif isinstance(other, Period): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: - return Period(ordinal=tslib.iNaT, freq=self.freq) return self.ordinal - other.ordinal elif getattr(other, '_typ', None) == 'periodindex': return -other.__sub__(self) @@ -914,16 +813,13 @@ cdef class Period(object): base1, mult1 = frequencies.get_freq_code(self.freq) base2, mult2 = frequencies.get_freq_code(freq) - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal + # mult1 can't be negative or 0 + end = how == 'E' + if end: + ordinal = self.ordinal + mult1 - 1 else: - # mult1 can't be negative or 0 - end = how == 'E' - if end: - ordinal = self.ordinal + mult1 - 1 - else: - ordinal = self.ordinal - ordinal = period_asfreq(ordinal, base1, base2, end) + ordinal = self.ordinal + ordinal = period_asfreq(ordinal, base1, base2, end) return Period(ordinal=ordinal, freq=freq) @@ -933,12 +829,9 @@ cdef class Period(object): @property def end_time(self): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - # freq.n can't be negative or 0 - # ordinal = (self + self.freq.n).start_time.value - 1 - ordinal = (self + 1).start_time.value - 1 + # freq.n can't be negative or 0 + # ordinal = (self + self.freq.n).start_time.value - 1 + ordinal = (self + 1).start_time.value - 1 return Timestamp(ordinal) def to_timestamp(self, freq=None, how='start', tz=None): @@ -1199,8 +1092,114 @@ cdef class Period(object): return period_format(self.ordinal, base, fmt) -def _ordinal_from_fields(year, month, quarter, day, hour, minute, - second, freq): +class Period(_Period): + """ + Represents an period of time + + Parameters + ---------- + value : Period or compat.string_types, default None + The time period represented (e.g., '4Q2005') + freq : str, default None + One of pandas period strings or corresponding objects + year : int, default None + month : int, default 1 + quarter : int, default None + day : int, default 1 + hour : int, default 0 + minute : int, default 0 + second : int, default 0 + """ + + def __new__(cls, value=None, freq=None, ordinal=None, + year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None): + # freq points to a tuple (base, mult); base is one of the defined + # periods such as A, Q, etc. Every five minutes would be, e.g., + # ('T', 5) but may be passed in as a string like '5T' + + # ordinal is the period offset from the gregorian proleptic epoch + + cdef _Period self + + if ordinal is not None and value is not None: + raise ValueError(("Only value or ordinal but not both should be " + "given but not both")) + elif ordinal is not None: + if not lib.is_integer(ordinal): + raise ValueError("Ordinal must be an integer") + if freq is None: + raise ValueError('Must supply freq for ordinal value') + + elif value is None: + if (year is None and month is None and quarter is None and + day is None and hour is None and minute is None and second is None): + ordinal = tslib.iNaT + else: + if freq is None: + raise ValueError("If value is None, freq cannot be None") + + # set defaults + month = 1 if month is None else month + day = 1 if day is None else day + hour = 0 if hour is None else hour + minute = 0 if minute is None else minute + second = 0 if second is None else second + + ordinal = _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq) + + elif isinstance(value, Period): + other = value + if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): + ordinal = other.ordinal + freq = other.freq + else: + converted = other.asfreq(freq) + ordinal = converted.ordinal + + elif is_null_datetimelike(value) or value in tslib._nat_strings: + ordinal = tslib.iNaT + + elif isinstance(value, compat.string_types) or lib.is_integer(value): + if lib.is_integer(value): + value = str(value) + value = value.upper() + dt, _, reso = parse_time_string(value, freq) + + if freq is None: + try: + freq = frequencies.Resolution.get_freq(reso) + except KeyError: + raise ValueError("Invalid frequency or could not infer: %s" % reso) + + elif isinstance(value, datetime): + dt = value + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, np.datetime64): + dt = Timestamp(value) + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') + else: + msg = "Value must be Period, string, integer, or datetime" + raise ValueError(msg) + + if ordinal is None: + base, mult = frequencies.get_freq_code(freq) + ordinal = get_period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + dt.microsecond, 0, base) + + return cls._from_ordinal(ordinal, freq) + + +def _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq): base, mult = frequencies.get_freq_code(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 9eba481a66685..5c21f71d64660 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -741,14 +741,7 @@ def test_astype(self): result = idx.astype(object) expected = Index([Period('2016-05-16', freq='D')] + [Period(NaT, freq='D')] * 3, dtype='object') - # Hack because of lack of support for Period null checking (GH12759) - tm.assert_index_equal(result[:1], expected[:1]) - result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64) - expected_arr = np.asarray([p.ordinal for p in expected], - dtype=np.int64) - tm.assert_numpy_array_equal(result_arr, expected_arr) - # TODO: When GH12759 is resolved, change the above hack to: - # tm.assert_index_equal(result, expected) # now, it raises. + tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index([16937] + [-9223372036854775808] * 3, diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 45f634050a5d8..dffb71cff526a 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -92,13 +92,14 @@ def wrapper(self, other): result[mask] = nat_result return result + elif other is tslib.NaT: + result = np.empty(len(self.values), dtype=bool) + result.fill(nat_result) else: other = Period(other, freq=self.freq) func = getattr(self.values, opname) result = func(other.ordinal) - if other.ordinal == tslib.iNaT: - result.fill(nat_result) mask = self.values == tslib.iNaT if mask.any(): result[mask] = nat_result @@ -235,7 +236,7 @@ def _from_arraylike(cls, data, freq, tz): data = _ensure_int64(data) if freq is None: raise ValueError('freq not specified') - data = np.array([Period(x, freq=freq).ordinal for x in data], + data = np.array([Period(x, freq=freq) for x in data], dtype=np.int64) except (TypeError, ValueError): data = _ensure_object(data) @@ -322,15 +323,18 @@ def _na_value(self): return self._box_func(tslib.iNaT) def __contains__(self, key): - if not isinstance(key, Period) or key.freq != self.freq: - if isinstance(key, compat.string_types): - try: - self.get_loc(key) - return True - except Exception: - return False + if isinstance(key, Period): + if key.freq != self.freq: + return False + else: + return key.ordinal in self._engine + else: + try: + self.get_loc(key) + return True + except Exception: + return False return False - return key.ordinal in self._engine def __array_wrap__(self, result, context=None): """ @@ -622,17 +626,13 @@ def _sub_period(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if other.ordinal == tslib.iNaT: - new_data = np.empty(len(self)) - new_data.fill(np.nan) - else: - asi8 = self.asi8 - new_data = asi8 - other.ordinal + asi8 = self.asi8 + new_data = asi8 - other.ordinal - if self.hasnans: - mask = asi8 == tslib.iNaT - new_data = new_data.astype(np.float64) - new_data[mask] = np.nan + if self.hasnans: + mask = asi8 == tslib.iNaT + new_data = new_data.astype(np.float64) + new_data[mask] = np.nan # result must be Int64Index or Float64Index return Index(new_data, name=self.name) @@ -740,8 +740,10 @@ def get_loc(self, key, method=None, tolerance=None): # we cannot construct the Period # as we have an invalid type raise KeyError(key) + try: - return Index.get_loc(self, key.ordinal, method, tolerance) + ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal + return Index.get_loc(self, ordinal, method, tolerance) except KeyError: raise KeyError(key) @@ -1044,8 +1046,7 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): if is_start_per and is_end_per and start.freq != end.freq: raise ValueError('Start and end must have same freq') - if ((is_start_per and start.ordinal == tslib.iNaT) or - (is_end_per and end.ordinal == tslib.iNaT)): + if (start is tslib.NaT or end is tslib.NaT): raise ValueError('Start and end must not be NaT') if freq is None: diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 68cea17ba3fc9..958a10c329a46 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1587,17 +1587,16 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) + tm.assert_index_equal(result, expected) for i in [0, 1, 3]: - self.assertTrue(result[i], expected[i]) - self.assertTrue(result[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result[2].freq, 'D') + self.assertEqual(result[i], expected[i]) + self.assertIs(result[2], pd.NaT) self.assertEqual(result.name, expected.name) result_list = idx.tolist() for i in [0, 1, 3]: - self.assertTrue(result_list[i], expected_list[i]) - self.assertTrue(result_list[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result_list[2].freq, 'D') + self.assertEqual(result_list[i], expected_list[i]) + self.assertIs(result_list[2], pd.NaT) def test_minmax(self): @@ -1623,18 +1622,15 @@ def test_minmax(self): # Return NaT obj = PeriodIndex([], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) obj = PeriodIndex([pd.NaT], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) obj = PeriodIndex([pd.NaT, pd.NaT, pd.NaT], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) def test_numpy_minmax(self): pr = pd.period_range(start='2016-01-15', end='2016-01-20') @@ -1735,9 +1731,9 @@ def test_representation_to_series(self): 2 2013 dtype: object""" - exp6 = """0 2011-01-01 09:00 -1 2012-02-01 10:00 -2 NaT + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT dtype: object""" exp7 = """0 2013Q1 diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 591fa19aad585..8d217ff0753a6 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -36,14 +36,17 @@ def test_quarterly_negative_ordinals(self): p = Period(ordinal=-1, freq='Q-DEC') self.assertEqual(p.year, 1969) self.assertEqual(p.quarter, 4) + self.assertIsInstance(p, Period) p = Period(ordinal=-2, freq='Q-DEC') self.assertEqual(p.year, 1969) self.assertEqual(p.quarter, 3) + self.assertIsInstance(p, Period) p = Period(ordinal=-2, freq='M') self.assertEqual(p.year, 1969) self.assertEqual(p.month, 11) + self.assertIsInstance(p, Period) def test_period_cons_quarterly(self): # bugs in scikits.timeseries @@ -67,6 +70,7 @@ def test_period_cons_annual(self): stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) p = Period(stamp, freq=freq) self.assertEqual(p, exp + 1) + self.assertIsInstance(p, Period) def test_period_cons_weekly(self): for num in range(10, 17): @@ -77,34 +81,46 @@ def test_period_cons_weekly(self): result = Period(daystr, freq=freq) expected = Period(daystr, freq='D').asfreq(freq) self.assertEqual(result, expected) + self.assertIsInstance(result, Period) + + def test_period_from_ordinal(self): + p = pd.Period('2011-01', freq='M') + res = pd.Period._from_ordinal(p.ordinal, freq='M') + self.assertEqual(p, res) + self.assertIsInstance(res, Period) def test_period_cons_nat(self): p = Period('NaT', freq='M') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'M') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period('nat', freq='W-SUN') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'W-SUN') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period(tslib.iNaT, freq='D') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'D') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period(tslib.iNaT, freq='3D') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, offsets.Day(3)) - self.assertEqual(p.freqstr, '3D') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) + + p = Period('NaT') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT) + self.assertIs(p, pd.NaT) + + def test_cons_null_like(self): + # check Timestamp compat + self.assertIs(Timestamp('NaT'), pd.NaT) + self.assertIs(Period('NaT'), pd.NaT) + + self.assertIs(Timestamp(None), pd.NaT) + self.assertIs(Period(None), pd.NaT) - self.assertRaises(ValueError, Period, 'NaT') + self.assertIs(Timestamp(float('nan')), pd.NaT) + self.assertIs(Period(float('nan')), pd.NaT) + + self.assertIs(Timestamp(np.nan), pd.NaT) + self.assertIs(Period(np.nan), pd.NaT) def test_period_cons_mult(self): p1 = Period('2011-01', freq='3M') @@ -197,13 +213,6 @@ def test_timestamp_tz_arg_dateutil_from_string(self): freq='M').to_timestamp(tz='dateutil/Europe/Brussels') self.assertEqual(p.tz, gettz('Europe/Brussels')) - def test_timestamp_nat_tz(self): - t = Period('NaT', freq='M').to_timestamp() - self.assertTrue(t is tslib.NaT) - - t = Period('NaT', freq='M').to_timestamp(tz='Asia/Tokyo') - self.assertTrue(t is tslib.NaT) - def test_timestamp_mult(self): p = pd.Period('2011-01', freq='M') self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) @@ -213,12 +222,6 @@ def test_timestamp_mult(self): self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-03-31')) - def test_timestamp_nat_mult(self): - for freq in ['M', '3M']: - p = pd.Period('NaT', freq=freq) - self.assertTrue(p.to_timestamp(how='S') is pd.NaT) - self.assertTrue(p.to_timestamp(how='E') is pd.NaT) - def test_period_constructor(self): i1 = Period('1/1/2005', freq='M') i2 = Period('Jan 2005') @@ -552,9 +555,6 @@ def _ex(p): result = p.to_timestamp('5S', how='start') self.assertEqual(result, expected) - p = Period('NaT', freq='W') - self.assertTrue(p.to_timestamp() is tslib.NaT) - def test_start_time(self): freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] xp = datetime(2012, 1, 1) @@ -566,9 +566,6 @@ def test_start_time(self): self.assertEqual(Period('2012', freq='W').start_time, datetime(2011, 12, 26)) - p = Period('NaT', freq='W') - self.assertTrue(p.start_time is tslib.NaT) - def test_end_time(self): p = Period('2012', freq='A') @@ -607,9 +604,6 @@ def _ex(*args): xp = _ex(2012, 1, 16) self.assertEqual(xp, p.end_time) - p = Period('NaT', freq='W') - self.assertTrue(p.end_time is tslib.NaT) - def test_anchor_week_end_time(self): def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) @@ -758,15 +752,14 @@ def test_properties_secondly(self): def test_properties_nat(self): p_nat = Period('NaT', freq='M') t_nat = pd.Timestamp('NaT') + self.assertIs(p_nat, t_nat) + # confirm Period('NaT') work identical with Timestamp('NaT') for f in ['year', 'month', 'day', 'hour', 'minute', 'second', 'week', 'dayofyear', 'quarter', 'days_in_month']: self.assertTrue(np.isnan(getattr(p_nat, f))) self.assertTrue(np.isnan(getattr(t_nat, f))) - for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']: - self.assertTrue(np.isnan(getattr(p_nat, f))) - def test_pnow(self): dt = datetime.now() @@ -789,7 +782,7 @@ def test_constructor_corner(self): self.assertRaises(ValueError, Period, 1.6, freq='D') self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') - self.assertRaises(ValueError, Period) + self.assertIs(Period(None), pd.NaT) self.assertRaises(ValueError, Period, month=1) p = Period('2007-01-01', freq='D') @@ -1526,12 +1519,6 @@ def test_conv_secondly(self): self.assertEqual(ival_S.asfreq('S'), ival_S) - def test_asfreq_nat(self): - p = Period('NaT', freq='A') - result = p.asfreq('M') - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') - def test_asfreq_mult(self): # normal freq to mult freq p = Period(freq='A', year=2007) @@ -1603,21 +1590,6 @@ def test_asfreq_mult(self): self.assertEqual(result.ordinal, expected.ordinal) self.assertEqual(result.freq, expected.freq) - def test_asfreq_mult_nat(self): - # normal freq to mult freq - for p in [Period('NaT', freq='A'), Period('NaT', freq='3A'), - Period('NaT', freq='2M'), Period('NaT', freq='3D')]: - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq) - expected = Period('NaT', freq='3A') - self.assertEqual(result.ordinal, pd.tslib.iNaT) - self.assertEqual(result.freq, expected.freq) - - result = p.asfreq(freq, how='S') - expected = Period('NaT', freq='3A') - self.assertEqual(result.ordinal, pd.tslib.iNaT) - self.assertEqual(result.freq, expected.freq) - class TestPeriodIndex(tm.TestCase): def setUp(self): @@ -1995,6 +1967,19 @@ def test_getitem_datetime(self): rs = ts[dt1:dt4] tm.assert_series_equal(rs, ts) + def test_getitem_nat(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) + self.assertIs(idx[1], tslib.NaT) + + s = pd.Series([0, 1, 2], index=idx) + self.assertEqual(s[pd.NaT], 1) + + s = pd.Series(idx, index=idx) + self.assertEqual(s[pd.Period('2011-01', freq='M')], + pd.Period('2011-01', freq='M')) + self.assertIs(s[pd.NaT], tslib.NaT) + def test_slice_with_negative_step(self): ts = Series(np.arange(20), period_range('2014-01', periods=20, freq='M')) @@ -2038,6 +2023,20 @@ def test_contains(self): self.assertFalse(Period('2007-01', freq='D') in rng) self.assertFalse(Period('2007-01', freq='2M') in rng) + def test_contains_nat(self): + # GH13582 + idx = period_range('2007-01', freq='M', periods=10) + self.assertFalse(pd.NaT in idx) + self.assertFalse(None in idx) + self.assertFalse(float('nan') in idx) + self.assertFalse(np.nan in idx) + + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertTrue(pd.NaT in idx) + self.assertTrue(None in idx) + self.assertTrue(float('nan') in idx) + self.assertTrue(np.nan in idx) + def test_sub(self): rng = period_range('2007-01', periods=50) @@ -3292,6 +3291,17 @@ def test_get_loc_msg(self): except KeyError as inst: self.assertEqual(inst.args[0], bad_period) + def test_get_loc_nat(self): + didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) + pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + + # check DatetimeIndex compat + for idx in [didx, pidx]: + self.assertEqual(idx.get_loc(pd.NaT), 1) + self.assertEqual(idx.get_loc(None), 1) + self.assertEqual(idx.get_loc(float('nan')), 1) + self.assertEqual(idx.get_loc(np.nan), 1) + def test_append_concat(self): # #1815 d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') @@ -3576,95 +3586,87 @@ def test_add_offset_nat(self): for freq in ['A', '2A', '3A']: p = Period('NaT', freq=freq) for o in [offsets.YearEnd(2)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) + # freq is Tick for freq in ['D', '2D', '3D']: p = Period('NaT', freq=freq) for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), np.timedelta64(3600 * 24, 's'), timedelta(-2), timedelta(hours=48)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) for freq in ['H', '2H', '3H']: p = Period('NaT', freq=freq) for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), np.timedelta64(3600, 's'), timedelta(minutes=120), timedelta(days=4, minutes=180)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if not isinstance(o, np.timedelta64): - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) def test_sub_pdnat(self): # GH 13071 @@ -3749,24 +3751,22 @@ def test_sub_offset_nat(self): for freq in ['A', '2A', '3A']: p = Period('NaT', freq=freq) for o in [offsets.YearEnd(2)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) # freq is Tick for freq in ['D', '2D', '3D']: @@ -3774,37 +3774,33 @@ def test_sub_offset_nat(self): for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), np.timedelta64(3600 * 24, 's'), timedelta(-2), timedelta(hours=48)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) for freq in ['H', '2H', '3H']: p = Period('NaT', freq=freq) for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), np.timedelta64(3600, 's'), timedelta(minutes=120), timedelta(days=4, minutes=180)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) def test_nat_ops(self): for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) - self.assertEqual((p - 1).ordinal, tslib.iNaT) - self.assertEqual((p - Period('2011-01', freq=freq)).ordinal, - tslib.iNaT) - self.assertEqual((Period('2011-01', freq=freq) - p).ordinal, - tslib.iNaT) + self.assertIs(p + 1, tslib.NaT) + self.assertIs(1 + p, tslib.NaT) + self.assertIs(p - 1, tslib.NaT) + self.assertIs(p - Period('2011-01', freq=freq), tslib.NaT) + self.assertIs(Period('2011-01', freq=freq) - p, tslib.NaT) def test_period_ops_offset(self): p = Period('2011-04-01', freq='D') @@ -3830,18 +3826,17 @@ class TestPeriodIndexSeriesMethods(tm.TestCase): def _check(self, values, func, expected): idx = pd.PeriodIndex(values) result = func(idx) - tm.assert_index_equal(result, pd.PeriodIndex(expected)) + if isinstance(expected, pd.Index): + tm.assert_index_equal(result, expected) + else: + # comp op results in bool + tm.assert_numpy_array_equal(result, expected) s = pd.Series(values) result = func(s) - exp = pd.Series(expected) - # Period(NaT) != Period(NaT) - - lmask = result.map(lambda x: x.ordinal != tslib.iNaT) - rmask = exp.map(lambda x: x.ordinal != tslib.iNaT) - tm.assert_series_equal(lmask, rmask) - tm.assert_series_equal(result[lmask], exp[rmask]) + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) def test_pi_ops(self): idx = PeriodIndex(['2011-01', '2011-02', '2011-03', @@ -3962,7 +3957,7 @@ def test_pi_sub_period(self): exp = pd.Index([12, 11, 10, 9], name='idx') tm.assert_index_equal(result, exp) - exp = pd.Index([np.nan, np.nan, np.nan, np.nan], name='idx') + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) @@ -3987,10 +3982,82 @@ def test_pi_sub_period_nat(self): exp = pd.Index([12, np.nan, 10, 9], name='idx') tm.assert_index_equal(result, exp) - exp = pd.Index([np.nan, np.nan, np.nan, np.nan], name='idx') + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + def test_pi_comp_period(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > pd.Period('2011-03', freq='M') + exp = np.array([False, False, False, True], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + def test_pi_comp_period_nat(self): + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x == tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: x != tslib.NaT + exp = np.array([True, True, True, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x < pd.Period('2011-03', freq='M') + exp = np.array([True, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: tslib.NaT >= x + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + class TestPeriodRepresentation(tm.TestCase): """ diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 31d6393c1c26e..6696c03a070f7 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1224,6 +1224,13 @@ def test_nat_arithmetic(self): self.assertIs(left - right, pd.NaT) self.assertIs(right - left, pd.NaT) + # int addition / subtraction + for (left, right) in [(pd.NaT, 2), (pd.NaT, 0), (pd.NaT, -3)]: + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(left - right, pd.NaT) + self.assertIs(right - left, pd.NaT) + def test_nat_arithmetic_index(self): # GH 11718 diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 2af08f2713262..c681cebd84836 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1097,7 +1097,10 @@ cdef class _Timestamp(datetime): return Timestamp(self.value + other_int, tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): - if self.freq is None: + if self is NaT: + # to be compat with Period + return NaT + elif self.freq is None: raise ValueError("Cannot add integral value to Timestamp " "without freq.") return Timestamp((self.freq * other).apply(self), freq=self.freq) From 1bee56ed9aa96ffe99aa62d5e8c0212d6dc947ee Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 15 Jul 2016 06:20:39 -0400 Subject: [PATCH 19/50] BUG: construction of Series with integers on windows not default to int64 closes #13646 Author: Jeff Reback Closes #13661 from jreback/foo and squashes the following commits: e26f9bf [Jeff Reback] BUG: construction of Series with integers on windows not defaulting to int64 --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/series.py | 2 +- pandas/tests/frame/test_operators.py | 2 +- pandas/tests/series/test_constructors.py | 11 +++++++++++ pandas/types/cast.py | 2 +- 5 files changed, 15 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c9f501c682a18..747fc70f858b4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -534,7 +534,7 @@ Bug Fixes - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - +- Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) diff --git a/pandas/core/series.py b/pandas/core/series.py index b933f68cfad62..3c1f834c3d479 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2820,7 +2820,7 @@ def _try_cast(arr, take_fast_path): subarr = data.copy() return subarr - elif isinstance(data, list) and len(data) > 0: + elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index e2e0f568e4098..c91585a28d867 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1196,7 +1196,7 @@ def test_alignment_non_pandas(self): align = pd.core.ops._align_method_FRAME - for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3])]: + for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.intp)]: tm.assert_series_equal(align(df, val, 'index'), Series([1, 2, 3], index=df.index)) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index b7ec4d570f18b..c8e04f1ffd75f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -109,6 +109,17 @@ def test_constructor_iterator(self): result = Series(range(10), dtype='int64') assert_series_equal(result, expected) + def test_constructor_list_like(self): + + # make sure that we are coercing different + # list-likes to standard dtypes and not + # platform specific + expected = Series([1, 2, 3], dtype='int64') + for obj in [[1, 2, 3], (1, 2, 3), + np.array([1, 2, 3], dtype='int64')]: + result = Series(obj, index=[0, 1, 2]) + assert_series_equal(result, expected) + def test_constructor_generator(self): gen = (i for i in range(10)) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index e55cb91d36430..ca23d8d26a426 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -33,7 +33,7 @@ def _possibly_convert_platform(values): """ try to do platform conversion, allow ndarray or list here """ if isinstance(values, (list, tuple)): - values = lib.list_to_object_array(values) + values = lib.list_to_object_array(list(values)) if getattr(values, 'dtype', None) == np.object_: if hasattr(values, '_values'): values = values._values From d7c028d4965932160fa3b69f56c716b1454c42a5 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 15 Jul 2016 06:25:54 -0400 Subject: [PATCH 20/50] CLN: Removed levels attribute from Categorical Deprecated back in `0.15.0` and therefore long overdue. Closes #8376. Author: gfyoung Closes #13612 from gfyoung/categorical-levels-remove and squashes the following commits: f1254df [gfyoung] MAINT: Relocated backwards compat categorical pickle tests f3321cb [gfyoung] CLN: Removed levels attribute from Categorical --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 30 +---------- .../tests/data/categorical_0_14_1.pickle | 0 .../tests/data/categorical_0_15_2.pickle | Bin pandas/io/tests/test_pickle.py | 38 +++++++++++++ pandas/tests/test_categorical.py | 50 ------------------ setup.py | 4 +- 7 files changed, 43 insertions(+), 80 deletions(-) rename pandas/{ => io}/tests/data/categorical_0_14_1.pickle (100%) rename pandas/{ => io}/tests/data/categorical_0_15_2.pickle (100%) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 747fc70f858b4..0b9695125c0a9 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -506,6 +506,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) +- ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) .. _whatsnew_0190.performance: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1d1a9f990e61a..a26cc5125db78 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -228,8 +228,8 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, name=None, - fastpath=False, levels=None): + def __init__(self, values, categories=None, ordered=False, + name=None, fastpath=False): if fastpath: # fast path @@ -245,17 +245,6 @@ def __init__(self, values, categories=None, ordered=False, name=None, "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) - # TODO: Remove after deprecation period in 2017/ after 0.18 - if levels is not None: - warn("Creating a 'Categorical' with 'levels' is deprecated, use " - "'categories' instead", FutureWarning, stacklevel=2) - if categories is None: - categories = levels - else: - raise ValueError("Cannot pass in both 'categories' and " - "(deprecated) 'levels', use only " - "'categories'", stacklevel=2) - # sanitize input if is_categorical_dtype(values): @@ -580,21 +569,6 @@ def _get_categories(self): categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) - def _set_levels(self, levels): - """ set new levels (deprecated, use "categories") """ - warn("Assigning to 'levels' is deprecated, use 'categories'", - FutureWarning, stacklevel=2) - self.categories = levels - - def _get_levels(self): - """ Gets the levels (deprecated, use "categories") """ - warn("Accessing 'levels' is deprecated, use 'categories'", - FutureWarning, stacklevel=2) - return self.categories - - # TODO: Remove after deprecation period in 2017/ after 0.18 - levels = property(fget=_get_levels, fset=_set_levels) - _ordered = None def _set_ordered(self, value): diff --git a/pandas/tests/data/categorical_0_14_1.pickle b/pandas/io/tests/data/categorical_0_14_1.pickle similarity index 100% rename from pandas/tests/data/categorical_0_14_1.pickle rename to pandas/io/tests/data/categorical_0_14_1.pickle diff --git a/pandas/tests/data/categorical_0_15_2.pickle b/pandas/io/tests/data/categorical_0_15_2.pickle similarity index 100% rename from pandas/tests/data/categorical_0_15_2.pickle rename to pandas/io/tests/data/categorical_0_15_2.pickle diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 55c14fee9e3ed..6019144d59698 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -231,6 +231,44 @@ def python_unpickler(path): result = python_unpickler(path) self.compare_element(result, expected, typ) + def test_pickle_v0_14_1(self): + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + def test_pickle_v0_15_2(self): + # ordered -> _ordered + # GH 9347 + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index dd39861ac3114..1edd9443fe356 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1559,18 +1559,6 @@ def test_deprecated_labels(self): res = cat.labels self.assert_numpy_array_equal(res, exp) - def test_deprecated_levels(self): - # TODO: levels is deprecated and should be removed in 0.18 or 2017, - # whatever is earlier - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - exp = cat.categories - with tm.assert_produces_warning(FutureWarning): - res = cat.levels - self.assert_index_equal(res, exp) - with tm.assert_produces_warning(FutureWarning): - res = pd.Categorical([1, 2, 3, np.nan], levels=[1, 2, 3]) - self.assert_index_equal(res.categories, exp) - def test_removed_names_produces_warning(self): # 10482 @@ -4431,44 +4419,6 @@ def test_dt_accessor_api_for_categorical(self): invalid.dt self.assertFalse(hasattr(invalid, 'str')) - def test_pickle_v0_14_1(self): - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') - # This code was executed once on v0.14.1 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - - def test_pickle_v0_15_2(self): - # ordered -> _ordered - # GH 9347 - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') - # This code was executed once on v0.15.2 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - def test_concat_categorical(self): # See GH 10177 df1 = pd.DataFrame( diff --git a/setup.py b/setup.py index 650357588570a..c77ca4d9e60fe 100755 --- a/setup.py +++ b/setup.py @@ -589,6 +589,7 @@ def pxd(name): 'tests/data/legacy_msgpack/*/*.msgpack', 'tests/data/*.csv*', 'tests/data/*.dta', + 'tests/data/*.pickle', 'tests/data/*.txt', 'tests/data/*.xls', 'tests/data/*.xlsx', @@ -605,8 +606,7 @@ def pxd(name): 'tests/data/html_encoding/*.html', 'tests/json/data/*.json'], 'pandas.tools': ['tests/data/*.csv'], - 'pandas.tests': ['data/*.pickle', - 'data/*.csv'], + 'pandas.tests': ['data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'], 'pandas.tseries.tests': ['data/*.pickle', From 91691dedf1beb37427d1740682098a2a22488b48 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Sun, 17 Jul 2016 01:41:02 +0200 Subject: [PATCH 21/50] Fix minor typo --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tests/indexes/test_multi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 023a13f13e9f2..ddb7fc58e58e6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -551,4 +551,4 @@ Bug Fixes - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) --Bug in ``MultiIndex.from_arrays`` doesn't check for arrays lengths (:issue:`13599`) +-Bug in ``MultiIndex.from_arrays`` didn't check for input array lengths (:issue:`13599`) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 82c69d5a675e6..e3816271d410f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -637,7 +637,7 @@ def test_from_arrays_different_lengths(self): idx1 = [1, 2, 3] idx2 = ['a', 'b'] assertRaisesRegexp(ValueError, '^all arrays must be same length$', - MultiIndex.from_arrays, [idx1, idx2]) + MultiIndex.from_arrays, [idx1, idx2]) def test_from_product(self): From 043879fbb7de71605eed87991eb037c1917bace1 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Sun, 17 Jul 2016 05:40:27 -0700 Subject: [PATCH 22/50] DOC: Add reference of DataFrame.rename_axis and Series.rename_axis to api.rst (#13678) --- doc/source/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index 0dde341d820e3..e8fe26e8a525d 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -380,6 +380,7 @@ Reindexing / Selection / Label manipulation Series.reindex Series.reindex_like Series.rename + Series.rename_axis Series.reset_index Series.sample Series.select @@ -889,6 +890,7 @@ Reindexing / Selection / Label manipulation DataFrame.reindex_axis DataFrame.reindex_like DataFrame.rename + DataFrame.rename_axis DataFrame.reset_index DataFrame.sample DataFrame.select From 76d7e779e82c12f73c08704ea44c3b802e914ce7 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Sun, 17 Jul 2016 11:30:19 -0700 Subject: [PATCH 23/50] DOC: correct template for .cum* descriptions (#13683) Closes #13682 --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d6e6f571be53a..6c1676fbdd7f4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5504,7 +5504,7 @@ def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, mask_a, mask_b): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) - @Appender("Return cumulative {0} over requested axis.".format(name) + + @Appender("Return {0} over requested axis.".format(desc) + _cnum_doc) def cum_func(self, axis=None, skipna=True, *args, **kwargs): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) From ada6bf350f7fd4daaf2b80188ca165fb9543a252 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Mon, 18 Jul 2016 03:32:25 +0900 Subject: [PATCH 24/50] DOC: fix a keyword coerce in array_to_timedelta64 (#13686) --- pandas/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c681cebd84836..5624b84523705 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3028,7 +3028,7 @@ cdef inline bint is_timedelta(object o): def array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): """ convert an ndarray to an array of ints that are timedeltas - force conversion if coerce = True, + force conversion if errors = 'coerce', else will raise if cannot convert """ cdef: Py_ssize_t i, n From 6b9cd15f6a655b1ade2c571e32e142bf56dde769 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Tue, 19 Jul 2016 07:08:26 +0900 Subject: [PATCH 25/50] TST: assert message shows unnecessary diff (#13676) --- pandas/util/testing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4442eed898b60..402613d3f1728 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1010,14 +1010,15 @@ def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(right, np.ndarray): right = pprint_thing(right) - if diff is not None: - diff = "\n[diff]: {diff}".format(diff=diff) - msg = """{0} are different {1} [left]: {2} -[right]: {3}{4}""".format(obj, message, left, right, diff) +[right]: {3}""".format(obj, message, left, right) + + if diff is not None: + msg = msg + "\n[diff]: {diff}".format(diff=diff) + raise AssertionError(msg) From 694fe61f931e1c0f034f93f3e0f1084a8974a1f3 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 18 Jul 2016 21:03:58 -0400 Subject: [PATCH 26/50] ENH: Series.append now has ignore_index kw Author: sinhrks Closes #13677 from sinhrks/append_series and squashes the following commits: 4bc7b54 [sinhrks] ENH: Series.append now has ignore_index kw --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/series.py | 20 ++++++++++++++++++-- pandas/tests/series/test_combine_concat.py | 21 +++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0b9695125c0a9..a69617bfbec55 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -249,6 +249,7 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) +- ``Series.append`` now supports ``ignore_index`` option (:issue:`13677`) .. _whatsnew_0190.api: diff --git a/pandas/core/series.py b/pandas/core/series.py index 3c1f834c3d479..c3f5b1b8e641c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1511,13 +1511,18 @@ def searchsorted(self, v, side='left', sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False): """ Concatenate two or more Series. Parameters ---------- to_append : Series or list/tuple of Series + ignore_index : boolean, default False + If True, do not use the index labels. + + .. versionadded: 0.19.0 + verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates @@ -1548,6 +1553,17 @@ def append(self, to_append, verify_integrity=False): 5 6 dtype: int64 + With `ignore_index` set to True: + + >>> s1.append(s2, ignore_index=True) + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + dtype: int64 + With `verify_integrity` set to True: >>> s1.append(s2, verify_integrity=True) @@ -1561,7 +1577,7 @@ def append(self, to_append, verify_integrity=False): to_concat = [self] + to_append else: to_concat = [self, to_append] - return concat(to_concat, ignore_index=False, + return concat(to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity) def _binop(self, other, func, level=None, fill_value=None): diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index eb560d4a17055..fd6fd90cd631f 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -39,6 +39,27 @@ def test_append_many(self): result = pieces[0].append(pieces[1:]) assert_series_equal(result, self.ts) + def test_append_duplicates(self): + # GH 13677 + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(s1.append(s2), exp) + tm.assert_series_equal(pd.concat([s1, s2]), exp) + + # the result must have RangeIndex + exp = pd.Series([1, 2, 3, 4, 5, 6]) + tm.assert_series_equal(s1.append(s2, ignore_index=True), + exp, check_index_type=True) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), + exp, check_index_type=True) + + msg = 'Indexes have overlapping values:' + with tm.assertRaisesRegexp(ValueError, msg): + s1.append(s2, verify_integrity=True) + with tm.assertRaisesRegexp(ValueError, msg): + pd.concat([s1, s2], verify_integrity=True) + def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) From 5a521713f3892539b648bc2735d3cc502feb2b48 Mon Sep 17 00:00:00 2001 From: wcwagner Date: Mon, 18 Jul 2016 21:06:40 -0400 Subject: [PATCH 27/50] BUG: Add type check for width parameter in str.pad method GH13598 closes #13598 Author: wcwagner Closes #13690 from wcwagner/bug/13598 and squashes the following commits: 9669f3f [wcwagner] BUG: "Replaced isinstance with is_integer, and changed test_pad_width to use getattr" 40a3188 [wcwagner] BUG: "Switched to single test method asserting functions that use pad raise correctly." 06795db [wcwagner] BUG: "Added tests for width parameter on center, ljust, rjust, zfill." 468df3a [wcwagner] BUG: Add type check for width parameter in str.pad method GH13598 --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/strings.py | 7 ++++++- pandas/tests/test_strings.py | 9 +++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a69617bfbec55..99396f6cfbc89 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -583,7 +583,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - +- Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6ec28f9735850..3150fc5d0143a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,7 +8,8 @@ is_object_dtype, is_string_like, is_list_like, - is_scalar) + is_scalar, + is_integer) from pandas.core.common import _values_from_object from pandas.core.algorithms import take_1d @@ -914,6 +915,10 @@ def str_pad(arr, width, side='left', fillchar=' '): if len(fillchar) != 1: raise TypeError('fillchar must be a character, not str') + if not is_integer(width): + msg = 'width must be of integer type, not {0}' + raise TypeError(msg.format(type(width).__name__)) + if side == 'left': f = lambda x: x.rjust(width, fillchar) elif side == 'right': diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4d23bed620265..fcdbec8fbc5c4 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1603,6 +1603,15 @@ def test_pad_fillchar(self): "fillchar must be a character, not int"): result = values.str.pad(5, fillchar=5) + def test_pad_width(self): + # GH 13598 + s = Series(['1', '22', 'a', 'bb']) + + for f in ['center', 'ljust', 'rjust', 'zfill', 'pad']: + with tm.assertRaisesRegexp(TypeError, + "width must be of integer type, not*"): + getattr(s.str, f)('f') + def test_translate(self): def _check(result, expected): From 9f635cd74316d26110809bf1bb2a5525ac4d23fe Mon Sep 17 00:00:00 2001 From: yui-knk Date: Mon, 18 Jul 2016 21:12:07 -0400 Subject: [PATCH 28/50] BUG: Cast a key to NaT before get loc from Index closes #13603 Author: yui-knk Closes #13687 from yui-knk/fix_13603 and squashes the following commits: 0960395 [yui-knk] BUG: Cast a key to NaT before get loc from Index --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/tdi.py | 6 +++++- pandas/tseries/tests/test_timedeltas.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 99396f6cfbc89..fd58eb1b00171 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -584,6 +584,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) +- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index f9fb51ebf710c..78ab333be8ea5 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -697,6 +697,10 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ + + if isnull(key): + key = tslib.NaT + if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below @@ -754,7 +758,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - if is_integer(key) or is_float(key): + if is_integer(key) or is_float(key) or key is tslib.NaT: self._invalid_indexer('slice', key) loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, use_rhs=use_rhs) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 4f985998d5e20..36ae479c3dfcc 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -30,6 +30,25 @@ class TestTimedeltas(tm.TestCase): def setUp(self): pass + def test_get_loc_nat(self): + tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) + + self.assertEqual(tidx.get_loc(pd.NaT), 1) + self.assertEqual(tidx.get_loc(None), 1) + self.assertEqual(tidx.get_loc(float('nan')), 1) + self.assertEqual(tidx.get_loc(np.nan), 1) + + def test_contains(self): + # Checking for any NaT-like objects + # GH 13603 + td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + for v in [pd.NaT, None, float('nan'), np.nan]: + self.assertFalse((v in td)) + + td = to_timedelta([pd.NaT]) + for v in [pd.NaT, None, float('nan'), np.nan]: + self.assertTrue((v in td)) + def test_construction(self): expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') From b05453631270d4b78f79dc272222d5f3fe499ad7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 18 Jul 2016 21:15:04 -0400 Subject: [PATCH 29/50] BUG: merge_asof not handling allow_exact_matches and tolerance on first entry closes #13695 Author: Jeff Reback Closes #13698 from jreback/merge_asof and squashes the following commits: c46dcfa [Jeff Reback] BUG: merge_asof not handling allow_exact_matches and tolerance on first entry --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/src/join.pyx | 18 ++++++++------- pandas/tools/tests/test_merge_asof.py | 33 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index fd58eb1b00171..e728cb7910134 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -46,7 +46,7 @@ The following are now part of this API: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to -support asof style joining of time-series. (:issue:`1870`). Full documentation is +support asof style joining of time-series. (:issue:`1870`, :issue:`13695`). Full documentation is :ref:`here ` The :func:`merge_asof` performs an asof merge, which is similar to a left-join diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index a81ac0aa35d4e..ad3b1d4e4a90e 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -193,11 +193,12 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, diff = left_val - right_val # do we allow exact matches - if allow_exact_matches and diff > tol: - right_indexer[indexer] = -1 - continue + if allow_exact_matches: + if diff > tol: + right_indexer[indexer] = -1 + continue elif not allow_exact_matches: - if diff >= tol: + if diff >= tol or lc == rc: right_indexer[indexer] = -1 continue @@ -220,13 +221,14 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, diff = left_val - right_val # do we allow exact matches - if allow_exact_matches and diff > tol: - right_indexer[indexer] = -1 - continue + if allow_exact_matches: + if diff > tol: + right_indexer[indexer] = -1 + continue # we don't allow exact matches elif not allow_exact_matches: - if diff >= tol or not right_pos: + if diff >= tol or lc == rc: right_indexer[indexer] = -1 else: right_indexer[indexer] = right_pos - 1 diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index 5d78ccf199ed3..bcbb0f0fadb49 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -347,6 +347,39 @@ def test_allow_exact_matches_and_tolerance(self): expected = self.allow_exact_matches_and_tolerance assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance2(self): + # GH 13695 + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob']}) + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.000', + '2016-07-15 13:30:00.030']), + 'version': [1, 2]}) + + result = pd.merge_asof(df1, df2, on='time') + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [2]}) + assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [1]}) + assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, + tolerance=pd.Timedelta('10ms')) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [np.nan]}) + assert_frame_equal(result, expected) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 361a2b4f90e8c536934c6bd652830ef4950b43aa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 18 Jul 2016 21:17:50 -0400 Subject: [PATCH 30/50] CLN: removed pandas.sandbox xref #9615 Author: gfyoung Closes #13670 from gfyoung/sandbox-removal and squashes the following commits: 2a014aa [gfyoung] CLN: removed pandas.sandbox --- doc/source/ecosystem.rst | 6 ++ doc/source/faq.rst | 78 +---------------- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/api/tests/test_api.py | 2 +- pandas/sandbox/__init__.py | 0 pandas/sandbox/qtpandas.py | 145 -------------------------------- setup.py | 1 - 7 files changed, 11 insertions(+), 222 deletions(-) delete mode 100644 pandas/sandbox/__init__.py delete mode 100644 pandas/sandbox/qtpandas.py diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 8fafe8ec9eaa2..0d010b47f393a 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -93,6 +93,12 @@ targets the IPython Notebook environment. `Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Pandas-Qt `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spun off from the main pandas library, the `Pandas-Qt `__ +library enables DataFrame visualization and manipulation in PyQt4 and PySide applications. + .. _ecosystem.ide: IDE diff --git a/doc/source/faq.rst b/doc/source/faq.rst index e5d659cc31606..d23e0ca59254d 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -110,78 +110,6 @@ details. Visualizing Data in Qt applications ----------------------------------- -.. warning:: - - The ``qt`` support is **deprecated and will be removed in a future version**. - We refer users to the external package `pandas-qt `_. - -There is experimental support for visualizing DataFrames in PyQt4 and PySide -applications. At the moment you can display and edit the values of the cells -in the DataFrame. Qt will take care of displaying just the portion of the -DataFrame that is currently visible and the edits will be immediately saved to -the underlying DataFrame - -To demonstrate this we will create a simple PySide application that will switch -between two editable DataFrames. For this will use the ``DataFrameModel`` class -that handles the access to the DataFrame, and the ``DataFrameWidget``, which is -just a thin layer around the ``QTableView``. - -.. code-block:: python - - import numpy as np - import pandas as pd - from pandas.sandbox.qtpandas import DataFrameModel, DataFrameWidget - from PySide import QtGui, QtCore - - # Or if you use PyQt4: - # from PyQt4 import QtGui, QtCore - - class MainWidget(QtGui.QWidget): - def __init__(self, parent=None): - super(MainWidget, self).__init__(parent) - - # Create two DataFrames - self.df1 = pd.DataFrame(np.arange(9).reshape(3, 3), - columns=['foo', 'bar', 'baz']) - self.df2 = pd.DataFrame({ - 'int': [1, 2, 3], - 'float': [1.5, 2.5, 3.5], - 'string': ['a', 'b', 'c'], - 'nan': [np.nan, np.nan, np.nan] - }, index=['AAA', 'BBB', 'CCC'], - columns=['int', 'float', 'string', 'nan']) - - # Create the widget and set the first DataFrame - self.widget = DataFrameWidget(self.df1) - - # Create the buttons for changing DataFrames - self.button_first = QtGui.QPushButton('First') - self.button_first.clicked.connect(self.on_first_click) - self.button_second = QtGui.QPushButton('Second') - self.button_second.clicked.connect(self.on_second_click) - - # Set the layout - vbox = QtGui.QVBoxLayout() - vbox.addWidget(self.widget) - hbox = QtGui.QHBoxLayout() - hbox.addWidget(self.button_first) - hbox.addWidget(self.button_second) - vbox.addLayout(hbox) - self.setLayout(vbox) - - def on_first_click(self): - '''Sets the first DataFrame''' - self.widget.setDataFrame(self.df1) - - def on_second_click(self): - '''Sets the second DataFrame''' - self.widget.setDataFrame(self.df2) - - if __name__ == '__main__': - import sys - - # Initialize the application - app = QtGui.QApplication(sys.argv) - mw = MainWidget() - mw.show() - app.exec_() +There is no support for such visualization in pandas. However, the external +package `pandas-qt `_ does +provide this functionality. diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e728cb7910134..0107bdea542d6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -505,6 +505,7 @@ Deprecations Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 3f6c97441d659..0aefdbeae0518 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -28,7 +28,7 @@ class TestPDApi(Base, tm.TestCase): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'rpy', 'sandbox', 'locale'] + ignored = ['tests', 'rpy', 'locale'] # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', diff --git a/pandas/sandbox/__init__.py b/pandas/sandbox/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/sandbox/qtpandas.py b/pandas/sandbox/qtpandas.py deleted file mode 100644 index b6af40a0e2156..0000000000000 --- a/pandas/sandbox/qtpandas.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -Easy integration of DataFrame into pyqt framework - -@author: Jev Kuznetsov -""" - -# flake8: noqa - -# GH9615 - -import warnings -warnings.warn("The pandas.sandbox.qtpandas module is deprecated and will be " - "removed in a future version. We refer users to the external package " - "here: https://github.com/datalyze-solutions/pandas-qt") - -try: - from PyQt4.QtCore import QAbstractTableModel, Qt, QVariant, QModelIndex - from PyQt4.QtGui import ( - QApplication, QDialog, QVBoxLayout, QTableView, QWidget) -except ImportError: - from PySide.QtCore import QAbstractTableModel, Qt, QModelIndex - from PySide.QtGui import ( - QApplication, QDialog, QVBoxLayout, QTableView, QWidget) - QVariant = lambda value=None: value - -from pandas import DataFrame, Index - - -class DataFrameModel(QAbstractTableModel): - """ data model for a DataFrame class """ - def __init__(self): - super(DataFrameModel, self).__init__() - self.df = DataFrame() - - def setDataFrame(self, dataFrame): - self.df = dataFrame - - def signalUpdate(self): - """ tell viewers to update their data (this is full update, not - efficient)""" - self.layoutChanged.emit() - - #------------- table display functions ----------------- - def headerData(self, section, orientation, role=Qt.DisplayRole): - if role != Qt.DisplayRole: - return QVariant() - - if orientation == Qt.Horizontal: - try: - return self.df.columns.tolist()[section] - except (IndexError, ): - return QVariant() - elif orientation == Qt.Vertical: - try: - # return self.df.index.tolist() - return self.df.index.tolist()[section] - except (IndexError, ): - return QVariant() - - def data(self, index, role=Qt.DisplayRole): - if role != Qt.DisplayRole: - return QVariant() - - if not index.isValid(): - return QVariant() - - return QVariant(str(self.df.ix[index.row(), index.column()])) - - def flags(self, index): - flags = super(DataFrameModel, self).flags(index) - flags |= Qt.ItemIsEditable - return flags - - def setData(self, index, value, role): - row = self.df.index[index.row()] - col = self.df.columns[index.column()] - if hasattr(value, 'toPyObject'): - # PyQt4 gets a QVariant - value = value.toPyObject() - else: - # PySide gets an unicode - dtype = self.df[col].dtype - if dtype != object: - value = None if value == '' else dtype.type(value) - self.df.set_value(row, col, value) - return True - - def rowCount(self, index=QModelIndex()): - return self.df.shape[0] - - def columnCount(self, index=QModelIndex()): - return self.df.shape[1] - - -class DataFrameWidget(QWidget): - """ a simple widget for using DataFrames in a gui """ - def __init__(self, dataFrame, parent=None): - super(DataFrameWidget, self).__init__(parent) - - self.dataModel = DataFrameModel() - self.dataTable = QTableView() - self.dataTable.setModel(self.dataModel) - - layout = QVBoxLayout() - layout.addWidget(self.dataTable) - self.setLayout(layout) - # Set DataFrame - self.setDataFrame(dataFrame) - - def setDataFrame(self, dataFrame): - self.dataModel.setDataFrame(dataFrame) - self.dataModel.signalUpdate() - self.dataTable.resizeColumnsToContents() - -#-----------------stand alone test code - - -def testDf(): - """ creates test dataframe """ - data = {'int': [1, 2, 3], 'float': [1.5, 2.5, 3.5], - 'string': ['a', 'b', 'c'], 'nan': [np.nan, np.nan, np.nan]} - return DataFrame(data, index=Index(['AAA', 'BBB', 'CCC']), - columns=['int', 'float', 'string', 'nan']) - - -class Form(QDialog): - def __init__(self, parent=None): - super(Form, self).__init__(parent) - - df = testDf() # make up some data - widget = DataFrameWidget(df) - widget.resizeColumnsToContents() - - layout = QVBoxLayout() - layout.addWidget(widget) - self.setLayout(layout) - -if __name__ == '__main__': - import sys - import numpy as np - - app = QApplication(sys.argv) - form = Form() - form.show() - app.exec_() diff --git a/setup.py b/setup.py index c77ca4d9e60fe..0bff49c4976b8 100755 --- a/setup.py +++ b/setup.py @@ -560,7 +560,6 @@ def pxd(name): 'pandas.io.sas', 'pandas.formats', 'pandas.rpy', - 'pandas.sandbox', 'pandas.sparse', 'pandas.sparse.tests', 'pandas.stats', From 1e1e9b348bb3f256c2b4997db090a1d35da9938b Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 18 Jul 2016 21:28:01 -0400 Subject: [PATCH 31/50] DEPR: Remove legacy offsets Follow-up of #10951. Remove legacy offsets deprecated in 0.17.0. Author: sinhrks Closes #13590 from sinhrks/depr_legacy_offset and squashes the following commits: 2593b1f [sinhrks] DEPR: Remove legacy offsets --- doc/source/timeseries.rst | 46 +----- doc/source/whatsnew/v0.19.0.txt | 11 ++ pandas/tseries/frequencies.py | 108 ++------------ pandas/tseries/tests/test_base.py | 16 ++- pandas/tseries/tests/test_frequencies.py | 39 +++--- pandas/tseries/tests/test_offsets.py | 25 ++-- pandas/tseries/tests/test_period.py | 170 ++++++----------------- pandas/tseries/tests/test_tslib.py | 14 +- 8 files changed, 118 insertions(+), 311 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 7e832af14c051..f6a1e169afe9d 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -752,7 +752,7 @@ calculate significantly slower and will raise a ``PerformanceWarning`` rng + BQuarterEnd() -.. _timeseries.alias: +.. _timeseries.custombusinessdays: Custom Business Days (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -953,6 +953,8 @@ You can use keyword arguments suported by either ``BusinessHour`` and ``CustomBu # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 +.. _timeseries.alias: + Offset Aliases ~~~~~~~~~~~~~~ @@ -1103,48 +1105,6 @@ it is rolled forward to the next anchor point. pd.Timestamp('2014-01-01') + MonthBegin(n=0) pd.Timestamp('2014-01-31') + MonthEnd(n=0) -.. _timeseries.legacyaliases: - -Legacy Aliases -~~~~~~~~~~~~~~ -Note that prior to v0.8.0, time rules had a slightly different look. These are -deprecated in v0.17.0, and removed in future version. - -.. csv-table:: - :header: "Legacy Time Rule", "Offset Alias" - :widths: 15, 65 - - "WEEKDAY", "B" - "EOM", "BM" - "W\@MON", "W\-MON" - "W\@TUE", "W\-TUE" - "W\@WED", "W\-WED" - "W\@THU", "W\-THU" - "W\@FRI", "W\-FRI" - "W\@SAT", "W\-SAT" - "W\@SUN", "W\-SUN" - "Q\@JAN", "BQ\-JAN" - "Q\@FEB", "BQ\-FEB" - "Q\@MAR", "BQ\-MAR" - "A\@JAN", "BA\-JAN" - "A\@FEB", "BA\-FEB" - "A\@MAR", "BA\-MAR" - "A\@APR", "BA\-APR" - "A\@MAY", "BA\-MAY" - "A\@JUN", "BA\-JUN" - "A\@JUL", "BA\-JUL" - "A\@AUG", "BA\-AUG" - "A\@SEP", "BA\-SEP" - "A\@OCT", "BA\-OCT" - "A\@NOV", "BA\-NOV" - "A\@DEC", "BA\-DEC" - - -As you can see, legacy quarterly and annual frequencies are business quarters -and business year ends. Please also note the legacy time rule for milliseconds -``ms`` versus the new offset alias for month start ``MS``. This means that -offset alias parsing is case sensitive. - .. _timeseries.holiday: Holidays / Holiday Calendars diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0107bdea542d6..5a1b5041cb521 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -510,6 +510,17 @@ Removal of prior version deprecations/changes - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) +- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) + + Previous Behavior: + + .. code-block:: ipython + + In [2]: pd.date_range('2016-07-01', freq='W@MON', periods=3) + pandas/tseries/frequencies.py:465: FutureWarning: Freq "W@MON" is deprecated, use "W-MON" as alternative. + Out[2]: DatetimeIndex(['2016-07-04', '2016-07-11', '2016-07-18'], dtype='datetime64[ns]', freq='W-MON') + + Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` .. _whatsnew_0190.performance: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e2132deb97d64..8b3785d78d260 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1,5 +1,5 @@ from datetime import timedelta -from pandas.compat import range, long, zip +from pandas.compat import long, zip from pandas import compat import re import warnings @@ -356,34 +356,6 @@ def get_period_alias(offset_str): """ alias to closest period strings BQ->Q etc""" return _offset_to_period_map.get(offset_str, None) -_rule_aliases = { - # Legacy rules that will continue to map to their original values - # essentially for the rest of time - 'WEEKDAY': 'B', - 'EOM': 'BM', - 'W@MON': 'W-MON', - 'W@TUE': 'W-TUE', - 'W@WED': 'W-WED', - 'W@THU': 'W-THU', - 'W@FRI': 'W-FRI', - 'W@SAT': 'W-SAT', - 'W@SUN': 'W-SUN', - 'Q@JAN': 'BQ-JAN', - 'Q@FEB': 'BQ-FEB', - 'Q@MAR': 'BQ-MAR', - 'A@JAN': 'BA-JAN', - 'A@FEB': 'BA-FEB', - 'A@MAR': 'BA-MAR', - 'A@APR': 'BA-APR', - 'A@MAY': 'BA-MAY', - 'A@JUN': 'BA-JUN', - 'A@JUL': 'BA-JUL', - 'A@AUG': 'BA-AUG', - 'A@SEP': 'BA-SEP', - 'A@OCT': 'BA-OCT', - 'A@NOV': 'BA-NOV', - 'A@DEC': 'BA-DEC', -} _lite_rule_alias = { 'W': 'W-SUN', @@ -401,17 +373,6 @@ def get_period_alias(offset_str): 'ns': 'N' } -# TODO: Can this be killed? -for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): - for _iweek in range(4): - _name = 'WOM-%d%s' % (_iweek + 1, _weekday) - _rule_aliases[_name.replace('-', '@')] = _name - -# Note that _rule_aliases is not 1:1 (d[BA]==d[A@DEC]), and so traversal -# order matters when constructing an inverse. we pick one. #2331 -# Used in get_legacy_offset_name -_legacy_reverse_map = dict((v, k) for k, v in - reversed(sorted(compat.iteritems(_rule_aliases)))) _name_to_offset_map = {'days': Day(1), 'hours': Hour(1), @@ -422,6 +383,9 @@ def get_period_alias(offset_str): 'nanoseconds': Nano(1)} +_INVALID_FREQ_ERROR = "Invalid frequency: {0}" + + def to_offset(freqstr): """ Return DateOffset object from string representation or @@ -460,7 +424,7 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) else: delta = None @@ -479,10 +443,10 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) if delta is None: - raise ValueError('Unable to understand %s as a frequency' % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) return delta @@ -526,9 +490,6 @@ def get_base_alias(freqstr): _dont_uppercase = set(('MS', 'ms')) -_LEGACY_FREQ_WARNING = 'Freq "{0}" is deprecated, use "{1}" as alternative.' - - def get_offset(name): """ Return DateOffset object associated with rule name @@ -539,27 +500,9 @@ def get_offset(name): """ if name not in _dont_uppercase: name = name.upper() - - if name in _rule_aliases: - new = _rule_aliases[name] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new - elif name.lower() in _rule_aliases: - new = _rule_aliases[name.lower()] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new - name = _lite_rule_alias.get(name, name) name = _lite_rule_alias.get(name.lower(), name) - else: - if name in _rule_aliases: - new = _rule_aliases[name] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new name = _lite_rule_alias.get(name, name) if name not in _offset_map: @@ -571,7 +514,7 @@ def get_offset(name): offset = klass._from_name(*split[1:]) except (ValueError, TypeError, KeyError): # bad prefix or suffix - raise ValueError('Bad rule name requested: %s.' % name) + raise ValueError(_INVALID_FREQ_ERROR.format(name)) # cache _offset_map[name] = offset # do not return cache because it's mutable @@ -595,17 +538,6 @@ def get_offset_name(offset): return offset.freqstr -def get_legacy_offset_name(offset): - """ - Return the pre pandas 0.8.0 name for the date offset - """ - - # This only used in test_timeseries_legacy.py - - name = offset.name - return _legacy_reverse_map.get(name, name) - - def get_standard_freq(freq): """ Return the standardized frequency string @@ -796,36 +728,18 @@ def _period_alias_dictionary(): def _period_str_to_code(freqstr): - # hack - if freqstr in _rule_aliases: - new = _rule_aliases[freqstr] - warnings.warn(_LEGACY_FREQ_WARNING.format(freqstr, new), - FutureWarning, stacklevel=3) - freqstr = new freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: lower = freqstr.lower() - if lower in _rule_aliases: - new = _rule_aliases[lower] - warnings.warn(_LEGACY_FREQ_WARNING.format(lower, new), - FutureWarning, stacklevel=3) - freqstr = new freqstr = _lite_rule_alias.get(lower, freqstr) + if freqstr not in _dont_uppercase: + freqstr = freqstr.upper() try: - if freqstr not in _dont_uppercase: - freqstr = freqstr.upper() return _period_code_map[freqstr] except KeyError: - try: - alias = _period_alias_dict[freqstr] - warnings.warn(_LEGACY_FREQ_WARNING.format(freqstr, alias), - FutureWarning, stacklevel=3) - except KeyError: - raise ValueError("Unknown freqstr: %s" % freqstr) - - return _period_code_map[alias] + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) def infer_freq(index, warn=True): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 958a10c329a46..6c996285369b8 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -160,9 +160,11 @@ def test_round(self): tm.assert_index_equal(rng.round(freq='H'), expected_rng) self.assertEqual(elt.round(freq='H'), expected_elt) - msg = "Could not evaluate foo" - tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='foo') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): + rng.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') msg = " is a non-fixed frequency" tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') @@ -847,9 +849,11 @@ def test_round(self): tm.assert_index_equal(td.round(freq='H'), expected_rng) self.assertEqual(elt.round(freq='H'), expected_elt) - msg = "Could not evaluate foo" - tm.assertRaisesRegexp(ValueError, msg, td.round, freq='foo') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + td.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') msg = " is a non-fixed frequency" tm.assertRaisesRegexp(ValueError, msg, td.round, freq='M') diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 1f06b7ad4361b..268933fada7a2 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -245,10 +245,10 @@ def _assert_depr(freq, expected, aliases): assert isinstance(aliases, list) assert (frequencies._period_str_to_code(freq) == expected) + msg = frequencies._INVALID_FREQ_ERROR for alias in aliases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert (frequencies._period_str_to_code(alias) == expected) + with tm.assertRaisesRegexp(ValueError, msg): + frequencies._period_str_to_code(alias) _assert_depr("M", 3000, ["MTH", "MONTH", "MONTHLY"]) @@ -699,8 +699,9 @@ def test_series(self): s = Series(period_range('2013', periods=10, freq=freq)) self.assertRaises(TypeError, lambda: frequencies.infer_freq(s)) for freq in ['Y']: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + + msg = frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): s = Series(period_range('2013', periods=10, freq=freq)) self.assertRaises(TypeError, lambda: frequencies.infer_freq(s)) @@ -715,17 +716,23 @@ def test_series(self): self.assertEqual(inferred, 'D') def test_legacy_offset_warnings(self): - for k, v in compat.iteritems(frequencies._rule_aliases): - with tm.assert_produces_warning(FutureWarning): - result = frequencies.get_offset(k) - exp = frequencies.get_offset(v) - self.assertEqual(result, exp) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - idx = date_range('2011-01-01', periods=5, freq=k) - exp = date_range('2011-01-01', periods=5, freq=v) - self.assert_index_equal(idx, exp) + freqs = ['WEEKDAY', 'EOM', 'W@MON', 'W@TUE', 'W@WED', 'W@THU', + 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', + 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', + 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', + 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', 'WOM@4MON', + 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', 'WOM@4TUE', + 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', 'WOM@4WED', + 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', 'WOM@4THU' + 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', 'WOM@4FRI'] + + msg = frequencies._INVALID_FREQ_ERROR + for freq in freqs: + with tm.assertRaisesRegexp(ValueError, msg): + frequencies.get_offset(freq) + + with tm.assertRaisesRegexp(ValueError, msg): + date_range('2011-01-01', periods=5, freq=freq) MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 5965a661699a6..b31e4d54c551f 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -23,7 +23,7 @@ from pandas.core.series import Series from pandas.tseries.frequencies import (_offset_map, get_freq_code, - _get_freq_str) + _get_freq_str, _INVALID_FREQ_ERROR) from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache from pandas.tseries.tools import parse_time_string, DateParseError import pandas.tseries.offsets as offsets @@ -4531,8 +4531,11 @@ def test_get_offset_name(self): def test_get_offset(): - assertRaisesRegexp(ValueError, "rule.*GIBBERISH", get_offset, 'gibberish') - assertRaisesRegexp(ValueError, "rule.*QS-JAN-B", get_offset, 'QS-JAN-B') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset('gibberish') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset('QS-JAN-B') + pairs = [ ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), @@ -4558,10 +4561,8 @@ def test_get_offset(): def test_get_offset_legacy(): pairs = [('w@Sat', Week(weekday=5))] for name, expected in pairs: - with tm.assert_produces_warning(FutureWarning): - offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset(name) class TestParseTimeString(tm.TestCase): @@ -4595,16 +4596,14 @@ def test_get_standard_freq(): assert fstr == get_standard_freq('1w') assert fstr == get_standard_freq(('W', 1)) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = get_standard_freq('WeEk') - assert fstr == result + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_standard_freq('WeEk') fstr = get_standard_freq('5Q') assert fstr == get_standard_freq('5q') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = get_standard_freq('5QuarTer') - assert fstr == result + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_standard_freq('5QuarTer') assert fstr == get_standard_freq(('q', 5)) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 8d217ff0753a6..c90cbbf80086a 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -451,13 +451,16 @@ def test_period_deprecated_freq(self): "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR for exp, freqs in iteritems(cases): for freq in freqs: + with self.assertRaisesRegexp(ValueError, msg): + Period('2016-03-01 09:00', freq=freq) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = pd.Period('2016-03-01 09:00', freq=freq) - self.assertEqual(res, Period('2016-03-01 09:00', freq=exp)) + # check supported freq-aliases still works + p = Period('2016-03-01 09:00', freq=exp) + tm.assertIsInstance(p, Period) def test_repr(self): p = Period('Jan-2000') @@ -659,19 +662,21 @@ def test_properties_weekly(self): def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. - with tm.assert_produces_warning(FutureWarning): - w_date = Period(freq='WK', year=2007, month=1, day=7) - # + w_date = Period(freq='W', year=2007, month=1, day=7) self.assertEqual(w_date.year, 2007) self.assertEqual(w_date.quarter, 1) self.assertEqual(w_date.month, 1) self.assertEqual(w_date.week, 1) self.assertEqual((w_date - 1).week, 52) self.assertEqual(w_date.days_in_month, 31) - with tm.assert_produces_warning(FutureWarning): - exp = Period(freq='WK', year=2012, month=2, day=1) + + exp = Period(freq='W', year=2012, month=2, day=1) self.assertEqual(exp.days_in_month, 29) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=7) + def test_properties_daily(self): # Test properties on Periods with daily frequency. b_date = Period(freq='B', year=2007, month=1, day=1) @@ -819,10 +824,11 @@ def test_asfreq_MS(self): self.assertEqual(initial.asfreq(freq="M", how="S"), Period('2013-01', 'M')) - with self.assertRaisesRegexp(ValueError, "Unknown freqstr"): + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): initial.asfreq(freq="MS", how="S") - with tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS"): + with tm.assertRaisesRegexp(ValueError, msg): pd.Period('2013-01', 'MS') self.assertTrue(_period_code_map.get("MS") is None) @@ -1122,123 +1128,28 @@ def test_conv_weekly(self): self.assertEqual(ival_W.asfreq('W'), ival_W) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + ival_W.asfreq('WK') + def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency - - with tm.assert_produces_warning(FutureWarning): - ival_W = Period(freq='WK', year=2007, month=1, day=1) - - with tm.assert_produces_warning(FutureWarning): - ival_WSUN = Period(freq='WK', year=2007, month=1, day=7) - with tm.assert_produces_warning(FutureWarning): - ival_WSAT = Period(freq='WK-SAT', year=2007, month=1, day=6) - with tm.assert_produces_warning(FutureWarning): - ival_WFRI = Period(freq='WK-FRI', year=2007, month=1, day=5) - with tm.assert_produces_warning(FutureWarning): - ival_WTHU = Period(freq='WK-THU', year=2007, month=1, day=4) - with tm.assert_produces_warning(FutureWarning): - ival_WWED = Period(freq='WK-WED', year=2007, month=1, day=3) - with tm.assert_produces_warning(FutureWarning): - ival_WTUE = Period(freq='WK-TUE', year=2007, month=1, day=2) - with tm.assert_produces_warning(FutureWarning): - ival_WMON = Period(freq='WK-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_year = Period(freq='WK', year=2007, month=12, day=31) - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_quarter = Period(freq='WK', year=2007, month=3, - day=31) - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_month = Period(freq='WK', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) - else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) - - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) - else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) - - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) - else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) - self.assertEqual(ival_W_end_of_year.asfreq('A'), - ival_W_to_A_end_of_year) - self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) - self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) - self.assertEqual(ival_W_end_of_month.asfreq('M'), - ival_W_to_M_end_of_month) - - self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - with tm.assert_produces_warning(FutureWarning): - self.assertEqual(ival_W.asfreq('WK'), ival_W) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=1) + + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-SAT', year=2007, month=1, day=6) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-FRI', year=2007, month=1, day=5) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-THU', year=2007, month=1, day=4) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-WED', year=2007, month=1, day=3) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-TUE', year=2007, month=1, day=2) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-MON', year=2007, month=1, day=1) def test_conv_business(self): # frequency conversion tests: from Business Frequency" @@ -2894,11 +2805,14 @@ def test_to_period_monthish(self): prng = rng.to_period() self.assertEqual(prng.freq, 'M') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rng = date_range('01-Jan-2012', periods=8, freq='EOM') + rng = date_range('01-Jan-2012', periods=8, freq='M') prng = rng.to_period() self.assertEqual(prng.freq, 'M') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + date_range('01-Jan-2012', periods=8, freq='EOM') + def test_multiples(self): result1 = Period('1989', freq='2A') result2 = Period('1989', freq='A') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 6696c03a070f7..f30f01e66cb0b 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1422,16 +1422,14 @@ def _check_round(freq, expected): result = stamp.round(freq=freq) self.assertEqual(result, expected) - for freq, expected in [ - ('D', Timestamp('2000-01-05 00:00:00')), - ('H', Timestamp('2000-01-05 05:00:00')), - ('S', Timestamp('2000-01-05 05:09:15')) - ]: + for freq, expected in [('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15'))]: _check_round(freq, expected) - msg = "Could not evaluate" - tm.assertRaisesRegexp(ValueError, msg, - stamp.round, 'foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + stamp.round('foo') class TestTimestampOps(tm.TestCase): From 006bd0b1c2f3ff183c1834a27305a1a3039011d8 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 18 Jul 2016 21:42:39 -0400 Subject: [PATCH 32/50] CLN: removed setter method of categorical's ordered attribute xref #9611 Author: gfyoung Closes #13671 from gfyoung/cat-set-order-removal and squashes the following commits: 58938e7 [gfyoung] CLN: removed setter method of categorical's ordered attribute --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 8 +------- pandas/tests/test_categorical.py | 11 +++++------ 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5a1b5041cb521..053028d896466 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -508,6 +508,7 @@ Removal of prior version deprecations/changes - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) +- ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a26cc5125db78..39e140e962821 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -571,12 +571,6 @@ def _get_categories(self): _ordered = None - def _set_ordered(self, value): - """ Sets the ordered attribute to the boolean value """ - warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", - FutureWarning, stacklevel=2) - self.set_ordered(value, inplace=True) - def set_ordered(self, value, inplace=False): """ Sets the ordered attribute to the boolean value @@ -624,7 +618,7 @@ def _get_ordered(self): """ Gets the ordered attribute """ return self._ordered - ordered = property(fget=_get_ordered, fset=_set_ordered) + ordered = property(fget=_get_ordered) def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 1edd9443fe356..35b1b8c1bf341 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -808,13 +808,12 @@ def test_set_ordered(self): cat2.set_ordered(False, inplace=True) self.assertFalse(cat2.ordered) - # deperecated in v0.16.0 - with tm.assert_produces_warning(FutureWarning): - cat.ordered = False - self.assertFalse(cat.ordered) - with tm.assert_produces_warning(FutureWarning): + # removed in 0.19.0 + msg = "can\'t set attribute" + with tm.assertRaisesRegexp(AttributeError, msg): cat.ordered = True - self.assertTrue(cat.ordered) + with tm.assertRaisesRegexp(AttributeError, msg): + cat.ordered = False def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) From b225cacb1d2a34e3c4041533a0590133098756fa Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Mon, 18 Jul 2016 21:46:17 -0400 Subject: [PATCH 33/50] BUG/PERF: Sort mixed-int in Py3, fix Index.difference fixes some issues from #13432 closes #12044 closes #12814 Author: Piotr Jucha Closes #13514 from pijucha/setop13432 and squashes the following commits: 3a96089 [Piotr Jucha] BUG/PERF: Sort mixed-int in Py3, fix Index.difference --- asv_bench/benchmarks/index_object.py | 55 ++++++++ doc/source/whatsnew/v0.19.0.txt | 34 ++++- pandas/core/algorithms.py | 125 ++++++++++++++---- pandas/indexes/base.py | 79 +++++++++-- pandas/tests/indexes/common.py | 39 ++++++ pandas/tests/indexes/test_base.py | 188 ++++++++++++++++++++++++--- pandas/tests/indexes/test_multi.py | 9 ++ pandas/tests/test_algos.py | 74 +++++++++++ pandas/tests/test_groupby.py | 12 ++ pandas/tools/merge.py | 14 +- pandas/tools/tests/test_join.py | 17 +++ 11 files changed, 583 insertions(+), 63 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 8c65f09937df4..a0a1b560d36f3 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -63,6 +63,27 @@ def time_index_datetime_union(self): self.rng.union(self.rng2) +class index_datetime_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.A = self.N - 20000 + self.B = self.N + 20000 + self.idx1 = DatetimeIndex(range(self.N)) + self.idx2 = DatetimeIndex(range(self.A, self.B)) + self.idx3 = DatetimeIndex(range(self.N, self.B)) + + def time_index_datetime_difference(self): + self.idx1.difference(self.idx2) + + def time_index_datetime_difference_disjoint(self): + self.idx1.difference(self.idx3) + + def time_index_datetime_symmetric_difference(self): + self.idx1.symmetric_difference(self.idx2) + + class index_float64_boolean_indexer(object): goal_time = 0.2 @@ -183,6 +204,40 @@ def time_index_int64_union(self): self.left.union(self.right) +class index_int64_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.options = np.arange(self.N) + self.left = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + + def time_index_int64_difference(self): + self.left.difference(self.right) + + def time_index_int64_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + +class index_str_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.strs = tm.rands_array(10, self.N) + self.left = Index(self.strs[:self.N * 2 // 3]) + self.right = Index(self.strs[self.N // 3:]) + + def time_str_difference(self): + self.left.difference(self.right) + + def time_str_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + class index_str_boolean_indexer(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 053028d896466..8d3fe84ab835e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -396,7 +396,7 @@ resulting dtype will be upcast, which is unchanged from previous. pd.merge(df1, df2, how='outer', on='key') pd.merge(df1, df2, how='outer', on='key').dtypes -.. _whatsnew_0190.describe: +.. _whatsnew_0190.api.describe: ``.describe()`` changes ^^^^^^^^^^^^^^^^^^^^^^^ @@ -485,6 +485,34 @@ New Behavior: pd.NaT + 1 pd.NaT - 1 +.. _whatsnew_0190.api.difference: + +``Index.difference`` and ``.symmetric_difference`` changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as any other values. (:issue:`13514`) + +.. ipython:: python + + idx1 = pd.Index([1, 2, 3, np.nan]) + idx2 = pd.Index([0, 1, np.nan]) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: idx1.difference(idx2) + Out[3]: Float64Index([nan, 2.0, 3.0], dtype='float64') + + In [4]: idx1.symmetric_difference(idx2) + Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64') + +New Behavior: + +.. ipython:: python + + idx1.difference(idx2) + idx1.symmetric_difference(idx2) .. _whatsnew_0190.deprecations: @@ -534,7 +562,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - +- Improved performance of ``Index.difference`` (:issue:`12044`) .. _whatsnew_0190.bug_fixes: @@ -629,3 +657,5 @@ Bug Fixes - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) +- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) +- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c3ba734353a8d..5cc54e61f6b2a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -163,6 +163,104 @@ def isin(comps, values): return f(comps, values) +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): + """ + Sort ``values`` and reorder corresponding ``labels``. + ``values`` should be unique if ``labels`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``labels`` is not None. + labels : list_like + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``labels`` to mark "not found". + Ignored when ``labels`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``labels`` is None. + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_labels : ndarray + Reordered ``labels``; returned when ``labels`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``labels`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``labels`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError("Only list-like objects are allowed to be passed to" + "safe_sort as values") + values = np.array(values, copy=False) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, string_types) for x in values], + dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return _ensure_object(np.concatenate([nums, strs])) + + sorter = None + if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # labels: + + if labels is None: + return ordered + + if not is_list_like(labels): + raise TypeError("Only list-like objects or None are allowed to be" + "passed to safe_sort as labels") + labels = _ensure_platform_int(np.asarray(labels)) + + from pandas import Index + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if labels is not None") + + if sorter is None: + # mixed types + (hash_klass, _), values = _get_data_algo(values, _hashtables) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = _ensure_platform_int(t.lookup(ordered)) + + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = (labels < -len(values)) | (labels >= len(values)) | \ + (labels == na_sentinel) + + # (Out of bound indices will be masked with `na_sentinel` next, so we may + # deal with them here without performance loss using `mode='wrap'`.) + new_labels = reverse_indexer.take(labels, mode='wrap') + np.putmask(new_labels, mask, na_sentinel) + + return ordered, new_labels + + def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -210,33 +308,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.to_array() if sort and len(uniques) > 0: - try: - sorter = uniques.argsort() - except: - # unorderable in py3 if mixed str/int - t = hash_klass(len(uniques)) - t.map_locations(_ensure_object(uniques)) - - # order ints before strings - ordered = np.concatenate([ - np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], - dtype=object)) for f in - [lambda x: not isinstance(x, string_types), - lambda x: isinstance(x, string_types)]]) - sorter = _ensure_platform_int(t.lookup( - _ensure_object(ordered))) - - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = labels < 0 - labels = reverse_indexer.take(labels) - np.putmask(labels, mask, -1) - - uniques = uniques.take(sorter) + uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, + assume_unique=True) if is_datetimetz_type: - # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( values.tz) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b013d6ccb0b8e..71d5fdd17ee5c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1773,7 +1773,7 @@ def _get_consensus_name(self, other): else: name = None if self.name != name: - return other._shallow_copy(name=name) + return self._shallow_copy(name=name) return self def union(self, other): @@ -1920,7 +1920,8 @@ def difference(self, other): Return a new Index with elements from the index that are not in `other`. - This is the sorted set difference of two Index objects. + This is the set difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1946,14 +1947,27 @@ def difference(self, other): other, result_name = self._convert_can_do_setop(other) - theDiff = sorted(set(self) - set(other)) - return Index(theDiff, name=result_name) + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + + return this._shallow_copy(the_diff, name=result_name) diff = deprecate('diff', difference) def symmetric_difference(self, other, result_name=None): """ - Compute the sorted symmetric difference of two Index objects. + Compute the symmetric difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1970,9 +1984,6 @@ def symmetric_difference(self, other, result_name=None): ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. - The sorting of a result containing ``NaN`` values is not guaranteed - across Python versions. See GitHub issue #6444. - Examples -------- >>> idx1 = Index([1, 2, 3, 4]) @@ -1990,8 +2001,26 @@ def symmetric_difference(self, other, result_name=None): if result_name is None: result_name = result_name_update - the_diff = sorted(set((self.difference(other)). - union(other.difference(self)))) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) + + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + attribs = self._get_attributes_dict() attribs['name'] = result_name if 'freq' in attribs: @@ -2000,6 +2029,36 @@ def symmetric_difference(self, other, result_name=None): sym_diff = deprecate('sym_diff', symmetric_difference) + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. + + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + + Returns + ------- + uniques : index + """ + if self.is_unique and not dropna: + return self + + values = self.values + + if not self.is_unique: + values = self.unique() + + if dropna: + try: + if self.hasnans: + values = values[~isnull(values)] + except NotImplementedError: + pass + + return self._shallow_copy(values) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index d6f7493bb25f9..92560363be8fe 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -287,6 +287,45 @@ def test_duplicates(self): self.assertEqual(result.name, 'foo') self.assert_index_equal(result, Index([ind[0]], name='foo')) + def test_get_unique_index(self): + for ind in self.indices.values(): + + # MultiIndex tested separately + if not len(ind) or isinstance(ind, MultiIndex): + continue + + idx = ind[[0] * 5] + idx_unique = ind[[0]] + # We test against `idx_unique`, so first we make sure it's unique + # and doesn't contain nans. + self.assertTrue(idx_unique.is_unique) + try: + self.assertFalse(idx_unique.hasnans) + except NotImplementedError: + pass + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assert_index_equal(result, idx_unique) + + # nans: + + if not ind._can_hold_na: + continue + + vals = ind.values[[0] * 5] + vals[0] = np.nan + vals_unique = vals[:2] + idx_nan = ind._shallow_copy(vals) + idx_unique_nan = ind._shallow_copy(vals_unique) + self.assertTrue(idx_unique_nan.is_unique) + + for dropna, expected in zip([False, True], + [idx_unique_nan, idx_unique]): + for i in [idx_nan, idx_unique_nan]: + result = i._get_unique_index(dropna=dropna) + self.assert_index_equal(result, expected) + def test_sort(self): for ind in self.indices.values(): self.assertRaises(TypeError, ind.sort) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 06662e52e3a6f..cc5dd24292bb8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -640,47 +640,56 @@ def test_union(self): first = Index(list('ab'), name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([], name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index([]) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index([], name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([]) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) def test_add(self): @@ -803,17 +812,19 @@ def test_symmetric_difference(self): self.assertTrue(tm.equalContents(result, expected)) # nans: - # GH #6444, sorting of nans. Make sure the number of nans is right - # and the correct non-nan values are there. punt on sorting. - idx1 = Index([1, 2, 3, np.nan]) + # GH 13514 change: {nan} - {nan} == {} + # (GH 6444, sorting of nans, is no longer an issue) + idx1 = Index([1, np.nan, 2, 3]) idx2 = Index([0, 1, np.nan]) + idx3 = Index([0, 1]) + result = idx1.symmetric_difference(idx2) - # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + expected = Index([0.0, 2.0, 3.0]) + tm.assert_index_equal(result, expected) - nans = pd.isnull(result) - self.assertEqual(nans.sum(), 1) - self.assertEqual((~nans).sum(), 3) - [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] + result = idx1.symmetric_difference(idx3) + expected = Index([0.0, 2.0, 3.0, np.nan]) + tm.assert_index_equal(result, expected) # other not an Index: idx1 = Index([1, 2, 3, 4], name='idx1') @@ -1665,6 +1676,149 @@ def test_string_index_repr(self): self.assertEqual(coerce(idx), expected) +class TestMixedIntIndex(Base, tm.TestCase): + # Mostly the tests from common.py for which the results differ + # in py2 and py3 because ints and strings are uncomparable in py3 + # (GH 13514) + + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) + self.setup_indices() + + def create_index(self): + return self.mixedIndex + + def test_order(self): + idx = self.create_index() + # 9816 deprecated + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + with tm.assert_produces_warning(FutureWarning): + idx.order() + else: + with tm.assert_produces_warning(FutureWarning): + idx.order() + + def test_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = idx.argsort() + else: + result = idx.argsort() + expected = np.array(idx).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_numpy_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = np.argsort(idx) + else: + result = np.argsort(idx) + expected = idx.argsort() + tm.assert_numpy_array_equal(result, expected) + + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + idx = self.create_index() + + first = idx.__class__(idx, copy=True, name='mario') + second = first.__class__(first, copy=False) + + # Even though "copy=False", we want a new object. + self.assertIsNot(first, second) + # Not using tm.assert_index_equal() since names differ: + self.assertTrue(idx.equals(first)) + + self.assertEqual(first.name, 'mario') + self.assertEqual(second.name, 'mario') + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + s3 = s1 * s2 + else: + s3 = s1 * s2 + self.assertEqual(s3.index.name, 'mario') + + def test_union_base(self): + idx = self.create_index() + first = idx[3:] + second = idx[:5] + + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + else: + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + + def test_intersection_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:5] + second = idx[:3] + result = first.intersection(second) + expected = Index([0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + def test_difference_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.difference(second) + expected = Index([0, 1, 'a']) + self.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.symmetric_difference(second) + expected = Index([0, 1, 2, 'a', 'c']) + self.assert_index_equal(result, expected) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e6a8aafc32be4..2734e90a1971b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1877,6 +1877,15 @@ def test_duplicate_meta_data(self): self.assertTrue(idx.has_duplicates) self.assertEqual(idx.drop_duplicates().names, idx.names) + def test_get_unique_index(self): + idx = self.index[[0, 1, 0, 1, 1, 0, 0]] + expected = self.index._shallow_copy(idx[[0, 1]]) + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assertTrue(result.unique) + self.assert_index_equal(result, expected) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cb90110c953c1..f18d869b3843d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -56,6 +56,80 @@ def test_strings(self): tm.assert_series_equal(result, expected) +class TestSafeSort(tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic_sort(self): + values = [3, 1, 2, 0, 4] + result = algos.safe_sort(values) + expected = np.array([0, 1, 2, 3, 4]) + tm.assert_numpy_array_equal(result, expected) + + values = list("baaacb") + result = algos.safe_sort(values) + expected = np.array(list("aaabbc")) + tm.assert_numpy_array_equal(result, expected) + + values = [] + result = algos.safe_sort(values) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected) + + def test_labels(self): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + labels = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # na_sentinel + labels = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels, + na_sentinel=99) + expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # out of bound indices + labels = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + labels = [] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([], dtype=np.int_) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_mixed_integer(self): + values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + result = algos.safe_sort(values) + expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(['b', 1, 0, 'a'], dtype=object) + labels = [0, 1, 2, 3, 0, -1, 1] + result, result_labels = algos.safe_sort(values, labels) + expected = np.array([0, 1, 'a', 'b'], dtype=object) + + def test_exceptions(self): + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects are allowed"): + algos.safe_sort(values=1) + + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects or None"): + algos.safe_sort(values=[0, 1, 2], labels=1) + + with tm.assertRaisesRegexp(ValueError, "values should be unique"): + algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + + class TestFactorize(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 57d43f22757ea..258f36cb1b68f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3210,6 +3210,18 @@ def test_groupby_nonstring_columns(self): expected = df.groupby(df[0]).mean() assert_frame_equal(result, expected) + def test_groupby_mixed_type_columns(self): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) + expected = DataFrame([[1, 2]], columns=['B', 0], + index=Index([0], name='A')) + + result = df.groupby('A').first() + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').sum() + tm.assert_frame_equal(result, expected) + def test_cython_grouper_series_bug_noncontig(self): arr = np.empty((100, 100)) arr.fill(np.nan) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 5b66e55eb60b6..e7d165354ec6c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1209,16 +1209,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - sorter = uniques.argsort() + l = len(left) + labels = np.concatenate([left, right]) - reverse_indexer = np.empty(len(sorter), dtype=np.int64) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - new_left = reverse_indexer.take(_ensure_platform_int(left)) - np.putmask(new_left, left == -1, -1) - - new_right = reverse_indexer.take(_ensure_platform_int(right)) - np.putmask(new_right, right == -1, -1) + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + new_labels = _ensure_int64(new_labels) + new_left, new_right = new_labels[:l], new_labels[l:] return new_left, new_right diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index 86aee0b4a01c9..cb84c1f06653b 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -536,6 +536,23 @@ def test_join_sort(self): joined = left.join(right, on='key', sort=False) self.assert_index_equal(joined.index, pd.Index(lrange(4))) + def test_join_mixed_non_unique_index(self): + # GH 12814, unorderable types in py3 with a non-unique index + df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) + df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + result = df1.join(df2) + expected = DataFrame({'a': [1, 2, 3, 3, 4], + 'b': [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, 'a']) + tm.assert_frame_equal(result, expected) + + df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) + df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + result = df3.join(df4) + expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, + index=[1, 2, 2, 'a']) + tm.assert_frame_equal(result, expected) + def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), From fafef5d91126d6a145f86f2ab4c4725039f3d739 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 18 Jul 2016 21:57:23 -0400 Subject: [PATCH 34/50] ENH: Add support for writing variable labels to Stata files closes #13536 closes #13535 Author: Kevin Sheppard Closes #13631 from bashtage/stata-data-labels and squashes the following commits: 1e1e1bf [Kevin Sheppard] ENH: Add support for writing variable labels --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/core/frame.py | 18 ++++++++-- pandas/io/stata.py | 51 +++++++++++++++++++------- pandas/io/tests/test_stata.py | 64 +++++++++++++++++++++++++++++---- 4 files changed, 114 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8d3fe84ab835e..df9f60fd499fa 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -249,7 +249,8 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) -- ``Series.append`` now supports ``ignore_index`` option (:issue:`13677`) +- ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) +- ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) .. _whatsnew_0190.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 334526b424be5..4fe7b318b3a18 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1467,7 +1467,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None): + data_label=None, variable_labels=None): """ A class for writing Stata binary dta files from array-like objects @@ -1480,11 +1480,24 @@ def to_stata(self, fname, convert_dates=None, write_index=True, format that you want to use for the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a number or a name. + write_index : bool + Write the index to Stata dataset. encoding : str Default is latin-1. Note that Stata does not support unicode. byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. + + .. versionadded:: 0.19.0 + + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. Examples -------- @@ -1500,7 +1513,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True, writer = StataWriter(fname, self, convert_dates=convert_dates, encoding=encoding, byteorder=byteorder, time_stamp=time_stamp, data_label=data_label, - write_index=write_index) + write_index=write_index, + variable_labels=variable_labels) writer.write_file() @Appender(fmt.docstring_to_string, indents=1) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index bd19102c7f18c..d35466e8896ba 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1059,7 +1059,7 @@ def _read_new_header(self, first_char): self.lbllist = self._get_lbllist() self.path_or_buf.seek(self._seek_variable_labels) - self.vlblist = self._get_vlblist() + self._variable_labels = self._get_variable_labels() # Get data type information, works for versions 117-118. def _get_dtypes(self, seek_vartypes): @@ -1127,7 +1127,7 @@ def _get_lbllist(self): return [self._null_terminate(self.path_or_buf.read(b)) for i in range(self.nvar)] - def _get_vlblist(self): + def _get_variable_labels(self): if self.format_version == 118: vlblist = [self._decode(self.path_or_buf.read(321)) for i in range(self.nvar)] @@ -1242,7 +1242,7 @@ def _read_old_header(self, first_char): self.lbllist = self._get_lbllist() - self.vlblist = self._get_vlblist() + self._variable_labels = self._get_variable_labels() # ignore expansion fields (Format 105 and later) # When reading, read five bytes; the last four bytes now tell you @@ -1306,11 +1306,11 @@ def _read_value_labels(self): while True: if self.format_version >= 117: if self.path_or_buf.read(5) == b' - break # end of variable label table + break # end of value label table slength = self.path_or_buf.read(4) if not slength: - break # end of variable label table (format < 117) + break # end of value label table (format < 117) if self.format_version <= 117: labname = self._null_terminate(self.path_or_buf.read(33)) else: @@ -1666,7 +1666,7 @@ def variable_labels(self): """Returns variable labels as a dict, associating each variable name with corresponding label """ - return dict(zip(self.varlist, self.vlblist)) + return dict(zip(self.varlist, self._variable_labels)) def value_labels(self): """Returns a dict, associating each variable name a dict, associating @@ -1696,7 +1696,7 @@ def _set_endianness(endianness): def _pad_bytes(name, length): """ - Takes a char string and pads it wih null bytes until it's length chars + Takes a char string and pads it with null bytes until it's length chars """ return name + "\x00" * (length - len(name)) @@ -1831,6 +1831,12 @@ class StataWriter(StataParser): dataset_label : str A label for the data set. Should be 80 characters or smaller. + .. versionadded:: 0.19.0 + + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + Returns ------- writer : StataWriter instance @@ -1853,12 +1859,13 @@ class StataWriter(StataParser): def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None): + data_label=None, variable_labels=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index self._time_stamp = time_stamp self._data_label = data_label + self._variable_labels = variable_labels # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -2135,11 +2142,29 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, else: # Default is empty label self._write(_pad_bytes("", 33)) - def _write_variable_labels(self, labels=None): - nvar = self.nvar - if labels is None: - for i in range(nvar): - self._write(_pad_bytes("", 81)) + def _write_variable_labels(self): + # Missing labels are 80 blank characters plus null termination + blank = _pad_bytes('', 81) + + if self._variable_labels is None: + for i in range(self.nvar): + self._write(blank) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError('Variable labels must be 80 characters ' + 'or fewer') + is_latin1 = all(ord(c) < 256 for c in label) + if not is_latin1: + raise ValueError('Variable labels must contain only ' + 'characters that can be encoded in ' + 'Latin-1') + self._write(_pad_bytes(label, 81)) + else: + self._write(blank) def _prepare_data(self): data = self.data diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 5f45d1b547e62..91850e6ffe9b9 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,27 +1,27 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101 -from datetime import datetime import datetime as dt import os -import warnings -import nose import struct import sys +import warnings +from datetime import datetime from distutils.version import LooseVersion +import nose import numpy as np import pandas as pd +import pandas.util.testing as tm +from pandas import compat from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series from pandas.types.common import is_categorical_dtype +from pandas.tslib import NaT from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) -import pandas.util.testing as tm -from pandas.tslib import NaT -from pandas import compat class TestStata(tm.TestCase): @@ -1113,6 +1113,58 @@ def test_read_chunks_columns(self): tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize + def test_write_variable_labels(self): + # GH 13631, add support for writing variable labels + original = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [1.0, 3.0, 27.0, 81.0], + 'c': ['Atlanta', 'Birmingham', + 'Cincinnati', 'Detroit']}) + original.index.name = 'index' + variable_labels = {'a': 'City Rank', 'b': 'City Exponent', 'c': 'City'} + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + expected_labels = {'index': '', + 'a': 'City Rank', + 'b': 'City Exponent', + 'c': 'City'} + tm.assert_equal(read_labels, expected_labels) + + variable_labels['index'] = 'The Index' + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + tm.assert_equal(read_labels, variable_labels) + + def test_write_variable_label_errors(self): + original = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [1.0, 3.0, 27.0, 81.0], + 'c': ['Atlanta', 'Birmingham', + 'Cincinnati', 'Detroit']}) + values = [u'\u03A1', u'\u0391', + u'\u039D', u'\u0394', + u'\u0391', u'\u03A3'] + + variable_labels_utf8 = {'a': 'City Rank', + 'b': 'City Exponent', + 'c': u''.join(values)} + + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels_utf8) + + variable_labels_long = {'a': 'City Rank', + 'b': 'City Exponent', + 'c': 'A very, very, very long variable label ' + 'that is too long for Stata which means ' + 'that it has more than 80 characters'} + + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels_long) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 506520bd35331aa82db50686c07d96594cac0c10 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 18 Jul 2016 22:06:18 -0400 Subject: [PATCH 35/50] API: Index doesn't results in PeriodIndex if Period contains NaT Author: sinhrks Closes #13664 from sinhrks/period_infer2 and squashes the following commits: b208a9e [sinhrks] API: Index doesn't results in PeriodIndex if Period contains NaT --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/core/ops.py | 11 ++- pandas/indexes/base.py | 31 +++++---- pandas/src/inference.pyx | 34 +++++++-- pandas/tests/indexes/test_datetimelike.py | 84 +++++++++++++++++------ pandas/tests/types/test_inference.py | 27 ++++++++ pandas/tseries/base.py | 7 +- pandas/tseries/tests/test_base.py | 6 +- 8 files changed, 153 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index df9f60fd499fa..f65f7d57d5d08 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -269,6 +269,8 @@ API changes - ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) +- Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) +- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) .. _whatsnew_0190.api.tolist: @@ -645,7 +647,6 @@ Bug Fixes - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) - - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index d76f011df3dd8..44e3be32c23df 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -31,7 +31,7 @@ is_list_like, _ensure_object) from pandas.types.cast import _maybe_upcast_putmask -from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -773,6 +773,15 @@ def wrapper(self, other, axis=None): if (not lib.isscalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') + + if isinstance(other, ABCPeriodIndex): + # temp workaround until fixing GH 13637 + # tested in test_nat_comparisons + # (pandas.tests.series.test_operators.TestSeriesOperators) + return self._constructor(na_op(self.values, + other.asobject.values), + index=self.index) + return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 71d5fdd17ee5c..567d2a458dafa 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -224,7 +224,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass # maybe coerce to a sub-class - from pandas.tseries.period import PeriodIndex + from pandas.tseries.period import (PeriodIndex, + IncompatibleFrequency) if isinstance(data, PeriodIndex): return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): @@ -265,13 +266,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - elif (inferred.startswith('timedelta') or - lib.is_timedelta_array(subarr)): + elif inferred.startswith('timedelta'): from pandas.tseries.tdi import TimedeltaIndex return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': - return PeriodIndex(subarr, name=name, **kwargs) + try: + return PeriodIndex(subarr, name=name, **kwargs) + except IncompatibleFrequency: + pass return cls._simple_new(subarr, name) elif hasattr(data, '__array__'): @@ -866,6 +869,16 @@ def _convert_can_do_setop(self, other): result_name = self.name if self.name == other.name else None return other, result_name + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + return value + + def _assert_can_do_op(self, value): + """ Check value is valid for scalar op """ + if not lib.isscalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + @property def nlevels(self): return 1 @@ -1508,16 +1521,6 @@ def hasnans(self): else: return False - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - return value - - def _assert_can_do_op(self, value): - """ Check value is valid for scalar op """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - def putmask(self, mask, value): """ return a new Index of the values set with the mask diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 9f96037c97c62..fe4748eb0eba0 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -270,7 +270,7 @@ cdef inline bint is_null_datetimelike(v): cdef inline bint is_null_datetime64(v): - # determine if we have a null for a datetime (or integer versions)x, + # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') if util._checknull(v): return True @@ -282,7 +282,7 @@ cdef inline bint is_null_datetime64(v): cdef inline bint is_null_timedelta64(v): - # determine if we have a null for a timedelta (or integer versions)x, + # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') if util._checknull(v): return True @@ -293,6 +293,16 @@ cdef inline bint is_null_timedelta64(v): return False +cdef inline bint is_null_period(v): + # determine if we have a null for a Period (or integer versions), + # excluding np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + return False + + cdef inline bint is_datetime(object o): return PyDateTime_Check(o) @@ -531,6 +541,7 @@ def is_timedelta_array(ndarray values): return False return null_count != n + def is_timedelta64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -546,6 +557,7 @@ def is_timedelta64_array(ndarray values): return False return null_count != n + def is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ cdef Py_ssize_t i, null_count = 0, n = len(values) @@ -562,6 +574,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False return null_count != n + def is_date_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -571,6 +584,7 @@ def is_date_array(ndarray[object] values): return False return True + def is_time_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -582,15 +596,21 @@ def is_time_array(ndarray[object] values): def is_period_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - from pandas.tseries.period import Period - + cdef Py_ssize_t i, null_count = 0, n = len(values) + cdef object v if n == 0: return False + + # return False for all nulls for i in range(n): - if not isinstance(values[i], Period): + v = values[i] + if is_null_period(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not is_period(v): return False - return True + return null_count != n cdef extern from "parse_helper.h": diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 5c21f71d64660..af44767ae5be5 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -119,10 +119,10 @@ def test_pickle_compat_construction(self): def test_construction_index_with_mixed_timezones(self): # GH 11488 # no tz results in DatetimeIndex - result = Index( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + result = Index([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNone(result.tz) @@ -295,9 +295,9 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00')], + tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) @@ -338,6 +338,17 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + def test_construction_base_constructor(self): + arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) @@ -699,12 +710,11 @@ def test_fillna_datetime64(self): pd.Timestamp('2011-01-01 11:00')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - idx = pd.DatetimeIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], tz=tz) + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], tz=tz) - exp = pd.DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], tz=tz) + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], tz=tz) self.assert_index_equal( idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) @@ -734,6 +744,26 @@ def setUp(self): def create_index(self): return period_range('20130101', periods=5, freq='D') + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='D')] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.Index(np.array(arr), dtype=object)) + def test_astype(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -874,7 +904,6 @@ def test_repeat(self): self.assertEqual(res.freqstr, 'D') def test_period_index_indexer(self): - # GH4125 idx = pd.period_range('2002-01', '2003-12', freq='M') df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) @@ -886,12 +915,11 @@ def test_period_index_indexer(self): def test_fillna_period(self): # GH 11343 - idx = pd.PeriodIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], freq='H') + idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], freq='H') - exp = pd.PeriodIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], freq='H') + exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H') self.assert_index_equal( idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) @@ -899,10 +927,11 @@ def test_fillna_period(self): pd.Period('2011-01-01 11:00', freq='H')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - with tm.assertRaisesRegexp( - ValueError, - 'Input has different freq=D from PeriodIndex\\(freq=H\\)'): - idx.fillna(pd.Period('2011-01-01', freq='D')) + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), + pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna(pd.Period('2011-01-01', freq='D')), + exp) def test_no_millisecond_field(self): with self.assertRaises(AttributeError): @@ -923,6 +952,17 @@ def setUp(self): def create_index(self): return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + def test_construction_base_constructor(self): + arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timedelta('1 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + def test_shift(self): # test shift for TimedeltaIndex # err8083 diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 34d10ee9dfa42..9a12220f5b41d 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -431,6 +431,33 @@ def test_infer_dtype_timedelta(self): dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed') + def test_infer_dtype_period(self): + # GH 13664 + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='M')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Period('2011-01', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([n, pd.Period('2011-01', freq='D'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # different type of nat + arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'floating') diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index fe0440170383b..188f538372092 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -800,12 +800,15 @@ def _ensure_datetimelike_to_i8(other): if lib.isscalar(other) and isnull(other): other = tslib.iNaT elif isinstance(other, ABCIndexClass): - # convert tz if needed if getattr(other, 'tz', None) is not None: other = other.tz_localize(None).asi8 else: other = other.asi8 else: - other = np.array(other, copy=False).view('i8') + try: + other = np.array(other, copy=False).view('i8') + except TypeError: + # period array cannot be coerces to int + other = Index(other).asi8 return other diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 6c996285369b8..4aa1e2f5d33dd 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1735,9 +1735,9 @@ def test_representation_to_series(self): 2 2013 dtype: object""" - exp6 = """0 2011-01-01 09:00 -1 2012-02-01 10:00 -2 NaT + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT dtype: object""" exp7 = """0 2013Q1 From 31c2e5ffa9c8008e2d84dc5ffa02f2d938a32294 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 19 Jul 2016 08:47:19 -0400 Subject: [PATCH 36/50] PERF: improve DTI string parse closes #11169 closes #11287 Author: sinhrks Closes #13692 from sinhrks/dti_perf and squashes the following commits: 8774772 [sinhrks] PERF: improve DTI string parse --- doc/source/whatsnew/v0.19.0.txt | 4 + pandas/io/parsers.py | 2 +- pandas/tests/indexes/test_datetimelike.py | 31 +++----- pandas/tseries/index.py | 93 +++++------------------ pandas/tseries/resample.py | 7 +- pandas/tseries/tests/test_timeseries.py | 5 +- pandas/tseries/tests/test_tslib.py | 22 +++--- pandas/tseries/tools.py | 33 +++----- pandas/tslib.pyx | 55 +++----------- 9 files changed, 74 insertions(+), 178 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f65f7d57d5d08..69200d7142b9f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -566,6 +566,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index.difference`` (:issue:`12044`) +- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) .. _whatsnew_0190.bug_fixes: @@ -631,6 +632,7 @@ Bug Fixes - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) + - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) @@ -654,6 +656,8 @@ Bug Fixes - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) +- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) +- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 84ea2a92b8026..f6a84ea9debaa 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2440,7 +2440,7 @@ def converter(*date_cols): strs = _concat_date_cols(date_cols) try: - return tools._to_datetime( + return tools.to_datetime( _ensure_object(strs), utc=None, box=False, diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index af44767ae5be5..378e8c545ec83 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 19:00'), - Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # length = 1 result = Index([Timestamp('2011-01-01')], name='idx') exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') @@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'), - pd.NaT, Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # all NaT result = Index([pd.NaT, pd.NaT], name='idx') exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') @@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self): self.assertTrue(isinstance(result, DatetimeIndex)) # tz mismatch affecting to tz-aware raises TypeError/ValueError + with tm.assertRaises(ValueError): DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], name='idx') - with tm.assertRaises(TypeError): + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): DatetimeIndex([Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='Asia/Tokyo', name='idx') @@ -338,6 +318,13 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): + # passing tz should results in DatetimeIndex, then mismatch raises + # TypeError + Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='Asia/Tokyo', name='idx') + def test_construction_base_constructor(self): arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 47bb69b8d7ad6..d448ca9878b99 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -292,55 +292,32 @@ def __new__(cls, data=None, raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) - # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) - data = np.asarray(data, dtype='O') + elif isinstance(data, ABCSeries): + data = data._values - # try a few ways to make it datetime64 - if lib.is_string_array(data): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - else: - data = tools.to_datetime(data, errors='raise') - data.offset = freq - if isinstance(data, DatetimeIndex): - if name is not None: - data.name = name - - if tz is not None: - - # we might already be localized to this tz - # so passing the same tz is ok - # however any other tz is a no-no - if data.tz is None: - return data.tz_localize(tz, ambiguous=ambiguous) - elif str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") - - return data._deepcopy_if_needed(ref_to_data, copy) - - if issubclass(data.dtype.type, compat.string_types): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) + # data must be Index or np.ndarray here + if not (is_datetime64_dtype(data) or is_datetimetz(data) or + is_integer_dtype(data)): + data = tools.to_datetime(data, dayfirst=dayfirst, + yearfirst=yearfirst) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): - if isinstance(data, ABCSeries): - data = data._values + if isinstance(data, DatetimeIndex): if tz is None: tz = data.tz - + elif data.tz is None: + data = data.tz_localize(tz, ambiguous=ambiguous) else: # the tz's must match if str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") + msg = ('data is already tz-aware {0}, unable to ' + 'set specified tz: {1}') + raise TypeError(msg.format(data.tz, tz)) subarr = data.values @@ -356,35 +333,6 @@ def __new__(cls, data=None, if isinstance(data, Int64Index): raise TypeError('cannot convert Int64Index->DatetimeIndex') subarr = data.view(_NS_DTYPE) - else: - if isinstance(data, (ABCSeries, Index)): - values = data._values - else: - values = data - - if lib.is_string_array(values): - subarr = tslib.parse_str_array_to_datetime( - values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst) - else: - try: - subarr = tools.to_datetime(data, box=False) - - # make sure that we have a index/ndarray like (and not a - # Series) - if isinstance(subarr, ABCSeries): - subarr = subarr._values - if subarr.dtype == np.object_: - subarr = tools._to_datetime(subarr, box=False) - - except ValueError: - # tz aware - subarr = tools._to_datetime(data, box=False, utc=True) - - # we may not have been able to convert - if not (is_datetimetz(subarr) or - np.issubdtype(subarr.dtype, np.datetime64)): - raise ValueError('Unable to convert %s to datetime dtype' - % str(data)) if isinstance(subarr, DatetimeIndex): if tz is None: @@ -399,27 +347,21 @@ def __new__(cls, data=None, ints = subarr.view('i8') subarr = tslib.tz_localize_to_utc(ints, tz, ambiguous=ambiguous) - subarr = subarr.view(_NS_DTYPE) subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) - - # if dtype is provided, coerce here if dtype is not None: - if not is_dtype_equal(subarr.dtype, dtype): - + # dtype must be coerced to DatetimeTZDtype above if subarr.tz is not None: raise ValueError("cannot localize from non-UTC data") - dtype = DatetimeTZDtype.construct_from_string(dtype) - subarr = subarr.tz_localize(dtype.tz) if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: inferred = subarr.inferred_freq if inferred != freq.freqstr: - on_freq = cls._generate(subarr[0], None, len( - subarr), None, freq, tz=tz, ambiguous=ambiguous) + on_freq = cls._generate(subarr[0], None, len(subarr), None, + freq, tz=tz, ambiguous=ambiguous) if not np.array_equal(subarr.asi8, on_freq.asi8): raise ValueError('Inferred frequency {0} from passed ' 'dates does not conform to passed ' @@ -563,7 +505,6 @@ def _generate(cls, start, end, periods, name, offset, index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] - index = cls._simple_new(index, name=name, freq=offset, tz=tz) return index @@ -669,7 +610,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, xdr = generate_range(offset=offset, start=_CACHE_START, end=_CACHE_END) - arr = tools._to_datetime(list(xdr), box=False) + arr = tools.to_datetime(list(xdr), box=False) cachedRange = DatetimeIndex._simple_new(arr) cachedRange.offset = offset diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 8d6955ab43711..e493e9d936b02 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1046,7 +1046,12 @@ def _get_binner_for_grouping(self, obj): l = [] for key, group in grouper.get_iterator(self.ax): l.extend([key] * len(group)) - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + + if isinstance(self.ax, PeriodIndex): + grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + else: + # resampling causes duplicated values, specifying freq is invalid + grouper = binner.__class__(l, name=binner.name) # since we may have had to sort # may need to reorder groups here diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 299ec374567e7..59fc147ead4eb 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4087,8 +4087,9 @@ def test_dti_set_index_reindex(self): # 11314 # with tz - index = date_range(datetime(2015, 10, 1), datetime( - 2015, 10, 1, 23), freq='H', tz='US/Eastern') + index = date_range(datetime(2015, 10, 1), + datetime(2015, 10, 1, 23), + freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) new_index = date_range(datetime(2015, 10, 2), datetime(2015, 10, 2, 23), diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index f30f01e66cb0b..22bb3bddbc742 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,7 +7,8 @@ import datetime import pandas as pd -from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime +from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period, + to_datetime) from pandas.tslib import get_timezone from pandas._period import period_asfreq, period_ordinal from pandas.tseries.index import date_range, DatetimeIndex @@ -698,14 +699,19 @@ def test_parsers(self): yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below result4 = to_datetime(np.array([date_str], dtype=object), yearfirst=yearfirst) - result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0] - self.assertEqual(result1, expected) - self.assertEqual(result2, expected) - self.assertEqual(result3, expected) - self.assertEqual(result4, expected) - self.assertEqual(result6, expected) + result6 = DatetimeIndex([date_str], yearfirst=yearfirst) + # result7 is used below + result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) + result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) + + for res in [result1, result2]: + self.assertEqual(res, expected) + for res in [result3, result4, result6, result8, result9]: + exp = DatetimeIndex([pd.Timestamp(expected)]) + tm.assert_index_equal(res, exp) # these really need to have yearfist, but we don't support if not yearfirst: @@ -893,9 +899,7 @@ def test_parsers_monthfreq(self): for date_str, expected in compat.iteritems(cases): result1, _, _ = tools.parse_time_string(date_str, freq='M') - result2 = tools._to_datetime(date_str, freq='M') self.assertEqual(result1, expected) - self.assertEqual(result2, expected) def test_parsers_quarterly_with_freq(self): msg = ('Incorrect quarterly string is given, quarter ' diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 067e8ec19f644..93d35ff964e69 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -295,22 +295,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 1 loop, best of 3: 471 ms per loop """ - return _to_datetime(arg, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, box=box, format=format, exact=exact, - unit=unit, infer_datetime_format=infer_datetime_format) - -def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - utc=None, box=True, format=None, exact=True, - unit=None, freq=None, infer_datetime_format=False): - """ - Same as to_datetime, but accept freq for - DatetimeIndex internal construction - """ from pandas.tseries.index import DatetimeIndex - def _convert_listlike(arg, box, format, name=None): + tz = 'utc' if utc else None + + def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -319,8 +309,7 @@ def _convert_listlike(arg, box, format, name=None): if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: - return DatetimeIndex(arg, tz='utc' if utc else None, - name=name) + return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass @@ -328,7 +317,7 @@ def _convert_listlike(arg, box, format, name=None): elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): - return DatetimeIndex(arg, tz='utc' if utc else None) + return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg @@ -344,8 +333,7 @@ def _convert_listlike(arg, box, format, name=None): from pandas import Index return Index(result) - return DatetimeIndex(result, tz='utc' if utc else None, - name=name) + return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' @@ -382,8 +370,8 @@ def _convert_listlike(arg, box, format, name=None): # fallback if result is None: try: - result = tslib.array_strptime( - arg, format, exact=exact, errors=errors) + result = tslib.array_strptime(arg, format, exact=exact, + errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise @@ -404,14 +392,11 @@ def _convert_listlike(arg, box, format, name=None): utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - freq=freq, require_iso8601=require_iso8601 ) if is_datetime64_dtype(result) and box: - result = DatetimeIndex(result, - tz='utc' if utc else None, - name=name) + result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 5624b84523705..016c49ea2b859 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -852,13 +852,6 @@ cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1: return _nat_scalar_rules[op] -cdef _tz_format(object obj, object zone): - try: - return obj.strftime(' %%Z, tz=%s' % zone) - except: - return ', tz=%s' % zone - - cpdef object get_value_box(ndarray arr, object loc): cdef: Py_ssize_t i, sz @@ -1642,14 +1635,6 @@ cdef inline _check_dts_bounds(pandas_datetimestruct *dts): raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) -# elif isinstance(ts, _Timestamp): -# tmp = ts -# obj.value = (<_Timestamp> ts).value -# obj.dtval = -# elif isinstance(ts, object): -# # If all else fails -# obj.value = _dtlike_to_datetime64(ts, &obj.dts) -# obj.dtval = _dts_to_pydatetime(&obj.dts) def datetime_to_datetime64(ndarray[object] values): cdef: @@ -1689,7 +1674,7 @@ def datetime_to_datetime64(ndarray[object] values): cdef: set _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) -cpdef object _does_string_look_like_datetime(object date_string): +cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): # Strings starting with 0 are more consistent with a # date-like string than a number @@ -1827,8 +1812,14 @@ def parse_datetime_string(object date_string, object freq=None, except ValueError: pass - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + try: + dt = parse_date(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + return dt @@ -2214,7 +2205,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): cpdef array_to_datetime(ndarray[object] values, errors='raise', - dayfirst=False, yearfirst=False, freq=None, + dayfirst=False, yearfirst=False, format=None, utc=None, require_iso8601=False): cdef: @@ -2343,7 +2334,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) except Exception: if is_coerce: iresult[i] = NPY_NAT @@ -2423,7 +2414,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: @@ -2438,28 +2429,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -def parse_str_array_to_datetime(ndarray values, dayfirst=False, - yearfirst=False, object freq=None): - """Shortcut to parse str array for quicker DatetimeIndex construction""" - cdef: - Py_ssize_t i, n = len(values) - object val, py_dt - ndarray[int64_t] iresult - _TSObject _ts - - iresult = np.empty(n, dtype='i8') - - for i in range(n): - val = values[i] - try: - py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) - except Exception: - raise ValueError - _ts = convert_to_tsobject(py_dt, None, None, 0, 0) - iresult[i] = _ts.value - - return iresult # Similar to Timestamp/datetime, this is a construction requirement for timedeltas # we need to do object instantiation in python From 4c9ae94f1ee4d867e2d92735d5755d43daef618d Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 19 Jul 2016 09:11:08 -0400 Subject: [PATCH 37/50] DOC: resample warnings closes #13618 closes #13520 Author: Chris Closes #13675 from chris-b1/resample-warning and squashes the following commits: 2185c1f [Chris] whatsnew note c58c70c [Chris] DOC: resample warnings --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/resample.py | 52 ++++++++++++++++----------- pandas/tseries/tests/test_resample.py | 7 ++++ 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 69200d7142b9f..efa6e5575fa79 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -639,7 +639,7 @@ Bug Fixes - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) - Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - +- Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) - Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index e493e9d936b02..38c2e009a01f3 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -60,12 +60,15 @@ class Resampler(_GroupBy): 'loffset', 'base', 'kind'] # API compat of allowed attributes - _deprecated_valids = _attributes + ['_ipython_display_', '__doc__', - '_cache', '_attributes', 'binner', - 'grouper', 'groupby', 'keys', - 'sort', 'kind', 'squeeze', - 'group_keys', 'as_index', - 'exclusions', '_groupby'] + _deprecated_valids = _attributes + ['__doc__', '_cache', '_attributes', + 'binner', 'grouper', 'groupby', + 'sort', 'kind', 'squeeze', 'keys', + 'group_keys', 'as_index', 'exclusions', + '_groupby'] + + # don't raise deprecation warning on attributes starting with these + # patterns - prevents warnings caused by IPython introspection + _deprecated_valid_patterns = ['_ipython', '_repr'] # API compat of disallowed attributes _deprecated_invalids = ['iloc', 'loc', 'ix', 'iat', 'at'] @@ -109,9 +112,12 @@ def _typ(self): return 'series' return 'dataframe' - def _deprecated(self): - warnings.warn(".resample() is now a deferred operation\n" - "use .resample(...).mean() instead of .resample(...)", + def _deprecated(self, op): + warnings.warn(("\n.resample() is now a deferred operation\n" + "You called {op}(...) on this deferred object " + "which materialized it into a {klass}\nby implicitly " + "taking the mean. Use .resample(...).mean() " + "instead").format(op=op, klass=self._typ), FutureWarning, stacklevel=3) return self.mean() @@ -119,20 +125,20 @@ def _make_deprecated_binop(op): # op is a string def _evaluate_numeric_binop(self, other): - result = self._deprecated() + result = self._deprecated(op) return getattr(result, op)(other) return _evaluate_numeric_binop - def _make_deprecated_unary(op): + def _make_deprecated_unary(op, name): # op is a callable def _evaluate_numeric_unary(self): - result = self._deprecated() + result = self._deprecated(name) return op(result) return _evaluate_numeric_unary def __array__(self): - return self._deprecated().__array__() + return self._deprecated('__array__').__array__() __gt__ = _make_deprecated_binop('__gt__') __ge__ = _make_deprecated_binop('__ge__') @@ -148,10 +154,10 @@ def __array__(self): __truediv__ = __rtruediv__ = _make_deprecated_binop('__truediv__') if not compat.PY3: __div__ = __rdiv__ = _make_deprecated_binop('__div__') - __neg__ = _make_deprecated_unary(lambda x: -x) - __pos__ = _make_deprecated_unary(lambda x: x) - __abs__ = _make_deprecated_unary(lambda x: np.abs(x)) - __inv__ = _make_deprecated_unary(lambda x: -x) + __neg__ = _make_deprecated_unary(lambda x: -x, '__neg__') + __pos__ = _make_deprecated_unary(lambda x: x, '__pos__') + __abs__ = _make_deprecated_unary(lambda x: np.abs(x), '__abs__') + __inv__ = _make_deprecated_unary(lambda x: -x, '__inv__') def __getattr__(self, attr): if attr in self._internal_names_set: @@ -165,8 +171,12 @@ def __getattr__(self, attr): raise ValueError(".resample() is now a deferred operation\n" "\tuse .resample(...).mean() instead of " ".resample(...)") - if attr not in self._deprecated_valids: - self = self._deprecated() + + matches_pattern = any(attr.startswith(x) for x + in self._deprecated_valid_patterns) + if not matches_pattern and attr not in self._deprecated_valids: + self = self._deprecated(attr) + return object.__getattribute__(self, attr) def __setattr__(self, attr, value): @@ -182,7 +192,7 @@ def __getitem__(self, key): # compat for deprecated if isinstance(self.obj, com.ABCSeries): - return self._deprecated()[key] + return self._deprecated('__getitem__')[key] raise @@ -230,7 +240,7 @@ def _assure_grouper(self): def plot(self, *args, **kwargs): # for compat with prior versions, we want to # have the warnings shown here and just have this work - return self._deprecated().plot(*args, **kwargs) + return self._deprecated('plot').plot(*args, **kwargs) def aggregate(self, arg, *args, **kwargs): """ diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 518f69485004c..85d8cd52e1866 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -168,6 +168,13 @@ def f(): check_stacklevel=False): self.assertIsInstance(getattr(r, op)(2), pd.Series) + # IPython introspection shouldn't trigger warning GH 13618 + for op in ['_repr_json', '_repr_latex', + '_ipython_canary_method_should_not_exist_']: + r = self.series.resample('H') + with tm.assert_produces_warning(None): + getattr(r, op, None) + # getitem compat df = self.series.to_frame('foo') From 8acfad343c88760a6d09fea221996dd50393fa8a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 19 Jul 2016 16:31:08 -0400 Subject: [PATCH 38/50] CLN: Removed the flavor='mysql' option and deprecate flavor in DataFrame.to_sql (#13611) --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/generic.py | 13 +- pandas/io/sql.py | 165 +++++++------------- pandas/io/tests/test_sql.py | 262 ++++++++++---------------------- 4 files changed, 144 insertions(+), 298 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index efa6e5575fa79..57b0d8895f67b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -524,6 +524,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) @@ -541,6 +542,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) +- ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c1676fbdd7f4..e59bec2dbd7e0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1144,7 +1144,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs) - def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', + def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): """ Write records stored in a DataFrame to a SQL database. @@ -1155,12 +1155,11 @@ def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', Name of SQL table con : SQLAlchemy engine or DBAPI2 connection (legacy mode) Using SQLAlchemy makes it possible to use any DB supported by that - library. - If a DBAPI2 object, only sqlite3 is supported. - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy engine. - 'mysql' is deprecated and will be removed in future versions, but - it will be further supported through SQLAlchemy engines. + library. If a DBAPI2 object, only sqlite3 is supported. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version, + as 'sqlite' is the only supported option if SQLAlchemy is not + installed. schema : string, default None Specify the schema (if database flavor supports this). If None, use default schema. diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8485a3f13f047..b9eaa0e4d657b 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -41,6 +41,24 @@ class DatabaseError(IOError): _SQLALCHEMY_INSTALLED = None +def _validate_flavor_parameter(flavor): + """ + Checks whether a database 'flavor' was specified. + If not None, produces FutureWarning if 'sqlite' and + raises a ValueError if anything else. + """ + if flavor is not None: + if flavor == 'sqlite': + warnings.warn("the 'flavor' parameter is deprecated " + "and will be removed in a future version, " + "as 'sqlite' is the only supported option " + "when SQLAlchemy is not installed.", + FutureWarning, stacklevel=2) + else: + raise ValueError("database flavor {flavor} is not " + "supported".format(flavor=flavor)) + + def _is_sqlalchemy_connectable(con): global _SQLALCHEMY_INSTALLED if _SQLALCHEMY_INSTALLED is None: @@ -517,7 +535,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, chunksize=chunksize) -def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', +def to_sql(frame, name, con, flavor=None, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): """ Write records stored in a DataFrame to a SQL database. @@ -532,10 +550,8 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy connectables. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). @@ -573,7 +589,7 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', chunksize=chunksize, dtype=dtype) -def has_table(table_name, con, flavor='sqlite', schema=None): +def has_table(table_name, con, flavor=None, schema=None): """ Check if DataBase has named table. @@ -585,10 +601,8 @@ def has_table(table_name, con, flavor='sqlite', schema=None): Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - flavor: {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy connectables. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). @@ -603,12 +617,6 @@ def has_table(table_name, con, flavor='sqlite', schema=None): table_exists = has_table -_MYSQL_WARNING = ("The 'mysql' flavor with DBAPI connection is deprecated " - "and will be removed in future versions. " - "MySQL will be further supported with SQLAlchemy " - "connectables.") - - def _engine_builder(con): """ Returns a SQLAlchemy engine from a URI (if con is a string) @@ -632,15 +640,15 @@ def pandasSQL_builder(con, flavor=None, schema=None, meta=None, Convenience function to return the correct PandasSQL subclass based on the provided parameters """ + _validate_flavor_parameter(flavor) + # When support for DBAPI connections is removed, # is_cursor should not be necessary. con = _engine_builder(con) if _is_sqlalchemy_connectable(con): return SQLDatabase(con, schema=schema, meta=meta) else: - if flavor == 'mysql': - warnings.warn(_MYSQL_WARNING, FutureWarning, stacklevel=3) - return SQLiteDatabase(con, flavor, is_cursor=is_cursor) + return SQLiteDatabase(con, is_cursor=is_cursor) class SQLTable(PandasObject): @@ -1035,11 +1043,11 @@ class PandasSQL(PandasObject): def read_sql(self, *args, **kwargs): raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or connection+sql flavor") + "connectable or sqlite connection") def to_sql(self, *args, **kwargs): raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or connection+sql flavor") + "connectable or sqlite connection") class SQLDatabase(PandasSQL): @@ -1308,38 +1316,16 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): # ---- SQL without SQLAlchemy --- -# Flavour specific sql strings and handler class for access to DBs without -# SQLAlchemy installed -# SQL type convertions for each DB +# sqlite-specific sql strings and handler class +# dictionary used for readability purposes _SQL_TYPES = { - 'string': { - 'mysql': 'VARCHAR (63)', - 'sqlite': 'TEXT', - }, - 'floating': { - 'mysql': 'DOUBLE', - 'sqlite': 'REAL', - }, - 'integer': { - 'mysql': 'BIGINT', - 'sqlite': 'INTEGER', - }, - 'datetime': { - 'mysql': 'DATETIME', - 'sqlite': 'TIMESTAMP', - }, - 'date': { - 'mysql': 'DATE', - 'sqlite': 'DATE', - }, - 'time': { - 'mysql': 'TIME', - 'sqlite': 'TIME', - }, - 'boolean': { - 'mysql': 'BOOLEAN', - 'sqlite': 'INTEGER', - } + 'string': 'TEXT', + 'floating': 'REAL', + 'integer': 'INTEGER', + 'datetime': 'TIMESTAMP', + 'date': 'DATE', + 'time': 'TIME', + 'boolean': 'INTEGER', } @@ -1351,22 +1337,6 @@ def _get_unicode_name(name): return uname -def _get_valid_mysql_name(name): - # Filter for unquoted identifiers - # See http://dev.mysql.com/doc/refman/5.0/en/identifiers.html - uname = _get_unicode_name(name) - if not len(uname): - raise ValueError("Empty table or column name specified") - - basere = r'[0-9,a-z,A-Z$_]' - for c in uname: - if not re.match(basere, c): - if not (0x80 < ord(c) < 0xFFFF): - raise ValueError("Invalid MySQL identifier '%s'" % uname) - - return '`' + uname + '`' - - def _get_valid_sqlite_name(name): # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python @@ -1385,19 +1355,6 @@ def _get_valid_sqlite_name(name): return '"' + uname.replace('"', '""') + '"' -# SQL enquote and wildcard symbols -_SQL_WILDCARD = { - 'mysql': '%s', - 'sqlite': '?' -} - -# Validate and return escaped identifier -_SQL_GET_IDENTIFIER = { - 'mysql': _get_valid_mysql_name, - 'sqlite': _get_valid_sqlite_name, -} - - _SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. " "In pandas versions < 0.14, spaces were converted to " "underscores.") @@ -1428,9 +1385,8 @@ def _execute_create(self): def insert_statement(self): names = list(map(text_type, self.frame.columns)) - flv = self.pd_sql.flavor - wld = _SQL_WILDCARD[flv] # wildcard char - escape = _SQL_GET_IDENTIFIER[flv] + wld = '?' # wildcard char + escape = _get_valid_sqlite_name if self.index is not None: [names.insert(0, idx) for idx in self.index[::-1]] @@ -1460,8 +1416,7 @@ def _create_table_setup(self): if any(map(pat.search, column_names)): warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) - flv = self.pd_sql.flavor - escape = _SQL_GET_IDENTIFIER[flv] + escape = _get_valid_sqlite_name create_tbl_stmts = [escape(cname) + ' ' + ctype for cname, ctype, _ in column_names_and_types] @@ -1514,7 +1469,7 @@ def _sql_type_name(self, col): if col_type not in _SQL_TYPES: col_type = "string" - return _SQL_TYPES[col_type][self.pd_sql.flavor] + return _SQL_TYPES[col_type] class SQLiteDatabase(PandasSQL): @@ -1522,25 +1477,17 @@ class SQLiteDatabase(PandasSQL): Version of SQLDatabase to support sqlite connections (fallback without sqlalchemy). This should only be used internally. - For now still supports `flavor` argument to deal with 'mysql' database - for backwards compatibility, but this will be removed in future versions. - Parameters ---------- con : sqlite connection object """ - def __init__(self, con, flavor, is_cursor=False): + def __init__(self, con, flavor=None, is_cursor=False): + _validate_flavor_parameter(flavor) + self.is_cursor = is_cursor self.con = con - if flavor is None: - flavor = 'sqlite' - if flavor not in ['sqlite', 'mysql']: - raise NotImplementedError("flavors other than SQLite and MySQL " - "are not supported") - else: - self.flavor = flavor @contextmanager def run_transaction(self): @@ -1665,15 +1612,12 @@ def to_sql(self, frame, name, if_exists='fail', index=True, def has_table(self, name, schema=None): # TODO(wesm): unused? - # escape = _SQL_GET_IDENTIFIER[self.flavor] + # escape = _get_valid_sqlite_name # esc_name = escape(name) - wld = _SQL_WILDCARD[self.flavor] - flavor_map = { - 'sqlite': ("SELECT name FROM sqlite_master " - "WHERE type='table' AND name=%s;") % wld, - 'mysql': "SHOW TABLES LIKE %s" % wld} - query = flavor_map.get(self.flavor) + wld = '?' + query = ("SELECT name FROM sqlite_master " + "WHERE type='table' AND name=%s;") % wld return len(self.execute(query, [name, ]).fetchall()) > 0 @@ -1681,8 +1625,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - escape = _SQL_GET_IDENTIFIER[self.flavor] - drop_sql = "DROP TABLE %s" % escape(name) + drop_sql = "DROP TABLE %s" % _get_valid_sqlite_name(name) self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): @@ -1691,7 +1634,7 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): return str(table.sql_schema()) -def get_schema(frame, name, flavor='sqlite', keys=None, con=None, dtype=None): +def get_schema(frame, name, flavor=None, keys=None, con=None, dtype=None): """ Get the SQL db table schema for the given frame. @@ -1700,16 +1643,14 @@ def get_schema(frame, name, flavor='sqlite', keys=None, con=None, dtype=None): frame : DataFrame name : string name of SQL table - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy engines. keys : string or sequence, default: None columns to use a primary key con: an open SQL database connection object or a SQLAlchemy connectable Using SQLAlchemy makes it possible to use any DB supported by that library, default: None If a DBAPI2 object, only sqlite3 is supported. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be a SQLAlchemy type, or a string for sqlite3 fallback connection. diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index e5a49c5213a48..41be39f9abaa6 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -13,7 +13,7 @@ common methods, `_TestSQLAlchemyConn` tests the API with a SQLAlchemy Connection object. The different tested flavors (sqlite3, MySQL, PostgreSQL) derive from the base class - - Tests for the fallback mode (`TestSQLiteFallback` and `TestMySQLLegacy`) + - Tests for the fallback mode (`TestSQLiteFallback`) """ @@ -526,30 +526,29 @@ def test_read_sql_view(self): self._check_iris_loaded_frame(iris_frame) def test_to_sql(self): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn, flavor='sqlite') + sql.to_sql(self.test_frame1, 'test_frame1', self.conn) self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='sqlite'), + sql.has_table('test_frame1', self.conn), 'Table not written to DB') def test_to_sql_fail(self): sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') self.assertTrue( - sql.has_table('test_frame2', self.conn, flavor='sqlite'), + sql.has_table('test_frame2', self.conn), 'Table not written to DB') self.assertRaises(ValueError, sql.to_sql, self.test_frame1, - 'test_frame2', self.conn, flavor='sqlite', - if_exists='fail') + 'test_frame2', self.conn, if_exists='fail') def test_to_sql_replace(self): sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, flavor='sqlite', if_exists='replace') + self.conn, if_exists='replace') self.assertTrue( - sql.has_table('test_frame3', self.conn, flavor='sqlite'), + sql.has_table('test_frame3', self.conn), 'Table not written to DB') num_entries = len(self.test_frame1) @@ -560,13 +559,13 @@ def test_to_sql_replace(self): def test_to_sql_append(self): sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, flavor='sqlite', if_exists='append') + self.conn, if_exists='append') self.assertTrue( - sql.has_table('test_frame4', self.conn, flavor='sqlite'), + sql.has_table('test_frame4', self.conn), 'Table not written to DB') num_entries = 2 * len(self.test_frame1) @@ -576,26 +575,25 @@ def test_to_sql_append(self): num_rows, num_entries, "not the same number of rows as entries") def test_to_sql_type_mapping(self): - sql.to_sql(self.test_frame3, 'test_frame5', - self.conn, flavor='sqlite', index=False) + sql.to_sql(self.test_frame3, 'test_frame5', self.conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", self.conn) tm.assert_frame_equal(self.test_frame3, result) def test_to_sql_series(self): s = Series(np.arange(5, dtype='int64'), name='series') - sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False) + sql.to_sql(s, "test_series", self.conn, index=False) s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) tm.assert_frame_equal(s.to_frame(), s2) def test_to_sql_panel(self): panel = tm.makePanel() self.assertRaises(NotImplementedError, sql.to_sql, panel, - 'test_panel', self.conn, flavor='sqlite') + 'test_panel', self.conn) def test_roundtrip(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', - con=self.conn, flavor='sqlite') + con=self.conn) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -609,7 +607,7 @@ def test_roundtrip(self): def test_roundtrip_chunksize(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, - index=False, flavor='sqlite', chunksize=2) + index=False, chunksize=2) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -764,27 +762,25 @@ def test_integer_col_names(self): if_exists='replace') def test_get_schema(self): - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', - con=self.conn) + create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn) self.assertTrue('CREATE' in create_sql) def test_get_schema_dtypes(self): float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]}) dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER' - create_sql = sql.get_schema(float_frame, 'test', 'sqlite', + create_sql = sql.get_schema(float_frame, 'test', con=self.conn, dtype={'b': dtype}) self.assertTrue('CREATE' in create_sql) self.assertTrue('INTEGER' in create_sql) def test_get_schema_keys(self): frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]}) - create_sql = sql.get_schema(frame, 'test', 'sqlite', - con=self.conn, keys='Col1') + create_sql = sql.get_schema(frame, 'test', con=self.conn, keys='Col1') constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' self.assertTrue(constraint_sentence in create_sql) # multiple columns as key (GH10385) - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', + create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn, keys=['A', 'B']) constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' self.assertTrue(constraint_sentence in create_sql) @@ -1044,8 +1040,8 @@ def test_sql_open_close(self): with tm.ensure_clean() as name: conn = self.connect(name) - sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, - flavor="sqlite", index=False) + sql.to_sql(self.test_frame3, "test_frame3_legacy", + conn, index=False) conn.close() conn = self.connect(name) @@ -1067,12 +1063,11 @@ def test_safe_names_warning(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space # warns on create table with spaces in names with tm.assert_produces_warning(): - sql.to_sql(df, "test_frame3_legacy", self.conn, - flavor="sqlite", index=False) + sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) def test_get_schema2(self): # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite') + create_sql = sql.get_schema(self.test_frame1, 'test') self.assertTrue('CREATE' in create_sql) def test_tquery(self): @@ -1098,7 +1093,7 @@ def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], utc=True)}) - db = sql.SQLiteDatabase(self.conn, self.flavor) + db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) schema = table.sql_schema() self.assertEqual(self._get_sqlite_column_type(schema, 'time'), @@ -1908,16 +1903,12 @@ def connect(cls): def setUp(self): self.conn = self.connect() - self.pandasSQL = sql.SQLiteDatabase(self.conn, 'sqlite') + self.pandasSQL = sql.SQLiteDatabase(self.conn) self._load_iris_data() self._load_test1_data() - def test_invalid_flavor(self): - self.assertRaises( - NotImplementedError, sql.SQLiteDatabase, self.conn, 'oracle') - def test_read_sql(self): self._read_sql_iris() @@ -1965,7 +1956,7 @@ def test_execute_sql(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False, flavor=self.flavor) + df.to_sql('test_date', self.conn, index=False) res = read_sql_query('SELECT * FROM test_date', self.conn) if self.flavor == 'sqlite': # comes back as strings @@ -1976,7 +1967,7 @@ def test_datetime_date(self): def test_datetime_time(self): # test support for datetime.time, GH #8341 df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False, flavor=self.flavor) + df.to_sql('test_time', self.conn, index=False) res = read_sql_query('SELECT * FROM test_time', self.conn) if self.flavor == 'sqlite': # comes back as strings @@ -2051,130 +2042,22 @@ def test_illegal_names(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) # Raise error on blank - self.assertRaises(ValueError, df.to_sql, "", self.conn, - flavor=self.flavor) + self.assertRaises(ValueError, df.to_sql, "", self.conn) for ndx, weird_name in enumerate( ['test_weird_name]', 'test_weird_name[', 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'', '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', '99beginswithnumber', '12345', u'\xe9']): - df.to_sql(weird_name, self.conn, flavor=self.flavor) + df.to_sql(weird_name, self.conn) sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name]) c_tbl = 'test_weird_col_name%d' % ndx - df2.to_sql(c_tbl, self.conn, flavor=self.flavor) + df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) -class TestMySQLLegacy(MySQLMixIn, TestSQLiteFallback): - """ - Test the legacy mode against a MySQL database. - - """ - flavor = 'mysql' - - @classmethod - def setUpClass(cls): - cls.setup_driver() - - # test connection - try: - cls.connect() - except cls.driver.err.OperationalError: - raise nose.SkipTest( - "{0} - can't connect to MySQL server".format(cls)) - - @classmethod - def setup_driver(cls): - try: - import pymysql - cls.driver = pymysql - except ImportError: - raise nose.SkipTest('pymysql not installed') - - @classmethod - def connect(cls): - return cls.driver.connect(host='127.0.0.1', user='root', passwd='', - db='pandas_nosetest') - - def _count_rows(self, table_name): - cur = self._get_exec() - cur.execute( - "SELECT count(*) AS count_1 FROM %s" % table_name) - rows = cur.fetchall() - return rows[0][0] - - def setUp(self): - try: - self.conn = self.connect() - except self.driver.err.OperationalError: - raise nose.SkipTest("Can't connect to MySQL server") - - self.pandasSQL = sql.SQLiteDatabase(self.conn, 'mysql') - - self._load_iris_data() - self._load_test1_data() - - def test_a_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn, - flavor='mysql') - self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='mysql'), - 'Table not written to DB') - - def _get_index_columns(self, tbl_name): - ixs = sql.read_sql_query( - "SHOW INDEX IN %s" % tbl_name, self.conn) - ix_cols = {} - for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - if ix_name not in ix_cols: - ix_cols[ix_name] = [] - ix_cols[ix_name].append(ix_col) - return list(ix_cols.values()) - - # TODO: cruft? - # def test_to_sql_save_index(self): - # self._to_sql_save_index() - - # for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - # if ix_name not in ix_cols: - # ix_cols[ix_name] = [] - # ix_cols[ix_name].append(ix_col) - # return ix_cols.values() - - def test_to_sql_save_index(self): - self._to_sql_save_index() - - def test_illegal_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) - - # These tables and columns should be ok - for ndx, ok_name in enumerate(['99beginswithnumber', '12345']): - df.to_sql(ok_name, self.conn, flavor=self.flavor, index=False, - if_exists='replace') - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', ok_name]) - - df2.to_sql('test_ok_col_name', self.conn, - flavor=self.flavor, index=False, - if_exists='replace') - - # For MySQL, these should raise ValueError - for ndx, illegal_name in enumerate( - ['test_illegal_name]', 'test_illegal_name[', - 'test_illegal_name`', 'test_illegal_name"', - 'test_illegal_name\'', '']): - self.assertRaises(ValueError, df.to_sql, illegal_name, self.conn, - flavor=self.flavor, index=False) - - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', illegal_name]) - self.assertRaises(ValueError, df2.to_sql, - 'test_illegal_col_name%d' % ndx, - self.conn, flavor=self.flavor, index=False) - - # ----------------------------------------------------------------------------- # -- Old tests from 0.13.1 (before refactor using sqlalchemy) @@ -2228,7 +2111,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) @@ -2247,7 +2130,7 @@ def test_write_row_by_row(self): def test_execute(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" @@ -2262,7 +2145,7 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') @@ -2270,7 +2153,7 @@ def test_schema(self): self.assertTrue(tokens[1] == 'DATETIME') frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],) + create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) lines = create_sql.splitlines() self.assertTrue('PRIMARY KEY ("A", "B")' in create_sql) cur = self.conn.cursor() @@ -2425,44 +2308,68 @@ def clean_up(test_table_to_drop): frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail') + sql.to_sql(frame=df_if_exists_1, con=self.conn, + name=table_name, if_exists='fail') self.assertRaises(ValueError, sql.to_sql, frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='sqlite', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='sqlite', if_exists='append', index=False) + if_exists='append', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) +class TestSQLFlavorDeprecation(tm.TestCase): + """ + gh-13611: test that the 'flavor' parameter + is appropriately deprecated by checking the + functions that directly raise the warning + """ + + con = 1234 # don't need real connection for this + funcs = ['SQLiteDatabase', 'pandasSQL_builder'] + + def test_unsupported_flavor(self): + msg = 'is not supported' + + for func in self.funcs: + tm.assertRaisesRegexp(ValueError, msg, getattr(sql, func), + self.con, flavor='mysql') + + def test_deprecated_flavor(self): + for func in self.funcs: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + getattr(sql, func)(self.con, flavor='sqlite') + + +@unittest.skip("gh-13611: there is no support for MySQL " + "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn, tm.TestCase): @classmethod @@ -2531,7 +2438,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) @@ -2553,7 +2460,7 @@ def test_chunksize_read_type(self): drop_sql = "DROP TABLE IF EXISTS test" cur = self.conn.cursor() cur.execute(drop_sql) - sql.to_sql(frame, name='test', con=self.conn, flavor='mysql') + sql.to_sql(frame, name='test', con=self.conn) query = "select * from test" chunksize = 5 chunk_gen = pd.read_sql_query(sql=query, con=self.conn, @@ -2565,7 +2472,7 @@ def test_execute(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") @@ -2584,7 +2491,7 @@ def test_execute(self): def test_schema(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') @@ -2593,7 +2500,7 @@ def test_schema(self): frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],) + create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) lines = create_sql.splitlines() self.assertTrue('PRIMARY KEY (`A`, `B`)' in create_sql) cur = self.conn.cursor() @@ -2666,8 +2573,7 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + sql.to_sql(frame, name='test_table', con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2687,7 +2593,7 @@ def _check_roundtrip(self, frame): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) sql.to_sql(frame2, name='test_table2', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) result = sql.read_sql("select * from test_table2", self.conn, index_col='Idx') expected = frame.copy() @@ -2707,7 +2613,7 @@ def test_tquery(self): cur = self.conn.cursor() cur.execute(drop_sql) sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) result = sql.tquery("select A from test_table", self.conn) expected = Series(frame.A.values, frame.index) # not to have name result = Series(result, frame.index) @@ -2733,7 +2639,7 @@ def test_uquery(self): cur = self.conn.cursor() cur.execute(drop_sql) sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' self.assertEqual(sql.uquery(stmt, con=self.conn), 1) @@ -2753,7 +2659,7 @@ def test_keyword_as_column_names(self): _skip_if_no_pymysql() df = DataFrame({'From': np.ones(5)}) sql.to_sql(df, con=self.conn, name='testkeywords', - if_exists='replace', flavor='mysql', index=False) + if_exists='replace', index=False) def test_if_exists(self): _skip_if_no_pymysql() @@ -2776,39 +2682,37 @@ def clean_up(test_table_to_drop): frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertRaises(ValueError, sql.to_sql, frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='mysql', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='mysql', if_exists='append', index=False) + if_exists='append', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) From 786edc7ed737d9f912613f29a62997017e729a37 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 20 Jul 2016 13:23:02 -0400 Subject: [PATCH 39/50] ENH: add time-window capability to .rolling xref #13327 closes #936 Author: Jeff Reback Closes #13513 from jreback/rolling and squashes the following commits: d8f3d73 [Jeff Reback] ENH: add time-window capability to .rolling --- ci/lint.sh | 14 +- doc/source/computation.rst | 85 ++ doc/source/timeseries.rst | 6 +- doc/source/whatsnew/v0.19.0.txt | 63 +- pandas/core/generic.py | 5 +- pandas/core/window.py | 363 +++++- pandas/tests/test_window.py | 573 ++++++++- pandas/window.pyx | 1921 +++++++++++++++++++++---------- setup.py | 6 +- 9 files changed, 2349 insertions(+), 687 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 9f582f72fcdd7..144febcfcece5 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -17,7 +17,19 @@ if [ "$LINT" ]; then fi done - echo "Linting DONE" + echo "Linting *.py DONE" + + echo "Linting *.pyx" + for path in 'window.pyx' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126,E128 + if [ $? -ne "0" ]; then + RET=1 + fi + + done + echo "Linting *.pyx DONE" echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 59675e33e724b..12e0ecfba97da 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -391,6 +391,91 @@ For some windowing functions, additional parameters must be specified: such that the weights are normalized with respect to each other. Weights of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. +.. _stats.moments.ts: + +Time-aware Rolling +~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +New in version 0.19.0 are the ability to pass an offset (or convertible) to a ``.rolling()`` method and have it produce +variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring +within the indicated time delta. + +This can be particularly useful for a non-regular time frequency index. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + +.. _stats.moments.ts-versus-resampling: + +Time-aware Rolling vs. Resampling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They +both operate and perform reductive operations on time-indexed pandas objects. + +When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and +aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value +at that point in the result. These are variable sized windows in time-space for each point of the input. You will get +a same sized result as the input. + +When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency +bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this +aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +will have the shape of a regular frequency between the min and the max of the original input object. + +To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. + Centering Windows ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index f6a1e169afe9d..fd31eb1b584a8 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1284,7 +1284,11 @@ performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -``resample`` is a time-based groupby, followed by a reduction method on each of its groups. +``.resample()`` is a time-based groupby, followed by a reduction method on each of its groups. + +.. note:: + + ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion `here ` See some :ref:`cookbook examples ` for some advanced strategies diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 57b0d8895f67b..cdae0d5c27c7d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -3,16 +3,17 @@ v0.19.0 (August ??, 2016) ------------------------- -This is a major release from 0.18.2 and includes a small number of API changes, several new features, +This is a major release from 0.18.1 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` +- ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` -.. contents:: What's new in v0.18.2 +.. contents:: What's new in v0.19.0 :local: :backlinks: none @@ -131,6 +132,64 @@ that forward filling happens automatically taking the most recent non-NaN value. This returns a merged DataFrame with the entries in the same order as the original left passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. +.. _whatsnew_0190.enhancements.rolling_ts: + +``.rolling()`` are now time-series aware +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`) +See the full documentation :ref:`here `. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: :func:`read_csv` has improved support for duplicate column names diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e59bec2dbd7e0..dd4be571ef2b4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5342,11 +5342,12 @@ def _add_series_or_dataframe_operations(cls): @Appender(rwindow.rolling.__doc__) def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, axis=0): + win_type=None, on=None, axis=0): axis = self._get_axis_number(axis) return rwindow.rolling(self, window=window, min_periods=min_periods, freq=freq, - center=center, win_type=win_type, axis=axis) + center=center, win_type=win_type, + on=on, axis=axis) cls.rolling = rolling diff --git a/pandas/core/window.py b/pandas/core/window.py index bc4d34529287b..9e2a27adc25a7 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -11,7 +11,11 @@ import numpy as np from collections import defaultdict -from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.types.generic import (ABCSeries, + ABCDataFrame, + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex) from pandas.types.common import (is_integer, is_bool, is_float_dtype, @@ -26,11 +30,14 @@ GroupByMixin) import pandas.core.common as com import pandas._window as _window +from pandas.tseries.offsets import DateOffset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import Substitution, Appender +from pandas.util.decorators import (Substitution, Appender, + cache_readonly) from textwrap import dedent + _shared_docs = dict() _doc_template = """ @@ -47,19 +54,21 @@ class _Window(PandasObject, SelectionMixin): _attributes = ['window', 'min_periods', 'freq', 'center', 'win_type', - 'axis'] + 'axis', 'on'] exclusions = set() def __init__(self, obj, window=None, min_periods=None, freq=None, - center=False, win_type=None, axis=0, **kwargs): + center=False, win_type=None, axis=0, on=None, **kwargs): if freq is not None: warnings.warn("The freq kw is deprecated and will be removed in a " "future version. You can resample prior to passing " "to a window function", FutureWarning, stacklevel=3) + self.__dict__.update(kwargs) self.blocks = [] self.obj = obj + self.on = on self.window = window self.min_periods = min_periods self.freq = freq @@ -72,6 +81,18 @@ def __init__(self, obj, window=None, min_periods=None, freq=None, def _constructor(self): return Window + @property + def is_datetimelike(self): + return None + + @property + def _on(self): + return None + + @property + def is_freq_type(self): + return self.win_type == 'freq' + def validate(self): if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") @@ -83,6 +104,7 @@ def _convert_freq(self, how=None): """ resample according to the how, return a new object """ obj = self._selected_obj + index = None if (self.freq is not None and isinstance(obj, (ABCSeries, ABCDataFrame))): if how is not None: @@ -92,13 +114,24 @@ def _convert_freq(self, how=None): stacklevel=6) obj = obj.resample(self.freq).aggregate(how or 'asfreq') - return obj + + return obj, index def _create_blocks(self, how): """ split data into blocks & return conformed data """ - obj = self._convert_freq(how) - return obj.as_blocks(copy=False).values(), obj + obj, index = self._convert_freq(how) + if index is not None: + index = self._on + + # filter out the on from the object + if self.on is not None: + if obj.ndim == 2: + obj = obj.reindex(columns=obj.columns.difference([self.on]), + copy=False) + blocks = obj.as_blocks(copy=False).values() + + return blocks, obj, index def _gotitem(self, key, ndim, subset=None): """ @@ -152,6 +185,21 @@ def __unicode__(self): return "{klass} [{attrs}]".format(klass=self._window_type, attrs=','.join(attrs)) + def _get_index(self, index=None): + """ + Return index as ndarrays + + Returns + ------- + tuple of (index, index_as_ndarray) + """ + + if self.is_freq_type: + if index is None: + index = self._on + return index, index.asi8 + return index, index + def _prep_values(self, values=None, kill_inf=True, how=None): if values is None: @@ -187,8 +235,8 @@ def _wrap_result(self, result, block=None, obj=None): if obj is None: obj = self._selected_obj - index = obj.index + if isinstance(result, np.ndarray): # coerce if necessary @@ -215,6 +263,9 @@ def _wrap_results(self, results, blocks, obj): obj : conformed data (may be resampled) """ + from pandas import Series + from pandas.core.index import _ensure_index + final = [] for result, block in zip(results, blocks): @@ -223,9 +274,31 @@ def _wrap_results(self, results, blocks, obj): return result final.append(result) + # if we have an 'on' column + # we want to put it back into the results + # in the same location + columns = self._selected_obj.columns + if self.on is not None \ + and not self._on.equals(obj.index): + + name = self._on.name + final.append(Series(self._on, index=obj.index, name=name)) + + if self._selection is not None: + + selection = _ensure_index(self._selection) + + # need to reorder to include original location of + # the on column (if its not already there) + if name not in selection: + columns = self.obj.columns + indexer = columns.get_indexer(selection.tolist() + [name]) + columns = columns.take(sorted(indexer)) + if not len(final): return obj.astype('float64') - return pd.concat(final, axis=1).reindex(columns=obj.columns) + return pd.concat(final, axis=1).reindex(columns=columns, + copy=False) def _center_window(self, result, window): """ center the result in the window """ @@ -271,18 +344,24 @@ def aggregate(self, arg, *args, **kwargs): class Window(_Window): """ - Provides rolling transformations. + Provides rolling window calculcations. .. versionadded:: 0.18.0 Parameters ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. + window : int, or offset + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. This is + new in 0.19.0 min_periods : int, default None Minimum number of observations in window required to have a value - (otherwise result is NA). + (otherwise result is NA). For a window that is specified by an offset, + this will default to 1. freq : string or DateOffset object, optional (default None) (DEPRECATED) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. @@ -290,11 +369,91 @@ class Window(_Window): Set the labels at the center of the window. win_type : string, default None Provide a window type. See the notes below. - axis : int, default 0 + on : string, optional + For a DataFrame, column on which to calculate + the rolling window, rather than the index + + .. versionadded:: 0.19.0 + + axis : int or string, default 0 Returns ------- - a Window sub-classed for the particular operation + a Window or Rolling sub-classed for the particular operation + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + Rolling sum with a window length of 2, using the 'triang' + window type. + + >>> df.rolling(2, win_type='triang').sum() + B + 0 NaN + 1 1.0 + 2 2.5 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, min_periods defaults + to the window length. + + >>> df.rolling(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Same as above, but explicity set the min_periods + + >>> df.rolling(2, min_periods=1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + A ragged (meaning not-a-regular frequency), time-indexed DataFrame + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + ....: index = [pd.Timestamp('20130101 09:00:00'), + ....: pd.Timestamp('20130101 09:00:02'), + ....: pd.Timestamp('20130101 09:00:03'), + ....: pd.Timestamp('20130101 09:00:05'), + ....: pd.Timestamp('20130101 09:00:06')]) + + >>> df + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + + Contrasting to an integer rolling window, this will roll a variable + length window corresponding to the time period. + The default for min_periods is 1. + + >>> df.rolling('2s').sum() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 Notes ----- @@ -305,7 +464,10 @@ class Window(_Window): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - The recognized window types are: + To learn more about the offsets & frequency strings, please see `this link + `__. + + The recognized win_types are: * ``boxcar`` * ``triang`` @@ -321,7 +483,8 @@ class Window(_Window): * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). -""" + + """ def validate(self): super(Window, self).validate() @@ -329,7 +492,7 @@ def validate(self): window = self.window if isinstance(window, (list, tuple, np.ndarray)): pass - elif com.is_integer(window): + elif is_integer(window): if window < 0: raise ValueError("window must be non-negative") try: @@ -400,7 +563,7 @@ def _apply_window(self, mean=True, how=None, **kwargs): window = self._prep_window(**kwargs) center = self.center - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: @@ -529,7 +692,8 @@ def _apply(self, func, name=None, window=None, center=None, if check_minp is None: check_minp = _use_window - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) + index, indexi = self._get_index(index=index) results = [] for b in blocks: try: @@ -551,9 +715,10 @@ def _apply(self, func, name=None, window=None, center=None, def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) - # GH #12373: rolling functions error on float32 data - return cfunc(_ensure_float64(arg), - window, minp, **kwargs) + # ensure we are only rolling on floats + arg = _ensure_float64(arg) + return cfunc(arg, + window, minp, indexi, **kwargs) # calculation function if center: @@ -587,11 +752,13 @@ class _Rolling_and_Expanding(_Rolling): observations inside provided window.""" def count(self): - obj = self._convert_freq() + + blocks, obj, index = self._create_blocks(how=None) + index, indexi = self._get_index(index=index) + window = self._get_window() window = min(window, len(obj)) if not self.center else window - blocks, obj = self._create_blocks(how=None) results = [] for b in blocks: @@ -625,10 +792,12 @@ def apply(self, func, args=(), kwargs={}): _level = kwargs.pop('_level', None) # noqa window = self._get_window() offset = _offset(window, self.center) + index, indexi = self._get_index() def f(arg, window, min_periods): minp = _use_window(min_periods, window) - return _window.roll_generic(arg, window, minp, offset, func, args, + return _window.roll_generic(arg, window, minp, indexi, + offset, func, args, kwargs) return self._apply(f, func, args=args, kwargs=kwargs, @@ -695,10 +864,12 @@ def median(self, how=None, **kwargs): def std(self, ddof=1, *args, **kwargs): nv.validate_window_func('std', args, kwargs) window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(_window.roll_var(arg, window, minp, ddof)) + return _zsqrt(_window.roll_var(arg, window, minp, indexi, + ddof)) return self._apply(f, 'std', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -740,10 +911,12 @@ def kurt(self, **kwargs): def quantile(self, quantile, **kwargs): window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return _window.roll_quantile(arg, window, minp, quantile) + return _window.roll_quantile(arg, window, minp, indexi, + quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) @@ -823,43 +996,63 @@ def _get_corr(a, b): class Rolling(_Rolling_and_Expanding): - """ - Provides rolling window calculcations. - - .. versionadded:: 0.18.0 - Parameters - ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. - Specified as a frequency string or DateOffset object. - center : boolean, default False - Set the labels at the center of the window. - axis : int, default 0 + @cache_readonly + def is_datetimelike(self): + return isinstance(self._on, + (ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex)) + + @cache_readonly + def _on(self): + + if self.on is None: + return self.obj.index + elif (isinstance(self.obj, ABCDataFrame) and + self.on in self.obj.columns): + return pd.Index(self.obj[self.on]) + else: + raise ValueError("invalid on specified as {0}, " + "must be a column (if DataFrame) " + "or None".format(self.on)) - Returns - ------- - a Window sub-classed for the particular operation + def validate(self): + super(Rolling, self).validate() - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. + # we allow rolling on a datetimelike index + if (self.is_datetimelike and + isinstance(self.window, (compat.string_types, DateOffset))): - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - """ + # must be monotonic for on + if not self._on.is_monotonic: + formatted = self.on or 'index' + raise ValueError("{0} must be " + "monotonic".format(formatted)) - def validate(self): - super(Rolling, self).validate() - if not is_integer(self.window): + from pandas.tseries.frequencies import to_offset + try: + freq = to_offset(self.window) + except (TypeError, ValueError): + raise ValueError("passed window {0} in not " + "compat with a datetimelike " + "index".format(self.window)) + + # we don't allow center + if self.center: + raise NotImplementedError("center is not implemented " + "for datetimelike and offset " + "based windows") + + # this will raise ValueError on non-fixed freqs + self.window = freq.nanos + self.win_type = 'freq' + + # min_periods must be an integer + if self.min_periods is None: + self.min_periods = 1 + + elif not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: raise ValueError("window must be non-negative") @@ -876,6 +1069,11 @@ def aggregate(self, arg, *args, **kwargs): @Appender(_doc_template) @Appender(_shared_docs['count']) def count(self): + + # different impl for freq counting + if self.is_freq_type: + return self._apply('roll_count', 'count') + return super(Rolling, self).count() @Substitution(name='rolling') @@ -993,12 +1191,31 @@ class Expanding(_Rolling_and_Expanding): Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. - axis : int, default 0 + axis : int or string, default 0 Returns ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + Notes ----- By default, the result is set to the right edge of the window. This can be @@ -1205,6 +1422,25 @@ class EWM(_Rolling): ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + Notes ----- Exactly one of center of mass, span, half-life, and alpha must be provided. @@ -1248,6 +1484,7 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, self.adjust = adjust self.ignore_na = ignore_na self.axis = axis + self.on = None @property def _constructor(self): @@ -1276,7 +1513,7 @@ def _apply(self, func, how=None, **kwargs): y : type of input argument """ - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3693ebdb12e2f..7a35682eee3b0 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -11,7 +11,7 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, - notnull, concat) + notnull, concat, Timestamp) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow @@ -101,7 +101,7 @@ def tests_skip_nuisance(self): expected = pd.concat([r[['A', 'B']].sum(), df[['C']]], axis=1) result = r.sum() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -319,6 +319,13 @@ class TestRolling(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.rolling(2).sum() + df.rolling(2, min_periods=1).sum() + def test_constructor(self): # GH 12669 @@ -372,6 +379,12 @@ class TestExpanding(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.expanding(2).sum() + def test_constructor(self): # GH 12669 @@ -408,6 +421,12 @@ class TestEWM(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.ewm(com=0.5).mean() + def test_constructor(self): for o in [self.series, self.frame]: c = o.ewm @@ -565,6 +584,7 @@ def _create_data(self): def test_dtypes(self): self._create_data() for f_name, d_name in product(self.funcs.keys(), self.data.keys()): + f = self.funcs[f_name] d = self.data[d_name] exp = self.expects[d_name][f_name] @@ -958,6 +978,7 @@ def test_rolling_median(self): name='median') def test_rolling_min(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_min, np.min, name='min') @@ -970,6 +991,7 @@ def test_rolling_min(self): window=3, min_periods=5) def test_rolling_max(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_max, np.max, name='max') @@ -2890,6 +2912,7 @@ def test_rolling_median_memory_error(self): Series(np.random.randn(n)).rolling(window=2, center=False).median() def test_rolling_min_max_numeric_types(self): + # GH12373 types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] types_test.extend([np.dtype("{}{}".format(sign, width)) @@ -2961,6 +2984,7 @@ def test_rolling(self): r = g.rolling(window=4) for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -3007,6 +3031,7 @@ def test_expanding(self): r = g.expanding() for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) tm.assert_frame_equal(result, expected) @@ -3047,3 +3072,547 @@ def test_expanding_apply(self): result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) tm.assert_frame_equal(result, expected) + + +class TestRollingTS(tm.TestCase): + + # rolling time-series friendly + # xref GH13327 + + def setUp(self): + + self.regular = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}).set_index('A') + + self.ragged = DataFrame({'B': range(5)}) + self.ragged.index = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=[Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')]) + df + df.rolling('2s').sum() + + def test_valid(self): + + df = self.regular + + # not a valid freq + with self.assertRaises(ValueError): + df.rolling(window='foobar') + + # not a datetimelike index + with self.assertRaises(ValueError): + df.reset_index().rolling(window='foobar') + + # non-fixed freqs + for freq in ['2MS', pd.offsets.MonthBegin(2)]: + with self.assertRaises(ValueError): + df.rolling(window=freq) + + for freq in ['1D', pd.offsets.Day(2), '2ms']: + df.rolling(window=freq) + + # non-integer min_periods + for minp in [1.0, 'foo', np.array([1, 2, 3])]: + with self.assertRaises(ValueError): + df.rolling(window='1D', min_periods=minp) + + # center is not implemented + with self.assertRaises(NotImplementedError): + df.rolling(window='1D', center=True) + + def test_on(self): + + df = self.regular + + # not a valid column + with self.assertRaises(ValueError): + df.rolling(window='2s', on='foobar') + + # column is valid + df = df.copy() + df['C'] = pd.date_range('20130101', periods=len(df)) + df.rolling(window='2d', on='C').sum() + + # invalid columns + with self.assertRaises(ValueError): + df.rolling(window='2d', on='B') + + # ok even though on non-selected + df.rolling(window='2d', on='C').B.sum() + + def test_monotonic_on(self): + + # on/index must be monotonic + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}) + + self.assertTrue(df.A.is_monotonic) + df.rolling('2s', on='A').sum() + + df = df.set_index('A') + self.assertTrue(df.index.is_monotonic) + df.rolling('2s').sum() + + # non-monotonic + df.index = reversed(df.index.tolist()) + self.assertFalse(df.index.is_monotonic) + + with self.assertRaises(ValueError): + df.rolling('2s').sum() + + df = df.reset_index() + with self.assertRaises(ValueError): + df.rolling('2s', on='A').sum() + + def test_frame_on(self): + + df = DataFrame({'B': range(5), + 'C': pd.date_range('20130101 09:00:00', + periods=5, + freq='3s')}) + + df['A'] = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + # we are doing simulating using 'on' + expected = (df.set_index('A') + .rolling('2s') + .B + .sum() + .reset_index(drop=True) + ) + + result = (df.rolling('2s', on='A') + .B + .sum() + ) + tm.assert_series_equal(result, expected) + + # test as a frame + # we should be ignoring the 'on' as an aggregation column + # note that the expected is setting, computing, and reseting + # so the columns need to be switched compared + # to the actual result where they are ordered as in the + # original + expected = (df.set_index('A') + .rolling('2s')[['B']] + .sum() + .reset_index()[['B', 'A']] + ) + + result = (df.rolling('2s', on='A')[['B']] + .sum() + ) + tm.assert_frame_equal(result, expected) + + def test_frame_on2(self): + + # using multiple aggregation columns + df = DataFrame({'A': [0, 1, 2, 3, 4], + 'B': [0, 1, 2, np.nan, 4], + 'C': pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')])}, + columns=['A', 'C', 'B']) + + expected1 = DataFrame({'A': [0., 1, 3, 3, 7], + 'B': [0, 1, 3, np.nan, 4], + 'C': df['C']}, + columns=['A', 'C', 'B']) + + result = df.rolling('2s', on='C').sum() + expected = expected1 + tm.assert_frame_equal(result, expected) + + expected = Series([0, 1, 3, np.nan, 4], name='B') + result = df.rolling('2s', on='C').B.sum() + tm.assert_series_equal(result, expected) + + expected = expected1[['A', 'B', 'C']] + result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum() + tm.assert_frame_equal(result, expected) + + def test_basic_regular(self): + + df = self.regular.copy() + + df.index = pd.date_range('20130101', periods=5, freq='D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='1D').sum() + tm.assert_frame_equal(result, expected) + + df.index = pd.date_range('20130101', periods=5, freq='2D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1).sum() + result = df.rolling(window='2D').sum() + tm.assert_frame_equal(result, expected) + + def test_min_periods(self): + + # compare for min_periods + df = self.regular + + # these slightly different + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s').sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + def test_ragged_sum(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 3, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, np.nan, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s').sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=3).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 10] + tm.assert_frame_equal(result, expected) + + def test_ragged_mean(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_median(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_quantile(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_std(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.5] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + tm.assert_frame_equal(result, expected) + + def test_ragged_var(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.25] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.] + tm.assert_frame_equal(result, expected) + + def test_ragged_skew(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_kurt(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 4 + [-1.2] + tm.assert_frame_equal(result, expected) + + def test_ragged_count(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 1, 1, 1] + tm.assert_frame_equal(result, expected) + + df = self.ragged + result = df.rolling(window='1s').count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 2, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).count() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 2, np.nan, 2] + tm.assert_frame_equal(result, expected) + + def test_regular_min(self): + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [0.0, 1, 2, 3, 4]}).set_index('A') + result = df.rolling('1s').min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [5, 4, 3, 4, 5]}).set_index('A') + + tm.assert_frame_equal(result, expected) + result = df.rolling('2s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling('5s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 3] + tm.assert_frame_equal(result, expected) + + def test_ragged_min(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_perf_min(self): + + N = 10000 + + dfp = DataFrame({'B': np.random.randn(N)}, + index=pd.date_range('20130101', + periods=N, + freq='s')) + expected = dfp.rolling(2, min_periods=1).min() + result = dfp.rolling('2s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + expected = dfp.rolling(200, min_periods=1).min() + result = dfp.rolling('200s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + def test_ragged_max(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_ragged_apply(self): + + df = self.ragged + + f = lambda x: 1 + result = df.rolling(window='1s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + def test_all(self): + + # simple comparision of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window='1s') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = er.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = r.apply(lambda x: 1) + expected = er.apply(lambda x: 1) + tm.assert_frame_equal(result, expected) + + def test_all2(self): + + # more sophisticated comparision of integer vs. + # time-based windowing + df = DataFrame({'B': np.arange(50)}, + index=pd.date_range('20130101', + periods=50, freq='H') + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window='5H') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() + expected = df.groupby(df.index.day).apply( + agg_by_day).reset_index(level=0, drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/window.pyx b/pandas/window.pyx index bfe9152477a40..8235d68e2a88b 100644 --- a/pandas/window.pyx +++ b/pandas/window.pyx @@ -1,3 +1,6 @@ +# cython: profile=False +# cython: boundscheck=False, wraparound=False, cdivision=True + from numpy cimport * cimport numpy as np import numpy as np @@ -51,9 +54,10 @@ cdef double nan = NaN cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -# this is our util.pxd from util cimport numeric +from skiplist cimport * + cdef extern from "src/headers/math.h": double sqrt(double x) nogil int signbit(double) nogil @@ -69,16 +73,37 @@ include "skiplist.pyx" # - In Cython x * x is faster than x ** 2 for C types, this should be # periodically revisited to see if it's still true. # -# - -def _check_minp(win, minp, N, floor=1): + +def _check_minp(win, minp, N, floor=None): + """ + Parameters + ---------- + win: int + minp: int or None + N: len of window + floor: int, optional + default 1 + + Returns + ------- + minimum period + """ + + if minp is None: + minp = 1 + if not util.is_integer_object(minp): + raise ValueError("min_periods must be an integer") if minp > win: - raise ValueError('min_periods (%d) must be <= window (%d)' - % (minp, win)) + raise ValueError("min_periods (%d) must be <= " + "window (%d)" % (minp, win)) elif minp > N: minp = N + 1 elif minp < 0: raise ValueError('min_periods must be >= 0') + if floor is None: + floor = 1 + return max(minp, floor) # original C implementation by N. Devillard. @@ -96,757 +121,1227 @@ def _check_minp(win, minp, N, floor=1): # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -#------------------------------------------------------------------------------- -# Rolling sum -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_sum(ndarray[double_t] input, int win, int minp): - cdef double val, prev, sum_x = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) +# ---------------------------------------------------------------------- +# The indexer objects for rolling +# These define start/end indexers to compute offsets - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - # Not NaN - if val == val: - nobs += 1 - sum_x += val +cdef class WindowIndexer: - output[i] = NaN + cdef: + ndarray start, end + int64_t N, minp, win + bint is_variable - for i from minp - 1 <= i < N: - val = input[i] + def get_data(self): + return (self.start, self.end, self.N, + self.win, self.minp, + self.is_variable) - if val == val: - nobs += 1 - sum_x += val - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 +cdef class MockFixedWindowIndexer(WindowIndexer): + """ - if nobs >= minp: - output[i] = sum_x - else: - output[i] = NaN + We are just checking parameters of the indexer, + and returning a consistent API with fixed/variable + indexers. - return output + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring -#------------------------------------------------------------------------------- -# Rolling mean -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_mean(ndarray[double_t] input, - int win, int minp): - cdef: - double val, prev, result, sum_x = 0 - Py_ssize_t nobs = 0, i, neg_ct = 0 - Py_ssize_t N = len(input) + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): - cdef ndarray[double_t] output = np.empty(N, dtype=float) - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) + self.start = np.empty(0, dtype='int64') + self.end = np.empty(0, dtype='int64') + self.win = win - # Not NaN - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - output[i] = NaN +cdef class FixedWindowIndexer(WindowIndexer): + """ + create a fixed length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments - for i from minp - 1 <= i < N: - val = input[i] + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring the unit - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): + cdef ndarray start_s, start_e, end_s, end_e - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - if signbit(prev): - neg_ct -= 1 + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) - if nobs >= minp: - result = sum_x / nobs - if neg_ct == 0 and result < 0: - # all positive - output[i] = 0 - elif neg_ct == nobs and result > 0: - # all negative - output[i] = 0 - else: - output[i] = result - else: - output[i] = NaN + start_s = np.zeros(win, dtype='int64') + start_e = np.arange(win, self.N, dtype='int64') - win + 1 + self.start = np.concatenate([start_s, start_e]) - return output + end_s = np.arange(win, dtype='int64') + 1 + end_e = start_e + win + self.end = np.concatenate([end_s, end_e]) + self.win = win -#------------------------------------------------------------------------------- -# Exponentially weighted moving average -def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): +cdef class VariableWindowIndexer(WindowIndexer): """ - Compute exponentially-weighted moving average using center-of-mass. + create a variable length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments Parameters ---------- - input : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: ndarray + index of the input - Returns - ------- - y : ndarray """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + ndarray index): - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output + self.is_variable = 1 + self.N = len(index) + self.minp = _check_minp(win, minp, self.N) - minp = max(minp, 1) + self.start = np.empty(self.N, dtype='int64') + self.start.fill(-1) - cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - cdef Py_ssize_t i, nobs + self.end = np.empty(self.N, dtype='int64') + self.end.fill(-1) - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha + self.build(index, win) - weighted_avg = input[0] - is_observation = (weighted_avg == weighted_avg) - nobs = int(is_observation) - output[0] = weighted_avg if (nobs >= minp) else NaN - old_wt = 1. + # max window size + self.win = (self.end - self.start).max() - for i from 1 <= i < N: - cur = input[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - if is_observation or (not ignore_na): - old_wt *= old_wt_factor - if is_observation: - if weighted_avg != cur: # avoid numerical errors on constant series - weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur + def build(self, ndarray[int64_t] index, int64_t win): - output[i] = weighted_avg if (nobs >= minp) else NaN + cdef: + ndarray[int64_t] start, end + int64_t start_bound, end_bound, N + Py_ssize_t i, j - return output + start = self.start + end = self.end + N = self.N -#------------------------------------------------------------------------------- -# Exponentially weighted moving covariance + start[0] = 0 + end[0] = 1 -def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, N): + end_bound = index[i] + start_bound = index[i] - win + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + +def get_window_indexer(input, win, minp, index, floor=None, + use_mock=True): """ - Compute exponentially-weighted moving variance using center-of-mass. + return the correct window indexer for the computation Parameters ---------- - input_x : ndarray (float64 type) - input_y : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - bias: int + input: 1d ndarray + win: integer, window size + minp: integer, minimum periods + index: 1d ndarray, optional + index to the input array + floor: optional + unit for flooring the unit + use_mock: boolean, default True + if we are a fixed indexer, return a mock indexer + instead of the FixedWindow Indexer. This is a type + compat Indexer that allows us to use a standard + code path with all of the indexers. Returns ------- - y : ndarray - """ + tuple of 1d int64 ndarrays of the offsets & data about the window - cdef Py_ssize_t N = len(input_x) - if len(input_y) != N: - raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) + """ - cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y - cdef Py_ssize_t i, nobs + if index is not None: + indexer = VariableWindowIndexer(input, win, minp, index) + elif use_mock: + indexer = MockFixedWindowIndexer(input, win, minp, index, floor) + else: + indexer = FixedWindowIndexer(input, win, minp, index, floor) + return indexer.get_data() - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha +# ---------------------------------------------------------------------- +# Rolling count +# this is only an impl for index not None, IOW, freq aware - mean_x = input_x[0] - mean_y = input_y[0] - is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) - nobs = int(is_observation) - if not is_observation: - mean_x = NaN - mean_y = NaN - output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN - cov = 0. - sum_wt = 1. - sum_wt2 = 1. - old_wt = 1. - for i from 1 <= i < N: - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - if mean_x != cur_x: # avoid numerical errors on constant series - mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) - if mean_y != cur_y: # avoid numerical errors on constant series - mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y +def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, count_x = 0.0 + int64_t s, e, nobs, N + Py_ssize_t i, j + ndarray[int64_t] start, end + ndarray[double_t] output - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN - else: - output[i] = cov - else: - output[i] = NaN + start, end, N, win, minp, _ = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) - return output + with nogil: -#---------------------------------------------------------------------- -# Rolling variance + for i in range(0, N): + s = start[i] + e = end[i] -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - """ - Numerically stable implementation using Welford's method. - """ - cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta - cdef Py_ssize_t i - cdef Py_ssize_t N = len(input) + if i == 0: - cdef ndarray[double_t] output = np.empty(N, dtype=float) + # setup + count_x = 0.0 + for j in range(s, e): + val = input[j] + if val == val: + count_x += 1.0 - minp = _check_minp(win, minp, N) + else: - # Check for windows larger than array, addresses #7297 - win = min(win, N) + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + count_x -= 1.0 - with nogil: - # Over the first window, observations can only be added, never removed - for i from 0 <= i < win: - val = input[i] + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + count_x += 1.0 - # Not NaN - if val == val: - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 + if count_x >= minp: + output[i] = count_x else: - val = NaN + output[i] = NaN - output[i] = val + return output - # After the first window, observations can both be added and removed - for i from win <= i < N: - val = input[i] - prev = input[i - win] +# ---------------------------------------------------------------------- +# Rolling sum - if val == val: - if prev == prev: - # Adding one observation and removing another one - delta = val - prev - prev -= mean_x - mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta - else: - # Adding one observation and not removing any - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - elif prev == prev: - # Adding no new observation, but removing one - nobs -= 1 - if nobs: - delta = (prev - mean_x) - mean_x -= delta / nobs - ssqdm_x -= delta * (prev - mean_x) - else: - mean_x = 0 - ssqdm_x = 0 - # Variance is unchanged if no observation is added or removed - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - output[i] = val +cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: + cdef double result - return output + if nobs >= minp: + result = sum_x + else: + result = NaN + return result -#------------------------------------------------------------------------------- -# Rolling skewness -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_skew(ndarray[double_t] input, int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) +cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ add a value from the sum calc """ - # 3 components of the skewness equation - cdef double A, B, C, R + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val +cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ remove a value from the sum calc """ - output[i] = NaN + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val - for i from minp - 1 <= i < N: - val = input[i] - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val +def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) - nobs -= 1 - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: - output[i] = NaN - else: - R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) - else: - output[i] = NaN + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster - return output + if is_variable: -#------------------------------------------------------------------------------- -# Rolling kurtosis -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_kurt(ndarray[double_t] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) + # variable window + with nogil: - cdef ndarray[double_t] output = np.empty(N, dtype=float) + for i in range(0, N): + s = start[i] + e = end[i] - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K + if i == 0: - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + add_sum(input[j], &nobs, &sum_x) - # Not NaN - if val == val: - nobs += 1 + else: - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(input[j], &nobs, &sum_x) - output[i] = NaN + # calculate adds + for j in range(end[i - 1], e): + add_sum(input[j], &nobs, &sum_x) - for i from minp - 1 <= i < N: - val = input[i] + output[i] = calc_sum(minp, nobs, sum_x) - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val + else: - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev + # fixed window - nobs -= 1 + with nogil: - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - if B == 0 or nobs < 4: - output[i] = NaN + for i in range(0, minp - 1): + add_sum(input[i], &nobs, &sum_x) + output[i] = NaN - else: - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) + for i in range(minp - 1, N): + val = input[i] + add_sum(val, &nobs, &sum_x) - output[i] = K + if i > win - 1: + prev_x = input[i - win] + remove_sum(prev_x, &nobs, &sum_x) - else: - output[i] = NaN + output[i] = calc_sum(minp, nobs, sum_x) return output -#------------------------------------------------------------------------------- -# Rolling median, min, max +# ---------------------------------------------------------------------- +# Rolling mean -from skiplist cimport * -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_median_c(ndarray[float64_t] arg, int win, int minp): - cdef: - double val, res, prev - bint err=0 - int ret=0 - skiplist_t *sl - Py_ssize_t midpoint, nobs = 0, i +cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, + Py_ssize_t neg_ct, double sum_x) nogil: + cdef double result + if nobs >= minp: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + # all positive + result = 0 + elif neg_ct == nobs and result > 0: + # all negative + result = 0 + else: + pass + else: + result = NaN + return result - cdef Py_ssize_t N = len(arg) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - sl = skiplist_init(win) - if sl == NULL: - raise MemoryError("skiplist_init failed") +cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ add a value from the mean calc """ - minp = _check_minp(win, minp, N) + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val + if signbit(val): + neg_ct[0] = neg_ct[0] + 1 - with nogil: - for i from 0 <= i < minp - 1: - val = arg[i] - # Not NaN - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - output[i] = NaN +cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ remove a value from the mean calc """ + + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val + if signbit(val): + neg_ct[0] = neg_ct[0] - 1 - with nogil: - if not err: - for i from minp - 1 <= i < N: - val = arg[i] +def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, result, sum_x = 0 + int64_t s, e + bint is_variable + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) if i > win - 1: - prev = arg[i - win] + prev_x = input[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + return output + +# ---------------------------------------------------------------------- +# Rolling variance + + +cdef inline double calc_var(int64_t minp, int ddof, double nobs, + double ssqdm_x) nogil: + cdef double result + + # Variance is unchanged if no observation is added or removed + if (nobs >= minp) and (nobs > ddof): + + # pathological case + if nobs == 1: + result = 0 + else: + result = ssqdm_x / (nobs - ddof) + if result < 0: + result = 0 + else: + result = NaN + + return result + + +cdef inline void add_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ add a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + +cdef inline void remove_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ remove a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + if nobs[0]: + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] - delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + else: + mean_x[0] = 0 + ssqdm_x[0] = 0 + + +def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, + object index, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + int64_t s, e + bint is_variable + Py_ssize_t i, j, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # Check for windows larger than array, addresses #7297 + win = min(win, N) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we + # have 2 paths but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + # calculate deletes + for j in range(start[i - 1], s): + remove_var(input[j], &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + else: + + with nogil: + + # Over the first window, observations can only be added, never + # removed + for i from 0 <= i < win: + add_var(input[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + # After the first window, observations can both be added and + # removed + for i from win <= i < N: + val = input[i] + prev = input[i - win] + + if val == val: if prev == prev: - skiplist_remove(sl, prev) - nobs -= 1 + # Adding one observation and removing another one + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + + else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + return output + + +# ---------------------------------------------------------------------- +# Rolling skewness + +cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, + double xxx) nogil: + cdef double result, dnobs + cdef double A, B, C, R + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + B = xx / dnobs - A * A + C = xxx / dnobs - A * A * A - 3 * A * B + if B <= 0 or nobs < 3: + result = NaN + else: + R = sqrt(B) + result = ((sqrt(dnobs * (dnobs - 1.)) * C) / + ((dnobs - 2) * R * R * R)) + else: + result = NaN + + return result + +cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ add a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + +cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ remove a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + + +def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + + if i > win - 1: + prev = input[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling kurtosis + + +cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, + double xxx, double xxxx) nogil: + cdef double result, dnobs + cdef double A, B, C, D, R, K + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + R = A * A + B = xx / dnobs - R + R = R * A + C = xxx / dnobs - R - 3 * A * B + R = R * A + D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A + + if B == 0 or nobs < 4: + result = NaN + else: + K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) + result = K / ((dnobs - 2.) * (dnobs - 3.)) + else: + result = NaN + + return result + +cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ add a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + xxxx[0] = xxxx[0] + val * val * val * val + +cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ remove a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + xxxx[0] = xxxx[0] - val * val * val * val + + +def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + else: + + with nogil: + + for i from 0 <= i < minp - 1: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + + if i > win - 1: + prev = input[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling median, min, max + + +def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, res, prev + bint err=0, is_variable + int ret=0 + skiplist_t *sl + Py_ssize_t i, j + int64_t nobs = 0, N, s, e + int midpoint + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + if sl == NULL: + raise MemoryError("skiplist_init failed") + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + val = input[i] if val == val: nobs += 1 err = skiplist_insert(sl, val) != 1 if err: break - if nobs >= minp: - midpoint = nobs / 2 - if nobs % 2: - res = skiplist_get(sl, midpoint, &ret) - else: - res = (skiplist_get(sl, midpoint, &ret) + - skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist_remove(sl, val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + if nobs >= minp: + midpoint = (nobs / 2) + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) else: - res = NaN + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN - output[i] = res + output[i] = res - skiplist_destroy(sl) + skiplist_destroy(sl) if err: raise MemoryError("skiplist_insert failed") return output -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Moving maximum / minimum code taken from Bottleneck under the terms # of its Simplified BSD license # https://github.com/kwgoodman/bottleneck -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_max(ndarray[numeric] a, int window, int minp): + +cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: + + if numeric in cython.floating: + if ai == ai: + nobs[0] = nobs[0] + 1 + elif is_max: + if numeric == cython.float: + ai = MINfloat32 + else: + ai = MINfloat64 + else: + if numeric == cython.float: + ai = MAXfloat32 + else: + ai = MAXfloat64 + + else: + nobs[0] = nobs[0] + 1 + + return ai + + +cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: + """ remove a value from the mm calc """ + if numeric in cython.floating and aold == aold: + nobs[0] = nobs[0] - 1 + + +cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, + numeric value) nogil: + cdef numeric result + + if numeric in cython.floating: + if nobs >= minp: + result = value + else: + result = NaN + else: + result = value + + return result + + +def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - a: numpy array + input: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN + index: ndarray, optional + index for window computation """ - return _roll_min_max(a, window, minp, 1) + return _roll_min_max(input, win, minp, index, is_max=1) + -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_min(ndarray[numeric] a, int window, int minp): +def roll_min(ndarray[numeric] input, int64_t win, int64_t minp, + object index): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - a: numpy array + input: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN + index: ndarray, optional + index for window computation """ - return _roll_min_max(a, window, minp, 0) - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef _roll_min_max(ndarray[numeric] a, int window, int minp, bint is_max): - "Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs." - cdef numeric ai, aold - cdef Py_ssize_t count - cdef Py_ssize_t* death - cdef numeric* ring - cdef numeric* minvalue - cdef numeric* end - cdef numeric* last - cdef Py_ssize_t i0 - cdef np.npy_intp *dim - dim = PyArray_DIMS(a) - cdef Py_ssize_t n0 = dim[0] - cdef np.npy_intp *dims = [n0] - cdef bint should_replace - cdef np.ndarray[numeric, ndim=1] y = PyArray_EMPTY(1, dims, PyArray_TYPE(a), 0) - - if window < 1: - raise ValueError('Invalid window size %d' - % (window)) - - if minp > window: - raise ValueError('Invalid min_periods size %d greater than window %d' - % (minp, window)) - - minp = _check_minp(window, minp, n0) - with nogil: - ring = malloc(window * sizeof(numeric)) - death = malloc(window * sizeof(Py_ssize_t)) - end = ring + window - last = ring + return _roll_min_max(input, win, minp, index, is_max=0) + +cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index, bint is_max): + """ + Moving min/max of 1d array of any numeric type along axis=0 + ignoring NaNs. + """ + + cdef: + numeric ai + bint is_variable, should_replace + int64_t s, e, N, i, j, removed + Py_ssize_t nobs = 0 + ndarray[int64_t] starti, endi + ndarray[numeric, ndim=1] output + cdef: + int64_t* death + numeric* ring + numeric* minvalue + numeric* end + numeric* last + + cdef: + cdef numeric r + + starti, endi, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index) + + output = np.empty(N, dtype=input.dtype) + + if is_variable: + + with nogil: + + for i in range(N): + s = starti[i] + e = endi[i] + + r = input[s] + nobs = 0 + for j in range(s, e): + + # adds, death at the i offset + ai = init_mm(input[j], &nobs, is_max) + + if is_max: + if ai > r: + r = ai + else: + if ai < r: + r = ai + + output[i] = calc_mm(minp, nobs, r) + + else: + + # setup the rings of death! + ring = malloc(win * sizeof(numeric)) + death = malloc(win * sizeof(int64_t)) + + end = ring + win + last = ring minvalue = ring - ai = a[0] - if numeric in cython.floating: - if ai == ai: - minvalue[0] = ai - elif is_max: - minvalue[0] = MINfloat64 - else: - minvalue[0] = MAXfloat64 - else: - minvalue[0] = ai - death[0] = window - - count = 0 - for i0 in range(n0): - ai = a[i0] - if numeric in cython.floating: - if ai == ai: - count += 1 - elif is_max: - ai = MINfloat64 + ai = input[0] + minvalue[0] = init_mm(input[0], &nobs, is_max) + death[0] = win + nobs = 0 + + with nogil: + + for i in range(N): + ai = init_mm(input[i], &nobs, is_max) + + if i >= win: + remove_mm(input[i - win], &nobs) + + if death[minvalue - ring] == i: + minvalue = minvalue + 1 + if minvalue >= end: + minvalue = ring + + if is_max: + should_replace = ai >= minvalue[0] else: - ai = MAXfloat64 - else: - count += 1 - if i0 >= window: - aold = a[i0 - window] - if aold == aold: - count -= 1 - if death[minvalue-ring] == i0: - minvalue += 1 - if minvalue >= end: - minvalue = ring - should_replace = ai >= minvalue[0] if is_max else ai <= minvalue[0] - if should_replace: - minvalue[0] = ai - death[minvalue-ring] = i0 + window - last = minvalue - else: - should_replace = last[0] <= ai if is_max else last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - should_replace = last[0] <= ai if is_max else last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i0 + window - if numeric in cython.floating: - if count >= minp: - y[i0] = minvalue[0] + should_replace = ai <= minvalue[0] + if should_replace: + + minvalue[0] = ai + death[minvalue - ring] = i + win + last = minvalue + else: - y[i0] = NaN - else: - y[i0] = minvalue[0] - for i0 in range(minp - 1): - if numeric in cython.floating: - y[i0] = NaN - else: - y[i0] = 0 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i + win + + output[i] = calc_mm(minp, nobs, minvalue[0]) + + for i in range(minp - 1): + if numeric in cython.floating: + output[i] = NaN + else: + output[i] = 0 + + free(ring) + free(death) + + # print("output: {0}".format(output)) + return output - free(ring) - free(death) - return y -def roll_quantile(ndarray[float64_t, cast=True] input, int win, - int minp, double quantile): +def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, + int64_t minp, object index, double quantile): """ O(N log(window)) implementation using skip list """ - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - + cdef: + double val, prev, midpoint + IndexableSkiplist skiplist + int64_t nobs = 0, i, j, s, e, N + Py_ssize_t idx + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) skiplist = IndexableSkiplist(win) - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] + for i in range(0, N): + s = start[i] + e = end[i] - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) + if i == 0: - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] + # setup + val = input[i] + if val == val: + nobs += 1 + skiplist.insert(val) - if i > win - 1: - prev = input[i - win] + else: - if prev == prev: - skiplist.remove(prev) - nobs -= 1 + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist.remove(val) + nobs -= 1 - if val == val: - nobs += 1 - skiplist.insert(val) + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + skiplist.insert(val) if nobs >= minp: - idx = int((quantile / 1.) * (nobs - 1)) + idx = int(quantile * (nobs - 1)) output[i] = skiplist.get(idx) else: output[i] = NaN return output + def roll_generic(ndarray[float64_t, cast=True] input, - int win, int minp, int offset, - object func, object args, object kwargs): - cdef ndarray[double_t] output, counts, bufarr - cdef Py_ssize_t i, n - cdef float64_t *buf - cdef float64_t *oldbuf + int64_t win, int64_t minp, object index, + int offset, object func, + object args, object kwargs): + cdef: + ndarray[double_t] output, counts, bufarr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N + bint is_variable + ndarray[int64_t] start, end if not input.flags.c_contiguous: input = input.copy('C') @@ -855,36 +1350,60 @@ def roll_generic(ndarray[float64_t, cast=True] input, if n == 0: return input - minp = _check_minp(win, minp, n, floor=0) - output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index, + floor=0) + output = np.empty(N, dtype=float) - # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, n) - offset): - if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) - else: - output[i] = NaN + counts = roll_sum(np.concatenate([np.isfinite(input).astype(float), + np.array([0.] * offset)]), + win, minp, index)[offset:] - # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data - for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf - if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) - else: - output[i] = NaN - bufarr.data = oldbuf + if is_variable: - # truncated windows at the end - for i from int_max(n - offset, 0) <= i < n: - if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) - else: - output[i] = NaN + # variable window + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + output[i] = func(input[s:e], *args, **kwargs) + else: + output[i] = NaN + + else: + + # truncated windows at the beginning, through first full-length window + for i from 0 <= i < (int_min(win, N) - offset): + if counts[i] >= minp: + output[i] = func(input[0: (i + offset + 1)], *args, **kwargs) + else: + output[i] = NaN + + # remaining full-length windows + buf = input.data + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + for i from (win - offset) <= i < (N - offset): + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr, *args, **kwargs) + else: + output[i] = NaN + bufarr.data = oldbuf + + # truncated windows at the end + for i from int_max(N - offset, 0) <= i < N: + if counts[i] >= minp: + output[i] = func(input[int_max(i + offset - win + 1, 0): N], + *args, + **kwargs) + else: + output[i] = NaN return output @@ -952,3 +1471,179 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, output[in_i] = NaN return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving average + + +def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, + int minp): + """ + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + weighted_avg = input[0] + is_observation = (weighted_avg == weighted_avg) + nobs = int(is_observation) + output[0] = weighted_avg if (nobs >= minp) else NaN + old_wt = 1. + + for i from 1 <= i < N: + cur = input[i] + is_observation = (cur == cur) + nobs += int(is_observation) + if weighted_avg == weighted_avg: + + if is_observation or (not ignore_na): + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ((old_wt * weighted_avg) + + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur + + output[i] = weighted_avg if (nobs >= minp) else NaN + + return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving covariance + + +def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, + double_t com, int adjust, int ignore_na, int minp, int bias): + """ + Compute exponentially-weighted moving variance using center-of-mass. + + Parameters + ---------- + input_x : ndarray (float64 type) + input_y : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + bias: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input_x) + if len(input_y) != N: + raise ValueError("arrays are of different lengths " + "(%d and %d)" % (N, len(input_y))) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + mean_x = input_x[0] + mean_y = input_y[0] + is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + nobs = int(is_observation) + if not is_observation: + mean_x = NaN + mean_y = NaN + output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + cov = 0. + sum_wt = 1. + sum_wt2 = 1. + old_wt = 1. + + for i from 1 <= i < N: + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += int(is_observation) + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + + # avoid numerical errors on constant series + if mean_x != cur_x: + mean_x = ((old_wt * old_mean_x) + + (new_wt * cur_x)) / (old_wt + new_wt) + + # avoid numerical errors on constant series + if mean_y != cur_y: + mean_y = ((old_wt * old_mean_y) + + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * + (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * + (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + if (denominator > 0.): + output[i] = ((numerator / denominator) * cov) + else: + output[i] = NaN + else: + output[i] = cov + else: + output[i] = NaN + + return output diff --git a/setup.py b/setup.py index 0bff49c4976b8..58965fe9ae6d6 100755 --- a/setup.py +++ b/setup.py @@ -430,9 +430,9 @@ def pxd(name): 'depends': [srcpath('generated', suffix='.pyx'), srcpath('join', suffix='.pyx')]}, _window={'pyxfile': 'window', - 'pxdfiles': ['src/skiplist','src/util'], - 'depends': ['pandas/src/skiplist.pyx', - 'pandas/src/skiplist.h']}, + 'pxdfiles': ['src/skiplist', 'src/util'], + 'depends': ['pandas/src/skiplist.pyx', + 'pandas/src/skiplist.h']}, parser={'pyxfile': 'parser', 'depends': ['pandas/src/parser/tokenizer.h', 'pandas/src/parser/io.h', From 57b373c97a8cd72f29a6206c5859661d8b926a97 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Thu, 21 Jul 2016 02:25:33 +0900 Subject: [PATCH 40/50] CLN: Remove a test case about Timestamp to TestTimestamp (#13722) --- pandas/tseries/tests/test_timedeltas.py | 1 - pandas/tseries/tests/test_timeseries.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 36ae479c3dfcc..659101cb4cad2 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -188,7 +188,6 @@ def test_construction(self): self.assertEqual(Timedelta('').value, iNaT) self.assertEqual(Timedelta('nat').value, iNaT) self.assertEqual(Timedelta('NAT').value, iNaT) - self.assertTrue(isnull(Timestamp('nat'))) self.assertTrue(isnull(Timedelta('nat'))) # offset diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 59fc147ead4eb..9c97749c87103 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4389,6 +4389,8 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) + self.assertTrue(isnull(Timestamp('nat'))) + def test_roundtrip(self): # test value to string and back conversions From b25a2a1259f33ce8123b7f239f109ae42155d02c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 20 Jul 2016 17:11:18 -0400 Subject: [PATCH 41/50] DOC/DEPR: pivot_annual closes #736 Author: sinhrks Closes #13706 from sinhrks/pivot_annual and squashes the following commits: d097bab [sinhrks] DOC/DEPR: pivot_annual --- doc/source/cookbook.rst | 13 +++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/tests/test_util.py | 9 ++++++--- pandas/tseries/util.py | 8 ++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 0dbc79415af0b..38a816060e1bc 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -679,6 +679,19 @@ The :ref:`Pivot ` docs. 'Employed' : lambda x : sum(x), 'Grade' : lambda x : sum(x) / len(x)}) +`Plot pandas DataFrame with year over year data +`__ + +To create year and month crosstabulation: + +.. ipython:: python + + df = pd.DataFrame({'value': np.random.randn(36)}, + index=pd.date_range('2011-01-01', freq='M', periods=36)) + + pd.pivot_table(df, index=df.index.month, columns=df.index.year, + values='value', aggfunc='sum') + Apply ***** diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cdae0d5c27c7d..ee77660795852 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -589,7 +589,7 @@ Deprecations - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) - top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - +- ``pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) .. _whatsnew_0190.prior_deprecations: diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 9c5c9b7a03445..9d992995df3a7 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -21,7 +21,8 @@ def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_annual(ts, 'D') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'D') doy = ts.index.dayofyear doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 @@ -53,7 +54,8 @@ def test_hourly(self): hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 hoy += 1 - annual = pivot_annual(ts_hourly) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts_hourly) ts_hourly = ts_hourly.astype(float) for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: @@ -78,7 +80,8 @@ def test_monthly(self): rng = date_range('1/1/2000', '12/31/2004', freq='M') ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_annual(ts, 'M') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'M') month = ts.index.month for i in range(1, 13): diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 98a93d22b09a6..7bac0567ea5c6 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -1,3 +1,5 @@ +import warnings + from pandas.compat import lrange import numpy as np from pandas.types.common import _ensure_platform_int @@ -7,6 +9,8 @@ def pivot_annual(series, freq=None): """ + Deprecated. Use ``pivot_table`` instead. + Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, @@ -35,6 +39,10 @@ def pivot_annual(series, freq=None): ------- annual : DataFrame """ + + msg = "pivot_annual is deprecated. Use pivot_table instead" + warnings.warn(msg, FutureWarning) + index = series.index year = index.year years = nanops.unique1d(year) From 016b35276eea344b861147dfff2d4ff8ae52aadc Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 20 Jul 2016 17:22:45 -0400 Subject: [PATCH 42/50] PERF: Improve Period hashing closes #12817 Author: sinhrks Closes #13705 from sinhrks/period_hash and squashes the following commits: e1fb7f4 [sinhrks] PERF: Improve Period hasing --- asv_bench/benchmarks/period.py | 26 +++++- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/src/period.pyx | 2 +- pandas/tseries/tests/test_base.py | 138 ++++++++++++++++++++++------ pandas/tseries/tests/test_period.py | 13 +++ 5 files changed, 152 insertions(+), 29 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 012030a71ac82..c1b89ae1db75b 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,4 @@ -from pandas import PeriodIndex, date_range +from pandas import Series, Period, PeriodIndex, date_range class create_period_index_from_date_range(object): @@ -7,3 +7,27 @@ class create_period_index_from_date_range(object): def time_period_index(self): # Simulate irregular PeriodIndex PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') + + +class period_algorithm(object): + goal_time = 0.2 + + def setup(self): + data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), + Period('2011-03', freq='M'), Period('2011-04', freq='M')] + self.s = Series(data * 1000) + self.i = PeriodIndex(data, freq='M') + + def time_period_series_drop_duplicates(self): + self.s.drop_duplicates() + + def time_period_index_drop_duplicates(self): + self.i.drop_duplicates() + + def time_period_series_value_counts(self): + self.s.value_counts() + + def time_period_index_value_counts(self): + self.i.value_counts() + + diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index ee77660795852..73ce39b66fc27 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -628,6 +628,8 @@ Performance Improvements - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index.difference`` (:issue:`12044`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) +- Improved performance of hashing ``Period`` (:issue:`12817`) + .. _whatsnew_0190.bug_fixes: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 37f265ede07e7..45743d1cf70ff 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -727,7 +727,7 @@ cdef class _Period(object): (type(self).__name__, type(other).__name__)) def __hash__(self): - return hash((self.ordinal, self.freq)) + return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 4aa1e2f5d33dd..05f7d9d9ce7b8 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -491,13 +491,15 @@ def test_value_counts_unique(self): for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + tz=tz) exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, tz=tz) @@ -507,15 +509,20 @@ def test_value_counts_unique(self): '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + tz=tz) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), + expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -654,6 +661,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -1303,23 +1331,29 @@ def test_value_counts_unique(self): exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = TimedeltaIndex( - ['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00', - '1 days 08:00:00', '1 days 08:00:00', pd.NaT]) + idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', + '1 days 09:00:00', '1 days 08:00:00', + '1 days 08:00:00', pd.NaT]) exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT - ]) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', + pd.NaT]) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -1454,6 +1488,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') @@ -2121,8 +2176,8 @@ def test_value_counts_unique(self): # GH 7735 idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') + idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + freq='H') exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', '2011-01-01 15:00', @@ -2131,24 +2186,31 @@ def test_value_counts_unique(self): '2011-01-01 10:00', '2011-01-01 09:00'], freq='H') expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) - expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.period_range('2011-01-01 09:00', freq='H', + periods=10) tm.assert_index_equal(idx.unique(), expected) idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + freq='H') expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], freq='H') expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -2164,6 +2226,28 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertEqual(idx.freq, result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_order_compat(self): def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c90cbbf80086a..e3a67289a587b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -462,6 +462,19 @@ def test_period_deprecated_freq(self): p = Period('2016-03-01 09:00', freq=exp) tm.assertIsInstance(p, Period) + def test_hash(self): + self.assertEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01-01', freq='D')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='3M')), + hash(Period('2011-01', freq='2M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-02', freq='M'))) + def test_repr(self): p = Period('Jan-2000') self.assertIn('2000-01', repr(p)) From 49621311e7812b4bacd487421057ef3f79434bdd Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 20 Jul 2016 17:25:12 -0400 Subject: [PATCH 43/50] MAINT: Removed some warnings in tests Per discussion with @jreback here. Author: gfyoung Closes #13702 from gfyoung/test-warnings-remove and squashes the following commits: e7292d3 [gfyoung] MAINT: Removed some warnings in tests --- pandas/core/internals.py | 2 +- pandas/tests/test_categorical.py | 81 ++++++++++++-------------------- 2 files changed, 32 insertions(+), 51 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ff12cfddbe9cd..8e77486457546 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1490,7 +1490,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] - mask = mask.reshape(new_values.shape) + mask = _safe_reshape(mask, new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 35b1b8c1bf341..57b8bb1531551 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101,E1103,W0232 -import os import sys from datetime import datetime from distutils.version import LooseVersion @@ -2906,54 +2905,41 @@ def test_value_counts(self): tm.assert_series_equal(res, exp) def test_value_counts_with_nan(self): - # https://github.com/pydata/pandas/issues/9443 + # see gh-9443 + # sanity check s = pd.Series(["a", "b", "a"], dtype="category") - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - s = pd.Series(["a", "b", None, "a", None, None], dtype="category") - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"]))) - # When we aren't sorting by counts, and np.nan isn't a - # category, it should be last. - tm.assert_series_equal( - s.value_counts(dropna=False, sort=False), - pd.Series([2, 1, 3], - index=pd.CategoricalIndex(["a", "b", np.nan]))) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical(["a", "b", "a"], - categories=["a", "b", np.nan])) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - # internal categories are different because of NaN - exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=True), exp, - check_categorical=False) - exp = pd.Series([2, 1, 0], - index=pd.CategoricalIndex(["a", "b", np.nan])) - tm.assert_series_equal(s.value_counts(dropna=False), exp, - check_categorical=False) + # same Series via two different constructions --> same behaviour + series = [ + pd.Series(["a", "b", None, "a", None, None], dtype="category"), + pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b"])) + ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], - categories=["a", "b", np.nan])) + for s in series: + # None is a NaN value, so we exclude its count here + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=True), exp, - check_categorical=False) - exp = pd.Series([3, 2, 1], - index=pd.CategoricalIndex([np.nan, "a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=False), exp, - check_categorical=False) + # we don't exclude the count of None and sort by counts + exp = pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"])) + res = s.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = pd.Series([2, 1, 3], index=pd.CategoricalIndex(["a", "b", np.nan])) + res = s.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) def test_groupby(self): @@ -4113,16 +4099,11 @@ def f(): res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) - # make sure that fillna takes both missing values and NA categories - # into account - c = Categorical(["a", "b", np.nan]) - with tm.assert_produces_warning(FutureWarning): - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - - c[0] = np.nan + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - cat_exp = Categorical(["a", "b", "a"], categories=["a", "b", np.nan]) + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") From 634e95d8d0f79bcaded9e92b4bbce46dd9805da4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 20 Jul 2016 17:27:51 -0400 Subject: [PATCH 44/50] CLN: removed the 'diff' method for Index Deprecated all the way back in `0.15.0` here. Author: gfyoung Closes #13669 from gfyoung/remove-index-diff and squashes the following commits: 7dca659 [gfyoung] CLN: removed the 'diff' method for Index --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/base.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 73ce39b66fc27..5727b917fd08c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -602,6 +602,7 @@ Removal of prior version deprecations/changes - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) +- ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 567d2a458dafa..850d049ef9f45 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1965,8 +1965,6 @@ def difference(self, other): return this._shallow_copy(the_diff, name=result_name) - diff = deprecate('diff', difference) - def symmetric_difference(self, other, result_name=None): """ Compute the symmetric difference of two Index objects. From a2e1917218de222d76ffb61822f1971e1059c849 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 01:38:14 +0200 Subject: [PATCH 45/50] BUG: Add check for array lengths in from_arrays method (GH13599) --- pandas/indexes/multi.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 365a971f82a3b..983930e0ad135 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -848,6 +848,12 @@ def from_arrays(cls, arrays, sortorder=None, names=None): name = None if names is None else names[0] return Index(arrays[0], name=name) + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i-1]): + raise ValueError('all arrays must be same length') + cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] levels = [c.categories for c in cats] labels = [c.codes for c in cats] From e401cf14b82213e419b410a1a7181ffd885307c0 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 02:28:22 +0200 Subject: [PATCH 46/50] BUG: Add test for array length mismatch --- pandas/tests/indexes/test_multi.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 2734e90a1971b..7fa61fe168e61 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -632,6 +632,13 @@ def test_from_arrays_index_series_period(self): tm.assert_index_equal(result, result2) + def test_from_arrays_different_lengths(self): + # GH13599 + idx1 = [1, 2, 3] + idx2 = ['a', 'b'] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + pd.MultiIndex.from_arrays, [idx1, idx2]) + def test_from_product(self): first = ['foo', 'bar', 'buz'] From 93296ff08855d173986bfdca0e87ec70bec3feb1 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 02:33:14 +0200 Subject: [PATCH 47/50] BUG: Fix minor issue with new test for from_arrays --- pandas/tests/indexes/test_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 7fa61fe168e61..47c44bbe9abe5 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -637,7 +637,7 @@ def test_from_arrays_different_lengths(self): idx1 = [1, 2, 3] idx2 = ['a', 'b'] assertRaisesRegexp(ValueError, '^all arrays must be same length$', - pd.MultiIndex.from_arrays, [idx1, idx2]) + MultiIndex.from_arrays, [idx1, idx2]) def test_from_product(self): From 72fb52d5e95e29573aa01adf521ae7b632ae1536 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 21:27:01 +0200 Subject: [PATCH 48/50] Minor fix for linter --- pandas/indexes/multi.py | 2 +- pandas/tests/indexes/test_multi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 983930e0ad135..184744915bd8d 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -851,7 +851,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i-1]): + if len(arrays[i]) != len(arrays[i - 1]): raise ValueError('all arrays must be same length') cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 47c44bbe9abe5..27a5eb0c7d458 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -637,7 +637,7 @@ def test_from_arrays_different_lengths(self): idx1 = [1, 2, 3] idx2 = ['a', 'b'] assertRaisesRegexp(ValueError, '^all arrays must be same length$', - MultiIndex.from_arrays, [idx1, idx2]) + MultiIndex.from_arrays, [idx1, idx2]) def test_from_product(self): From 57d52505de2b9d8e85233eadf3be7f653ac24276 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 13 Jul 2016 21:40:20 +0200 Subject: [PATCH 49/50] Update whatsnew entry --- doc/source/whatsnew/v0.19.0.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5727b917fd08c..d6e0e13f376df 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -727,5 +727,8 @@ Bug Fixes - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) + - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) + +-Bug in ``MultiIndex.from_arrays`` didn't check for arrays lengths (:issue:`13599`) From bb6a95228ec5a80635b21cbd261ab7c5a38269fe Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Sun, 17 Jul 2016 01:41:02 +0200 Subject: [PATCH 50/50] Fix minor typo --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tests/indexes/test_multi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d6e0e13f376df..a584a0c078be8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -731,4 +731,4 @@ Bug Fixes - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) --Bug in ``MultiIndex.from_arrays`` didn't check for arrays lengths (:issue:`13599`) +-Bug in ``MultiIndex.from_arrays`` didn't check for input array lengths (:issue:`13599`) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 27a5eb0c7d458..173a33aaffd6d 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -637,7 +637,7 @@ def test_from_arrays_different_lengths(self): idx1 = [1, 2, 3] idx2 = ['a', 'b'] assertRaisesRegexp(ValueError, '^all arrays must be same length$', - MultiIndex.from_arrays, [idx1, idx2]) + MultiIndex.from_arrays, [idx1, idx2]) def test_from_product(self):