From cc6e0598ce9962f9de6477eaf7fa7f3b2e3ceb4c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 17 Apr 2017 07:30:51 -0400 Subject: [PATCH 1/2] CLN: move/reorg pandas.tools -> pandas.core.reshape xref #13634 --- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/__init__.py | 10 +- pandas/core/algorithms.py | 2 +- pandas/core/api.py | 2 +- pandas/core/base.py | 4 +- pandas/core/categorical.py | 2 +- pandas/core/computation/expr.py | 2 +- pandas/core/frame.py | 20 +- pandas/core/groupby.py | 20 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/core/panel.py | 4 +- .../{tests/tools => core/reshape}/__init__.py | 0 pandas/core/reshape/api.py | 5 + pandas/{tools => core/reshape}/concat.py | 0 pandas/core/reshape/merge.py | 1482 +++++++++++++++++ pandas/{tools => core/reshape}/pivot.py | 5 +- pandas/core/{ => reshape}/reshape.py | 2 +- pandas/{tools => core/reshape}/tile.py | 0 pandas/{tools => core/reshape}/util.py | 0 pandas/core/series.py | 4 +- pandas/io/formats/format.py | 4 +- pandas/plotting/_core.py | 2 +- pandas/tests/reshape/__init__.py | 0 .../data/allow_exact_matches.csv | 0 .../allow_exact_matches_and_tolerance.csv | 0 pandas/tests/{tools => reshape}/data/asof.csv | 0 .../tests/{tools => reshape}/data/asof2.csv | 0 .../{tools => reshape}/data/cut_data.csv | 0 .../tests/{tools => reshape}/data/quotes.csv | 0 .../tests/{tools => reshape}/data/quotes2.csv | 0 .../{tools => reshape}/data/tolerance.csv | 0 .../tests/{tools => reshape}/data/trades.csv | 0 .../tests/{tools => reshape}/data/trades2.csv | 0 .../tests/{tools => reshape}/test_concat.py | 0 .../tests/{tools => reshape}/test_hashing.py | 0 pandas/tests/{tools => reshape}/test_join.py | 2 +- pandas/tests/{tools => reshape}/test_merge.py | 4 +- .../{tools => reshape}/test_merge_asof.py | 2 +- .../{tools => reshape}/test_merge_ordered.py | 0 pandas/tests/{tools => reshape}/test_pivot.py | 2 +- pandas/tests/{ => reshape}/test_reshape.py | 5 +- pandas/tests/{tools => reshape}/test_tile.py | 2 +- .../test_union_categoricals.py | 0 pandas/tests/{tools => reshape}/test_util.py | 2 +- pandas/tests/sparse/test_series.py | 2 +- pandas/tests/test_algos.py | 2 +- pandas/tests/test_panel.py | 6 +- pandas/tools/merge.py | 1482 +---------------- setup.py | 4 +- 50 files changed, 1548 insertions(+), 1544 deletions(-) rename pandas/{tests/tools => core/reshape}/__init__.py (100%) create mode 100644 pandas/core/reshape/api.py rename pandas/{tools => core/reshape}/concat.py (100%) create mode 100644 pandas/core/reshape/merge.py rename pandas/{tools => core/reshape}/pivot.py (99%) rename pandas/core/{ => reshape}/reshape.py (99%) rename pandas/{tools => core/reshape}/tile.py (100%) rename pandas/{tools => core/reshape}/util.py (100%) create mode 100644 pandas/tests/reshape/__init__.py rename pandas/tests/{tools => reshape}/data/allow_exact_matches.csv (100%) rename pandas/tests/{tools => reshape}/data/allow_exact_matches_and_tolerance.csv (100%) rename pandas/tests/{tools => reshape}/data/asof.csv (100%) rename pandas/tests/{tools => reshape}/data/asof2.csv (100%) rename pandas/tests/{tools => reshape}/data/cut_data.csv (100%) rename pandas/tests/{tools => reshape}/data/quotes.csv (100%) rename pandas/tests/{tools => reshape}/data/quotes2.csv (100%) rename pandas/tests/{tools => reshape}/data/tolerance.csv (100%) rename pandas/tests/{tools => reshape}/data/trades.csv (100%) rename pandas/tests/{tools => reshape}/data/trades2.csv (100%) rename pandas/tests/{tools => reshape}/test_concat.py (100%) rename pandas/tests/{tools => reshape}/test_hashing.py (100%) rename pandas/tests/{tools => reshape}/test_join.py (99%) rename pandas/tests/{tools => reshape}/test_merge.py (99%) rename pandas/tests/{tools => reshape}/test_merge_asof.py (99%) rename pandas/tests/{tools => reshape}/test_merge_ordered.py (100%) rename pandas/tests/{tools => reshape}/test_pivot.py (99%) rename pandas/tests/{ => reshape}/test_reshape.py (99%) rename pandas/tests/{tools => reshape}/test_tile.py (99%) rename pandas/tests/{tools => reshape}/test_union_categoricals.py (100%) rename pandas/tests/{tools => reshape}/test_util.py (99%) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9df82b8ac7338..12d343f7fe4c4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1335,7 +1335,7 @@ If indicated, a deprecation warning will be issued if you reference theses modul .. csv-table:: :header: "Previous Location", "New Location", "Deprecated" - :widths: 30, 30, 4 + :widths: 30, 30, 20 "pandas.lib", "pandas._libs.lib", "X" "pandas.tslib", "pandas._libs.tslib", "X" @@ -1349,6 +1349,7 @@ If indicated, a deprecation warning will be issued if you reference theses modul "pandas.parser", "pandas.io.libparsers", "X" "pandas.formats", "pandas.io.formats", "" "pandas.sparse", "pandas.core.sparse", "" + "pandas.tools", "pandas.core.tools", "pandas.tools.plotting" "pandas.types", "pandas.core.dtypes", "" "pandas.io.sas.saslib", "pandas.io.sas.libsas", "" "pandas._join", "pandas._libs.join", "" diff --git a/pandas/__init__.py b/pandas/__init__.py index 5f6d54fd904b1..9e5830306db0d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -44,11 +44,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.core.computation.api import * - -from pandas.tools.concat import concat -from pandas.tools.merge import (merge, ordered_merge, - merge_ordered, merge_asof) -from pandas.tools.pivot import pivot_table, crosstab +from pandas.core.reshape.api import * # deprecate tools.plotting, plot_params and scatter_matrix on the top namespace import pandas.tools.plotting @@ -58,9 +54,7 @@ 'pandas.scatter_matrix', pandas.plotting.scatter_matrix, 'pandas.plotting.scatter_matrix') -from pandas.tools.tile import cut, qcut -from pandas.tools.util import to_numeric -from pandas.core.reshape import melt +from pandas.core.reshape.util import to_numeric from pandas.util.print_versions import show_versions from pandas.io.api import * from pandas.util._tester import test diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6df7fce631a3c..63df4b3d94bc8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -605,7 +605,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if bins is not None: try: - from pandas.tools.tile import cut + from pandas.core.reshape.tile import cut values = Series(values) ii = cut(values, bins, include_lowest=True) except TypeError: diff --git a/pandas/core/api.py b/pandas/core/api.py index 865fe367873d8..aa8266995c6b9 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -18,7 +18,7 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D -from pandas.core.reshape import (pivot_simple as pivot, get_dummies, +from pandas.core.reshape.reshape import (pivot_simple as pivot, get_dummies, lreshape, wide_to_long) from pandas.core.indexing import IndexSlice diff --git a/pandas/core/base.py b/pandas/core/base.py index e30751a6582f9..87c649c5fbd79 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -522,7 +522,7 @@ def nested_renaming_depr(level=4): len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ @@ -671,7 +671,7 @@ def is_any_frame(): return result, True def _aggregate_multiple_funcs(self, arg, _level, _axis): - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a12cec33fb350..a3667e9322959 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1995,7 +1995,7 @@ def describe(self): counts = self.value_counts(dropna=False) freqs = counts / float(counts.sum()) - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat result = concat([counts, freqs], axis=1) result.columns = ['counts', 'freqs'] result.index.name = 'categories' diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 51785ebcd9ec8..73c27f4d772ca 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -13,7 +13,7 @@ from pandas.core.base import StringMixin from pandas.core import common as com import pandas.io.formats.printing as printing -from pandas.tools.util import compose +from pandas.core.reshape.util import compose from pandas.core.computation.ops import ( _cmp_ops_syms, _bool_ops_syms, _arith_ops_syms, _unary_ops_syms, is_term) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 732ce7ce695b0..9b9039455b948 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3956,7 +3956,7 @@ def pivot(self, index=None, columns=None, values=None): """ - from pandas.core.reshape import pivot + from pandas.core.reshape.reshape import pivot return pivot(self, index=index, columns=columns, values=values) def stack(self, level=-1, dropna=True): @@ -3992,7 +3992,7 @@ def stack(self, level=-1, dropna=True): ------- stacked : DataFrame or Series """ - from pandas.core.reshape import stack, stack_multiple + from pandas.core.reshape.reshape import stack, stack_multiple if isinstance(level, (tuple, list)): return stack_multiple(self, level, dropna=dropna) @@ -4057,7 +4057,7 @@ def unstack(self, level=-1, fill_value=None): ------- unstacked : DataFrame or Series """ - from pandas.core.reshape import unstack + from pandas.core.reshape.reshape import unstack return unstack(self, level, fill_value) _shared_docs['melt'] = (""" @@ -4159,7 +4159,7 @@ def unstack(self, level=-1, fill_value=None): other='melt')) def melt(self, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): - from pandas.core.reshape import melt + from pandas.core.reshape.reshape import melt return melt(self, id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level) @@ -4609,7 +4609,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): if (self.columns.get_indexer(other.columns) >= 0).all(): other = other.loc[:, self.columns] - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat if isinstance(other, (list, tuple)): to_concat = [self] + other else: @@ -4741,8 +4741,8 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): - from pandas.tools.merge import merge - from pandas.tools.concat import concat + from pandas.core.reshape.merge import merge + from pandas.core.reshape.concat import concat if isinstance(other, Series): if other.name is None: @@ -4786,7 +4786,7 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False): - from pandas.tools.merge import merge + from pandas.core.reshape.merge import merge return merge(self, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, @@ -4846,7 +4846,7 @@ def round(self, decimals=0, *args, **kwargs): Series.round """ - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat def _dict_round(df, decimals): for col, vals in df.iteritems(): @@ -5523,7 +5523,7 @@ def isin(self, values): """ if isinstance(values, dict): from collections import defaultdict - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat values = defaultdict(list, values) return concat((self.iloc[:, [i]].isin(values[col]) for i, col in enumerate(self.columns)), axis=1) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3fd41f3456732..47f8f22725d48 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -870,7 +870,7 @@ def _wrap_applied_output(self, *args, **kwargs): raise AbstractMethodError(self) def _concat_objects(self, keys, values, not_indexed_same=False): - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat def reset_identity(values): # reset the identities of the components @@ -2985,7 +2985,7 @@ def transform(self, func, *args, **kwargs): s = klass(res, indexer) results.append(s) - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat result = concat(results).sort_index() # we will only try to coerce the result type if @@ -3126,8 +3126,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): from functools import partial - from pandas.tools.tile import cut - from pandas.tools.merge import _get_join_indexers + from pandas.core.reshape.tile import cut + from pandas.core.reshape.merge import _get_join_indexers if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level @@ -3509,7 +3509,7 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same - from pandas.tools.util import to_numeric + from pandas.core.reshape.util import to_numeric if len(keys) == 0: return DataFrame(index=keys) @@ -3600,7 +3600,7 @@ def first_non_None_value(values): # still a series # path added as of GH 5545 elif all_indexed_same: - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat return concat(values) if not all_indexed_same: @@ -3633,7 +3633,7 @@ def first_non_None_value(values): else: # GH5788 instead of stacking; concat gets the # dtypes correct - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat result = concat(values, keys=key_index, names=key_index.names, axis=self.axis).unstack() @@ -3684,7 +3684,7 @@ def first_non_None_value(values): not_indexed_same=not_indexed_same) def _transform_general(self, func, *args, **kwargs): - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat applied = [] obj = self._obj_with_exclusions @@ -4071,7 +4071,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions) def _apply_to_column_groupbys(self, func): - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat return concat( (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), @@ -4151,7 +4151,7 @@ def groupby_series(obj, col=None): if isinstance(obj, Series): results = groupby_series(obj) else: - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat results = [groupby_series(obj[col], col) for col in obj.columns] results = concat(results, axis=1) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5149d45514e2e..705b7a186dced 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3064,7 +3064,7 @@ def _join_multi(self, other, how, return_indexers=True): "implemented") def _join_non_unique(self, other, how='left', return_indexers=False): - from pandas.tools.merge import _get_join_indexers + from pandas.core.reshape.merge import _get_join_indexers left_idx, right_idx = _get_join_indexers([self.values], [other._values], how=how, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 40e7118ca0f6a..6d9a9aa691f66 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1170,7 +1170,7 @@ def from_product(cls, iterables, sortorder=None, names=None): MultiIndex.from_tuples : Convert list of tuples to MultiIndex """ from pandas.core.categorical import _factorize_from_iterables - from pandas.tools.util import cartesian_product + from pandas.core.reshape.util import cartesian_product labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index fefe75163d033..39d2ebdeec3ac 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -33,7 +33,7 @@ create_block_manager_from_blocks) from pandas.core.ops import _op_descriptions from pandas.core.series import Series -from pandas.tools.util import cartesian_product +from pandas.core.reshape.util import cartesian_product from pandas.util.decorators import (deprecate, Appender) _shared_doc_kwargs = dict( @@ -1294,7 +1294,7 @@ def join(self, other, how='left', lsuffix='', rsuffix=''): ------- joined : Panel """ - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat if isinstance(other, Panel): join_major, join_minor = self._get_join_index(other, how) diff --git a/pandas/tests/tools/__init__.py b/pandas/core/reshape/__init__.py similarity index 100% rename from pandas/tests/tools/__init__.py rename to pandas/core/reshape/__init__.py diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py new file mode 100644 index 0000000000000..cd10fbf76cf2b --- /dev/null +++ b/pandas/core/reshape/api.py @@ -0,0 +1,5 @@ +from pandas.core.reshape.concat import concat +from pandas.core.reshape.reshape import melt +from pandas.core.reshape.merge import merge, ordered_merge, merge_ordered, merge_asof +from pandas.core.reshape.pivot import pivot_table, crosstab +from pandas.core.reshape.tile import cut, qcut diff --git a/pandas/tools/concat.py b/pandas/core/reshape/concat.py similarity index 100% rename from pandas/tools/concat.py rename to pandas/core/reshape/concat.py diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py new file mode 100644 index 0000000000000..0a4b65b425913 --- /dev/null +++ b/pandas/core/reshape/merge.py @@ -0,0 +1,1482 @@ +""" +SQL-style merge routines +""" + +import copy +import warnings +import string + +import numpy as np +from pandas.compat import range, lzip, zip, map, filter +import pandas.compat as compat + +import pandas as pd +from pandas import (Categorical, Series, DataFrame, + Index, MultiIndex, Timedelta) +from pandas.core.frame import _merge_doc +from pandas.core.dtypes.common import ( + is_datetime64tz_dtype, + is_datetime64_dtype, + needs_i8_conversion, + is_int64_dtype, + is_categorical_dtype, + is_integer_dtype, + is_float_dtype, + is_numeric_dtype, + is_integer, + is_int_or_datetime_dtype, + is_dtype_equal, + is_bool, + is_list_like, + _ensure_int64, + _ensure_float64, + _ensure_object, + _get_dtype) +from pandas.core.dtypes.missing import na_value_for_dtype +from pandas.core.internals import (items_overlap_with_suffix, + concatenate_block_managers) +from pandas.util.decorators import Appender, Substitution + +from pandas.core.sorting import is_int64_overflow_possible +import pandas.core.algorithms as algos +import pandas.core.common as com +from pandas._libs import hashtable as libhashtable, join as libjoin, lib + + +@Substitution('\nleft : DataFrame') +@Appender(_merge_doc, indents=0) +def merge(left, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=False, + suffixes=('_x', '_y'), copy=True, indicator=False): + op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, + right_on=right_on, left_index=left_index, + right_index=right_index, sort=sort, suffixes=suffixes, + copy=copy, indicator=indicator) + return op.get_result() + + +if __debug__: + merge.__doc__ = _merge_doc % '\nleft : DataFrame' + + +class MergeError(ValueError): + pass + + +def _groupby_and_merge(by, on, left, right, _merge_pieces, + check_duplicates=True): + """ + groupby & merge; we are always performing a left-by type operation + + Parameters + ---------- + by: field to group + on: duplicates field + left: left frame + right: right frame + _merge_pieces: function for merging + check_duplicates: boolean, default True + should we check & clean duplicates + """ + + pieces = [] + if not isinstance(by, (list, tuple)): + by = [by] + + lby = left.groupby(by, sort=False) + + # if we can groupby the rhs + # then we can get vastly better perf + try: + + # we will check & remove duplicates if indicated + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(by + on).any(): + right = right.drop_duplicates(by + on, keep='last') + rby = right.groupby(by, sort=False) + except KeyError: + rby = None + + for key, lhs in lby: + + if rby is None: + rhs = right + else: + try: + rhs = right.take(rby.indices[key]) + except KeyError: + # key doesn't exist in left + lcols = lhs.columns.tolist() + cols = lcols + [r for r in right.columns + if r not in set(lcols)] + merged = lhs.reindex(columns=cols) + merged.index = range(len(merged)) + pieces.append(merged) + continue + + merged = _merge_pieces(lhs, rhs) + + # make sure join keys are in the merged + # TODO, should _merge_pieces do this? + for k in by: + try: + if k in merged: + merged[k] = key + except: + pass + + pieces.append(merged) + + # preserve the original order + # if we have a missing piece this can be reset + from pandas.core.reshape.concat import concat + result = concat(pieces, ignore_index=True) + result = result.reindex(columns=pieces[0].columns, copy=False) + return result, lby + + +def ordered_merge(left, right, on=None, + left_on=None, right_on=None, + left_by=None, right_by=None, + fill_method=None, suffixes=('_x', '_y')): + + warnings.warn("ordered_merge is deprecated and replaced by merge_ordered", + FutureWarning, stacklevel=2) + return merge_ordered(left, right, on=on, + left_on=left_on, right_on=right_on, + left_by=left_by, right_by=right_by, + fill_method=fill_method, suffixes=suffixes) + + +def merge_ordered(left, right, on=None, + left_on=None, right_on=None, + left_by=None, right_by=None, + fill_method=None, suffixes=('_x', '_y'), + how='outer'): + """Perform merge with optional filling/interpolation designed for ordered + data like time series data. Optionally perform group-wise merge (see + examples) + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label or list + Field names to join on. Must be found in both DataFrames. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs + left_by : column name or list of column names + Group left DataFrame by group columns and merge piece by piece with + right DataFrame + right_by : column name or list of column names + Group right DataFrame by group columns and merge piece by piece with + left DataFrame + fill_method : {'ffill', None}, default None + Interpolation method for data + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively + how : {'left', 'right', 'outer', 'inner'}, default 'outer' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join) + + .. versionadded:: 0.19.0 + + Examples + -------- + >>> A >>> B + key lvalue group key rvalue + 0 a 1 a 0 b 1 + 1 c 2 a 1 c 2 + 2 e 3 a 2 d 3 + 3 a 1 b + 4 c 2 b + 5 e 3 b + + >>> ordered_merge(A, B, fill_method='ffill', left_by='group') + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1 + 2 c 2 a 2 + 3 d 2 a 3 + 4 e 3 a 3 + 5 f 3 a 4 + 6 a 1 b NaN + 7 b 1 b 1 + 8 c 2 b 2 + 9 d 2 b 3 + 10 e 3 b 3 + 11 f 3 b 4 + + Returns + ------- + merged : DataFrame + The output type will the be same as 'left', if it is a subclass + of DataFrame. + + See also + -------- + merge + merge_asof + + """ + def _merger(x, y): + # perform the ordered merge operation + op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, + suffixes=suffixes, fill_method=fill_method, + how=how) + return op.get_result() + + if left_by is not None and right_by is not None: + raise ValueError('Can only group either left or right frames') + elif left_by is not None: + result, _ = _groupby_and_merge(left_by, on, left, right, + lambda x, y: _merger(x, y), + check_duplicates=False) + elif right_by is not None: + result, _ = _groupby_and_merge(right_by, on, right, left, + lambda x, y: _merger(y, x), + check_duplicates=False) + else: + result = _merger(left, right) + return result + + +ordered_merge.__doc__ = merge_ordered.__doc__ + + +def merge_asof(left, right, on=None, + left_on=None, right_on=None, + left_index=False, right_index=False, + by=None, left_by=None, right_by=None, + suffixes=('_x', '_y'), + tolerance=None, + allow_exact_matches=True, + direction='backward'): + """Perform an asof merge. This is similar to a left-join except that we + match on nearest key rather than equal keys. + + Both DataFrames must be sorted by the key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + 'on' key is less than or equal to the left's key. + + - A "forward" search selects the first row in the right DataFrame whose + 'on' key is greater than or equal to the left's key. + + - A "nearest" search selects the row in the right DataFrame whose 'on' + key is closest in absolute distance to the left's key. + + The default is "backward" and is compatible in versions below 0.20.0. + The direction parameter was added in version 0.20.0 and introduces + "forward" and "nearest". + + Optionally match on equivalent keys with 'by' before searching with 'on'. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label + Field name to join on. Must be found in both DataFrames. + The data MUST be ordered. Furthermore this must be a numeric column, + such as datetimelike, integer, or float. On or left_on/right_on + must be given. + left_on : label + Field name to join on in left DataFrame. + right_on : label + Field name to join on in right DataFrame. + left_index : boolean + Use the index of the left DataFrame as the join key. + + .. versionadded:: 0.19.2 + + right_index : boolean + Use the index of the right DataFrame as the join key. + + .. versionadded:: 0.19.2 + + by : column name or list of column names + Match on these columns before performing merge operation. + left_by : column name + Field names to match on in the left DataFrame. + + .. versionadded:: 0.19.2 + + right_by : column name + Field names to match on in the right DataFrame. + + .. versionadded:: 0.19.2 + + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively. + tolerance : integer or Timedelta, optional, default None + Select asof tolerance within this range; must be compatible + with the merge index. + allow_exact_matches : boolean, default True + + - If True, allow matching with the same 'on' value + (i.e. less-than-or-equal-to / greater-than-or-equal-to) + - If False, don't match the same 'on' value + (i.e., stricly less-than / strictly greater-than) + + direction : 'backward' (default), 'forward', or 'nearest' + Whether to search for prior, subsequent, or closest matches. + + .. versionadded:: 0.20.0 + + Returns + ------- + merged : DataFrame + + Examples + -------- + >>> left + a left_val + 0 1 a + 1 5 b + 2 10 c + + >>> right + a right_val + 0 1 1 + 1 2 2 + 2 3 3 + 3 6 6 + 4 7 7 + + >>> pd.merge_asof(left, right, on='a') + a left_val right_val + 0 1 a 1 + 1 5 b 3 + 2 10 c 7 + + >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + a left_val right_val + 0 1 a NaN + 1 5 b 3.0 + 2 10 c 7.0 + + >>> pd.merge_asof(left, right, on='a', direction='forward') + a left_val right_val + 0 1 a 1.0 + 1 5 b 6.0 + 2 10 c NaN + + >>> pd.merge_asof(left, right, on='a', direction='nearest') + a left_val right_val + 0 1 a 1 + 1 5 b 6 + 2 10 c 7 + + We can use indexed DataFrames as well. + + >>> left + left_val + 1 a + 5 b + 10 c + + >>> right + right_val + 1 1 + 2 2 + 3 3 + 6 6 + 7 7 + + >>> pd.merge_asof(left, right, left_index=True, right_index=True) + left_val right_val + 1 a 1 + 5 b 3 + 10 c 7 + + Here is a real-world times-series example + + >>> quotes + time ticker bid ask + 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 + 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 + 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 + 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 + 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 + 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 + 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 + 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + + >>> trades + time ticker price quantity + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 + + By default we are taking the asof of the quotes + + >>> pd.merge_asof(trades, quotes, + ... on='time', + ... by='ticker') + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 2ms betwen the quote time and the trade time + + >>> pd.merge_asof(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('2ms')) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 10ms betwen the quote time and the trade time + and we exclude exact matches on time. However *prior* data will + propogate forward + + >>> pd.merge_asof(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('10ms'), + ... allow_exact_matches=False) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + See also + -------- + merge + merge_ordered + + """ + op = _AsOfMerge(left, right, + on=on, left_on=left_on, right_on=right_on, + left_index=left_index, right_index=right_index, + by=by, left_by=left_by, right_by=right_by, + suffixes=suffixes, + how='asof', tolerance=tolerance, + allow_exact_matches=allow_exact_matches, + direction=direction) + return op.get_result() + + +# TODO: transformations?? +# TODO: only copy DataFrames when modification necessary +class _MergeOperation(object): + """ + Perform a database (SQL) merge operation between two DataFrame objects + using either columns as keys or their row indexes + """ + _merge_type = 'merge' + + def __init__(self, left, right, how='inner', on=None, + left_on=None, right_on=None, axis=1, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True, indicator=False): + self.left = self.orig_left = left + self.right = self.orig_right = right + self.how = how + self.axis = axis + + self.on = com._maybe_make_list(on) + self.left_on = com._maybe_make_list(left_on) + self.right_on = com._maybe_make_list(right_on) + + self.copy = copy + self.suffixes = suffixes + self.sort = sort + + self.left_index = left_index + self.right_index = right_index + + self.indicator = indicator + + if isinstance(self.indicator, compat.string_types): + self.indicator_name = self.indicator + elif isinstance(self.indicator, bool): + self.indicator_name = '_merge' if self.indicator else None + else: + raise ValueError( + 'indicator option can only accept boolean or string arguments') + + if not isinstance(left, DataFrame): + raise ValueError( + 'can not merge DataFrame with instance of ' + 'type {0}'.format(type(left))) + if not isinstance(right, DataFrame): + raise ValueError( + 'can not merge DataFrame with instance of ' + 'type {0}'.format(type(right))) + + if not is_bool(left_index): + raise ValueError( + 'left_index parameter must be of type bool, not ' + '{0}'.format(type(left_index))) + if not is_bool(right_index): + raise ValueError( + 'right_index parameter must be of type bool, not ' + '{0}'.format(type(right_index))) + + # warn user when merging between different levels + if left.columns.nlevels != right.columns.nlevels: + msg = ('merging between different levels can give an unintended ' + 'result ({0} levels on the left, {1} on the right)') + msg = msg.format(left.columns.nlevels, right.columns.nlevels) + warnings.warn(msg, UserWarning) + + self._validate_specification() + + # note this function has side effects + (self.left_join_keys, + self.right_join_keys, + self.join_names) = self._get_merge_keys() + + # validate the merge keys dtypes. We may need to coerce + # to avoid incompat dtypes + self._maybe_coerce_merge_keys() + + def get_result(self): + if self.indicator: + self.left, self.right = self._indicator_pre_merge( + self.left, self.right) + + join_index, left_indexer, right_indexer = self._get_join_info() + + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) + + lindexers = {1: left_indexer} if left_indexer is not None else {} + rindexers = {1: right_indexer} if right_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) + + typ = self.left._constructor + result = typ(result_data).__finalize__(self, method=self._merge_type) + + if self.indicator: + result = self._indicator_post_merge(result) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + def _indicator_pre_merge(self, left, right): + + columns = left.columns.union(right.columns) + + for i in ['_left_indicator', '_right_indicator']: + if i in columns: + raise ValueError("Cannot use `indicator=True` option when " + "data contains a column named {}".format(i)) + if self.indicator_name in columns: + raise ValueError( + "Cannot use name of an existing column for indicator column") + + left = left.copy() + right = right.copy() + + left['_left_indicator'] = 1 + left['_left_indicator'] = left['_left_indicator'].astype('int8') + + right['_right_indicator'] = 2 + right['_right_indicator'] = right['_right_indicator'].astype('int8') + + return left, right + + def _indicator_post_merge(self, result): + + result['_left_indicator'] = result['_left_indicator'].fillna(0) + result['_right_indicator'] = result['_right_indicator'].fillna(0) + + result[self.indicator_name] = Categorical((result['_left_indicator'] + + result['_right_indicator']), + categories=[1, 2, 3]) + result[self.indicator_name] = ( + result[self.indicator_name] + .cat.rename_categories(['left_only', 'right_only', 'both'])) + + result = result.drop(labels=['_left_indicator', '_right_indicator'], + axis=1) + return result + + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): + + left_has_missing = None + right_has_missing = None + + keys = zip(self.join_names, self.left_on, self.right_on) + for i, (name, lname, rname) in enumerate(keys): + if not _should_fill(lname, rname): + continue + + take_left, take_right = None, None + + if name in result: + + if left_indexer is not None and right_indexer is not None: + if name in self.left: + + if left_has_missing is None: + left_has_missing = (left_indexer == -1).any() + + if left_has_missing: + take_right = self.right_join_keys[i] + + if not is_dtype_equal(result[name].dtype, + self.left[name].dtype): + take_left = self.left[name]._values + + elif name in self.right: + + if right_has_missing is None: + right_has_missing = (right_indexer == -1).any() + + if right_has_missing: + take_left = self.left_join_keys[i] + + if not is_dtype_equal(result[name].dtype, + self.right[name].dtype): + take_right = self.right[name]._values + + elif left_indexer is not None \ + and isinstance(self.left_join_keys[i], np.ndarray): + + take_left = self.left_join_keys[i] + take_right = self.right_join_keys[i] + + if take_left is not None or take_right is not None: + + if take_left is None: + lvals = result[name]._values + else: + lfill = na_value_for_dtype(take_left.dtype) + lvals = algos.take_1d(take_left, left_indexer, + fill_value=lfill) + + if take_right is None: + rvals = result[name]._values + else: + rfill = na_value_for_dtype(take_right.dtype) + rvals = algos.take_1d(take_right, right_indexer, + fill_value=rfill) + + # if we have an all missing left_indexer + # make sure to just use the right values + mask = left_indexer == -1 + if mask.all(): + key_col = rvals + else: + key_col = Index(lvals).where(~mask, rvals) + + if name in result: + result[name] = key_col + else: + result.insert(i, name or 'key_%d' % i, key_col) + + def _get_join_indexers(self): + """ return the join indexers """ + return _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, + how=self.how) + + def _get_join_info(self): + left_ax = self.left._data.axes[self.axis] + right_ax = self.right._data.axes[self.axis] + + if self.left_index and self.right_index and self.how != 'asof': + join_index, left_indexer, right_indexer = \ + left_ax.join(right_ax, how=self.how, return_indexers=True, + sort=self.sort) + elif self.right_index and self.how == 'left': + join_index, left_indexer, right_indexer = \ + _left_join_on_index(left_ax, right_ax, self.left_join_keys, + sort=self.sort) + + elif self.left_index and self.how == 'right': + join_index, right_indexer, left_indexer = \ + _left_join_on_index(right_ax, left_ax, self.right_join_keys, + sort=self.sort) + else: + (left_indexer, + right_indexer) = self._get_join_indexers() + + if self.right_index: + if len(self.left) > 0: + join_index = self.left.index.take(left_indexer) + else: + join_index = self.right.index.take(right_indexer) + left_indexer = np.array([-1] * len(join_index)) + elif self.left_index: + if len(self.right) > 0: + join_index = self.right.index.take(right_indexer) + else: + join_index = self.left.index.take(left_indexer) + right_indexer = np.array([-1] * len(join_index)) + else: + join_index = Index(np.arange(len(left_indexer))) + + if len(join_index) == 0: + join_index = join_index.astype(object) + return join_index, left_indexer, right_indexer + + def _get_merge_keys(self): + """ + Note: has side effects (copy/delete key columns) + + Parameters + ---------- + left + right + on + + Returns + ------- + left_keys, right_keys + """ + left_keys = [] + right_keys = [] + join_names = [] + right_drop = [] + left_drop = [] + left, right = self.left, self.right + + is_lkey = lambda x: isinstance( + x, (np.ndarray, Series)) and len(x) == len(left) + is_rkey = lambda x: isinstance( + x, (np.ndarray, Series)) and len(x) == len(right) + + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A + # user could, for example, request 'left_index' and 'left_by'. In a + # regular pd.merge(), users cannot specify both 'left_index' and + # 'left_on'. (Instead, users have a MultiIndex). That means the + # self.left_on in this function is always empty in a pd.merge(), but + # a pd.merge_asof(left_index=True, left_by=...) will result in a + # self.left_on array with a None in the middle of it. This requires + # a work-around as designated in the code below. + # See _validate_specification() for where this happens. + + # ugh, spaghetti re #733 + if _any(self.left_on) and _any(self.right_on): + for lk, rk in zip(self.left_on, self.right_on): + if is_lkey(lk): + left_keys.append(lk) + if is_rkey(rk): + right_keys.append(rk) + join_names.append(None) # what to do? + else: + if rk is not None: + right_keys.append(right[rk]._values) + join_names.append(rk) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + join_names.append(right.index.name) + else: + if not is_rkey(rk): + if rk is not None: + right_keys.append(right[rk]._values) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + if lk is not None and lk == rk: + # avoid key upcast in corner case (length-0) + if len(left) > 0: + right_drop.append(rk) + else: + left_drop.append(lk) + else: + right_keys.append(rk) + if lk is not None: + left_keys.append(left[lk]._values) + join_names.append(lk) + else: + # work-around for merge_asof(left_index=True) + left_keys.append(left.index) + join_names.append(left.index.name) + elif _any(self.left_on): + for k in self.left_on: + if is_lkey(k): + left_keys.append(k) + join_names.append(None) + else: + left_keys.append(left[k]._values) + join_names.append(k) + if isinstance(self.right.index, MultiIndex): + right_keys = [lev._values.take(lab) + for lev, lab in zip(self.right.index.levels, + self.right.index.labels)] + else: + right_keys = [self.right.index.values] + elif _any(self.right_on): + for k in self.right_on: + if is_rkey(k): + right_keys.append(k) + join_names.append(None) + else: + right_keys.append(right[k]._values) + join_names.append(k) + if isinstance(self.left.index, MultiIndex): + left_keys = [lev._values.take(lab) + for lev, lab in zip(self.left.index.levels, + self.left.index.labels)] + else: + left_keys = [self.left.index.values] + + if left_drop: + self.left = self.left.drop(left_drop, axis=1) + + if right_drop: + self.right = self.right.drop(right_drop, axis=1) + + return left_keys, right_keys, join_names + + def _maybe_coerce_merge_keys(self): + # we have valid mergee's but we may have to further + # coerce these if they are originally incompatible types + # + # for example if these are categorical, but are not dtype_equal + # or if we have object and integer dtypes + + for lk, rk, name in zip(self.left_join_keys, + self.right_join_keys, + self.join_names): + if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): + continue + + # if either left or right is a categorical + # then the must match exactly in categories & ordered + if is_categorical_dtype(lk) and is_categorical_dtype(rk): + if lk.is_dtype_equal(rk): + continue + elif is_categorical_dtype(lk) or is_categorical_dtype(rk): + pass + + elif is_dtype_equal(lk.dtype, rk.dtype): + continue + + # if we are numeric, then allow differing + # kinds to proceed, eg. int64 and int8 + # further if we are object, but we infer to + # the same, then proceed + if (is_numeric_dtype(lk) and is_numeric_dtype(rk)): + if lk.dtype.kind == rk.dtype.kind: + continue + + # let's infer and see if we are ok + if lib.infer_dtype(lk) == lib.infer_dtype(rk): + continue + + # Houston, we have a problem! + # let's coerce to object + if name in self.left.columns: + self.left = self.left.assign( + **{name: self.left[name].astype(object)}) + if name in self.right.columns: + self.right = self.right.assign( + **{name: self.right[name].astype(object)}) + + def _validate_specification(self): + # Hm, any way to make this logic less complicated?? + if self.on is None and self.left_on is None and self.right_on is None: + + if self.left_index and self.right_index: + self.left_on, self.right_on = (), () + elif self.left_index: + if self.right_on is None: + raise MergeError('Must pass right_on or right_index=True') + elif self.right_index: + if self.left_on is None: + raise MergeError('Must pass left_on or left_index=True') + else: + # use the common columns + common_cols = self.left.columns.intersection( + self.right.columns) + if len(common_cols) == 0: + raise MergeError('No common columns to perform merge on') + if not common_cols.is_unique: + raise MergeError("Data columns not unique: %s" + % repr(common_cols)) + self.left_on = self.right_on = common_cols + elif self.on is not None: + if self.left_on is not None or self.right_on is not None: + raise MergeError('Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.') + self.left_on = self.right_on = self.on + elif self.left_on is not None: + n = len(self.left_on) + if self.right_index: + if len(self.left_on) != self.right.index.nlevels: + raise ValueError('len(left_on) must equal the number ' + 'of levels in the index of "right"') + self.right_on = [None] * n + elif self.right_on is not None: + n = len(self.right_on) + if self.left_index: + if len(self.right_on) != self.left.index.nlevels: + raise ValueError('len(right_on) must equal the number ' + 'of levels in the index of "left"') + self.left_on = [None] * n + if len(self.right_on) != len(self.left_on): + raise ValueError("len(right_on) must equal len(left_on)") + + +def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', + **kwargs): + """ + + Parameters + ---------- + left_keys: ndarray, Index, Series + right_keys: ndarray, Index, Series + sort: boolean, default False + how: string {'inner', 'outer', 'left', 'right'}, default 'inner' + + Returns + ------- + tuple of (left_indexer, right_indexer) + indexers into the left_keys, right_keys + + """ + from functools import partial + + assert len(left_keys) == len(right_keys), \ + 'left_key and right_keys must be the same length' + + # bind `sort` arg. of _factorize_keys + fkeys = partial(_factorize_keys, sort=sort) + + # get left & right join labels and num. of levels at each location + llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys))) + + # get flat i8 keys from label lists + lkey, rkey = _get_join_keys(llab, rlab, shape, sort) + + # factorize keys to a dense i8 space + # `count` is the num. of unique keys + # set(lkey) | set(rkey) == range(count) + lkey, rkey, count = fkeys(lkey, rkey) + + # preserve left frame order if how == 'left' and sort == False + kwargs = copy.copy(kwargs) + if how == 'left': + kwargs['sort'] = sort + join_func = _join_functions[how] + + return join_func(lkey, rkey, count, **kwargs) + + +class _OrderedMerge(_MergeOperation): + _merge_type = 'ordered_merge' + + def __init__(self, left, right, on=None, left_on=None, right_on=None, + left_index=False, right_index=False, axis=1, + suffixes=('_x', '_y'), copy=True, + fill_method=None, how='outer'): + + self.fill_method = fill_method + _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, + left_index=left_index, + right_index=right_index, + right_on=right_on, axis=axis, + how=how, suffixes=suffixes, + sort=True # factorize sorts + ) + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + # this is a bit kludgy + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) + + if self.fill_method == 'ffill': + left_join_indexer = libjoin.ffill_indexer(left_indexer) + right_join_indexer = libjoin.ffill_indexer(right_indexer) + else: + left_join_indexer = left_indexer + right_join_indexer = right_indexer + + lindexers = { + 1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = { + 1: right_join_indexer} if right_join_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) + + typ = self.left._constructor + result = typ(result_data).__finalize__(self, method=self._merge_type) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + +def _asof_function(direction, on_type): + return getattr(libjoin, 'asof_join_%s_%s' % (direction, on_type), None) + + +def _asof_by_function(direction, on_type, by_type): + return getattr(libjoin, 'asof_join_%s_%s_by_%s' % + (direction, on_type, by_type), None) + + +_type_casters = { + 'int64_t': _ensure_int64, + 'double': _ensure_float64, + 'object': _ensure_object, +} + +_cython_types = { + 'uint8': 'uint8_t', + 'uint32': 'uint32_t', + 'uint16': 'uint16_t', + 'uint64': 'uint64_t', + 'int8': 'int8_t', + 'int32': 'int32_t', + 'int16': 'int16_t', + 'int64': 'int64_t', + 'float16': 'error', + 'float32': 'float', + 'float64': 'double', +} + + +def _get_cython_type(dtype): + """ Given a dtype, return a C name like 'int64_t' or 'double' """ + type_name = _get_dtype(dtype).name + ctype = _cython_types.get(type_name, 'object') + if ctype == 'error': + raise MergeError('unsupported type: ' + type_name) + return ctype + + +def _get_cython_type_upcast(dtype): + """ Upcast a dtype to 'int64_t', 'double', or 'object' """ + if is_integer_dtype(dtype): + return 'int64_t' + elif is_float_dtype(dtype): + return 'double' + else: + return 'object' + + +class _AsOfMerge(_OrderedMerge): + _merge_type = 'asof_merge' + + def __init__(self, left, right, on=None, left_on=None, right_on=None, + left_index=False, right_index=False, + by=None, left_by=None, right_by=None, + axis=1, suffixes=('_x', '_y'), copy=True, + fill_method=None, + how='asof', tolerance=None, + allow_exact_matches=True, + direction='backward'): + + self.by = by + self.left_by = left_by + self.right_by = right_by + self.tolerance = tolerance + self.allow_exact_matches = allow_exact_matches + self.direction = direction + + _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, left_index=left_index, + right_index=right_index, axis=axis, + how=how, suffixes=suffixes, + fill_method=fill_method) + + def _validate_specification(self): + super(_AsOfMerge, self)._validate_specification() + + # we only allow on to be a single item for on + if len(self.left_on) != 1 and not self.left_index: + raise MergeError("can only asof on a key for left") + + if len(self.right_on) != 1 and not self.right_index: + raise MergeError("can only asof on a key for right") + + if self.left_index and isinstance(self.left.index, MultiIndex): + raise MergeError("left can only have one index") + + if self.right_index and isinstance(self.right.index, MultiIndex): + raise MergeError("right can only have one index") + + # set 'by' columns + if self.by is not None: + if self.left_by is not None or self.right_by is not None: + raise MergeError('Can only pass by OR left_by ' + 'and right_by') + self.left_by = self.right_by = self.by + if self.left_by is None and self.right_by is not None: + raise MergeError('missing left_by') + if self.left_by is not None and self.right_by is None: + raise MergeError('missing right_by') + + # add 'by' to our key-list so we can have it in the + # output as a key + if self.left_by is not None: + if not is_list_like(self.left_by): + self.left_by = [self.left_by] + if not is_list_like(self.right_by): + self.right_by = [self.right_by] + + if len(self.left_by) != len(self.right_by): + raise MergeError('left_by and right_by must be same length') + + self.left_on = self.left_by + list(self.left_on) + self.right_on = self.right_by + list(self.right_on) + + # check 'direction' is valid + if self.direction not in ['backward', 'forward', 'nearest']: + raise MergeError('direction invalid: ' + self.direction) + + @property + def _asof_key(self): + """ This is our asof key, the 'on' """ + return self.left_on[-1] + + def _get_merge_keys(self): + + # note this function has side effects + (left_join_keys, + right_join_keys, + join_names) = super(_AsOfMerge, self)._get_merge_keys() + + # validate index types are the same + for lk, rk in zip(left_join_keys, right_join_keys): + if not is_dtype_equal(lk.dtype, rk.dtype): + raise MergeError("incompatible merge keys, " + "must be the same type") + + # validate tolerance; must be a Timedelta if we have a DTI + if self.tolerance is not None: + + if self.left_index: + lt = self.left.index + else: + lt = left_join_keys[-1] + + msg = "incompatible tolerance, must be compat " \ + "with type {0}".format(type(lt)) + + if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): + if not isinstance(self.tolerance, Timedelta): + raise MergeError(msg) + if self.tolerance < Timedelta(0): + raise MergeError("tolerance must be positive") + + elif is_int64_dtype(lt): + if not is_integer(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + else: + raise MergeError("key must be integer or timestamp") + + # validate allow_exact_matches + if not is_bool(self.allow_exact_matches): + raise MergeError("allow_exact_matches must be boolean, " + "passed {0}".format(self.allow_exact_matches)) + + return left_join_keys, right_join_keys, join_names + + def _get_join_indexers(self): + """ return the join indexers """ + + def flip(xs): + """ unlike np.transpose, this returns an array of tuples """ + labels = list(string.ascii_lowercase[:len(xs)]) + dtypes = [x.dtype for x in xs] + labeled_dtypes = list(zip(labels, dtypes)) + return np.array(lzip(*xs), labeled_dtypes) + + # values to compare + left_values = (self.left.index.values if self.left_index else + self.left_join_keys[-1]) + right_values = (self.right.index.values if self.right_index else + self.right_join_keys[-1]) + tolerance = self.tolerance + + # we required sortedness in the join keys + msg = " keys must be sorted" + if not Index(left_values).is_monotonic: + raise ValueError('left' + msg) + if not Index(right_values).is_monotonic: + raise ValueError('right' + msg) + + # initial type conversion as needed + if needs_i8_conversion(left_values): + left_values = left_values.view('i8') + right_values = right_values.view('i8') + if tolerance is not None: + tolerance = tolerance.value + + # a "by" parameter requires special handling + if self.left_by is not None: + # remove 'on' parameter from values if one existed + if self.left_index and self.right_index: + left_by_values = self.left_join_keys + right_by_values = self.right_join_keys + else: + left_by_values = self.left_join_keys[0:-1] + right_by_values = self.right_join_keys[0:-1] + + # get tuple representation of values if more than one + if len(left_by_values) == 1: + left_by_values = left_by_values[0] + right_by_values = right_by_values[0] + else: + left_by_values = flip(left_by_values) + right_by_values = flip(right_by_values) + + # upcast 'by' parameter because HashTable is limited + by_type = _get_cython_type_upcast(left_by_values.dtype) + by_type_caster = _type_casters[by_type] + left_by_values = by_type_caster(left_by_values) + right_by_values = by_type_caster(right_by_values) + + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + func = _asof_by_function(self.direction, on_type, by_type) + return func(left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance) + else: + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + func = _asof_function(self.direction, on_type) + return func(left_values, + right_values, + self.allow_exact_matches, + tolerance) + + +def _get_multiindex_indexer(join_keys, index, sort): + from functools import partial + + # bind `sort` argument + fkeys = partial(_factorize_keys, sort=sort) + + # left & right join labels and num. of levels at each location + rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys))) + if sort: + rlab = list(map(np.take, rlab, index.labels)) + else: + i8copy = lambda a: a.astype('i8', subok=False, copy=True) + rlab = list(map(i8copy, index.labels)) + + # fix right labels if there were any nulls + for i in range(len(join_keys)): + mask = index.labels[i] == -1 + if mask.any(): + # check if there already was any nulls at this location + # if there was, it is factorized to `shape[i] - 1` + a = join_keys[i][llab[i] == shape[i] - 1] + if a.size == 0 or not a[0] != a[0]: + shape[i] += 1 + + rlab[i][mask] = shape[i] - 1 + + # get flat i8 join keys + lkey, rkey = _get_join_keys(llab, rlab, shape, sort) + + # factorize keys to a dense i8 space + lkey, rkey, count = fkeys(lkey, rkey) + + return libjoin.left_outer_join(lkey, rkey, count, sort=sort) + + +def _get_single_indexer(join_key, index, sort=False): + left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) + + left_indexer, right_indexer = libjoin.left_outer_join( + _ensure_int64(left_key), + _ensure_int64(right_key), + count, sort=sort) + + return left_indexer, right_indexer + + +def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): + if len(join_keys) > 1: + if not ((isinstance(right_ax, MultiIndex) and + len(join_keys) == right_ax.nlevels)): + raise AssertionError("If more than one join key is given then " + "'right_ax' must be a MultiIndex and the " + "number of join keys must be the number of " + "levels in right_ax") + + left_indexer, right_indexer = \ + _get_multiindex_indexer(join_keys, right_ax, sort=sort) + else: + jkey = join_keys[0] + + left_indexer, right_indexer = \ + _get_single_indexer(jkey, right_ax, sort=sort) + + if sort or len(left_ax) != len(left_indexer): + # if asked to sort or there are 1-to-many matches + join_index = left_ax.take(left_indexer) + return join_index, left_indexer, right_indexer + + # left frame preserves order & length of its index + return left_ax, None, right_indexer + + +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + + +_join_functions = { + 'inner': libjoin.inner_join, + 'left': libjoin.left_outer_join, + 'right': _right_outer_join, + 'outer': libjoin.full_outer_join, +} + + +def _factorize_keys(lk, rk, sort=True): + if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): + lk = lk.values + rk = rk.values + + # if we exactly match in categories, allow us to use codes + if (is_categorical_dtype(lk) and + is_categorical_dtype(rk) and + lk.is_dtype_equal(rk)): + return lk.codes, rk.codes, len(lk.categories) + + if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): + klass = libhashtable.Int64Factorizer + lk = _ensure_int64(com._values_from_object(lk)) + rk = _ensure_int64(com._values_from_object(rk)) + else: + klass = libhashtable.Factorizer + lk = _ensure_object(lk) + rk = _ensure_object(rk) + + rizer = klass(max(len(lk), len(rk))) + + llab = rizer.factorize(lk) + rlab = rizer.factorize(rk) + + count = rizer.get_count() + + if sort: + uniques = rizer.uniques.to_array() + llab, rlab = _sort_labels(uniques, llab, rlab) + + # NA group + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + + if lany or rany: + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 + + return llab, rlab, count + + +def _sort_labels(uniques, left, right): + if not isinstance(uniques, np.ndarray): + # tuplesafe + uniques = Index(uniques).values + + l = len(left) + labels = np.concatenate([left, right]) + + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + new_labels = _ensure_int64(new_labels) + new_left, new_right = new_labels[:l], new_labels[l:] + + return new_left, new_right + + +def _get_join_keys(llab, rlab, shape, sort): + + # how many levels can be done without overflow + pred = lambda i: not is_int64_overflow_possible(shape[:i]) + nlev = next(filter(pred, range(len(shape), 0, -1))) + + # get keys for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype='i8') + lkey = stride * llab[0].astype('i8', subok=False, copy=False) + rkey = stride * rlab[0].astype('i8', subok=False, copy=False) + + for i in range(1, nlev): + stride //= shape[i] + lkey += llab[i] * stride + rkey += rlab[i] * stride + + if nlev == len(shape): # all done! + return lkey, rkey + + # densify current keys to avoid overflow + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + + llab = [lkey] + llab[nlev:] + rlab = [rkey] + rlab[nlev:] + shape = [count] + shape[nlev:] + + return _get_join_keys(llab, rlab, shape, sort) + + +def _should_fill(lname, rname): + if (not isinstance(lname, compat.string_types) or + not isinstance(rname, compat.string_types)): + return True + return lname == rname + + +def _any(x): + return x is not None and len(x) > 0 and any([y is not None for y in x]) diff --git a/pandas/tools/pivot.py b/pandas/core/reshape/pivot.py similarity index 99% rename from pandas/tools/pivot.py rename to pandas/core/reshape/pivot.py index 11ca2e548f171..1c5250615d410 100644 --- a/pandas/tools/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -2,9 +2,10 @@ from pandas.core.dtypes.common import is_list_like, is_scalar -from pandas import Series, DataFrame, MultiIndex, Index, concat +from pandas.core.reshape.concat import concat +from pandas import Series, DataFrame, MultiIndex, Index from pandas.core.groupby import Grouper -from pandas.tools.util import cartesian_product +from pandas.core.reshape.util import cartesian_product from pandas.compat import range, lrange, zip from pandas import compat import pandas.core.common as com diff --git a/pandas/core/reshape.py b/pandas/core/reshape/reshape.py similarity index 99% rename from pandas/core/reshape.py rename to pandas/core/reshape/reshape.py index b3a06d85967f2..bfd5320af13fb 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1151,7 +1151,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, -------- Series.str.get_dummies """ - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat from itertools import cycle if isinstance(data, DataFrame): diff --git a/pandas/tools/tile.py b/pandas/core/reshape/tile.py similarity index 100% rename from pandas/tools/tile.py rename to pandas/core/reshape/tile.py diff --git a/pandas/tools/util.py b/pandas/core/reshape/util.py similarity index 100% rename from pandas/tools/util.py rename to pandas/core/reshape/util.py diff --git a/pandas/core/series.py b/pandas/core/series.py index 9022bff092ac3..38c6d5bae59fe 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1557,7 +1557,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): """ - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat if isinstance(to_append, (list, tuple)): to_concat = [self] + to_append @@ -2035,7 +2035,7 @@ def unstack(self, level=-1, fill_value=None): ------- unstacked : DataFrame """ - from pandas.core.reshape import unstack + from pandas.core.reshape.reshape import unstack return unstack(self, level, fill_value) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 20df60eb96299..8ffa6d01ad8f0 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -168,7 +168,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True, self._chk_truncate() def _chk_truncate(self): - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat max_rows = self.max_rows truncate_v = max_rows and (len(self.series) > max_rows) series = self.series @@ -409,7 +409,7 @@ def _chk_truncate(self): Checks whether the frame should be truncated. If so, slices the frame up. """ - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat # Column of which first element is used to determine width of a dot col self.tr_size_col = -1 diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 374244acfe173..934c05ba5f130 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2317,7 +2317,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: - from pandas.tools.concat import concat + from pandas.core.reshape.concat import concat keys, frames = zip(*grouped) if grouped.axis == 0: df = concat(frames, keys=keys, axis=1) diff --git a/pandas/tests/reshape/__init__.py b/pandas/tests/reshape/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/tools/data/allow_exact_matches.csv b/pandas/tests/reshape/data/allow_exact_matches.csv similarity index 100% rename from pandas/tests/tools/data/allow_exact_matches.csv rename to pandas/tests/reshape/data/allow_exact_matches.csv diff --git a/pandas/tests/tools/data/allow_exact_matches_and_tolerance.csv b/pandas/tests/reshape/data/allow_exact_matches_and_tolerance.csv similarity index 100% rename from pandas/tests/tools/data/allow_exact_matches_and_tolerance.csv rename to pandas/tests/reshape/data/allow_exact_matches_and_tolerance.csv diff --git a/pandas/tests/tools/data/asof.csv b/pandas/tests/reshape/data/asof.csv similarity index 100% rename from pandas/tests/tools/data/asof.csv rename to pandas/tests/reshape/data/asof.csv diff --git a/pandas/tests/tools/data/asof2.csv b/pandas/tests/reshape/data/asof2.csv similarity index 100% rename from pandas/tests/tools/data/asof2.csv rename to pandas/tests/reshape/data/asof2.csv diff --git a/pandas/tests/tools/data/cut_data.csv b/pandas/tests/reshape/data/cut_data.csv similarity index 100% rename from pandas/tests/tools/data/cut_data.csv rename to pandas/tests/reshape/data/cut_data.csv diff --git a/pandas/tests/tools/data/quotes.csv b/pandas/tests/reshape/data/quotes.csv similarity index 100% rename from pandas/tests/tools/data/quotes.csv rename to pandas/tests/reshape/data/quotes.csv diff --git a/pandas/tests/tools/data/quotes2.csv b/pandas/tests/reshape/data/quotes2.csv similarity index 100% rename from pandas/tests/tools/data/quotes2.csv rename to pandas/tests/reshape/data/quotes2.csv diff --git a/pandas/tests/tools/data/tolerance.csv b/pandas/tests/reshape/data/tolerance.csv similarity index 100% rename from pandas/tests/tools/data/tolerance.csv rename to pandas/tests/reshape/data/tolerance.csv diff --git a/pandas/tests/tools/data/trades.csv b/pandas/tests/reshape/data/trades.csv similarity index 100% rename from pandas/tests/tools/data/trades.csv rename to pandas/tests/reshape/data/trades.csv diff --git a/pandas/tests/tools/data/trades2.csv b/pandas/tests/reshape/data/trades2.csv similarity index 100% rename from pandas/tests/tools/data/trades2.csv rename to pandas/tests/reshape/data/trades2.csv diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/reshape/test_concat.py similarity index 100% rename from pandas/tests/tools/test_concat.py rename to pandas/tests/reshape/test_concat.py diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/reshape/test_hashing.py similarity index 100% rename from pandas/tests/tools/test_hashing.py rename to pandas/tests/reshape/test_hashing.py diff --git a/pandas/tests/tools/test_join.py b/pandas/tests/reshape/test_join.py similarity index 99% rename from pandas/tests/tools/test_join.py rename to pandas/tests/reshape/test_join.py index 8571a1ff16701..51e5beadee8a7 100644 --- a/pandas/tests/tools/test_join.py +++ b/pandas/tests/reshape/test_join.py @@ -12,7 +12,7 @@ from pandas._libs import join as libjoin import pandas.util.testing as tm -from pandas.tests.tools.test_merge import get_test_data, N, NGROUPS +from pandas.tests.reshape.test_merge import get_test_data, N, NGROUPS a_ = np.array diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/reshape/test_merge.py similarity index 99% rename from pandas/tests/tools/test_merge.py rename to pandas/tests/reshape/test_merge.py index cc4a97df33801..67a8c5084eef6 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -9,8 +9,8 @@ import pandas as pd from pandas.compat import lrange, lzip -from pandas.tools.concat import concat -from pandas.tools.merge import merge, MergeError +from pandas.core.reshape.concat import concat +from pandas.core.reshape.merge import merge, MergeError from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype diff --git a/pandas/tests/tools/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py similarity index 99% rename from pandas/tests/tools/test_merge_asof.py rename to pandas/tests/reshape/test_merge_asof.py index c9460cc74c94a..865c413bad11e 100644 --- a/pandas/tests/tools/test_merge_asof.py +++ b/pandas/tests/reshape/test_merge_asof.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import (merge_asof, read_csv, to_datetime, Timedelta) -from pandas.tools.merge import MergeError +from pandas.core.reshape.merge import MergeError from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal diff --git a/pandas/tests/tools/test_merge_ordered.py b/pandas/tests/reshape/test_merge_ordered.py similarity index 100% rename from pandas/tests/tools/test_merge_ordered.py rename to pandas/tests/reshape/test_merge_ordered.py diff --git a/pandas/tests/tools/test_pivot.py b/pandas/tests/reshape/test_pivot.py similarity index 99% rename from pandas/tests/tools/test_pivot.py rename to pandas/tests/reshape/test_pivot.py index c8dfaf5e29bc6..88d25b9d053c3 100644 --- a/pandas/tests/tools/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import (DataFrame, Series, Index, MultiIndex, Grouper, date_range, concat) -from pandas.tools.pivot import pivot_table, crosstab +from pandas.core.reshape.pivot import pivot_table, crosstab from pandas.compat import range, product import pandas.util.testing as tm from pandas.tseries.util import pivot_annual, isleapyear diff --git a/pandas/tests/test_reshape.py b/pandas/tests/reshape/test_reshape.py similarity index 99% rename from pandas/tests/test_reshape.py rename to pandas/tests/reshape/test_reshape.py index ee255c1863b41..0eb1e5ff3cf11 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -9,7 +9,8 @@ from pandas.util.testing import assert_frame_equal -from pandas.core.reshape import (melt, lreshape, get_dummies, wide_to_long) +from pandas.core.reshape.reshape import ( + melt, lreshape, get_dummies, wide_to_long) import pandas.util.testing as tm from pandas.compat import range, u @@ -662,7 +663,7 @@ def test_preserve_categorical_dtype(self): expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], index=midx, columns=cidx) - from pandas.core.reshape import make_axis_dummies + from pandas.core.reshape.reshape import make_axis_dummies result = make_axis_dummies(df) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/reshape/test_tile.py similarity index 99% rename from pandas/tests/tools/test_tile.py rename to pandas/tests/reshape/test_tile.py index 742568870c3c3..1cdd87dc67bd8 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -10,7 +10,7 @@ import pandas.util.testing as tm from pandas.core.algorithms import quantile -import pandas.tools.tile as tmod +import pandas.core.reshape.tile as tmod class TestCut(tm.TestCase): diff --git a/pandas/tests/tools/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py similarity index 100% rename from pandas/tests/tools/test_union_categoricals.py rename to pandas/tests/reshape/test_union_categoricals.py diff --git a/pandas/tests/tools/test_util.py b/pandas/tests/reshape/test_util.py similarity index 99% rename from pandas/tests/tools/test_util.py rename to pandas/tests/reshape/test_util.py index 3ac7d8b32516e..95998f4491e09 100644 --- a/pandas/tests/tools/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import (date_range, Index, _np_version_under1p9) import pandas.util.testing as tm -from pandas.tools.util import cartesian_product, to_numeric +from pandas.core.reshape.util import cartesian_product, to_numeric CURRENT_LOCALE = locale.getlocale() LOCALE_OVERRIDE = os.environ.get('LOCALE_OVERRIDE', None) diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index f5a27a8161909..b8e74073e9eb9 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -12,7 +12,7 @@ import pandas.util.testing as tm from pandas.compat import range from pandas import compat -from pandas.tools.util import cartesian_product +from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cd1ec915d3aeb..6f4c145d74cd1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -583,7 +583,7 @@ class TestValueCounts(tm.TestCase): def test_value_counts(self): np.random.seed(1234) - from pandas.tools.tile import cut + from pandas.core.reshape.tile import cut arr = np.random.randn(4) factor = cut(arr, 4) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 55e0e512169fb..69a844e2e64e4 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2584,7 +2584,7 @@ def test_truncate(self): wp.major_axis[2]) def test_axis_dummies(self): - from pandas.core.reshape import make_axis_dummies + from pandas.core.reshape.reshape import make_axis_dummies minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) self.assertEqual(len(minor_dummies.columns), @@ -2604,7 +2604,7 @@ def test_axis_dummies(self): # TODO: test correctness def test_get_dummies(self): - from pandas.core.reshape import get_dummies, make_axis_dummies + from pandas.core.reshape.reshape import get_dummies, make_axis_dummies self.panel['Label'] = self.panel.index.labels[1] minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) @@ -2655,7 +2655,7 @@ def test_join(self): def test_pivot(self): with catch_warnings(record=True): - from pandas.core.reshape import _slow_pivot + from pandas.core.reshape.reshape import _slow_pivot one, two, three = (np.array([1, 2, 3, 4, 5]), np.array(['a', 'b', 'c', 'd', 'e']), diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 53208fbdd5529..cd58aa2c7f923 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1,46 +1,4 @@ -""" -SQL-style merge routines -""" - -import copy import warnings -import string - -import numpy as np -from pandas.compat import range, lzip, zip, map, filter -import pandas.compat as compat - -import pandas as pd -from pandas import (Categorical, Series, DataFrame, - Index, MultiIndex, Timedelta) -from pandas.core.frame import _merge_doc -from pandas.core.dtypes.common import ( - is_datetime64tz_dtype, - is_datetime64_dtype, - needs_i8_conversion, - is_int64_dtype, - is_categorical_dtype, - is_integer_dtype, - is_float_dtype, - is_numeric_dtype, - is_integer, - is_int_or_datetime_dtype, - is_dtype_equal, - is_bool, - is_list_like, - _ensure_int64, - _ensure_float64, - _ensure_object, - _get_dtype) -from pandas.core.dtypes.missing import na_value_for_dtype -from pandas.core.internals import (items_overlap_with_suffix, - concatenate_block_managers) -from pandas.util.decorators import Appender, Substitution - -from pandas.core.sorting import is_int64_overflow_possible -import pandas.core.algorithms as algos -import pandas.core.common as com -from pandas._libs import hashtable as libhashtable, join as libjoin, lib # back-compat of pseudo-public API @@ -51,1447 +9,9 @@ def wrapper(*args, **kwargs): "import from the public API: " "pandas.concat instead", FutureWarning, stacklevel=3) + import pandas as pd return pd.concat(*args, **kwargs) return wrapper concat = concat_wrap() - - -@Substitution('\nleft : DataFrame') -@Appender(_merge_doc, indents=0) -def merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False): - op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator) - return op.get_result() - - -if __debug__: - merge.__doc__ = _merge_doc % '\nleft : DataFrame' - - -class MergeError(ValueError): - pass - - -def _groupby_and_merge(by, on, left, right, _merge_pieces, - check_duplicates=True): - """ - groupby & merge; we are always performing a left-by type operation - - Parameters - ---------- - by: field to group - on: duplicates field - left: left frame - right: right frame - _merge_pieces: function for merging - check_duplicates: boolean, default True - should we check & clean duplicates - """ - - pieces = [] - if not isinstance(by, (list, tuple)): - by = [by] - - lby = left.groupby(by, sort=False) - - # if we can groupby the rhs - # then we can get vastly better perf - try: - - # we will check & remove duplicates if indicated - if check_duplicates: - if on is None: - on = [] - elif not isinstance(on, (list, tuple)): - on = [on] - - if right.duplicated(by + on).any(): - right = right.drop_duplicates(by + on, keep='last') - rby = right.groupby(by, sort=False) - except KeyError: - rby = None - - for key, lhs in lby: - - if rby is None: - rhs = right - else: - try: - rhs = right.take(rby.indices[key]) - except KeyError: - # key doesn't exist in left - lcols = lhs.columns.tolist() - cols = lcols + [r for r in right.columns - if r not in set(lcols)] - merged = lhs.reindex(columns=cols) - merged.index = range(len(merged)) - pieces.append(merged) - continue - - merged = _merge_pieces(lhs, rhs) - - # make sure join keys are in the merged - # TODO, should _merge_pieces do this? - for k in by: - try: - if k in merged: - merged[k] = key - except: - pass - - pieces.append(merged) - - # preserve the original order - # if we have a missing piece this can be reset - from pandas.tools.concat import concat - result = concat(pieces, ignore_index=True) - result = result.reindex(columns=pieces[0].columns, copy=False) - return result, lby - - -def ordered_merge(left, right, on=None, - left_on=None, right_on=None, - left_by=None, right_by=None, - fill_method=None, suffixes=('_x', '_y')): - - warnings.warn("ordered_merge is deprecated and replaced by merge_ordered", - FutureWarning, stacklevel=2) - return merge_ordered(left, right, on=on, - left_on=left_on, right_on=right_on, - left_by=left_by, right_by=right_by, - fill_method=fill_method, suffixes=suffixes) - - -def merge_ordered(left, right, on=None, - left_on=None, right_on=None, - left_by=None, right_by=None, - fill_method=None, suffixes=('_x', '_y'), - how='outer'): - """Perform merge with optional filling/interpolation designed for ordered - data like time series data. Optionally perform group-wise merge (see - examples) - - Parameters - ---------- - left : DataFrame - right : DataFrame - on : label or list - Field names to join on. Must be found in both DataFrames. - left_on : label or list, or array-like - Field names to join on in left DataFrame. Can be a vector or list of - vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns - right_on : label or list, or array-like - Field names to join on in right DataFrame or vector/list of vectors per - left_on docs - left_by : column name or list of column names - Group left DataFrame by group columns and merge piece by piece with - right DataFrame - right_by : column name or list of column names - Group right DataFrame by group columns and merge piece by piece with - left DataFrame - fill_method : {'ffill', None}, default None - Interpolation method for data - suffixes : 2-length sequence (tuple, list, ...) - Suffix to apply to overlapping column names in the left and right - side, respectively - how : {'left', 'right', 'outer', 'inner'}, default 'outer' - * left: use only keys from left frame (SQL: left outer join) - * right: use only keys from right frame (SQL: right outer join) - * outer: use union of keys from both frames (SQL: full outer join) - * inner: use intersection of keys from both frames (SQL: inner join) - - .. versionadded:: 0.19.0 - - Examples - -------- - >>> A >>> B - key lvalue group key rvalue - 0 a 1 a 0 b 1 - 1 c 2 a 1 c 2 - 2 e 3 a 2 d 3 - 3 a 1 b - 4 c 2 b - 5 e 3 b - - >>> ordered_merge(A, B, fill_method='ffill', left_by='group') - key lvalue group rvalue - 0 a 1 a NaN - 1 b 1 a 1 - 2 c 2 a 2 - 3 d 2 a 3 - 4 e 3 a 3 - 5 f 3 a 4 - 6 a 1 b NaN - 7 b 1 b 1 - 8 c 2 b 2 - 9 d 2 b 3 - 10 e 3 b 3 - 11 f 3 b 4 - - Returns - ------- - merged : DataFrame - The output type will the be same as 'left', if it is a subclass - of DataFrame. - - See also - -------- - merge - merge_asof - - """ - def _merger(x, y): - # perform the ordered merge operation - op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, - suffixes=suffixes, fill_method=fill_method, - how=how) - return op.get_result() - - if left_by is not None and right_by is not None: - raise ValueError('Can only group either left or right frames') - elif left_by is not None: - result, _ = _groupby_and_merge(left_by, on, left, right, - lambda x, y: _merger(x, y), - check_duplicates=False) - elif right_by is not None: - result, _ = _groupby_and_merge(right_by, on, right, left, - lambda x, y: _merger(y, x), - check_duplicates=False) - else: - result = _merger(left, right) - return result - - -ordered_merge.__doc__ = merge_ordered.__doc__ - - -def merge_asof(left, right, on=None, - left_on=None, right_on=None, - left_index=False, right_index=False, - by=None, left_by=None, right_by=None, - suffixes=('_x', '_y'), - tolerance=None, - allow_exact_matches=True, - direction='backward'): - """Perform an asof merge. This is similar to a left-join except that we - match on nearest key rather than equal keys. - - Both DataFrames must be sorted by the key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - 'on' key is less than or equal to the left's key. - - - A "forward" search selects the first row in the right DataFrame whose - 'on' key is greater than or equal to the left's key. - - - A "nearest" search selects the row in the right DataFrame whose 'on' - key is closest in absolute distance to the left's key. - - The default is "backward" and is compatible in versions below 0.20.0. - The direction parameter was added in version 0.20.0 and introduces - "forward" and "nearest". - - Optionally match on equivalent keys with 'by' before searching with 'on'. - - .. versionadded:: 0.19.0 - - Parameters - ---------- - left : DataFrame - right : DataFrame - on : label - Field name to join on. Must be found in both DataFrames. - The data MUST be ordered. Furthermore this must be a numeric column, - such as datetimelike, integer, or float. On or left_on/right_on - must be given. - left_on : label - Field name to join on in left DataFrame. - right_on : label - Field name to join on in right DataFrame. - left_index : boolean - Use the index of the left DataFrame as the join key. - - .. versionadded:: 0.19.2 - - right_index : boolean - Use the index of the right DataFrame as the join key. - - .. versionadded:: 0.19.2 - - by : column name or list of column names - Match on these columns before performing merge operation. - left_by : column name - Field names to match on in the left DataFrame. - - .. versionadded:: 0.19.2 - - right_by : column name - Field names to match on in the right DataFrame. - - .. versionadded:: 0.19.2 - - suffixes : 2-length sequence (tuple, list, ...) - Suffix to apply to overlapping column names in the left and right - side, respectively. - tolerance : integer or Timedelta, optional, default None - Select asof tolerance within this range; must be compatible - with the merge index. - allow_exact_matches : boolean, default True - - - If True, allow matching with the same 'on' value - (i.e. less-than-or-equal-to / greater-than-or-equal-to) - - If False, don't match the same 'on' value - (i.e., stricly less-than / strictly greater-than) - - direction : 'backward' (default), 'forward', or 'nearest' - Whether to search for prior, subsequent, or closest matches. - - .. versionadded:: 0.20.0 - - Returns - ------- - merged : DataFrame - - Examples - -------- - >>> left - a left_val - 0 1 a - 1 5 b - 2 10 c - - >>> right - a right_val - 0 1 1 - 1 2 2 - 2 3 3 - 3 6 6 - 4 7 7 - - >>> pd.merge_asof(left, right, on='a') - a left_val right_val - 0 1 a 1 - 1 5 b 3 - 2 10 c 7 - - >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) - a left_val right_val - 0 1 a NaN - 1 5 b 3.0 - 2 10 c 7.0 - - >>> pd.merge_asof(left, right, on='a', direction='forward') - a left_val right_val - 0 1 a 1.0 - 1 5 b 6.0 - 2 10 c NaN - - >>> pd.merge_asof(left, right, on='a', direction='nearest') - a left_val right_val - 0 1 a 1 - 1 5 b 6 - 2 10 c 7 - - We can use indexed DataFrames as well. - - >>> left - left_val - 1 a - 5 b - 10 c - - >>> right - right_val - 1 1 - 2 2 - 3 3 - 6 6 - 7 7 - - >>> pd.merge_asof(left, right, left_index=True, right_index=True) - left_val right_val - 1 a 1 - 5 b 3 - 10 c 7 - - Here is a real-world times-series example - - >>> quotes - time ticker bid ask - 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 - 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 - 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 - 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 - 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 - 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 - 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 - 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 - - >>> trades - time ticker price quantity - 0 2016-05-25 13:30:00.023 MSFT 51.95 75 - 1 2016-05-25 13:30:00.038 MSFT 51.95 155 - 2 2016-05-25 13:30:00.048 GOOG 720.77 100 - 3 2016-05-25 13:30:00.048 GOOG 720.92 100 - 4 2016-05-25 13:30:00.048 AAPL 98.00 100 - - By default we are taking the asof of the quotes - - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker') - time ticker price quantity bid ask - 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 - 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 - 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 - 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 - 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - - We only asof within 2ms betwen the quote time and the trade time - - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('2ms')) - time ticker price quantity bid ask - 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 - 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN - 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 - 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 - 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - - We only asof within 10ms betwen the quote time and the trade time - and we exclude exact matches on time. However *prior* data will - propogate forward - - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('10ms'), - ... allow_exact_matches=False) - time ticker price quantity bid ask - 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN - 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 - 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 - 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 - 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - - See also - -------- - merge - merge_ordered - - """ - op = _AsOfMerge(left, right, - on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index, - by=by, left_by=left_by, right_by=right_by, - suffixes=suffixes, - how='asof', tolerance=tolerance, - allow_exact_matches=allow_exact_matches, - direction=direction) - return op.get_result() - - -# TODO: transformations?? -# TODO: only copy DataFrames when modification necessary -class _MergeOperation(object): - """ - Perform a database (SQL) merge operation between two DataFrame objects - using either columns as keys or their row indexes - """ - _merge_type = 'merge' - - def __init__(self, left, right, how='inner', on=None, - left_on=None, right_on=None, axis=1, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False): - self.left = self.orig_left = left - self.right = self.orig_right = right - self.how = how - self.axis = axis - - self.on = com._maybe_make_list(on) - self.left_on = com._maybe_make_list(left_on) - self.right_on = com._maybe_make_list(right_on) - - self.copy = copy - self.suffixes = suffixes - self.sort = sort - - self.left_index = left_index - self.right_index = right_index - - self.indicator = indicator - - if isinstance(self.indicator, compat.string_types): - self.indicator_name = self.indicator - elif isinstance(self.indicator, bool): - self.indicator_name = '_merge' if self.indicator else None - else: - raise ValueError( - 'indicator option can only accept boolean or string arguments') - - if not isinstance(left, DataFrame): - raise ValueError( - 'can not merge DataFrame with instance of ' - 'type {0}'.format(type(left))) - if not isinstance(right, DataFrame): - raise ValueError( - 'can not merge DataFrame with instance of ' - 'type {0}'.format(type(right))) - - if not is_bool(left_index): - raise ValueError( - 'left_index parameter must be of type bool, not ' - '{0}'.format(type(left_index))) - if not is_bool(right_index): - raise ValueError( - 'right_index parameter must be of type bool, not ' - '{0}'.format(type(right_index))) - - # warn user when merging between different levels - if left.columns.nlevels != right.columns.nlevels: - msg = ('merging between different levels can give an unintended ' - 'result ({0} levels on the left, {1} on the right)') - msg = msg.format(left.columns.nlevels, right.columns.nlevels) - warnings.warn(msg, UserWarning) - - self._validate_specification() - - # note this function has side effects - (self.left_join_keys, - self.right_join_keys, - self.join_names) = self._get_merge_keys() - - # validate the merge keys dtypes. We may need to coerce - # to avoid incompat dtypes - self._maybe_coerce_merge_keys() - - def get_result(self): - if self.indicator: - self.left, self.right = self._indicator_pre_merge( - self.left, self.right) - - join_index, left_indexer, right_indexer = self._get_join_info() - - ldata, rdata = self.left._data, self.right._data - lsuf, rsuf = self.suffixes - - llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) - - lindexers = {1: left_indexer} if left_indexer is not None else {} - rindexers = {1: right_indexer} if right_indexer is not None else {} - - result_data = concatenate_block_managers( - [(ldata, lindexers), (rdata, rindexers)], - axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) - - typ = self.left._constructor - result = typ(result_data).__finalize__(self, method=self._merge_type) - - if self.indicator: - result = self._indicator_post_merge(result) - - self._maybe_add_join_keys(result, left_indexer, right_indexer) - - return result - - def _indicator_pre_merge(self, left, right): - - columns = left.columns.union(right.columns) - - for i in ['_left_indicator', '_right_indicator']: - if i in columns: - raise ValueError("Cannot use `indicator=True` option when " - "data contains a column named {}".format(i)) - if self.indicator_name in columns: - raise ValueError( - "Cannot use name of an existing column for indicator column") - - left = left.copy() - right = right.copy() - - left['_left_indicator'] = 1 - left['_left_indicator'] = left['_left_indicator'].astype('int8') - - right['_right_indicator'] = 2 - right['_right_indicator'] = right['_right_indicator'].astype('int8') - - return left, right - - def _indicator_post_merge(self, result): - - result['_left_indicator'] = result['_left_indicator'].fillna(0) - result['_right_indicator'] = result['_right_indicator'].fillna(0) - - result[self.indicator_name] = Categorical((result['_left_indicator'] + - result['_right_indicator']), - categories=[1, 2, 3]) - result[self.indicator_name] = ( - result[self.indicator_name] - .cat.rename_categories(['left_only', 'right_only', 'both'])) - - result = result.drop(labels=['_left_indicator', '_right_indicator'], - axis=1) - return result - - def _maybe_add_join_keys(self, result, left_indexer, right_indexer): - - left_has_missing = None - right_has_missing = None - - keys = zip(self.join_names, self.left_on, self.right_on) - for i, (name, lname, rname) in enumerate(keys): - if not _should_fill(lname, rname): - continue - - take_left, take_right = None, None - - if name in result: - - if left_indexer is not None and right_indexer is not None: - if name in self.left: - - if left_has_missing is None: - left_has_missing = (left_indexer == -1).any() - - if left_has_missing: - take_right = self.right_join_keys[i] - - if not is_dtype_equal(result[name].dtype, - self.left[name].dtype): - take_left = self.left[name]._values - - elif name in self.right: - - if right_has_missing is None: - right_has_missing = (right_indexer == -1).any() - - if right_has_missing: - take_left = self.left_join_keys[i] - - if not is_dtype_equal(result[name].dtype, - self.right[name].dtype): - take_right = self.right[name]._values - - elif left_indexer is not None \ - and isinstance(self.left_join_keys[i], np.ndarray): - - take_left = self.left_join_keys[i] - take_right = self.right_join_keys[i] - - if take_left is not None or take_right is not None: - - if take_left is None: - lvals = result[name]._values - else: - lfill = na_value_for_dtype(take_left.dtype) - lvals = algos.take_1d(take_left, left_indexer, - fill_value=lfill) - - if take_right is None: - rvals = result[name]._values - else: - rfill = na_value_for_dtype(take_right.dtype) - rvals = algos.take_1d(take_right, right_indexer, - fill_value=rfill) - - # if we have an all missing left_indexer - # make sure to just use the right values - mask = left_indexer == -1 - if mask.all(): - key_col = rvals - else: - key_col = Index(lvals).where(~mask, rvals) - - if name in result: - result[name] = key_col - else: - result.insert(i, name or 'key_%d' % i, key_col) - - def _get_join_indexers(self): - """ return the join indexers """ - return _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, - how=self.how) - - def _get_join_info(self): - left_ax = self.left._data.axes[self.axis] - right_ax = self.right._data.axes[self.axis] - - if self.left_index and self.right_index and self.how != 'asof': - join_index, left_indexer, right_indexer = \ - left_ax.join(right_ax, how=self.how, return_indexers=True, - sort=self.sort) - elif self.right_index and self.how == 'left': - join_index, left_indexer, right_indexer = \ - _left_join_on_index(left_ax, right_ax, self.left_join_keys, - sort=self.sort) - - elif self.left_index and self.how == 'right': - join_index, right_indexer, left_indexer = \ - _left_join_on_index(right_ax, left_ax, self.right_join_keys, - sort=self.sort) - else: - (left_indexer, - right_indexer) = self._get_join_indexers() - - if self.right_index: - if len(self.left) > 0: - join_index = self.left.index.take(left_indexer) - else: - join_index = self.right.index.take(right_indexer) - left_indexer = np.array([-1] * len(join_index)) - elif self.left_index: - if len(self.right) > 0: - join_index = self.right.index.take(right_indexer) - else: - join_index = self.left.index.take(left_indexer) - right_indexer = np.array([-1] * len(join_index)) - else: - join_index = Index(np.arange(len(left_indexer))) - - if len(join_index) == 0: - join_index = join_index.astype(object) - return join_index, left_indexer, right_indexer - - def _get_merge_keys(self): - """ - Note: has side effects (copy/delete key columns) - - Parameters - ---------- - left - right - on - - Returns - ------- - left_keys, right_keys - """ - left_keys = [] - right_keys = [] - join_names = [] - right_drop = [] - left_drop = [] - left, right = self.left, self.right - - is_lkey = lambda x: isinstance( - x, (np.ndarray, Series)) and len(x) == len(left) - is_rkey = lambda x: isinstance( - x, (np.ndarray, Series)) and len(x) == len(right) - - # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A - # user could, for example, request 'left_index' and 'left_by'. In a - # regular pd.merge(), users cannot specify both 'left_index' and - # 'left_on'. (Instead, users have a MultiIndex). That means the - # self.left_on in this function is always empty in a pd.merge(), but - # a pd.merge_asof(left_index=True, left_by=...) will result in a - # self.left_on array with a None in the middle of it. This requires - # a work-around as designated in the code below. - # See _validate_specification() for where this happens. - - # ugh, spaghetti re #733 - if _any(self.left_on) and _any(self.right_on): - for lk, rk in zip(self.left_on, self.right_on): - if is_lkey(lk): - left_keys.append(lk) - if is_rkey(rk): - right_keys.append(rk) - join_names.append(None) # what to do? - else: - if rk is not None: - right_keys.append(right[rk]._values) - join_names.append(rk) - else: - # work-around for merge_asof(right_index=True) - right_keys.append(right.index) - join_names.append(right.index.name) - else: - if not is_rkey(rk): - if rk is not None: - right_keys.append(right[rk]._values) - else: - # work-around for merge_asof(right_index=True) - right_keys.append(right.index) - if lk is not None and lk == rk: - # avoid key upcast in corner case (length-0) - if len(left) > 0: - right_drop.append(rk) - else: - left_drop.append(lk) - else: - right_keys.append(rk) - if lk is not None: - left_keys.append(left[lk]._values) - join_names.append(lk) - else: - # work-around for merge_asof(left_index=True) - left_keys.append(left.index) - join_names.append(left.index.name) - elif _any(self.left_on): - for k in self.left_on: - if is_lkey(k): - left_keys.append(k) - join_names.append(None) - else: - left_keys.append(left[k]._values) - join_names.append(k) - if isinstance(self.right.index, MultiIndex): - right_keys = [lev._values.take(lab) - for lev, lab in zip(self.right.index.levels, - self.right.index.labels)] - else: - right_keys = [self.right.index.values] - elif _any(self.right_on): - for k in self.right_on: - if is_rkey(k): - right_keys.append(k) - join_names.append(None) - else: - right_keys.append(right[k]._values) - join_names.append(k) - if isinstance(self.left.index, MultiIndex): - left_keys = [lev._values.take(lab) - for lev, lab in zip(self.left.index.levels, - self.left.index.labels)] - else: - left_keys = [self.left.index.values] - - if left_drop: - self.left = self.left.drop(left_drop, axis=1) - - if right_drop: - self.right = self.right.drop(right_drop, axis=1) - - return left_keys, right_keys, join_names - - def _maybe_coerce_merge_keys(self): - # we have valid mergee's but we may have to further - # coerce these if they are originally incompatible types - # - # for example if these are categorical, but are not dtype_equal - # or if we have object and integer dtypes - - for lk, rk, name in zip(self.left_join_keys, - self.right_join_keys, - self.join_names): - if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): - continue - - # if either left or right is a categorical - # then the must match exactly in categories & ordered - if is_categorical_dtype(lk) and is_categorical_dtype(rk): - if lk.is_dtype_equal(rk): - continue - elif is_categorical_dtype(lk) or is_categorical_dtype(rk): - pass - - elif is_dtype_equal(lk.dtype, rk.dtype): - continue - - # if we are numeric, then allow differing - # kinds to proceed, eg. int64 and int8 - # further if we are object, but we infer to - # the same, then proceed - if (is_numeric_dtype(lk) and is_numeric_dtype(rk)): - if lk.dtype.kind == rk.dtype.kind: - continue - - # let's infer and see if we are ok - if lib.infer_dtype(lk) == lib.infer_dtype(rk): - continue - - # Houston, we have a problem! - # let's coerce to object - if name in self.left.columns: - self.left = self.left.assign( - **{name: self.left[name].astype(object)}) - if name in self.right.columns: - self.right = self.right.assign( - **{name: self.right[name].astype(object)}) - - def _validate_specification(self): - # Hm, any way to make this logic less complicated?? - if self.on is None and self.left_on is None and self.right_on is None: - - if self.left_index and self.right_index: - self.left_on, self.right_on = (), () - elif self.left_index: - if self.right_on is None: - raise MergeError('Must pass right_on or right_index=True') - elif self.right_index: - if self.left_on is None: - raise MergeError('Must pass left_on or left_index=True') - else: - # use the common columns - common_cols = self.left.columns.intersection( - self.right.columns) - if len(common_cols) == 0: - raise MergeError('No common columns to perform merge on') - if not common_cols.is_unique: - raise MergeError("Data columns not unique: %s" - % repr(common_cols)) - self.left_on = self.right_on = common_cols - elif self.on is not None: - if self.left_on is not None or self.right_on is not None: - raise MergeError('Can only pass argument "on" OR "left_on" ' - 'and "right_on", not a combination of both.') - self.left_on = self.right_on = self.on - elif self.left_on is not None: - n = len(self.left_on) - if self.right_index: - if len(self.left_on) != self.right.index.nlevels: - raise ValueError('len(left_on) must equal the number ' - 'of levels in the index of "right"') - self.right_on = [None] * n - elif self.right_on is not None: - n = len(self.right_on) - if self.left_index: - if len(self.right_on) != self.left.index.nlevels: - raise ValueError('len(right_on) must equal the number ' - 'of levels in the index of "left"') - self.left_on = [None] * n - if len(self.right_on) != len(self.left_on): - raise ValueError("len(right_on) must equal len(left_on)") - - -def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', - **kwargs): - """ - - Parameters - ---------- - left_keys: ndarray, Index, Series - right_keys: ndarray, Index, Series - sort: boolean, default False - how: string {'inner', 'outer', 'left', 'right'}, default 'inner' - - Returns - ------- - tuple of (left_indexer, right_indexer) - indexers into the left_keys, right_keys - - """ - from functools import partial - - assert len(left_keys) == len(right_keys), \ - 'left_key and right_keys must be the same length' - - # bind `sort` arg. of _factorize_keys - fkeys = partial(_factorize_keys, sort=sort) - - # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys))) - - # get flat i8 keys from label lists - lkey, rkey = _get_join_keys(llab, rlab, shape, sort) - - # factorize keys to a dense i8 space - # `count` is the num. of unique keys - # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = fkeys(lkey, rkey) - - # preserve left frame order if how == 'left' and sort == False - kwargs = copy.copy(kwargs) - if how == 'left': - kwargs['sort'] = sort - join_func = _join_functions[how] - - return join_func(lkey, rkey, count, **kwargs) - - -class _OrderedMerge(_MergeOperation): - _merge_type = 'ordered_merge' - - def __init__(self, left, right, on=None, left_on=None, right_on=None, - left_index=False, right_index=False, axis=1, - suffixes=('_x', '_y'), copy=True, - fill_method=None, how='outer'): - - self.fill_method = fill_method - _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, - left_index=left_index, - right_index=right_index, - right_on=right_on, axis=axis, - how=how, suffixes=suffixes, - sort=True # factorize sorts - ) - - def get_result(self): - join_index, left_indexer, right_indexer = self._get_join_info() - - # this is a bit kludgy - ldata, rdata = self.left._data, self.right._data - lsuf, rsuf = self.suffixes - - llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) - - if self.fill_method == 'ffill': - left_join_indexer = libjoin.ffill_indexer(left_indexer) - right_join_indexer = libjoin.ffill_indexer(right_indexer) - else: - left_join_indexer = left_indexer - right_join_indexer = right_indexer - - lindexers = { - 1: left_join_indexer} if left_join_indexer is not None else {} - rindexers = { - 1: right_join_indexer} if right_join_indexer is not None else {} - - result_data = concatenate_block_managers( - [(ldata, lindexers), (rdata, rindexers)], - axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) - - typ = self.left._constructor - result = typ(result_data).__finalize__(self, method=self._merge_type) - - self._maybe_add_join_keys(result, left_indexer, right_indexer) - - return result - - -def _asof_function(direction, on_type): - return getattr(libjoin, 'asof_join_%s_%s' % (direction, on_type), None) - - -def _asof_by_function(direction, on_type, by_type): - return getattr(libjoin, 'asof_join_%s_%s_by_%s' % - (direction, on_type, by_type), None) - - -_type_casters = { - 'int64_t': _ensure_int64, - 'double': _ensure_float64, - 'object': _ensure_object, -} - -_cython_types = { - 'uint8': 'uint8_t', - 'uint32': 'uint32_t', - 'uint16': 'uint16_t', - 'uint64': 'uint64_t', - 'int8': 'int8_t', - 'int32': 'int32_t', - 'int16': 'int16_t', - 'int64': 'int64_t', - 'float16': 'error', - 'float32': 'float', - 'float64': 'double', -} - - -def _get_cython_type(dtype): - """ Given a dtype, return a C name like 'int64_t' or 'double' """ - type_name = _get_dtype(dtype).name - ctype = _cython_types.get(type_name, 'object') - if ctype == 'error': - raise MergeError('unsupported type: ' + type_name) - return ctype - - -def _get_cython_type_upcast(dtype): - """ Upcast a dtype to 'int64_t', 'double', or 'object' """ - if is_integer_dtype(dtype): - return 'int64_t' - elif is_float_dtype(dtype): - return 'double' - else: - return 'object' - - -class _AsOfMerge(_OrderedMerge): - _merge_type = 'asof_merge' - - def __init__(self, left, right, on=None, left_on=None, right_on=None, - left_index=False, right_index=False, - by=None, left_by=None, right_by=None, - axis=1, suffixes=('_x', '_y'), copy=True, - fill_method=None, - how='asof', tolerance=None, - allow_exact_matches=True, - direction='backward'): - - self.by = by - self.left_by = left_by - self.right_by = right_by - self.tolerance = tolerance - self.allow_exact_matches = allow_exact_matches - self.direction = direction - - _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, axis=axis, - how=how, suffixes=suffixes, - fill_method=fill_method) - - def _validate_specification(self): - super(_AsOfMerge, self)._validate_specification() - - # we only allow on to be a single item for on - if len(self.left_on) != 1 and not self.left_index: - raise MergeError("can only asof on a key for left") - - if len(self.right_on) != 1 and not self.right_index: - raise MergeError("can only asof on a key for right") - - if self.left_index and isinstance(self.left.index, MultiIndex): - raise MergeError("left can only have one index") - - if self.right_index and isinstance(self.right.index, MultiIndex): - raise MergeError("right can only have one index") - - # set 'by' columns - if self.by is not None: - if self.left_by is not None or self.right_by is not None: - raise MergeError('Can only pass by OR left_by ' - 'and right_by') - self.left_by = self.right_by = self.by - if self.left_by is None and self.right_by is not None: - raise MergeError('missing left_by') - if self.left_by is not None and self.right_by is None: - raise MergeError('missing right_by') - - # add 'by' to our key-list so we can have it in the - # output as a key - if self.left_by is not None: - if not is_list_like(self.left_by): - self.left_by = [self.left_by] - if not is_list_like(self.right_by): - self.right_by = [self.right_by] - - if len(self.left_by) != len(self.right_by): - raise MergeError('left_by and right_by must be same length') - - self.left_on = self.left_by + list(self.left_on) - self.right_on = self.right_by + list(self.right_on) - - # check 'direction' is valid - if self.direction not in ['backward', 'forward', 'nearest']: - raise MergeError('direction invalid: ' + self.direction) - - @property - def _asof_key(self): - """ This is our asof key, the 'on' """ - return self.left_on[-1] - - def _get_merge_keys(self): - - # note this function has side effects - (left_join_keys, - right_join_keys, - join_names) = super(_AsOfMerge, self)._get_merge_keys() - - # validate index types are the same - for lk, rk in zip(left_join_keys, right_join_keys): - if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys, " - "must be the same type") - - # validate tolerance; must be a Timedelta if we have a DTI - if self.tolerance is not None: - - if self.left_index: - lt = self.left.index - else: - lt = left_join_keys[-1] - - msg = "incompatible tolerance, must be compat " \ - "with type {0}".format(type(lt)) - - if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): - if not isinstance(self.tolerance, Timedelta): - raise MergeError(msg) - if self.tolerance < Timedelta(0): - raise MergeError("tolerance must be positive") - - elif is_int64_dtype(lt): - if not is_integer(self.tolerance): - raise MergeError(msg) - if self.tolerance < 0: - raise MergeError("tolerance must be positive") - - else: - raise MergeError("key must be integer or timestamp") - - # validate allow_exact_matches - if not is_bool(self.allow_exact_matches): - raise MergeError("allow_exact_matches must be boolean, " - "passed {0}".format(self.allow_exact_matches)) - - return left_join_keys, right_join_keys, join_names - - def _get_join_indexers(self): - """ return the join indexers """ - - def flip(xs): - """ unlike np.transpose, this returns an array of tuples """ - labels = list(string.ascii_lowercase[:len(xs)]) - dtypes = [x.dtype for x in xs] - labeled_dtypes = list(zip(labels, dtypes)) - return np.array(lzip(*xs), labeled_dtypes) - - # values to compare - left_values = (self.left.index.values if self.left_index else - self.left_join_keys[-1]) - right_values = (self.right.index.values if self.right_index else - self.right_join_keys[-1]) - tolerance = self.tolerance - - # we required sortedness in the join keys - msg = " keys must be sorted" - if not Index(left_values).is_monotonic: - raise ValueError('left' + msg) - if not Index(right_values).is_monotonic: - raise ValueError('right' + msg) - - # initial type conversion as needed - if needs_i8_conversion(left_values): - left_values = left_values.view('i8') - right_values = right_values.view('i8') - if tolerance is not None: - tolerance = tolerance.value - - # a "by" parameter requires special handling - if self.left_by is not None: - # remove 'on' parameter from values if one existed - if self.left_index and self.right_index: - left_by_values = self.left_join_keys - right_by_values = self.right_join_keys - else: - left_by_values = self.left_join_keys[0:-1] - right_by_values = self.right_join_keys[0:-1] - - # get tuple representation of values if more than one - if len(left_by_values) == 1: - left_by_values = left_by_values[0] - right_by_values = right_by_values[0] - else: - left_by_values = flip(left_by_values) - right_by_values = flip(right_by_values) - - # upcast 'by' parameter because HashTable is limited - by_type = _get_cython_type_upcast(left_by_values.dtype) - by_type_caster = _type_casters[by_type] - left_by_values = by_type_caster(left_by_values) - right_by_values = by_type_caster(right_by_values) - - # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) - func = _asof_by_function(self.direction, on_type, by_type) - return func(left_values, - right_values, - left_by_values, - right_by_values, - self.allow_exact_matches, - tolerance) - else: - # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) - func = _asof_function(self.direction, on_type) - return func(left_values, - right_values, - self.allow_exact_matches, - tolerance) - - -def _get_multiindex_indexer(join_keys, index, sort): - from functools import partial - - # bind `sort` argument - fkeys = partial(_factorize_keys, sort=sort) - - # left & right join labels and num. of levels at each location - rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys))) - if sort: - rlab = list(map(np.take, rlab, index.labels)) - else: - i8copy = lambda a: a.astype('i8', subok=False, copy=True) - rlab = list(map(i8copy, index.labels)) - - # fix right labels if there were any nulls - for i in range(len(join_keys)): - mask = index.labels[i] == -1 - if mask.any(): - # check if there already was any nulls at this location - # if there was, it is factorized to `shape[i] - 1` - a = join_keys[i][llab[i] == shape[i] - 1] - if a.size == 0 or not a[0] != a[0]: - shape[i] += 1 - - rlab[i][mask] = shape[i] - 1 - - # get flat i8 join keys - lkey, rkey = _get_join_keys(llab, rlab, shape, sort) - - # factorize keys to a dense i8 space - lkey, rkey, count = fkeys(lkey, rkey) - - return libjoin.left_outer_join(lkey, rkey, count, sort=sort) - - -def _get_single_indexer(join_key, index, sort=False): - left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) - - left_indexer, right_indexer = libjoin.left_outer_join( - _ensure_int64(left_key), - _ensure_int64(right_key), - count, sort=sort) - - return left_indexer, right_indexer - - -def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): - if len(join_keys) > 1: - if not ((isinstance(right_ax, MultiIndex) and - len(join_keys) == right_ax.nlevels)): - raise AssertionError("If more than one join key is given then " - "'right_ax' must be a MultiIndex and the " - "number of join keys must be the number of " - "levels in right_ax") - - left_indexer, right_indexer = \ - _get_multiindex_indexer(join_keys, right_ax, sort=sort) - else: - jkey = join_keys[0] - - left_indexer, right_indexer = \ - _get_single_indexer(jkey, right_ax, sort=sort) - - if sort or len(left_ax) != len(left_indexer): - # if asked to sort or there are 1-to-many matches - join_index = left_ax.take(left_indexer) - return join_index, left_indexer, right_indexer - - # left frame preserves order & length of its index - return left_ax, None, right_indexer - - -def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) - return left_indexer, right_indexer - - -_join_functions = { - 'inner': libjoin.inner_join, - 'left': libjoin.left_outer_join, - 'right': _right_outer_join, - 'outer': libjoin.full_outer_join, -} - - -def _factorize_keys(lk, rk, sort=True): - if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = lk.values - rk = rk.values - - # if we exactly match in categories, allow us to use codes - if (is_categorical_dtype(lk) and - is_categorical_dtype(rk) and - lk.is_dtype_equal(rk)): - return lk.codes, rk.codes, len(lk.categories) - - if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): - klass = libhashtable.Int64Factorizer - lk = _ensure_int64(com._values_from_object(lk)) - rk = _ensure_int64(com._values_from_object(rk)) - else: - klass = libhashtable.Factorizer - lk = _ensure_object(lk) - rk = _ensure_object(rk) - - rizer = klass(max(len(lk), len(rk))) - - llab = rizer.factorize(lk) - rlab = rizer.factorize(rk) - - count = rizer.get_count() - - if sort: - uniques = rizer.uniques.to_array() - llab, rlab = _sort_labels(uniques, llab, rlab) - - # NA group - lmask = llab == -1 - lany = lmask.any() - rmask = rlab == -1 - rany = rmask.any() - - if lany or rany: - if lany: - np.putmask(llab, lmask, count) - if rany: - np.putmask(rlab, rmask, count) - count += 1 - - return llab, rlab, count - - -def _sort_labels(uniques, left, right): - if not isinstance(uniques, np.ndarray): - # tuplesafe - uniques = Index(uniques).values - - l = len(left) - labels = np.concatenate([left, right]) - - _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) - new_labels = _ensure_int64(new_labels) - new_left, new_right = new_labels[:l], new_labels[l:] - - return new_left, new_right - - -def _get_join_keys(llab, rlab, shape, sort): - - # how many levels can be done without overflow - pred = lambda i: not is_int64_overflow_possible(shape[:i]) - nlev = next(filter(pred, range(len(shape), 0, -1))) - - # get keys for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - lkey = stride * llab[0].astype('i8', subok=False, copy=False) - rkey = stride * rlab[0].astype('i8', subok=False, copy=False) - - for i in range(1, nlev): - stride //= shape[i] - lkey += llab[i] * stride - rkey += rlab[i] * stride - - if nlev == len(shape): # all done! - return lkey, rkey - - # densify current keys to avoid overflow - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) - - llab = [lkey] + llab[nlev:] - rlab = [rkey] + rlab[nlev:] - shape = [count] + shape[nlev:] - - return _get_join_keys(llab, rlab, shape, sort) - - -def _should_fill(lname, rname): - if (not isinstance(lname, compat.string_types) or - not isinstance(rname, compat.string_types)): - return True - return lname == rname - - -def _any(x): - return x is not None and len(x) > 0 and any([y is not None for y in x]) diff --git a/setup.py b/setup.py index 6fc66e2355c0f..69b9a974b9935 100755 --- a/setup.py +++ b/setup.py @@ -642,6 +642,7 @@ def pxd(name): 'pandas.core.dtypes', 'pandas.core.indexes', 'pandas.core.computation', + 'pandas.core.reshape', 'pandas.core.sparse', 'pandas.errors', 'pandas.io', @@ -673,7 +674,6 @@ def pxd(name): 'pandas.tests.series', 'pandas.tests.scalar', 'pandas.tests.tseries', - 'pandas.tests.tools', 'pandas.tests.plotting', 'pandas.tools', 'pandas.tseries', @@ -703,7 +703,7 @@ def pxd(name): 'data/html_encoding/*.html', 'json/data/*.json'], 'pandas.tests.io.formats': ['data/*.csv'], - 'pandas.tests.tools': ['data/*.csv'], + 'pandas.tests.reshape': ['data/*.csv'], 'pandas.tests.tseries': ['data/*.pickle'], 'pandas.io.formats': ['templates/*.tpl'] }, From 376cef5308ee7feb2e65c07bfdbf48f59eed1b8e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 17 Apr 2017 18:37:49 -0400 Subject: [PATCH 2/2] move to_numeric --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/__init__.py | 1 - pandas/core/api.py | 6 +- pandas/core/dtypes/cast.py | 164 ++- pandas/core/groupby.py | 2 +- pandas/core/reshape/api.py | 5 +- pandas/core/reshape/merge.py | 1 - pandas/core/reshape/util.py | 171 +-- pandas/tests/dtypes/test_cast.py | 370 ++++- pandas/tests/dtypes/test_convert.py | 0 pandas/tests/reshape/test_util.py | 440 +----- pandas/tests/test_generic.py | 2076 --------------------------- pandas/tests/test_util.py | 78 +- 13 files changed, 620 insertions(+), 2698 deletions(-) create mode 100644 pandas/tests/dtypes/test_convert.py delete mode 100644 pandas/tests/test_generic.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 12d343f7fe4c4..13f4e8e8f26e9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1335,7 +1335,7 @@ If indicated, a deprecation warning will be issued if you reference theses modul .. csv-table:: :header: "Previous Location", "New Location", "Deprecated" - :widths: 30, 30, 20 + :widths: 30, 30, 4 "pandas.lib", "pandas._libs.lib", "X" "pandas.tslib", "pandas._libs.tslib", "X" @@ -1349,7 +1349,7 @@ If indicated, a deprecation warning will be issued if you reference theses modul "pandas.parser", "pandas.io.libparsers", "X" "pandas.formats", "pandas.io.formats", "" "pandas.sparse", "pandas.core.sparse", "" - "pandas.tools", "pandas.core.tools", "pandas.tools.plotting" + "pandas.tools", "pandas.core.reshape", "" "pandas.types", "pandas.core.dtypes", "" "pandas.io.sas.saslib", "pandas.io.sas.libsas", "" "pandas._join", "pandas._libs.join", "" diff --git a/pandas/__init__.py b/pandas/__init__.py index 9e5830306db0d..43fa362b66ed5 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -54,7 +54,6 @@ 'pandas.scatter_matrix', pandas.plotting.scatter_matrix, 'pandas.plotting.scatter_matrix') -from pandas.core.reshape.util import to_numeric from pandas.util.print_versions import show_versions from pandas.io.api import * from pandas.util._tester import test diff --git a/pandas/core/api.py b/pandas/core/api.py index aa8266995c6b9..f3191283b85eb 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -18,10 +18,12 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D -from pandas.core.reshape.reshape import (pivot_simple as pivot, get_dummies, - lreshape, wide_to_long) +from pandas.core.reshape.reshape import ( + pivot_simple as pivot, get_dummies, + lreshape, wide_to_long) from pandas.core.indexing import IndexSlice +from pandas.core.dtypes.cast import to_numeric from pandas.tseries.offsets import DateOffset from pandas.tseries.tools import to_datetime from pandas.tseries.index import (DatetimeIndex, Timestamp, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3954fb5c93da8..3c1f480787d3a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -5,6 +5,7 @@ import numpy as np import warnings +import pandas as pd from pandas._libs import tslib, lib from pandas._libs.tslib import iNaT from pandas.compat import string_types, text_type, PY3 @@ -18,6 +19,8 @@ is_integer_dtype, is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, + is_numeric_dtype, is_decimal, + is_number, _string_dtypes, _coerce_to_dtype, _ensure_int8, _ensure_int16, @@ -25,7 +28,8 @@ _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _POSSIBLY_CAST_DTYPES) from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype -from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries +from .generic import (ABCDatetimeIndex, ABCPeriodIndex, + ABCSeries, ABCIndexClass) from .missing import isnull, notnull from .inference import is_list_like @@ -1025,3 +1029,161 @@ def find_common_type(types): return np.object return np.find_common_type(types, []) + + +def to_numeric(arg, errors='raise', downcast=None): + """ + Convert argument to a numeric type. + + Parameters + ---------- + arg : list, tuple, 1-d array, or Series + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception + - If 'coerce', then invalid parsing will be set as NaN + - If 'ignore', then invalid parsing will return the input + downcast : {'integer', 'signed', 'unsigned', 'float'} , default None + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + .. versionadded:: 0.19.0 + + Returns + ------- + ret : numeric if parsing succeeded. + Return type depends on input. Series if Series, otherwise ndarray + + Examples + -------- + Take separate series and convert to numeric, coercing when told to + + >>> import pandas as pd + >>> s = pd.Series(['1.0', '2', -3]) + >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast='float') + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast='signed') + 0 1 + 1 2 + 2 -3 + dtype: int8 + >>> s = pd.Series(['apple', '1.0', '2', -3]) + >>> pd.to_numeric(s, errors='ignore') + 0 apple + 1 1.0 + 2 2 + 3 -3 + dtype: object + >>> pd.to_numeric(s, errors='coerce') + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 + """ + if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): + raise ValueError('invalid downcasting method provided') + + is_series = False + is_index = False + is_scalars = False + + if isinstance(arg, ABCSeries): + is_series = True + values = arg.values + elif isinstance(arg, ABCIndexClass): + is_index = True + values = arg.asi8 + if values is None: + values = arg.values + elif isinstance(arg, (list, tuple)): + values = np.array(arg, dtype='O') + elif is_scalar(arg): + if is_decimal(arg): + return float(arg) + if is_number(arg): + return arg + is_scalars = True + values = np.array([arg], dtype='O') + elif getattr(arg, 'ndim', 1) > 1: + raise TypeError('arg must be a list, tuple, 1-d array, or Series') + else: + values = arg + + try: + if is_numeric_dtype(values): + pass + elif is_datetime_or_timedelta_dtype(values): + values = values.astype(np.int64) + else: + values = _ensure_object(values) + coerce_numeric = False if errors in ('ignore', 'raise') else True + values = lib.maybe_convert_numeric(values, set(), + coerce_numeric=coerce_numeric) + + except Exception: + if errors == 'raise': + raise + + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified + if downcast is not None and is_numeric_dtype(values): + typecodes = None + + if downcast in ('integer', 'signed'): + typecodes = np.typecodes['Integer'] + elif downcast == 'unsigned' and np.min(values) >= 0: + typecodes = np.typecodes['UnsignedInteger'] + elif downcast == 'float': + typecodes = np.typecodes['Float'] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for dtype in typecodes: + if np.dtype(dtype).itemsize <= values.dtype.itemsize: + values = maybe_downcast_to_dtype(values, dtype) + + # successful conversion + if values.dtype == dtype: + break + + if is_series: + return pd.Series(values, index=arg.index, name=arg.name) + elif is_index: + # because we want to coerce to numeric if possible, + # do not use _shallow_copy_with_infer + return pd.Index(values, name=arg.name) + elif is_scalars: + return values[0] + else: + return values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 47f8f22725d48..8f788aed3950d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3509,7 +3509,7 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same - from pandas.core.reshape.util import to_numeric + from pandas.core.dtypes.cast import to_numeric if len(keys) == 0: return DataFrame(index=keys) diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index cd10fbf76cf2b..c75e0341918bb 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,5 +1,8 @@ +# flake8: noqa + from pandas.core.reshape.concat import concat from pandas.core.reshape.reshape import melt -from pandas.core.reshape.merge import merge, ordered_merge, merge_ordered, merge_asof +from pandas.core.reshape.merge import ( + merge, ordered_merge, merge_ordered, merge_asof) from pandas.core.reshape.pivot import pivot_table, crosstab from pandas.core.reshape.tile import cut, qcut diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0a4b65b425913..1ca3786ecc174 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -10,7 +10,6 @@ from pandas.compat import range, lzip, zip, map, filter import pandas.compat as compat -import pandas as pd from pandas import (Categorical, Series, DataFrame, Index, MultiIndex, Timedelta) from pandas.core.frame import _merge_doc diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index baf968440858d..2fe82e5d6bc57 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -1,18 +1,7 @@ import numpy as np -import pandas._libs.lib as lib -from pandas.core.dtypes.common import ( - is_number, - is_numeric_dtype, - is_datetime_or_timedelta_dtype, - is_list_like, - _ensure_object, - is_decimal, - is_scalar as isscalar) +from pandas.core.dtypes.common import is_list_like -from pandas.core.dtypes.cast import maybe_downcast_to_dtype - -import pandas as pd from pandas.compat import reduce from pandas.core.index import Index from pandas.core import common as com @@ -85,161 +74,3 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) - - -def to_numeric(arg, errors='raise', downcast=None): - """ - Convert argument to a numeric type. - - Parameters - ---------- - arg : list, tuple, 1-d array, or Series - errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaN - - If 'ignore', then invalid parsing will return the input - downcast : {'integer', 'signed', 'unsigned', 'float'} , default None - If not None, and if the data has been successfully cast to a - numerical dtype (or if the data was numeric to begin with), - downcast that resulting data to the smallest numerical dtype - possible according to the following rules: - - - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - - 'float': smallest float dtype (min.: np.float32) - - As this behaviour is separate from the core conversion to - numeric values, any errors raised during the downcasting - will be surfaced regardless of the value of the 'errors' input. - - In addition, downcasting will only occur if the size - of the resulting data's dtype is strictly larger than - the dtype it is to be cast to, so if none of the dtypes - checked satisfy that specification, no downcasting will be - performed on the data. - - .. versionadded:: 0.19.0 - - Returns - ------- - ret : numeric if parsing succeeded. - Return type depends on input. Series if Series, otherwise ndarray - - Examples - -------- - Take separate series and convert to numeric, coercing when told to - - >>> import pandas as pd - >>> s = pd.Series(['1.0', '2', -3]) - >>> pd.to_numeric(s) - 0 1.0 - 1 2.0 - 2 -3.0 - dtype: float64 - >>> pd.to_numeric(s, downcast='float') - 0 1.0 - 1 2.0 - 2 -3.0 - dtype: float32 - >>> pd.to_numeric(s, downcast='signed') - 0 1 - 1 2 - 2 -3 - dtype: int8 - >>> s = pd.Series(['apple', '1.0', '2', -3]) - >>> pd.to_numeric(s, errors='ignore') - 0 apple - 1 1.0 - 2 2 - 3 -3 - dtype: object - >>> pd.to_numeric(s, errors='coerce') - 0 NaN - 1 1.0 - 2 2.0 - 3 -3.0 - dtype: float64 - """ - if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): - raise ValueError('invalid downcasting method provided') - - is_series = False - is_index = False - is_scalar = False - - if isinstance(arg, pd.Series): - is_series = True - values = arg.values - elif isinstance(arg, pd.Index): - is_index = True - values = arg.asi8 - if values is None: - values = arg.values - elif isinstance(arg, (list, tuple)): - values = np.array(arg, dtype='O') - elif isscalar(arg): - if is_decimal(arg): - return float(arg) - if is_number(arg): - return arg - is_scalar = True - values = np.array([arg], dtype='O') - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a list, tuple, 1-d array, or Series') - else: - values = arg - - try: - if is_numeric_dtype(values): - pass - elif is_datetime_or_timedelta_dtype(values): - values = values.astype(np.int64) - else: - values = _ensure_object(values) - coerce_numeric = False if errors in ('ignore', 'raise') else True - values = lib.maybe_convert_numeric(values, set(), - coerce_numeric=coerce_numeric) - - except Exception: - if errors == 'raise': - raise - - # attempt downcast only if the data has been successfully converted - # to a numerical dtype and if a downcast method has been specified - if downcast is not None and is_numeric_dtype(values): - typecodes = None - - if downcast in ('integer', 'signed'): - typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) >= 0: - typecodes = np.typecodes['UnsignedInteger'] - elif downcast == 'float': - typecodes = np.typecodes['Float'] - - # pandas support goes only to np.float32, - # as float dtypes smaller than that are - # extremely rare and not well supported - float_32_char = np.dtype(np.float32).char - float_32_ind = typecodes.index(float_32_char) - typecodes = typecodes[float_32_ind:] - - if typecodes is not None: - # from smallest to largest - for dtype in typecodes: - if np.dtype(dtype).itemsize <= values.dtype.itemsize: - values = maybe_downcast_to_dtype(values, dtype) - - # successful conversion - if values.dtype == dtype: - break - - if is_series: - return pd.Series(values, index=arg.index, name=arg.name) - elif is_index: - # because we want to coerce to numeric if possible, - # do not use _shallow_copy_with_infer - return Index(values, name=arg.name) - elif is_scalar: - return values[0] - else: - return values diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index a1490426ebf9d..e59784d233367 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -6,10 +6,14 @@ """ import pytest +import decimal from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex +import pandas as pd +from pandas import (Timedelta, Timestamp, DatetimeIndex, + to_numeric, _np_version_under1p9) + from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, maybe_convert_objects, @@ -24,6 +28,8 @@ PeriodDtype) from pandas.util import testing as tm +from numpy import iinfo + class TestMaybeDowncast(tm.TestCase): @@ -321,3 +327,365 @@ def test_period_dtype(self): np.dtype('datetime64[ns]'), np.object, np.int64]: self.assertEqual(find_common_type([dtype, dtype2]), np.object) self.assertEqual(find_common_type([dtype2, dtype]), np.object) + + +class TestToNumeric(tm.TestCase): + + def test_series(self): + s = pd.Series(['1', '-3.14', '7']) + res = to_numeric(s) + expected = pd.Series([1, -3.14, 7]) + tm.assert_series_equal(res, expected) + + s = pd.Series(['1', '-3.14', 7]) + res = to_numeric(s) + tm.assert_series_equal(res, expected) + + def test_series_numeric(self): + s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX') + res = to_numeric(s) + tm.assert_series_equal(res, s) + + s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX') + res = to_numeric(s) + tm.assert_series_equal(res, s) + + # bool is regarded as numeric + s = pd.Series([True, False, True, True], + index=list('ABCD'), name='XXX') + res = to_numeric(s) + tm.assert_series_equal(res, s) + + def test_error(self): + s = pd.Series([1, -3.14, 'apple']) + msg = 'Unable to parse string "apple" at position 2' + with tm.assertRaisesRegexp(ValueError, msg): + to_numeric(s, errors='raise') + + res = to_numeric(s, errors='ignore') + expected = pd.Series([1, -3.14, 'apple']) + tm.assert_series_equal(res, expected) + + res = to_numeric(s, errors='coerce') + expected = pd.Series([1, -3.14, np.nan]) + tm.assert_series_equal(res, expected) + + s = pd.Series(['orange', 1, -3.14, 'apple']) + msg = 'Unable to parse string "orange" at position 0' + with tm.assertRaisesRegexp(ValueError, msg): + to_numeric(s, errors='raise') + + def test_error_seen_bool(self): + s = pd.Series([True, False, 'apple']) + msg = 'Unable to parse string "apple" at position 2' + with tm.assertRaisesRegexp(ValueError, msg): + to_numeric(s, errors='raise') + + res = to_numeric(s, errors='ignore') + expected = pd.Series([True, False, 'apple']) + tm.assert_series_equal(res, expected) + + # coerces to float + res = to_numeric(s, errors='coerce') + expected = pd.Series([1., 0., np.nan]) + tm.assert_series_equal(res, expected) + + def test_list(self): + s = ['1', '-3.14', '7'] + res = to_numeric(s) + expected = np.array([1, -3.14, 7]) + tm.assert_numpy_array_equal(res, expected) + + def test_list_numeric(self): + s = [1, 3, 4, 5] + res = to_numeric(s) + tm.assert_numpy_array_equal(res, np.array(s, dtype=np.int64)) + + s = [1., 3., 4., 5.] + res = to_numeric(s) + tm.assert_numpy_array_equal(res, np.array(s)) + + # bool is regarded as numeric + s = [True, False, True, True] + res = to_numeric(s) + tm.assert_numpy_array_equal(res, np.array(s)) + + def test_numeric(self): + s = pd.Series([1, -3.14, 7], dtype='O') + res = to_numeric(s) + expected = pd.Series([1, -3.14, 7]) + tm.assert_series_equal(res, expected) + + s = pd.Series([1, -3.14, 7]) + res = to_numeric(s) + tm.assert_series_equal(res, expected) + + # GH 14827 + df = pd.DataFrame(dict( + a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'], + b=[1.0, 2.0, 3.0, 4.0], + )) + expected = pd.DataFrame(dict( + a=[1.2, 3.14, np.inf, 0.1], + b=[1.0, 2.0, 3.0, 4.0], + )) + + # Test to_numeric over one column + df_copy = df.copy() + df_copy['a'] = df_copy['a'].apply(to_numeric) + tm.assert_frame_equal(df_copy, expected) + + # Test to_numeric over multiple columns + df_copy = df.copy() + df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric) + tm.assert_frame_equal(df_copy, expected) + + def test_numeric_lists_and_arrays(self): + # Test to_numeric with embedded lists and arrays + df = pd.DataFrame(dict( + a=[[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1] + )) + df['a'] = df['a'].apply(to_numeric) + expected = pd.DataFrame(dict( + a=[[3.14, 1.0], 1.6, 0.1], + )) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame(dict( + a=[np.array([decimal.Decimal(3.14), 1.0]), 0.1] + )) + df['a'] = df['a'].apply(to_numeric) + expected = pd.DataFrame(dict( + a=[[3.14, 1.0], 0.1], + )) + tm.assert_frame_equal(df, expected) + + def test_all_nan(self): + s = pd.Series(['a', 'b', 'c']) + res = to_numeric(s, errors='coerce') + expected = pd.Series([np.nan, np.nan, np.nan]) + tm.assert_series_equal(res, expected) + + def test_type_check(self): + # GH 11776 + df = pd.DataFrame({'a': [1, -3.14, 7], 'b': ['4', '5', '6']}) + with tm.assertRaisesRegexp(TypeError, "1-d array"): + to_numeric(df) + for errors in ['ignore', 'raise', 'coerce']: + with tm.assertRaisesRegexp(TypeError, "1-d array"): + to_numeric(df, errors=errors) + + def test_scalar(self): + self.assertEqual(pd.to_numeric(1), 1) + self.assertEqual(pd.to_numeric(1.1), 1.1) + + self.assertEqual(pd.to_numeric('1'), 1) + self.assertEqual(pd.to_numeric('1.1'), 1.1) + + with tm.assertRaises(ValueError): + to_numeric('XX', errors='raise') + + self.assertEqual(to_numeric('XX', errors='ignore'), 'XX') + self.assertTrue(np.isnan(to_numeric('XX', errors='coerce'))) + + def test_numeric_dtypes(self): + idx = pd.Index([1, 2, 3], name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, idx) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.values) + + idx = pd.Index([1., np.nan, 3., np.nan], name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, idx) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.values) + + def test_str(self): + idx = pd.Index(['1', '2', '3'], name='xxx') + exp = np.array([1, 2, 3], dtype='int64') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(exp, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(exp, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, exp) + + idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx') + exp = np.array([1.5, 2.7, 3.4]) + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(exp, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(exp, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, exp) + + def test_datetimelike(self): + for tz in [None, 'US/Eastern', 'Asia/Tokyo']: + idx = pd.date_range('20130101', periods=3, tz=tz, name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.asi8) + + def test_timedelta(self): + idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.asi8) + + def test_period(self): + idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) + + # ToDo: enable when we can support native PeriodDtype + # res = pd.to_numeric(pd.Series(idx, name='xxx')) + # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + + def test_non_hashable(self): + # Test for Bug #13324 + s = pd.Series([[10.0, 2], 1.0, 'apple']) + res = pd.to_numeric(s, errors='coerce') + tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan])) + + res = pd.to_numeric(s, errors='ignore') + tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple'])) + + with self.assertRaisesRegexp(TypeError, "Invalid object type"): + pd.to_numeric(s) + + def test_downcast(self): + # see gh-13352 + mixed_data = ['1', 2, 3] + int_data = [1, 2, 3] + date_data = np.array(['1970-01-02', '1970-01-03', + '1970-01-04'], dtype='datetime64[D]') + + invalid_downcast = 'unsigned-integer' + msg = 'invalid downcasting method provided' + + smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) + smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) + + # support below np.float32 is rare and far between + float_32_char = np.dtype(np.float32).char + smallest_float_dtype = float_32_char + + for data in (mixed_data, int_data, date_data): + with self.assertRaisesRegexp(ValueError, msg): + pd.to_numeric(data, downcast=invalid_downcast) + + expected = np.array([1, 2, 3], dtype=np.int64) + + res = pd.to_numeric(data) + tm.assert_numpy_array_equal(res, expected) + + res = pd.to_numeric(data, downcast=None) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_int_dtype) + + for signed_downcast in ('integer', 'signed'): + res = pd.to_numeric(data, downcast=signed_downcast) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_float_dtype) + res = pd.to_numeric(data, downcast='float') + tm.assert_numpy_array_equal(res, expected) + + # if we can't successfully cast the given + # data to a numeric dtype, do not bother + # with the downcast parameter + data = ['foo', 2, 3] + expected = np.array(data, dtype=object) + res = pd.to_numeric(data, errors='ignore', + downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an unsigned integer because + # we have a negative number + data = ['-1', 2, 3] + expected = np.array([-1, 2, 3], dtype=np.int64) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an integer (signed or unsigned) + # because we have a float number + data = (['1.1', 2, 3], + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00]) + expected = (np.array([1.1, 2, 3], dtype=np.float64), + np.array([10000.0, 20000, 3000, + 40000.36, 50000, 50000.00], dtype=np.float64)) + + for _data, _expected in zip(data, expected): + for downcast in ('integer', 'signed', 'unsigned'): + res = pd.to_numeric(_data, downcast=downcast) + tm.assert_numpy_array_equal(res, _expected) + + # the smallest integer dtype need not be np.(u)int8 + data = ['256', 257, 258] + + for downcast, expected_dtype in zip( + ['integer', 'signed', 'unsigned'], + [np.int16, np.int16, np.uint16]): + expected = np.array([256, 257, 258], dtype=expected_dtype) + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) + + def test_downcast_limits(self): + # Test the limits of each downcast. Bug: #14401. + # Check to make sure numpy is new enough to run this test. + if _np_version_under1p9: + pytest.skip("Numpy version is under 1.9") + + i = 'integer' + u = 'unsigned' + dtype_downcast_min_max = [ + ('int8', i, [iinfo(np.int8).min, iinfo(np.int8).max]), + ('int16', i, [iinfo(np.int16).min, iinfo(np.int16).max]), + ('int32', i, [iinfo(np.int32).min, iinfo(np.int32).max]), + ('int64', i, [iinfo(np.int64).min, iinfo(np.int64).max]), + ('uint8', u, [iinfo(np.uint8).min, iinfo(np.uint8).max]), + ('uint16', u, [iinfo(np.uint16).min, iinfo(np.uint16).max]), + ('uint32', u, [iinfo(np.uint32).min, iinfo(np.uint32).max]), + ('uint64', u, [iinfo(np.uint64).min, iinfo(np.uint64).max]), + ('int16', i, [iinfo(np.int8).min, iinfo(np.int8).max + 1]), + ('int32', i, [iinfo(np.int16).min, iinfo(np.int16).max + 1]), + ('int64', i, [iinfo(np.int32).min, iinfo(np.int32).max + 1]), + ('int16', i, [iinfo(np.int8).min - 1, iinfo(np.int16).max]), + ('int32', i, [iinfo(np.int16).min - 1, iinfo(np.int32).max]), + ('int64', i, [iinfo(np.int32).min - 1, iinfo(np.int64).max]), + ('uint16', u, [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]), + ('uint32', u, [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]), + ('uint64', u, [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]) + ] + + for dtype, downcast, min_max in dtype_downcast_min_max: + series = pd.to_numeric(pd.Series(min_max), downcast=downcast) + assert series.dtype == dtype diff --git a/pandas/tests/dtypes/test_convert.py b/pandas/tests/dtypes/test_convert.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 95998f4491e09..fd3a683e80397 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -1,19 +1,8 @@ -import os -import locale -import codecs -import pytest -import decimal import numpy as np -from numpy import iinfo - -import pandas as pd -from pandas import (date_range, Index, _np_version_under1p9) +from pandas import date_range, Index import pandas.util.testing as tm -from pandas.core.reshape.util import cartesian_product, to_numeric - -CURRENT_LOCALE = locale.getlocale() -LOCALE_OVERRIDE = os.environ.get('LOCALE_OVERRIDE', None) +from pandas.core.reshape.util import cartesian_product class TestCartesianProduct(tm.TestCase): @@ -58,428 +47,3 @@ def test_invalid_input(self): msg = "Input must be a list-like of list-likes" for X in invalid_inputs: tm.assertRaisesRegexp(TypeError, msg, cartesian_product, X=X) - - -class TestLocaleUtils(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestLocaleUtils, cls).setUpClass() - cls.locales = tm.get_locales() - - if not cls.locales: - pytest.skip("No locales found") - - tm._skip_if_windows() - - @classmethod - def tearDownClass(cls): - super(TestLocaleUtils, cls).tearDownClass() - del cls.locales - - def test_get_locales(self): - # all systems should have at least a single locale - assert len(tm.get_locales()) > 0 - - def test_get_locales_prefix(self): - if len(self.locales) == 1: - pytest.skip("Only a single locale found, no point in " - "trying to test filtering locale prefixes") - first_locale = self.locales[0] - assert len(tm.get_locales(prefix=first_locale[:2])) > 0 - - def test_set_locale(self): - if len(self.locales) == 1: - pytest.skip("Only a single locale found, no point in " - "trying to test setting another locale") - - if all(x is None for x in CURRENT_LOCALE): - # Not sure why, but on some travis runs with pytest, - # getlocale() returned (None, None). - pytest.skip("CURRENT_LOCALE is not set.") - - if LOCALE_OVERRIDE is None: - lang, enc = 'it_CH', 'UTF-8' - elif LOCALE_OVERRIDE == 'C': - lang, enc = 'en_US', 'ascii' - else: - lang, enc = LOCALE_OVERRIDE.split('.') - - enc = codecs.lookup(enc).name - new_locale = lang, enc - - if not tm._can_set_locale(new_locale): - with tm.assertRaises(locale.Error): - with tm.set_locale(new_locale): - pass - else: - with tm.set_locale(new_locale) as normalized_locale: - new_lang, new_enc = normalized_locale.split('.') - new_enc = codecs.lookup(enc).name - normalized_locale = new_lang, new_enc - self.assertEqual(normalized_locale, new_locale) - - current_locale = locale.getlocale() - self.assertEqual(current_locale, CURRENT_LOCALE) - - -class TestToNumeric(tm.TestCase): - - def test_series(self): - s = pd.Series(['1', '-3.14', '7']) - res = to_numeric(s) - expected = pd.Series([1, -3.14, 7]) - tm.assert_series_equal(res, expected) - - s = pd.Series(['1', '-3.14', 7]) - res = to_numeric(s) - tm.assert_series_equal(res, expected) - - def test_series_numeric(self): - s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX') - res = to_numeric(s) - tm.assert_series_equal(res, s) - - s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX') - res = to_numeric(s) - tm.assert_series_equal(res, s) - - # bool is regarded as numeric - s = pd.Series([True, False, True, True], - index=list('ABCD'), name='XXX') - res = to_numeric(s) - tm.assert_series_equal(res, s) - - def test_error(self): - s = pd.Series([1, -3.14, 'apple']) - msg = 'Unable to parse string "apple" at position 2' - with tm.assertRaisesRegexp(ValueError, msg): - to_numeric(s, errors='raise') - - res = to_numeric(s, errors='ignore') - expected = pd.Series([1, -3.14, 'apple']) - tm.assert_series_equal(res, expected) - - res = to_numeric(s, errors='coerce') - expected = pd.Series([1, -3.14, np.nan]) - tm.assert_series_equal(res, expected) - - s = pd.Series(['orange', 1, -3.14, 'apple']) - msg = 'Unable to parse string "orange" at position 0' - with tm.assertRaisesRegexp(ValueError, msg): - to_numeric(s, errors='raise') - - def test_error_seen_bool(self): - s = pd.Series([True, False, 'apple']) - msg = 'Unable to parse string "apple" at position 2' - with tm.assertRaisesRegexp(ValueError, msg): - to_numeric(s, errors='raise') - - res = to_numeric(s, errors='ignore') - expected = pd.Series([True, False, 'apple']) - tm.assert_series_equal(res, expected) - - # coerces to float - res = to_numeric(s, errors='coerce') - expected = pd.Series([1., 0., np.nan]) - tm.assert_series_equal(res, expected) - - def test_list(self): - s = ['1', '-3.14', '7'] - res = to_numeric(s) - expected = np.array([1, -3.14, 7]) - tm.assert_numpy_array_equal(res, expected) - - def test_list_numeric(self): - s = [1, 3, 4, 5] - res = to_numeric(s) - tm.assert_numpy_array_equal(res, np.array(s, dtype=np.int64)) - - s = [1., 3., 4., 5.] - res = to_numeric(s) - tm.assert_numpy_array_equal(res, np.array(s)) - - # bool is regarded as numeric - s = [True, False, True, True] - res = to_numeric(s) - tm.assert_numpy_array_equal(res, np.array(s)) - - def test_numeric(self): - s = pd.Series([1, -3.14, 7], dtype='O') - res = to_numeric(s) - expected = pd.Series([1, -3.14, 7]) - tm.assert_series_equal(res, expected) - - s = pd.Series([1, -3.14, 7]) - res = to_numeric(s) - tm.assert_series_equal(res, expected) - - # GH 14827 - df = pd.DataFrame(dict( - a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'], - b=[1.0, 2.0, 3.0, 4.0], - )) - expected = pd.DataFrame(dict( - a=[1.2, 3.14, np.inf, 0.1], - b=[1.0, 2.0, 3.0, 4.0], - )) - - # Test to_numeric over one column - df_copy = df.copy() - df_copy['a'] = df_copy['a'].apply(to_numeric) - tm.assert_frame_equal(df_copy, expected) - - # Test to_numeric over multiple columns - df_copy = df.copy() - df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric) - tm.assert_frame_equal(df_copy, expected) - - def test_numeric_lists_and_arrays(self): - # Test to_numeric with embedded lists and arrays - df = pd.DataFrame(dict( - a=[[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1] - )) - df['a'] = df['a'].apply(to_numeric) - expected = pd.DataFrame(dict( - a=[[3.14, 1.0], 1.6, 0.1], - )) - tm.assert_frame_equal(df, expected) - - df = pd.DataFrame(dict( - a=[np.array([decimal.Decimal(3.14), 1.0]), 0.1] - )) - df['a'] = df['a'].apply(to_numeric) - expected = pd.DataFrame(dict( - a=[[3.14, 1.0], 0.1], - )) - tm.assert_frame_equal(df, expected) - - def test_all_nan(self): - s = pd.Series(['a', 'b', 'c']) - res = to_numeric(s, errors='coerce') - expected = pd.Series([np.nan, np.nan, np.nan]) - tm.assert_series_equal(res, expected) - - def test_type_check(self): - # GH 11776 - df = pd.DataFrame({'a': [1, -3.14, 7], 'b': ['4', '5', '6']}) - with tm.assertRaisesRegexp(TypeError, "1-d array"): - to_numeric(df) - for errors in ['ignore', 'raise', 'coerce']: - with tm.assertRaisesRegexp(TypeError, "1-d array"): - to_numeric(df, errors=errors) - - def test_scalar(self): - self.assertEqual(pd.to_numeric(1), 1) - self.assertEqual(pd.to_numeric(1.1), 1.1) - - self.assertEqual(pd.to_numeric('1'), 1) - self.assertEqual(pd.to_numeric('1.1'), 1.1) - - with tm.assertRaises(ValueError): - to_numeric('XX', errors='raise') - - self.assertEqual(to_numeric('XX', errors='ignore'), 'XX') - self.assertTrue(np.isnan(to_numeric('XX', errors='coerce'))) - - def test_numeric_dtypes(self): - idx = pd.Index([1, 2, 3], name='xxx') - res = pd.to_numeric(idx) - tm.assert_index_equal(res, idx) - - res = pd.to_numeric(pd.Series(idx, name='xxx')) - tm.assert_series_equal(res, pd.Series(idx, name='xxx')) - - res = pd.to_numeric(idx.values) - tm.assert_numpy_array_equal(res, idx.values) - - idx = pd.Index([1., np.nan, 3., np.nan], name='xxx') - res = pd.to_numeric(idx) - tm.assert_index_equal(res, idx) - - res = pd.to_numeric(pd.Series(idx, name='xxx')) - tm.assert_series_equal(res, pd.Series(idx, name='xxx')) - - res = pd.to_numeric(idx.values) - tm.assert_numpy_array_equal(res, idx.values) - - def test_str(self): - idx = pd.Index(['1', '2', '3'], name='xxx') - exp = np.array([1, 2, 3], dtype='int64') - res = pd.to_numeric(idx) - tm.assert_index_equal(res, pd.Index(exp, name='xxx')) - - res = pd.to_numeric(pd.Series(idx, name='xxx')) - tm.assert_series_equal(res, pd.Series(exp, name='xxx')) - - res = pd.to_numeric(idx.values) - tm.assert_numpy_array_equal(res, exp) - - idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx') - exp = np.array([1.5, 2.7, 3.4]) - res = pd.to_numeric(idx) - tm.assert_index_equal(res, pd.Index(exp, name='xxx')) - - res = pd.to_numeric(pd.Series(idx, name='xxx')) - tm.assert_series_equal(res, pd.Series(exp, name='xxx')) - - res = pd.to_numeric(idx.values) - tm.assert_numpy_array_equal(res, exp) - - def test_datetimelike(self): - for tz in [None, 'US/Eastern', 'Asia/Tokyo']: - idx = pd.date_range('20130101', periods=3, tz=tz, name='xxx') - res = pd.to_numeric(idx) - tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) - - res = pd.to_numeric(pd.Series(idx, name='xxx')) - tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) - - res = pd.to_numeric(idx.values) - tm.assert_numpy_array_equal(res, idx.asi8) - - def test_timedelta(self): - idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx') - res = pd.to_numeric(idx) - tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) - - res = pd.to_numeric(pd.Series(idx, name='xxx')) - tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) - - res = pd.to_numeric(idx.values) - tm.assert_numpy_array_equal(res, idx.asi8) - - def test_period(self): - idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx') - res = pd.to_numeric(idx) - tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) - - # ToDo: enable when we can support native PeriodDtype - # res = pd.to_numeric(pd.Series(idx, name='xxx')) - # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) - - def test_non_hashable(self): - # Test for Bug #13324 - s = pd.Series([[10.0, 2], 1.0, 'apple']) - res = pd.to_numeric(s, errors='coerce') - tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan])) - - res = pd.to_numeric(s, errors='ignore') - tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple'])) - - with self.assertRaisesRegexp(TypeError, "Invalid object type"): - pd.to_numeric(s) - - def test_downcast(self): - # see gh-13352 - mixed_data = ['1', 2, 3] - int_data = [1, 2, 3] - date_data = np.array(['1970-01-02', '1970-01-03', - '1970-01-04'], dtype='datetime64[D]') - - invalid_downcast = 'unsigned-integer' - msg = 'invalid downcasting method provided' - - smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) - smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) - - # support below np.float32 is rare and far between - float_32_char = np.dtype(np.float32).char - smallest_float_dtype = float_32_char - - for data in (mixed_data, int_data, date_data): - with self.assertRaisesRegexp(ValueError, msg): - pd.to_numeric(data, downcast=invalid_downcast) - - expected = np.array([1, 2, 3], dtype=np.int64) - - res = pd.to_numeric(data) - tm.assert_numpy_array_equal(res, expected) - - res = pd.to_numeric(data, downcast=None) - tm.assert_numpy_array_equal(res, expected) - - expected = np.array([1, 2, 3], dtype=smallest_int_dtype) - - for signed_downcast in ('integer', 'signed'): - res = pd.to_numeric(data, downcast=signed_downcast) - tm.assert_numpy_array_equal(res, expected) - - expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) - res = pd.to_numeric(data, downcast='unsigned') - tm.assert_numpy_array_equal(res, expected) - - expected = np.array([1, 2, 3], dtype=smallest_float_dtype) - res = pd.to_numeric(data, downcast='float') - tm.assert_numpy_array_equal(res, expected) - - # if we can't successfully cast the given - # data to a numeric dtype, do not bother - # with the downcast parameter - data = ['foo', 2, 3] - expected = np.array(data, dtype=object) - res = pd.to_numeric(data, errors='ignore', - downcast='unsigned') - tm.assert_numpy_array_equal(res, expected) - - # cannot cast to an unsigned integer because - # we have a negative number - data = ['-1', 2, 3] - expected = np.array([-1, 2, 3], dtype=np.int64) - res = pd.to_numeric(data, downcast='unsigned') - tm.assert_numpy_array_equal(res, expected) - - # cannot cast to an integer (signed or unsigned) - # because we have a float number - data = (['1.1', 2, 3], - [10000.0, 20000, 3000, 40000.36, 50000, 50000.00]) - expected = (np.array([1.1, 2, 3], dtype=np.float64), - np.array([10000.0, 20000, 3000, - 40000.36, 50000, 50000.00], dtype=np.float64)) - - for _data, _expected in zip(data, expected): - for downcast in ('integer', 'signed', 'unsigned'): - res = pd.to_numeric(_data, downcast=downcast) - tm.assert_numpy_array_equal(res, _expected) - - # the smallest integer dtype need not be np.(u)int8 - data = ['256', 257, 258] - - for downcast, expected_dtype in zip( - ['integer', 'signed', 'unsigned'], - [np.int16, np.int16, np.uint16]): - expected = np.array([256, 257, 258], dtype=expected_dtype) - res = pd.to_numeric(data, downcast=downcast) - tm.assert_numpy_array_equal(res, expected) - - def test_downcast_limits(self): - # Test the limits of each downcast. Bug: #14401. - # Check to make sure numpy is new enough to run this test. - if _np_version_under1p9: - pytest.skip("Numpy version is under 1.9") - - i = 'integer' - u = 'unsigned' - dtype_downcast_min_max = [ - ('int8', i, [iinfo(np.int8).min, iinfo(np.int8).max]), - ('int16', i, [iinfo(np.int16).min, iinfo(np.int16).max]), - ('int32', i, [iinfo(np.int32).min, iinfo(np.int32).max]), - ('int64', i, [iinfo(np.int64).min, iinfo(np.int64).max]), - ('uint8', u, [iinfo(np.uint8).min, iinfo(np.uint8).max]), - ('uint16', u, [iinfo(np.uint16).min, iinfo(np.uint16).max]), - ('uint32', u, [iinfo(np.uint32).min, iinfo(np.uint32).max]), - ('uint64', u, [iinfo(np.uint64).min, iinfo(np.uint64).max]), - ('int16', i, [iinfo(np.int8).min, iinfo(np.int8).max + 1]), - ('int32', i, [iinfo(np.int16).min, iinfo(np.int16).max + 1]), - ('int64', i, [iinfo(np.int32).min, iinfo(np.int32).max + 1]), - ('int16', i, [iinfo(np.int8).min - 1, iinfo(np.int16).max]), - ('int32', i, [iinfo(np.int16).min - 1, iinfo(np.int32).max]), - ('int64', i, [iinfo(np.int32).min - 1, iinfo(np.int64).max]), - ('uint16', u, [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]), - ('uint32', u, [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]), - ('uint64', u, [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]) - ] - - for dtype, downcast, min_max in dtype_downcast_min_max: - series = pd.to_numeric(pd.Series(min_max), downcast=downcast) - assert series.dtype == dtype diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py deleted file mode 100644 index d740d8bd26581..0000000000000 --- a/pandas/tests/test_generic.py +++ /dev/null @@ -1,2076 +0,0 @@ -# -*- coding: utf-8 -*- -# pylint: disable-msg=E1101,W0612 - -from operator import methodcaller -from copy import copy, deepcopy -from warnings import catch_warnings - -import pytest -import numpy as np -from numpy import nan -import pandas as pd - -from distutils.version import LooseVersion -from pandas.core.dtypes.common import is_scalar -from pandas import (Index, Series, DataFrame, Panel, isnull, - date_range, period_range, Panel4D) -from pandas.core.index import MultiIndex - -import pandas.io.formats.printing as printing - -from pandas.compat import range, zip, PY3 -from pandas import compat -from pandas.util.testing import (assertRaisesRegexp, - assert_series_equal, - assert_frame_equal, - assert_panel_equal, - assert_panel4d_equal, - assert_almost_equal) - -import pandas.util.testing as tm - - -# ---------------------------------------------------------------------- -# Generic types test cases - - -class Generic(object): - - def setUp(self): - pass - - @property - def _ndim(self): - return self._typ._AXIS_LEN - - def _axes(self): - """ return the axes for my object typ """ - return self._typ._AXIS_ORDERS - - def _construct(self, shape, value=None, dtype=None, **kwargs): - """ construct an object for the given shape - if value is specified use that if its a scalar - if value is an array, repeat it as needed """ - - if isinstance(shape, int): - shape = tuple([shape] * self._ndim) - if value is not None: - if is_scalar(value): - if value == 'empty': - arr = None - - # remove the info axis - kwargs.pop(self._typ._info_axis_name, None) - else: - arr = np.empty(shape, dtype=dtype) - arr.fill(value) - else: - fshape = np.prod(shape) - arr = value.ravel() - new_shape = fshape / arr.shape[0] - if fshape % arr.shape[0] != 0: - raise Exception("invalid value passed in _construct") - - arr = np.repeat(arr, new_shape).reshape(shape) - else: - arr = np.random.randn(*shape) - return self._typ(arr, dtype=dtype, **kwargs) - - def _compare(self, result, expected): - self._comparator(result, expected) - - def test_rename(self): - - # single axis - idx = list('ABCD') - # relabeling values passed into self.rename - args = [ - str.lower, - {x: x.lower() for x in idx}, - Series({x: x.lower() for x in idx}), - ] - - for axis in self._axes(): - kwargs = {axis: idx} - obj = self._construct(4, **kwargs) - - for arg in args: - # rename a single axis - result = obj.rename(**{axis: arg}) - expected = obj.copy() - setattr(expected, axis, list('abcd')) - self._compare(result, expected) - - # multiple axes at once - - def test_rename_axis(self): - idx = list('ABCD') - # relabeling values passed into self.rename - args = [ - str.lower, - {x: x.lower() for x in idx}, - Series({x: x.lower() for x in idx}), - ] - - for axis in self._axes(): - kwargs = {axis: idx} - obj = self._construct(4, **kwargs) - - for arg in args: - # rename a single axis - result = obj.rename_axis(arg, axis=axis) - expected = obj.copy() - setattr(expected, axis, list('abcd')) - self._compare(result, expected) - # scalar values - for arg in ['foo', None]: - result = obj.rename_axis(arg, axis=axis) - expected = obj.copy() - getattr(expected, axis).name = arg - self._compare(result, expected) - - def test_get_numeric_data(self): - - n = 4 - kwargs = {} - for i in range(self._ndim): - kwargs[self._typ._AXIS_NAMES[i]] = list(range(n)) - - # get the numeric data - o = self._construct(n, **kwargs) - result = o._get_numeric_data() - self._compare(result, o) - - # non-inclusion - result = o._get_bool_data() - expected = self._construct(n, value='empty', **kwargs) - self._compare(result, expected) - - # get the bool data - arr = np.array([True, True, False, True]) - o = self._construct(n, value=arr, **kwargs) - result = o._get_numeric_data() - self._compare(result, o) - - # _get_numeric_data is includes _get_bool_data, so can't test for - # non-inclusion - - def test_get_default(self): - - # GH 7725 - d0 = "a", "b", "c", "d" - d1 = np.arange(4, dtype='int64') - others = "e", 10 - - for data, index in ((d0, d1), (d1, d0)): - s = Series(data, index=index) - for i, d in zip(index, data): - self.assertEqual(s.get(i), d) - self.assertEqual(s.get(i, d), d) - self.assertEqual(s.get(i, "z"), d) - for other in others: - self.assertEqual(s.get(other, "z"), "z") - self.assertEqual(s.get(other, other), other) - - def test_nonzero(self): - - # GH 4633 - # look at the boolean/nonzero behavior for objects - obj = self._construct(shape=4) - self.assertRaises(ValueError, lambda: bool(obj == 0)) - self.assertRaises(ValueError, lambda: bool(obj == 1)) - self.assertRaises(ValueError, lambda: bool(obj)) - - obj = self._construct(shape=4, value=1) - self.assertRaises(ValueError, lambda: bool(obj == 0)) - self.assertRaises(ValueError, lambda: bool(obj == 1)) - self.assertRaises(ValueError, lambda: bool(obj)) - - obj = self._construct(shape=4, value=np.nan) - self.assertRaises(ValueError, lambda: bool(obj == 0)) - self.assertRaises(ValueError, lambda: bool(obj == 1)) - self.assertRaises(ValueError, lambda: bool(obj)) - - # empty - obj = self._construct(shape=0) - self.assertRaises(ValueError, lambda: bool(obj)) - - # invalid behaviors - - obj1 = self._construct(shape=4, value=1) - obj2 = self._construct(shape=4, value=1) - - def f(): - if obj1: - printing.pprint_thing("this works and shouldn't") - - self.assertRaises(ValueError, f) - self.assertRaises(ValueError, lambda: obj1 and obj2) - self.assertRaises(ValueError, lambda: obj1 or obj2) - self.assertRaises(ValueError, lambda: not obj1) - - def test_numpy_1_7_compat_numeric_methods(self): - # GH 4435 - # numpy in 1.7 tries to pass addtional arguments to pandas functions - - o = self._construct(shape=4) - for op in ['min', 'max', 'max', 'var', 'std', 'prod', 'sum', 'cumsum', - 'cumprod', 'median', 'skew', 'kurt', 'compound', 'cummax', - 'cummin', 'all', 'any']: - f = getattr(np, op, None) - if f is not None: - f(o) - - def test_downcast(self): - # test close downcasting - - o = self._construct(shape=4, value=9, dtype=np.int64) - result = o.copy() - result._data = o._data.downcast(dtypes='infer') - self._compare(result, o) - - o = self._construct(shape=4, value=9.) - expected = o.astype(np.int64) - result = o.copy() - result._data = o._data.downcast(dtypes='infer') - self._compare(result, expected) - - o = self._construct(shape=4, value=9.5) - result = o.copy() - result._data = o._data.downcast(dtypes='infer') - self._compare(result, o) - - # are close - o = self._construct(shape=4, value=9.000000000005) - result = o.copy() - result._data = o._data.downcast(dtypes='infer') - expected = o.astype(np.int64) - self._compare(result, expected) - - def test_constructor_compound_dtypes(self): - # GH 5191 - # compound dtypes should raise not-implementederror - - def f(dtype): - return self._construct(shape=3, dtype=dtype) - - self.assertRaises(NotImplementedError, f, [("A", "datetime64[h]"), - ("B", "str"), - ("C", "int32")]) - - # these work (though results may be unexpected) - f('int64') - f('float64') - f('M8[ns]') - - def check_metadata(self, x, y=None): - for m in x._metadata: - v = getattr(x, m, None) - if y is None: - self.assertIsNone(v) - else: - self.assertEqual(v, getattr(y, m, None)) - - def test_metadata_propagation(self): - # check that the metadata matches up on the resulting ops - - o = self._construct(shape=3) - o.name = 'foo' - o2 = self._construct(shape=3) - o2.name = 'bar' - - # TODO - # Once panel can do non-trivial combine operations - # (currently there is an a raise in the Panel arith_ops to prevent - # this, though it actually does work) - # can remove all of these try: except: blocks on the actual operations - - # ---------- - # preserving - # ---------- - - # simple ops with scalars - for op in ['__add__', '__sub__', '__truediv__', '__mul__']: - result = getattr(o, op)(1) - self.check_metadata(o, result) - - # ops with like - for op in ['__add__', '__sub__', '__truediv__', '__mul__']: - try: - result = getattr(o, op)(o) - self.check_metadata(o, result) - except (ValueError, AttributeError): - pass - - # simple boolean - for op in ['__eq__', '__le__', '__ge__']: - v1 = getattr(o, op)(o) - self.check_metadata(o, v1) - - try: - self.check_metadata(o, v1 & v1) - except (ValueError): - pass - - try: - self.check_metadata(o, v1 | v1) - except (ValueError): - pass - - # combine_first - try: - result = o.combine_first(o2) - self.check_metadata(o, result) - except (AttributeError): - pass - - # --------------------------- - # non-preserving (by default) - # --------------------------- - - # add non-like - try: - result = o + o2 - self.check_metadata(result) - except (ValueError, AttributeError): - pass - - # simple boolean - for op in ['__eq__', '__le__', '__ge__']: - - # this is a name matching op - v1 = getattr(o, op)(o) - - v2 = getattr(o, op)(o2) - self.check_metadata(v2) - - try: - self.check_metadata(v1 & v2) - except (ValueError): - pass - - try: - self.check_metadata(v1 | v2) - except (ValueError): - pass - - def test_head_tail(self): - # GH5370 - - o = self._construct(shape=10) - - # check all index types - for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeDateIndex, - tm.makePeriodIndex]: - axis = o._get_axis_name(0) - setattr(o, axis, index(len(getattr(o, axis)))) - - # Panel + dims - try: - o.head() - except (NotImplementedError): - pytest.skip('not implemented on {0}'.format( - o.__class__.__name__)) - - self._compare(o.head(), o.iloc[:5]) - self._compare(o.tail(), o.iloc[-5:]) - - # 0-len - self._compare(o.head(0), o.iloc[0:0]) - self._compare(o.tail(0), o.iloc[0:0]) - - # bounded - self._compare(o.head(len(o) + 1), o) - self._compare(o.tail(len(o) + 1), o) - - # neg index - self._compare(o.head(-3), o.head(7)) - self._compare(o.tail(-3), o.tail(7)) - - def test_sample(self): - # Fixes issue: 2419 - - o = self._construct(shape=10) - - ### - # Check behavior of random_state argument - ### - - # Check for stability when receives seed or random state -- run 10 - # times. - for test in range(10): - seed = np.random.randint(0, 100) - self._compare( - o.sample(n=4, random_state=seed), o.sample(n=4, - random_state=seed)) - self._compare( - o.sample(frac=0.7, random_state=seed), o.sample( - frac=0.7, random_state=seed)) - - self._compare( - o.sample(n=4, random_state=np.random.RandomState(test)), - o.sample(n=4, random_state=np.random.RandomState(test))) - - self._compare( - o.sample(frac=0.7, random_state=np.random.RandomState(test)), - o.sample(frac=0.7, random_state=np.random.RandomState(test))) - - os1, os2 = [], [] - for _ in range(2): - np.random.seed(test) - os1.append(o.sample(n=4)) - os2.append(o.sample(frac=0.7)) - self._compare(*os1) - self._compare(*os2) - - # Check for error when random_state argument invalid. - with tm.assertRaises(ValueError): - o.sample(random_state='astring!') - - ### - # Check behavior of `frac` and `N` - ### - - # Giving both frac and N throws error - with tm.assertRaises(ValueError): - o.sample(n=3, frac=0.3) - - # Check that raises right error for negative lengths - with tm.assertRaises(ValueError): - o.sample(n=-3) - with tm.assertRaises(ValueError): - o.sample(frac=-0.3) - - # Make sure float values of `n` give error - with tm.assertRaises(ValueError): - o.sample(n=3.2) - - # Check lengths are right - self.assertTrue(len(o.sample(n=4) == 4)) - self.assertTrue(len(o.sample(frac=0.34) == 3)) - self.assertTrue(len(o.sample(frac=0.36) == 4)) - - ### - # Check weights - ### - - # Weight length must be right - with tm.assertRaises(ValueError): - o.sample(n=3, weights=[0, 1]) - - with tm.assertRaises(ValueError): - bad_weights = [0.5] * 11 - o.sample(n=3, weights=bad_weights) - - with tm.assertRaises(ValueError): - bad_weight_series = Series([0, 0, 0.2]) - o.sample(n=4, weights=bad_weight_series) - - # Check won't accept negative weights - with tm.assertRaises(ValueError): - bad_weights = [-0.1] * 10 - o.sample(n=3, weights=bad_weights) - - # Check inf and -inf throw errors: - with tm.assertRaises(ValueError): - weights_with_inf = [0.1] * 10 - weights_with_inf[0] = np.inf - o.sample(n=3, weights=weights_with_inf) - - with tm.assertRaises(ValueError): - weights_with_ninf = [0.1] * 10 - weights_with_ninf[0] = -np.inf - o.sample(n=3, weights=weights_with_ninf) - - # All zeros raises errors - zero_weights = [0] * 10 - with tm.assertRaises(ValueError): - o.sample(n=3, weights=zero_weights) - - # All missing weights - nan_weights = [np.nan] * 10 - with tm.assertRaises(ValueError): - o.sample(n=3, weights=nan_weights) - - # Check np.nan are replaced by zeros. - weights_with_nan = [np.nan] * 10 - weights_with_nan[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) - - # Check None are also replaced by zeros. - weights_with_None = [None] * 10 - weights_with_None[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) - - def test_size_compat(self): - # GH8846 - # size property should be defined - - o = self._construct(shape=10) - self.assertTrue(o.size == np.prod(o.shape)) - self.assertTrue(o.size == 10 ** len(o.axes)) - - def test_split_compat(self): - # xref GH8846 - o = self._construct(shape=10) - self.assertTrue(len(np.array_split(o, 5)) == 5) - self.assertTrue(len(np.array_split(o, 2)) == 2) - - def test_unexpected_keyword(self): # GH8597 - df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe']) - ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) - ts = df['joe'].copy() - ts[2] = np.nan - - with assertRaisesRegexp(TypeError, 'unexpected keyword'): - df.drop('joe', axis=1, in_place=True) - - with assertRaisesRegexp(TypeError, 'unexpected keyword'): - df.reindex([1, 0], inplace=True) - - with assertRaisesRegexp(TypeError, 'unexpected keyword'): - ca.fillna(0, inplace=True) - - with assertRaisesRegexp(TypeError, 'unexpected keyword'): - ts.fillna(0, in_place=True) - - # See gh-12301 - def test_stat_unexpected_keyword(self): - obj = self._construct(5) - starwars = 'Star Wars' - errmsg = 'unexpected keyword' - - with assertRaisesRegexp(TypeError, errmsg): - obj.max(epic=starwars) # stat_function - with assertRaisesRegexp(TypeError, errmsg): - obj.var(epic=starwars) # stat_function_ddof - with assertRaisesRegexp(TypeError, errmsg): - obj.sum(epic=starwars) # cum_function - with assertRaisesRegexp(TypeError, errmsg): - obj.any(epic=starwars) # logical_function - - def test_api_compat(self): - - # GH 12021 - # compat for __name__, __qualname__ - - obj = self._construct(5) - for func in ['sum', 'cumsum', 'any', 'var']: - f = getattr(obj, func) - self.assertEqual(f.__name__, func) - if PY3: - self.assertTrue(f.__qualname__.endswith(func)) - - def test_stat_non_defaults_args(self): - obj = self._construct(5) - out = np.array([0]) - errmsg = "the 'out' parameter is not supported" - - with assertRaisesRegexp(ValueError, errmsg): - obj.max(out=out) # stat_function - with assertRaisesRegexp(ValueError, errmsg): - obj.var(out=out) # stat_function_ddof - with assertRaisesRegexp(ValueError, errmsg): - obj.sum(out=out) # cum_function - with assertRaisesRegexp(ValueError, errmsg): - obj.any(out=out) # logical_function - - def test_clip(self): - lower = 1 - upper = 3 - col = np.arange(5) - - obj = self._construct(len(col), value=col) - - if isinstance(obj, Panel): - msg = "clip is not supported yet for panels" - tm.assertRaisesRegexp(NotImplementedError, msg, - obj.clip, lower=lower, - upper=upper) - - else: - out = obj.clip(lower=lower, upper=upper) - expected = self._construct(len(col), value=col - .clip(lower, upper)) - self._compare(out, expected) - - bad_axis = 'foo' - msg = ('No axis named {axis} ' - 'for object').format(axis=bad_axis) - assertRaisesRegexp(ValueError, msg, obj.clip, - lower=lower, upper=upper, - axis=bad_axis) - - def test_truncate_out_of_bounds(self): - # GH11382 - - # small - shape = [int(2e3)] + ([1] * (self._ndim - 1)) - small = self._construct(shape, dtype='int8') - self._compare(small.truncate(), small) - self._compare(small.truncate(before=0, after=3e3), small) - self._compare(small.truncate(before=-1, after=2e3), small) - - # big - shape = [int(2e6)] + ([1] * (self._ndim - 1)) - big = self._construct(shape, dtype='int8') - self._compare(big.truncate(), big) - self._compare(big.truncate(before=0, after=3e6), big) - self._compare(big.truncate(before=-1, after=2e6), big) - - def test_numpy_clip(self): - lower = 1 - upper = 3 - col = np.arange(5) - - obj = self._construct(len(col), value=col) - - if isinstance(obj, Panel): - msg = "clip is not supported yet for panels" - tm.assertRaisesRegexp(NotImplementedError, msg, - np.clip, obj, - lower, upper) - else: - out = np.clip(obj, lower, upper) - expected = self._construct(len(col), value=col - .clip(lower, upper)) - self._compare(out, expected) - - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, - np.clip, obj, - lower, upper, out=col) - - def test_validate_bool_args(self): - df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) - invalid_values = [1, "True", [1, 2, 3], 5.0] - - for value in invalid_values: - with self.assertRaises(ValueError): - super(DataFrame, df).rename_axis(mapper={'a': 'x', 'b': 'y'}, - axis=1, inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df).drop('a', axis=1, inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df).sort_index(inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df)._consolidate(inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df).fillna(value=0, inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df).replace(to_replace=1, value=7, - inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df).interpolate(inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df)._where(cond=df.a > 2, inplace=value) - - with self.assertRaises(ValueError): - super(DataFrame, df).mask(cond=df.a > 2, inplace=value) - - def test_copy_and_deepcopy(self): - # GH 15444 - for shape in [0, 1, 2]: - obj = self._construct(shape) - for func in [copy, - deepcopy, - lambda x: x.copy(deep=False), - lambda x: x.copy(deep=True)]: - obj_copy = func(obj) - assert obj_copy is not obj - self._compare(obj_copy, obj) - - -class TestSeries(tm.TestCase, Generic): - _typ = Series - _comparator = lambda self, x, y: assert_series_equal(x, y) - - def setUp(self): - self.ts = tm.makeTimeSeries() # Was at top level in test_series - self.ts.name = 'ts' - - self.series = tm.makeStringSeries() - self.series.name = 'series' - - def test_rename_mi(self): - s = Series([11, 21, 31], - index=MultiIndex.from_tuples( - [("A", x) for x in ["a", "B", "c"]])) - s.rename(str.lower) - - def test_set_axis_name(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - funcs = ['rename_axis', '_set_axis_name'] - name = 'foo' - for func in funcs: - result = methodcaller(func, name)(s) - self.assertTrue(s.index.name is None) - self.assertEqual(result.index.name, name) - - def test_set_axis_name_mi(self): - s = Series([11, 21, 31], index=MultiIndex.from_tuples( - [("A", x) for x in ["a", "B", "c"]], - names=['l1', 'l2']) - ) - funcs = ['rename_axis', '_set_axis_name'] - for func in funcs: - result = methodcaller(func, ['L1', 'L2'])(s) - self.assertTrue(s.index.name is None) - self.assertEqual(s.index.names, ['l1', 'l2']) - self.assertTrue(result.index.name is None) - self.assertTrue(result.index.names, ['L1', 'L2']) - - def test_set_axis_name_raises(self): - s = pd.Series([1]) - with tm.assertRaises(ValueError): - s._set_axis_name(name='a', axis=1) - - def test_get_numeric_data_preserve_dtype(self): - - # get the numeric data - o = Series([1, 2, 3]) - result = o._get_numeric_data() - self._compare(result, o) - - o = Series([1, '2', 3.]) - result = o._get_numeric_data() - expected = Series([], dtype=object, index=pd.Index([], dtype=object)) - self._compare(result, expected) - - o = Series([True, False, True]) - result = o._get_numeric_data() - self._compare(result, o) - - o = Series([True, False, True]) - result = o._get_bool_data() - self._compare(result, o) - - o = Series(date_range('20130101', periods=3)) - result = o._get_numeric_data() - expected = Series([], dtype='M8[ns]', index=pd.Index([], dtype=object)) - self._compare(result, expected) - - def test_nonzero_single_element(self): - - # allow single item via bool method - s = Series([True]) - self.assertTrue(s.bool()) - - s = Series([False]) - self.assertFalse(s.bool()) - - # single item nan to raise - for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), - Series([False])]: - self.assertRaises(ValueError, lambda: bool(s)) - - for s in [Series([np.nan]), Series([pd.NaT])]: - self.assertRaises(ValueError, lambda: s.bool()) - - # multiple bool are still an error - for s in [Series([True, True]), Series([False, False])]: - self.assertRaises(ValueError, lambda: bool(s)) - self.assertRaises(ValueError, lambda: s.bool()) - - # single non-bool are an error - for s in [Series([1]), Series([0]), Series(['a']), Series([0.0])]: - self.assertRaises(ValueError, lambda: bool(s)) - self.assertRaises(ValueError, lambda: s.bool()) - - def test_metadata_propagation_indiv(self): - # check that the metadata matches up on the resulting ops - - o = Series(range(3), range(3)) - o.name = 'foo' - o2 = Series(range(3), range(3)) - o2.name = 'bar' - - result = o.T - self.check_metadata(o, result) - - # resample - ts = Series(np.random.rand(1000), - index=date_range('20130101', periods=1000, freq='s'), - name='foo') - result = ts.resample('1T').mean() - self.check_metadata(ts, result) - - result = ts.resample('1T').min() - self.check_metadata(ts, result) - - result = ts.resample('1T').apply(lambda x: x.sum()) - self.check_metadata(ts, result) - - _metadata = Series._metadata - _finalize = Series.__finalize__ - Series._metadata = ['name', 'filename'] - o.filename = 'foo' - o2.filename = 'bar' - - def finalize(self, other, method=None, **kwargs): - for name in self._metadata: - if method == 'concat' and name == 'filename': - value = '+'.join([getattr( - o, name) for o in other.objs if getattr(o, name, None) - ]) - object.__setattr__(self, name, value) - else: - object.__setattr__(self, name, getattr(other, name, None)) - - return self - - Series.__finalize__ = finalize - - result = pd.concat([o, o2]) - self.assertEqual(result.filename, 'foo+bar') - self.assertIsNone(result.name) - - # reset - Series._metadata = _metadata - Series.__finalize__ = _finalize - - def test_describe(self): - self.series.describe() - self.ts.describe() - - def test_describe_objects(self): - s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) - result = s.describe() - expected = Series({'count': 7, 'unique': 4, - 'top': 'a', 'freq': 3, 'second': 'b', - 'second_freq': 2}, index=result.index) - assert_series_equal(result, expected) - - dt = list(self.ts.index) - dt.append(dt[0]) - ser = Series(dt) - rs = ser.describe() - min_date = min(dt) - max_date = max(dt) - xp = Series({'count': len(dt), - 'unique': len(self.ts.index), - 'first': min_date, 'last': max_date, 'freq': 2, - 'top': min_date}, index=rs.index) - assert_series_equal(rs, xp) - - def test_describe_empty(self): - result = pd.Series().describe() - - self.assertEqual(result['count'], 0) - self.assertTrue(result.drop('count').isnull().all()) - - nanSeries = Series([np.nan]) - nanSeries.name = 'NaN' - result = nanSeries.describe() - self.assertEqual(result['count'], 0) - self.assertTrue(result.drop('count').isnull().all()) - - def test_describe_none(self): - noneSeries = Series([None]) - noneSeries.name = 'None' - expected = Series([0, 0], index=['count', 'unique'], name='None') - assert_series_equal(noneSeries.describe(), expected) - - def test_to_xarray(self): - - tm._skip_if_no_xarray() - import xarray - from xarray import DataArray - - s = Series([]) - s.index.name = 'foo' - result = s.to_xarray() - self.assertEqual(len(result), 0) - self.assertEqual(len(result.coords), 1) - assert_almost_equal(list(result.coords.keys()), ['foo']) - self.assertIsInstance(result, DataArray) - - def testit(index, check_index_type=True, check_categorical=True): - s = Series(range(6), index=index(6)) - s.index.name = 'foo' - result = s.to_xarray() - repr(result) - self.assertEqual(len(result), 6) - self.assertEqual(len(result.coords), 1) - assert_almost_equal(list(result.coords.keys()), ['foo']) - self.assertIsInstance(result, DataArray) - - # idempotency - assert_series_equal(result.to_series(), s, - check_index_type=check_index_type, - check_categorical=check_categorical) - - l = [tm.makeFloatIndex, tm.makeIntIndex, - tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex, - tm.makeTimedeltaIndex] - - if LooseVersion(xarray.__version__) >= '0.8.0': - l.append(tm.makeCategoricalIndex) - - for index in l: - testit(index) - - s = Series(range(6)) - s.index.name = 'foo' - s.index = pd.MultiIndex.from_product([['a', 'b'], range(3)], - names=['one', 'two']) - result = s.to_xarray() - self.assertEqual(len(result), 2) - assert_almost_equal(list(result.coords.keys()), ['one', 'two']) - self.assertIsInstance(result, DataArray) - assert_series_equal(result.to_series(), s) - - -class TestDataFrame(tm.TestCase, Generic): - _typ = DataFrame - _comparator = lambda self, x, y: assert_frame_equal(x, y) - - def test_rename_mi(self): - df = DataFrame([ - 11, 21, 31 - ], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]])) - df.rename(str.lower) - - def test_set_axis_name(self): - df = pd.DataFrame([[1, 2], [3, 4]]) - funcs = ['_set_axis_name', 'rename_axis'] - for func in funcs: - result = methodcaller(func, 'foo')(df) - self.assertTrue(df.index.name is None) - self.assertEqual(result.index.name, 'foo') - - result = methodcaller(func, 'cols', axis=1)(df) - self.assertTrue(df.columns.name is None) - self.assertEqual(result.columns.name, 'cols') - - def test_set_axis_name_mi(self): - df = DataFrame( - np.empty((3, 3)), - index=MultiIndex.from_tuples([("A", x) for x in list('aBc')]), - columns=MultiIndex.from_tuples([('C', x) for x in list('xyz')]) - ) - - level_names = ['L1', 'L2'] - funcs = ['_set_axis_name', 'rename_axis'] - for func in funcs: - result = methodcaller(func, level_names)(df) - self.assertEqual(result.index.names, level_names) - self.assertEqual(result.columns.names, [None, None]) - - result = methodcaller(func, level_names, axis=1)(df) - self.assertEqual(result.columns.names, ["L1", "L2"]) - self.assertEqual(result.index.names, [None, None]) - - def test_nonzero_single_element(self): - - # allow single item via bool method - df = DataFrame([[True]]) - self.assertTrue(df.bool()) - - df = DataFrame([[False]]) - self.assertFalse(df.bool()) - - df = DataFrame([[False, False]]) - self.assertRaises(ValueError, lambda: df.bool()) - self.assertRaises(ValueError, lambda: bool(df)) - - def test_get_numeric_data_preserve_dtype(self): - - # get the numeric data - o = DataFrame({'A': [1, '2', 3.]}) - result = o._get_numeric_data() - expected = DataFrame(index=[0, 1, 2], dtype=object) - self._compare(result, expected) - - def test_describe(self): - tm.makeDataFrame().describe() - tm.makeMixedDataFrame().describe() - tm.makeTimeDataFrame().describe() - - def test_describe_percentiles_percent_or_raw(self): - msg = 'percentiles should all be in the interval \\[0, 1\\]' - - df = tm.makeDataFrame() - with tm.assertRaisesRegexp(ValueError, msg): - df.describe(percentiles=[10, 50, 100]) - - with tm.assertRaisesRegexp(ValueError, msg): - df.describe(percentiles=[2]) - - with tm.assertRaisesRegexp(ValueError, msg): - df.describe(percentiles=[-2]) - - def test_describe_percentiles_equivalence(self): - df = tm.makeDataFrame() - d1 = df.describe() - d2 = df.describe(percentiles=[.25, .75]) - assert_frame_equal(d1, d2) - - def test_describe_percentiles_insert_median(self): - df = tm.makeDataFrame() - d1 = df.describe(percentiles=[.25, .75]) - d2 = df.describe(percentiles=[.25, .5, .75]) - assert_frame_equal(d1, d2) - self.assertTrue('25%' in d1.index) - self.assertTrue('75%' in d2.index) - - # none above - d1 = df.describe(percentiles=[.25, .45]) - d2 = df.describe(percentiles=[.25, .45, .5]) - assert_frame_equal(d1, d2) - self.assertTrue('25%' in d1.index) - self.assertTrue('45%' in d2.index) - - # none below - d1 = df.describe(percentiles=[.75, 1]) - d2 = df.describe(percentiles=[.5, .75, 1]) - assert_frame_equal(d1, d2) - self.assertTrue('75%' in d1.index) - self.assertTrue('100%' in d2.index) - - # edge - d1 = df.describe(percentiles=[0, 1]) - d2 = df.describe(percentiles=[0, .5, 1]) - assert_frame_equal(d1, d2) - self.assertTrue('0%' in d1.index) - self.assertTrue('100%' in d2.index) - - def test_describe_percentiles_insert_median_ndarray(self): - # GH14908 - df = tm.makeDataFrame() - result = df.describe(percentiles=np.array([.25, .75])) - expected = df.describe(percentiles=[.25, .75]) - assert_frame_equal(result, expected) - - def test_describe_percentiles_unique(self): - # GH13104 - df = tm.makeDataFrame() - with self.assertRaises(ValueError): - df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6]) - with self.assertRaises(ValueError): - df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6]) - - def test_describe_percentiles_formatting(self): - # GH13104 - df = tm.makeDataFrame() - - # default - result = df.describe().index - expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', - 'max'], - dtype='object') - tm.assert_index_equal(result, expected) - - result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, - 0.9995, 0.9999]).index - expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%', - '0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'], - dtype='object') - tm.assert_index_equal(result, expected) - - result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50, - 0.75]).index - expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%', - '25%', '50%', '75%', 'max'], - dtype='object') - tm.assert_index_equal(result, expected) - - result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50, - 0.75]).index - expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%', - '25%', '50%', '75%', 'max'], - dtype='object') - tm.assert_index_equal(result, expected) - - def test_describe_column_index_type(self): - # GH13288 - df = pd.DataFrame([1, 2, 3, 4]) - df.columns = pd.Index([0], dtype=object) - result = df.describe().columns - expected = Index([0], dtype=object) - tm.assert_index_equal(result, expected) - - df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]}) - result = df.describe().columns - expected = Index([0], dtype=object) - tm.assert_index_equal(result, expected) - - def test_describe_no_numeric(self): - df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, - 'B': ['a', 'b', 'c', 'd'] * 6}) - desc = df.describe() - expected = DataFrame(dict((k, v.describe()) - for k, v in compat.iteritems(df)), - columns=df.columns) - assert_frame_equal(desc, expected) - - ts = tm.makeTimeSeries() - df = DataFrame({'time': ts.index}) - desc = df.describe() - self.assertEqual(desc.time['first'], min(ts.index)) - - def test_describe_empty(self): - df = DataFrame() - tm.assertRaisesRegexp(ValueError, 'DataFrame without columns', - df.describe) - - df = DataFrame(columns=['A', 'B']) - result = df.describe() - expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique']) - tm.assert_frame_equal(result, expected) - - def test_describe_empty_int_columns(self): - df = DataFrame([[0, 1], [1, 2]]) - desc = df[df[0] < 0].describe() # works - assert_series_equal(desc.xs('count'), - Series([0, 0], dtype=float, name='count')) - self.assertTrue(isnull(desc.iloc[1:]).all().all()) - - def test_describe_objects(self): - df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']}) - result = df.describe() - expected = DataFrame({"C1": [3, 2, 'a', 2], "C2": [3, 2, 'd', 2]}, - index=['count', 'unique', 'top', 'freq']) - assert_frame_equal(result, expected) - - df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D') - }) - df.loc[4] = pd.Timestamp('2010-01-04') - result = df.describe() - expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-04'), 2, - pd.Timestamp('2010-01-01'), - pd.Timestamp('2010-01-04')]}, - index=['count', 'unique', 'top', 'freq', - 'first', 'last']) - assert_frame_equal(result, expected) - - # mix time and str - df['C2'] = ['a', 'a', 'b', 'c', 'a'] - result = df.describe() - expected['C2'] = [5, 3, 'a', 3, np.nan, np.nan] - assert_frame_equal(result, expected) - - # just str - expected = DataFrame({'C2': [5, 3, 'a', 4]}, - index=['count', 'unique', 'top', 'freq']) - result = df[['C2']].describe() - - # mix of time, str, numeric - df['C3'] = [2, 4, 6, 8, 2] - result = df.describe() - expected = DataFrame({"C3": [5., 4.4, 2.607681, 2., 2., 4., 6., 8.]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) - assert_frame_equal(result, expected) - assert_frame_equal(df.describe(), df[['C3']].describe()) - - assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe()) - assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe()) - - def test_describe_typefiltering(self): - df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, - 'catB': ['a', 'b', 'c', 'd'] * 6, - 'numC': np.arange(24, dtype='int64'), - 'numD': np.arange(24.) + .5, - 'ts': tm.makeTimeSeries()[:24].index}) - - descN = df.describe() - expected_cols = ['numC', 'numD', ] - expected = DataFrame(dict((k, df[k].describe()) - for k in expected_cols), - columns=expected_cols) - assert_frame_equal(descN, expected) - - desc = df.describe(include=['number']) - assert_frame_equal(desc, descN) - desc = df.describe(exclude=['object', 'datetime']) - assert_frame_equal(desc, descN) - desc = df.describe(include=['float']) - assert_frame_equal(desc, descN.drop('numC', 1)) - - descC = df.describe(include=['O']) - expected_cols = ['catA', 'catB'] - expected = DataFrame(dict((k, df[k].describe()) - for k in expected_cols), - columns=expected_cols) - assert_frame_equal(descC, expected) - - descD = df.describe(include=['datetime']) - assert_series_equal(descD.ts, df.ts.describe()) - - desc = df.describe(include=['object', 'number', 'datetime']) - assert_frame_equal(desc.loc[:, ["numC", "numD"]].dropna(), descN) - assert_frame_equal(desc.loc[:, ["catA", "catB"]].dropna(), descC) - descDs = descD.sort_index() # the index order change for mixed-types - assert_frame_equal(desc.loc[:, "ts":].dropna().sort_index(), descDs) - - desc = df.loc[:, 'catA':'catB'].describe(include='all') - assert_frame_equal(desc, descC) - desc = df.loc[:, 'numC':'numD'].describe(include='all') - assert_frame_equal(desc, descN) - - desc = df.describe(percentiles=[], include='all') - cnt = Series(data=[4, 4, 6, 6, 6], - index=['catA', 'catB', 'numC', 'numD', 'ts']) - assert_series_equal(desc.count(), cnt) - self.assertTrue('count' in desc.index) - self.assertTrue('unique' in desc.index) - self.assertTrue('50%' in desc.index) - self.assertTrue('first' in desc.index) - - desc = df.drop("ts", 1).describe(percentiles=[], include='all') - assert_series_equal(desc.count(), cnt.drop("ts")) - self.assertTrue('first' not in desc.index) - desc = df.drop(["numC", "numD"], 1).describe(percentiles=[], - include='all') - assert_series_equal(desc.count(), cnt.drop(["numC", "numD"])) - self.assertTrue('50%' not in desc.index) - - def test_describe_typefiltering_category_bool(self): - df = DataFrame({'A_cat': pd.Categorical(['foo', 'foo', 'bar'] * 8), - 'B_str': ['a', 'b', 'c', 'd'] * 6, - 'C_bool': [True] * 12 + [False] * 12, - 'D_num': np.arange(24.) + .5, - 'E_ts': tm.makeTimeSeries()[:24].index}) - - desc = df.describe() - expected_cols = ['D_num'] - expected = DataFrame(dict((k, df[k].describe()) - for k in expected_cols), - columns=expected_cols) - assert_frame_equal(desc, expected) - - desc = df.describe(include=["category"]) - self.assertTrue(desc.columns.tolist() == ["A_cat"]) - - # 'all' includes numpy-dtypes + category - desc1 = df.describe(include="all") - desc2 = df.describe(include=[np.generic, "category"]) - assert_frame_equal(desc1, desc2) - - def test_describe_timedelta(self): - df = DataFrame({"td": pd.to_timedelta(np.arange(24) % 20, "D")}) - self.assertTrue(df.describe().loc["mean"][0] == pd.to_timedelta( - "8d4h")) - - def test_describe_typefiltering_dupcol(self): - df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, - 'catB': ['a', 'b', 'c', 'd'] * 6, - 'numC': np.arange(24), - 'numD': np.arange(24.) + .5, - 'ts': tm.makeTimeSeries()[:24].index}) - s = df.describe(include='all').shape[1] - df = pd.concat([df, df], axis=1) - s2 = df.describe(include='all').shape[1] - self.assertTrue(s2 == 2 * s) - - def test_describe_typefiltering_groupby(self): - df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, - 'catB': ['a', 'b', 'c', 'd'] * 6, - 'numC': np.arange(24), - 'numD': np.arange(24.) + .5, - 'ts': tm.makeTimeSeries()[:24].index}) - G = df.groupby('catA') - self.assertTrue(G.describe(include=['number']).shape == (2, 16)) - self.assertTrue(G.describe(include=['number', 'object']).shape == (2, - 33)) - self.assertTrue(G.describe(include='all').shape == (2, 52)) - - def test_describe_multi_index_df_column_names(self): - """ Test that column names persist after the describe operation.""" - - df = pd.DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - # GH 11517 - # test for hierarchical index - hierarchical_index_df = df.groupby(['A', 'B']).mean().T - self.assertTrue(hierarchical_index_df.columns.names == ['A', 'B']) - self.assertTrue(hierarchical_index_df.describe().columns.names == - ['A', 'B']) - - # test for non-hierarchical index - non_hierarchical_index_df = df.groupby(['A']).mean().T - self.assertTrue(non_hierarchical_index_df.columns.names == ['A']) - self.assertTrue(non_hierarchical_index_df.describe().columns.names == - ['A']) - - def test_metadata_propagation_indiv(self): - - # groupby - df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - result = df.groupby('A').sum() - self.check_metadata(df, result) - - # resample - df = DataFrame(np.random.randn(1000, 2), - index=date_range('20130101', periods=1000, freq='s')) - result = df.resample('1T') - self.check_metadata(df, result) - - # merging with override - # GH 6923 - _metadata = DataFrame._metadata - _finalize = DataFrame.__finalize__ - - np.random.seed(10) - df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b']) - df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd']) - DataFrame._metadata = ['filename'] - df1.filename = 'fname1.csv' - df2.filename = 'fname2.csv' - - def finalize(self, other, method=None, **kwargs): - - for name in self._metadata: - if method == 'merge': - left, right = other.left, other.right - value = getattr(left, name, '') + '|' + getattr(right, - name, '') - object.__setattr__(self, name, value) - else: - object.__setattr__(self, name, getattr(other, name, '')) - - return self - - DataFrame.__finalize__ = finalize - result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner') - self.assertEqual(result.filename, 'fname1.csv|fname2.csv') - - # concat - # GH 6927 - DataFrame._metadata = ['filename'] - df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list('ab')) - df1.filename = 'foo' - - def finalize(self, other, method=None, **kwargs): - for name in self._metadata: - if method == 'concat': - value = '+'.join([getattr( - o, name) for o in other.objs if getattr(o, name, None) - ]) - object.__setattr__(self, name, value) - else: - object.__setattr__(self, name, getattr(other, name, None)) - - return self - - DataFrame.__finalize__ = finalize - - result = pd.concat([df1, df1]) - self.assertEqual(result.filename, 'foo+foo') - - # reset - DataFrame._metadata = _metadata - DataFrame.__finalize__ = _finalize - - def test_tz_convert_and_localize(self): - l0 = date_range('20140701', periods=5, freq='D') - - # TODO: l1 should be a PeriodIndex for testing - # after GH2106 is addressed - with tm.assertRaises(NotImplementedError): - period_range('20140701', periods=1).tz_convert('UTC') - with tm.assertRaises(NotImplementedError): - period_range('20140701', periods=1).tz_localize('UTC') - # l1 = period_range('20140701', periods=5, freq='D') - l1 = date_range('20140701', periods=5, freq='D') - - int_idx = Index(range(5)) - - for fn in ['tz_localize', 'tz_convert']: - - if fn == 'tz_convert': - l0 = l0.tz_localize('UTC') - l1 = l1.tz_localize('UTC') - - for idx in [l0, l1]: - - l0_expected = getattr(idx, fn)('US/Pacific') - l1_expected = getattr(idx, fn)('US/Pacific') - - df1 = DataFrame(np.ones(5), index=l0) - df1 = getattr(df1, fn)('US/Pacific') - self.assert_index_equal(df1.index, l0_expected) - - # MultiIndex - # GH7846 - df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) - - df3 = getattr(df2, fn)('US/Pacific', level=0) - self.assertFalse(df3.index.levels[0].equals(l0)) - self.assert_index_equal(df3.index.levels[0], l0_expected) - self.assert_index_equal(df3.index.levels[1], l1) - self.assertFalse(df3.index.levels[1].equals(l1_expected)) - - df3 = getattr(df2, fn)('US/Pacific', level=1) - self.assert_index_equal(df3.index.levels[0], l0) - self.assertFalse(df3.index.levels[0].equals(l0_expected)) - self.assert_index_equal(df3.index.levels[1], l1_expected) - self.assertFalse(df3.index.levels[1].equals(l1)) - - df4 = DataFrame(np.ones(5), - MultiIndex.from_arrays([int_idx, l0])) - - # TODO: untested - df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa - - self.assert_index_equal(df3.index.levels[0], l0) - self.assertFalse(df3.index.levels[0].equals(l0_expected)) - self.assert_index_equal(df3.index.levels[1], l1_expected) - self.assertFalse(df3.index.levels[1].equals(l1)) - - # Bad Inputs - for fn in ['tz_localize', 'tz_convert']: - # Not DatetimeIndex / PeriodIndex - with tm.assertRaisesRegexp(TypeError, 'DatetimeIndex'): - df = DataFrame(index=int_idx) - df = getattr(df, fn)('US/Pacific') - - # Not DatetimeIndex / PeriodIndex - with tm.assertRaisesRegexp(TypeError, 'DatetimeIndex'): - df = DataFrame(np.ones(5), - MultiIndex.from_arrays([int_idx, l0])) - df = getattr(df, fn)('US/Pacific', level=0) - - # Invalid level - with tm.assertRaisesRegexp(ValueError, 'not valid'): - df = DataFrame(index=l0) - df = getattr(df, fn)('US/Pacific', level=1) - - def test_set_attribute(self): - # Test for consistent setattr behavior when an attribute and a column - # have the same name (Issue #8994) - df = DataFrame({'x': [1, 2, 3]}) - - df.y = 2 - df['y'] = [2, 4, 6] - df.y = 5 - - self.assertEqual(df.y, 5) - assert_series_equal(df['y'], Series([2, 4, 6], name='y')) - - def test_pct_change(self): - # GH 11150 - pnl = DataFrame([np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange( - 0, 40, 10)]).astype(np.float64) - pnl.iat[1, 0] = np.nan - pnl.iat[1, 1] = np.nan - pnl.iat[2, 3] = 60 - - mask = pnl.isnull() - - for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift( - axis=axis) - 1 - expected[mask] = np.nan - result = pnl.pct_change(axis=axis, fill_method='pad') - - self.assert_frame_equal(result, expected) - - def test_to_xarray(self): - - tm._skip_if_no_xarray() - from xarray import Dataset - - df = DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', - periods=3, - tz='US/Eastern')} - ) - - df.index.name = 'foo' - result = df[0:0].to_xarray() - self.assertEqual(result.dims['foo'], 0) - self.assertIsInstance(result, Dataset) - - for index in [tm.makeFloatIndex, tm.makeIntIndex, - tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex, - tm.makeCategoricalIndex, tm.makeTimedeltaIndex]: - df.index = index(3) - df.index.name = 'foo' - df.columns.name = 'bar' - result = df.to_xarray() - self.assertEqual(result.dims['foo'], 3) - self.assertEqual(len(result.coords), 1) - self.assertEqual(len(result.data_vars), 8) - assert_almost_equal(list(result.coords.keys()), ['foo']) - self.assertIsInstance(result, Dataset) - - # idempotency - # categoricals are not preserved - # datetimes w/tz are not preserved - # column names are lost - expected = df.copy() - expected['f'] = expected['f'].astype(object) - expected['h'] = expected['h'].astype('datetime64[ns]') - expected.columns.name = None - assert_frame_equal(result.to_dataframe(), expected, - check_index_type=False, check_categorical=False) - - # available in 0.7.1 - # MultiIndex - df.index = pd.MultiIndex.from_product([['a'], range(3)], - names=['one', 'two']) - result = df.to_xarray() - self.assertEqual(result.dims['one'], 1) - self.assertEqual(result.dims['two'], 3) - self.assertEqual(len(result.coords), 2) - self.assertEqual(len(result.data_vars), 8) - assert_almost_equal(list(result.coords.keys()), ['one', 'two']) - self.assertIsInstance(result, Dataset) - - result = result.to_dataframe() - expected = df.copy() - expected['f'] = expected['f'].astype(object) - expected['h'] = expected['h'].astype('datetime64[ns]') - expected.columns.name = None - assert_frame_equal(result, - expected, - check_index_type=False) - - def test_deepcopy_empty(self): - # This test covers empty frame copying with non-empty column sets - # as reported in issue GH15370 - empty_frame = DataFrame(data=[], index=[], columns=['A']) - empty_frame_copy = deepcopy(empty_frame) - - self._compare(empty_frame_copy, empty_frame) - - -class TestPanel(tm.TestCase, Generic): - _typ = Panel - _comparator = lambda self, x, y: assert_panel_equal(x, y, by_blocks=True) - - def test_to_xarray(self): - - tm._skip_if_no_xarray() - from xarray import DataArray - - with catch_warnings(record=True): - p = tm.makePanel() - - result = p.to_xarray() - self.assertIsInstance(result, DataArray) - self.assertEqual(len(result.coords), 3) - assert_almost_equal(list(result.coords.keys()), - ['items', 'major_axis', 'minor_axis']) - self.assertEqual(len(result.dims), 3) - - # idempotency - assert_panel_equal(result.to_pandas(), p) - - -class TestPanel4D(tm.TestCase, Generic): - _typ = Panel4D - _comparator = lambda self, x, y: assert_panel4d_equal(x, y, by_blocks=True) - - def test_sample(self): - pytest.skip("sample on Panel4D") - - def test_to_xarray(self): - - tm._skip_if_no_xarray() - from xarray import DataArray - - with catch_warnings(record=True): - p = tm.makePanel4D() - - result = p.to_xarray() - self.assertIsInstance(result, DataArray) - self.assertEqual(len(result.coords), 4) - assert_almost_equal(list(result.coords.keys()), - ['labels', 'items', 'major_axis', - 'minor_axis']) - self.assertEqual(len(result.dims), 4) - - # non-convertible - self.assertRaises(ValueError, lambda: result.to_pandas()) - - -# run all the tests, but wrap each in a warning catcher -for t in ['test_rename', 'test_rename_axis', 'test_get_numeric_data', - 'test_get_default', 'test_nonzero', - 'test_numpy_1_7_compat_numeric_methods', - 'test_downcast', 'test_constructor_compound_dtypes', - 'test_head_tail', - 'test_size_compat', 'test_split_compat', - 'test_unexpected_keyword', - 'test_stat_unexpected_keyword', 'test_api_compat', - 'test_stat_non_defaults_args', - 'test_clip', 'test_truncate_out_of_bounds', 'test_numpy_clip', - 'test_metadata_propagation', 'test_copy_and_deepcopy', - 'test_sample']: - - def f(): - def tester(self): - with catch_warnings(record=True): - return getattr(super(TestPanel, self), t)() - return tester - - setattr(TestPanel, t, f()) - - def f(): - def tester(self): - with catch_warnings(record=True): - return getattr(super(TestPanel4D, self), t)() - return tester - - setattr(TestPanel4D, t, f()) - - -class TestNDFrame(tm.TestCase): - # tests that don't fit elsewhere - - def test_sample(sel): - # Fixes issue: 2419 - # additional specific object based tests - - # A few dataframe test with degenerate weights. - easy_weight_list = [0] * 10 - easy_weight_list[5] = 1 - - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10, - 'easyweights': easy_weight_list}) - sample1 = df.sample(n=1, weights='easyweights') - assert_frame_equal(sample1, df.iloc[5:6]) - - # Ensure proper error if string given as weight for Series, panel, or - # DataFrame with axis = 1. - s = Series(range(10)) - with tm.assertRaises(ValueError): - s.sample(n=3, weights='weight_column') - - with catch_warnings(record=True): - panel = Panel(items=[0, 1, 2], major_axis=[2, 3, 4], - minor_axis=[3, 4, 5]) - with tm.assertRaises(ValueError): - panel.sample(n=1, weights='weight_column') - - with tm.assertRaises(ValueError): - df.sample(n=1, weights='weight_column', axis=1) - - # Check weighting key error - with tm.assertRaises(KeyError): - df.sample(n=3, weights='not_a_real_column_name') - - # Check that re-normalizes weights that don't sum to one. - weights_less_than_1 = [0] * 10 - weights_less_than_1[0] = 0.5 - tm.assert_frame_equal( - df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) - - ### - # Test axis argument - ### - - # Test axis argument - df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) - second_column_weight = [0, 1] - assert_frame_equal( - df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) - - # Different axis arg types - assert_frame_equal(df.sample(n=1, axis='columns', - weights=second_column_weight), - df[['col2']]) - - weight = [0] * 10 - weight[5] = 0.5 - assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), - df.iloc[5:6]) - assert_frame_equal(df.sample(n=1, axis='index', weights=weight), - df.iloc[5:6]) - - # Check out of range axis values - with tm.assertRaises(ValueError): - df.sample(n=1, axis=2) - - with tm.assertRaises(ValueError): - df.sample(n=1, axis='not_a_name') - - with tm.assertRaises(ValueError): - s = pd.Series(range(10)) - s.sample(n=1, axis=1) - - # Test weight length compared to correct axis - with tm.assertRaises(ValueError): - df.sample(n=1, axis=1, weights=[0.5] * 10) - - # Check weights with axis = 1 - easy_weight_list = [0] * 3 - easy_weight_list[2] = 1 - - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10}) - sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) - assert_frame_equal(sample1, df[['colString']]) - - # Test default axes - with catch_warnings(record=True): - p = Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], - minor_axis=[1, 3, 5]) - assert_panel_equal( - p.sample(n=3, random_state=42), p.sample(n=3, axis=1, - random_state=42)) - assert_frame_equal( - df.sample(n=3, random_state=42), df.sample(n=3, axis=0, - random_state=42)) - - # Test that function aligns weights with frame - df = DataFrame( - {'col1': [5, 6, 7], - 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) - s = Series([1, 0, 0], index=[3, 5, 9]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) - - # Weights have index values to be dropped because not in - # sampled DataFrame - s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) - - # Weights have empty values to be filed with zeros - s3 = Series([0.01, 0], index=[3, 5]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) - - # No overlap in weight and sampled DataFrame indices - s4 = Series([1, 0], index=[1, 2]) - with tm.assertRaises(ValueError): - df.sample(1, weights=s4) - - def test_squeeze(self): - # noop - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: - tm.assert_series_equal(s.squeeze(), s) - for df in [tm.makeTimeDataFrame()]: - tm.assert_frame_equal(df.squeeze(), df) - with catch_warnings(record=True): - for p in [tm.makePanel()]: - tm.assert_panel_equal(p.squeeze(), p) - with catch_warnings(record=True): - for p4d in [tm.makePanel4D()]: - tm.assert_panel4d_equal(p4d.squeeze(), p4d) - - # squeezing - df = tm.makeTimeDataFrame().reindex(columns=['A']) - tm.assert_series_equal(df.squeeze(), df['A']) - - with catch_warnings(record=True): - p = tm.makePanel().reindex(items=['ItemA']) - tm.assert_frame_equal(p.squeeze(), p['ItemA']) - - p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A']) - tm.assert_series_equal(p.squeeze(), p.loc['ItemA', :, 'A']) - - with catch_warnings(record=True): - p4d = tm.makePanel4D().reindex(labels=['label1']) - tm.assert_panel_equal(p4d.squeeze(), p4d['label1']) - - with catch_warnings(record=True): - p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) - tm.assert_frame_equal(p4d.squeeze(), p4d.loc['label1', 'ItemA']) - - # don't fail with 0 length dimensions GH11229 & GH8999 - empty_series = Series([], name='five') - empty_frame = DataFrame([empty_series]) - with catch_warnings(record=True): - empty_panel = Panel({'six': empty_frame}) - - [tm.assert_series_equal(empty_series, higher_dim.squeeze()) - for higher_dim in [empty_series, empty_frame, empty_panel]] - - # axis argument - df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] - assert df.shape == (1, 1) - tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) - tm.assert_series_equal(df.squeeze(axis='index'), df.iloc[0]) - tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) - tm.assert_series_equal(df.squeeze(axis='columns'), df.iloc[:, 0]) - assert df.squeeze() == df.iloc[0, 0] - tm.assertRaises(ValueError, df.squeeze, axis=2) - tm.assertRaises(ValueError, df.squeeze, axis='x') - - df = tm.makeTimeDataFrame(3) - tm.assert_frame_equal(df.squeeze(axis=0), df) - - def test_numpy_squeeze(self): - s = tm.makeFloatSeries() - tm.assert_series_equal(np.squeeze(s), s) - - df = tm.makeTimeDataFrame().reindex(columns=['A']) - tm.assert_series_equal(np.squeeze(df), df['A']) - - def test_transpose(self): - msg = (r"transpose\(\) got multiple values for " - r"keyword argument 'axes'") - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: - # calls implementation in pandas/core/base.py - tm.assert_series_equal(s.transpose(), s) - for df in [tm.makeTimeDataFrame()]: - tm.assert_frame_equal(df.transpose().transpose(), df) - - with catch_warnings(record=True): - for p in [tm.makePanel()]: - tm.assert_panel_equal(p.transpose(2, 0, 1) - .transpose(1, 2, 0), p) - tm.assertRaisesRegexp(TypeError, msg, p.transpose, - 2, 0, 1, axes=(2, 0, 1)) - - with catch_warnings(record=True): - for p4d in [tm.makePanel4D()]: - tm.assert_panel4d_equal(p4d.transpose(2, 0, 3, 1) - .transpose(1, 3, 0, 2), p4d) - tm.assertRaisesRegexp(TypeError, msg, p4d.transpose, - 2, 0, 3, 1, axes=(2, 0, 3, 1)) - - def test_numpy_transpose(self): - msg = "the 'axes' parameter is not supported" - - s = tm.makeFloatSeries() - tm.assert_series_equal( - np.transpose(s), s) - tm.assertRaisesRegexp(ValueError, msg, - np.transpose, s, axes=1) - - df = tm.makeTimeDataFrame() - tm.assert_frame_equal(np.transpose( - np.transpose(df)), df) - tm.assertRaisesRegexp(ValueError, msg, - np.transpose, df, axes=1) - - with catch_warnings(record=True): - p = tm.makePanel() - tm.assert_panel_equal(np.transpose( - np.transpose(p, axes=(2, 0, 1)), - axes=(1, 2, 0)), p) - - with catch_warnings(record=True): - p4d = tm.makePanel4D() - tm.assert_panel4d_equal(np.transpose( - np.transpose(p4d, axes=(2, 0, 3, 1)), - axes=(1, 3, 0, 2)), p4d) - - def test_take(self): - indices = [1, 5, -2, 6, 3, -1] - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: - out = s.take(indices) - expected = Series(data=s.values.take(indices), - index=s.index.take(indices), dtype=s.dtype) - tm.assert_series_equal(out, expected) - for df in [tm.makeTimeDataFrame()]: - out = df.take(indices) - expected = DataFrame(data=df.values.take(indices, axis=0), - index=df.index.take(indices), - columns=df.columns) - tm.assert_frame_equal(out, expected) - - indices = [-3, 2, 0, 1] - with catch_warnings(record=True): - for p in [tm.makePanel()]: - out = p.take(indices) - expected = Panel(data=p.values.take(indices, axis=0), - items=p.items.take(indices), - major_axis=p.major_axis, - minor_axis=p.minor_axis) - tm.assert_panel_equal(out, expected) - - with catch_warnings(record=True): - for p4d in [tm.makePanel4D()]: - out = p4d.take(indices) - expected = Panel4D(data=p4d.values.take(indices, axis=0), - labels=p4d.labels.take(indices), - major_axis=p4d.major_axis, - minor_axis=p4d.minor_axis, - items=p4d.items) - tm.assert_panel4d_equal(out, expected) - - def test_take_invalid_kwargs(self): - indices = [-3, 2, 0, 1] - s = tm.makeFloatSeries() - df = tm.makeTimeDataFrame() - - with catch_warnings(record=True): - p = tm.makePanel() - p4d = tm.makePanel4D() - - for obj in (s, df, p, p4d): - msg = r"take\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, obj.take, - indices, foo=2) - - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, obj.take, - indices, out=indices) - - msg = "the 'mode' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, obj.take, - indices, mode='clip') - - def test_equals(self): - s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) - s2 = s1.copy() - self.assertTrue(s1.equals(s2)) - - s1[1] = 99 - self.assertFalse(s1.equals(s2)) - - # NaNs compare as equal - s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) - s2 = s1.copy() - self.assertTrue(s1.equals(s2)) - - s2[0] = 9.9 - self.assertFalse(s1.equals(s2)) - - idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) - s1 = Series([1, 2, np.nan], index=idx) - s2 = s1.copy() - self.assertTrue(s1.equals(s2)) - - # Add object dtype column with nans - index = np.random.random(10) - df1 = DataFrame( - np.random.random(10, ), index=index, columns=['floats']) - df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( - ) - df1['start'] = date_range('2000-1-1', periods=10, freq='T') - df1['end'] = date_range('2000-1-1', periods=10, freq='D') - df1['diff'] = df1['end'] - df1['start'] - df1['bool'] = (np.arange(10) % 3 == 0) - df1.loc[::2] = nan - df2 = df1.copy() - self.assertTrue(df1['text'].equals(df2['text'])) - self.assertTrue(df1['start'].equals(df2['start'])) - self.assertTrue(df1['end'].equals(df2['end'])) - self.assertTrue(df1['diff'].equals(df2['diff'])) - self.assertTrue(df1['bool'].equals(df2['bool'])) - self.assertTrue(df1.equals(df2)) - self.assertFalse(df1.equals(object)) - - # different dtype - different = df1.copy() - different['floats'] = different['floats'].astype('float32') - self.assertFalse(df1.equals(different)) - - # different index - different_index = -index - different = df2.set_index(different_index) - self.assertFalse(df1.equals(different)) - - # different columns - different = df2.copy() - different.columns = df2.columns[::-1] - self.assertFalse(df1.equals(different)) - - # DatetimeIndex - index = pd.date_range('2000-1-1', periods=10, freq='T') - df1 = df1.set_index(index) - df2 = df1.copy() - self.assertTrue(df1.equals(df2)) - - # MultiIndex - df3 = df1.set_index(['text'], append=True) - df2 = df1.set_index(['text'], append=True) - self.assertTrue(df3.equals(df2)) - - df2 = df1.set_index(['floats'], append=True) - self.assertFalse(df3.equals(df2)) - - # NaN in index - df3 = df1.set_index(['floats'], append=True) - df2 = df1.set_index(['floats'], append=True) - self.assertTrue(df3.equals(df2)) - - # GH 8437 - a = pd.Series([False, np.nan]) - b = pd.Series([False, np.nan]) - c = pd.Series(index=range(2)) - d = pd.Series(index=range(2)) - e = pd.Series(index=range(2)) - f = pd.Series(index=range(2)) - c[:-1] = d[:-1] = e[0] = f[0] = False - self.assertTrue(a.equals(a)) - self.assertTrue(a.equals(b)) - self.assertTrue(a.equals(c)) - self.assertTrue(a.equals(d)) - self.assertFalse(a.equals(e)) - self.assertTrue(e.equals(f)) - - def test_describe_raises(self): - with catch_warnings(record=True): - with tm.assertRaises(NotImplementedError): - tm.makePanel().describe() - - def test_pipe(self): - df = DataFrame({'A': [1, 2, 3]}) - f = lambda x, y: x ** y - result = df.pipe(f, 2) - expected = DataFrame({'A': [1, 4, 9]}) - self.assert_frame_equal(result, expected) - - result = df.A.pipe(f, 2) - self.assert_series_equal(result, expected.A) - - def test_pipe_tuple(self): - df = DataFrame({'A': [1, 2, 3]}) - f = lambda x, y: y - result = df.pipe((f, 'y'), 0) - self.assert_frame_equal(result, df) - - result = df.A.pipe((f, 'y'), 0) - self.assert_series_equal(result, df.A) - - def test_pipe_tuple_error(self): - df = DataFrame({"A": [1, 2, 3]}) - f = lambda x, y: y - with tm.assertRaises(ValueError): - df.pipe((f, 'y'), x=1, y=0) - - with tm.assertRaises(ValueError): - df.A.pipe((f, 'y'), x=1, y=0) - - def test_pipe_panel(self): - with catch_warnings(record=True): - wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})}) - f = lambda x, y: x + y - result = wp.pipe(f, 2) - expected = wp + 2 - assert_panel_equal(result, expected) - - result = wp.pipe((f, 'y'), x=1) - expected = wp + 1 - assert_panel_equal(result, expected) - - with tm.assertRaises(ValueError): - result = wp.pipe((f, 'y'), x=1, y=1) diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index 2793cc14df19a..1fa436df0910d 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -1,8 +1,12 @@ # -*- coding: utf-8 -*- -from collections import OrderedDict +import os +import locale +import codecs import sys -import unittest from uuid import uuid4 +from collections import OrderedDict + +import pytest from pandas.util._move import move_into_mutable_buffer, BadMove, stolenbuf from pandas.util.decorators import deprecate_kwarg from pandas.util.validators import (validate_args, validate_kwargs, @@ -11,6 +15,9 @@ import pandas.util.testing as tm +CURRENT_LOCALE = locale.getlocale() +LOCALE_OVERRIDE = os.environ.get('LOCALE_OVERRIDE', None) + class TestDecorators(tm.TestCase): @@ -352,9 +359,9 @@ def test_exactly_one_ref(self): # materialize as bytearray to show that it is mutable self.assertEqual(bytearray(as_stolen_buf), b'test') - @unittest.skipIf( + @pytest.mark.skipif( sys.version_info[0] > 2, - 'bytes objects cannot be interned in py3', + reason='bytes objects cannot be interned in py3', ) def test_interned(self): salt = uuid4().hex @@ -401,3 +408,66 @@ def test_numpy_errstate_is_default(): from pandas.compat import numpy # noqa # The errstate should be unchanged after that import. assert np.geterr() == expected + + +class TestLocaleUtils(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestLocaleUtils, cls).setUpClass() + cls.locales = tm.get_locales() + + if not cls.locales: + pytest.skip("No locales found") + + tm._skip_if_windows() + + @classmethod + def tearDownClass(cls): + super(TestLocaleUtils, cls).tearDownClass() + del cls.locales + + def test_get_locales(self): + # all systems should have at least a single locale + assert len(tm.get_locales()) > 0 + + def test_get_locales_prefix(self): + if len(self.locales) == 1: + pytest.skip("Only a single locale found, no point in " + "trying to test filtering locale prefixes") + first_locale = self.locales[0] + assert len(tm.get_locales(prefix=first_locale[:2])) > 0 + + def test_set_locale(self): + if len(self.locales) == 1: + pytest.skip("Only a single locale found, no point in " + "trying to test setting another locale") + + if all(x is None for x in CURRENT_LOCALE): + # Not sure why, but on some travis runs with pytest, + # getlocale() returned (None, None). + pytest.skip("CURRENT_LOCALE is not set.") + + if LOCALE_OVERRIDE is None: + lang, enc = 'it_CH', 'UTF-8' + elif LOCALE_OVERRIDE == 'C': + lang, enc = 'en_US', 'ascii' + else: + lang, enc = LOCALE_OVERRIDE.split('.') + + enc = codecs.lookup(enc).name + new_locale = lang, enc + + if not tm._can_set_locale(new_locale): + with tm.assertRaises(locale.Error): + with tm.set_locale(new_locale): + pass + else: + with tm.set_locale(new_locale) as normalized_locale: + new_lang, new_enc = normalized_locale.split('.') + new_enc = codecs.lookup(enc).name + normalized_locale = new_lang, new_enc + self.assertEqual(normalized_locale, new_locale) + + current_locale = locale.getlocale() + self.assertEqual(current_locale, CURRENT_LOCALE)