diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 9211ffb5cfde5..6ea217c4a72a7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -181,8 +181,8 @@ def __init__(self, left, right, how='inner', on=None, elif isinstance(self.indicator, bool): self.indicator_name = '_merge' if self.indicator else None else: - raise ValueError('indicator option can only accept boolean or string arguments') - + raise ValueError( + 'indicator option can only accept boolean or string arguments') # note this function has side effects (self.left_join_keys, @@ -191,7 +191,8 @@ def __init__(self, left, right, how='inner', on=None, def get_result(self): if self.indicator: - self.left, self.right = self._indicator_pre_merge(self.left, self.right) + self.left, self.right = self._indicator_pre_merge( + self.left, self.right) join_index, left_indexer, right_indexer = self._get_join_info() @@ -225,9 +226,11 @@ def _indicator_pre_merge(self, left, right): for i in ['_left_indicator', '_right_indicator']: if i in columns: - raise ValueError("Cannot use `indicator=True` option when data contains a column named {}".format(i)) + raise ValueError("Cannot use `indicator=True` option when " + "data contains a column named {}".format(i)) if self.indicator_name in columns: - raise ValueError("Cannot use name of an existing column for indicator column") + raise ValueError( + "Cannot use name of an existing column for indicator column") left = left.copy() right = right.copy() @@ -245,11 +248,15 @@ def _indicator_post_merge(self, result): result['_left_indicator'] = result['_left_indicator'].fillna(0) result['_right_indicator'] = result['_right_indicator'].fillna(0) - result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3]) - result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both']) - - result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1) + result[self.indicator_name] = Categorical((result['_left_indicator'] + + result['_right_indicator']), + categories=[1, 2, 3]) + result[self.indicator_name] = ( + result[self.indicator_name] + .cat.rename_categories(['left_only', 'right_only', 'both'])) + result = result.drop(labels=['_left_indicator', '_right_indicator'], + axis=1) return result def _maybe_add_join_keys(self, result, left_indexer, right_indexer): @@ -274,8 +281,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): continue right_na_indexer = right_indexer.take(na_indexer) - result.iloc[na_indexer,key_indexer] = com.take_1d(self.right_join_keys[i], - right_na_indexer) + result.iloc[na_indexer, key_indexer] = ( + com.take_1d(self.right_join_keys[i], + right_na_indexer)) elif name in self.right: if len(self.right) == 0: continue @@ -285,8 +293,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): continue left_na_indexer = left_indexer.take(na_indexer) - result.iloc[na_indexer,key_indexer] = com.take_1d(self.left_join_keys[i], - left_na_indexer) + result.iloc[na_indexer, key_indexer] = ( + com.take_1d(self.left_join_keys[i], + left_na_indexer)) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): @@ -384,8 +393,10 @@ def _get_merge_keys(self): left_drop = [] left, right = self.left, self.right - is_lkey = lambda x: isinstance(x, (np.ndarray, ABCSeries)) and len(x) == len(left) - is_rkey = lambda x: isinstance(x, (np.ndarray, ABCSeries)) and len(x) == len(right) + is_lkey = lambda x: isinstance( + x, (np.ndarray, ABCSeries)) and len(x) == len(left) + is_rkey = lambda x: isinstance( + x, (np.ndarray, ABCSeries)) and len(x) == len(right) # ugh, spaghetti re #733 if _any(self.left_on) and _any(self.right_on): @@ -507,13 +518,13 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): from functools import partial assert len(left_keys) == len(right_keys), \ - 'left_key and right_keys must be the same length' + 'left_key and right_keys must be the same length' # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip( * map(fkeys, left_keys, right_keys))) + llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys))) # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -524,7 +535,7 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False - kwargs = {'sort':sort} if how == 'left' else {} + kwargs = {'sort': sort} if how == 'left' else {} join_func = _join_functions[how] return join_func(lkey, rkey, count, **kwargs) @@ -563,8 +574,10 @@ def get_result(self): left_join_indexer = left_indexer right_join_indexer = right_indexer - lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} - rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} + lindexers = { + 1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = { + 1: right_join_indexer} if right_join_indexer is not None else {} result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], @@ -586,7 +599,7 @@ def _get_multiindex_indexer(join_keys, index, sort): fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location - rlab, llab, shape = map(list, zip( * map(fkeys, index.levels, join_keys))) + rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys))) if sort: rlab = list(map(np.take, rlab, index.labels)) else: @@ -751,12 +764,13 @@ def _get_join_keys(llab, rlab, shape, sort): return _get_join_keys(llab, rlab, shape, sort) -#---------------------------------------------------------------------- +# --------------------------------------------------------------------- # Concatenate DataFrame objects def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, copy=True): + keys=None, levels=None, names=None, verify_integrity=False, + copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. Can also add a layer of hierarchical indexing on the @@ -885,10 +899,11 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, else: # filter out the empties # if we have not multi-index possibiltes - df = DataFrame([ obj.shape for obj in objs ]).sum(1) - non_empties = df[df!=0] - if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None): - objs = [ objs[i] for i in non_empties.index ] + df = DataFrame([obj.shape for obj in objs]).sum(1) + non_empties = df[df != 0] + if (len(non_empties) and (keys is None and names is None and + levels is None and join_axes is None)): + objs = [objs[i] for i in non_empties.index] sample = objs[0] if sample is None: @@ -917,12 +932,12 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if ndim == max_ndim: pass - elif ndim != max_ndim-1: + elif ndim != max_ndim - 1: raise ValueError("cannot concatenate unaligned mixed " "dimensional NDFrame objects") else: - name = getattr(obj,'name',None) + name = getattr(obj, 'name', None) if ignore_index or name is None: name = current_column current_column += 1 @@ -931,7 +946,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, # to line up if self._is_frame and axis == 1: name = 0 - obj = sample._constructor({ name : obj }) + obj = sample._constructor({name: obj}) self.objs.append(obj) @@ -957,17 +972,23 @@ def get_result(self): if self.axis == 0: new_data = com._concat_compat([x._values for x in self.objs]) name = com._consensus_name_attr(self.objs) - return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') + return (Series(new_data, index=self.new_axes[0], name=name) + .__finalize__(self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) - # checks if the column variable already stores valid column names (because set via the 'key' argument - # in the 'concat' function call. If that's not the case, use the series names as column names - if columns.equals(Index(np.arange(len(self.objs)))) and not self.ignore_index: - columns = np.array([ data[i].name for i in range(len(data)) ], dtype='object') + # checks if the column variable already stores valid column + # names (because set via the 'key' argument in the 'concat' + # function call. If that's not the case, use the series names + # as column names + if (columns.equals(Index(np.arange(len(self.objs)))) and + not self.ignore_index): + columns = np.array([data[i].name + for i in range(len(data))], + dtype='object') indexer = isnull(columns) if indexer.any(): columns[indexer] = np.arange(len(indexer[indexer])) @@ -992,11 +1013,13 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) + mgrs_indexers, self.new_axes, + concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() - return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') + return (self.objs[0]._from_axes(new_data, self.new_axes) + .__finalize__(self, method='concat')) def _get_result_dim(self): if self._is_series and self.axis == 1: @@ -1091,7 +1114,7 @@ def _maybe_check_integrity(self, concat_index): if not concat_index.is_unique: overlap = concat_index.get_duplicates() raise ValueError('Indexes have overlapping values: %s' - % str(overlap)) + % str(overlap)) def _concat_indexes(indexes): @@ -1106,7 +1129,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array(zp, ordered=True).categories for zp in zipped] + levels = [Categorical.from_array( + zp, ordered=True).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: @@ -1152,7 +1176,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = list(names) else: # make sure that all of the passed indices have the same nlevels - if not len(set([ i.nlevels for i in indexes ])) == 1: + if not len(set([i.nlevels for i in indexes])) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") @@ -1201,7 +1225,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): def _should_fill(lname, rname): - if not isinstance(lname, compat.string_types) or not isinstance(rname, compat.string_types): + if (not isinstance(lname, compat.string_types) or + not isinstance(rname, compat.string_types)): return True return lname == rname diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 97bd1f86d01cf..7a04847947bf2 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -24,13 +24,17 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', ---------- data : DataFrame values : column to aggregate, optional - index : a column, Grouper, array which has the same length as data, or list of them. - Keys to group by on the pivot table index. - If an array is passed, it is being used as the same manner as column values. - columns : a column, Grouper, array which has the same length as data, or list of them. - Keys to group by on the pivot table column. - If an array is passed, it is being used as the same manner as column values. - aggfunc : function, default numpy.mean, or list of functions + index : column, Grouper, array, or list of the previous + If an array is passed, it must be the same length as the data. The list + can contain any of the other types (except list). + Keys to group by on the pivot table index. If an array is passed, it + is being used as the same manner as column values. + columns : column, Grouper, array, or list of the previous + If an array is passed, it must be the same length as the data. The list + can contain any of the other types (except list). + Keys to group by on the pivot table column. If an array is passed, it + is being used as the same manner as column values. + aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) @@ -78,7 +82,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pieces = [] keys = [] for func in aggfunc: - table = pivot_table(data, values=values, index=index, columns=columns, + table = pivot_table(data, values=values, index=index, + columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) @@ -124,7 +129,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) table = table.reindex_axis(m, axis=0) except AttributeError: - pass # it's a single level + pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) @@ -197,7 +202,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, margins_name) + table, data, rows, cols, aggfunc, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set @@ -273,7 +278,8 @@ def _all_key(key): except TypeError: # we cannot reshape, so coerce the axis - piece.set_axis(cat_axis, piece._get_axis(cat_axis)._to_safe_for_reshape()) + piece.set_axis(cat_axis, piece._get_axis( + cat_axis)._to_safe_for_reshape()) piece[all_key] = margin[key] table_pieces.append(piece) @@ -349,13 +355,15 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (np.isscalar(by) or isinstance(by, (np.ndarray, Index, Series, Grouper)) + elif (np.isscalar(by) or isinstance(by, (np.ndarray, Index, + Series, Grouper)) or hasattr(by, '__call__')): by = [by] else: by = list(by) return by + def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True): """ diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 43bcd2373df69..8f7c0a2b1be9a 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1,6 +1,5 @@ # being a bit too dynamic # pylint: disable=E1101 -import datetime import warnings import re from math import ceil @@ -17,10 +16,7 @@ from pandas.core.generic import _shared_docs, _shared_doc_kwargs from pandas.core.index import Index, MultiIndex from pandas.core.series import Series, remove_na -from pandas.tseries.index import DatetimeIndex -from pandas.tseries.period import PeriodIndex, Period -import pandas.tseries.frequencies as frequencies -from pandas.tseries.offsets import DateOffset +from pandas.tseries.period import PeriodIndex from pandas.compat import range, lrange, lmap, map, zip, string_types import pandas.compat as compat from pandas.util.decorators import Appender @@ -36,65 +32,65 @@ # to True. mpl_stylesheet = { 'axes.axisbelow': True, - 'axes.color_cycle': ['#348ABD', - '#7A68A6', - '#A60628', - '#467821', - '#CF4457', - '#188487', - '#E24A33'], - 'axes.edgecolor': '#bcbcbc', - 'axes.facecolor': '#eeeeee', - 'axes.grid': True, - 'axes.labelcolor': '#555555', - 'axes.labelsize': 'large', - 'axes.linewidth': 1.0, - 'axes.titlesize': 'x-large', - 'figure.edgecolor': 'white', - 'figure.facecolor': 'white', - 'figure.figsize': (6.0, 4.0), - 'figure.subplot.hspace': 0.5, - 'font.family': 'monospace', - 'font.monospace': ['Andale Mono', - 'Nimbus Mono L', - 'Courier New', - 'Courier', - 'Fixed', - 'Terminal', - 'monospace'], - 'font.size': 10, - 'interactive': True, - 'keymap.all_axes': ['a'], - 'keymap.back': ['left', 'c', 'backspace'], - 'keymap.forward': ['right', 'v'], - 'keymap.fullscreen': ['f'], - 'keymap.grid': ['g'], - 'keymap.home': ['h', 'r', 'home'], - 'keymap.pan': ['p'], - 'keymap.save': ['s'], - 'keymap.xscale': ['L', 'k'], - 'keymap.yscale': ['l'], - 'keymap.zoom': ['o'], - 'legend.fancybox': True, - 'lines.antialiased': True, - 'lines.linewidth': 1.0, - 'patch.antialiased': True, - 'patch.edgecolor': '#EEEEEE', - 'patch.facecolor': '#348ABD', - 'patch.linewidth': 0.5, - 'toolbar': 'toolbar2', - 'xtick.color': '#555555', - 'xtick.direction': 'in', - 'xtick.major.pad': 6.0, - 'xtick.major.size': 0.0, - 'xtick.minor.pad': 6.0, - 'xtick.minor.size': 0.0, - 'ytick.color': '#555555', - 'ytick.direction': 'in', - 'ytick.major.pad': 6.0, - 'ytick.major.size': 0.0, - 'ytick.minor.pad': 6.0, - 'ytick.minor.size': 0.0 + 'axes.color_cycle': ['#348ABD', + '#7A68A6', + '#A60628', + '#467821', + '#CF4457', + '#188487', + '#E24A33'], + 'axes.edgecolor': '#bcbcbc', + 'axes.facecolor': '#eeeeee', + 'axes.grid': True, + 'axes.labelcolor': '#555555', + 'axes.labelsize': 'large', + 'axes.linewidth': 1.0, + 'axes.titlesize': 'x-large', + 'figure.edgecolor': 'white', + 'figure.facecolor': 'white', + 'figure.figsize': (6.0, 4.0), + 'figure.subplot.hspace': 0.5, + 'font.family': 'monospace', + 'font.monospace': ['Andale Mono', + 'Nimbus Mono L', + 'Courier New', + 'Courier', + 'Fixed', + 'Terminal', + 'monospace'], + 'font.size': 10, + 'interactive': True, + 'keymap.all_axes': ['a'], + 'keymap.back': ['left', 'c', 'backspace'], + 'keymap.forward': ['right', 'v'], + 'keymap.fullscreen': ['f'], + 'keymap.grid': ['g'], + 'keymap.home': ['h', 'r', 'home'], + 'keymap.pan': ['p'], + 'keymap.save': ['s'], + 'keymap.xscale': ['L', 'k'], + 'keymap.yscale': ['l'], + 'keymap.zoom': ['o'], + 'legend.fancybox': True, + 'lines.antialiased': True, + 'lines.linewidth': 1.0, + 'patch.antialiased': True, + 'patch.edgecolor': '#EEEEEE', + 'patch.facecolor': '#348ABD', + 'patch.linewidth': 0.5, + 'toolbar': 'toolbar2', + 'xtick.color': '#555555', + 'xtick.direction': 'in', + 'xtick.major.pad': 6.0, + 'xtick.major.size': 0.0, + 'xtick.minor.pad': 6.0, + 'xtick.minor.size': 0.0, + 'ytick.color': '#555555', + 'ytick.direction': 'in', + 'ytick.major.pad': 6.0, + 'ytick.major.size': 0.0, + 'ytick.minor.pad': 6.0, + 'ytick.minor.size': 0.0 } @@ -106,6 +102,7 @@ def _mpl_le_1_2_1(): except ImportError: return False + def _mpl_ge_1_3_1(): try: import matplotlib @@ -116,18 +113,20 @@ def _mpl_ge_1_3_1(): except ImportError: return False + def _mpl_ge_1_4_0(): try: import matplotlib - return (matplotlib.__version__ >= LooseVersion('1.4') + return (matplotlib.__version__ >= LooseVersion('1.4') or matplotlib.__version__[0] == '0') except ImportError: return False + def _mpl_ge_1_5_0(): try: import matplotlib - return (matplotlib.__version__ >= LooseVersion('1.5') + return (matplotlib.__version__ >= LooseVersion('1.5') or matplotlib.__version__[0] == '0') except ImportError: return False @@ -142,6 +141,7 @@ def _mpl_ge_1_5_0(): def _get_standard_kind(kind): return {'density': 'kde'}.get(kind, kind) + def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): import matplotlib.pyplot as plt @@ -164,7 +164,8 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', # need to call list() on the result to copy so we don't # modify the global rcParams below try: - colors = [c['color'] for c in list(plt.rcParams['axes.prop_cycle'])] + colors = [c['color'] + for c in list(plt.rcParams['axes.prop_cycle'])] except KeyError: colors = list(plt.rcParams.get('axes.color_cycle', list('bgrcmyk'))) @@ -172,6 +173,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', colors = list(colors) elif color_type == 'random': import random + def random_color(column): random.seed(column) return [random.random() for _ in range(3)] @@ -183,6 +185,7 @@ def random_color(column): if isinstance(colors, compat.string_types): import matplotlib.colors conv = matplotlib.colors.ColorConverter() + def _maybe_valid_colors(colors): try: [conv.to_rgba(c) for c in colors] @@ -207,7 +210,7 @@ def _maybe_valid_colors(colors): pass if len(colors) != num_colors: - multiple = num_colors//len(colors) - 1 + multiple = num_colors // len(colors) - 1 mod = num_colors % len(colors) colors += multiple * colors @@ -215,6 +218,7 @@ def _maybe_valid_colors(colors): return colors + class _Options(dict): """ Stores pandas plotting options. @@ -319,7 +323,6 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, >>> scatter_matrix(df, alpha=0.2) """ import matplotlib.pyplot as plt - from matplotlib.artist import setp df = frame._get_numeric_data() n = df.columns.size @@ -345,7 +348,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. - boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext)) + boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): @@ -379,9 +382,9 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, ax.set_xlabel(b) ax.set_ylabel(a) - if j!= 0: + if j != 0: ax.yaxis.set_visible(False) - if i != n-1: + if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: @@ -413,6 +416,7 @@ def _gcf(): import matplotlib.pyplot as plt return plt.gcf() + def _get_marker_compat(marker): import matplotlib.lines as mlines import matplotlib as mpl @@ -422,6 +426,7 @@ def _get_marker_compat(marker): return 'o' return marker + def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): """RadViz - a multivariate data visualization algorithm @@ -506,18 +511,22 @@ def normalize(series): ax.axis('equal') return ax + @deprecate_kwarg(old_arg_name='data', new_arg_name='frame') def andrews_curves(frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds): """ - Generates a matplotlib plot of Andrews curves, for visualising clusters of multivariate data. + Generates a matplotlib plot of Andrews curves, for visualising clusters of + multivariate data. Andrews curves have the functional form: - f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) + x_4 sin(2t) + x_5 cos(2t) + ... + f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) + + x_4 sin(2t) + x_5 cos(2t) + ... - Where x coefficients correspond to the values of each dimension and t is linearly spaced between -pi and +pi. Each - row of frame then corresponds to a single curve. + Where x coefficients correspond to the values of each dimension and t is + linearly spaced between -pi and +pi. Each row of frame then corresponds to + a single curve. Parameters: ----------- @@ -547,12 +556,14 @@ def f(t): x1 = amplitudes[0] result = x1 / sqrt(2.0) - # Take the rest of the coefficients and resize them appropriately. Take a copy of amplitudes as otherwise - # numpy deletes the element from amplitudes itself. + # Take the rest of the coefficients and resize them + # appropriately. Take a copy of amplitudes as otherwise numpy + # deletes the element from amplitudes itself. coeffs = np.delete(np.copy(amplitudes), 0) coeffs.resize(int((coeffs.size + 1) / 2), 2) - # Generate the harmonics and arguments for the sin and cos functions. + # Generate the harmonics and arguments for the sin and cos + # functions. harmonics = np.arange(0, coeffs.shape[0]) + 1 trig_args = np.outer(harmonics, t) @@ -652,6 +663,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): plt.setp(axis.get_yticklabels(), fontsize=8) return fig + @deprecate_kwarg(old_arg_name='colors', new_arg_name='color') @deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3) def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, @@ -692,12 +704,14 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, >>> from pandas import read_csv >>> from pandas.tools.plotting import parallel_coordinates >>> from matplotlib import pyplot as plt - >>> df = read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv') - >>> parallel_coordinates(df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) + >>> df = read_csv('https://raw.github.com/pydata/pandas/master' + '/pandas/tests/data/iris.csv') + >>> parallel_coordinates(df, 'Name', color=('#556270', + '#4ECDC4', '#C7F464')) >>> plt.show() """ if axvlines_kwds is None: - axvlines_kwds = {'linewidth':1,'color':'black'} + axvlines_kwds = {'linewidth': 1, 'color': 'black'} import matplotlib.pyplot as plt n = len(frame) @@ -811,7 +825,8 @@ def autocorrelation_plot(series, ax=None, **kwds): c0 = np.sum((data - mean) ** 2) / float(n) def r(h): - return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + return ((data[:n - h] - mean) * + (data[h:] - mean)).sum() / float(n) / c0 x = np.arange(n) + 1 y = lmap(r, x) z95 = 1.959963984540054 @@ -873,10 +888,11 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, if sharex is None: if ax is None: - self.sharex = True + self.sharex = True else: - # if we get an axis, the users should do the visibility setting... - self.sharex = False + # if we get an axis, the users should do the visibility + # setting... + self.sharex = False else: self.sharex = sharex @@ -968,10 +984,11 @@ def _validate_color_args(self): # need only a single match for s in styles: if re.match('^[a-z]+?', s) is not None: - raise ValueError("Cannot pass 'style' string with a color " - "symbol and 'color' keyword argument. Please" - " use one or the other or pass 'style' " - "without a color symbol") + raise ValueError( + "Cannot pass 'style' string with a color " + "symbol and 'color' keyword argument. Please" + " use one or the other or pass 'style' " + "without a color symbol") def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -979,10 +996,11 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - if self.sort_columns: - columns = com._try_sort(data.columns) - else: - columns = data.columns + # TODO: unused? + # if self.sort_columns: + # columns = com._try_sort(data.columns) + # else: + # columns = data.columns for col, values in data.iteritems(): if keep_index is True: @@ -1147,7 +1165,7 @@ def _post_plot_logic_common(self, ax, data): self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) - else: # pragma no cover + else: # pragma no cover raise ValueError def _post_plot_logic(self, ax, data): @@ -1206,7 +1224,7 @@ def legend_title(self): return ','.join(stringified) def _add_legend_handle(self, handle, label, index=None): - if not label is None: + if label is not None: if self.mark_right and index is not None: if self.on_right(index): label = label + ' (right)' @@ -1221,7 +1239,7 @@ def _make_legend(self): title = '' if not self.subplots: - if not leg is None: + if leg is not None: title = leg.get_title().get_text() handles = leg.legendHandles labels = [x.get_text() for x in leg.get_texts()] @@ -1233,7 +1251,7 @@ def _make_legend(self): handles += self.legend_handles labels += self.legend_labels - if not self.legend_title is None: + if self.legend_title is not None: title = self.legend_title if len(handles) > 0: @@ -1312,7 +1330,8 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): if is_errorbar: return ax.errorbar(x, y, **kwds) else: - # prevent style kwarg from going to errorbar, where it is unsupported + # prevent style kwarg from going to errorbar, where it is + # unsupported if style is not None: args = (x, y, style) else: @@ -1449,10 +1468,10 @@ def match_labels(data, e): # asymmetrical error bars if err.ndim == 3: if (err_shape[0] != self.nseries) or \ - (err_shape[1] != 2) or \ - (err_shape[2] != len(self.data)): + (err_shape[1] != 2) or \ + (err_shape[2] != len(self.data)): msg = "Asymmetrical error bars should be provided " + \ - "with the shape (%u, 2, %u)" % \ + "with the shape (%u, 2, %u)" % \ (self.nseries, len(self.data)) raise ValueError(msg) @@ -1492,7 +1511,7 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): def _get_subplots(self): from matplotlib.axes import Subplot return [ax for ax in self.axes[0].get_figure().get_axes() - if isinstance(ax, Subplot)] + if isinstance(ax, Subplot)] def _get_axes_layout(self): axes = self._get_subplots() @@ -1594,7 +1613,8 @@ def _make_plot(self): if len(errors_x) > 0 or len(errors_y) > 0: err_kwds = dict(errors_x, **errors_y) err_kwds['ecolor'] = scatter.get_facecolor()[0] - ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds) + ax.errorbar(data[x].values, data[y].values, + linestyle='none', **err_kwds) class HexBinPlot(PlanePlot): @@ -1691,7 +1711,8 @@ def _make_plot(self): @classmethod def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): - # column_num is used to get the target column from protf in line and area plots + # column_num is used to get the target column from protf in line and + # area plots if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) @@ -1753,8 +1774,10 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): elif (values <= 0).all(): return ax._stacker_neg_prior[stacking_id] + values - raise ValueError('When stacked is True, each column must be either all positive or negative.' - '{0} contains both positive and negative values'.format(label)) + raise ValueError('When stacked is True, each column must be either ' + 'all positive or negative.' + '{0} contains both positive and negative values' + .format(label)) @classmethod def _update_stacker(cls, ax, stacking_id, values): @@ -1820,10 +1843,10 @@ def _plot(cls, ax, x, y, style=None, column_num=None, else: start = np.zeros(len(y)) - if not 'color' in kwds: + if 'color' not in kwds: kwds['color'] = lines[0].get_color() - if cls.mpl_ge_1_5_0(): # mpl 1.5 added real support for poly legends + if cls.mpl_ge_1_5_0(): # mpl 1.5 added real support for poly legends kwds.pop('label') ax.fill_between(xdata, start, y_values, **kwds) cls._update_stacker(ax, stacking_id, y) @@ -1861,7 +1884,7 @@ def __init__(self, data, **kwargs): self.bottom = kwargs.pop('bottom', 0) self.left = kwargs.pop('left', 0) - self.log = kwargs.pop('log',False) + self.log = kwargs.pop('log', False) MPLPlot.__init__(self, data, **kwargs) if self.stacked or self.subplots: @@ -1915,7 +1938,7 @@ def _make_plot(self): label = com.pprint_thing(label) if (('yerr' in kwds) or ('xerr' in kwds)) \ - and (kwds.get('ecolor') is None): + and (kwds.get('ecolor') is None): kwds['ecolor'] = mpl.rcParams['xtick.color'] start = 0 @@ -1926,20 +1949,23 @@ def _make_plot(self): if self.subplots: w = self.bar_width / 2 rect = self._plot(ax, self.ax_pos + w, y, self.bar_width, - start=start, label=label, log=self.log, **kwds) + start=start, label=label, + log=self.log, **kwds) ax.set_title(label) elif self.stacked: mask = y > 0 start = np.where(mask, pos_prior, neg_prior) + self._start_base w = self.bar_width / 2 rect = self._plot(ax, self.ax_pos + w, y, self.bar_width, - start=start, label=label, log=self.log, **kwds) + start=start, label=label, + log=self.log, **kwds) pos_prior = pos_prior + np.where(mask, y, 0) neg_prior = neg_prior + np.where(mask, 0, y) else: w = self.bar_width / K rect = self._plot(ax, self.ax_pos + (i + 0.5) * w, y, w, - start=start, label=label, log=self.log, **kwds) + start=start, label=label, + log=self.log, **kwds) self._add_legend_handle(rect, label, index=i) def _post_plot_logic(self, ax, data): @@ -2000,9 +2026,10 @@ def _args_adjust(self): values = np.ravel(values) values = values[~com.isnull(values)] - hist, self.bins = np.histogram(values, bins=self.bins, - range=self.kwds.get('range', None), - weights=self.kwds.get('weights', None)) + hist, self.bins = np.histogram( + values, bins=self.bins, + range=self.kwds.get('range', None), + weights=self.kwds.get('weights', None)) if com.is_list_like(self.bottom): self.bottom = np.array(self.bottom) @@ -2015,7 +2042,8 @@ def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, y = y[~com.isnull(y)] base = np.zeros(len(bins) - 1) - bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds['label']) + bottom = bottom + \ + cls._get_stacked_values(ax, stacking_id, base, kwds['label']) # ignore style n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds) cls._update_stacker(ax, stacking_id, n) @@ -2134,7 +2162,8 @@ def _validate_color_args(self): pass def _make_plot(self): - colors = self._get_colors(num_colors=len(self.data), color_kwds='colors') + colors = self._get_colors( + num_colors=len(self.data), color_kwds='colors') self.kwds.setdefault('colors', colors) for i, (label, y) in enumerate(self._iter_data()): @@ -2190,14 +2219,16 @@ class BoxPlot(LinePlot): def __init__(self, data, return_type=None, **kwargs): # Do not call LinePlot.__init__ which may fill nan if return_type not in self._valid_return_types: - raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + raise ValueError( + "return_type must be {None, 'axes', 'dict', 'both'}") self.return_type = return_type MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): if self.subplots: - # Disable label ax sharing. Otherwise, all subplots shows last column label + # Disable label ax sharing. Otherwise, all subplots shows last + # column label if self.orientation == 'vertical': self.sharex = False else: @@ -2233,8 +2264,10 @@ def _validate_color_args(self): valid_keys = ['boxes', 'whiskers', 'medians', 'caps'] for key, values in compat.iteritems(self.color): if key not in valid_keys: - raise ValueError("color dict contains invalid key '{0}' " - "The key must be either {1}".format(key, valid_keys)) + raise ValueError("color dict contains invalid " + "key '{0}' " + "The key must be either {1}" + .format(key, valid_keys)) else: self.color = None @@ -2332,7 +2365,8 @@ def result(self): # kinds supported by both dataframe and series -_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', 'box'] +_common_kinds = ['line', 'bar', 'barh', + 'kde', 'density', 'area', 'hist', 'box'] # kinds supported by dataframe _dataframe_kinds = ['scatter', 'hexbin'] # kinds supported only by series or dataframe single column @@ -2529,7 +2563,8 @@ def _plot(data, x=None, y=None, subplots=False, be transposed to meet matplotlib's default layout. If a Series or DataFrame is passed, use passed data to draw a table. yerr : DataFrame, Series, array-like, dict and str - See :ref:`Plotting with Error Bars ` for detail. + See :ref:`Plotting with Error Bars ` for + detail. xerr : same types as yerr. %(klass_unique)s mark_right : boolean, default True @@ -2555,14 +2590,14 @@ def _plot(data, x=None, y=None, subplots=False, @Appender(_shared_docs['plot'] % _shared_doc_df_kwargs) -def plot_frame(data, x=None, y=None, kind='line', ax=None, # Dataframe unique - subplots=False, sharex=None, sharey=False, layout=None, # Dataframe unique +def plot_frame(data, x=None, y=None, kind='line', ax=None, + subplots=False, sharex=None, sharey=False, layout=None, figsize=None, use_index=True, title=None, grid=None, legend=True, style=None, logx=False, logy=False, loglog=False, xticks=None, yticks=None, xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, table=False, yerr=None, xerr=None, - secondary_y=False, sort_columns=False, # Dataframe unique + secondary_y=False, sort_columns=False, **kwds): return _plot(data, kind=kind, x=x, y=y, ax=ax, subplots=subplots, sharex=sharex, sharey=sharey, @@ -2836,10 +2871,11 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, """ if by is not None: - axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, - sharex=sharex, sharey=sharey, layout=layout, bins=bins, - xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds) + axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, + figsize=figsize, sharex=sharex, sharey=sharey, + layout=layout, bins=bins, xlabelsize=xlabelsize, + xrot=xrot, ylabelsize=ylabelsize, + yrot=yrot, **kwds) return axes if column is not None: @@ -2861,14 +2897,15 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, ax.grid(grid) _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + ylabelsize=ylabelsize, yrot=yrot) fig.subplots_adjust(wspace=0.3, hspace=0.3) return axes def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds): + xrot=None, ylabelsize=None, yrot=None, figsize=None, + bins=10, **kwds): """ Draw histogram of the input series using matplotlib @@ -2910,7 +2947,7 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize)) if (figsize is not None and tuple(figsize) != - tuple(fig.get_size_inches())): + tuple(fig.get_size_inches())): fig.set_size_inches(*figsize, forward=True) if ax is None: ax = fig.gca() @@ -2923,16 +2960,16 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, axes = np.array([ax]) _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + ylabelsize=ylabelsize, yrot=yrot) else: if 'figure' in kwds: raise ValueError("Cannot pass 'figure' when using the " "'by' argument, since a new 'Figure' instance " "will be created") - axes = grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize, bins=bins, - xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds) + axes = grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize, + bins=bins, xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot, **kwds) if hasattr(axes, 'ndim'): if axes.ndim == 1 and len(axes) == 1: @@ -2976,7 +3013,7 @@ def plot_group(group, ax): figsize=figsize, layout=layout, rot=rot) _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + ylabelsize=ylabelsize, yrot=yrot) fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3) @@ -3032,8 +3069,8 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, if subplots is True: naxes = len(grouped) fig, axes = _subplots(naxes=naxes, squeeze=False, - ax=ax, sharex=False, sharey=True, figsize=figsize, - layout=layout) + ax=ax, sharex=False, sharey=True, + figsize=figsize, layout=layout) axes = _flatten(axes) ret = compat.OrderedDict() @@ -3042,7 +3079,8 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, rot=rot, grid=grid, **kwds) ax.set_title(com.pprint_thing(key)) ret[key] = d - fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, + right=0.9, wspace=0.2) else: from pandas.tools.merge import concat keys, frames = zip(*grouped) @@ -3054,7 +3092,8 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, else: df = frames[0] ret = df.boxplot(column=column, fontsize=fontsize, rot=rot, - grid=grid, ax=ax, figsize=figsize, layout=layout, **kwds) + grid=grid, ax=ax, figsize=figsize, + layout=layout, **kwds) return ret @@ -3092,8 +3131,8 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, def _grouped_plot_by_column(plotf, data, columns=None, by=None, numeric_only=True, grid=False, - figsize=None, ax=None, layout=None, return_type=None, - **kwargs): + figsize=None, ax=None, layout=None, + return_type=None, **kwargs): grouped = data.groupby(by) if columns is None: if not isinstance(by, (list, tuple)): @@ -3129,7 +3168,6 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, def table(ax, data, rowLabels=None, colLabels=None, **kwargs): - """ Helper function to convert DataFrame and Series to matplotlib.table @@ -3140,7 +3178,8 @@ def table(ax, data, rowLabels=None, colLabels=None, data for table contents `kwargs`: keywords, optional keyword arguments which passed to matplotlib.table.table. - If `rowLabels` or `colLabels` is not specified, data index or column name will be used. + If `rowLabels` or `colLabels` is not specified, data index or column + name will be used. Returns ------- @@ -3164,7 +3203,8 @@ def table(ax, data, rowLabels=None, colLabels=None, import matplotlib.table table = matplotlib.table.table(ax, cellText=cellText, - rowLabels=rowLabels, colLabels=colLabels, **kwargs) + rowLabels=rowLabels, + colLabels=colLabels, **kwargs) return table @@ -3177,7 +3217,7 @@ def _get_layout(nplots, layout=None, layout_type='box'): # Python 2 compat ceil_ = lambda x: int(ceil(x)) - if nrows == -1 and ncols >0: + if nrows == -1 and ncols > 0: layout = nrows, ncols = (ceil_(float(nplots) / ncols), ncols) elif ncols == -1 and nrows > 0: layout = nrows, ncols = (nrows, ceil_(float(nplots) / nrows)) @@ -3186,8 +3226,8 @@ def _get_layout(nplots, layout=None, layout_type='box'): raise ValueError(msg) if nrows * ncols < nplots: - raise ValueError('Layout of %sx%s must be larger than required size %s' % - (nrows, ncols, nplots)) + raise ValueError('Layout of %sx%s must be larger than ' + 'required size %s' % (nrows, ncols, nplots)) return layout @@ -3215,7 +3255,8 @@ def _get_layout(nplots, layout=None, layout_type='box'): def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, - subplot_kw=None, ax=None, layout=None, layout_type='box', **fig_kw): + subplot_kw=None, ax=None, layout=None, layout_type='box', + **fig_kw): """Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of @@ -3300,27 +3341,29 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if com.is_list_like(ax): ax = _flatten(ax) if layout is not None: - warnings.warn("When passing multiple axes, layout keyword is ignored", UserWarning) + warnings.warn("When passing multiple axes, layout keyword is " + "ignored", UserWarning) if sharex or sharey: - warnings.warn("When passing multiple axes, sharex and sharey are ignored." - "These settings must be specified when creating axes", UserWarning) + warnings.warn("When passing multiple axes, sharex and sharey " + "are ignored. These settings must be specified " + "when creating axes", UserWarning) if len(ax) == naxes: fig = ax[0].get_figure() return fig, ax else: - raise ValueError("The number of passed axes must be {0}, the same as " - "the output plot".format(naxes)) + raise ValueError("The number of passed axes must be {0}, the " + "same as the output plot".format(naxes)) fig = ax.get_figure() - # if ax is passed and a number of subplots is 1, return ax as it is + # if ax is passed and a number of subplots is 1, return ax as it is if naxes == 1: if squeeze: return fig, ax else: return fig, _flatten(ax) else: - warnings.warn("To output multiple subplots, the figure containing the passed axes " - "is being cleared", UserWarning) + warnings.warn("To output multiple subplots, the figure containing " + "the passed axes is being cleared", UserWarning) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) @@ -3399,7 +3442,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): try: # first find out the ax layout, # so that we can correctly handle 'gaps" - layout = np.zeros((nrows+1,ncols+1), dtype=np.bool) + layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool) for ax in axarr: layout[ax.rowNum, ax.colNum] = ax.get_visible() @@ -3407,9 +3450,10 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): # only the last row of subplots should get x labels -> all # other off layout handles the case that the subplot is # the last in the column, because below is no subplot/gap. - if not layout[ax.rowNum+1, ax.colNum]: + if not layout[ax.rowNum + 1, ax.colNum]: continue - if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: + if sharex or len(ax.get_shared_x_axes() + .get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) except IndexError: @@ -3418,21 +3462,21 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): for ax in axarr: if ax.is_last_row(): continue - if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: + if sharex or len(ax.get_shared_x_axes() + .get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) if ncols > 1: for ax in axarr: - # only the first column should get y labels -> set all other to off - # as we only have labels in teh first column and we always have a subplot there, - # we can skip the layout test + # only the first column should get y labels -> set all other to + # off as we only have labels in teh first column and we always + # have a subplot there, we can skip the layout test if ax.is_first_col(): continue if sharey or len(ax.get_shared_y_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.yaxis) - def _flatten(axes): if not com.is_list_like(axes): return np.array([axes]) @@ -3479,6 +3523,7 @@ def _set_ticks_props(axes, xlabelsize=None, xrot=None, class BasePlotMethods(PandasObject): + def __init__(self, data): self._data = data @@ -3499,14 +3544,15 @@ class SeriesPlotMethods(BasePlotMethods): with the ``kind`` argument: ``s.plot(kind='line')`` is equivalent to ``s.plot.line()`` """ - def __call__(self, kind='line', ax=None, # Series unique + + def __call__(self, kind='line', ax=None, figsize=None, use_index=True, title=None, grid=None, - legend=False, style=None, logx=False, logy=False, loglog=False, - xticks=None, yticks=None, xlim=None, ylim=None, + legend=False, style=None, logx=False, logy=False, + loglog=False, xticks=None, yticks=None, + xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, table=False, yerr=None, xerr=None, - label=None, secondary_y=False, # Series unique - **kwds): + label=None, secondary_y=False, **kwds): return plot_series(self._data, kind=kind, ax=ax, figsize=figsize, use_index=use_index, title=title, grid=grid, legend=legend, style=style, logx=logx, logy=logy, @@ -3671,15 +3717,15 @@ class FramePlotMethods(BasePlotMethods): method with the ``kind`` argument: ``df.plot(kind='line')`` is equivalent to ``df.plot.line()`` """ - def __call__(self, x=None, y=None, kind='line', ax=None, # Dataframe unique - subplots=False, sharex=None, sharey=False, layout=None, # Dataframe unique + + def __call__(self, x=None, y=None, kind='line', ax=None, + subplots=False, sharex=None, sharey=False, layout=None, figsize=None, use_index=True, title=None, grid=None, legend=True, style=None, logx=False, logy=False, loglog=False, xticks=None, yticks=None, xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, table=False, yerr=None, xerr=None, - secondary_y=False, sort_columns=False, # Dataframe unique - **kwds): + secondary_y=False, sort_columns=False, **kwds): return plot_frame(self._data, kind=kind, x=x, y=y, ax=ax, subplots=subplots, sharex=sharex, sharey=sharey, layout=layout, figsize=figsize, use_index=use_index, @@ -3913,8 +3959,8 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, import pandas.tools.plotting as plots import pandas.core.frame as fr - reload(plots) - reload(fr) + reload(plots) # noqa + reload(fr) # noqa from pandas.core.frame import DataFrame data = DataFrame([[3, 6, -5], [4, 8, 2], [4, 9, -6], diff --git a/pandas/tools/rplot.py b/pandas/tools/rplot.py index bc834689ffce8..5a748b60aae9c 100644 --- a/pandas/tools/rplot.py +++ b/pandas/tools/rplot.py @@ -15,7 +15,8 @@ "The rplot trellis plotting interface is deprecated and will be " "removed in a future version. We refer to external packages " "like seaborn for similar but more refined functionality. \n\n" - "See our docs http://pandas.pydata.org/pandas-docs/stable/visualization.html#rplot " + "See our docs http://pandas.pydata.org/pandas-docs/stable" + "/visualization.html#rplot " "for some example how to convert your existing code to these " "packages.", FutureWarning, stacklevel=2) @@ -26,19 +27,23 @@ class Scale: """ pass + class ScaleGradient(Scale): """ A mapping between a data attribute value and a point in colour space between two specified colours. """ + def __init__(self, column, colour1, colour2): """Initialize ScaleGradient instance. Parameters: ----------- column: string, pandas DataFrame column name - colour1: tuple, 3 element tuple with float values representing an RGB colour - colour2: tuple, 3 element tuple with float values representing an RGB colour + colour1: tuple + 3 element tuple with float values representing an RGB colour + colour2: tuple + 3 element tuple with float values representing an RGB colour """ self.column = column self.colour1 = colour1 @@ -55,7 +60,8 @@ def __call__(self, data, index): Returns: -------- - A three element tuple representing an RGB somewhere between colour1 and colour2 + A three element tuple representing an RGB somewhere between colour1 and + colour2 """ x = data[self.column].iget(index) a = min(data[self.column]) @@ -67,20 +73,25 @@ def __call__(self, data, index): g1 + (g2 - g1) * x_scaled, b1 + (b2 - b1) * x_scaled) + class ScaleGradient2(Scale): """ Create a mapping between a data attribute value and a point in colour space in a line of three specified colours. """ + def __init__(self, column, colour1, colour2, colour3): """Initialize ScaleGradient2 instance. Parameters: ----------- column: string, pandas DataFrame column name - colour1: tuple, 3 element tuple with float values representing an RGB colour - colour2: tuple, 3 element tuple with float values representing an RGB colour - colour3: tuple, 3 element tuple with float values representing an RGB colour + colour1: tuple + 3 element tuple with float values representing an RGB colour + colour2: tuple + 3 element tuple with float values representing an RGB colour + colour3: tuple + 3 element tuple with float values representing an RGB colour """ self.column = column self.colour1 = colour1 @@ -119,12 +130,15 @@ def __call__(self, data, index): g2 + (g3 - g2) * x_scaled, b2 + (b3 - b2) * x_scaled) + class ScaleSize(Scale): """ Provide a mapping between a DataFrame column and matplotlib scatter plot shape size. """ - def __init__(self, column, min_size=5.0, max_size=100.0, transform=lambda x: x): + + def __init__(self, column, min_size=5.0, max_size=100.0, + transform=lambda x: x): """Initialize ScaleSize instance. Parameters: @@ -132,7 +146,9 @@ def __init__(self, column, min_size=5.0, max_size=100.0, transform=lambda x: x): column: string, a column name min_size: float, minimum point size max_size: float, maximum point size - transform: a one argument function of form float -> float (e.g. lambda x: log(x)) + transform: function + a one argument function of form float -> float (e.g. lambda x: + log(x)) """ self.column = column self.min_size = min_size @@ -152,13 +168,15 @@ def __call__(self, data, index): a = float(min(data[self.column])) b = float(max(data[self.column])) return self.transform(self.min_size + ((x - a) / (b - a)) * - (self.max_size - self.min_size)) + (self.max_size - self.min_size)) + class ScaleShape(Scale): """ Provides a mapping between matplotlib marker shapes and attribute values. """ + def __init__(self, column): """Initialize ScaleShape instance. @@ -185,14 +203,17 @@ def __call__(self, data, index): """ values = sorted(list(set(data[self.column]))) if len(values) > len(self.shapes): - raise ValueError("Too many different values of the categorical attribute for ScaleShape") + raise ValueError("Too many different values of the categorical " + "attribute for ScaleShape") x = data[self.column].iget(index) return self.shapes[values.index(x)] + class ScaleRandomColour(Scale): """ Maps a random colour to a DataFrame attribute. """ + def __init__(self, column): """Initialize ScaleRandomColour instance. @@ -215,10 +236,12 @@ def __call__(self, data, index): random.seed(data[self.column].iget(index)) return [random.random() for _ in range(3)] + class ScaleConstant(Scale): """ Constant returning scale. Usually used automatically. """ + def __init__(self, value): """Initialize ScaleConstant instance. @@ -243,6 +266,7 @@ def __call__(self, data, index): """ return self.value + def default_aes(x=None, y=None): """Create the default aesthetics dictionary. @@ -256,14 +280,15 @@ def default_aes(x=None, y=None): a dictionary with aesthetics bindings """ return { - 'x' : x, - 'y' : y, - 'size' : ScaleConstant(40.0), - 'colour' : ScaleConstant('grey'), - 'shape' : ScaleConstant('o'), - 'alpha' : ScaleConstant(1.0), + 'x': x, + 'y': y, + 'size': ScaleConstant(40.0), + 'colour': ScaleConstant('grey'), + 'shape': ScaleConstant('o'), + 'alpha': ScaleConstant(1.0), } + def make_aes(x=None, y=None, size=None, colour=None, shape=None, alpha=None): """Create an empty aesthetics dictionary. @@ -288,35 +313,48 @@ def make_aes(x=None, y=None, size=None, colour=None, shape=None, alpha=None): shape = ScaleConstant(shape) if not hasattr(alpha, '__call__') and alpha is not None: alpha = ScaleConstant(alpha) - if any([isinstance(size, scale) for scale in [ScaleConstant, ScaleSize]]) or size is None: + if any([isinstance(size, scale) + for scale in [ScaleConstant, ScaleSize]]) or size is None: pass else: - raise ValueError('size mapping should be done through ScaleConstant or ScaleSize') - if any([isinstance(colour, scale) for scale in [ScaleConstant, ScaleGradient, ScaleGradient2, ScaleRandomColour]]) or colour is None: + raise ValueError( + 'size mapping should be done through ScaleConstant or ScaleSize') + if (any([isinstance(colour, scale) + for scale in [ScaleConstant, ScaleGradient, + ScaleGradient2, ScaleRandomColour]]) or + colour is None): pass else: - raise ValueError('colour mapping should be done through ScaleConstant, ScaleRandomColour, ScaleGradient or ScaleGradient2') - if any([isinstance(shape, scale) for scale in [ScaleConstant, ScaleShape]]) or shape is None: + raise ValueError('colour mapping should be done through ' + 'ScaleConstant, ScaleRandomColour, ScaleGradient ' + 'or ScaleGradient2') + if (any([isinstance(shape, scale) + for scale in [ScaleConstant, ScaleShape]]) or + shape is None): pass else: - raise ValueError('shape mapping should be done through ScaleConstant or ScaleShape') - if any([isinstance(alpha, scale) for scale in [ScaleConstant]]) or alpha is None: + raise ValueError('shape mapping should be done through ScaleConstant ' + 'or ScaleShape') + if (any([isinstance(alpha, scale) for scale in [ScaleConstant]]) or + alpha is None): pass else: raise ValueError('alpha mapping should be done through ScaleConstant') return { - 'x' : x, - 'y' : y, - 'size' : size, - 'colour' : colour, - 'shape' : shape, - 'alpha' : alpha, + 'x': x, + 'y': y, + 'size': size, + 'colour': colour, + 'shape': shape, + 'alpha': alpha, } + class Layer: """ Layer object representing a single plot layer. """ + def __init__(self, data=None, **kwds): """Initialize layer object. @@ -343,7 +381,9 @@ def work(self, fig=None, ax=None): """ return fig, ax + class GeomPoint(Layer): + def work(self, fig=None, ax=None): """Render the layer on a matplotlib axis. You can specify either a figure or an axis to draw on. @@ -375,10 +415,10 @@ def work(self, fig=None, ax=None): marker_value = shape_scaler(self.data, index) alpha_value = alpha(self.data, index) patch = ax.scatter(x, y, - s=size_value, - c=colour_value, - marker=marker_value, - alpha=alpha_value) + s=size_value, + c=colour_value, + marker=marker_value, + alpha=alpha_value) label = [] if colour_scaler.categorical: label += [colour_scaler.column, row[colour_scaler.column]] @@ -389,10 +429,12 @@ def work(self, fig=None, ax=None): ax.set_ylabel(self.aes['y']) return fig, ax + class GeomPolyFit(Layer): """ Draw a polynomial fit of specified degree. """ + def __init__(self, degree, lw=2.0, colour='grey'): """Initialize GeomPolyFit object. @@ -436,10 +478,12 @@ def work(self, fig=None, ax=None): ax.plot(x_, y_, lw=self.lw, c=self.colour) return fig, ax + class GeomScatter(Layer): """ An efficient scatter plot, use this instead of GeomPoint for speed. """ + def __init__(self, marker='o', colour='lightblue', alpha=1.0): """Initialize GeomScatter instance. @@ -476,10 +520,12 @@ def work(self, fig=None, ax=None): ax.scatter(x, y, marker=self.marker, c=self.colour, alpha=self.alpha) return fig, ax + class GeomHistogram(Layer): """ An efficient histogram, use this instead of GeomBar for speed. """ + def __init__(self, bins=10, colour='lightblue'): """Initialize GeomHistogram instance. @@ -514,10 +560,12 @@ def work(self, fig=None, ax=None): ax.set_xlabel(self.aes['x']) return fig, ax + class GeomDensity(Layer): """ A kernel density estimation plot. """ + def work(self, fig=None, ax=None): """Draw a one dimensional kernel density plot. You can specify either a figure or an axis to draw on. @@ -543,7 +591,9 @@ def work(self, fig=None, ax=None): ax.plot(ind, gkde.evaluate(ind)) return fig, ax + class GeomDensity2D(Layer): + def work(self, fig=None, ax=None): """Draw a two dimensional kernel density plot. You can specify either a figure or an axis to draw on. @@ -564,7 +614,10 @@ def work(self, fig=None, ax=None): ax = fig.gca() x = self.data[self.aes['x']] y = self.data[self.aes['y']] - rvs = np.array([x, y]) + + # TODO: unused? + # rvs = np.array([x, y]) + x_min = x.min() x_max = x.max() y_min = y.min() @@ -578,7 +631,9 @@ def work(self, fig=None, ax=None): ax.contour(Z, extent=[x_min, x_max, y_min, y_max]) return fig, ax + class TrellisGrid(Layer): + def __init__(self, by): """Initialize TreelisGrid instance. @@ -589,12 +644,14 @@ def __init__(self, by): if len(by) != 2: raise ValueError("You must give a list of length 2 to group by") elif by[0] == '.' and by[1] == '.': - raise ValueError("At least one of grouping attributes must be not a dot") + raise ValueError( + "At least one of grouping attributes must be not a dot") self.by = by def trellis(self, layers): - """Create a trellis structure for a list of layers. - Each layer will be cloned with different data in to a two dimensional grid. + """ + Create a trellis structure for a list of layers. Each layer will be + cloned with different data in to a two dimensional grid. Parameters: ----------- @@ -602,7 +659,8 @@ def trellis(self, layers): Returns: -------- - trellised_layers: Clones of each layer in the list arranged in a trellised latice + trellised_layers: Clones of each layer in the list arranged in a + trellised latice """ trellised_layers = [] for layer in layers: @@ -628,8 +686,10 @@ def trellis(self, layers): else: self.rows = len(shingle1) self.cols = len(shingle2) - trellised = [[None for _ in range(self.cols)] for _ in range(self.rows)] - self.group_grid = [[None for _ in range(self.cols)] for _ in range(self.rows)] + trellised = [[None for _ in range(self.cols)] + for _ in range(self.rows)] + self.group_grid = [[None for _ in range( + self.cols)] for _ in range(self.rows)] row = 0 col = 0 for group, data in grouped: @@ -644,6 +704,7 @@ def trellis(self, layers): trellised_layers.append(trellised) return trellised_layers + def dictionary_union(dict1, dict2): """Take two dictionaries, return dictionary union. @@ -666,6 +727,7 @@ def dictionary_union(dict1, dict2): result[key2] = dict2[key2] return result + def merge_aes(layer1, layer2): """Merges the aesthetics dictionaries for the two layers. Look up sequence_layers function. Which layer is first and which @@ -680,11 +742,16 @@ def merge_aes(layer1, layer2): if layer2.aes[key] is None: layer2.aes[key] = layer1.aes[key] + def sequence_layers(layers): - """Go through the list of layers and fill in the missing bits of information. + """ + Go through the list of layers and fill in the missing bits of information. The basic rules are this: - * If the current layer has data set to None, take the data from previous layer. - * For each aesthetic mapping, if that mapping is set to None, take it from previous layer. + + * If the current layer has data set to None, take the data from previous + layer. + * For each aesthetic mapping, if that mapping is set to None, take it from + previous layer. Parameters: ----------- @@ -696,8 +763,11 @@ def sequence_layers(layers): merge_aes(layer1, layer2) return layers + def sequence_grids(layer_grids): - """Go through the list of layer girds and perform the same thing as sequence_layers. + """ + Go through the list of layer girds and perform the same thing as + sequence_layers. Parameters: ----------- @@ -711,8 +781,11 @@ def sequence_grids(layer_grids): merge_aes(layer1, layer2) return layer_grids + def work_grid(grid, fig): - """Take a two dimensional grid, add subplots to a figure for each cell and do layer work. + """ + Take a two dimensional grid, add subplots to a figure for each cell and do + layer work. Parameters: ----------- @@ -728,10 +801,12 @@ def work_grid(grid, fig): axes = [[None for _ in range(ncols)] for _ in range(nrows)] for row in range(nrows): for col in range(ncols): - axes[row][col] = fig.add_subplot(nrows, ncols, ncols * row + col + 1) + axes[row][col] = fig.add_subplot( + nrows, ncols, ncols * row + col + 1) grid[row][col].work(ax=axes[row][col]) return axes + def adjust_subplots(fig, axes, trellis, layers): """Adjust the subtplots on matplotlib figure with the fact that we have a trellis plot in mind. @@ -763,20 +838,29 @@ def adjust_subplots(fig, axes, trellis, layers): axis.get_xaxis().set_ticks([]) axis.set_xlabel('') if trellis.by[0] == '.': - label1 = "%s = %s" % (trellis.by[1], trellis.group_grid[index // trellis.cols][index % trellis.cols]) + label1 = "%s = %s" % (trellis.by[1], trellis.group_grid[ + index // trellis.cols][index % trellis.cols]) label2 = None elif trellis.by[1] == '.': - label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[index // trellis.cols][index % trellis.cols]) + label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[ + index // trellis.cols][index % trellis.cols]) label2 = None else: - label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[index // trellis.cols][index % trellis.cols][0]) - label2 = "%s = %s" % (trellis.by[1], trellis.group_grid[index // trellis.cols][index % trellis.cols][1]) + label1 = "%s = %s" % ( + trellis.by[0], + trellis.group_grid[index // trellis.cols] + [index % trellis.cols][0]) + label2 = "%s = %s" % ( + trellis.by[1], + trellis.group_grid[index // trellis.cols] + [index % trellis.cols][1]) if label2 is not None: axis.table(cellText=[[label1], [label2]], - loc='top', cellLoc='center', - cellColours=[['lightgrey'], ['lightgrey']]) + loc='top', cellLoc='center', + cellColours=[['lightgrey'], ['lightgrey']]) else: - axis.table(cellText=[[label1]], loc='top', cellLoc='center', cellColours=[['lightgrey']]) + axis.table(cellText=[[label1]], loc='top', + cellLoc='center', cellColours=[['lightgrey']]) # Flatten the layer grid layers = [layer for row in layers for layer in row] legend = {} @@ -800,15 +884,19 @@ def adjust_subplots(fig, axes, trellis, layers): col1, val1, col2, val2 = key labels.append("%s, %s" % (str(val1), str(val2))) else: - raise ValueError("Maximum 2 categorical attributes to display a lengend of") + raise ValueError( + "Maximum 2 categorical attributes to display a lengend of") if len(legend): fig.legend(patches, labels, loc='upper right') fig.subplots_adjust(wspace=0.05, hspace=0.2) + class RPlot: """ - The main plot object. Add layers to an instance of this object to create a plot. + The main plot object. Add layers to an instance of this object to create a + plot. """ + def __init__(self, data, x=None, y=None): """Initialize RPlot instance. @@ -819,7 +907,6 @@ def __init__(self, data, x=None, y=None): y: string, DataFrame column name """ self.layers = [Layer(data, **default_aes(x=x, y=y))] - trellised = False def add(self, layer): """Add a layer to RPlot instance. @@ -829,7 +916,8 @@ def add(self, layer): layer: Layer instance """ if not isinstance(layer, Layer): - raise TypeError("The operand on the right side of + must be a Layer instance") + raise TypeError( + "The operand on the right side of + must be a Layer instance") self.layers.append(layer) def render(self, fig=None): @@ -873,13 +961,13 @@ def render(self, fig=None): col1, val1, col2, val2 = key labels.append("%s, %s" % (str(val1), str(val2))) else: - raise ValueError("Maximum 2 categorical attributes to display a lengend of") + raise ValueError("Maximum 2 categorical attributes to " + "display a lengend of") if len(legend): fig.legend(patches, labels, loc='upper right') else: - # We have a trellised plot. - # First let's remove all other TrellisGrid instances from the layer list, - # including this one. + # We have a trellised plot. First let's remove all other + # TrellisGrid instances from the layer list, including this one. new_layers = [] for layer in self.layers: if not isinstance(layer, TrellisGrid): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6db2d2e15f699..9e64e0eeb2792 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -9,7 +9,7 @@ import random import pandas as pd -from pandas.compat import range, lrange, lzip, zip, StringIO +from pandas.compat import range, lrange, lzip, StringIO from pandas import compat from pandas.tseries.index import DatetimeIndex from pandas.tools.merge import merge, concat, ordered_merge, MergeError @@ -18,7 +18,8 @@ assert_almost_equal, makeCustomDataframe as mkdf, assertRaisesRegexp) -from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv +from pandas import (isnull, DataFrame, Index, MultiIndex, Panel, + Series, date_range, read_csv) import pandas.algos as algos import pandas.util.testing as tm from numpy.testing.decorators import slow @@ -414,7 +415,7 @@ def test_join_inner_multiindex(self): data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) + 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], @@ -459,8 +460,8 @@ def test_join_hierarchical_mixed(self): def test_join_float64_float32(self): - a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64) - b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32) + a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) self.assertEqual(joined.dtypes['a'], 'float64') self.assertEqual(joined.dtypes['b'], 'float64') @@ -470,7 +471,7 @@ def test_join_float64_float32(self): b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c }) + xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) self.assertEqual(rs.dtypes['a'], 'int64') @@ -785,14 +786,14 @@ def test_merge_left_empty_right_notempty(self): right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['x', 'y', 'z']) - exp_out = pd.DataFrame({'a': np.array([np.nan]*3, dtype=object), - 'b': np.array([np.nan]*3, dtype=object), - 'c': np.array([np.nan]*3, dtype=object), + exp_out = pd.DataFrame({'a': np.array([np.nan] * 3, dtype=object), + 'b': np.array([np.nan] * 3, dtype=object), + 'c': np.array([np.nan] * 3, dtype=object), 'x': [1, 4, 7], 'y': [2, 5, 8], 'z': [3, 6, 9]}, columns=['a', 'b', 'c', 'x', 'y', 'z']) - exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + exp_in = exp_out[0:0] # make empty DataFrame keeping dtype # result will have object dtype exp_in.index = exp_in.index.astype(object) @@ -820,11 +821,11 @@ def test_merge_left_notempty_right_empty(self): exp_out = pd.DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9], - 'x': np.array([np.nan]*3, dtype=object), - 'y': np.array([np.nan]*3, dtype=object), - 'z': np.array([np.nan]*3, dtype=object)}, + 'x': np.array([np.nan] * 3, dtype=object), + 'y': np.array([np.nan] * 3, dtype=object), + 'z': np.array([np.nan] * 3, dtype=object)}, columns=['a', 'b', 'c', 'x', 'y', 'z']) - exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + exp_in = exp_out[0:0] # make empty DataFrame keeping dtype # result will have object dtype exp_in.index = exp_in.index.astype(object) @@ -871,24 +872,30 @@ def test_merge_nosort(self): self.assertTrue((df.var3.unique() == result.var3.unique()).all()) def test_merge_nan_right(self): - df1 = DataFrame({"i1" : [0, 1], "i2" : [0, 1]}) - df2 = DataFrame({"i1" : [0], "i3" : [0]}) + df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]}) + df2 = DataFrame({"i1": [0], "i3": [0]}) result = df1.join(df2, on="i1", rsuffix="_") - expected = DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1}, - 'i1_': {0: 0, 1: np.nan}, 'i3': {0: 0.0, 1: np.nan}, - None: {0: 0, 1: 0}}).set_index(None).reset_index()[['i1', 'i2', 'i1_', 'i3']] + expected = (DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1}, + 'i1_': {0: 0, 1: np.nan}, + 'i3': {0: 0.0, 1: np.nan}, + None: {0: 0, 1: 0}}) + .set_index(None) + .reset_index()[['i1', 'i2', 'i1_', 'i3']]) assert_frame_equal(result, expected, check_dtype=False) - df1 = DataFrame({"i1" : [0, 1], "i2" : [0.5, 1.5]}) - df2 = DataFrame({"i1" : [0], "i3" : [0.7]}) + df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]}) + df2 = DataFrame({"i1": [0], "i3": [0.7]}) result = df1.join(df2, rsuffix="_", on='i1') - expected = DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan}, - 'i2': {0: 0.5, 1: 1.5}, 'i3': {0: 0.69999999999999996, - 1: nan}})[['i1', 'i2', 'i1_', 'i3']] + expected = (DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan}, + 'i2': {0: 0.5, 1: 1.5}, + 'i3': {0: 0.69999999999999996, + 1: nan}}) + [['i1', 'i2', 'i1_', 'i3']]) assert_frame_equal(result, expected) def test_merge_type(self): class NotADataFrame(DataFrame): + @property def _constructor(self): return NotADataFrame @@ -905,20 +912,24 @@ def test_append_dtype_coerce(self): import datetime as dt from pandas import NaT - df1 = DataFrame(index=[1,2], data=[dt.datetime(2013,1,1,0,0), - dt.datetime(2013,1,2,0,0)], + df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0)], columns=['start_time']) - df2 = DataFrame(index=[4,5], data=[[dt.datetime(2013,1,3,0,0), - dt.datetime(2013,1,3,6,10)], - [dt.datetime(2013,1,4,0,0), - dt.datetime(2013,1,4,7,10)]], - columns=['start_time','end_time']) - - expected = concat([ - Series([NaT,NaT,dt.datetime(2013,1,3,6,10),dt.datetime(2013,1,4,7,10)],name='end_time'), - Series([dt.datetime(2013,1,1,0,0),dt.datetime(2013,1,2,0,0),dt.datetime(2013,1,3,0,0),dt.datetime(2013,1,4,0,0)],name='start_time'), - ],axis=1) - result = df1.append(df2,ignore_index=True) + df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), + dt.datetime(2013, 1, 4, 7, 10)]], + columns=['start_time', 'end_time']) + + expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10)], + name='end_time'), + Series([dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0)], + name='start_time')], axis=1) + result = df1.append(df2, ignore_index=True) assert_frame_equal(result, expected) def test_join_append_timedeltas(self): @@ -934,18 +945,18 @@ def test_join_append_timedeltas(self): df = df.append(d, ignore_index=True) result = df.append(d, ignore_index=True) expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56), - dt.datetime(2013, 11, 5, 5, 56) ], - 't': [ dt.timedelta(0, 22500), - dt.timedelta(0, 22500) ]}) + dt.datetime(2013, 11, 5, 5, 56)], + 't': [dt.timedelta(0, 22500), + dt.timedelta(0, 22500)]}) assert_frame_equal(result, expected) td = np.timedelta64(300000000) - lhs = DataFrame(Series([td,td],index=["A","B"])) - rhs = DataFrame(Series([td],index=["A"])) + lhs = DataFrame(Series([td, td], index=["A", "B"])) + rhs = DataFrame(Series([td], index=["A"])) - from pandas import NaT - result = lhs.join(rhs,rsuffix='r', how="left") - expected = DataFrame({ '0' : Series([td,td],index=list('AB')), '0r' : Series([td,NaT],index=list('AB')) }) + result = lhs.join(rhs, rsuffix='r', how="left") + expected = DataFrame({'0': Series([td, td], index=list('AB')), + '0r': Series([td, NaT], index=list('AB'))}) assert_frame_equal(result, expected) def test_overlapping_columns_error_message(self): @@ -959,10 +970,10 @@ def test_overlapping_columns_error_message(self): df.columns = ['key', 'foo', 'foo'] df2.columns = ['key', 'bar', 'bar'] expected = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9], - 'v3': [4, 5, 6], - 'v4': [7, 8, 9]}) + 'v1': [4, 5, 6], + 'v2': [7, 8, 9], + 'v3': [4, 5, 6], + 'v4': [7, 8, 9]}) expected.columns = ['key', 'foo', 'foo', 'bar', 'bar'] assert_frame_equal(merge(df, df2), expected) @@ -973,48 +984,58 @@ def test_overlapping_columns_error_message(self): def test_merge_on_datetime64tz(self): # GH11405 - left = pd.DataFrame({'key' : pd.date_range('20151010',periods=2,tz='US/Eastern'), - 'value' : [1,2]}) - right = pd.DataFrame({'key' : pd.date_range('20151011',periods=3,tz='US/Eastern'), - 'value' : [1,2,3]}) - - expected = DataFrame({'key' : pd.date_range('20151010',periods=4,tz='US/Eastern'), - 'value_x' : [1,2,np.nan,np.nan], - 'value_y' : [np.nan,1,2,3]}) + left = pd.DataFrame({'key': pd.date_range('20151010', periods=2, + tz='US/Eastern'), + 'value': [1, 2]}) + right = pd.DataFrame({'key': pd.date_range('20151011', periods=3, + tz='US/Eastern'), + 'value': [1, 2, 3]}) + + expected = DataFrame({'key': pd.date_range('20151010', periods=4, + tz='US/Eastern'), + 'value_x': [1, 2, np.nan, np.nan], + 'value_y': [np.nan, 1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) - left = pd.DataFrame({'value' : pd.date_range('20151010',periods=2,tz='US/Eastern'), - 'key' : [1,2]}) - right = pd.DataFrame({'value' : pd.date_range('20151011',periods=2,tz='US/Eastern'), - 'key' : [2,3]}) - expected = DataFrame({'value_x' : list(pd.date_range('20151010',periods=2,tz='US/Eastern')) + [pd.NaT], - 'value_y' : [pd.NaT] + list(pd.date_range('20151011',periods=2,tz='US/Eastern')), - 'key' : [1.,2,3]}) + left = pd.DataFrame({'value': pd.date_range('20151010', periods=2, + tz='US/Eastern'), + 'key': [1, 2]}) + right = pd.DataFrame({'value': pd.date_range('20151011', periods=2, + tz='US/Eastern'), + 'key': [2, 3]}) + expected = DataFrame({ + 'value_x': list(pd.date_range('20151010', periods=2, + tz='US/Eastern')) + [pd.NaT], + 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, + tz='US/Eastern')), + 'key': [1., 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) def test_indicator(self): # PR #10054. xref #7412 and closes #8790. - df1 = DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]}) + df1 = DataFrame({'col1': [0, 1], 'col_left': [ + 'a', 'b'], 'col_conflict': [1, 2]}) df1_copy = df1.copy() - df2 = DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], - 'col_conflict':[1,2,3,4,5]}) + df2 = DataFrame({'col1': [1, 2, 3, 4, 5], 'col_right': [2, 2, 2, 2, 2], + 'col_conflict': [1, 2, 3, 4, 5]}) df2_copy = df2.copy() - df_result = DataFrame({'col1':[0,1,2,3,4,5], - 'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan], - 'col_left':['a','b', np.nan,np.nan,np.nan,np.nan], - 'col_conflict_y':[np.nan,1,2,3,4,5], - 'col_right':[np.nan, 2,2,2,2,2]}, - dtype='float64') - df_result['_merge'] = Categorical(['left_only','both','right_only', - 'right_only','right_only','right_only'] - , categories=['left_only', 'right_only', 'both']) + df_result = DataFrame({ + 'col1': [0, 1, 2, 3, 4, 5], + 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan], + 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan], + 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5], + 'col_right': [np.nan, 2, 2, 2, 2, 2]}, dtype='float64') + df_result['_merge'] = Categorical( + ['left_only', 'both', 'right_only', + 'right_only', 'right_only', 'right_only'], + categories=['left_only', 'right_only', 'both']) df_result = df_result[['col1', 'col_conflict_x', 'col_left', - 'col_conflict_y', 'col_right', '_merge' ]] + 'col_conflict_y', 'col_right', '_merge']] test = merge(df1, df2, on='col1', how='outer', indicator=True) assert_frame_equal(test, df_result) @@ -1027,11 +1048,14 @@ def test_indicator(self): # Check with custom name df_result_custom_name = df_result - df_result_custom_name = df_result_custom_name.rename(columns={'_merge':'custom_name'}) + df_result_custom_name = df_result_custom_name.rename( + columns={'_merge': 'custom_name'}) - test_custom_name = merge(df1, df2, on='col1', how='outer', indicator='custom_name') + test_custom_name = merge( + df1, df2, on='col1', how='outer', indicator='custom_name') assert_frame_equal(test_custom_name, df_result_custom_name) - test_custom_name = df1.merge(df2, on='col1', how='outer', indicator='custom_name') + test_custom_name = df1.merge( + df2, on='col1', how='outer', indicator='custom_name') assert_frame_equal(test_custom_name, df_result_custom_name) # Check only accepts strings and booleans @@ -1059,35 +1083,41 @@ def test_indicator(self): # Check if working name in df for i in ['_right_indicator', '_left_indicator', '_merge']: - df_badcolumn = DataFrame({'col1':[1,2], i:[2,2]}) + df_badcolumn = DataFrame({'col1': [1, 2], i: [2, 2]}) with tm.assertRaises(ValueError): - merge(df1, df_badcolumn, on='col1', how='outer', indicator=True) + merge(df1, df_badcolumn, on='col1', + how='outer', indicator=True) with tm.assertRaises(ValueError): df1.merge(df_badcolumn, on='col1', how='outer', indicator=True) # Check for name conflict with custom name - df_badcolumn = DataFrame({'col1':[1,2], 'custom_column_name':[2,2]}) + df_badcolumn = DataFrame( + {'col1': [1, 2], 'custom_column_name': [2, 2]}) with tm.assertRaises(ValueError): - merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name') + merge(df1, df_badcolumn, on='col1', how='outer', + indicator='custom_column_name') with tm.assertRaises(ValueError): - df1.merge(df_badcolumn, on='col1', how='outer', indicator='custom_column_name') + df1.merge(df_badcolumn, on='col1', how='outer', + indicator='custom_column_name') # Merge on multiple columns - df3 = DataFrame({'col1':[0,1], 'col2':['a','b']}) + df3 = DataFrame({'col1': [0, 1], 'col2': ['a', 'b']}) - df4 = DataFrame({'col1':[1,1,3], 'col2':['b','x','y']}) + df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']}) - hand_coded_result = DataFrame({'col1':[0,1,1,3.0], - 'col2':['a','b','x','y']}) + hand_coded_result = DataFrame({'col1': [0, 1, 1, 3.0], + 'col2': ['a', 'b', 'x', 'y']}) hand_coded_result['_merge'] = Categorical( - ['left_only','both','right_only','right_only'] - , categories=['left_only', 'right_only', 'both']) + ['left_only', 'both', 'right_only', 'right_only'], + categories=['left_only', 'right_only', 'both']) - test5 = merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True) + test5 = merge(df3, df4, on=['col1', 'col2'], + how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) - test5 = df3.merge(df4, on=['col1', 'col2'], how='outer', indicator=True) + test5 = df3.merge(df4, on=['col1', 'col2'], + how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) @@ -1099,7 +1129,8 @@ def _check_merge(x, y): sort=True) expected = expected.set_index('index') - assert_frame_equal(result, expected, check_names=False) # TODO check_names on merge? + # TODO check_names on merge? + assert_frame_equal(result, expected, check_names=False) class TestMergeMulti(tm.TestCase): @@ -1147,7 +1178,8 @@ def test_left_join_multi_index(self): def bind_cols(df): iord = lambda a: 0 if a != a else ord(a) f = lambda ts: ts.map(iord) - ord('a') - return f(df['1st']) + f(df['3rd'])* 1e2 + df['2nd'].fillna(0) * 1e4 + return (f(df['1st']) + f(df['3rd']) * 1e2 + + df['2nd'].fillna(0) * 1e4) def run_asserts(left, right): for sort in [False, True]: @@ -1157,14 +1189,15 @@ def run_asserts(left, right): self.assertFalse(res['4th'].isnull().any()) self.assertFalse(res['5th'].isnull().any()) - tm.assert_series_equal(res['4th'], - res['5th'], check_names=False) + tm.assert_series_equal( + res['4th'], - res['5th'], check_names=False) result = bind_cols(res.iloc[:, :-2]) tm.assert_series_equal(res['4th'], result, check_names=False) self.assertTrue(result.name is None) if sort: - tm.assert_frame_equal(res, - res.sort_values(icols, kind='mergesort')) + tm.assert_frame_equal( + res, res.sort_values(icols, kind='mergesort')) out = merge(left, right.reset_index(), on=icols, sort=sort, how='left') @@ -1203,10 +1236,11 @@ def test_merge_right_vs_left(self): # compare left vs right merge with multikey for sort in [False, True]: merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], - right_index=True, how='left', sort=sort) + right_index=True, how='left', sort=sort) merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], - left_index=True, how='right', sort=sort) + left_index=True, how='right', + sort=sort) merged2 = merged2.ix[:, merged1.columns] assert_frame_equal(merged1, merged2) @@ -1225,13 +1259,13 @@ def test_compress_group_combinations(self): 'value2': np.random.randn(10000)}) # just to hit the label compression code path - merged = merge(df, df2, how='outer') + merge(df, df2, how='outer') def test_left_join_index_preserve_order(self): left = DataFrame({'k1': [0, 1, 2] * 8, 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24),dtype=np.int64) }) + 'v': np.array(np.arange(24), dtype=np.int64)}) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2': [5, 7]}, index=index) @@ -1240,18 +1274,19 @@ def test_left_join_index_preserve_order(self): expected = left.copy() expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'),'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7 + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.sort_values(['k1', 'k2'], kind='mergesort'), - left.join(right, on=['k1', 'k2'], sort=True)) + tm.assert_frame_equal( + result.sort_values(['k1', 'k2'], kind='mergesort'), + left.join(right, on=['k1', 'k2'], sort=True)) # test join with multi dtypes blocks left = DataFrame({'k1': [0, 1, 2] * 8, 'k2': ['foo', 'bar'] * 12, - 'k3' : np.array([0, 1, 2]*8, dtype=np.float32), - 'v': np.array(np.arange(24),dtype=np.int32) }) + 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), + 'v': np.array(np.arange(24), dtype=np.int32)}) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2': [5, 7]}, index=index) @@ -1260,12 +1295,13 @@ def test_left_join_index_preserve_order(self): expected = left.copy() expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'),'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7 + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.sort_values(['k1', 'k2'], kind='mergesort'), - left.join(right, on=['k1', 'k2'], sort=True)) + tm.assert_frame_equal( + result.sort_values(['k1', 'k2'], kind='mergesort'), + left.join(right, on=['k1', 'k2'], sort=True)) # do a right join for an extra test joined = merge(right, left, left_index=True, @@ -1288,18 +1324,18 @@ def test_left_join_index_multi_match_multiindex(self): index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) right = DataFrame([ - ['W', 'R', 'C', 0], - ['W', 'Q', 'B', 3], - ['W', 'Q', 'B', 8], - ['X', 'Y', 'A', 1], - ['X', 'Y', 'A', 4], - ['X', 'Y', 'B', 5], - ['X', 'Y', 'C', 6], - ['X', 'Y', 'C', 9], + ['W', 'R', 'C', 0], + ['W', 'Q', 'B', 3], + ['W', 'Q', 'B', 8], + ['X', 'Y', 'A', 1], + ['X', 'Y', 'A', 4], + ['X', 'Y', 'B', 5], + ['X', 'Y', 'C', 6], + ['X', 'Y', 'C', 9], ['X', 'Q', 'C', -6], ['X', 'R', 'C', -9], - ['V', 'Y', 'C', 7], - ['V', 'R', 'D', 2], + ['V', 'Y', 'C', 7], + ['V', 'R', 'D', 2], ['V', 'R', 'D', -1], ['V', 'Q', 'A', -3]], columns=['col1', 'col2', 'col3', 'val']) @@ -1308,20 +1344,20 @@ def test_left_join_index_multi_match_multiindex(self): result = left.join(right, on=['cola', 'colb', 'colc'], how='left') expected = DataFrame([ - ['X', 'Y', 'C', 'a', 6], - ['X', 'Y', 'C', 'a', 9], + ['X', 'Y', 'C', 'a', 6], + ['X', 'Y', 'C', 'a', 9], ['W', 'Y', 'C', 'e', nan], - ['V', 'Q', 'A', 'h', -3], - ['V', 'R', 'D', 'i', 2], - ['V', 'R', 'D', 'i', -1], + ['V', 'Q', 'A', 'h', -3], + ['V', 'R', 'D', 'i', 2], + ['V', 'R', 'D', 'i', -1], ['X', 'Y', 'D', 'b', nan], - ['X', 'Y', 'A', 'c', 1], - ['X', 'Y', 'A', 'c', 4], - ['W', 'Q', 'B', 'f', 3], - ['W', 'Q', 'B', 'f', 8], - ['W', 'R', 'C', 'g', 0], - ['V', 'Y', 'C', 'j', 7], - ['X', 'Y', 'B', 'd', 5]], + ['X', 'Y', 'A', 'c', 1], + ['X', 'Y', 'A', 'c', 4], + ['W', 'Q', 'B', 'f', 3], + ['W', 'Q', 'B', 'f', 8], + ['W', 'R', 'C', 'g', 0], + ['V', 'Y', 'C', 'j', 7], + ['X', 'Y', 'B', 'd', 5]], columns=['cola', 'colb', 'colc', 'tag', 'val'], index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) @@ -1330,8 +1366,9 @@ def test_left_join_index_multi_match_multiindex(self): result = left.join(right, on=['cola', 'colb', 'colc'], how='left', sort=True) - tm.assert_frame_equal(result, - expected.sort_values(['cola', 'colb', 'colc'], kind='mergesort')) + tm.assert_frame_equal( + result, + expected.sort_values(['cola', 'colb', 'colc'], kind='mergesort')) # GH7331 - maintain left frame order in left merge right.reset_index(inplace=True) @@ -1378,7 +1415,8 @@ def test_left_join_index_multi_match(self): tm.assert_frame_equal(result, expected) result = left.join(right, on='tag', how='left', sort=True) - tm.assert_frame_equal(result, expected.sort_values('tag', kind='mergesort')) + tm.assert_frame_equal( + result, expected.sort_values('tag', kind='mergesort')) # GH7331 - maintain left frame order in left merge result = merge(left, right.reset_index(), how='left', on='tag') @@ -1388,13 +1426,14 @@ def test_left_join_index_multi_match(self): def test_join_multi_dtypes(self): # test with multi dtypes in the join index - def _test(dtype1,dtype2): + def _test(dtype1, dtype2): left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24),dtype=np.int64) }) + 'v': np.array(np.arange(24), dtype=np.int64)}) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) + right = DataFrame( + {'v2': np.array([5, 7], dtype=dtype2)}, index=index) result = left.join(right, on=['k1', 'k2']) @@ -1402,9 +1441,9 @@ def _test(dtype1,dtype2): if dtype2.kind == 'i': dtype2 = np.dtype('float64') - expected['v2'] = np.array(np.nan,dtype=dtype2) - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'),'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7 + expected['v2'] = np.array(np.nan, dtype=dtype2) + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 tm.assert_frame_equal(result, expected) @@ -1412,9 +1451,9 @@ def _test(dtype1,dtype2): expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) tm.assert_frame_equal(result, expected) - for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]: - for d2 in [np.int64,np.float64,np.float32,np.float16]: - _test(np.dtype(d1),np.dtype(d2)) + for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]: + for d2 in [np.int64, np.float64, np.float32, np.float16]: + _test(np.dtype(d1), np.dtype(d2)) def test_left_merge_na_buglet(self): left = DataFrame({'id': list('abcde'), 'v1': randn(5), @@ -1517,7 +1556,8 @@ def test_int64_overflow_issues(self): # add duplicates to left frame left = concat([left, left], ignore_index=True) - right = DataFrame(np.random.randint(low, high, (n // 2, 7)).astype('int64'), + right = DataFrame(np.random.randint(low, high, (n // 2, 7)) + .astype('int64'), columns=list('ABCDEFG')) # add duplicates & overlap with left to the right frame @@ -1588,52 +1628,78 @@ def verify_order(df): assert_frame_equal(frame, align(res), check_dtype=how not in ('right', 'outer')) - def test_join_multi_levels(self): # GH 3662 # merge multi-levels - - household = DataFrame(dict(household_id = [1,2,3], - male = [0,1,0], - wealth = [196087.3,316478.7,294750]), - columns = ['household_id','male','wealth']).set_index('household_id') - portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], - asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan], - name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], - share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), - columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id']) + household = ( + DataFrame( + dict(household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750]), + columns=['household_id', 'male', 'wealth']) + .set_index('household_id')) + portfolio = ( + DataFrame( + dict(household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "nl0000289965", + np.nan], + name=["ABN Amro", "Robeco", "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", np.nan], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), + columns=['household_id', 'asset_id', 'name', 'share']) + .set_index(['household_id', 'asset_id'])) result = household.join(portfolio, how='inner') - expected = DataFrame(dict(male = [0,1,1,0,0,0], - wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ], - name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'], - share = [1.00,0.40,0.60,0.15,0.60,0.25], - household_id = [1,2,2,3,3,3], - asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']), - ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share']) - assert_frame_equal(result,expected) + expected = ( + DataFrame( + dict(male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, + 294750.0, 294750.0, 294750.0], + name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', + 'Royal Dutch Shell', + 'AAB Eastern Europe Equity Fund', + 'Postbank BioTech Fonds'], + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', + 'gb00b03mlx29', 'lu0197800237', + 'nl0000289965'])) + .set_index(['household_id', 'asset_id']) + .reindex(columns=['male', 'wealth', 'name', 'share'])) + assert_frame_equal(result, expected) - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) # equivalency - result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id']) - assert_frame_equal(result2,expected) + result2 = (merge(household.reset_index(), portfolio.reset_index(), + on=['household_id'], how='inner') + .set_index(['household_id', 'asset_id'])) + assert_frame_equal(result2, expected) result = household.join(portfolio, how='outer') - expected = concat([expected,DataFrame(dict(share = [1.00]), - index=MultiIndex.from_tuples([(4,np.nan)], - names=['household_id','asset_id']))], - axis=0).reindex(columns=expected.columns) - assert_frame_equal(result,expected) + expected = (concat([ + expected, + (DataFrame( + dict(share=[1.00]), + index=MultiIndex.from_tuples( + [(4, np.nan)], + names=['household_id', 'asset_id']))) + ], axis=0).reindex(columns=expected.columns)) + assert_frame_equal(result, expected) # invalid cases household.index.name = 'foo' + def f(): household.join(portfolio, how='inner') self.assertRaises(ValueError, f) portfolio2 = portfolio.copy() - portfolio2.index.set_names(['household_id','foo']) + portfolio2.index.set_names(['household_id', 'foo']) + def f(): portfolio2.join(portfolio, how='inner') self.assertRaises(ValueError, f) @@ -1642,45 +1708,72 @@ def test_join_multi_levels2(self): # some more advanced merges # GH6360 - household = DataFrame(dict(household_id = [1,2,2,3,3,3,4], - asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan], - share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), - columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id']) + household = ( + DataFrame( + dict(household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "nl0000289965", + np.nan], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), + columns=['household_id', 'asset_id', 'share']) + .set_index(['household_id', 'asset_id'])) log_return = DataFrame(dict( - asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], - t = [233, 234, 235, 180, 181], - log_return = [.09604978, -.06524096, .03532373, .03025441, .036997] - )).set_index(["asset_id","t"]) - - expected = DataFrame(dict( - household_id = [2, 2, 2, 3, 3, 3, 3, 3], - asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], - t = [233, 234, 235, 233, 234, 235, 180, 181], - share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], - log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997] - )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return']) + asset_id=["gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t=[233, 234, 235, 180, 181], + log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["asset_id", "t"]) + + expected = ( + DataFrame(dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=["gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237"], + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[.09604978, -.06524096, .03532373, + .09604978, -.06524096, .03532373, + .03025441, .036997] + )) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) def f(): household.join(log_return, how='inner') self.assertRaises(NotImplementedError, f) # this is the equivalency - result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t']) - assert_frame_equal(result,expected) + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='inner') + .set_index(['household_id', 'asset_id', 't'])) + assert_frame_equal(result, expected) - expected = DataFrame(dict( - household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None], - t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], - share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], - log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None] - )).set_index(["household_id", "asset_id", "t"]) + expected = ( + DataFrame(dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237", + "nl0000289965", None], + t=[None, None, 233, 234, 235, 233, 234, + 235, 180, 181, None, None], + share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, + 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], + log_return=[None, None, .09604978, -.06524096, .03532373, + .09604978, -.06524096, .03532373, + .03025441, .036997, None, None] + )) + .set_index(["household_id", "asset_id", "t"])) def f(): household.join(log_return, how='outer') self.assertRaises(NotImplementedError, f) + def _check_join(left, right, result, join_col, how='left', lsuffix='_x', rsuffix='_y'): @@ -1722,7 +1815,7 @@ def _restrict_to_columns(group, columns, suffix): found = [c for c in group.columns if c in columns or c.replace(suffix, '') in columns] - # filter + # filter group = group.ix[:, found] # get rid of suffixes, if any @@ -1823,7 +1916,8 @@ def test_append(self): # GH 6129 df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') - expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {'x': 3, 'y': 4, 'z': 6}, 'c' : {'z' : 7}}) + expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { + 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) result = df.append(row) assert_frame_equal(result, expected) @@ -1938,32 +2032,35 @@ def test_append_missing_column_proper_upcast(self): def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randint(0,10,size=4).reshape(4,1)) - df3 = DataFrame({5 : 'foo'},index=range(4)) + df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) + df3 = DataFrame({5: 'foo'}, index=range(4)) # these are actual copies - result = concat([df,df2,df3],axis=1,copy=True) + result = concat([df, df2, df3], axis=1, copy=True) for b in result._data.blocks: self.assertIsNone(b.values.base) # these are the same - result = concat([df,df2,df3],axis=1,copy=False) + result = concat([df, df2, df3], axis=1, copy=False) for b in result._data.blocks: if b.is_float: - self.assertTrue(b.values.base is df._data.blocks[0].values.base) + self.assertTrue( + b.values.base is df._data.blocks[0].values.base) elif b.is_integer: - self.assertTrue(b.values.base is df2._data.blocks[0].values.base) + self.assertTrue( + b.values.base is df2._data.blocks[0].values.base) elif b.is_object: self.assertIsNotNone(b.values.base) # float block was consolidated - df4 = DataFrame(np.random.randn(4,1)) - result = concat([df,df2,df3,df4],axis=1,copy=False) + df4 = DataFrame(np.random.randn(4, 1)) + result = concat([df, df2, df3, df4], axis=1, copy=False) for b in result._data.blocks: if b.is_float: self.assertIsNone(b.values.base) elif b.is_integer: - self.assertTrue(b.values.base is df2._data.blocks[0].values.base) + self.assertTrue( + b.values.base is df2._data.blocks[0].values.base) elif b.is_object: self.assertIsNotNone(b.values.base) @@ -1984,7 +2081,7 @@ def test_concat_with_group_keys(self): result = concat([df, df], keys=[0, 1]) exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 1, 2]]) + [0, 1, 2, 0, 1, 2]]) expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) tm.assert_frame_equal(result, expected) @@ -2015,10 +2112,11 @@ def test_concat_keys_specific_levels(self): self.assertEqual(result.columns.names[0], 'group_key') def test_concat_dataframe_keys_bug(self): - t1 = DataFrame({'value': Series([1, 2, 3], - index=Index(['a', 'b', 'c'], name='id'))}) - t2 = DataFrame({'value': Series([7, 8], - index=Index(['a', 'b'], name='id'))}) + t1 = DataFrame({ + 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], + name='id'))}) + t2 = DataFrame({ + 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) # it works result = concat([t1, t2], axis=1, keys=['t1', 't2']) @@ -2027,20 +2125,23 @@ def test_concat_dataframe_keys_bug(self): def test_concat_series_partial_columns_names(self): # GH10698 - foo = Series([1,2], name='foo') - bar = Series([1,2]) - baz = Series([4,5]) + foo = Series([1, 2], name='foo') + bar = Series([1, 2]) + baz = Series([4, 5]) result = concat([foo, bar, baz], axis=1) - expected = DataFrame({'foo' : [1,2], 0 : [1,2], 1 : [4,5]}, columns=['foo',0,1]) + expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ + 4, 5]}, columns=['foo', 0, 1]) tm.assert_frame_equal(result, expected) - result = concat([foo, bar, baz], axis=1, keys=['red','blue','yellow']) - expected = DataFrame({'red' : [1,2], 'blue' : [1,2], 'yellow' : [4,5]}, columns=['red','blue','yellow']) + result = concat([foo, bar, baz], axis=1, keys=[ + 'red', 'blue', 'yellow']) + expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ + 4, 5]}, columns=['red', 'blue', 'yellow']) tm.assert_frame_equal(result, expected) result = concat([foo, bar, baz], axis=1, ignore_index=True) - expected = DataFrame({0 : [1,2], 1 : [1,2], 2 : [4,5]}) + expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) tm.assert_frame_equal(result, expected) def test_concat_dict(self): @@ -2109,8 +2210,9 @@ def test_concat_multiindex_with_tz(self): df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) df = df.set_index(['dt', 'b']) - exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2, - tz='US/Pacific', name='dt') + exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', + '2014-01-03'] * 2, + tz='US/Pacific', name='dt') exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, @@ -2214,70 +2316,81 @@ def test_dups_index(self): # GH 4771 # single dtypes - df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C']) + df = DataFrame(np.random.randint(0, 10, size=40).reshape( + 10, 4), columns=['A', 'A', 'C', 'C']) - result = concat([df,df],axis=1) - assert_frame_equal(result.iloc[:,:4],df) - assert_frame_equal(result.iloc[:,4:],df) + result = concat([df, df], axis=1) + assert_frame_equal(result.iloc[:, :4], df) + assert_frame_equal(result.iloc[:, 4:], df) - result = concat([df,df],axis=0) - assert_frame_equal(result.iloc[:10],df) - assert_frame_equal(result.iloc[10:],df) + result = concat([df, df], axis=0) + assert_frame_equal(result.iloc[:10], df) + assert_frame_equal(result.iloc[10:], df) # multi dtypes - df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), - DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], axis=1) - result = concat([df,df],axis=1) - assert_frame_equal(result.iloc[:,:6],df) - assert_frame_equal(result.iloc[:,6:],df) + result = concat([df, df], axis=1) + assert_frame_equal(result.iloc[:, :6], df) + assert_frame_equal(result.iloc[:, 6:], df) - result = concat([df,df],axis=0) - assert_frame_equal(result.iloc[:10],df) - assert_frame_equal(result.iloc[10:],df) + result = concat([df, df], axis=0) + assert_frame_equal(result.iloc[:10], df) + assert_frame_equal(result.iloc[10:], df) # append - result = df.iloc[0:8,:].append(df.iloc[8:]) + result = df.iloc[0:8, :].append(df.iloc[8:]) assert_frame_equal(result, df) - result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10]) + result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) assert_frame_equal(result, df) - expected = concat([df,df],axis=0) + expected = concat([df, df], axis=0) result = df.append(df) assert_frame_equal(result, expected) def test_with_mixed_tuples(self): # 10697 # columns have mixed tuples, so handle properly - df1 = DataFrame({ u'A' : 'foo', (u'B',1) : 'bar' },index=range(2)) - df2 = DataFrame({ u'B' : 'foo', (u'B',1) : 'bar' },index=range(2)) - result = concat([df1,df2]) + df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) + df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) + + # it works + concat([df1, df2]) def test_join_dups(self): # joining dups - df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), - DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], axis=1) - expected = concat([df,df],axis=1) - result = df.join(df,rsuffix='_2') + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups - w = DataFrame(np.random.randn(4,2), columns=["x", "y"]) - x = DataFrame(np.random.randn(4,2), columns=["x", "y"]) - y = DataFrame(np.random.randn(4,2), columns=["x", "y"]) - z = DataFrame(np.random.randn(4,2), columns=["x", "y"]) + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x,y,z,w],axis=1) - expected.columns=['x_x','y_x','x_y','y_y','x_x','y_x','x_y','y_y'] - assert_frame_equal(dta,expected) + expected = concat([x, y, z, w], axis=1) + expected.columns = ['x_x', 'y_x', 'x_y', + 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + assert_frame_equal(dta, expected) def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) @@ -2291,22 +2404,23 @@ def test_handle_empty_objects(self): expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] expected['foo'] = expected['foo'].astype('O') - expected.loc[0:4,'foo'] = 'bar' + expected.loc[0:4, 'foo'] = 'bar' tm.assert_frame_equal(concatted, expected) # empty as first element with time series # GH3259 - df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) + df = DataFrame(dict(A=range(10000)), index=date_range( + '20130101', periods=10000, freq='s')) empty = DataFrame() - result = concat([df,empty],axis=1) + result = concat([df, empty], axis=1) assert_frame_equal(result, df) - result = concat([empty,df],axis=1) + result = concat([empty, df], axis=1) assert_frame_equal(result, df) - result = concat([df,empty]) + result = concat([df, empty]) assert_frame_equal(result, df) - result = concat([empty,df]) + result = concat([empty, df]) assert_frame_equal(result, df) def test_concat_mixed_objs(self): @@ -2315,56 +2429,64 @@ def test_concat_mixed_objs(self): # G2385 # axis 1 - index=date_range('01-Jan-2013', periods=10, freq='H') + index = date_range('01-Jan-2013', periods=10, freq='H') arr = np.arange(10, dtype='int64') s1 = Series(arr, index=index) s2 = Series(arr, index=index) - df = DataFrame(arr.reshape(-1,1), index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) - expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0]) - result = concat([df,df], axis=1) + expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), + index=index, columns=[0, 0]) + result = concat([df, df], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1]) - result = concat([s1,s2], axis=1) + expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), + index=index, columns=[0, 1]) + result = concat([s1, s2], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) - result = concat([s1,s2,s1], axis=1) + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=[0, 1, 2]) + result = concat([s1, s2, s1], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3]) - result = concat([s1,df,s2,s2,s1], axis=1) + expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), + index=index, columns=[0, 0, 1, 2, 3]) + result = concat([s1, df, s2, s2, s1], axis=1) assert_frame_equal(result, expected) # with names s1.name = 'foo' - expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0]) - result = concat([s1,df,s2], axis=1) + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=['foo', 0, 0]) + result = concat([s1, df, s2], axis=1) assert_frame_equal(result, expected) s2.name = 'bar' - expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar']) - result = concat([s1,df,s2], axis=1) + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=['foo', 0, 'bar']) + result = concat([s1, df, s2], axis=1) assert_frame_equal(result, expected) # ignore index - expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) - result = concat([s1,df,s2], axis=1, ignore_index=True) + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=[0, 1, 2]) + result = concat([s1, df, s2], axis=1, ignore_index=True) assert_frame_equal(result, expected) # axis 0 - expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0]) - result = concat([s1,df,s2]) + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), + index=index.tolist() * 3, columns=[0]) + result = concat([s1, df, s2]) assert_frame_equal(result, expected) - expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0]) - result = concat([s1,df,s2], ignore_index=True) + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + result = concat([s1, df, s2], ignore_index=True) assert_frame_equal(result, expected) # invalid concatente of mixed dims panel = tm.makePanel() - self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1)) + self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) def test_panel_join(self): panel = tm.makePanel() @@ -2576,7 +2698,8 @@ def test_concat_series_axis1(self): s2.name = None result = concat([s, s2], axis=1) - self.assertTrue(np.array_equal(result.columns, Index(['A', 0], dtype='object'))) + self.assertTrue(np.array_equal( + result.columns, Index(['A', 0], dtype='object'))) # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') @@ -2614,7 +2737,7 @@ def test_concat_datetime64_block(self): def test_concat_timedelta64_block(self): from pandas import to_timedelta - rng = to_timedelta(np.arange(10),unit='s') + rng = to_timedelta(np.arange(10), unit='s') df = DataFrame({'time': rng}) @@ -2640,8 +2763,8 @@ def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] - ## to join with union - ## these two are of different length! + # to join with union + # these two are of different length! left = concat([ts1, ts2], join='outer', axis=1) right = concat([ts2, ts1], join='outer', axis=1) @@ -2654,22 +2777,24 @@ def test_concat_bug_2972(self): result = concat([ts0, ts1], axis=1) expected = DataFrame({0: ts0, 1: ts1}) - expected.columns=['same name', 'same name'] + expected.columns = ['same name', 'same name'] assert_frame_equal(result, expected) def test_concat_bug_3602(self): # GH 3602, duplicate columns - df1 = DataFrame({'firmNo' : [0,0,0,0], 'stringvar' : ['rrr', 'rrr', 'rrr', 'rrr'], 'prc' : [6,6,6,6] }) - df2 = DataFrame({'misc' : [1,2,3,4], 'prc' : [6,6,6,6], 'C' : [9,10,11,12]}) - expected = DataFrame([[0,6,'rrr',9,1,6], - [0,6,'rrr',10,2,6], - [0,6,'rrr',11,3,6], - [0,6,'rrr',12,4,6]]) - expected.columns = ['firmNo','prc','stringvar','C','misc','prc'] - - result = concat([df1,df2],axis=1) - assert_frame_equal(result,expected) + df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'stringvar': [ + 'rrr', 'rrr', 'rrr', 'rrr'], 'prc': [6, 6, 6, 6]}) + df2 = DataFrame({'misc': [1, 2, 3, 4], 'prc': [ + 6, 6, 6, 6], 'C': [9, 10, 11, 12]}) + expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], + [0, 6, 'rrr', 10, 2, 6], + [0, 6, 'rrr', 11, 3, 6], + [0, 6, 'rrr', 12, 4, 6]]) + expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] + + result = concat([df1, df2], axis=1) + assert_frame_equal(result, expected) def test_concat_series_axis1_same_names_ignore_index(self): dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] @@ -2689,29 +2814,38 @@ def test_concat_iterables(self): expected = DataFrame([1, 2, 3, 4, 5, 6]) assert_frame_equal(concat((df1, df2), ignore_index=True), expected) assert_frame_equal(concat([df1, df2], ignore_index=True), expected) - assert_frame_equal(concat((df for df in (df1, df2)), ignore_index=True), expected) - assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) + assert_frame_equal(concat((df for df in (df1, df2)), + ignore_index=True), expected) + assert_frame_equal( + concat(deque((df1, df2)), ignore_index=True), expected) + class CustomIterator1(object): + def __len__(self): return 2 + def __getitem__(self, index): try: return {0: df1, 1: df2}[index] except KeyError: raise IndexError - assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) + assert_frame_equal(pd.concat(CustomIterator1(), + ignore_index=True), expected) + class CustomIterator2(Iterable): + def __iter__(self): yield df1 yield df2 - assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) + assert_frame_equal(pd.concat(CustomIterator2(), + ignore_index=True), expected) def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = mkdf(10, 2) - for obj in [1, dict(), [1, 2], (1, 2) ]: - self.assertRaises(TypeError, lambda x: concat([ df1, obj ])) + for obj in [1, dict(), [1, 2], (1, 2)]: + self.assertRaises(TypeError, lambda x: concat([df1, obj])) def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) @@ -2719,7 +2853,7 @@ def test_concat_invalid_first_argument(self): self.assertRaises(TypeError, concat, df1, df2) # generator ok though - concat(DataFrame(np.random.rand(5,5)) for _ in range(3)) + concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) # text reader ok # GH6583 @@ -2735,7 +2869,8 @@ def test_concat_invalid_first_argument(self): reader = read_csv(StringIO(data), chunksize=1) result = concat(reader, ignore_index=True) expected = read_csv(StringIO(data)) - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) + class TestOrderedMerge(tm.TestCase): @@ -2789,6 +2924,7 @@ def test_multigroup(self): def test_merge_type(self): class NotADataFrame(DataFrame): + @property def _constructor(self): return NotADataFrame diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index cb7e9102b21a0..d303f489d9dea 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime, date, timedelta import numpy as np from numpy.testing import assert_equal @@ -32,9 +32,11 @@ def setUp(self): def test_pivot_table(self): index = ['A', 'B'] columns = 'C' - table = pivot_table(self.data, values='D', index=index, columns=columns) + table = pivot_table(self.data, values='D', + index=index, columns=columns) - table2 = self.data.pivot_table(values='D', index=index, columns=columns) + table2 = self.data.pivot_table( + values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2) # this works @@ -50,13 +52,14 @@ def test_pivot_table(self): else: self.assertEqual(table.columns.name, columns[0]) - expected = self.data.groupby(index + [columns])['D'].agg(np.mean).unstack() + expected = self.data.groupby( + index + [columns])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_nocols(self): df = DataFrame({'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'], - 'values': [1,2,3]}) + 'values': [1, 2, 3]}) rs = df.pivot_table(columns='cols', aggfunc=np.sum) xp = df.pivot_table(index='cols', aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) @@ -70,9 +73,12 @@ def test_pivot_table_dropna(self): 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, - 'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}}) - pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False) - pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False) + 'quantity': {0: 2000000, 1: 500000, + 2: 1000000, 3: 1000000}}) + pv_col = df.pivot_table('quantity', 'month', [ + 'customer', 'product'], dropna=False) + pv_ind = df.pivot_table( + 'quantity', ['customer', 'product'], 'month', dropna=False) m = MultiIndex.from_tuples([(u('A'), u('a')), (u('A'), u('b')), @@ -90,9 +96,9 @@ def test_pivot_table_dropna(self): assert_equal(pv_col.columns.values, m.values) assert_equal(pv_ind.index.values, m.values) - def test_pass_array(self): - result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C) + result = self.data.pivot_table( + 'D', index=self.data.A, columns=self.data.C) expected = self.data.pivot_table('D', index='A', columns='C') tm.assert_frame_equal(result, expected) @@ -113,21 +119,25 @@ def test_pivot_table_multiple(self): def test_pivot_dtypes(self): # can convert dtypes - f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) + f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [ + 1, 2, 3, 4], 'i': ['a', 'b', 'a', 'b']}) self.assertEqual(f.dtypes['v'], 'int64') - z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum) + z = pivot_table(f, values='v', index=['a'], columns=[ + 'i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() - expected = Series(dict(int64 = 2)) + expected = Series(dict(int64=2)) tm.assert_series_equal(result, expected) # cannot convert dtypes - f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) + f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [ + 1.5, 2.5, 3.5, 4.5], 'i': ['a', 'b', 'a', 'b']}) self.assertEqual(f.dtypes['v'], 'float64') - z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean) + z = pivot_table(f, values='v', index=['a'], columns=[ + 'i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() - expected = Series(dict(float64 = 2)) + expected = Series(dict(float64=2)) tm.assert_series_equal(result, expected) def test_pivot_multi_values(self): @@ -160,20 +170,20 @@ def test_pivot_multi_functions(self): def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan - df = DataFrame({'a':['R1', 'R2', nan, 'R4'], - 'b':['C1', 'C2', 'C3' , 'C4'], - 'c':[10, 15, 17, 20]}) - result = df.pivot('a','b','c') - expected = DataFrame([[nan,nan,17,nan],[10,nan,nan,nan], - [nan,15,nan,nan],[nan,nan,nan,20]], - index = Index([nan,'R1','R2','R4'], name='a'), - columns = Index(['C1','C2','C3','C4'], name='b')) + df = DataFrame({'a': ['R1', 'R2', nan, 'R4'], + 'b': ['C1', 'C2', 'C3', 'C4'], + 'c': [10, 15, 17, 20]}) + result = df.pivot('a', 'b', 'c') + expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], + [nan, 15, nan, nan], [nan, nan, nan, 20]], + index=Index([nan, 'R1', 'R2', 'R4'], name='a'), + columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) # GH9491 - df = DataFrame({'a':pd.date_range('2014-02-01', periods=6, freq='D'), - 'c':100 + np.arange(6)}) + df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'), + 'c': 100 + np.arange(6)}) df['b'] = df['a'] - pd.Timestamp('2014-02-02') df.loc[1, 'a'] = df.loc[3, 'a'] = nan df.loc[1, 'b'] = df.loc[4, 'b'] = nan @@ -188,39 +198,46 @@ def test_pivot_index_with_nan(self): def test_pivot_with_tz(self): # GH 5878 - df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0), - datetime.datetime(2013, 1, 2, 9, 0), - datetime.datetime(2013, 1, 1, 9, 0), - datetime.datetime(2013, 1, 2, 9, 0)], - 'dt2': [datetime.datetime(2014, 1, 1, 9, 0), - datetime.datetime(2014, 1, 1, 9, 0), - datetime.datetime(2014, 1, 2, 9, 0), - datetime.datetime(2014, 1, 2, 9, 0)], - 'data1': np.arange(4,dtype='int64'), - 'data2': np.arange(4,dtype='int64')}) + df = DataFrame({'dt1': [datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0)], + 'dt2': [datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0)], + 'data1': np.arange(4, dtype='int64'), + 'data2': np.arange(4, dtype='int64')}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) - exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2, + exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', + '2014/01/02 09:00'] * 2, name='dt2', tz='Asia/Tokyo') exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], - name='dt1', tz='US/Pacific'), + index=pd.DatetimeIndex(['2013/01/01 09:00', + '2013/01/02 09:00'], + name='dt1', + tz='US/Pacific'), columns=exp_col) - pv = df.pivot(index='dt1', columns='dt2') + pv = df.pivot(index='dt1', columns='dt2') tm.assert_frame_equal(pv, expected) expected = DataFrame([[0, 2], [1, 3]], - index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], - name='dt1', tz='US/Pacific'), - columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], - name='dt2', tz='Asia/Tokyo')) - - pv = df.pivot(index='dt1', columns='dt2', values='data1') + index=pd.DatetimeIndex(['2013/01/01 09:00', + '2013/01/02 09:00'], + name='dt1', + tz='US/Pacific'), + columns=pd.DatetimeIndex(['2014/01/01 09:00', + '2014/01/02 09:00'], + name='dt2', + tz='Asia/Tokyo')) + + pv = df.pivot(index='dt1', columns='dt2', values='data1') tm.assert_frame_equal(pv, expected) def test_margins(self): @@ -287,19 +304,19 @@ def _check_output(result, values_col, index=['A', 'B'], # issue number #8349: pivot_table with margins and dictionary aggfunc data = [ {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2013, - 'MONTH': 12, 'DAYS': 3, 'SALARY': 17}, + 'MONTH': 12, 'DAYS': 3, 'SALARY': 17}, {'JOB': 'Employ', 'NAME': - 'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23}, + 'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23}, {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, - 'MONTH': 1, 'DAYS': 10, 'SALARY': 100}, + 'MONTH': 1, 'DAYS': 10, 'SALARY': 100}, {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, - 'MONTH': 1, 'DAYS': 11, 'SALARY': 110}, + 'MONTH': 1, 'DAYS': 11, 'SALARY': 110}, {'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, - 'MONTH': 1, 'DAYS': 15, 'SALARY': 200}, + 'MONTH': 1, 'DAYS': 15, 'SALARY': 200}, {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, - 'MONTH': 2, 'DAYS': 8, 'SALARY': 80}, + 'MONTH': 2, 'DAYS': 8, 'SALARY': 80}, {'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, - 'MONTH': 2, 'DAYS': 5, 'SALARY': 190}, + 'MONTH': 2, 'DAYS': 5, 'SALARY': 190}, ] df = DataFrame(data) @@ -328,14 +345,16 @@ def _check_output(result, values_col, index=['A', 'B'], def test_pivot_integer_columns(self): # caused by upstream bug in unstack - d = datetime.date.min + d = date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], - [d + datetime.timedelta(i) for i in range(20)], [1.0])) + [d + timedelta(i) + for i in range(20)], [1.0])) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) - table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2']) + table2 = df2.pivot_table( + values='4', index=['0', '1', '3'], columns=['2']) tm.assert_frame_equal(table, table2, check_names=False) @@ -382,7 +401,8 @@ def test_pivot_columns_lexsorted(self): iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] - dr = pd.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) + dr = pd.date_range(date(2000, 1, 1), + date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month @@ -406,24 +426,32 @@ def test_pivot_complex_aggfunc(self): def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. - result = self.data[['A', 'B']].pivot_table(index=['A', 'B'], aggfunc=len, margins=True) + result = self.data[['A', 'B']].pivot_table( + index=['A', 'B'], aggfunc=len, margins=True) result_list = result.tolist() self.assertEqual(sum(result_list[:-1]), result_list[-1]) def test_margins_no_values_two_rows(self): - # Regression test on pivot table: no values passed but rows are a multi-index - result = self.data[['A', 'B', 'C']].pivot_table(index=['A', 'B'], columns='C', aggfunc=len, margins=True) + # Regression test on pivot table: no values passed but rows are a + # multi-index + result = self.data[['A', 'B', 'C']].pivot_table( + index=['A', 'B'], columns='C', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) def test_margins_no_values_one_row_one_col(self): - # Regression test on pivot table: no values passed but row and col defined - result = self.data[['A', 'B']].pivot_table(index='A', columns='B', aggfunc=len, margins=True) + # Regression test on pivot table: no values passed but row and col + # defined + result = self.data[['A', 'B']].pivot_table( + index='A', columns='B', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0]) def test_margins_no_values_two_row_two_cols(self): - # Regression test on pivot table: no values passed but rows and cols are multi-indexed - self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] - result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) + # Regression test on pivot table: no values passed but rows and cols + # are multi-indexed + self.data['D'] = ['a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k'] + result = self.data[['A', 'B', 'C', 'D']].pivot_table( + index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) def test_pivot_table_with_margins_set_margin_name(self): @@ -447,30 +475,37 @@ def test_pivot_table_with_margins_set_margin_name(self): def test_pivot_timegrouper(self): df = DataFrame({ - 'Branch' : 'A A A A A A A B'.split(), + 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), - datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), - datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date') - - expected = DataFrame(np.array([10, 18, 3],dtype='int64').reshape(1, 3), - index=[datetime.datetime(2013, 12, 31)], + 'Date': [datetime(2013, 1, 1), + datetime(2013, 1, 1), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 12, 2), + datetime(2013, 12, 2), ]}).set_index('Date') + + expected = DataFrame(np.array([10, 18, 3], dtype='int64') + .reshape(1, 3), + index=[datetime(2013, 12, 31)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', values='Quantity', aggfunc=np.sum) - tm.assert_frame_equal(result,expected) + tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), values='Quantity', aggfunc=np.sum) - tm.assert_frame_equal(result,expected.T) + tm.assert_frame_equal(result, expected.T) - expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), - index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)], + expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]) + .reshape(2, 3), + index=[datetime(2013, 1, 1), + datetime(2013, 7, 1)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' @@ -485,57 +520,80 @@ def test_pivot_timegrouper(self): # passing the name df = df.reset_index() - result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', + result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), + columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), + result = pivot_table(df, index='Buyer', + columns=Grouper(freq='6MS', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) - self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'), - columns='Buyer', values='Quantity', aggfunc=np.sum)) - self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) + self.assertRaises(KeyError, lambda: pivot_table( + df, index=Grouper(freq='6MS', key='foo'), + columns='Buyer', values='Quantity', aggfunc=np.sum)) + self.assertRaises(KeyError, lambda: pivot_table( + df, index='Buyer', + columns=Grouper(freq='6MS', key='foo'), + values='Quantity', aggfunc=np.sum)) # passing the level df = df.set_index('Date') - result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', - values='Quantity', aggfunc=np.sum) + result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), + columns='Buyer', values='Quantity', + aggfunc=np.sum) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), + result = pivot_table(df, index='Buyer', + columns=Grouper(freq='6MS', level='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) - self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'), - columns='Buyer', values='Quantity', aggfunc=np.sum)) - self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) + self.assertRaises(ValueError, lambda: pivot_table( + df, index=Grouper(freq='6MS', level='foo'), + columns='Buyer', values='Quantity', aggfunc=np.sum)) + self.assertRaises(ValueError, lambda: pivot_table( + df, index='Buyer', + columns=Grouper(freq='6MS', level='foo'), + values='Quantity', aggfunc=np.sum)) # double grouper df = DataFrame({ - 'Branch' : 'A A A A A A A B'.split(), + 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1,3,5,1,8,1,9,3], - 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), - datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), - datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0), - datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)], - 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), - datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), - datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0), - datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]}) + 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], + 'Date': [datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 11, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 12, 5, 14, 0)], + 'PayDay': [datetime(2013, 10, 4, 0, 0), + datetime(2013, 10, 15, 13, 5), + datetime(2013, 9, 5, 20, 0), + datetime(2013, 11, 2, 10, 0), + datetime(2013, 10, 7, 20, 0), + datetime(2013, 9, 5, 10, 0), + datetime(2013, 12, 30, 12, 0), + datetime(2013, 11, 20, 14, 0), ]}) result = pivot_table(df, index=Grouper(freq='M', key='Date'), columns=Grouper(freq='M', key='PayDay'), values='Quantity', aggfunc=np.sum) - expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, - np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4), - index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), - datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)], - columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), - datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)]) + expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, + 6, np.nan, 1, 9, + np.nan, 9, np.nan, np.nan, np.nan, + np.nan, 3, np.nan]).reshape(4, 4), + index=[datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31)], + columns=[datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31)]) expected.index.name = 'Date' expected.columns.name = 'PayDay' @@ -546,74 +604,97 @@ def test_pivot_timegrouper(self): values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) - tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)), - (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)), - (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)), - (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)), - (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)), - (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),] + tuples = [(datetime(2013, 9, 30), datetime(2013, 10, 31)), + (datetime(2013, 10, 31), + datetime(2013, 9, 30)), + (datetime(2013, 10, 31), + datetime(2013, 11, 30)), + (datetime(2013, 10, 31), + datetime(2013, 12, 31)), + (datetime(2013, 11, 30), + datetime(2013, 10, 31)), + (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, - 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), + 9, np.nan, 9, np.nan, + np.nan, 3]).reshape(6, 2), index=idx, columns=['A', 'B']) expected.columns.name = 'Branch' - result = pivot_table(df, index=[Grouper(freq='M', key='Date'), - Grouper(freq='M', key='PayDay')], columns=['Branch'], - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, index=[Grouper(freq='M', key='Date'), + Grouper(freq='M', key='PayDay')], columns=['Branch'], + values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'), - Grouper(freq='M', key='PayDay')], + result = pivot_table(df, index=['Branch'], + columns=[Grouper(freq='M', key='Date'), + Grouper(freq='M', key='PayDay')], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): - dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', - '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', - '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] + dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', + '2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'] + dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', + '2013-01-01 15:00:00', + '2013-02-01 15:00:00', '2013-02-01 15:00:00', + '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, - 'value1': np.arange(6,dtype='int64'), 'value2': [1, 2] * 3}) + 'value1': np.arange(6, dtype='int64'), + 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) - exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1') + exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], + tz='US/Pacific', name='dt1') exp_col1 = Index(['value1', 'value1']) exp_col2 = Index(['a', 'b'], name='label') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) - result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) + result = pivot_table(df, index=['dt1'], columns=[ + 'label'], values=['value1']) tm.assert_frame_equal(result, expected) - - exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) + exp_col1 = Index(['sum', 'sum', 'sum', 'sum', + 'mean', 'mean', 'mean', 'mean']) exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) - exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, + exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', + '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], - [2, 5, 1, 2, 2, 5, 1, 2]], dtype='int64'), + [2, 5, 1, 2, 2, 5, 1, 2]], + dtype='int64'), index=exp_idx, columns=exp_col) - result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], + result = pivot_table(df, index=['dt1'], columns=['dt2'], + values=['value1', 'value2'], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected) def test_pivot_dtaccessor(self): # GH 8103 - dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', - '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', - '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] + dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', + '2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'] + dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', + '2013-01-01 15:00:00', + '2013-02-01 15:00:00', '2013-02-01 15:00:00', + '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, - 'value1': np.arange(6,dtype='int64'), 'value2': [1, 2] * 3}) + 'value1': np.arange(6, dtype='int64'), + 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d)) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d)) @@ -621,31 +702,37 @@ def test_pivot_dtaccessor(self): values='value1') exp_idx = Index(['a', 'b'], name='label') - expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, - index=exp_idx, columns=Index([7, 8, 9],name='dt1')) + expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=exp_idx, + columns=Index([7, 8, 9], name='dt1')) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=df['dt2'].dt.month, columns=df['dt1'].dt.hour, + result = pivot_table(df, index=df['dt2'].dt.month, + columns=df['dt1'].dt.hour, values='value1') - expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, - index=Index([1, 2],name='dt2'), columns=Index([7, 8, 9],name='dt1')) + expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=Index([1, 2], name='dt2'), + columns=Index([7, 8, 9], name='dt1')) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df['dt2'].dt.year.values, columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') - exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3],names=['dt1','dt2']) - expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]],dtype='int64'), + exp_col = MultiIndex.from_arrays( + [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2']) + expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'), index=[2013], columns=exp_col) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=np.array(['X', 'X', 'X', 'X', 'Y', 'Y']), + result = pivot_table(df, index=np.array(['X', 'X', 'X', + 'X', 'Y', 'Y']), columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan], - [np.nan, np.nan, np.nan, 4, np.nan, 5]]), + [np.nan, np.nan, np.nan, + 4, np.nan, 5]]), index=['X', 'Y'], columns=exp_col) tm.assert_frame_equal(result, expected) @@ -748,16 +835,20 @@ def test_crosstab_pass_values(self): df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values}) - expected = df.pivot_table('values', index=['foo', 'bar'], columns='baz', - aggfunc=np.sum) + expected = df.pivot_table('values', index=['foo', 'bar'], + columns='baz', aggfunc=np.sum) tm.assert_frame_equal(table, expected) def test_crosstab_dropna(self): # GH 3820 - a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object) - b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object) - c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object) - res = crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False) + a = np.array(['foo', 'foo', 'foo', 'bar', + 'bar', 'foo', 'foo'], dtype=object) + b = np.array(['one', 'one', 'two', 'one', + 'two', 'two', 'two'], dtype=object) + c = np.array(['dull', 'dull', 'dull', 'dull', + 'dull', 'shiny', 'shiny'], dtype=object) + res = crosstab(a, [b, c], rownames=['a'], + colnames=['b', 'c'], dropna=False) m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), ('two', 'dull'), ('two', 'shiny')]) assert_equal(res.columns.values, m.values) @@ -768,9 +859,9 @@ def test_categorical_margins(self): 'y': np.arange(8) // 4, 'z': np.arange(8) % 2}) - expected = pd.DataFrame([[1.0, 2.0, 1.5],[5, 6, 5.5],[3, 4, 3.5]]) - expected.index = Index([0,1,'All'],name='y') - expected.columns = Index([0,1,'All'],name='z') + expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected.index = Index([0, 1, 'All'], name='y') + expected.columns = Index([0, 1, 'All'], name='z') data = df.copy() table = data.pivot_table('x', 'y', 'z', margins=True) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index eac6973bffb25..63dc769f2ed75 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -4,7 +4,7 @@ import numpy as np from pandas.compat import zip -from pandas import DataFrame, Series, unique +from pandas import Series import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com @@ -113,7 +113,7 @@ def test_na_handling(self): def test_inf_handling(self): data = np.arange(6) - data_ser = Series(data,dtype='int64') + data_ser = Series(data, dtype='int64') result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) @@ -151,7 +151,8 @@ def test_qcut_specify_quantiles(self): self.assertTrue(factor.equals(expected)) def test_qcut_all_bins_same(self): - assertRaisesRegexp(ValueError, "edges.*unique", qcut, [0,0,0,0,0,0,0,0,0,0], 3) + assertRaisesRegexp(ValueError, "edges.*unique", qcut, + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) def test_cut_out_of_bounds(self): arr = np.random.randn(100) @@ -230,19 +231,21 @@ def test_qcut_binning_issues(self): def test_cut_return_categorical(self): from pandas import Categorical - s = Series([0,1,2,3,4,5,6,7,8]) - res = cut(s,3) - exp = Series(Categorical.from_codes([0,0,0,1,1,1,2,2,2], - ["(-0.008, 2.667]", "(2.667, 5.333]", "(5.333, 8]"], + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = cut(s, 3) + exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], + ["(-0.008, 2.667]", + "(2.667, 5.333]", "(5.333, 8]"], ordered=True)) tm.assert_series_equal(res, exp) def test_qcut_return_categorical(self): from pandas import Categorical - s = Series([0,1,2,3,4,5,6,7,8]) - res = qcut(s,[0,0.333,0.666,1]) - exp = Series(Categorical.from_codes([0,0,0,1,1,1,2,2,2], - ["[0, 2.664]", "(2.664, 5.328]", "(5.328, 8]"], + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = qcut(s, [0, 0.333, 0.666, 1]) + exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], + ["[0, 2.664]", + "(2.664, 5.328]", "(5.328, 8]"], ordered=True)) tm.assert_series_equal(res, exp) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index a00b27c81e668..8a40f65af869a 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -2,7 +2,7 @@ import locale import codecs import nose -from nose.tools import assert_raises, assert_true +from nose.tools import assert_raises import numpy as np from numpy.testing import assert_equal @@ -22,7 +22,7 @@ def test_simple(self): x, y = list('ABC'), [1, 22] result = cartesian_product([x, y]) expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), - np.array([ 1, 22, 1, 22, 1, 22])] + np.array([1, 22, 1, 22, 1, 22])] assert_equal(result, expected) def test_datetimeindex(self): @@ -91,6 +91,7 @@ def test_set_locale(self): class TestToNumeric(tm.TestCase): + def test_series(self): s = pd.Series(['1', '-3.14', '7']) res = to_numeric(s) @@ -130,7 +131,7 @@ def test_numeric(self): tm.assert_series_equal(res, expected) def test_all_nan(self): - s = pd.Series(['a','b','c']) + s = pd.Series(['a', 'b', 'c']) res = to_numeric(s, errors='coerce') expected = pd.Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(res, expected) @@ -147,4 +148,3 @@ def test_type_check(self): if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) - diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 416addfcf2ad5..f66ace14ccf50 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -2,9 +2,8 @@ Quantilization functions and related stuff """ -from pandas.core.api import DataFrame, Series +from pandas.core.api import Series from pandas.core.categorical import Categorical -from pandas.core.index import _ensure_index import pandas.core.algorithms as algos import pandas.core.common as com import pandas.core.nanops as nanops @@ -34,8 +33,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, right == True (the default), then the bins [1,2,3,4] indicate (1,2], (2,3], (3,4]. labels : array or boolean, default None - Used as labels for the resulting bins. Must be of the same length as the resulting - bins. If False, return only integer indicators of the bins. + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -47,9 +47,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Returns ------- out : Categorical or Series or array of integers if labels is False - The return type (Categorical or Series) depends on the input: a Series of type category if - input is a Series else Categorical. Bins are represented as categories when categorical - data is returned. + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. bins : ndarray of floats Returned only if `retbins` is True. @@ -66,10 +66,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) - ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], (6.533, 9.7], (0.191, 3.367]] + ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], + (6.533, 9.7], (0.191, 3.367]] Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, labels=["good","medium","bad"]) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, + labels=["good","medium","bad"]) [good, good, good, medium, bad, good] Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) @@ -109,11 +111,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - return _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision, + return _bins_to_cuts(x, bins, right=right, labels=labels, + retbins=retbins, precision=precision, include_lowest=include_lowest) - def qcut(x, q, labels=None, retbins=False, precision=3): """ Quantile-based discretization function. Discretize variable into @@ -128,8 +130,9 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None - Used as labels for the resulting bins. Must be of the same length as the resulting - bins. If False, return only integer indicators of the bins. + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -139,9 +142,9 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Returns ------- out : Categorical or Series or array of integers if labels is False - The return type (Categorical or Series) depends on the input: a Series of type category if - input is a Series else Categorical. Bins are represented as categories when categorical - data is returned. + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. bins : ndarray of floats Returned only if `retbins` is True. @@ -165,9 +168,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): else: quantiles = q bins = algos.quantile(x, quantiles) - return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,precision=precision, - include_lowest=True) - + return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, + precision=precision, include_lowest=True) def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, diff --git a/pandas/tools/util.py b/pandas/tools/util.py index c3ebadfdb9e0b..3b7becdf64a10 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -36,7 +36,7 @@ def cartesian_product(X): return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]), np.product(a[i])) - for i, x in enumerate(X)] + for i, x in enumerate(X)] def _compose2(f, g):