From c47a8216feadcf0b86ce00a8196636041429442a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Jan 2016 17:01:06 +0100 Subject: [PATCH] Reformat with YAPF --- pandas/core/categorical.py | 381 +++++++---- pandas/core/common.py | 434 +++++++----- pandas/core/frame.py | 1285 ++++++++++++++++++++++++------------ 3 files changed, 1395 insertions(+), 705 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 462ead70c9f93..212b05b0bf5e4 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -13,17 +13,17 @@ from pandas.core.missing import interpolate_2d from pandas.util.decorators import cache_readonly, deprecate_kwarg -from pandas.core.common import (ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, - isnull, notnull, is_dtype_equal, - is_categorical_dtype, is_integer_dtype, is_object_dtype, - _possibly_infer_to_datetimelike, get_dtype_kinds, - is_list_like, is_sequence, is_null_slice, is_bool, - _ensure_platform_int, _ensure_object, _ensure_int64, - _coerce_indexer_dtype, take_1d) +from pandas.core.common import ( + ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, isnull, + notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, + is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, + is_list_like, is_sequence, is_null_slice, is_bool, _ensure_platform_int, + _ensure_object, _ensure_int64, _coerce_indexer_dtype, take_1d) from pandas.core.dtypes import CategoricalDtype from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option + def _cat_compare_op(op): def f(self, other): # On python2, you can usually compare any type to any type, and Categoricals can be @@ -31,15 +31,18 @@ def f(self, other): # the same or not is kind of insane, so be a bit stricter here and use the python3 idea # of comparing only things of equal type. if not self.ordered: - if op in ['__lt__', '__gt__','__le__','__ge__']: - raise TypeError("Unordered Categoricals can only compare equality or not") + if op in ['__lt__', '__gt__', '__le__', '__ge__']: + raise TypeError( + "Unordered Categoricals can only compare equality or not") if isinstance(other, Categorical): # Two Categoricals can only be be compared if the categories are the same if (len(self.categories) != len(other.categories)) or \ not ((self.categories == other.categories).all()): - raise TypeError("Categoricals can only be compared if 'categories' are the same") + raise TypeError( + "Categoricals can only be compared if 'categories' are the same") if not (self.ordered == other.ordered): - raise TypeError("Categoricals can only be compared if 'ordered' is the same") + raise TypeError( + "Categoricals can only be compared if 'ordered' is the same") na_mask = (self._codes == -1) | (other._codes == -1) f = getattr(self._codes, op) ret = f(other._codes) @@ -73,23 +76,25 @@ def f(self, other): # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons - if op in ['__eq__','__ne__']: - return getattr(np.array(self),op)(np.array(other)) + if op in ['__eq__', '__ne__']: + return getattr(np.array(self), op)(np.array(other)) msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ "compare values, use 'np.asarray(cat) other'." - raise TypeError(msg.format(op=op,typ=type(other))) + raise TypeError(msg.format(op=op, typ=type(other))) f.__name__ = op return f + def maybe_to_categorical(array): """ coerce to a categorical if a series is given """ if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array._values return array + _codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real @@ -124,8 +129,9 @@ def maybe_to_categorical(array): remove_unused_categories set_categories """ -class Categorical(PandasObject): + +class Categorical(PandasObject): """ Represents a categorical variable in classic R / S-plus fashion @@ -185,7 +191,6 @@ class Categorical(PandasObject): """ dtype = CategoricalDtype() """The dtype (always "category")""" - """Whether or not this Categorical is ordered. Only ordered `Categoricals` can be sorted (according to the order @@ -203,13 +208,20 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False, + def __init__(self, + values, + categories=None, + ordered=False, + name=None, + fastpath=False, levels=None): if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories(categories, fastpath=isinstance(categories, ABCIndexClass)) + self._categories = self._validate_categories( + categories, + fastpath=isinstance(categories, ABCIndexClass)) self._ordered = ordered return @@ -220,13 +232,17 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # TODO: Remove after deprecation period in 2017/ after 0.18 if not levels is None: - warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", - FutureWarning, stacklevel=2) + warn( + "Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", + FutureWarning, + stacklevel=2) if categories is None: categories = levels else: - raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " - "use only 'categories'", stacklevel=2) + raise ValueError( + "Cannot pass in both 'categories' and (deprecated) 'levels', " + "use only 'categories'", + stacklevel=2) # sanitize input if is_categorical_dtype(values): @@ -250,7 +266,8 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well - values = _possibly_infer_to_datetimelike(values, convert_dates=True) + values = _possibly_infer_to_datetimelike(values, + convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array @@ -260,7 +277,6 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) - if categories is None: try: codes, categories = factorize(values, sort=True) @@ -269,12 +285,14 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F if ordered: # raise, as we don't have a sortable data structure and so the user should # give us one by specifying categories - raise TypeError("'values' is not ordered, please explicitly specify the " - "categories order by passing in a categories argument.") + raise TypeError( + "'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument.") except ValueError: ### FIXME #### - raise NotImplementedError("> 1 ndim Categorical are not supported at this time") + raise NotImplementedError( + "> 1 ndim Categorical are not supported at this time") categories = self._validate_categories(categories) @@ -290,12 +308,19 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 if is_integer_dtype(values) and not is_integer_dtype(categories): - warn("Values and categories have different dtypes. Did you mean to use\n" - "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) - - if len(values) and is_integer_dtype(values) and (codes == -1).all(): - warn("None of the categories were found in values. Did you mean to use\n" - "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) + warn( + "Values and categories have different dtypes. Did you mean to use\n" + "'Categorical.from_codes(codes, categories)'?", + RuntimeWarning, + stacklevel=2) + + if len(values) and is_integer_dtype(values) and ( + codes == -1).all(): + warn( + "None of the categories were found in values. Did you mean to use\n" + "'Categorical.from_codes(codes, categories)'?", + RuntimeWarning, + stacklevel=2) self.set_ordered(ordered or False, inplace=True) self._categories = categories @@ -303,8 +328,10 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(),categories=self.categories, - ordered=self.ordered, fastpath=True) + return Categorical(values=self._codes.copy(), + categories=self.categories, + ordered=self.ordered, + fastpath=True) def astype(self, dtype): """ coerce this type to another dtype """ @@ -379,14 +406,19 @@ def from_codes(cls, codes, categories, ordered=False, name=None): try: codes = np.asarray(codes, np.int64) except: - raise ValueError("codes need to be convertible to an arrays of integers") + raise ValueError( + "codes need to be convertible to an arrays of integers") categories = cls._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and len(categories)-1") + raise ValueError( + "codes need to be between -1 and len(categories)-1") - return Categorical(codes, categories=categories, ordered=ordered, fastpath=True) + return Categorical(codes, + categories=categories, + ordered=ordered, + fastpath=True) _codes = None @@ -416,7 +448,9 @@ def _get_labels(self): Deprecated, use .codes! """ - warn("'labels' is deprecated. Use 'codes' instead", FutureWarning, stacklevel=2) + warn("'labels' is deprecated. Use 'codes' instead", + FutureWarning, + stacklevel=2) return self.codes labels = property(fget=_get_labels, fset=_set_codes) @@ -441,7 +475,8 @@ def _validate_categories(cls, categories, fastpath=False): # on categories with NaNs, int values would be converted to float. # Use "object" dtype to prevent this. if isnull(categories).any(): - without_na = np.array([x for x in categories if notnull(x)]) + without_na = np.array([x for x in categories if notnull(x) + ]) with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" @@ -478,9 +513,11 @@ def _set_categories(self, categories, fastpath=False): """ categories = self._validate_categories(categories, fastpath=fastpath) - if not fastpath and not self._categories is None and len(categories) != len(self._categories): - raise ValueError("new categories need to have the same number of items than the old " - "categories!") + if not fastpath and not self._categories is None and len( + categories) != len(self._categories): + raise ValueError( + "new categories need to have the same number of items than the old " + "categories!") self._categories = categories @@ -489,16 +526,22 @@ def _get_categories(self): # categories is an Index, which is immutable -> no need to copy return self._categories - categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) + categories = property(fget=_get_categories, + fset=_set_categories, + doc=_categories_doc) def _set_levels(self, levels): """ set new levels (deprecated, use "categories") """ - warn("Assigning to 'levels' is deprecated, use 'categories'", FutureWarning, stacklevel=2) + warn("Assigning to 'levels' is deprecated, use 'categories'", + FutureWarning, + stacklevel=2) self.categories = levels def _get_levels(self): """ Gets the levels (deprecated, use "categories") """ - warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning, stacklevel=2) + warn("Accessing 'levels' is deprecated, use 'categories'", + FutureWarning, + stacklevel=2) return self.categories # TODO: Remove after deprecation period in 2017/ after 0.18 @@ -508,7 +551,8 @@ def _get_levels(self): def _set_ordered(self, value): """ Sets the ordered attribute to the boolean value """ - warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", FutureWarning, + warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", + FutureWarning, stacklevel=2) self.set_ordered(value, inplace=True) @@ -560,7 +604,11 @@ def _get_ordered(self): ordered = property(fget=_get_ordered, fset=_set_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + def set_categories(self, + new_categories, + ordered=None, + rename=False, + inplace=False): """ Sets the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or @@ -611,7 +659,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() if rename: - if not cat._categories is None and len(new_categories) < len(cat._categories): + if not cat._categories is None and len(new_categories) < len( + cat._categories): # remove all _codes which are larger and set to -1/NaN self._codes[self._codes >= len(new_categories)] = -1 else: @@ -697,8 +746,11 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ if set(self._categories) != set(new_categories): - raise ValueError("items in new_categories are not the same as in old categories") - return self.set_categories(new_categories, ordered=ordered, inplace=inplace) + raise ValueError( + "items in new_categories are not the same as in old categories") + return self.set_categories(new_categories, + ordered=ordered, + inplace=inplace) def add_categories(self, new_categories, inplace=False): """ Add new categories. @@ -735,7 +787,8 @@ def add_categories(self, new_categories, inplace=False): new_categories = [new_categories] already_included = set(new_categories) & set(self._categories) if len(already_included) != 0: - msg = "new categories must not include old categories: %s" % str(already_included) + msg = "new categories must not include old categories: %s" % str( + already_included) raise ValueError(msg) new_categories = list(self._categories) + list(new_categories) cat = self if inplace else self.copy() @@ -780,7 +833,7 @@ def remove_categories(self, removals, inplace=False): removal_set = set(list(removals)) not_included = removal_set - set(self._categories) - new_categories = [ c for c in self._categories if c not in removal_set ] + new_categories = [c for c in self._categories if c not in removal_set] # GH 10156 if any(isnull(removals)): @@ -788,12 +841,14 @@ def remove_categories(self, removals, inplace=False): new_categories = [x for x in new_categories if notnull(x)] if len(not_included) != 0: - raise ValueError("removals must all be in old categories: %s" % str(not_included)) + raise ValueError("removals must all be in old categories: %s" % + str(not_included)) - return self.set_categories(new_categories, ordered=self.ordered, rename=False, + return self.set_categories(new_categories, + ordered=self.ordered, + rename=False, inplace=inplace) - def remove_unused_categories(self, inplace=False): """ Removes categories which are not used. @@ -827,7 +882,6 @@ def remove_unused_categories(self, inplace=False): if not inplace: return cat - __eq__ = _cat_compare_op('__eq__') __ne__ = _cat_compare_op('__ne__') __lt__ = _cat_compare_op('__lt__') @@ -889,7 +943,7 @@ def __array__(self, dtype=None): dtype as categorical.categories.dtype """ ret = take_1d(self.categories.values, self._codes) - if dtype and not is_dtype_equal(dtype,self.categories.dtype): + if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) return ret @@ -1001,11 +1055,13 @@ def searchsorted(self, v, side='left', sorter=None): array([3, 5]) # eggs after donuts, after switching milk and donuts """ if not self.ordered: - raise ValueError("Categorical not ordered\n" - "you can use .as_ordered() to change the Categorical to an ordered one\n") + raise ValueError( + "Categorical not ordered\n" + "you can use .as_ordered() to change the Categorical to an ordered one\n") from pandas.core.series import Series - values_as_codes = self.categories.values.searchsorted(Series(v).values, side) + values_as_codes = self.categories.values.searchsorted( + Series(v).values, side) return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): @@ -1031,7 +1087,7 @@ def isnull(self): if np.nan in self.categories: nan_pos = np.where(isnull(self.categories))[0] # we only have one NA in categories - ret = np.logical_or(ret , self._codes == nan_pos) + ret = np.logical_or(ret, self._codes == nan_pos) return ret def notnull(self): @@ -1101,8 +1157,10 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = Categorical(ix, categories=cat, - ordered=obj.ordered, fastpath=True) + ix = Categorical(ix, + categories=cat, + ordered=obj.ordered, + fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1125,8 +1183,10 @@ def get_values(self): def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: - raise TypeError("Categorical is not ordered for operation {op}\n" - "you can use .as_ordered() to change the Categorical to an ordered one\n".format(op=op)) + raise TypeError( + "Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the Categorical to an ordered one\n".format( + op=op)) def argsort(self, ascending=True, **kwargs): """ Implements ndarray.argsort. @@ -1169,7 +1229,7 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): -------- Category.sort """ - if na_position not in ['last','first']: + if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) codes = np.sort(self._codes) @@ -1177,19 +1237,19 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): codes = codes[::-1] # NaN handling - na_mask = (codes==-1) + na_mask = (codes == -1) if na_mask.any(): n_nans = len(codes[na_mask]) - if na_position=="first" and not ascending: + if na_position == "first" and not ascending: # in this case sort to the front new_codes = codes.copy() new_codes[0:n_nans] = -1 new_codes[n_nans:] = codes[~na_mask] codes = new_codes - elif na_position=="last" and not ascending: + elif na_position == "last" and not ascending: # ... and to the end new_codes = codes.copy() - pos = len(codes)-n_nans + pos = len(codes) - n_nans new_codes[0:pos] = codes[~na_mask] new_codes[pos:] = -1 codes = new_codes @@ -1197,7 +1257,9 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): self._codes = codes return else: - return Categorical(values=codes,categories=self.categories, ordered=self.ordered, + return Categorical(values=codes, + categories=self.categories, + ordered=self.ordered, fastpath=True) def order(self, inplace=False, ascending=True, na_position='last'): @@ -1229,8 +1291,11 @@ def order(self, inplace=False, ascending=True, na_position='last'): Category.sort """ warn("order is deprecated, use sort_values(...)", - FutureWarning, stacklevel=2) - return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) + FutureWarning, + stacklevel=2) + return self.sort_values(inplace=inplace, + ascending=ascending, + na_position=na_position) def sort(self, inplace=True, ascending=True, na_position='last'): """ Sorts the Category inplace by category value. @@ -1257,7 +1322,8 @@ def sort(self, inplace=True, ascending=True, na_position='last'): -------- Category.sort_values """ - return self.sort_values(inplace=inplace, ascending=ascending, + return self.sort_values(inplace=inplace, + ascending=ascending, na_position=na_position) def ravel(self, order='C'): @@ -1340,8 +1406,8 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: values = self.to_dense().reshape(-1, len(self)) - values = interpolate_2d( - values, method, 0, None, value).astype(self.categories.dtype)[0] + values = interpolate_2d(values, method, 0, None, + value).astype(self.categories.dtype)[0] values = _get_codes_for_values(values, self.categories) else: @@ -1349,12 +1415,14 @@ def fillna(self, value=None, method=None, limit=None): if not isnull(value) and value not in self.categories: raise ValueError("fill value must be in categories") - mask = values==-1 + mask = values == -1 if mask.any(): values = values.copy() values[mask] = self.categories.get_loc(value) - return Categorical(values, categories=self.categories, ordered=self.ordered, + return Categorical(values, + categories=self.categories, + ordered=self.ordered, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): @@ -1368,7 +1436,9 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): assert isnull(fill_value) codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) - result = Categorical(codes, categories=self.categories, ordered=self.ordered, + result = Categorical(codes, + categories=self.categories, + ordered=self.ordered, fastpath=True) return result @@ -1384,11 +1454,14 @@ def _slice(self, slicer): # in a 2-d case be passd (slice(None),....) if isinstance(slicer, tuple) and len(slicer) == 2: if not is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") + raise AssertionError( + "invalid slicing for a 1-ndim categorical") slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes,categories=self.categories, ordered=self.ordered, + return Categorical(values=_codes, + categories=self.categories, + ordered=self.ordered, fastpath=True) def __len__(self): @@ -1403,8 +1476,7 @@ def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num):]._get_repr(length=False, - footer=False) + tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) if footer: @@ -1414,8 +1486,8 @@ def _tidy_repr(self, max_vals=10, footer=True): def _repr_categories(self): """ return the base repr for the categories """ - max_categories = (10 if get_option("display.max_categories") == 0 - else get_option("display.max_categories")) + max_categories = (10 if get_option("display.max_categories") == 0 else + get_option("display.max_categories")) from pandas.core import format as fmt if len(self.categories) > max_categories: num = max_categories // 2 @@ -1433,7 +1505,8 @@ def _repr_categories_info(self): """ Returns a string representation of the footer.""" category_strs = self._repr_categories() - dtype = getattr(self.categories, 'dtype_str', str(self.categories.dtype)) + dtype = getattr(self.categories, 'dtype_str', + str(self.categories.dtype)) levheader = "Categories (%d, %s): " % (len(self.categories), dtype) width, height = get_terminal_size() @@ -1443,20 +1516,20 @@ def _repr_categories_info(self): max_width = 0 levstring = "" start = True - cur_col_len = len(levheader) # header + cur_col_len = len(levheader) # header sep_len, sep = (3, " < ") if self.ordered else (2, ", ") - linesep = sep.rstrip() + "\n" # remove whitespace + linesep = sep.rstrip() + "\n" # remove whitespace for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: levstring += linesep + (" " * (len(levheader) + 1)) - cur_col_len = len(levheader) + 1 # header + a whitespace + cur_col_len = len(levheader) + 1 # header + a whitespace elif not start: levstring += sep cur_col_len += len(val) levstring += val start = False # replace to simple save space by - return levheader + "["+levstring.replace(" < ... < ", " ... ")+"]" + return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" def _repr_footer(self): @@ -1479,9 +1552,9 @@ def __unicode__(self): elif len(self._codes) > 0: result = self._get_repr(length=len(self) > _maxlen) else: - result = '[], %s' % self._get_repr(length=False, - footer=True, - ).replace("\n",", ") + result = '[], %s' % self._get_repr( + length=False, + footer=True, ).replace("\n", ", ") return result @@ -1500,8 +1573,10 @@ def __getitem__(self, key): else: return self.categories[i] else: - return Categorical(values=self._codes[key], categories=self.categories, - ordered=self.ordered, fastpath=True) + return Categorical(values=self._codes[key], + categories=self.categories, + ordered=self.ordered, + fastpath=True) def __setitem__(self, key, value): """ Item assignment. @@ -1518,8 +1593,9 @@ def __setitem__(self, key, value): # require identical categories set if isinstance(value, Categorical): if not value.categories.equals(self.categories): - raise ValueError("Cannot set a Categorical with another, without identical " - "categories") + raise ValueError( + "Cannot set a Categorical with another, without identical " + "categories") rvalue = value if is_list_like(value) else [value] @@ -1528,8 +1604,9 @@ def __setitem__(self, key, value): # no assignments of values not in categories, but it's always ok to set something to np.nan if len(to_add) and not isnull(to_add).all(): - raise ValueError("cannot setitem on a Categorical with a new category," - " set the categories first") + raise ValueError( + "cannot setitem on a Categorical with a new category," + " set the categories first") # set by position if isinstance(key, (int, np.integer)): @@ -1541,12 +1618,14 @@ def __setitem__(self, key, value): # in a 2-d case be passd (slice(None),....) if len(key) == 2: if not is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") + raise AssertionError( + "invalid slicing for a 1-ndim categorical") key = key[1] elif len(key) == 1: key = key[0] else: - raise AssertionError("invalid slicing for a 1-ndim categorical") + raise AssertionError( + "invalid slicing for a 1-ndim categorical") # slicing in Series or Categorical elif isinstance(key, slice): @@ -1574,12 +1653,20 @@ def __setitem__(self, key, value): self._codes[key] = lindexer #### reduction ops #### - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, + op, + name, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds): """ perform the reduction type operation """ - func = getattr(self,name,None) + func = getattr(self, name, None) if func is None: - raise TypeError("Categorical cannot perform the operation {op}".format(op=name)) + raise TypeError( + "Categorical cannot perform the operation {op}".format(op= + name)) return func(numeric_only=numeric_only, **kwds) def min(self, numeric_only=None, **kwargs): @@ -1607,7 +1694,6 @@ def min(self, numeric_only=None, **kwargs): else: return self.categories[pointer] - def max(self, numeric_only=None, **kwargs): """ The maximum value of the object. @@ -1647,8 +1733,11 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 - result = Categorical(sorted(htable.mode_int64(_ensure_int64(self._codes[good]))), - categories=self.categories,ordered=self.ordered, fastpath=True) + result = Categorical( + sorted(htable.mode_int64(_ensure_int64(self._codes[good]))), + categories=self.categories, + ordered=self.ordered, + fastpath=True) return result def unique(self): @@ -1690,7 +1779,8 @@ def equals(self, other): ------- are_equal : boolean """ - return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes) + return self.is_dtype_equal(other) and np.array_equal(self._codes, + other._codes) def is_dtype_equal(self, other): """ @@ -1707,7 +1797,8 @@ def is_dtype_equal(self, other): """ try: - return self.categories.equals(other.categories) and self.ordered == other.ordered + return self.categories.equals( + other.categories) and self.ordered == other.ordered except (AttributeError, TypeError): return False @@ -1723,8 +1814,8 @@ def describe(self): freqs = counts / float(counts.sum()) from pandas.tools.merge import concat - result = concat([counts,freqs],axis=1) - result.columns = ['counts','freqs'] + result = concat([counts, freqs], axis=1) + result.columns = ['counts', 'freqs'] result.index.name = 'categories' return result @@ -1739,12 +1830,14 @@ def repeat(self, repeats): """ codes = self._codes.repeat(repeats) - return Categorical(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) - + return Categorical(values=codes, + categories=self.categories, + ordered=self.ordered, + fastpath=True) ##### The Series.cat accessor ##### + class CategoricalAccessor(PandasDelegate, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. @@ -1790,29 +1883,28 @@ def _delegate_method(self, name, *args, **kwargs): if not res is None: return Series(res, index=self.index) -CategoricalAccessor._add_delegate_accessors(delegate=Categorical, - accessors=["categories", "ordered"], - typ='property') -CategoricalAccessor._add_delegate_accessors(delegate=Categorical, - accessors=["rename_categories", - "reorder_categories", - "add_categories", - "remove_categories", - "remove_unused_categories", - "set_categories", - "as_ordered", - "as_unordered"], - typ='method') + +CategoricalAccessor._add_delegate_accessors( + delegate=Categorical, + accessors=["categories", "ordered"], + typ='property') +CategoricalAccessor._add_delegate_accessors( + delegate=Categorical, + accessors=["rename_categories", "reorder_categories", "add_categories", + "remove_categories", "remove_unused_categories", + "set_categories", "as_ordered", "as_unordered"], + typ='method') ##### utility routines ##### + def _get_codes_for_values(values, categories): """ utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if not is_dtype_equal(values.dtype,categories.dtype): + if not is_dtype_equal(values.dtype, categories.dtype): values = _ensure_object(values) categories = _ensure_object(categories) @@ -1822,13 +1914,14 @@ def _get_codes_for_values(values, categories): t.map_locations(cats) return _coerce_indexer_dtype(t.lookup(vals), cats) + def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): return list_like if isinstance(list_like, list): return list_like - if (is_sequence(list_like) or isinstance(list_like, tuple) - or isinstance(list_like, types.GeneratorType)): + if (is_sequence(list_like) or isinstance(list_like, tuple) or + isinstance(list_like, types.GeneratorType)): return list(list_like) elif np.isscalar(list_like): return [list_like] @@ -1836,6 +1929,7 @@ def _convert_to_list_like(list_like): # is this reached? return [list_like] + def _concat_compat(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -1862,8 +1956,11 @@ def convert_categorical(x): if get_dtype_kinds(to_concat) - set(['object', 'category']): # convert to object type and perform a regular concat from pandas.core.common import _concat_compat - return _concat_compat([np.array(x, copy=False, dtype=object) - for x in to_concat], axis=0) + return _concat_compat( + [np.array(x, + copy=False, + dtype=object) for x in to_concat], + axis=0) # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything @@ -1882,11 +1979,15 @@ def convert_categorical(x): if len(categoricals) == len(to_concat): # concating numeric types is much faster than concating object types # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), - rawcats, - ordered=categoricals[0].ordered, - fastpath=True) + return Categorical( + np.concatenate( + [x.codes for x in to_concat], + axis=0), + rawcats, + ordered=categoricals[0].ordered, + fastpath=True) else: - concatted = np.concatenate(list(map(convert_categorical, to_concat)), - axis=0) + concatted = np.concatenate( + list(map(convert_categorical, to_concat)), + axis=0) return Categorical(concatted, rawcats) diff --git a/pandas/core/common.py b/pandas/core/common.py index e81b58a3f7eef..1859b3325a9ac 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -21,6 +21,7 @@ from pandas.core.dtypes import CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType from pandas.core.config import get_option + class PandasError(Exception): pass @@ -41,6 +42,7 @@ class AbstractMethodError(NotImplementedError): """Raise this error instead of NotImplementedError for abstract methods while keeping compatibility with Python 2 and Python 3. """ + def __init__(self, class_instance): self.class_instance = class_instance @@ -48,63 +50,68 @@ def __str__(self): return "This method must be defined in the concrete class of " \ + self.class_instance.__class__.__name__ + _POSSIBLY_CAST_DTYPES = set([np.dtype(t).name - for t in ['O', 'int8', - 'uint8', 'int16', 'uint16', 'int32', - 'uint32', 'int64', 'uint64']]) + for t in ['O', 'int8', 'uint8', 'int16', 'uint16', + 'int32', 'uint32', 'int64', 'uint64']]) _NS_DTYPE = np.dtype('M8[ns]') _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) -_DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'M8[ns]', - 'm8[ns]', 'm8[ns]']]) +_DATELIKE_DTYPES = set([np.dtype( + t) for t in ['M8[ns]', 'M8[ns]', 'm8[ns]', 'm8[ns]' + ]]) _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max _int64_max = np.iinfo(np.int64).max + # define abstract base classes to enable isinstance type checking on our # objects def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): return getattr(inst, attr, '_typ') in comp - dct = dict(__instancecheck__=_check, - __subclasscheck__=_check) - meta = type("ABCBase", (type,), dct) + + dct = dict(__instancecheck__=_check, __subclasscheck__=_check) + meta = type("ABCBase", (type, ), dct) return meta(name, tuple(), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) -ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) -ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) -ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) -ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)) -ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)) -ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) -ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) -ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", - "int64index", - "float64index", - "multiindex", - "datetimeindex", - "timedeltaindex", - "periodindex", - "categoricalindex")) - -ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) -ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) -ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", )) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", + ("int64index", )) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", + ("float64index", )) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", + ("multiindex", )) +ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", + ("datetimeindex", )) +ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", + ("timedeltaindex", )) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", + ("periodindex", )) +ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", + ("categoricalindex", )) +ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ( + "index", "int64index", "float64index", "multiindex", "datetimeindex", + "timedeltaindex", "periodindex", "categoricalindex")) + +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) +ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel", )) ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', 'sparse_time_series')) ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", ('sparse_array', 'sparse_series')) -ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical")) -ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) +ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", + ("categorical")) +ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", )) -class _ABCGeneric(type): +class _ABCGeneric(type): def __instancecheck__(cls, inst): return hasattr(inst, "_data") @@ -136,6 +143,7 @@ class to receive bound method else: setattr(cls, name, func) + def isnull(obj): """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) @@ -198,6 +206,7 @@ def _isnull_old(obj): else: return obj is None + _isnull = _isnull_new @@ -263,6 +272,7 @@ def _isnull_ndarraylike(obj): return result + def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) dtype = values.dtype @@ -316,6 +326,7 @@ def notnull(obj): return not res return ~res + def is_null_datelike_scalar(other): """ test whether the object is a null datelike, e.g. Nat but guard against passing a non-scalar """ @@ -324,13 +335,14 @@ def is_null_datelike_scalar(other): elif np.isscalar(other): # a timedelta - if hasattr(other,'dtype'): + if hasattr(other, 'dtype'): return other.view('i8') == tslib.iNaT elif is_integer(other) and other == tslib.iNaT: return True return isnull(other) return False + def array_equivalent(left, right, strict_nan=False): """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in @@ -366,19 +378,21 @@ def array_equivalent(left, right, strict_nan=False): if left.shape != right.shape: return False # Object arrays can contain None, NaN and NaT. - if issubclass(left.dtype.type, np.object_) or issubclass(right.dtype.type, np.object_): + if issubclass(left.dtype.type, np.object_) or issubclass(right.dtype.type, + np.object_): if not strict_nan: # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object(_ensure_object(left.ravel()), - _ensure_object(right.ravel())) + return lib.array_equivalent_object( + _ensure_object(left.ravel()), _ensure_object(right.ravel())) for left_value, right_value in zip(left, right): if left_value is tslib.NaT and right_value is not tslib.NaT: return False elif isinstance(left_value, float) and np.isnan(left_value): - if not isinstance(right_value, float) or not np.isnan(right_value): + if not isinstance(right_value, + float) or not np.isnan(right_value): return False else: if left_value != right_value: @@ -396,6 +410,7 @@ def array_equivalent(left, right, strict_nan=False): # NaNs cannot occur otherwise. return np.array_equal(left, right) + def _iterable_not_string(x): return (isinstance(x, collections.Iterable) and not isinstance(x, compat.string_types)) @@ -502,6 +517,7 @@ def wrapper(arr, indexer, out, fill_value=np.nan): if fill_wrap is not None: fill_value = fill_wrap(fill_value) f(arr, indexer, out, fill_value=fill_value) + return wrapper @@ -509,6 +525,7 @@ def _convert_wrapper(f, conv_dtype): def wrapper(arr, indexer, out, fill_value=np.nan): arr = arr.astype(conv_dtype) f(arr, indexer, out, fill_value=fill_value) + return wrapper @@ -569,15 +586,14 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_1d_float32_float64, ('float64', 'float64'): algos.take_1d_float64_float64, ('object', 'object'): algos.take_1d_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_1d_bool_object, np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_1d_int64_int64, np.int64, np.int64, np.int64) + ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8, + None), + ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( + algos.take_1d_int64_int64, np.int64, np.int64, np.int64) } - _take_2d_axis0_dict = { ('int8', 'int8'): algos.take_2d_axis0_int8_int8, ('int8', 'int32'): algos.take_2d_axis0_int8_int32, @@ -596,16 +612,17 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_2d_axis0_float32_float64, ('float64', 'float64'): algos.take_2d_axis0_float64_float64, ('object', 'object'): algos.take_2d_axis0_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object, + np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( + algos.take_2d_axis0_int64_int64, + np.int64, + np.int64, + fill_wrap=np.int64) } - _take_2d_axis1_dict = { ('int8', 'int8'): algos.take_2d_axis1_int8_int8, ('int8', 'int32'): algos.take_2d_axis1_int8_int32, @@ -624,16 +641,17 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_2d_axis1_float32_float64, ('float64', 'float64'): algos.take_2d_axis1_float64_float64, ('object', 'object'): algos.take_2d_axis1_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object, + np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( + algos.take_2d_axis1_int64_int64, + np.int64, + np.int64, + fill_wrap=np.int64) } - _take_2d_multi_dict = { ('int8', 'int8'): algos.take_2d_multi_int8_int8, ('int8', 'int32'): algos.take_2d_multi_int8_int32, @@ -652,13 +670,15 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_2d_multi_float32_float64, ('float64', 'float64'): algos.take_2d_multi_float64_float64, ('object', 'object'): algos.take_2d_multi_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object, + np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( + algos.take_2d_multi_int64_int64, + np.int64, + np.int64, + fill_wrap=np.int64) } @@ -689,13 +709,23 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = _ensure_int64(indexer) - _take_nd_generic(arr, indexer, out, axis=axis, - fill_value=fill_value, mask_info=mask_info) + _take_nd_generic(arr, + indexer, + out, + axis=axis, + fill_value=fill_value, + mask_info=mask_info) + return func -def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, - mask_info=None, allow_fill=True): +def take_nd(arr, + indexer, + axis=0, + out=None, + fill_value=np.nan, + mask_info=None, + allow_fill=True): """ Specialized Cython take which sets NaN values in one pass @@ -726,7 +756,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, # dispatch to internal type takes if is_categorical(arr): - return arr.take_nd(indexer, fill_value=fill_value, + return arr.take_nd(indexer, + fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) @@ -786,8 +817,11 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, else: out = np.empty(out_shape, dtype=dtype) - func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, - axis=axis, mask_info=mask_info) + func = _get_take_nd_function(arr.ndim, + arr.dtype, + out.dtype, + axis=axis, + mask_info=mask_info) indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) @@ -799,8 +833,12 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, take_1d = take_nd -def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, - mask_info=None, allow_fill=True): +def take_2d_multi(arr, + indexer, + out=None, + fill_value=np.nan, + mask_info=None, + allow_fill=True): """ Specialized Cython take which sets NaN values in one pass """ @@ -858,12 +896,18 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: + def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_generic(arr, indexer, out, - fill_value=fill_value, mask_info=mask_info) + _take_2d_multi_generic(arr, + indexer, + out, + fill_value=fill_value, + mask_info=mask_info) + func(arr, indexer, out=out, fill_value=fill_value) return out + _diff_special = { 'float64': algos.diff_2d_float64, 'float32': algos.diff_2d_float32, @@ -873,6 +917,7 @@ def func(arr, indexer, out, fill_value=np.nan): 'int8': algos.diff_2d_int8, } + def diff(arr, n, axis=0): """ difference of n between self, analagoust to s-s.shift(n) """ @@ -931,10 +976,12 @@ def diff(arr, n, axis=0): if is_timedelta: from pandas import TimedeltaIndex - out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(out_arr.shape).astype('timedelta64[ns]') + out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape( + out_arr.shape).astype('timedelta64[ns]') return out_arr + def _coerce_indexer_dtype(indexer, categories): """ coerce the indexer input array to the smallest dtype possible """ l = len(categories) @@ -946,6 +993,7 @@ def _coerce_indexer_dtype(indexer, categories): return _ensure_int32(indexer) return _ensure_int64(indexer) + def _coerce_to_dtypes(result, dtypes): """ given a dtypes and a result set, coerce the result elements to the dtypes @@ -965,7 +1013,7 @@ def conv(r, dtype): r = _coerce_scalar_to_timedelta_type(r) elif dtype == np.bool_: # messy. non 0/1 integers do not get converted. - if is_integer(r) and r not in [0,1]: + if is_integer(r) and r not in [0, 1]: return int(r) r = bool(r) elif dtype.kind == 'f': @@ -989,15 +1037,15 @@ def _infer_fill_value(val): if not is_list_like(val): val = [val] - val = np.array(val,copy=False) + val = np.array(val, copy=False) if is_datetimelike(val): - return np.array('NaT',dtype=val.dtype) + return np.array('NaT', dtype=val.dtype) elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(_ensure_object(val)) - if dtype in ['datetime','datetime64']: - return np.array('NaT',dtype=_NS_DTYPE) - elif dtype in ['timedelta','timedelta64']: - return np.array('NaT',dtype=_TD_DTYPE) + if dtype in ['datetime', 'datetime64']: + return np.array('NaT', dtype=_NS_DTYPE) + elif dtype in ['timedelta', 'timedelta64']: + return np.array('NaT', dtype=_TD_DTYPE) return np.nan @@ -1025,12 +1073,13 @@ def _infer_dtype_from_scalar(val): dtype = np.object_ - elif isinstance(val, (np.datetime64, datetime)) and getattr(val,'tzinfo',None) is None: + elif isinstance(val, (np.datetime64, + datetime)) and getattr(val, 'tzinfo', None) is None: val = lib.Timestamp(val).value dtype = np.dtype('M8[ns]') elif isinstance(val, (np.timedelta64, timedelta)): - val = tslib.convert_to_timedelta(val,'ns') + val = tslib.convert_to_timedelta(val, 'ns') dtype = np.dtype('m8[ns]') elif is_bool(val): @@ -1190,7 +1239,7 @@ def changeit(): # we have a scalar or len 0 ndarray # and its nan and we are changing some values if (np.isscalar(other) or - (isinstance(other, np.ndarray) and other.ndim < 1)): + (isinstance(other, np.ndarray) and other.ndim < 1)): if isnull(other): return changeit() @@ -1281,7 +1330,8 @@ def _possibly_downcast_to_dtype(result, dtype): # don't allow upcasts here (except if empty) if dtype.kind == result.dtype.kind: - if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): + if result.dtype.itemsize <= dtype.itemsize and np.prod( + result.shape): return result if issubclass(dtype.type, np.floating): @@ -1317,7 +1367,7 @@ def _possibly_downcast_to_dtype(result, dtype): return new_result # a datetimelike - elif dtype.kind in ['M','m'] and result.dtype.kind in ['i']: + elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: try: result = result.astype(dtype) except: @@ -1339,7 +1389,7 @@ def _maybe_convert_string_to_object(values): if isinstance(values, string_types): values = np.array([values], dtype=object) elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): + issubclass(values.dtype.type, (np.string_, np.unicode_))): values = values.astype(object) return values @@ -1386,9 +1436,9 @@ def _fill_zeros(result, x, y, name, fill): return result if name.startswith(('r', '__r')): - x,y = y,x + x, y = y, x - is_typed_variable = (hasattr(y, 'dtype') or hasattr(y,'type')) + is_typed_variable = (hasattr(y, 'dtype') or hasattr(y, 'type')) is_scalar = lib.isscalar(y) if not is_typed_variable and not is_scalar: @@ -1433,18 +1483,18 @@ def _consensus_name_attr(objs): return None return name - #---------------------------------------------------------------------- # Lots of little utilities + def _validate_date_like_dtype(dtype): try: typ = np.datetime_data(dtype)[0] except ValueError as e: raise TypeError('%s' % e) if typ != 'generic' and typ != 'ns': - raise ValueError('%r is too specific of a frequency, try passing %r' - % (dtype.name, dtype.type.__name__)) + raise ValueError('%r is too specific of a frequency, try passing %r' % + (dtype.name, dtype.type.__name__)) def _invalidate_string_dtypes(dtype_set): @@ -1481,7 +1531,7 @@ def _get_dtype_from_object(dtype): dtype += '64' try: - return _get_dtype_from_object(getattr(np,dtype)) + return _get_dtype_from_object(getattr(np, dtype)) except AttributeError: # handles cases like _get_dtype(int) # i.e., python objects that are valid dtypes (unlike user-defined @@ -1522,6 +1572,7 @@ def _maybe_box_datetimelike(value): return value + _values_from_object = lib.values_from_object @@ -1568,7 +1619,7 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): if is_datetime64 or is_datetime64tz or is_timedelta64: # force the dtype if needed - if is_datetime64 and not is_dtype_equal(dtype,_NS_DTYPE): + if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): if dtype.name == 'datetime64[ns]': dtype = _NS_DTYPE else: @@ -1576,7 +1627,7 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): "cannot convert datetimelike to dtype [%s]" % dtype) elif is_datetime64tz: pass - elif is_timedelta64 and not is_dtype_equal(dtype,_TD_DTYPE): + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): if dtype.name == 'timedelta64[ns]': dtype = _TD_DTYPE else: @@ -1587,20 +1638,23 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): if value == tslib.iNaT or isnull(value): value = tslib.iNaT else: - value = np.array(value,copy=False) + value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) if value.ndim == 0: value = tslib.iNaT # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, + dtype): try: if is_datetime64: value = to_datetime(value, errors=errors)._values elif is_datetime64tz: # input has to be UTC at this point, so just localize - value = to_datetime(value, errors=errors).tz_localize(dtype.tz) + value = to_datetime( + value, + errors=errors).tz_localize(dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except (AttributeError, ValueError): @@ -1669,7 +1723,7 @@ def _possibly_infer_to_datetimelike(value, convert_dates=False): v = value if not is_list_like(v): v = [v] - v = np.array(v,copy=False) + v = np.array(v, copy=False) shape = v.shape if not v.ndim == 1: v = v.ravel() @@ -1707,10 +1761,11 @@ def _try_timedelta(v): return v # do a quick inference for perf - sample = v[:min(3,len(v))] + sample = v[:min(3, len(v))] inferred_type = lib.infer_dtype(sample) - if inferred_type in ['datetime', 'datetime64'] or (convert_dates and inferred_type in ['date']): + if inferred_type in ['datetime', 'datetime64'] or ( + convert_dates and inferred_type in ['date']): value = _try_datetime(v) elif inferred_type in ['timedelta', 'timedelta64']: value = _try_timedelta(v) @@ -1757,7 +1812,7 @@ def is_bool_indexer(key): def _default_index(n): from pandas.core.index import Int64Index values = np.arange(n, dtype=np.int64) - result = Int64Index(values,name=None) + result = Int64Index(values, name=None) result.is_unique = True return result @@ -1784,6 +1839,7 @@ def _mut_exclusive(**kwargs): def _not_none(*args): return (arg for arg in args if arg is not None) + def _any_none(*args): for arg in args: if arg is None: @@ -1813,8 +1869,6 @@ def _count_not_none(*args): # miscellaneous python tools - - def adjoin(space, *lists, **kwargs): """ Glues together two sets of strings using the amount of space requested. @@ -1849,6 +1903,7 @@ def adjoin(space, *lists, **kwargs): out_lines.append(_join_unicode(lines)) return _join_unicode(out_lines, sep='\n') + def _justify(texts, max_len, mode='right'): """ Perform ljust, center, rjust against string or list-like @@ -1860,6 +1915,7 @@ def _justify(texts, max_len, mode='right'): else: return [x.rjust(max_len) for x in texts] + def _join_unicode(lines, sep=''): try: return sep.join(lines) @@ -1932,7 +1988,6 @@ def _long_prod(vals): class groupby(dict): - """ A simple groupby different from the one in itertools. @@ -1944,6 +1999,7 @@ def __init__(self, seq, key=lambda x: x): for value in seq: k = key(value) self.setdefault(k, []).append(value) + try: __iter__ = dict.iteritems except AttributeError: # pragma: no cover @@ -1985,8 +2041,7 @@ def intersection(*seqs): def _asarray_tuplesafe(values, dtype=None): from pandas.core.index import Index - if not (isinstance(values, (list, tuple)) - or hasattr(values, '__array__')): + if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')): values = list(values) elif isinstance(values, Index): return values.values @@ -2041,19 +2096,17 @@ def _maybe_make_list(obj): is_bool = lib.is_bool - is_integer = lib.is_integer - is_float = lib.is_float - is_complex = lib.is_complex def is_string_like(obj): return isinstance(obj, (compat.text_type, compat.string_types)) + def is_iterator(obj): # python 3 generators have __next__ instead of next return hasattr(obj, 'next') or hasattr(obj, '__next__') @@ -2062,6 +2115,7 @@ def is_iterator(obj): def is_number(obj): return isinstance(obj, (numbers.Number, np.number)) + def is_period_arraylike(arr): """ return if we are period arraylike / PeriodIndex """ if isinstance(arr, pd.PeriodIndex): @@ -2070,6 +2124,7 @@ def is_period_arraylike(arr): return arr.dtype == object and lib.infer_dtype(arr) == 'period' return getattr(arr, 'inferred_type', None) == 'period' + def is_datetime_arraylike(arr): """ return if we are datetime arraylike / DatetimeIndex """ if isinstance(arr, ABCDatetimeIndex): @@ -2078,8 +2133,11 @@ def is_datetime_arraylike(arr): return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' return getattr(arr, 'inferred_type', None) == 'datetime' + def is_datetimelike(arr): - return arr.dtype in _DATELIKE_DTYPES or isinstance(arr, ABCPeriodIndex) or is_datetimetz(arr) + return arr.dtype in _DATELIKE_DTYPES or isinstance( + arr, ABCPeriodIndex) or is_datetimetz(arr) + def _coerce_to_dtype(dtype): """ coerce a string / np.dtype to a dtype """ @@ -2091,6 +2149,7 @@ def _coerce_to_dtype(dtype): dtype = np.dtype(dtype) return dtype + def _get_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype @@ -2110,6 +2169,7 @@ def _get_dtype(arr_or_dtype): arr_or_dtype = arr_or_dtype.dtype return np.dtype(arr_or_dtype) + def _get_dtype_type(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type @@ -2130,6 +2190,7 @@ def _get_dtype_type(arr_or_dtype): except AttributeError: return type(None) + def is_dtype_equal(source, target): """ return a boolean if the dtypes are equal """ try: @@ -2142,6 +2203,7 @@ def is_dtype_equal(source, target): # object == category will hit this return False + def is_any_int_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.integer) @@ -2152,15 +2214,18 @@ def is_integer_dtype(arr_or_dtype): return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) + def is_int64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.int64) + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or issubclass(tipo, (np.datetime64, np.timedelta64))) + def is_datetime64_dtype(arr_or_dtype): try: tipo = _get_dtype_type(arr_or_dtype) @@ -2168,11 +2233,15 @@ def is_datetime64_dtype(arr_or_dtype): return False return issubclass(tipo, np.datetime64) + def is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtype.is_dtype(arr_or_dtype) + def is_datetime64_any_dtype(arr_or_dtype): - return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) + return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype( + arr_or_dtype) + def is_datetime64_ns_dtype(arr_or_dtype): try: @@ -2181,6 +2250,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): return False return tipo == _NS_DTYPE + def is_timedelta64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.timedelta64) @@ -2214,62 +2284,67 @@ def is_numeric_v_string_like(a, b): is_a_scalar_string_like = not is_a_array and is_string_like(a) is_b_scalar_string_like = not is_b_array and is_string_like(b) - return ( - is_a_numeric_array and is_b_scalar_string_like) or ( + return (is_a_numeric_array and is_b_scalar_string_like) or ( is_b_numeric_array and is_a_scalar_string_like) or ( - is_a_numeric_array and is_b_string_array) or ( - is_b_numeric_array and is_a_string_array - ) + is_a_numeric_array and is_b_string_array) or ( + is_b_numeric_array and is_a_string_array) + def is_datetimelike_v_numeric(a, b): # return if we have an i8 convertible and numeric comparision - if not hasattr(a,'dtype'): + if not hasattr(a, 'dtype'): a = np.asarray(a) if not hasattr(b, 'dtype'): b = np.asarray(b) is_numeric = lambda x: is_integer_dtype(x) or is_float_dtype(x) is_datetimelike = needs_i8_conversion - return (is_datetimelike(a) and is_numeric(b)) or ( - is_datetimelike(b) and is_numeric(a)) + return (is_datetimelike(a) and is_numeric(b)) or (is_datetimelike(b) and + is_numeric(a)) + def is_datetimelike_v_object(a, b): # return if we have an i8 convertible and object comparision - if not hasattr(a,'dtype'): + if not hasattr(a, 'dtype'): a = np.asarray(a) if not hasattr(b, 'dtype'): b = np.asarray(b) f = lambda x: is_object_dtype(x) is_object = lambda x: is_integer_dtype(x) or is_float_dtype(x) is_datetimelike = needs_i8_conversion - return (is_datetimelike(a) and is_object(b)) or ( - is_datetimelike(b) and is_object(a)) + return (is_datetimelike(a) and is_object(b)) or (is_datetimelike(b) and + is_object(a)) needs_i8_conversion = lambda arr_or_dtype: is_datetime_or_timedelta_dtype(arr_or_dtype) or \ is_datetime64tz_dtype(arr_or_dtype) + def i8_boxer(arr_or_dtype): """ return the scalar boxer for the dtype """ - if is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype): + if is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype( + arr_or_dtype): return lib.Timestamp elif is_timedelta64_dtype(arr_or_dtype): - return lambda x: lib.Timedelta(x,unit='ns') + return lambda x: lib.Timedelta(x, unit='ns') raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype)) + def is_numeric_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, (np.number, np.bool_)) - and not issubclass(tipo, (np.datetime64, np.timedelta64))) + return (issubclass(tipo, (np.number, np.bool_)) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) def is_string_dtype(arr_or_dtype): dtype = _get_dtype(arr_or_dtype) return dtype.kind in ('O', 'S', 'U') + def is_string_like_dtype(arr_or_dtype): # exclude object as its a mixed dtype dtype = _get_dtype(arr_or_dtype) return dtype.kind in ('S', 'U') + def is_float_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.floating) @@ -2288,13 +2363,17 @@ def is_bool_dtype(arr_or_dtype): return False return issubclass(tipo, np.bool_) + def is_sparse(array): """ return if we are a sparse array """ return isinstance(array, (ABCSparseArray, ABCSparseSeries)) + def is_datetimetz(array): """ return if we are a datetime with tz array """ - return (isinstance(array, ABCDatetimeIndex) and getattr(array,'tz',None) is not None) or is_datetime64tz_dtype(array) + return (isinstance(array, ABCDatetimeIndex) and getattr(array, 'tz', None) + is not None) or is_datetime64tz_dtype(array) + def is_internal_type(value): """ @@ -2309,13 +2388,16 @@ def is_internal_type(value): return True return False + def is_categorical(array): """ return if we are a categorical possibility """ return isinstance(array, ABCCategorical) or is_categorical_dtype(array) + def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) + def is_complex_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.complexfloating) @@ -2340,21 +2422,25 @@ def is_re_compilable(obj): def is_list_like(arg): - return (hasattr(arg, '__iter__') and + return (hasattr(arg, '__iter__') and not isinstance(arg, compat.string_and_binary_types)) + def is_named_tuple(arg): return isinstance(arg, tuple) and hasattr(arg, '_fields') + def is_null_slice(obj): """ we have a null slice """ return (isinstance(obj, slice) and obj.start is None and obj.stop is None and obj.step is None) + def is_full_slice(obj, l): """ we have a full length slice """ - return (isinstance(obj, slice) and obj.start == 0 and - obj.stop == l and obj.step is None) + return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and + obj.step is None) + def is_hashable(arg): """Return True if hash(arg) will succeed, False otherwise. @@ -2413,10 +2499,10 @@ def _get_callable_name(obj): # distinguishing between no name and a name of '' return None + _string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, compat.text_type))) - _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 _ensure_int64 = algos.ensure_int64 @@ -2455,7 +2541,7 @@ def _astype_nansafe(arr, dtype, copy=True): # in py3, timedelta64[ns] are int64 elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or - (not compat.PY3 and dtype != _TD_DTYPE)): + (not compat.PY3 and dtype != _TD_DTYPE)): # allow frequency conversions if dtype.kind == 'm': @@ -2523,6 +2609,7 @@ def get_dtype_kinds(l): typs.add(typ) return typs + def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single @@ -2546,6 +2633,7 @@ def is_nonempty(x): return x.shape[axis] > 0 except Exception: return True + nonempty = [x for x in to_concat if is_nonempty(x)] # If all arrays are empty, there's nothing to convert, just short-cut to @@ -2577,14 +2665,16 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) if len(typs) != 1: - if not len(typs-set(['i','u','f'])) or not len(typs-set(['bool','i','u'])): + if not len(typs - set(['i', 'u', 'f'])) or not len(typs - set( + ['bool', 'i', 'u'])): # let numpy coerce pass else: # coerce to object - to_concat = [ x.astype('object') for x in to_concat ] + to_concat = [x.astype('object') for x in to_concat] + + return np.concatenate(to_concat, axis=axis) - return np.concatenate(to_concat,axis=axis) def _where_compat(mask, arr1, arr2): if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE: @@ -2599,6 +2689,7 @@ def _where_compat(mask, arr1, arr2): return np.where(mask, arr1, arr2) + def _dict_compat(d): """ Helper function to convert datetimelike-keyed dicts to Timestamp-keyed dict @@ -2612,20 +2703,23 @@ def _dict_compat(d): dict """ - return dict((_maybe_box_datetimelike(key), value) for key, value in iteritems(d)) + return dict((_maybe_box_datetimelike(key), value) + for key, value in iteritems(d)) -def sentinel_factory(): +def sentinel_factory(): class Sentinel(object): pass return Sentinel() + def in_interactive_session(): """ check if we're running in an interactive shell returns True if running under python/ipython interactive shell """ + def check_main(): import __main__ as main return (not hasattr(main, '__file__') or @@ -2647,8 +2741,7 @@ def in_qtconsole(): ip = get_ipython() front_end = ( ip.config.get('KernelApp', {}).get('parent_appname', "") or - ip.config.get('IPKernelApp', {}).get('parent_appname', "") - ) + ip.config.get('IPKernelApp', {}).get('parent_appname', "")) if 'qtconsole' in front_end.lower(): return True except: @@ -2667,8 +2760,7 @@ def in_ipnb(): ip = get_ipython() front_end = ( ip.config.get('KernelApp', {}).get('parent_appname', "") or - ip.config.get('IPKernelApp', {}).get('parent_appname', "") - ) + ip.config.get('IPKernelApp', {}).get('parent_appname', "")) if 'notebook' in front_end.lower(): return True except: @@ -2737,7 +2829,11 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): s = iter(seq) r = [] for i in range(min(nitems, len(seq))): # handle sets, no slicing - r.append(pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) + r.append(pprint_thing( + next(s), + _nest_lvl + 1, + max_seq_items=max_seq_items, + **kwds)) body = ", ".join(r) if nitems < len(seq): @@ -2764,8 +2860,14 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): nitems = max_seq_items or get_option("max_seq_items") or len(seq) for k, v in list(seq.items())[:nitems]: - pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), - pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))) + pairs.append(pfmt % (pprint_thing(k, + _nest_lvl + 1, + max_seq_items=max_seq_items, + **kwds), + pprint_thing(v, + _nest_lvl + 1, + max_seq_items=max_seq_items, + **kwds))) if nitems < len(seq): return fmt % (", ".join(pairs) + ", ...") @@ -2773,8 +2875,12 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): return fmt % ", ".join(pairs) -def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, - quote_strings=False, max_seq_items=None): +def pprint_thing(thing, + _nest_lvl=0, + escape_chars=None, + default_escapes=False, + quote_strings=False, + max_seq_items=None): """ This function is the sanctioned way of converting objects to a unicode representation. @@ -2801,6 +2907,7 @@ def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, result - unicode object on py2, str on py3. Always Unicode. """ + def as_escaped_unicode(thing, escape_chars=escape_chars): # Unicode is fine, else we try to decode using utf-8 and 'replace' # if that's not it either, we have no way of knowing and the user @@ -2812,10 +2919,7 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): # either utf-8 or we replace errors result = str(thing).decode('utf-8', "replace") - translate = {'\t': r'\t', - '\n': r'\n', - '\r': r'\r', - } + translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', } if isinstance(escape_chars, dict): if default_escapes: translate.update(escape_chars) @@ -2833,11 +2937,17 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return compat.text_type(thing) elif (isinstance(thing, dict) and _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items) + result = _pprint_dict(thing, + _nest_lvl, + quote_strings=True, + max_seq_items=max_seq_items) elif is_sequence(thing) and _nest_lvl < \ get_option("display.pprint_nest_depth"): - result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, - quote_strings=quote_strings, max_seq_items=max_seq_items) + result = _pprint_seq(thing, + _nest_lvl, + escape_chars=escape_chars, + quote_strings=quote_strings, + max_seq_items=max_seq_items) elif isinstance(thing, compat.string_types) and quote_strings: if compat.PY3: fmt = "'%s'" @@ -2863,8 +2973,8 @@ def console_encode(object, **kwds): set in display.encoding. Use this everywhere where you output to the console. """ - return pprint_thing_encoded(object, - get_option("display.encoding")) + return pprint_thing_encoded(object, get_option("display.encoding")) + def _maybe_match_name(a, b): a_has = hasattr(a, 'name') @@ -2880,6 +2990,7 @@ def _maybe_match_name(a, b): return b.name return None + def _random_state(state=None): """ Helper function for processing random_state arguments. @@ -2905,4 +3016,5 @@ def _random_state(state=None): elif state is None: return np.random.RandomState() else: - raise ValueError("random_state must be an integer, a numpy RandomState, or None") + raise ValueError( + "random_state must be an integer, a numpy RandomState, or None") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6207ac5dc5c12..24b5b61468827 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,17 +23,15 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index, _maybe_upcast, is_sequence, - _infer_dtype_from_scalar, _values_from_object, - is_list_like, _maybe_box_datetimelike, - is_categorical_dtype, is_object_dtype, - is_internal_type, is_datetimetz, - _possibly_infer_to_datetimelike, _dict_compat) +from pandas.core.common import ( + isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, + is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, + _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, + is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike, + _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import (maybe_droplevels, - convert_to_index_sliceable, +from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, @@ -43,11 +41,11 @@ import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval from numpy import percentile as _quantile -from pandas.compat import(range, map, zip, lrange, lmap, lzip, StringIO, u, - OrderedDict, raise_with_traceback) +from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, + OrderedDict, raise_with_traceback) from pandas import compat -from pandas.util.decorators import (deprecate, Appender, - Substitution, deprecate_kwarg) +from pandas.util.decorators import (deprecate, Appender, Substitution, + deprecate_kwarg) from pandas.tseries.period import PeriodIndex from pandas.tseries.index import DatetimeIndex @@ -68,7 +66,8 @@ #---------------------------------------------------------------------- # Docstring templates -_shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame', +_shared_doc_kwargs = dict(axes='index, columns', + klass='DataFrame', axes_single_arg="{0, 1, 'index', 'columns'}") _numeric_only_doc = """numeric_only : boolean, default None @@ -160,7 +159,6 @@ class DataFrame(NDFrame): - """ Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like @@ -208,7 +206,11 @@ def _constructor_expanddim(self): from pandas.core.panel import Panel return Panel - def __init__(self, data=None, index=None, columns=None, dtype=None, + def __init__(self, + data=None, + index=None, + columns=None, + dtype=None, copy=False): if data is None: data = {} @@ -219,8 +221,11 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = data._data if isinstance(data, BlockManager): - mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr(data, + axes=dict(index=index, + columns=columns), + dtype=dtype, + copy=copy) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): @@ -238,7 +243,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data[mask] = fill_value else: data = data.copy() - mgr = self._init_ndarray(data, index, columns, dtype=dtype, + mgr = self._init_ndarray(data, + index, + columns, + dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): @@ -249,10 +257,15 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, columns = data_columns mgr = self._init_dict(data, index, columns, dtype=dtype) elif getattr(data, 'name', None): - mgr = self._init_dict({data.name: data}, index, columns, + mgr = self._init_dict({data.name: data}, + index, + columns, dtype=dtype) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, + mgr = self._init_ndarray(data, + index, + columns, + dtype=dtype, copy=copy) elif isinstance(data, (list, types.GeneratorType)): if isinstance(data, types.GeneratorType): @@ -273,10 +286,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, else: index = _default_index(len(data)) - mgr = _arrays_to_mgr(arrays, columns, index, columns, + mgr = _arrays_to_mgr(arrays, + columns, + index, + columns, dtype=dtype) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, + mgr = self._init_ndarray(data, + index, + columns, + dtype=dtype, copy=copy) else: mgr = self._init_dict({}, index, columns, dtype=dtype) @@ -298,7 +317,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, values = np.empty((len(index), len(columns)), dtype=dtype) values.fill(data) - mgr = self._init_ndarray(values, index, columns, dtype=dtype, + mgr = self._init_ndarray(values, + index, + columns, + dtype=dtype, copy=False) else: raise PandasError('DataFrame constructor not properly called!') @@ -358,11 +380,9 @@ def _init_dict(self, data, index, columns, dtype=None): columns = data_names = Index(keys) arrays = [data[k] for k in keys] - return _arrays_to_mgr(arrays, data_names, index, columns, - dtype=dtype) + return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) - def _init_ndarray(self, values, index, columns, dtype=None, - copy=False): + def _init_ndarray(self, values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index if isinstance(values, Series): @@ -395,20 +415,23 @@ def _get_axes(N, K, index=index, columns=columns): # we could have a categorical type passed or coerced to 'category' # recast this to an _arrays_to_mgr - if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype): + if is_categorical_dtype(getattr(values, 'dtype', + None)) or is_categorical_dtype(dtype): - if not hasattr(values,'dtype'): + if not hasattr(values, 'dtype'): values = _prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() - index, columns = _get_axes(len(values),1) - return _arrays_to_mgr([ values ], columns, index, columns, + index, columns = _get_axes(len(values), 1) + return _arrays_to_mgr([values], + columns, + index, + columns, dtype=dtype) elif is_datetimetz(values): - return self._init_dict({ 0 : values }, index, columns, - dtype=dtype) + return self._init_dict({0: values}, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -474,7 +497,7 @@ def _repr_fits_horizontal_(self, ignore_width=False): # exceed max columns if ((max_columns and nb_columns > max_columns) or - ((not ignore_width) and width and nb_columns > (width // 2))): + ((not ignore_width) and width and nb_columns > (width // 2))): return False if (ignore_width # used by repr_html under IPython notebook @@ -513,9 +536,8 @@ def _repr_fits_horizontal_(self, ignore_width=False): def _info_repr(self): """True if the repr should show the info view.""" info_repr_option = (get_option("display.large_repr") == "info") - return info_repr_option and not ( - self._repr_fits_horizontal_() and self._repr_fits_vertical_() - ) + return info_repr_option and not (self._repr_fits_horizontal_() and + self._repr_fits_vertical_()) def __unicode__(self): """ @@ -536,8 +558,11 @@ def __unicode__(self): width, _ = fmt.get_console_size() else: width = None - self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols, - line_width=width, show_dimensions=show_dimensions) + self.to_string(buf=buf, + max_rows=max_rows, + max_cols=max_cols, + line_width=width, + show_dimensions=show_dimensions) return buf.getvalue() @@ -560,8 +585,8 @@ def _repr_html_(self): buf = StringIO(u("")) self.info(buf=buf) # need to escape the , should be the first line. - val = buf.getvalue().replace('<', r'<', 1).replace('>', - r'>', 1) + val = buf.getvalue().replace('<', r'<', 1).replace('>', r'>', + 1) return '
' + val + '
' if get_option("display.notebook_repr_html"): @@ -569,7 +594,8 @@ def _repr_html_(self): max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") - return self.to_html(max_rows=max_rows, max_cols=max_cols, + return self.to_html(max_rows=max_rows, + max_cols=max_cols, show_dimensions=show_dimensions, notebook=True) else: @@ -610,7 +636,7 @@ def iteritems(self): yield k, self._get_item_cache(k) else: for i, k in enumerate(self.columns): - yield k, self._ixs(i,axis=1) + yield k, self._ixs(i, axis=1) def iterrows(self): """ @@ -711,8 +737,9 @@ def itertuples(self, index=True, name="Pandas"): if name is not None and len(self.columns) + index < 256: # `rename` is unsupported in Python 2.6 try: - itertuple = collections.namedtuple( - name, fields+list(self.columns), rename=True) + itertuple = collections.namedtuple(name, + fields + list(self.columns), + rename=True) return map(itertuple._make, zip(*arrays)) except Exception: pass @@ -758,9 +785,10 @@ def dot(self, other): (lvals.shape, rvals.shape)) if isinstance(other, DataFrame): - return self._constructor(np.dot(lvals, rvals), - index=left.index, - columns=other.columns) + return self._constructor( + np.dot(lvals, rvals), + index=left.index, + columns=other.columns) elif isinstance(other, Series): return Series(np.dot(lvals, rvals), index=left.index) elif isinstance(rvals, (np.ndarray, Index)): @@ -846,20 +874,28 @@ def to_dict(self, orient='dict'): elif orient.lower().startswith('sp'): return {'index': self.index.tolist(), 'columns': self.columns.tolist(), - 'data': lib.map_infer(self.values.ravel(), _maybe_box_datetimelike) + 'data': + lib.map_infer(self.values.ravel(), _maybe_box_datetimelike) .reshape(self.values.shape).tolist()} elif orient.lower().startswith('s'): - return dict((k, _maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) + return dict((k, _maybe_box_datetimelike(v)) + for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [dict((k, _maybe_box_datetimelike(v)) for k, v in zip(self.columns, row)) + return [dict((k, _maybe_box_datetimelike(v)) + for k, v in zip(self.columns, row)) for row in self.values] elif orient.lower().startswith('i'): return dict((k, v.to_dict()) for k, v in self.iterrows()) else: raise ValueError("orient '%s' not understood" % orient) - def to_gbq(self, destination_table, project_id, chunksize=10000, - verbose=True, reauth=False, if_exists='fail'): + def to_gbq(self, + destination_table, + project_id, + chunksize=10000, + verbose=True, + reauth=False, + if_exists='fail'): """Write a DataFrame to a Google BigQuery table. THIS IS AN EXPERIMENTAL LIBRARY @@ -888,13 +924,22 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, """ from pandas.io import gbq - return gbq.to_gbq(self, destination_table, project_id=project_id, - chunksize=chunksize, verbose=verbose, - reauth=reauth, if_exists=if_exists) + return gbq.to_gbq(self, + destination_table, + project_id=project_id, + chunksize=chunksize, + verbose=verbose, + reauth=reauth, + if_exists=if_exists) @classmethod - def from_records(cls, data, index=None, exclude=None, columns=None, - coerce_float=False, nrows=None): + def from_records(cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None): """ Convert structured or record ndarray to DataFrame @@ -971,7 +1016,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, columns = _ensure_index(columns) arr_columns = columns else: - arrays, arr_columns = _to_arrays(data, columns, + arrays, arr_columns = _to_arrays(data, + columns, coerce_float=coerce_float) arr_columns = _ensure_index(arr_columns) @@ -1000,7 +1046,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, to_remove = [arr_columns.get_loc(field) for field in index] result_index = MultiIndex.from_arrays( - [arrays[i] for i in to_remove], names=index) + [arrays[i] for i in to_remove], + names=index) exclude.update(index) except Exception: @@ -1014,8 +1061,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = _arrays_to_mgr(arrays, arr_columns, result_index, - columns) + mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns) return cls(mgr) @@ -1125,8 +1171,14 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): return cls(mgr) @classmethod - def from_csv(cls, path, header=0, sep=',', index_col=0, - parse_dates=True, encoding=None, tupleize_cols=False, + def from_csv(cls, + path, + header=0, + sep=',', + index_col=0, + parse_dates=True, + encoding=None, + tupleize_cols=False, infer_datetime_format=False): """ Read CSV file (DISCOURAGED, please use :func:`pandas.read_csv` instead). @@ -1177,9 +1229,13 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, """ from pandas.io.parsers import read_table - return read_table(path, header=header, sep=sep, - parse_dates=parse_dates, index_col=index_col, - encoding=encoding, tupleize_cols=tupleize_cols, + return read_table(path, + header=header, + sep=sep, + parse_dates=parse_dates, + index_col=index_col, + encoding=encoding, + tupleize_cols=tupleize_cols, infer_datetime_format=infer_datetime_format) def to_sparse(self, fill_value=None, kind='block'): @@ -1196,7 +1252,9 @@ def to_sparse(self, fill_value=None, kind='block'): y : SparseDataFrame """ from pandas.core.sparse import SparseDataFrame - return SparseDataFrame(self._series, index=self.index, columns=self.columns, + return SparseDataFrame(self._series, + index=self.index, + columns=self.columns, default_kind=kind, default_fill_value=fill_value) @@ -1244,21 +1302,38 @@ def to_panel(self): new_axes = [selfsorted.columns, major_axis, minor_axis] # create new manager - new_mgr = selfsorted._data.reshape_nd(axes=new_axes, - labels=[major_labels, minor_labels], - shape=shape, - ref_items=selfsorted.columns) + new_mgr = selfsorted._data.reshape_nd( + axes=new_axes, + labels=[major_labels, minor_labels], + shape=shape, + ref_items=selfsorted.columns) return self._constructor_expanddim(new_mgr) to_wide = deprecate('to_wide', to_panel) - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, - columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression=None, quoting=None, - quotechar='"', line_terminator='\n', chunksize=None, - tupleize_cols=False, date_format=None, doublequote=True, - escapechar=None, decimal='.', **kwds): + def to_csv(self, + path_or_buf=None, + sep=",", + na_rep='', + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode='w', + encoding=None, + compression=None, + quoting=None, + quotechar='"', + line_terminator='\n', + chunksize=None, + tupleize_cols=False, + date_format=None, + doublequote=True, + escapechar=None, + decimal='.', + **kwds): """Write DataFrame to a comma-separated values (csv) file Parameters @@ -1320,15 +1395,22 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, .. versionadded:: 0.16.0 """ - formatter = fmt.CSVFormatter(self, path_or_buf, + formatter = fmt.CSVFormatter(self, + path_or_buf, line_terminator=line_terminator, - sep=sep, encoding=encoding, + sep=sep, + encoding=encoding, compression=compression, - quoting=quoting, na_rep=na_rep, - float_format=float_format, cols=columns, - header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, + quoting=quoting, + na_rep=na_rep, + float_format=float_format, + cols=columns, + header=header, + index=index, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, engine=kwds.get("engine"), tupleize_cols=tupleize_cols, date_format=date_format, @@ -1340,10 +1422,21 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, if path_or_buf is None: return formatter.path_or_buf.getvalue() - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', + def to_excel(self, + excel_writer, + sheet_name='Sheet1', + na_rep='', + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep='inf', verbose=True): """ Write DataFrame to a excel sheet @@ -1419,14 +1512,21 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', merge_cells=merge_cells, inf_rep=inf_rep) formatted_cells = formatter.get_formatted_cells() - excel_writer.write_cells(formatted_cells, sheet_name, - startrow=startrow, startcol=startcol) + excel_writer.write_cells(formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol) if need_save: excel_writer.save() - def to_stata( - self, fname, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None, time_stamp=None, data_label=None): + def to_stata(self, + fname, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None): """ A class for writing Stata binary dta files from array-like objects @@ -1456,30 +1556,49 @@ def to_stata( >>> writer.write_file() """ from pandas.io.stata import StataWriter - writer = StataWriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder, - time_stamp=time_stamp, data_label=data_label, + writer = StataWriter(fname, + self, + convert_dates=convert_dates, + encoding=encoding, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, write_index=write_index) writer.write_file() @Appender(fmt.docstring_to_string, indents=1) - def to_string(self, buf=None, columns=None, col_space=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, line_width=None, max_rows=None, max_cols=None, + def to_string(self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep='NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + line_width=None, + max_rows=None, + max_cols=None, show_dimensions=False): """ Render a DataFrame to a console-friendly tabular output. """ - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, + formatter = fmt.DataFrameFormatter(self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, justify=justify, index_names=index_names, - header=header, index=index, + header=header, + index=index, line_width=line_width, max_rows=max_rows, max_cols=max_cols, @@ -1491,11 +1610,25 @@ def to_string(self, buf=None, columns=None, col_space=None, return result @Appender(fmt.docstring_to_string, indents=1) - def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, bold_rows=True, classes=None, escape=True, - max_rows=None, max_cols=None, show_dimensions=False, + def to_html(self, + buf=None, + columns=None, + col_space=None, + colSpace=None, + header=True, + index=True, + na_rep='NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + bold_rows=True, + classes=None, + escape=True, + max_rows=None, + max_cols=None, + show_dimensions=False, notebook=False): """ Render a DataFrame as an HTML table. @@ -1519,17 +1652,22 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, if colSpace is not None: # pragma: no cover warnings.warn("colSpace is deprecated, use col_space", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) col_space = colSpace - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, + formatter = fmt.DataFrameFormatter(self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, justify=justify, index_names=index_names, - header=header, index=index, + header=header, + index=index, bold_rows=bold_rows, escape=escape, max_rows=max_rows, @@ -1542,11 +1680,22 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, return formatter.buf.getvalue() @Appender(fmt.common_docstring + fmt.return_docstring, indents=1) - def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - bold_rows=True, column_format=None, - longtable=None, escape=None): + def to_latex(self, + buf=None, + columns=None, + col_space=None, + colSpace=None, + header=True, + index=True, + na_rep='NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=True, + column_format=None, + longtable=None, + escape=None): """ Render a DataFrame to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -1571,17 +1720,22 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, if colSpace is not None: # pragma: no cover warnings.warn("colSpace is deprecated, use col_space", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) col_space = colSpace # Get defaults from the pandas config if longtable is None: longtable = get_option("display.latex.longtable") if escape is None: escape = get_option("display.latex.escape") - - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - header=header, index=index, + + formatter = fmt.DataFrameFormatter(self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, formatters=formatters, float_format=float_format, bold_rows=bold_rows, @@ -1593,7 +1747,12 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, if buf is None: return formatter.buf.getvalue() - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): + def info(self, + verbose=None, + buf=None, + max_cols=None, + memory_usage=None, + null_counts=None): """ Concise summary of a DataFrame. @@ -1640,8 +1799,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co # hack if max_cols is None: - max_cols = get_option( - 'display.max_info_columns', len(self.columns) + 1) + max_cols = get_option('display.max_info_columns', + len(self.columns) + 1) max_rows = get_option('display.max_info_rows', len(self) + 1) @@ -1662,8 +1821,8 @@ def _verbose_repr(): if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover - raise AssertionError('Columns must equal counts (%d != %d)' % - (len(cols), len(counts))) + raise AssertionError('Columns must equal counts (%d != %d)' + % (len(cols), len(counts))) tmpl = "%s non-null %s" dtypes = self.dtypes @@ -1675,8 +1834,7 @@ def _verbose_repr(): if show_counts: count = counts.iloc[i] - lines.append(_put_str(col, space) + - tmpl % (count, dtype)) + lines.append(_put_str(col, space) + tmpl % (count, dtype)) def _non_verbose_repr(): lines.append(self.columns.summary(name='Columns')) @@ -1709,18 +1867,17 @@ def _sizeof_fmt(num, size_qualifier): # append memory usage of df to display size_qualifier = '' if memory_usage == 'deep': - deep=True + deep = True else: # size_qualifier is just a best effort; not guaranteed to catch all # cases (e.g., it misses categorical data even with object # categories) - deep=False + deep = False if 'object' in counts or is_object_dtype(self.index): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % - _sizeof_fmt(mem_usage, size_qualifier) - ) + _sizeof_fmt(mem_usage, size_qualifier)) _put_lines(buf, lines) def memory_usage(self, index=True, deep=False): @@ -1751,11 +1908,14 @@ def memory_usage(self, index=True, deep=False): -------- numpy.ndarray.nbytes """ - result = Series([ c.memory_usage(index=False, deep=deep) for col, c in self.iteritems() ], - index=self.columns) + result = Series( + [c.memory_usage(index=False, + deep=deep) for col, c in self.iteritems()], + index=self.columns) if index: - result = Series(self.index.memory_usage(deep=deep), - index=['Index']).append(result) + result = Series( + self.index.memory_usage(deep=deep), + index=['Index']).append(result) return result def transpose(self): @@ -1786,12 +1946,15 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover (vals, idx, cols), object_state = state index = _unpickle_array(idx) - dm = DataFrame(vals, index=index, columns=_unpickle_array(cols), + dm = DataFrame(vals, + index=index, + columns=_unpickle_array(cols), copy=False) if object_state is not None: ovals, _, ocols = object_state - objects = DataFrame(ovals, index=index, + objects = DataFrame(ovals, + index=index, columns=_unpickle_array(ocols), copy=False) @@ -1866,7 +2029,8 @@ def irow(self, i, copy=False): """ warnings.warn("irow(i) is deprecated. Please use .iloc[i]", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) return self._ixs(i, axis=0) def icol(self, i): @@ -1874,7 +2038,8 @@ def icol(self, i): DEPRECATED. Use ``.iloc[:, i]`` instead """ warnings.warn("icol(i) is deprecated. Please use .iloc[:,i]", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) return self._ixs(i, axis=1) def _ixs(self, i, axis=0): @@ -1885,7 +2050,6 @@ def _ixs(self, i, axis=0): # irow if axis == 0: - """ Notes ----- @@ -1899,22 +2063,24 @@ def _ixs(self, i, axis=0): if isinstance(label, Index): # a location index by definition result = self.take(i, axis=axis) - copy=True + copy = True else: new_values = self._data.fast_xs(i) if lib.isscalar(new_values): return new_values # if we are a copy, mark as such - copy = isinstance(new_values,np.ndarray) and new_values.base is None - result = Series(new_values, index=self.columns, - name=self.index[i], dtype=new_values.dtype) + copy = isinstance(new_values, + np.ndarray) and new_values.base is None + result = Series(new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype) result._set_is_copy(self, copy=copy) return result # icol else: - """ Notes ----- @@ -1940,9 +2106,10 @@ def _ixs(self, i, axis=0): if index_len and not len(values): values = np.array([np.nan] * index_len, dtype=object) - result = self._constructor_sliced.from_array( - values, index=self.index, - name=label, fastpath=True) + result = self._constructor_sliced.from_array(values, + index=self.index, + name=label, + fastpath=True) # this is a cached value, mark it so result._set_as_cached(label, self) @@ -1954,7 +2121,8 @@ def iget_value(self, i, j): DEPRECATED. Use ``.iat[i, j]`` instead """ warnings.warn("iget_value(i, j) is deprecated. Please use .iat[i, j]", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) return self.iat[i, j] def __getitem__(self, key): @@ -2031,15 +2199,19 @@ def _getitem_multilevel(self, key): result.columns = result_columns else: new_values = self.values[:, loc] - result = self._constructor(new_values, index=self.index, - columns=result_columns).__finalize__(self) + result = self._constructor( + new_values, + index=self.index, + columns=result_columns).__finalize__(self) if len(result.columns) == 1: top = result.columns[0] if ((type(top) == str and top == '') or - (type(top) == tuple and top[0] == '')): + (type(top) == tuple and top[0] == '')): result = result[''] if isinstance(result, Series): - result = self._constructor_sliced(result, index=self.index, name=key) + result = self._constructor_sliced(result, + index=self.index, + name=key) result._set_is_copy(self) return result @@ -2271,16 +2443,15 @@ def select_dtypes(self, include=None, exclude=None): 'nonempty') # convert the myriad valid dtypes object to a single representation - include, exclude = map(lambda x: - frozenset(map(com._get_dtype_from_object, x)), - selection) + include, exclude = map( + lambda x: frozenset(map(com._get_dtype_from_object, x)), selection) for dtypes in (include, exclude): com._invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError('include and exclude overlap on %s' - % (include & exclude)) + raise ValueError('include and exclude overlap on %s' % + (include & exclude)) # empty include/exclude -> defaults to True # three cases (we've already raised if both are empty) @@ -2318,8 +2489,10 @@ def _box_item_values(self, key, values): def _box_col_values(self, values, items): """ provide boxed values for a column """ - return self._constructor_sliced.from_array(values, index=self.index, - name=items, fastpath=True) + return self._constructor_sliced.from_array(values, + index=self.index, + name=items, + fastpath=True) def __setitem__(self, key, value): @@ -2378,16 +2551,16 @@ def _ensure_valid_index(self, value): """ # GH5632, make sure that we are a Series convertible if not len(self.index) and is_list_like(value): - try: - value = Series(value) - except: - raise ValueError('Cannot set a frame with no defined index ' - 'and a value that cannot be converted to a ' - 'Series') - - self._data = self._data.reindex_axis(value.index.copy(), axis=1, - fill_value=np.nan) + try: + value = Series(value) + except: + raise ValueError('Cannot set a frame with no defined index ' + 'and a value that cannot be converted to a ' + 'Series') + self._data = self._data.reindex_axis(value.index.copy(), + axis=1, + fill_value=np.nan) def _set_item(self, key, value): """ @@ -2426,8 +2599,10 @@ def insert(self, loc, column, value, allow_duplicates=False): """ self._ensure_valid_index(value) value = self._sanitize_column(column, value) - self._data.insert( - loc, column, value, allow_duplicates=allow_duplicates) + self._data.insert(loc, + column, + value, + allow_duplicates=allow_duplicates) def assign(self, **kwargs): """ @@ -2598,8 +2773,10 @@ def reindexer(value): def _series(self): result = {} for idx, item in enumerate(self.columns): - result[item] = Series(self._data.iget(idx), index=self.index, - name=item) + result[item] = Series( + self._data.iget(idx), + index=self.index, + name=item) return result def lookup(self, row_labels, col_labels): @@ -2656,8 +2833,8 @@ def lookup(self, row_labels, col_labels): #---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_axes(self, axes, level, limit, tolerance, method, - fill_value, copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, + copy): frame = self columns = axes['columns'] @@ -2672,23 +2849,41 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, return frame - def _reindex_index(self, new_index, method, copy, level, fill_value=NA, - limit=None, tolerance=None): - new_index, indexer = self.index.reindex(new_index, method, level, + def _reindex_index(self, + new_index, + method, + copy, + level, + fill_value=NA, + limit=None, + tolerance=None): + new_index, indexer = self.index.reindex(new_index, + method, + level, limit=limit, tolerance=tolerance) - return self._reindex_with_indexers({0: [new_index, indexer]}, - copy=copy, fill_value=fill_value, - allow_dups=False) - - def _reindex_columns(self, new_columns, copy, level, fill_value=NA, - limit=None, tolerance=None): - new_columns, indexer = self.columns.reindex(new_columns, level=level, + return self._reindex_with_indexers( + {0: [new_index, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False) + + def _reindex_columns(self, + new_columns, + copy, + level, + fill_value=NA, + limit=None, + tolerance=None): + new_columns, indexer = self.columns.reindex(new_columns, + level=level, limit=limit, tolerance=tolerance) - return self._reindex_with_indexers({1: [new_columns, indexer]}, - copy=copy, fill_value=fill_value, - allow_dups=False) + return self._reindex_with_indexers( + {1: [new_columns, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False) def _reindex_multi(self, axes, copy, fill_value): """ we are guaranteed non-Nones in the axes! """ @@ -2698,56 +2893,99 @@ def _reindex_multi(self, axes, copy, fill_value): if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer - new_values = com.take_2d_multi(self.values, indexer, + new_values = com.take_2d_multi(self.values, + indexer, fill_value=fill_value) - return self._constructor(new_values, index=new_index, + return self._constructor(new_values, + index=new_index, columns=new_columns) else: - return self._reindex_with_indexers({0: [new_index, row_indexer], - 1: [new_columns, col_indexer]}, - copy=copy, - fill_value=fill_value) + return self._reindex_with_indexers( + {0: [new_index, row_indexer], + 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value) @Appender(_shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, + def align(self, + other, + join='outer', + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, broadcast_axis=None): - return super(DataFrame, self).align(other, join=join, axis=axis, level=level, copy=copy, - fill_value=fill_value, method=method, limit=limit, - fill_axis=fill_axis, broadcast_axis=broadcast_axis) + return super(DataFrame, self).align(other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis) @Appender(_shared_docs['reindex'] % _shared_doc_kwargs) def reindex(self, index=None, columns=None, **kwargs): - return super(DataFrame, self).reindex(index=index, columns=columns, + return super(DataFrame, self).reindex(index=index, + columns=columns, **kwargs) @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) - def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): - return super(DataFrame, self).reindex_axis(labels=labels, axis=axis, - method=method, level=level, - copy=copy, limit=limit, + def reindex_axis(self, + labels, + axis=0, + method=None, + level=None, + copy=True, + limit=None, + fill_value=np.nan): + return super(DataFrame, self).reindex_axis(labels=labels, + axis=axis, + method=method, + level=level, + copy=copy, + limit=limit, fill_value=fill_value) @Appender(_shared_docs['rename'] % _shared_doc_kwargs) def rename(self, index=None, columns=None, **kwargs): - return super(DataFrame, self).rename(index=index, columns=columns, + return super(DataFrame, self).rename(index=index, + columns=columns, **kwargs) @Appender(_shared_docs['fillna'] % _shared_doc_kwargs) - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - return super(DataFrame, self).fillna(value=value, method=method, - axis=axis, inplace=inplace, - limit=limit, downcast=downcast, + def fillna(self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs): + return super(DataFrame, self).fillna(value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, **kwargs) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): - return super(DataFrame, self).shift(periods=periods, freq=freq, + return super(DataFrame, self).shift(periods=periods, + freq=freq, axis=axis) - def set_index(self, keys, drop=True, append=False, inplace=False, + def set_index(self, + keys, + drop=True, + append=False, + inplace=False, verify_integrity=False): """ Set the DataFrame index (row labels) using one or more existing @@ -2838,7 +3076,11 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not inplace: return frame - def reset_index(self, level=None, drop=False, inplace=False, col_level=0, + def reset_index(self, + level=None, + drop=False, + inplace=False, + col_level=0, col_fill=''): """ For DataFrame with multi-level index, return new DataFrame with @@ -2877,8 +3119,7 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, def _maybe_casted_values(index, labels=None): if isinstance(index, PeriodIndex): values = index.asobject.values - elif (isinstance(index, DatetimeIndex) and - index.tz is not None): + elif (isinstance(index, DatetimeIndex) and index.tz is not None): values = index else: values = index.values @@ -2890,11 +3131,11 @@ def _maybe_casted_values(index, labels=None): mask = labels == -1 values = values.take(labels) if mask.any(): - values, changed = com._maybe_upcast_putmask(values, - mask, np.nan) + values, changed = com._maybe_upcast_putmask(values, mask, + np.nan) return values - new_index = np.arange(len(new_obj),dtype='int64') + new_index = np.arange(len(new_obj), dtype='int64') if isinstance(self.index, MultiIndex): if level is not None: if not isinstance(level, (tuple, list)): @@ -2915,8 +3156,7 @@ def _maybe_casted_values(index, labels=None): if multi_col: if col_fill is None: - col_name = tuple([col_name] * - self.columns.nlevels) + col_name = tuple([col_name] * self.columns.nlevels) else: name_lst = [col_fill] * self.columns.nlevels lev_num = self.columns._get_level_number(col_level) @@ -2947,11 +3187,14 @@ def _maybe_casted_values(index, labels=None): if not inplace: return new_obj - #---------------------------------------------------------------------- # Reindex-based selection methods - def dropna(self, axis=0, how='any', thresh=None, subset=None, + def dropna(self, + axis=0, + how='any', + thresh=None, + subset=None, inplace=False): """ Return object with labels on given axis omitted where alternately any @@ -2979,8 +3222,10 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, if isinstance(axis, (tuple, list)): result = self for ax in axis: - result = result.dropna(how=how, thresh=thresh, - subset=subset, axis=ax) + result = result.dropna(how=how, + thresh=thresh, + subset=subset, + axis=ax) else: axis = self._get_axis_number(axis) agg_axis = 1 - axis @@ -2991,8 +3236,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, indices = ax.get_indexer_for(subset) check = indices == -1 if check.any(): - raise KeyError(list(np.compress(check,subset))) - agg_obj = self.take(indices,axis=agg_axis) + raise KeyError(list(np.compress(check, subset))) + agg_obj = self.take(indices, axis=agg_axis) count = agg_obj.count(axis=agg_axis) @@ -3015,7 +3260,10 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', + 'keep', + mapping={True: 'last', + False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset', stacklevel=3) def drop_duplicates(self, subset=None, keep='first', inplace=False): """ @@ -3049,7 +3297,10 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', + 'keep', + mapping={True: 'last', + False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset', stacklevel=3) def duplicated(self, subset=None, keep='first'): """ @@ -3079,8 +3330,10 @@ def duplicated(self, subset=None, keep='first'): from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): - labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) - return labels.astype('i8',copy=False), len(shape) + labels, shape = factorize(vals, + size_hint=min( + len(self), _SIZE_HINT_LIMIT)) + return labels.astype('i8', copy=False), len(shape) if subset is None: subset = self.columns @@ -3090,7 +3343,7 @@ def f(vals): subset = subset, vals = (self[col].values for col in subset) - labels, shape = map(list, zip( * map(f, vals))) + labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) return Series(duplicated_int64(ids, keep), index=self.index) @@ -3099,8 +3352,13 @@ def f(vals): # Sorting @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs) - def sort_values(self, by, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + def sort_values(self, + by, + axis=0, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last'): axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -3120,13 +3378,16 @@ def trans(v): if com.needs_i8_conversion(v): return v.view('i8') return v + keys = [] for x in by: k = self[x].values if k.ndim == 2: - raise ValueError('Cannot sort by duplicate column %s' % str(x)) + raise ValueError('Cannot sort by duplicate column %s' % + str(x)) keys.append(trans(k)) - indexer = _lexsort_indexer(keys, orders=ascending, + indexer = _lexsort_indexer(keys, + orders=ascending, na_position=na_position) indexer = com._ensure_platform_int(indexer) else: @@ -3138,28 +3399,38 @@ def trans(v): # try to be helpful if isinstance(self.columns, MultiIndex): - raise ValueError('Cannot sort by column %s in a multi-index' - ' you need to explicity provide all the levels' - % str(by)) + raise ValueError( + 'Cannot sort by column %s in a multi-index' + ' you need to explicity provide all the levels' % + str(by)) - raise ValueError('Cannot sort by duplicate column %s' - % str(by)) + raise ValueError('Cannot sort by duplicate column %s' % + str(by)) if isinstance(ascending, (tuple, list)): ascending = ascending[0] - indexer = _nargsort(k, kind=kind, ascending=ascending, + indexer = _nargsort(k, + kind=kind, + ascending=ascending, na_position=na_position) - new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), - convert=False, verify=False) + new_data = self._data.take(indexer, + axis=self._get_block_manager_axis(axis), + convert=False, + verify=False) if inplace: return self._update_inplace(new_data) else: return self._constructor(new_data).__finalize__(self) - def sort(self, columns=None, axis=0, ascending=True, - inplace=False, kind='quicksort', na_position='last'): + def sort(self, + columns=None, + axis=0, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last'): """ DEPRECATED: use :meth:`DataFrame.sort_values` @@ -3196,26 +3467,46 @@ def sort(self, columns=None, axis=0, ascending=True, if columns is None: warnings.warn("sort(....) is deprecated, use sort_index(.....)", - FutureWarning, stacklevel=2) - return self.sort_index(axis=axis, ascending=ascending, inplace=inplace) - - warnings.warn("sort(columns=....) is deprecated, use sort_values(by=.....)", - FutureWarning, stacklevel=2) - return self.sort_values(by=columns, axis=axis, ascending=ascending, - inplace=inplace, kind=kind, na_position=na_position) + FutureWarning, + stacklevel=2) + return self.sort_index(axis=axis, + ascending=ascending, + inplace=inplace) + + warnings.warn( + "sort(columns=....) is deprecated, use sort_values(by=.....)", + FutureWarning, + stacklevel=2) + return self.sort_values(by=columns, + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position) @Appender(_shared_docs['sort_index'] % _shared_doc_kwargs) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True, by=None): + def sort_index(self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last', + sort_remaining=True, + by=None): # 10726 if by is not None: - warnings.warn("by argument to sort_index is deprecated, pls use .sort_values(by=...)", - FutureWarning, stacklevel=2) + warnings.warn( + "by argument to sort_index is deprecated, pls use .sort_values(by=...)", + FutureWarning, + stacklevel=2) if level is not None: raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) - + return self.sort_values(by, + axis=axis, + ascending=ascending, + inplace=inplace) axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -3223,7 +3514,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # sort by the index if level is not None: - new_axis, indexer = labels.sortlevel(level, ascending=ascending, + new_axis, indexer = labels.sortlevel(level, + ascending=ascending, sort_remaining=sort_remaining) elif isinstance(labels, MultiIndex): @@ -3234,7 +3526,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, if not labels.is_lexsorted(): labels = MultiIndex.from_tuples(labels.values) - indexer = _lexsort_indexer(labels.labels, orders=ascending, + indexer = _lexsort_indexer(labels.labels, + orders=ascending, na_position=na_position) else: from pandas.core.groupby import _nargsort @@ -3248,19 +3541,27 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self.copy() - indexer = _nargsort(labels, kind=kind, ascending=ascending, + indexer = _nargsort(labels, + kind=kind, + ascending=ascending, na_position=na_position) - new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), - convert=False, verify=False) + new_data = self._data.take(indexer, + axis=self._get_block_manager_axis(axis), + convert=False, + verify=False) if inplace: return self._update_inplace(new_data) else: return self._constructor(new_data).__finalize__(self) - def sortlevel(self, level=0, axis=0, ascending=True, - inplace=False, sort_remaining=True): + def sortlevel(self, + level=0, + axis=0, + ascending=True, + inplace=False, + sort_remaining=True): """ Sort multilevel index by chosen axis and primary level. Data will be lexicographically sorted by the chosen level followed by the other @@ -3285,9 +3586,11 @@ def sortlevel(self, level=0, axis=0, ascending=True, DataFrame.sort_index(level=...) """ - return self.sort_index(level=level, axis=axis, ascending=ascending, - inplace=inplace, sort_remaining=sort_remaining) - + return self.sort_index(level=level, + axis=axis, + ascending=ascending, + inplace=inplace, + sort_remaining=sort_remaining) def _nsorted(self, columns, n, method, keep): if not com.is_list_like(columns): @@ -3295,7 +3598,8 @@ def _nsorted(self, columns, n, method, keep): columns = list(columns) ser = getattr(self[columns[0]], method)(n, keep=keep) ascending = dict(nlargest=False, nsmallest=True)[method] - return self.loc[ser.index].sort_values(columns, ascending=ascending, + return self.loc[ser.index].sort_values(columns, + ascending=ascending, kind='mergesort') def nlargest(self, n, columns, keep='first'): @@ -3406,8 +3710,8 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), - MultiIndex): # pragma: no cover + if not isinstance( + self._get_axis(axis), MultiIndex): # pragma: no cover raise TypeError('Can only reorder levels on a hierarchical axis.') result = self.copy() @@ -3446,7 +3750,8 @@ def _arith_op(left, right): def f(col): r = _arith_op(this[col].values, other[col].values) - return self._constructor_sliced(r, index=new_index, + return self._constructor_sliced(r, + index=new_index, dtype=r.dtype) result = dict([(col, f(col)) for col in this]) @@ -3457,7 +3762,8 @@ def f(col): def f(i): r = _arith_op(this.iloc[:, i].values, other.iloc[:, i].values) - return self._constructor_sliced(r, index=new_index, + return self._constructor_sliced(r, + index=new_index, dtype=r.dtype) result = dict([ @@ -3470,18 +3776,33 @@ def f(i): else: result = _arith_op(this.values, other.values) - return self._constructor(result, index=new_index, - columns=new_columns, copy=False) - - def _combine_series(self, other, func, fill_value=None, axis=None, + return self._constructor(result, + index=new_index, + columns=new_columns, + copy=False) + + def _combine_series(self, + other, + func, + fill_value=None, + axis=None, level=None): if axis is not None: axis = self._get_axis_name(axis) if axis == 'index': - return self._combine_match_index(other, func, level=level, fill_value=fill_value) + return self._combine_match_index(other, + func, + level=level, + fill_value=fill_value) else: - return self._combine_match_columns(other, func, level=level, fill_value=fill_value) - return self._combine_series_infer(other, func, level=level, fill_value=fill_value) + return self._combine_match_columns(other, + func, + level=level, + fill_value=fill_value) + return self._combine_series_infer(other, + func, + level=level, + fill_value=fill_value) def _combine_series_infer(self, other, func, level=None, fill_value=None): if len(other) == 0: @@ -3489,53 +3810,77 @@ def _combine_series_infer(self, other, func, level=None, fill_value=None): if len(self) == 0: # Ambiguous case, use _series so works with DataFrame - return self._constructor(data=self._series, index=self.index, + return self._constructor(data=self._series, + index=self.index, columns=self.columns) - return self._combine_match_columns(other, func, level=level, fill_value=fill_value) + return self._combine_match_columns(other, + func, + level=level, + fill_value=fill_value) def _combine_match_index(self, other, func, level=None, fill_value=None): - left, right = self.align(other, join='outer', axis=0, level=level, copy=False) + left, right = self.align(other, + join='outer', + axis=0, + level=level, + copy=False) if fill_value is not None: raise NotImplementedError("fill_value %r not supported." % fill_value) - return self._constructor(func(left.values.T, right.values).T, - index=left.index, - columns=self.columns, copy=False) + return self._constructor( + func(left.values.T, right.values).T, + index=left.index, + columns=self.columns, + copy=False) def _combine_match_columns(self, other, func, level=None, fill_value=None): - left, right = self.align(other, join='outer', axis=1, level=level, copy=False) + left, right = self.align(other, + join='outer', + axis=1, + level=level, + copy=False) if fill_value is not None: raise NotImplementedError("fill_value %r not supported" % fill_value) - new_data = left._data.eval( - func=func, other=right, axes=[left.columns, self.index]) + new_data = left._data.eval(func=func, + other=right, + axes=[left.columns, self.index]) return self._constructor(new_data) def _combine_const(self, other, func, raise_on_error=True): if self.empty: return self - new_data = self._data.eval(func=func, other=other, raise_on_error=raise_on_error) + new_data = self._data.eval(func=func, + other=other, + raise_on_error=raise_on_error) return self._constructor(new_data) def _compare_frame_evaluate(self, other, func, str_rep): # unique if self.columns.is_unique: + def _compare(a, b): return dict([(col, func(a[col], b[col])) for col in a.columns]) + new_data = expressions.evaluate(_compare, str_rep, self, other) - return self._constructor(data=new_data, index=self.index, - columns=self.columns, copy=False) + return self._constructor(data=new_data, + index=self.index, + columns=self.columns, + copy=False) # non-unique else: + def _compare(a, b): return dict([(i, func(a.iloc[:, i], b.iloc[:, i])) for i, col in enumerate(a.columns)]) + new_data = expressions.evaluate(_compare, str_rep, self, other) - result = self._constructor(data=new_data, index=self.index, + result = self._constructor(data=new_data, + index=self.index, copy=False) result.columns = self.columns return result @@ -3663,6 +4008,7 @@ def combine_first(self, other): ------- combined : DataFrame """ + def combiner(x, y, needs_i8_conversion=False): x_values = x.values if hasattr(x, 'values') else x y_values = y.values if hasattr(y, 'values') else y @@ -3673,12 +4019,18 @@ def combiner(x, y, needs_i8_conversion=False): else: mask = isnull(x_values) - return expressions.where(mask, y_values, x_values, + return expressions.where(mask, + y_values, + x_values, raise_on_error=True) return self.combine(other, combiner, overwrite=False) - def update(self, other, join='left', overwrite=True, filter_func=None, + def update(self, + other, + join='left', + overwrite=True, + filter_func=None, raise_conflict=False): """ Modify DataFrame in place using non-NA values from passed @@ -3727,8 +4079,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, else: mask = notnull(this) - self[col] = expressions.where( - mask, this, that, raise_on_error=True) + self[col] = expressions.where(mask, + this, + that, + raise_on_error=True) #---------------------------------------------------------------------- # Misc methods @@ -3923,8 +4277,14 @@ def diff(self, periods=1, axis=0): #---------------------------------------------------------------------- # Function application - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): + def apply(self, + func, + axis=0, + broadcast=False, + raw=False, + reduce=None, + args=(), + **kwds): """ Applies function along input axis of DataFrame. @@ -3994,8 +4354,10 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, if isinstance(f, np.ufunc): results = f(self.values) - return self._constructor(data=results, index=self.index, - columns=self.columns, copy=False) + return self._constructor(data=results, + index=self.index, + columns=self.columns, + copy=False) else: if not broadcast: if not all(self.shape): @@ -4015,8 +4377,8 @@ def _apply_empty_result(self, func, axis, reduce, *args, **kwds): if reduce is None: reduce = False try: - reduce = not isinstance(func(_EMPTY_SERIES, *args, **kwds), - Series) + reduce = not isinstance( + func(_EMPTY_SERIES, *args, **kwds), Series) except Exception: pass @@ -4033,8 +4395,7 @@ def _apply_raw(self, func, axis): # TODO: mixed type case if result.ndim == 2: - return DataFrame(result, index=self.index, - columns=self.columns) + return DataFrame(result, index=self.index, columns=self.columns) else: return Series(result, index=self._get_agg_axis(axis)) @@ -4042,8 +4403,8 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): # skip if we are mixed datelike and trying reduce across axes # GH6125 - if reduce and axis==1 and self._is_mixed_type and self._is_datelike_mixed_type: - reduce=False + if reduce and axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: + reduce = False # try to reduce first (by default) # this only matters if the reduction in values is of different dtype @@ -4055,12 +4416,16 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): # Create a dummy Series from an empty array index = self._get_axis(axis) empty_arr = np.empty(len(index), dtype=values.dtype) - dummy = Series(empty_arr, index=self._get_axis(axis), + dummy = Series(empty_arr, + index=self._get_axis(axis), dtype=values.dtype) try: labels = self._get_agg_axis(axis) - result = lib.reduce(values, func, axis=axis, dummy=dummy, + result = lib.reduce(values, + func, + axis=axis, + dummy=dummy, labels=labels) return Series(result, index=labels) except Exception: @@ -4068,16 +4433,20 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): dtype = object if self._is_mixed_type else None if axis == 0: - series_gen = (self._ixs(i,axis=1) for i in range(len(self.columns))) + series_gen = (self._ixs(i, + axis=1) for i in range(len(self.columns))) res_index = self.columns res_columns = self.index elif axis == 1: res_index = self.index res_columns = self.columns values = self.values - series_gen = (Series.from_array(arr, index=res_columns, name=name, dtype=dtype) - for i, (arr, name) in - enumerate(zip(values, res_index))) + series_gen = ( + Series.from_array(arr, + index=res_columns, + name=name, + dtype=dtype) + for i, (arr, name) in enumerate(zip(values, res_index))) else: # pragma : no cover raise AssertionError('Axis must be 0 or 1, got %s' % str(axis)) @@ -4107,7 +4476,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): if i is not None: k = res_index[i] e.args = e.args + ('occurred at index %s' % - com.pprint_thing(k),) + com.pprint_thing(k), ) raise if len(results) > 0 and is_sequence(results[0]): @@ -4143,7 +4512,8 @@ def _apply_broadcast(self, func, axis): for i, col in enumerate(columns): result_values[:, i] = func(target[col]) - result = self._constructor(result_values, index=target.index, + result = self._constructor(result_values, + index=target.index, columns=target.columns) if axis == 1: @@ -4194,6 +4564,7 @@ def infer(x): f = com.i8_boxer(x) x = lib.map_infer(_values_from_object(x), f) return lib.map_infer(_values_from_object(x), func) + return self.apply(infer) #---------------------------------------------------------------------- @@ -4262,11 +4633,13 @@ def append(self, other, ignore_index=False, verify_integrity=False): ' or if the Series has a name') index = None if other.name is None else [other.name] - combined_columns = self.columns.tolist() + self.columns.union(other.index).difference(self.columns).tolist() + combined_columns = self.columns.tolist() + self.columns.union( + other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) - other = DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) + other = DataFrame( + other.values.reshape((1, len(other))), + index=index, + columns=combined_columns) other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): @@ -4281,10 +4654,16 @@ def append(self, other, ignore_index=False, verify_integrity=False): to_concat = [self] + other else: to_concat = [self, other] - return concat(to_concat, ignore_index=ignore_index, + return concat(to_concat, + ignore_index=ignore_index, verify_integrity=verify_integrity) - def join(self, other, on=None, how='left', lsuffix='', rsuffix='', + def join(self, + other, + on=None, + how='left', + lsuffix='', + rsuffix='', sort=False): """ Join columns with other DataFrame either on index or on a key @@ -4328,10 +4707,19 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', joined : DataFrame """ # For SparseDataFrame's benefit - return self._join_compat(other, on=on, how=how, lsuffix=lsuffix, - rsuffix=rsuffix, sort=sort) - - def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', + return self._join_compat(other, + on=on, + how=how, + lsuffix=lsuffix, + rsuffix=rsuffix, + sort=sort) + + def _join_compat(self, + other, + on=None, + how='left', + lsuffix='', + rsuffix='', sort=False): from pandas.tools.merge import merge, concat @@ -4341,9 +4729,14 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', other = DataFrame({other.name: other}) if isinstance(other, DataFrame): - return merge(self, other, left_on=on, how=how, - left_index=on is None, right_index=True, - suffixes=(lsuffix, rsuffix), sort=sort) + return merge(self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort) else: if on is not None: raise ValueError('Joining multiple DataFrames only supported' @@ -4361,27 +4754,50 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', can_concat = all(df.index.is_unique for df in frames) if can_concat: - return concat(frames, axis=1, join=how, join_axes=join_axes, + return concat(frames, + axis=1, + join=how, + join_axes=join_axes, verify_integrity=True) joined = frames[0] for frame in frames[1:]: - joined = merge(joined, frame, how=how, - left_index=True, right_index=True) + joined = merge(joined, + frame, + how=how, + left_index=True, + right_index=True) return joined @Substitution('') @Appender(_merge_doc, indents=2) - def merge(self, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False): + def merge(self, + right, + how='inner', + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=('_x', '_y'), + copy=True, + indicator=False): from pandas.tools.merge import merge - return merge(self, right, how=how, on=on, - left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index, sort=sort, - suffixes=suffixes, copy=copy, indicator=indicator) + return merge(self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator) def round(self, decimals=0, out=None): """ @@ -4456,9 +4872,11 @@ def _series_round(s, decimals): new_cols = [col for col in _dict_round(self, decimals)] elif com.is_integer(decimals): # Dispatch to Series.round - new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] + new_cols = [_series_round(v, decimals) + for _, v in self.iteritems()] else: - raise TypeError("decimals must be an integer, a dict-like or a Series") + raise TypeError( + "decimals must be an integer, a dict-like or a Series") if len(new_cols) > 0: return concat(new_cols, axis=1) @@ -4492,11 +4910,11 @@ def corr(self, method='pearson', min_periods=1): mat = numeric_df.values if method == 'pearson': - correl = _algos.nancorr(com._ensure_float64(mat), - minp=min_periods) + correl = _algos.nancorr(com._ensure_float64(mat), minp=min_periods) elif method == 'spearman': - correl = _algos.nancorr_spearman(com._ensure_float64(mat), - minp=min_periods) + correl = _algos.nancorr_spearman( + com._ensure_float64(mat), + minp=min_periods) else: if min_periods is None: min_periods = 1 @@ -4555,8 +4973,10 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = _algos.nancorr(com._ensure_float64(mat), cov=True, - minp=min_periods) + baseCov = _algos.nancorr( + com._ensure_float64(mat), + cov=True, + minp=min_periods) return self._constructor(baseCov, index=cols, columns=cols) @@ -4634,7 +5054,8 @@ def count(self, axis=0, level=None, numeric_only=False): """ axis = self._get_axis_number(axis) if level is not None: - return self._count_level(level, axis=axis, + return self._count_level(level, + axis=axis, numeric_only=numeric_only) if numeric_only: @@ -4687,8 +5108,7 @@ def _count_level(self, level, axis=0, numeric_only=False): labels = com._ensure_int64(count_axis.labels[level]) counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) - result = DataFrame(counts, index=level_index, - columns=agg_axis) + result = DataFrame(counts, index=level_index, columns=agg_axis) if axis == 1: # Undo our earlier transpose @@ -4696,8 +5116,14 @@ def _count_level(self, level, axis=0, numeric_only=False): else: return result - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, + op, + name, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds): axis = self._get_axis_number(axis) f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) labels = self._get_agg_axis(axis) @@ -4719,7 +5145,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, # this can end up with a non-reduction # but not always. if the types are mixed # with datelike then need to make sure a series - result = self.apply(f,reduce=False) + result = self.apply(f, reduce=False) if result.ndim == self.ndim: result = result.iloc[0] return result @@ -4732,8 +5158,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, data = self._get_bool_data() else: # pragma: no cover e = NotImplementedError("Handling exception with filter_" - "type %s not implemented." - % filter_type) + "type %s not implemented." % + filter_type) raise_with_traceback(e) result = f(data.values) labels = data._get_agg_axis(axis) @@ -4946,7 +5372,8 @@ def f(arr, per): quantiles = [[f(vals, x) for x in per] for (_, vals) in data.iteritems()] - result = self._constructor(quantiles, index=data._info_axis, + result = self._constructor(quantiles, + index=data._info_axis, columns=q).T if len(is_dt_col) > 0: result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp) @@ -4958,8 +5385,13 @@ def f(arr, per): result.name = None # For groupby, so it can set an index name return result - def rank(self, axis=0, numeric_only=None, method='average', - na_option='keep', ascending=True, pct=False): + def rank(self, + axis=0, + numeric_only=None, + method='average', + na_option='keep', + ascending=True, + pct=False): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values @@ -4992,10 +5424,14 @@ def rank(self, axis=0, numeric_only=None, method='average', axis = self._get_axis_number(axis) if numeric_only is None: try: - ranks = algos.rank(self.values, axis=axis, method=method, - ascending=ascending, na_option=na_option, + ranks = algos.rank(self.values, + axis=axis, + method=method, + ascending=ascending, + na_option=na_option, pct=pct) - return self._constructor(ranks, index=self.index, + return self._constructor(ranks, + index=self.index, columns=self.columns) except TypeError: numeric_only = True @@ -5003,8 +5439,12 @@ def rank(self, axis=0, numeric_only=None, method='average', data = self._get_numeric_data() else: data = self - ranks = algos.rank(data.values, axis=axis, method=method, - ascending=ascending, na_option=na_option, pct=pct) + ranks = algos.rank(data.values, + axis=axis, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct) return self._constructor(ranks, index=data.index, columns=data.columns) def to_timestamp(self, freq=None, how='start', axis=0, copy=True): @@ -5125,8 +5565,10 @@ def isin(self, values): from collections import defaultdict from pandas.tools.merge import concat values = defaultdict(list, values) - return concat((self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns)), axis=1) + return concat( + (self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns)), + axis=1) elif isinstance(values, Series): if not values.index.is_unique: raise ValueError("ValueError: cannot compute isin with" @@ -5143,10 +5585,10 @@ def isin(self, values): " allowed to be passed to DataFrame.isin(), " "you passed a " "{0!r}".format(type(values).__name__)) - return DataFrame(lib.ismember(self.values.ravel(), - set(values)).reshape(self.shape), - self.index, - self.columns) + return DataFrame( + lib.ismember(self.values.ravel(), + set(values)).reshape(self.shape), self.index, + self.columns) #---------------------------------------------------------------------- # Deprecated stuff @@ -5175,7 +5617,8 @@ def combineAdd(self, other): """ warnings.warn("'combineAdd' is deprecated. Use " "'DataFrame.add(other, fill_value=0.)' instead", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) return self.add(other, fill_value=0.) def combineMult(self, other): @@ -5201,17 +5644,22 @@ def combineMult(self, other): """ warnings.warn("'combineMult' is deprecated. Use " "'DataFrame.mul(other, fill_value=1.)' instead", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) return self.mul(other, fill_value=1.) -DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, - axes_are_reversed=True, aliases={'rows': 0}) +DataFrame._setup_axes(['index', 'columns'], + info_axis=1, + stat_axis=0, + axes_are_reversed=True, + aliases={'rows': 0}) DataFrame._add_numeric_operations() DataFrame._add_series_or_dataframe_operations() _EMPTY_SERIES = Series([]) + def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. @@ -5275,8 +5723,8 @@ def extract_index(data): if have_series: if lengths[0] != len(index): - msg = ('array length %d does not match index length %d' - % (lengths[0], len(index))) + msg = ('array length %d does not match index length %d' % + (lengths[0], len(index))) raise ValueError(msg) else: index = Index(np.arange(lengths[0])) @@ -5324,11 +5772,12 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): """ if isinstance(data, DataFrame): if columns is not None: - arrays = [data._ixs(i,axis=1).values for i, col in enumerate(data.columns) - if col in columns] + arrays = [data._ixs(i, + axis=1).values + for i, col in enumerate(data.columns) if col in columns] else: columns = data.columns - arrays = [data._ixs(i,axis=1).values for i in range(len(columns))] + arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns @@ -5339,22 +5788,26 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, + return _list_to_arrays(data, + columns, + coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], collections.Mapping): - return _list_of_dict_to_arrays(data, columns, + return _list_of_dict_to_arrays(data, + columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], Series): - return _list_of_series_to_arrays(data, columns, + return _list_of_series_to_arrays(data, + columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], Categorical): if columns is None: columns = _default_index(len(data)) return data, columns - elif (isinstance(data, (np.ndarray, Series, Index)) - and data.dtype.names is not None): + elif (isinstance(data, (np.ndarray, Series, Index)) and + data.dtype.names is not None): columns = list(data.dtype.names) arrays = [data[k] for k in columns] @@ -5362,7 +5815,8 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): else: # last ditch effort data = lmap(tuple, data) - return _list_to_arrays(data, columns, + return _list_to_arrays(data, + columns, coerce_float=coerce_float, dtype=dtype) @@ -5408,10 +5862,8 @@ def _reorder_arrays(arrays, arr_columns, columns): # reorder according to the columns if (columns is not None and len(columns) and arr_columns is not None and len(arr_columns)): - indexer = _ensure_index( - arr_columns).get_indexer(columns) - arr_columns = _ensure_index( - [arr_columns[i] for i in indexer]) + indexer = _ensure_index(arr_columns).get_indexer(columns) + arr_columns = _ensure_index([arr_columns[i] for i in indexer]) arrays = [arrays[i] for i in indexer] return arrays, arr_columns @@ -5422,7 +5874,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): else: # list of lists content = list(lib.to_object_array(data).T) - return _convert_object_array(content, columns, dtype=dtype, + return _convert_object_array(content, + columns, + dtype=dtype, coerce_float=coerce_float) @@ -5454,7 +5908,9 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if values.dtype == np.object_: content = list(values.T) - return _convert_object_array(content, columns, dtype=dtype, + return _convert_object_array(content, + columns, + dtype=dtype, coerce_float=coerce_float) else: return values.T, columns @@ -5470,7 +5926,9 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): data = [(type(d) is dict) and d or dict(d) for d in data] content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array(content, columns, dtype=dtype, + return _convert_object_array(content, + columns, + dtype=dtype, coerce_float=coerce_float) @@ -5490,7 +5948,7 @@ def convert(arr): arr = com._possibly_cast_to_datetime(arr, dtype) return arr - arrays = [ convert(arr) for arr in content ] + arrays = [convert(arr) for arr in content] return arrays, columns @@ -5537,7 +5995,10 @@ def _homogenize(data, index, dtype=None): else: v = dict(v) v = lib.fast_multiget(v, oindex.values, default=NA) - v = _sanitize_array(v, index, dtype=dtype, copy=False, + v = _sanitize_array(v, + index, + dtype=dtype, + copy=False, raise_cast_failure=False) homogenized.append(v) @@ -5558,29 +6019,45 @@ def _from_nested_dict(data): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) - #---------------------------------------------------------------------- # Add plotting methods to DataFrame import pandas.tools.plotting as gfx -DataFrame.plot = base.AccessorProperty(gfx.FramePlotMethods, gfx.FramePlotMethods) +DataFrame.plot = base.AccessorProperty(gfx.FramePlotMethods, + gfx.FramePlotMethods) DataFrame.hist = gfx.hist_frame @Appender(_shared_docs['boxplot'] % _shared_doc_kwargs) -def boxplot(self, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, +def boxplot(self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, **kwds): import pandas.tools.plotting as plots import matplotlib.pyplot as plt - ax = plots.boxplot(self, column=column, by=by, ax=ax, - fontsize=fontsize, grid=grid, rot=rot, - figsize=figsize, layout=layout, return_type=return_type, + ax = plots.boxplot(self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + grid=grid, + rot=rot, + figsize=figsize, + layout=layout, + return_type=return_type, **kwds) plt.draw_if_interactive() return ax + DataFrame.boxplot = boxplot ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs)