diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e5347f03b5462..d1c983769ed2a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -12,6 +12,7 @@ import pandas.hashtable as htable from pandas.compat import string_types + def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values @@ -44,7 +45,8 @@ def match(to_match, values, na_sentinel=-1): # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas.core.series import Series - result = Series(result.ravel()).replace(-1,na_sentinel).values.reshape(result.shape) + result = Series(result.ravel()).replace(-1, na_sentinel).values.\ + reshape(result.shape) return result @@ -63,6 +65,7 @@ def unique(values): uniques """ values = com._asarray_tuplesafe(values) + f = lambda htype, caster: _unique_generic(values, htype, caster) return _hashtable_algo(f, values.dtype) @@ -95,9 +98,9 @@ def isin(comps, values): # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: - f = lambda x, y: np.in1d(x,np.asarray(list(y))) + f = lambda x, y: np.in1d(x, np.asarray(list(y))) else: - f = lambda x, y: lib.ismember_int64(x,set(y)) + f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing if com.is_datetime64_dtype(comps): @@ -115,6 +118,7 @@ def isin(comps, values): return f(comps, values) + def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result @@ -148,8 +152,6 @@ def _unique_generic(values, table_type, type_caster): return type_caster(uniques) - - def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -169,12 +171,15 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index - the unique values. Index is returned when passed values is Index or Series + the unique values. Index is returned when passed values is Index or + Series - note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex + note: an array of Periods will ignore sort as it returns an always sorted + PeriodIndex """ if order is not None: - msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926" + msg = "order is deprecated. See " \ + "https://github.com/pydata/pandas/issues/6926" warn(msg, FutureWarning, stacklevel=2) from pandas.core.index import Index @@ -203,10 +208,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # order ints before strings ordered = np.concatenate([ - np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), - lambda x: isinstance(x,string_types) ] - ]) - sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered))) + np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], + dtype=object)) for f in + [lambda x: not isinstance(x, string_types), + lambda x: isinstance(x, string_types)]]) + sorter = com._ensure_platform_int(t.lookup( + com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -276,7 +283,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) - if com.is_datetime_or_timedelta_dtype(dtype) or is_period or is_datetimetz: + if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \ + is_datetimetz: if is_period: values = PeriodIndex(values) @@ -300,7 +308,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False, else: keys = keys.astype(dtype) - elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) @@ -322,7 +329,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if bins is not None: # TODO: This next line should be more efficient - result = result.reindex(np.arange(len(cat.categories)), fill_value=0) + result = result.reindex(np.arange(len(cat.categories)), + fill_value=0) result.index = bins[:-1] if sort: @@ -525,12 +533,11 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr): def nsmallest(arr, n, keep='first'): - ''' + """ Find the indices of the n smallest values of a numpy array. Note: Fails silently with NaN. - - ''' + """ if keep == 'last': arr = arr[::-1] diff --git a/pandas/core/base.py b/pandas/core/base.py index 548b922926f02..7164a593d8e96 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,8 +7,7 @@ from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib -from pandas.util.decorators import (Appender, Substitution, - cache_readonly, deprecate_kwarg) +from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.common import AbstractMethodError _shared_docs = dict() @@ -17,7 +16,6 @@ class StringMixin(object): - """implements string methods so long as object defines a `__unicode__` method. @@ -26,7 +24,7 @@ class StringMixin(object): # side note - this could be made into a metaclass if more than one # object needs - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Formatting def __unicode__(self): @@ -115,7 +113,7 @@ def _reset_cache(self, key=None): def __sizeof__(self): """ Generates the total memory usage for a object that returns - either a value or Series of values + either a value or Series of values """ if hasattr(self, 'memory_usage'): mem = self.memory_usage(deep=True) @@ -131,25 +129,27 @@ def __sizeof__(self): class NoNewAttributesMixin(object): """Mixin which prevents adding new attributes. - Prevents additional attributes via xxx.attribute = "something" after a call to - `self.__freeze()`. Mainly used to prevent the user from using wrong attrirbutes - on a accessor (`Series.cat/.str/.dt`). + Prevents additional attributes via xxx.attribute = "something" after a + call to `self.__freeze()`. Mainly used to prevent the user from using + wrong attrirbutes on a accessor (`Series.cat/.str/.dt`). - If you really want to add a new attribute at a later time, you need to use - `object.__setattr__(self, key, value)`. - """ + If you really want to add a new attribute at a later time, you need to use + `object.__setattr__(self, key, value)`. + """ def _freeze(self): """Prevents setting additional attributes""" object.__setattr__(self, "__frozen", True) - # prevent adding any attribute via s.xxx.new_attribute = ... def __setattr__(self, key, value): # _cache is used by a decorator - # dict lookup instead of getattr as getattr is false for getter which error - if getattr(self, "__frozen", False) and not (key in type(self).__dict__ or key == "_cache"): - raise AttributeError( "You cannot add any new attribute '{key}'".format(key=key)) + # dict lookup instead of getattr as getattr is false for getter + # which error + if getattr(self, "__frozen", False) and not \ + (key in type(self).__dict__ or key == "_cache"): + raise AttributeError("You cannot add any new attribute '{key}'". + format(key=key)) object.__setattr__(self, key, value) @@ -157,7 +157,8 @@ class PandasDelegate(PandasObject): """ an abstract base class for delegating methods/properties """ def _delegate_property_get(self, name, *args, **kwargs): - raise TypeError("You cannot access the property {name}".format(name=name)) + raise TypeError("You cannot access the " + "property {name}".format(name=name)) def _delegate_property_set(self, name, value, *args, **kwargs): raise TypeError("The property {name} cannot be set".format(name=name)) @@ -166,7 +167,8 @@ def _delegate_method(self, name, *args, **kwargs): raise TypeError("You cannot call method {name}".format(name=name)) @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): + def _add_delegate_accessors(cls, delegate, accessors, typ, + overwrite=False): """ add accessors to cls from the delegate class @@ -178,20 +180,21 @@ def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): typ : 'property' or 'method' overwrite : boolean, default False overwrite the method/property in the target class if it exists - """ def _create_delegator_property(name): def _getter(self): return self._delegate_property_get(name) + def _setter(self, new_values): return self._delegate_property_set(name, new_values) _getter.__name__ = name _setter.__name__ = name - return property(fget=_getter, fset=_setter, doc=getattr(delegate,name).__doc__) + return property(fget=_getter, fset=_setter, + doc=getattr(delegate, name).__doc__) def _create_delegator_method(name): @@ -199,7 +202,7 @@ def f(self, *args, **kwargs): return self._delegate_method(name, *args, **kwargs) f.__name__ = name - f.__doc__ = getattr(delegate,name).__doc__ + f.__doc__ = getattr(delegate, name).__doc__ return f @@ -212,7 +215,7 @@ def f(self, *args, **kwargs): # don't overwrite existing methods/properties if overwrite or not hasattr(cls, name): - setattr(cls,name,f) + setattr(cls, name, f) class AccessorProperty(object): @@ -250,17 +253,17 @@ class SpecificationError(GroupByError): class SelectionMixin(object): """ - mixin implementing the selection & aggregation interface on a group-like object - sub-classes need to define: obj, exclusions + mixin implementing the selection & aggregation interface on a group-like + object sub-classes need to define: obj, exclusions """ _selection = None - _internal_names = ['_cache','__setstate__'] + _internal_names = ['_cache', '__setstate__'] _internal_names_set = set(_internal_names) _builtin_table = { builtins.sum: np.sum, builtins.max: np.max, - builtins.min: np.min, - } + builtins.min: np.min + } _cython_table = { builtins.sum: 'sum', builtins.max: 'max', @@ -275,7 +278,7 @@ class SelectionMixin(object): np.min: 'min', np.cumprod: 'cumprod', np.cumsum: 'cumsum' - } + } @property def name(self): @@ -286,7 +289,8 @@ def name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, com.ABCSeries, com.ABCIndex, np.ndarray)): + if not isinstance(self._selection, (list, tuple, com.ABCSeries, + com.ABCIndex, np.ndarray)): return [self._selection] return self._selection @@ -300,7 +304,8 @@ def _selected_obj(self): @cache_readonly def _obj_with_exclusions(self): - if self._selection is not None and isinstance(self.obj, com.ABCDataFrame): + if self._selection is not None and isinstance(self.obj, + com.ABCDataFrame): return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: @@ -312,14 +317,15 @@ def __getitem__(self, key): if self._selection is not None: raise Exception('Column(s) %s already selected' % self._selection) - if isinstance(key, (list, tuple, com.ABCSeries, com.ABCIndex, np.ndarray)): + if isinstance(key, (list, tuple, com.ABCSeries, com.ABCIndex, + np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError("Columns not found: %s" % str(bad_keys)[1:-1]) return self._gotitem(list(key), ndim=2) - elif not getattr(self,'as_index',False): + elif not getattr(self, 'as_index', False): if key not in self.obj.columns: raise KeyError("Column not found: %s" % key) return self._gotitem(key, ndim=2) @@ -345,7 +351,8 @@ def _gotitem(self, key, ndim, subset=None): """ raise AbstractMethodError(self) - _agg_doc = """Aggregate using input function or dict of {column -> function} + _agg_doc = """Aggregate using input function or dict of {column -> +function} Parameters ---------- @@ -395,7 +402,6 @@ def _aggregate(self, arg, *args, **kwargs): *args : args to pass on to the function **kwargs : kwargs to pass on to the function - Returns ------- tuple of result, how @@ -406,7 +412,7 @@ def _aggregate(self, arg, *args, **kwargs): None if not required """ - _level = kwargs.pop('_level',None) + _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None @@ -431,7 +437,8 @@ def _aggregate(self, arg, *args, **kwargs): subset = obj for fname, agg_how in compat.iteritems(arg): - colg = self._gotitem(self._selection, ndim=1, subset=subset) + colg = self._gotitem(self._selection, ndim=1, + subset=subset) result[fname] = colg.aggregate(agg_how, _level=None) keys.append(fname) else: @@ -442,7 +449,7 @@ def _aggregate(self, arg, *args, **kwargs): if isinstance(list(result.values())[0], com.ABCDataFrame): from pandas.tools.merge import concat - result = concat([ result[k] for k in keys ], keys=keys, axis=1) + result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame result = DataFrame(result) @@ -475,7 +482,7 @@ def _aggregate_multiple_funcs(self, arg, _level): keys = [] # degenerate case - if obj.ndim==1: + if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) @@ -518,6 +525,7 @@ def _is_builtin_func(self, arg): """ return self._builtin_table.get(arg, arg) + class FrozenList(PandasObject, list): """ @@ -585,6 +593,7 @@ def __repr__(self): __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled + class FrozenNDArray(PandasObject, np.ndarray): # no __array_finalize__ for now because no metadata @@ -623,7 +632,9 @@ def __unicode__(self): class IndexOpsMixin(object): - """ common ops mixin to support a unified inteface / docs for Series / Index """ + """ common ops mixin to support a unified inteface / docs for Series / + Index + """ # ndarray compatibility __array_priority__ = 1000 @@ -632,7 +643,8 @@ def transpose(self): """ return the transpose, which is by definition self """ return self - T = property(transpose, doc="return the transpose, which is by definition self") + T = property(transpose, doc="return the transpose, which is by " + "definition self") @property def shape(self): @@ -641,11 +653,15 @@ def shape(self): @property def ndim(self): - """ return the number of dimensions of the underlying data, by definition 1 """ + """ return the number of dimensions of the underlying data, + by definition 1 + """ return 1 def item(self): - """ return the first element of the underlying data as a python scalar """ + """ return the first element of the underlying data as a python + scalar + """ try: return self.values.item() except IndexError: @@ -685,7 +701,9 @@ def flags(self): @property def base(self): - """ return the base object if the memory of the underlying data is shared """ + """ return the base object if the memory of the underlying data is + shared + """ return self.values.base @property @@ -729,9 +747,10 @@ def hasnans(self): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ perform the reduction type operation if we can """ - func = getattr(self,name,None) + func = getattr(self, name, None) if func is None: - raise TypeError("{klass} cannot perform the operation {op}".format(klass=self.__class__.__name__,op=name)) + raise TypeError("{klass} cannot perform the operation {op}".format( + klass=self.__class__.__name__, op=name)) return func(**kwds) def value_counts(self, normalize=False, sort=True, ascending=False, @@ -787,7 +806,7 @@ def unique(self): """ from pandas.core.nanops import unique1d values = self.values - if hasattr(values,'unique'): + if hasattr(values, 'unique'): return values.unique() return unique1d(values) @@ -836,7 +855,7 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - if hasattr(self.values,'memory_usage'): + if hasattr(self.values, 'memory_usage'): return self.values.memory_usage(deep=deep) v = self.values.nbytes @@ -866,9 +885,9 @@ def factorize(self, sort=False, na_sentinel=-1): def searchsorted(self, key, side='left'): """ np.ndarray searchsorted compat """ - ### FIXME in GH7447 - #### needs coercion on the key (DatetimeIndex does alreay) - #### needs tests/doc-string + # FIXME in GH7447 + # needs coercion on the key (DatetimeIndex does alreay) + # needs tests/doc-string return self.values.searchsorted(key, side=side) _shared_docs['drop_duplicates'] = ( @@ -889,7 +908,8 @@ def searchsorted(self, key, side='left'): deduplicated : %(klass)s """) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): duplicated = self.duplicated(keep=keep) @@ -905,8 +925,10 @@ def drop_duplicates(self, keep='first', inplace=False): Parameters ---------- keep : {'first', 'last', False}, default 'first' - - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. - False : Mark all duplicates as ``True``. take_last : deprecated @@ -915,7 +937,8 @@ def drop_duplicates(self, keep='first', inplace=False): duplicated : %(duplicated)s """) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): keys = com._values_from_object(com._ensure_object(self.values)) @@ -926,7 +949,7 @@ def duplicated(self, keep='first'): except AttributeError: return np.array(duplicated, dtype=bool) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # abstracts def _update_inplace(self, result, **kwargs): diff --git a/pandas/core/dtypes.py b/pandas/core/dtypes.py index 69957299aa9bb..1e358694de63e 100644 --- a/pandas/core/dtypes.py +++ b/pandas/core/dtypes.py @@ -4,6 +4,7 @@ import numpy as np from pandas import compat + class ExtensionDtype(object): """ A np.dtype duck-typed class, suitable for holding a custom dtype. @@ -60,17 +61,21 @@ def __repr__(self): return str(self) def __hash__(self): - raise NotImplementedError("sub-classes should implement an __hash__ method") + raise NotImplementedError("sub-classes should implement an __hash__ " + "method") def __eq__(self, other): - raise NotImplementedError("sub-classes should implement an __eq__ method") + raise NotImplementedError("sub-classes should implement an __eq__ " + "method") def __ne__(self, other): return not self.__eq__(other) @classmethod def is_dtype(cls, dtype): - """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) """ + """ Return a boolean if we if the passed type is an actual dtype that + we can match (via string or type) + """ if hasattr(dtype, 'dtype'): dtype = dtype.dtype if isinstance(dtype, cls): @@ -82,16 +87,19 @@ def is_dtype(cls, dtype): except: return False + class CategoricalDtypeType(type): """ the type of CategoricalDtype, this metaclass determines subclass ability """ pass + class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical dtype. + A np.dtype duck-typed class, suitable for holding a custom categorical + dtype. THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object """ @@ -113,7 +121,8 @@ def __eq__(self, other): @classmethod def construct_from_string(cls, string): - """ attempt to construct this type from a string, raise a TypeError if its not possible """ + """ attempt to construct this type from a string, raise a TypeError if + it's not possible """ try: if string == 'category': return cls() @@ -122,25 +131,29 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + class DatetimeTZDtypeType(type): """ the type of DatetimeTZDtype, this metaclass determines subclass ability """ pass + class DatetimeTZDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom datetime with tz dtype. + A np.dtype duck-typed class, suitable for holding a custom datetime with tz + dtype. - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.datetime64[ns] + THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of + np.datetime64[ns] """ type = DatetimeTZDtypeType kind = 'M' str = '|M8[ns]' num = 101 base = np.dtype('M8[ns]') - _metadata = ['unit','tz'] + _metadata = ['unit', 'tz'] _match = re.compile("(datetime64|M8)\[(?P.+), (?P.+)\]") def __init__(self, unit, tz=None): @@ -167,7 +180,8 @@ def __init__(self, unit, tz=None): except: raise ValueError("could not construct DatetimeTZDtype") - raise ValueError("DatetimeTZDtype constructor must have a tz supplied") + raise ValueError("DatetimeTZDtype constructor must have a tz " + "supplied") if unit != 'ns': raise ValueError("DatetimeTZDtype only supports ns units") @@ -176,7 +190,9 @@ def __init__(self, unit, tz=None): @classmethod def construct_from_string(cls, string): - """ attempt to construct this type from a string, raise a TypeError if its not possible """ + """ attempt to construct this type from a string, raise a TypeError if + it's not possible + """ try: return cls(unit=string) except ValueError: @@ -198,4 +214,5 @@ def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, DatetimeTZDtype) and self.unit == other.unit and self.tz == other.tz + return isinstance(other, DatetimeTZDtype) and self.unit == other.unit \ + and self.tz == other.tz