From 28b7bded23f99b07004c40275a628560e5eb7b76 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 13 Nov 2015 00:34:01 +0100 Subject: [PATCH 1/4] TST: Allow for more than one acessor on a Series `.dt` and `.str` can be available when the Series is of type category, which makes `.cat` available. --- pandas/core/series.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cab231e8fb09c..29e9a81d19cd6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2704,12 +2704,10 @@ def _dir_deletions(self): def _dir_additions(self): rv = set() - # these accessors are mutually exclusive, so break loop when one exists for accessor in self._accessors: try: getattr(self, accessor) rv.add(accessor) - break except AttributeError: pass return rv From 149feeff11284124badc355ffa264b01151b7465 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Thu, 12 Nov 2015 14:15:26 +0100 Subject: [PATCH 2/4] Make .str available for Series of type category with strings If a series is a type category and the underlying Categorical has categories of type string, then make it possible to use the `.str` assessor on such a series. The string methods work on the categories (and therefor fast if we have only a few categories), but return a Series with a dtype other than category (boolean, string,...), so that it is no different if we use `.str` on a series of type string or of type category. --- pandas/core/strings.py | 126 +++++++++++++++++++------------ pandas/tests/test_categorical.py | 72 ++++++++++++++++++ 2 files changed, 148 insertions(+), 50 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f1ff7e2178a04..a8907ac192707 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from pandas.compat import zip from pandas.core.common import (isnull, _values_from_object, is_bool_dtype, is_list_like, - is_categorical_dtype, is_object_dtype) + is_categorical_dtype, is_object_dtype, take_1d) import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin from pandas.util.decorators import Appender, deprecate_kwarg @@ -1003,7 +1003,7 @@ def str_encode(arr, encoding, errors="strict"): def _noarg_wrapper(f, docstring=None, **kargs): def wrapper(self): - result = _na_map(f, self.series, **kargs) + result = _na_map(f, self._data, **kargs) return self._wrap_result(result) wrapper.__name__ = f.__name__ @@ -1017,15 +1017,15 @@ def wrapper(self): def _pat_wrapper(f, flags=False, na=False, **kwargs): def wrapper1(self, pat): - result = f(self.series, pat) + result = f(self._data, pat) return self._wrap_result(result) def wrapper2(self, pat, flags=0, **kwargs): - result = f(self.series, pat, flags=flags, **kwargs) + result = f(self._data, pat, flags=flags, **kwargs) return self._wrap_result(result) def wrapper3(self, pat, na=np.nan): - result = f(self.series, pat, na=na) + result = f(self._data, pat, na=na) return self._wrap_result(result) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 @@ -1059,8 +1059,11 @@ class StringMethods(NoNewAttributesMixin): >>> s.str.replace('_', '') """ - def __init__(self, series): - self.series = series + def __init__(self, data): + self._is_categorical = is_categorical_dtype(data) + self._data = data.cat.categories if self._is_categorical else data + # save orig to blow up categoricals to the right type + self._orig = data self._freeze() def __getitem__(self, key): @@ -1078,7 +1081,15 @@ def __iter__(self): i += 1 g = self.get(i) - def _wrap_result(self, result, **kwargs): + def _wrap_result(self, result, use_codes=True, name=None): + + # for category, we do the stuff on the categories, so blow it up + # to the full series again + # But for some operations, we have to do the stuff on the full values, + # so make it possible to skip this step as the method already did this before + # the transformation... + if use_codes and self._is_categorical: + result = take_1d(result, self._orig.cat.codes) # leave as it is to keep extract and get_dummies results # can be merged to _wrap_result_expand in v0.17 @@ -1088,29 +1099,34 @@ def _wrap_result(self, result, **kwargs): if not hasattr(result, 'ndim'): return result - name = kwargs.get('name') or getattr(result, 'name', None) or self.series.name + name = name or getattr(result, 'name', None) or self._orig.name if result.ndim == 1: - if isinstance(self.series, Index): + if isinstance(self._orig, Index): # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if is_bool_dtype(result): return result return Index(result, name=name) - return Series(result, index=self.series.index, name=name) + return Series(result, index=self._orig.index, name=name) else: assert result.ndim < 3 - return DataFrame(result, index=self.series.index) + return DataFrame(result, index=self._orig.index) def _wrap_result_expand(self, result, expand=False): if not isinstance(expand, bool): raise ValueError("expand must be True or False") + # for category, we do the stuff on the categories, so blow it up + # to the full series again + if self._is_categorical: + result = take_1d(result, self._orig.cat.codes) + from pandas.core.index import Index, MultiIndex if not hasattr(result, 'ndim'): return result - if isinstance(self.series, Index): + if isinstance(self._orig, Index): name = getattr(result, 'name', None) # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) @@ -1123,36 +1139,38 @@ def _wrap_result_expand(self, result, expand=False): else: return Index(result, name=name) else: - index = self.series.index + index = self._orig.index if expand: def cons_row(x): if is_list_like(x): return x else: return [ x ] - cons = self.series._constructor_expanddim + cons = self._orig._constructor_expanddim data = [cons_row(x) for x in result] return cons(data, index=index) else: name = getattr(result, 'name', None) - cons = self.series._constructor + cons = self._orig._constructor return cons(result, name=name, index=index) @copy(str_cat) def cat(self, others=None, sep=None, na_rep=None): - result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep) - return self._wrap_result(result) + data = self._orig if self._is_categorical else self._data + result = str_cat(data, others=others, sep=sep, na_rep=na_rep) + return self._wrap_result(result, use_codes=(not self._is_categorical)) + @deprecate_kwarg('return_type', 'expand', mapping={'series': False, 'frame': True}) @copy(str_split) def split(self, pat=None, n=-1, expand=False): - result = str_split(self.series, pat, n=n) + result = str_split(self._data, pat, n=n) return self._wrap_result_expand(result, expand=expand) @copy(str_rsplit) def rsplit(self, pat=None, n=-1, expand=False): - result = str_rsplit(self.series, pat, n=n) + result = str_rsplit(self._data, pat, n=n) return self._wrap_result_expand(result, expand=expand) _shared_docs['str_partition'] = (""" @@ -1203,7 +1221,7 @@ def rsplit(self, pat=None, n=-1, expand=False): 'also': 'rpartition : Split the string at the last occurrence of `sep`'}) def partition(self, pat=' ', expand=True): f = lambda x: x.partition(pat) - result = _na_map(f, self.series) + result = _na_map(f, self._data) return self._wrap_result_expand(result, expand=expand) @Appender(_shared_docs['str_partition'] % {'side': 'last', @@ -1211,45 +1229,45 @@ def partition(self, pat=' ', expand=True): 'also': 'partition : Split the string at the first occurrence of `sep`'}) def rpartition(self, pat=' ', expand=True): f = lambda x: x.rpartition(pat) - result = _na_map(f, self.series) + result = _na_map(f, self._data) return self._wrap_result_expand(result, expand=expand) @copy(str_get) def get(self, i): - result = str_get(self.series, i) + result = str_get(self._data, i) return self._wrap_result(result) @copy(str_join) def join(self, sep): - result = str_join(self.series, sep) + result = str_join(self._data, sep) return self._wrap_result(result) @copy(str_contains) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains(self.series, pat, case=case, flags=flags, + result = str_contains(self._data, pat, case=case, flags=flags, na=na, regex=regex) return self._wrap_result(result) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): - result = str_match(self.series, pat, case=case, flags=flags, + result = str_match(self._data, pat, case=case, flags=flags, na=na, as_indexer=as_indexer) return self._wrap_result(result) @copy(str_replace) def replace(self, pat, repl, n=-1, case=True, flags=0): - result = str_replace(self.series, pat, repl, n=n, case=case, + result = str_replace(self._data, pat, repl, n=n, case=case, flags=flags) return self._wrap_result(result) @copy(str_repeat) def repeat(self, repeats): - result = str_repeat(self.series, repeats) + result = str_repeat(self._data, repeats) return self._wrap_result(result) @copy(str_pad) def pad(self, width, side='left', fillchar=' '): - result = str_pad(self.series, width, side=side, fillchar=fillchar) + result = str_pad(self._data, width, side=side, fillchar=fillchar) return self._wrap_result(result) _shared_docs['str_pad'] = (""" @@ -1297,27 +1315,27 @@ def zfill(self, width): ------- filled : Series/Index of objects """ - result = str_pad(self.series, width, side='left', fillchar='0') + result = str_pad(self._data, width, side='left', fillchar='0') return self._wrap_result(result) @copy(str_slice) def slice(self, start=None, stop=None, step=None): - result = str_slice(self.series, start, stop, step) + result = str_slice(self._data, start, stop, step) return self._wrap_result(result) @copy(str_slice_replace) def slice_replace(self, start=None, stop=None, repl=None): - result = str_slice_replace(self.series, start, stop, repl) + result = str_slice_replace(self._data, start, stop, repl) return self._wrap_result(result) @copy(str_decode) def decode(self, encoding, errors="strict"): - result = str_decode(self.series, encoding, errors) + result = str_decode(self._data, encoding, errors) return self._wrap_result(result) @copy(str_encode) def encode(self, encoding, errors="strict"): - result = str_encode(self.series, encoding, errors) + result = str_encode(self._data, encoding, errors) return self._wrap_result(result) _shared_docs['str_strip'] = (""" @@ -1332,34 +1350,37 @@ def encode(self, encoding, errors="strict"): @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', method='strip')) def strip(self, to_strip=None): - result = str_strip(self.series, to_strip, side='both') + result = str_strip(self._data, to_strip, side='both') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='left side', method='lstrip')) def lstrip(self, to_strip=None): - result = str_strip(self.series, to_strip, side='left') + result = str_strip(self._data, to_strip, side='left') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='right side', method='rstrip')) def rstrip(self, to_strip=None): - result = str_strip(self.series, to_strip, side='right') + result = str_strip(self._data, to_strip, side='right') return self._wrap_result(result) @copy(str_wrap) def wrap(self, width, **kwargs): - result = str_wrap(self.series, width, **kwargs) + result = str_wrap(self._data, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) def get_dummies(self, sep='|'): - result = str_get_dummies(self.series, sep) - return self._wrap_result(result) + # we need to cast to Series of strings as only that has all + # methods available for making the dummies... + data = self._orig.astype(str) if self._is_categorical else self._data + result = str_get_dummies(data, sep) + return self._wrap_result(result, use_codes=(not self._is_categorical)) @copy(str_translate) def translate(self, table, deletechars=None): - result = str_translate(self.series, table, deletechars) + result = str_translate(self._data, table, deletechars) return self._wrap_result(result) count = _pat_wrapper(str_count, flags=True) @@ -1369,7 +1390,7 @@ def translate(self, table, deletechars=None): @copy(str_extract) def extract(self, pat, flags=0): - result, name = str_extract(self.series, pat, flags=flags) + result, name = str_extract(self._data, pat, flags=flags) return self._wrap_result(result, name=name) _shared_docs['find'] = (""" @@ -1398,13 +1419,13 @@ def extract(self, pat, flags=0): @Appender(_shared_docs['find'] % dict(side='lowest', method='find', also='rfind : Return highest indexes in each strings')) def find(self, sub, start=0, end=None): - result = str_find(self.series, sub, start=start, end=end, side='left') + result = str_find(self._data, sub, start=start, end=end, side='left') return self._wrap_result(result) @Appender(_shared_docs['find'] % dict(side='highest', method='rfind', also='find : Return lowest indexes in each strings')) def rfind(self, sub, start=0, end=None): - result = str_find(self.series, sub, start=start, end=end, side='right') + result = str_find(self._data, sub, start=start, end=end, side='right') return self._wrap_result(result) def normalize(self, form): @@ -1423,7 +1444,7 @@ def normalize(self, form): """ import unicodedata f = lambda x: unicodedata.normalize(form, compat.u_safe(x)) - result = _na_map(f, self.series) + result = _na_map(f, self._data) return self._wrap_result(result) _shared_docs['index'] = (""" @@ -1453,13 +1474,13 @@ def normalize(self, form): @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index', also='rindex : Return highest indexes in each strings')) def index(self, sub, start=0, end=None): - result = str_index(self.series, sub, start=start, end=end, side='left') + result = str_index(self._data, sub, start=start, end=end, side='left') return self._wrap_result(result) @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex', also='index : Return lowest indexes in each strings')) def rindex(self, sub, start=0, end=None): - result = str_index(self.series, sub, start=start, end=end, side='right') + result = str_index(self._data, sub, start=start, end=end, side='right') return self._wrap_result(result) _shared_docs['len'] = (""" @@ -1553,9 +1574,14 @@ class StringAccessorMixin(object): def _make_str_accessor(self): from pandas.core.series import Series from pandas.core.index import Index - if isinstance(self, Series) and not is_object_dtype(self.dtype): - # this really should exclude all series with any non-string values, - # but that isn't practical for performance reasons until we have a + if isinstance(self, Series) and not( + (is_categorical_dtype(self.dtype) and + is_object_dtype(self.values.categories)) or + (is_object_dtype(self.dtype))): + # it's neither a string series not a categorical series with strings + # inside the categories. + # this really should exclude all series with any non-string values (instead of test + # for object dtype), but that isn't practical for performance reasons until we have a # str dtype (GH 9343) raise AttributeError("Can only use .str accessor with string " "values, which use np.object_ dtype in " diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 0da4d0e68621d..beab5754d30c3 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3640,6 +3640,78 @@ def test_cat_accessor_no_new_attributes(self): with tm.assertRaisesRegexp(AttributeError, "You cannot add any new attribute"): c.cat.xlabel = "a" + def test_str_accessor_api_for_categorical(self): + # https://github.com/pydata/pandas/issues/10661 + from pandas.core.strings import StringMethods + s = Series(list('aabb')) + s = s + " " + s + c = s.astype('category') + self.assertIsInstance(c.str, StringMethods) + + # str functions, which need special arguments + special_func_defs = [ + ('cat', (list("zyxw"),), {"sep": ","}), + ('center', (10,), {}), + ('contains', ("a",), {}), + ('count', ("a",), {}), + ('decode', ("UTF-8",), {}), + ('encode', ("UTF-8",), {}), + ('endswith', ("a",), {}), + ('extract', ("([a-z]*) ",), {}), + ('find', ("a",), {}), + ('findall', ("a",), {}), + ('index', (" ",), {}), + ('ljust', (10,), {}), + ('match', ("a"), {}), # deprecated... + ('normalize', ("NFC",), {}), + ('pad', (10,), {}), + ('partition', (" ",), {"expand": False}), # not default + ('partition', (" ",), {"expand": True}), # default + ('repeat', (3,), {}), + ('replace', ("a", "z"), {}), + ('rfind', ("a",), {}), + ('rindex', (" ",), {}), + ('rjust', (10,), {}), + ('rpartition', (" ",), {"expand": False}), # not default + ('rpartition', (" ",), {"expand": True}), # default + ('slice', (0,1), {}), + ('slice_replace', (0,1,"z"), {}), + ('split', (" ",), {"expand":False}), #default + ('split', (" ",), {"expand":True}), # not default + ('startswith', ("a",), {}), + ('wrap', (2,), {}), + ('zfill', (10,), {}) + ] + _special_func_names = [f[0] for f in special_func_defs] + + # * get, join: they need a individual elements of type lists, but + # we can't make a categorical with lists as individual categories. + # -> `s.str.split(" ").astype("category")` will error! + # * `translate` has different interfaces for py2 vs. py3 + _ignore_names = ["get", "join", "translate"] + + str_func_names = [f for f in dir(s.str) if not (f.startswith("_") or + f in _special_func_names or + f in _ignore_names)] + + func_defs = [(f, (), {}) for f in str_func_names] + func_defs.extend(special_func_defs) + + + for func, args, kwargs in func_defs: + res = getattr(c.str, func)(*args, **kwargs) + exp = getattr(s.str, func)(*args, **kwargs) + + if isinstance(res, pd.DataFrame): + tm.assert_frame_equal(res, exp) + else: + tm.assert_series_equal(res, exp) + + invalid = Series([1,2,3]).astype('category') + with tm.assertRaisesRegexp(AttributeError, "Can only use .str accessor with string"): + invalid.str + self.assertFalse(hasattr(invalid, 'str')) + def test_pickle_v0_14_1(self): # we have the name warning From a7c65ed6b74227a9d4a49097d2da28c876e85c1f Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Mon, 16 Nov 2015 16:33:18 +0100 Subject: [PATCH 3/4] Make .dt available for Series of type category with datetimes If a series is a type category and the underlying Categorical has categories of type datetime, then make it possible to use the .dt assessor on such a series. The string methods work on the categories (and therefore fast if we have only a few categories), but return a Series with a dtype other than category (integer,...), so that it is no different if we use .dt on a series of type datetime or of type category. --- pandas/tests/test_categorical.py | 75 ++++++++++++++++++++++++++++++++ pandas/tests/test_series.py | 12 ++++- pandas/tseries/common.py | 30 +++++++++---- 3 files changed, 107 insertions(+), 10 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index beab5754d30c3..ac2f9e77c3674 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3712,6 +3712,81 @@ def test_str_accessor_api_for_categorical(self): invalid.str self.assertFalse(hasattr(invalid, 'str')) + def test_dt_accessor_api_for_categorical(self): + # https://github.com/pydata/pandas/issues/10661 + from pandas.tseries.common import Properties + from pandas.tseries.index import date_range, DatetimeIndex + from pandas.tseries.period import period_range, PeriodIndex + from pandas.tseries.tdi import timedelta_range, TimedeltaIndex + + s_dr = Series(date_range('1/1/2015', periods=5, tz="MET")) + c_dr = s_dr.astype("category") + + s_pr = Series(period_range('1/1/2015', freq='D', periods=5)) + c_pr = s_pr.astype("category") + + s_tdr = Series(timedelta_range('1 days','10 days')) + c_tdr = s_tdr.astype("category") + + test_data = [ + ("Datetime", DatetimeIndex._datetimelike_ops, s_dr, c_dr), + ("Period", PeriodIndex._datetimelike_ops, s_pr, c_pr), + ("Timedelta", TimedeltaIndex._datetimelike_ops, s_tdr, c_tdr)] + + self.assertIsInstance(c_dr.dt, Properties) + + special_func_defs = [ + ('strftime', ("%Y-%m-%d",), {}), + ('tz_convert', ("EST",), {}), + #('tz_localize', ("UTC",), {}), + ] + _special_func_names = [f[0] for f in special_func_defs] + + # the series is already localized + _ignore_names = ['tz_localize'] + + for name, attr_names, s, c in test_data: + func_names = [f for f in dir(s.dt) if not (f.startswith("_") or + f in attr_names or + f in _special_func_names or + f in _ignore_names)] + + func_defs = [(f, (), {}) for f in func_names] + for f_def in special_func_defs: + if f_def[0] in dir(s.dt): + func_defs.append(f_def) + + for func, args, kwargs in func_defs: + res = getattr(c.dt, func)(*args, **kwargs) + exp = getattr(s.dt, func)(*args, **kwargs) + + if isinstance(res, pd.DataFrame): + tm.assert_frame_equal(res, exp) + elif isinstance(res, pd.Series): + tm.assert_series_equal(res, exp) + else: + tm.assert_numpy_array_equal(res, exp) + + for attr in attr_names: + try: + res = getattr(c.dt, attr) + exp = getattr(s.dt, attr) + except Exception as e: + print(name, attr) + raise e + + if isinstance(res, pd.DataFrame): + tm.assert_frame_equal(res, exp) + elif isinstance(res, pd.Series): + tm.assert_series_equal(res, exp) + else: + tm.assert_numpy_array_equal(res, exp) + + invalid = Series([1,2,3]).astype('category') + with tm.assertRaisesRegexp(AttributeError, "Can only use .dt accessor with datetimelike"): + invalid.dt + self.assertFalse(hasattr(invalid, 'str')) + def test_pickle_v0_14_1(self): # we have the name warning diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index f30481ee17f75..9be0784c709bc 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -362,12 +362,20 @@ def test_tab_completion(self): self.assertTrue('str' not in dir(s)) self.assertTrue('cat' not in dir(s)) - # similiarly for .cat + # similiarly for .cat, but with the twist that str and dt should be there + # if the categories are of that type + # first cat and str s = Series(list('abbcd'), dtype="category") self.assertTrue('cat' in dir(s)) - self.assertTrue('str' not in dir(s)) + self.assertTrue('str' in dir(s)) # as it is a string categorical self.assertTrue('dt' not in dir(s)) + # similar to cat and str + s = Series(date_range('1/1/2015', periods=5)).astype("category") + self.assertTrue('cat' in dir(s)) + self.assertTrue('str' not in dir(s)) + self.assertTrue('dt' in dir(s)) # as it is a datetime categorical + def test_binop_maybe_preserve_name(self): # names match, preserve result = self.ts * self.ts diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 171f72d37cdd8..31b5281aa86a6 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -10,8 +10,8 @@ from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, is_datetime_arraylike, is_integer_dtype, is_list_like, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, - get_dtype_kinds) + is_timedelta64_dtype, is_categorical_dtype, + get_dtype_kinds, take_1d) def is_datetimelike(data): """ return a boolean if we can be successfully converted to a datetimelike """ @@ -45,26 +45,36 @@ def maybe_to_datetimelike(data, copy=False): raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) index = data.index + name = data.name + orig = data if is_categorical_dtype(data) else None + if orig is not None: + data = orig.values.categories + if is_datetime64_dtype(data.dtype): - return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name) + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, + orig=orig) elif is_datetime64tz_dtype(data.dtype): - return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, name=data.name) + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), + index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): - return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=data.name) + return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, + name=name, orig=orig) else: if is_period_arraylike(data): - return PeriodProperties(PeriodIndex(data, copy=copy), index, name=data.name) + return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): - return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name) + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, + name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) class Properties(PandasDelegate, NoNewAttributesMixin): - def __init__(self, values, index, name): + def __init__(self, values, index, name, orig=None): self.values = values self.index = index self.name = name + self.orig = orig self._freeze() def _delegate_property_get(self, name): @@ -79,6 +89,10 @@ def _delegate_property_get(self, name): elif not is_list_like(result): return result + # blow up if we operate on categories + if self.orig is not None: + result = take_1d(result, self.orig.cat.codes) + # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) From 8020bf54d5c1849e70899967b350c5209fe16d5a Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 13 Nov 2015 00:39:43 +0100 Subject: [PATCH 4/4] DOC: whatsnew and docs for multiple accessors Also add some docs in text.rst to mention the performance gains when using ``s_cat.str`` vs ``s.str``. --- doc/source/categorical.rst | 44 +++++++++++++++++++++++++++++++++ doc/source/text.rst | 16 ++++++++++++ doc/source/whatsnew/v0.17.1.txt | 2 ++ 3 files changed, 62 insertions(+) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 4ba52694980d3..6207366b96f63 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -515,6 +515,50 @@ To get a single value `Series` of type ``category`` pass in a list with a single df.loc[["h"],"cats"] +String and datetime accessors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.17.1 + +The accessors ``.dt`` and ``.str`` will work if the ``s.cat.categories`` are of an appropriate +type: + + +.. ipython:: python + + str_s = pd.Series(list('aabb')) + str_cat = str_s.astype('category') + str_cat.str.contains("a") + + date_s = pd.Series(date_range('1/1/2015', periods=5)) + date_cat = date_s.astype('category') + date_cat.dt.day + +.. note:: + + The returned ``Series`` (or ``DataFrame``) is of the same type as if you used the + ``.str.`` / ``.dt.`` on a ``Series`` of that type (and not of + type ``category``!). + +That means, that the returned values from methods and properties on the accessors of a +``Series`` and the returned values from methods and properties on the accessors of this +``Series`` transformed to one of type `category` will be equal: + +.. ipython:: python + + ret_s = str_s.str.contains("a") + ret_cat = str_cat.str.contains("a") + ret_s.dtype == ret_cat.dtype + ret_s == ret_cat + +.. note:: + + The work is done on the ``categories`` and then a new ``Series`` is constructed. This has + some performance implication if you have a ``Series`` of type string, where lots of elements + are repeated (i.e. the number of unique elements in the ``Series`` is a lot smaller than the + length of the ``Series``). In this case it can be faster to convert the original ``Series`` + to one of type ``category`` and use ``.str.`` or ``.dt.`` on that. + Setting ~~~~~~~ diff --git a/doc/source/text.rst b/doc/source/text.rst index ee4f96b41c7de..68ac82a5383c2 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -63,6 +63,22 @@ and replacing any remaining whitespaces with underscores: df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') df +.. note:: + + If you do a lot of string munging and have a ``Series`` where lots of elements are repeated + (i.e. the number of unique elements in the ``Series`` is a lot smaller than the length of the + ``Series``), it can be faster to convert the original ``Series`` to one of type + ``category`` and then use ``.str.`` or ``.dt.`` on that. The + performance difference comes from the fact that, for ``Series`` of type ``category``, the + string operations are done on the ``.categories`` and not on each element of the + ``Series``. Please note that a ``Series`` of type ``category`` with string ``.categories`` has + some limitations in comparison of ``Series`` of type string (e.g. you can't add strings to + each other: ``s + " " + s`` won't work if ``s`` is a ``Series`` of type ``category``). Also, + ``.str`` methods which operate on elements of type ``list`` are not available on such a + ``Series``. If you are interested in having these performance gains on all string ``Series``, + please look at `this bug report `_. + + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 046791d4287c9..b8702034cd464 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -65,6 +65,8 @@ Enhancements pd.Index([1, np.nan, 3]).fillna(2) +- Series of type ``"category"`` now make ``.str.<...>`` and ``.dt.<...>`` accessor methods / properties available, if the categories are of that type. (:issue:`10661`) + - ``pivot_table`` now has a ``margins_name`` argument so you can use something other than the default of 'All' (:issue:`3335`) .. _whatsnew_0171.api: