diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 13d61957eea00..d7b16eda3495b 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -606,6 +606,9 @@ Bug Fixes - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) +- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) +- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) + - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b0d564caa5826..c9e30ea31dab8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -12,7 +12,7 @@ import pandas.core.common as com from pandas.util.decorators import cache_readonly, deprecate_kwarg -from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, +from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, @@ -1053,15 +1053,12 @@ def get_values(self): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.categories.dtype or dtype string if - periods + A numpy array of the same dtype as categorical.categories.dtype or + Index if datetime / periods """ - - # if we are a period index, return a string repr - if isinstance(self.categories, ABCPeriodIndex): - return take_1d(np.array(self.categories.to_native_types(), dtype=object), - self._codes) - + # if we are a datetime and period index, return Index to keep metadata + if com.is_datetimelike(self.categories): + return self.categories.take(self._codes) return np.array(self) def check_for_ordered(self, op): @@ -1308,7 +1305,7 @@ def __len__(self): def __iter__(self): """Returns an Iterator over the values of this Categorical.""" - return iter(np.array(self)) + return iter(self.get_values()) def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) """ @@ -1328,7 +1325,7 @@ def _repr_categories(self): max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) from pandas.core import format as fmt - category_strs = fmt.format_array(self.categories.get_values(), None) + category_strs = fmt.format_array(self.categories, None) if len(category_strs) > max_categories: num = max_categories // 2 head = category_strs[:num] @@ -1343,8 +1340,9 @@ def _repr_categories_info(self): """ Returns a string representation of the footer.""" category_strs = self._repr_categories() - levheader = "Categories (%d, %s): " % (len(self.categories), - self.categories.dtype) + dtype = getattr(self.categories, 'dtype_str', str(self.categories.dtype)) + + levheader = "Categories (%d, %s): " % (len(self.categories), dtype) width, height = get_terminal_size() max_width = get_option("display.width") or width if com.in_ipython_frontend(): @@ -1352,13 +1350,14 @@ def _repr_categories_info(self): max_width = 0 levstring = "" start = True - cur_col_len = len(levheader) + cur_col_len = len(levheader) # header sep_len, sep = (3, " < ") if self.ordered else (2, ", ") + linesep = sep.rstrip() + "\n" # remove whitespace for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: - levstring += "\n" + (" "* len(levheader)) - cur_col_len = len(levheader) - if not start: + levstring += linesep + (" " * (len(levheader) + 1)) + cur_col_len = len(levheader) + 1 # header + a whitespace + elif not start: levstring += sep cur_col_len += len(val) levstring += val diff --git a/pandas/core/format.py b/pandas/core/format.py index a18d0cfa6f195..4ec4375349764 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -207,7 +207,7 @@ def _get_formatted_index(self): return fmt_index, have_header def _get_formatted_values(self): - return format_array(self.tr_series.get_values(), None, + return format_array(self.tr_series.values, None, float_format=self.float_format, na_rep=self.na_rep) @@ -681,7 +681,7 @@ def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) return format_array( - (frame.iloc[:, i]).get_values(), + frame.iloc[:, i].values, formatter, float_format=self.float_format, na_rep=self.na_rep, space=self.col_space ) @@ -1895,8 +1895,13 @@ def get_formatted_cells(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right'): - if com.is_float_dtype(values.dtype): + + if com.is_categorical_dtype(values): + fmt_klass = CategoricalArrayFormatter + elif com.is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter + elif com.is_period_arraylike(values): + fmt_klass = PeriodArrayFormatter elif com.is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter elif com.is_datetime64_dtype(values.dtype): @@ -1963,6 +1968,8 @@ def _format(x): return '%s' % formatter(x) vals = self.values + if isinstance(vals, Index): + vals = vals.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() @@ -2076,8 +2083,30 @@ def _format_strings(self): values = values.asobject is_dates_only = _is_dates_only(values) formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format)) - fmt_values = [ formatter(x) for x in self.values ] + fmt_values = [ formatter(x) for x in values ] + + return fmt_values + +class PeriodArrayFormatter(IntArrayFormatter): + + def _format_strings(self): + values = np.array(self.values.to_native_types(), dtype=object) + formatter = self.formatter or (lambda x: '%s' % x) + fmt_values = [formatter(x) for x in values] + return fmt_values + + +class CategoricalArrayFormatter(GenericArrayFormatter): + + def __init__(self, values, *args, **kwargs): + GenericArrayFormatter.__init__(self, values, *args, **kwargs) + + def _format_strings(self): + fmt_values = format_array(self.values.get_values(), self.formatter, + float_format=self.float_format, + na_rep=self.na_rep, digits=self.digits, + space=self.space, justify=self.justify) return fmt_values diff --git a/pandas/core/index.py b/pandas/core/index.py index ce6c60df2fd94..5b57d602d7e41 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -276,6 +276,11 @@ def dtype(self): """ return the dtype object of the underlying data """ return self._data.dtype + @cache_readonly + def dtype_str(self): + """ return the dtype str of the underlying data """ + return str(self.dtype) + @property def values(self): """ return the underlying data as an ndarray """ @@ -2994,6 +2999,10 @@ def equals(self, other): return False + @property + def _formatter_func(self): + return self.categories._formatter_func + def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a065d03d4ad72..680b370cbca41 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1736,6 +1736,582 @@ def test_repr(self): "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp,a.__unicode__()) + def test_categorical_repr(self): + c = pd.Categorical([1, 2 ,3]) + exp = """[1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2 ,3, 1, 2 ,3], categories=[1, 2, 3]) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2, 3, 4, 5] * 10) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1, 2, 3, 4, 5]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(np.arange(20)) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_ordered(self): + c = pd.Categorical([1, 2 ,3], ordered=True) + exp = """[1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2 ,3, 1, 2 ,3], categories=[1, 2, 3], ordered=True) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2, 3, 4, 5] * 10, ordered=True) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1 < 2 < 3 < 4 < 5]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(np.arange(20), ordered=True) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_datetime(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + c = pd.Categorical(idx) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_datetime_ordered(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_period(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + c = pd.Categorical(idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_period_ordered(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_timedelta(self): + idx = pd.timedelta_range('1 days', periods=5) + c = pd.Categorical(idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + self.assertEqual(repr(c), exp) + + idx = pd.timedelta_range('1 hours', periods=20) + c = pd.Categorical(idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_timedelta_ordered(self): + idx = pd.timedelta_range('1 days', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + self.assertEqual(repr(c), exp) + + idx = pd.timedelta_range('1 hours', periods=20) + c = pd.Categorical(idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_series_repr(self): + s = pd.Series(pd.Categorical([1, 2 ,3])) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1, 2, 3]""" + self.assertEqual(repr(s), exp) + + s = pd.Series(pd.Categorical(np.arange(10))) + exp = """0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_ordered(self): + s = pd.Series(pd.Categorical([1, 2 ,3], ordered=True)) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1 < 2 < 3]""" + self.assertEqual(repr(s), exp) + + s = pd.Series(pd.Categorical(np.arange(10), ordered=True)) + exp = """0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_datetime(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_datetime_ordered(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_period(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_period_ordered(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_timedelta(self): + idx = pd.timedelta_range('1 days', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + self.assertEqual(repr(s), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + s = pd.Series(pd.Categorical(idx)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, + 8 days 01:00:00, 9 days 01:00:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_timedelta_ordered(self): + idx = pd.timedelta_range('1 days', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + self.assertEqual(repr(s), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < + 8 days 01:00:00 < 9 days 01:00:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_index_repr(self): + idx = pd.CategoricalIndex(pd.Categorical([1, 2 ,3])) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), exp) + + i = pd.CategoricalIndex(pd.Categorical(np.arange(10))) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_ordered(self): + i = pd.CategoricalIndex(pd.Categorical([1, 2 ,3], ordered=True)) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + i = pd.CategoricalIndex(pd.Categorical(np.arange(10), ordered=True)) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_datetime(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_datetime_ordered(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + i = pd.CategoricalIndex(pd.Categorical(idx.append(idx), ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', + '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', + '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_period(self): + # test all length + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=1) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=2) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=3) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + i = pd.CategoricalIndex(pd.Categorical(idx.append(idx))) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', + '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', + '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_period_ordered(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_timedelta(self): + idx = pd.timedelta_range('1 days', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_timedelta_ordered(self): + idx = pd.timedelta_range('1 days', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_frame(self): + # normal DataFrame + dt = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + p = pd.period_range('2011-01', freq='M', periods=5) + df = pd.DataFrame({'dt': dt, 'p': p}) + exp = """ dt p +0 2011-01-01 09:00:00-05:00 2011-01 +1 2011-01-01 10:00:00-05:00 2011-02 +2 2011-01-01 11:00:00-05:00 2011-03 +3 2011-01-01 12:00:00-05:00 2011-04 +4 2011-01-01 13:00:00-05:00 2011-05""" + + df = pd.DataFrame({'dt': pd.Categorical(dt), 'p': pd.Categorical(p)}) + self.assertEqual(repr(df), exp) + def test_info(self): # make sure it works diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 2699e780f0edb..79ec18c521a00 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -132,6 +132,15 @@ def test_str(self): self.assertTrue("'foo'" in str(idx)) self.assertTrue(idx.__class__.__name__ in str(idx)) + def test_dtype_str(self): + for idx in self.indices.values(): + dtype = idx.dtype_str + self.assertIsInstance(dtype, compat.string_types) + if isinstance(idx, PeriodIndex): + self.assertEqual(dtype, 'period') + else: + self.assertEqual(dtype, str(idx.dtype)) + def test_repr_max_seq_item_setting(self): # GH10182 idx = self.create_index() diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 6413ce9cd5a03..f68073fd54025 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -21,6 +21,8 @@ _values_from_object, ABCSeries, is_integer, is_float, is_object_dtype) from pandas import compat +from pandas.util.decorators import cache_readonly + from pandas.lib import Timestamp, Timedelta import pandas.lib as lib import pandas.tslib as tslib @@ -530,6 +532,11 @@ def shift(self, n): values[mask] = tslib.iNaT return PeriodIndex(data=values, name=self.name, freq=self.freq) + @cache_readonly + def dtype_str(self): + """ return the dtype str of the underlying data """ + return self.inferred_type + @property def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous