|
19 | 19 | import pandas.core.common as com
|
20 | 20 | import pandas.core.datetools as datetools
|
21 | 21 | from pandas import compat, _np_version_under1p7
|
22 |
| -from pandas.compat import map, zip, lrange, string_types, isidentifier |
| 22 | +from pandas.compat import map, zip, lrange, string_types, isidentifier, lmap |
23 | 23 | from pandas.core.common import (isnull, notnull, is_list_like,
|
24 | 24 | _values_from_object, _maybe_promote, _maybe_box_datetimelike,
|
25 | 25 | ABCSeries, SettingWithCopyError, SettingWithCopyWarning)
|
@@ -3478,6 +3478,154 @@ def _convert_timedeltas(x):
|
3478 | 3478 |
|
3479 | 3479 | return np.abs(self)
|
3480 | 3480 |
|
| 3481 | + _shared_docs['describe'] = """ |
| 3482 | + Generate various summary statistics, excluding NaN values. |
| 3483 | +
|
| 3484 | + Parameters |
| 3485 | + ---------- |
| 3486 | + percentile_width : float, deprecated |
| 3487 | + The ``percentile_width`` argument will be removed in a future |
| 3488 | + version. Use ``percentiles`` instead. |
| 3489 | + width of the desired uncertainty interval, default is 50, |
| 3490 | + which corresponds to lower=25, upper=75 |
| 3491 | + percentiles : array-like, optional |
| 3492 | + The percentiles to include in the output. Should all |
| 3493 | + be in the interval [0, 1]. By default `percentiles` is |
| 3494 | + [.25, .5, .75], returning the 25th, 50th, and 75th percentiles. |
| 3495 | +
|
| 3496 | + Returns |
| 3497 | + ------- |
| 3498 | + summary: %(klass)s of summary statistics |
| 3499 | +
|
| 3500 | + Notes |
| 3501 | + ----- |
| 3502 | + For numeric dtypes the index includes: count, mean, std, min, |
| 3503 | + max, and lower, 50, and upper percentiles. |
| 3504 | +
|
| 3505 | + If self is of object dtypes (e.g. timestamps or strings), the output |
| 3506 | + will include the count, unique, most common, and frequency of the |
| 3507 | + most common. Timestamps also include the first and last items. |
| 3508 | +
|
| 3509 | + If multiple values have the highest count, then the |
| 3510 | + `count` and `most common` pair will be arbitrarily chosen from |
| 3511 | + among those with the highest count. |
| 3512 | + """ |
| 3513 | + |
| 3514 | + @Appender(_shared_docs['describe'] % _shared_doc_kwargs) |
| 3515 | + def describe(self, percentile_width=None, percentiles=None): |
| 3516 | + if self.ndim >= 3: |
| 3517 | + msg = "describe is not implemented on on Panel or PanelND objects." |
| 3518 | + raise NotImplementedError(msg) |
| 3519 | + |
| 3520 | + if percentile_width is not None and percentiles is not None: |
| 3521 | + msg = "Cannot specify both 'percentile_width' and 'percentiles.'" |
| 3522 | + raise ValueError(msg) |
| 3523 | + if percentiles is not None: |
| 3524 | + # get them all to be in [0, 1] |
| 3525 | + percentiles = np.asarray(percentiles) |
| 3526 | + if (percentiles > 1).any(): |
| 3527 | + percentiles = percentiles / 100.0 |
| 3528 | + msg = ("percentiles should all be in the interval [0, 1]. " |
| 3529 | + "Try {0} instead.") |
| 3530 | + raise ValueError(msg.format(list(percentiles))) |
| 3531 | + else: |
| 3532 | + # only warn if they change the default |
| 3533 | + if percentile_width is not None: |
| 3534 | + do_warn = True |
| 3535 | + else: |
| 3536 | + do_warn = False |
| 3537 | + percentile_width = percentile_width or 50 |
| 3538 | + lb = .5 * (1. - percentile_width / 100.) |
| 3539 | + ub = 1. - lb |
| 3540 | + percentiles = np.array([lb, 0.5, ub]) |
| 3541 | + if do_warn: |
| 3542 | + msg = ("The `percentile_width` keyword is deprecated. " |
| 3543 | + "Use percentiles={0} instead".format(list(percentiles))) |
| 3544 | + warnings.warn(msg, FutureWarning) |
| 3545 | + |
| 3546 | + # median should always be included |
| 3547 | + if (percentiles != 0.5).all(): # median isn't included |
| 3548 | + lh = percentiles[percentiles < .5] |
| 3549 | + uh = percentiles[percentiles > .5] |
| 3550 | + percentiles = np.hstack([lh, 0.5, uh]) |
| 3551 | + |
| 3552 | + # dtypes: numeric only, numeric mixed, objects only |
| 3553 | + data = self._get_numeric_data() |
| 3554 | + if self.ndim > 1: |
| 3555 | + if len(data._info_axis) == 0: |
| 3556 | + is_object = True |
| 3557 | + else: |
| 3558 | + is_object = False |
| 3559 | + else: |
| 3560 | + is_object = not self._is_numeric_mixed_type |
| 3561 | + |
| 3562 | + def pretty_name(x): |
| 3563 | + x *= 100 |
| 3564 | + if x == int(x): |
| 3565 | + return '%.0f%%' % x |
| 3566 | + else: |
| 3567 | + return '%.1f%%' % x |
| 3568 | + |
| 3569 | + def describe_numeric_1d(series, percentiles): |
| 3570 | + return ([series.count(), series.mean(), series.std(), |
| 3571 | + series.min()] + |
| 3572 | + [series.quantile(x) for x in percentiles] + |
| 3573 | + [series.max()]) |
| 3574 | + |
| 3575 | + def describe_categorical_1d(data): |
| 3576 | + if data.dtype == object: |
| 3577 | + names = ['count', 'unique'] |
| 3578 | + objcounts = data.value_counts() |
| 3579 | + result = [data.count(), len(objcounts)] |
| 3580 | + if result[1] > 0: |
| 3581 | + names += ['top', 'freq'] |
| 3582 | + top, freq = objcounts.index[0], objcounts.iloc[0] |
| 3583 | + result += [top, freq] |
| 3584 | + |
| 3585 | + elif issubclass(data.dtype.type, np.datetime64): |
| 3586 | + names = ['count', 'unique'] |
| 3587 | + asint = data.dropna().values.view('i8') |
| 3588 | + objcounts = compat.Counter(asint) |
| 3589 | + result = [data.count(), len(objcounts)] |
| 3590 | + if result[1] > 0: |
| 3591 | + top, freq = objcounts.most_common(1)[0] |
| 3592 | + names += ['first', 'last', 'top', 'freq'] |
| 3593 | + result += [lib.Timestamp(asint.min()), |
| 3594 | + lib.Timestamp(asint.max()), |
| 3595 | + lib.Timestamp(top), freq] |
| 3596 | + |
| 3597 | + return pd.Series(result, index=names) |
| 3598 | + |
| 3599 | + if is_object: |
| 3600 | + if data.ndim == 1: |
| 3601 | + return describe_categorical_1d(self) |
| 3602 | + else: |
| 3603 | + result = pd.DataFrame(dict((k, describe_categorical_1d(v)) |
| 3604 | + for k, v in compat.iteritems(self)), |
| 3605 | + columns=self._info_axis, |
| 3606 | + index=['count', 'unique', 'first', 'last', |
| 3607 | + 'top', 'freq']) |
| 3608 | + # just objects, no datime |
| 3609 | + if pd.isnull(result.loc['first']).all(): |
| 3610 | + result = result.drop(['first', 'last'], axis=0) |
| 3611 | + return result |
| 3612 | + else: |
| 3613 | + stat_index = (['count', 'mean', 'std', 'min'] + |
| 3614 | + [pretty_name(x) for x in percentiles] + |
| 3615 | + ['max']) |
| 3616 | + if data.ndim == 1: |
| 3617 | + return pd.Series(describe_numeric_1d(data, percentiles), |
| 3618 | + index=stat_index) |
| 3619 | + else: |
| 3620 | + destat = [] |
| 3621 | + for i in range(len(data._info_axis)): # BAD |
| 3622 | + series = data.iloc[:, i] |
| 3623 | + destat.append(describe_numeric_1d(series, percentiles)) |
| 3624 | + |
| 3625 | + return self._constructor(lmap(list, zip(*destat)), |
| 3626 | + index=stat_index, |
| 3627 | + columns=data._info_axis) |
| 3628 | + |
3481 | 3629 | _shared_docs['pct_change'] = """
|
3482 | 3630 | Percent change over given number of periods.
|
3483 | 3631 |
|
|
0 commit comments