Skip to content

Commit 3038f64

Browse files
author
Joshua Bradt
committed
Merge remote-tracking branch 'upstream/master' into fix-melt
2 parents e907135 + e884072 commit 3038f64

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+868
-940
lines changed

doc/source/whatsnew/v0.20.0.txt

+57-2
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,59 @@ New Behavior:
356356
In [11]: index.memory_usage(deep=True)
357357
Out[11]: 260
358358

359+
.. _whatsnew_0200.api_breaking.groupby_describe:
360+
361+
Groupby Describe Formatting
362+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
363+
364+
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
365+
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
366+
367+
Previous Behavior:
368+
369+
.. code-block:: ipython
370+
371+
In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
372+
373+
In [2]: df.groupby('A').describe()
374+
Out[2]:
375+
B
376+
A
377+
1 count 2.000000
378+
mean 1.500000
379+
std 0.707107
380+
min 1.000000
381+
25% 1.250000
382+
50% 1.500000
383+
75% 1.750000
384+
max 2.000000
385+
2 count 2.000000
386+
mean 3.500000
387+
std 0.707107
388+
min 3.000000
389+
25% 3.250000
390+
50% 3.500000
391+
75% 3.750000
392+
max 4.000000
393+
394+
In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max])
395+
Out[3]:
396+
B
397+
mean std amin amax
398+
A
399+
1 1.5 0.707107 1 2
400+
2 3.5 0.707107 3 4
401+
402+
New Behavior:
403+
404+
.. ipython:: python
405+
406+
df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
407+
408+
df.groupby('A').describe()
409+
410+
df.groupby('A').agg([np.mean, np.std, np.min, np.max])
411+
359412
.. _whatsnew_0200.api:
360413

361414
Other API Changes
@@ -371,6 +424,7 @@ Other API Changes
371424
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
372425
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
373426
- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`).
427+
- Reorganization of timeseries development tests (:issue:`14854`)
374428

375429
.. _whatsnew_0200.deprecations:
376430

@@ -385,7 +439,7 @@ Deprecations
385439
- ``TimedeltaIndex.searchsorted()``, ``DatetimeIndex.searchsorted()``, and ``PeriodIndex.searchsorted()`` have deprecated the ``key`` parameter in favor of ``value`` (:issue:`12662`)
386440
- ``DataFrame.astype()`` has deprecated the ``raise_on_error`` parameter in favor of ``errors`` (:issue:`14878`)
387441
- ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`)
388-
442+
- importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`)
389443

390444
.. _whatsnew_0200.prior_deprecations:
391445

@@ -484,6 +538,8 @@ Bug Fixes
484538
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
485539

486540
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
541+
- Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`)
542+
- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`)
487543
- Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`)
488544
- Bug in ``DataFrame.resample().median()`` if duplicate column names are present (:issue:`14233`)
489545

@@ -507,7 +563,6 @@ Bug Fixes
507563
- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`)
508564

509565

510-
- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`)
511566

512567
- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
513568

pandas/__init__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@
4242
from pandas.sparse.api import *
4343
from pandas.stats.api import *
4444
from pandas.tseries.api import *
45-
from pandas.io.api import *
4645
from pandas.computation.api import *
4746

48-
from pandas.tools.merge import (merge, concat, ordered_merge,
47+
from pandas.tools.concat import concat
48+
from pandas.tools.merge import (merge, ordered_merge,
4949
merge_ordered, merge_asof)
5050
from pandas.tools.pivot import pivot_table, crosstab
5151
from pandas.tools.plotting import scatter_matrix, plot_params
@@ -54,6 +54,8 @@
5454
from pandas.core.reshape import melt
5555
from pandas.util.print_versions import show_versions
5656

57+
from pandas.io.api import *
58+
5759
# define the testing framework
5860
import pandas.util.testing
5961
from pandas.util.nosetester import NoseTester

pandas/core/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def _aggregate(self, arg, *args, **kwargs):
472472

473473
arg = new_arg
474474

475-
from pandas.tools.merge import concat
475+
from pandas.tools.concat import concat
476476

477477
def _agg_1dim(name, how, subset=None):
478478
"""
@@ -579,7 +579,7 @@ def _agg(arg, func):
579579
return result, True
580580

581581
def _aggregate_multiple_funcs(self, arg, _level):
582-
from pandas.tools.merge import concat
582+
from pandas.tools.concat import concat
583583

584584
if self.axis != 0:
585585
raise NotImplementedError("axis other than 0 is not supported")

pandas/core/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1907,7 +1907,7 @@ def describe(self):
19071907
counts = self.value_counts(dropna=False)
19081908
freqs = counts / float(counts.sum())
19091909

1910-
from pandas.tools.merge import concat
1910+
from pandas.tools.concat import concat
19111911
result = concat([counts, freqs], axis=1)
19121912
result.columns = ['counts', 'freqs']
19131913
result.index.name = 'categories'

pandas/core/frame.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -4402,7 +4402,7 @@ def append(self, other, ignore_index=False, verify_integrity=False):
44024402
if (self.columns.get_indexer(other.columns) >= 0).all():
44034403
other = other.loc[:, self.columns]
44044404

4405-
from pandas.tools.merge import concat
4405+
from pandas.tools.concat import concat
44064406
if isinstance(other, (list, tuple)):
44074407
to_concat = [self] + other
44084408
else:
@@ -4532,7 +4532,8 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
45324532

45334533
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
45344534
sort=False):
4535-
from pandas.tools.merge import merge, concat
4535+
from pandas.tools.merge import merge
4536+
from pandas.tools.concat import concat
45364537

45374538
if isinstance(other, Series):
45384539
if other.name is None:
@@ -4636,7 +4637,7 @@ def round(self, decimals=0, *args, **kwargs):
46364637
Series.round
46374638
46384639
"""
4639-
from pandas.tools.merge import concat
4640+
from pandas.tools.concat import concat
46404641

46414642
def _dict_round(df, decimals):
46424643
for col, vals in df.iteritems():
@@ -5306,7 +5307,7 @@ def isin(self, values):
53065307
"""
53075308
if isinstance(values, dict):
53085309
from collections import defaultdict
5309-
from pandas.tools.merge import concat
5310+
from pandas.tools.concat import concat
53105311
values = defaultdict(list, values)
53115312
return concat((self.iloc[:, [i]].isin(values[col])
53125313
for i, col in enumerate(self.columns)), axis=1)

pandas/core/groupby.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@
8080
'mean', 'sum', 'min', 'max',
8181
'cumcount',
8282
'resample',
83-
'describe',
8483
'rank', 'quantile',
8584
'fillna',
8685
'mad',
@@ -854,7 +853,7 @@ def _wrap_applied_output(self, *args, **kwargs):
854853
raise AbstractMethodError(self)
855854

856855
def _concat_objects(self, keys, values, not_indexed_same=False):
857-
from pandas.tools.merge import concat
856+
from pandas.tools.concat import concat
858857

859858
def reset_identity(values):
860859
# reset the identities of the components
@@ -1138,6 +1137,16 @@ def ohlc(self):
11381137
return self._apply_to_column_groupbys(
11391138
lambda x: x._cython_agg_general('ohlc'))
11401139

1140+
@Appender(DataFrame.describe.__doc__)
1141+
@Substitution(name='groupby')
1142+
@Appender(_doc_template)
1143+
def describe(self, **kwargs):
1144+
self._set_group_selection()
1145+
result = self.apply(lambda x: x.describe(**kwargs))
1146+
if self.axis == 1:
1147+
return result.T
1148+
return result.unstack()
1149+
11411150
@Substitution(name='groupby')
11421151
@Appender(_doc_template)
11431152
def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'):
30393048
def nsmallest(self, n=5, keep='first'):
30403049
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
30413050

3051+
@Appender(Series.describe.__doc__)
3052+
def describe(self, **kwargs):
3053+
self._set_group_selection()
3054+
result = self.apply(lambda x: x.describe(**kwargs))
3055+
if self.axis == 1:
3056+
return result.T
3057+
return result.unstack()
3058+
30423059
def value_counts(self, normalize=False, sort=True, ascending=False,
30433060
bins=None, dropna=True):
30443061

@@ -3507,7 +3524,7 @@ def first_non_None_value(values):
35073524
# still a series
35083525
# path added as of GH 5545
35093526
elif all_indexed_same:
3510-
from pandas.tools.merge import concat
3527+
from pandas.tools.concat import concat
35113528
return concat(values)
35123529

35133530
if not all_indexed_same:
@@ -3540,7 +3557,7 @@ def first_non_None_value(values):
35403557
else:
35413558
# GH5788 instead of stacking; concat gets the
35423559
# dtypes correct
3543-
from pandas.tools.merge import concat
3560+
from pandas.tools.concat import concat
35443561
result = concat(values, keys=key_index,
35453562
names=key_index.names,
35463563
axis=self.axis).unstack()
@@ -3588,7 +3605,7 @@ def first_non_None_value(values):
35883605
not_indexed_same=not_indexed_same)
35893606

35903607
def _transform_general(self, func, *args, **kwargs):
3591-
from pandas.tools.merge import concat
3608+
from pandas.tools.concat import concat
35923609

35933610
applied = []
35943611
obj = self._obj_with_exclusions
@@ -3980,7 +3997,7 @@ def _iterate_column_groupbys(self):
39803997
exclusions=self.exclusions)
39813998

39823999
def _apply_to_column_groupbys(self, func):
3983-
from pandas.tools.merge import concat
4000+
from pandas.tools.concat import concat
39844001
return concat(
39854002
(func(col_groupby) for _, col_groupby
39864003
in self._iterate_column_groupbys()),
@@ -4061,7 +4078,7 @@ def groupby_series(obj, col=None):
40614078
if isinstance(obj, Series):
40624079
results = groupby_series(obj)
40634080
else:
4064-
from pandas.tools.merge import concat
4081+
from pandas.tools.concat import concat
40654082
results = [groupby_series(obj[col], col) for col in obj.columns]
40664083
results = concat(results, axis=1)
40674084

pandas/core/panel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1282,7 +1282,7 @@ def join(self, other, how='left', lsuffix='', rsuffix=''):
12821282
-------
12831283
joined : Panel
12841284
"""
1285-
from pandas.tools.merge import concat
1285+
from pandas.tools.concat import concat
12861286

12871287
if isinstance(other, Panel):
12881288
join_major, join_minor = self._get_join_index(other, how)

pandas/core/reshape.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1204,7 +1204,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
12041204
--------
12051205
Series.str.get_dummies
12061206
"""
1207-
from pandas.tools.merge import concat
1207+
from pandas.tools.concat import concat
12081208
from itertools import cycle
12091209

12101210
if isinstance(data, DataFrame):

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1588,7 +1588,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
15881588
15891589
15901590
"""
1591-
from pandas.tools.merge import concat
1591+
from pandas.tools.concat import concat
15921592

15931593
if isinstance(to_append, (list, tuple)):
15941594
to_concat = [self] + to_append

pandas/formats/format.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True,
165165
self._chk_truncate()
166166

167167
def _chk_truncate(self):
168-
from pandas.tools.merge import concat
168+
from pandas.tools.concat import concat
169169
max_rows = self.max_rows
170170
truncate_v = max_rows and (len(self.series) > max_rows)
171171
series = self.series
@@ -406,7 +406,7 @@ def _chk_truncate(self):
406406
Checks whether the frame should be truncated. If so, slices
407407
the frame up.
408408
"""
409-
from pandas.tools.merge import concat
409+
from pandas.tools.concat import concat
410410

411411
# Column of which first element is used to determine width of a dot col
412412
self.tr_size_col = -1

pandas/io/gbq.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
import numpy as np
1111

1212
from distutils.version import StrictVersion
13-
from pandas import compat
14-
from pandas.core.api import DataFrame
15-
from pandas.tools.merge import concat
13+
from pandas import compat, DataFrame, concat
1614
from pandas.core.common import PandasError
1715
from pandas.compat import lzip, bytes_to_str
1816

pandas/io/pytables.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,12 @@
2626

2727
import pandas as pd
2828
from pandas import (Series, DataFrame, Panel, Panel4D, Index,
29-
MultiIndex, Int64Index, isnull)
29+
MultiIndex, Int64Index, isnull, concat,
30+
SparseSeries, SparseDataFrame, PeriodIndex,
31+
DatetimeIndex, TimedeltaIndex)
3032
from pandas.core import config
3133
from pandas.io.common import _stringify_path
32-
from pandas.sparse.api import SparseSeries, SparseDataFrame
3334
from pandas.sparse.array import BlockIndex, IntIndex
34-
from pandas.tseries.api import PeriodIndex, DatetimeIndex
35-
from pandas.tseries.tdi import TimedeltaIndex
3635
from pandas.core.base import StringMixin
3736
from pandas.formats.printing import adjoin, pprint_thing
3837
from pandas.core.common import _asarray_tuplesafe, PerformanceWarning
@@ -42,7 +41,6 @@
4241
_block2d_to_blocknd,
4342
_factor_indexer, _block_shape)
4443
from pandas.core.index import _ensure_index
45-
from pandas.tools.merge import concat
4644
from pandas import compat
4745
from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter
4846
from pandas.core.config import get_option

pandas/io/tests/json/test_pandas.py

+10
Original file line numberDiff line numberDiff line change
@@ -1044,3 +1044,13 @@ def roundtrip(s, encoding='latin-1'):
10441044

10451045
for s in examples:
10461046
roundtrip(s)
1047+
1048+
def test_data_frame_size_after_to_json(self):
1049+
# GH15344
1050+
df = DataFrame({'a': [str(1)]})
1051+
1052+
size_before = df.memory_usage(index=True, deep=True).sum()
1053+
df.to_json()
1054+
size_after = df.memory_usage(index=True, deep=True).sum()
1055+
1056+
self.assertEqual(size_before, size_after)

pandas/src/ujson/python/objToJSON.c

+10
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,16 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
402402
static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
403403
size_t *_outLen) {
404404
PyObject *obj = (PyObject *)_obj;
405+
406+
#if (PY_VERSION_HEX >= 0x03030000)
407+
if (PyUnicode_IS_COMPACT_ASCII(obj)) {
408+
Py_ssize_t len;
409+
char *data = PyUnicode_AsUTF8AndSize(obj, &len);
410+
*_outLen = len;
411+
return data;
412+
}
413+
#endif
414+
405415
PyObject *newObj = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj),
406416
PyUnicode_GET_SIZE(obj), NULL);
407417

0 commit comments

Comments
 (0)