Skip to content

Commit 3a3e02e

Browse files
committed
sorting example
1 parent 4333937 commit 3a3e02e

File tree

1 file changed

+139
-119
lines changed

1 file changed

+139
-119
lines changed

pandas/tests/frame/test_sorting.py

+139-119
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
# -*- coding: utf-8 -*-
22

33
from __future__ import print_function
4-
4+
import random
55
import numpy as np
66

7+
import pandas as pd
78
from pandas.compat import lrange
89
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
9-
date_range, NaT)
10+
date_range, NaT, IntervalIndex)
1011

1112
from pandas.util.testing import (assert_series_equal,
1213
assert_frame_equal,
@@ -19,45 +20,6 @@
1920

2021
class TestDataFrameSorting(tm.TestCase, TestData):
2122

22-
def test_sort_index(self):
23-
# GH13496
24-
25-
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
26-
columns=['A', 'B', 'C', 'D'])
27-
28-
# axis=0 : sort rows by index labels
29-
unordered = frame.loc[[3, 2, 4, 1]]
30-
result = unordered.sort_index(axis=0)
31-
expected = frame
32-
assert_frame_equal(result, expected)
33-
34-
result = unordered.sort_index(ascending=False)
35-
expected = frame[::-1]
36-
assert_frame_equal(result, expected)
37-
38-
# axis=1 : sort columns by column names
39-
unordered = frame.iloc[:, [2, 1, 3, 0]]
40-
result = unordered.sort_index(axis=1)
41-
assert_frame_equal(result, frame)
42-
43-
result = unordered.sort_index(axis=1, ascending=False)
44-
expected = frame.iloc[:, ::-1]
45-
assert_frame_equal(result, expected)
46-
47-
def test_sort_index_multiindex(self):
48-
# GH13496
49-
50-
# sort rows by specified level of multi-index
51-
mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC'))
52-
df = DataFrame([[1, 2], [3, 4]], mi)
53-
54-
# MI sort, but no level: sort_level has no effect
55-
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
56-
df = DataFrame([[1, 2], [3, 4]], mi)
57-
result = df.sort_index(sort_remaining=False)
58-
expected = df.sort_index()
59-
assert_frame_equal(result, expected)
60-
6123
def test_sort(self):
6224
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
6325
columns=['A', 'B', 'C', 'D'])
@@ -151,21 +113,6 @@ def test_sort_values_inplace(self):
151113
expected = frame.sort_values(by=['A', 'B'], ascending=False)
152114
assert_frame_equal(sorted_df, expected)
153115

154-
def test_sort_index_categorical_index(self):
155-
156-
df = (DataFrame({'A': np.arange(6, dtype='int64'),
157-
'B': Series(list('aabbca'))
158-
.astype('category', categories=list('cab'))})
159-
.set_index('B'))
160-
161-
result = df.sort_index()
162-
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
163-
assert_frame_equal(result, expected)
164-
165-
result = df.sort_index(ascending=False)
166-
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
167-
assert_frame_equal(result, expected)
168-
169116
def test_sort_nan(self):
170117
# GH3917
171118
nan = np.nan
@@ -291,8 +238,86 @@ def test_stable_descending_multicolumn_sort(self):
291238
kind='mergesort')
292239
assert_frame_equal(sorted_df, expected)
293240

241+
def test_sort_datetimes(self):
242+
243+
# GH 3461, argsort / lexsort differences for a datetime column
244+
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
245+
columns=['A'],
246+
index=date_range('20130101', periods=9))
247+
dts = [Timestamp(x)
248+
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
249+
'2005-09-20', '2010-10-04', '2009-05-12',
250+
'2008-11-12', '2010-09-28', '2010-09-28']]
251+
df['B'] = dts[::2] + dts[1::2]
252+
df['C'] = 2.
253+
df['A1'] = 3.
254+
255+
df1 = df.sort_values(by='A')
256+
df2 = df.sort_values(by=['A'])
257+
assert_frame_equal(df1, df2)
258+
259+
df1 = df.sort_values(by='B')
260+
df2 = df.sort_values(by=['B'])
261+
assert_frame_equal(df1, df2)
262+
263+
def test_frame_column_inplace_sort_exception(self):
264+
s = self.frame['A']
265+
with assertRaisesRegexp(ValueError, "This Series is a view"):
266+
s.sort_values(inplace=True)
267+
268+
cp = s.copy()
269+
cp.sort_values() # it works!
270+
271+
def test_sort_nat_values_in_int_column(self):
272+
273+
# GH 14922: "sorting with large float and multiple columns incorrect"
274+
275+
# cause was that the int64 value NaT was considered as "na". Which is
276+
# only correct for datetime64 columns.
277+
278+
int_values = (2, int(NaT))
279+
float_values = (2.0, -1.797693e308)
280+
281+
df = DataFrame(dict(int=int_values, float=float_values),
282+
columns=["int", "float"])
283+
284+
df_reversed = DataFrame(dict(int=int_values[::-1],
285+
float=float_values[::-1]),
286+
columns=["int", "float"],
287+
index=[1, 0])
288+
289+
# NaT is not a "na" for int64 columns, so na_position must not
290+
# influence the result:
291+
df_sorted = df.sort_values(["int", "float"], na_position="last")
292+
assert_frame_equal(df_sorted, df_reversed)
293+
294+
df_sorted = df.sort_values(["int", "float"], na_position="first")
295+
assert_frame_equal(df_sorted, df_reversed)
296+
297+
# reverse sorting order
298+
df_sorted = df.sort_values(["int", "float"], ascending=False)
299+
assert_frame_equal(df_sorted, df)
300+
301+
# and now check if NaT is still considered as "na" for datetime64
302+
# columns:
303+
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
304+
float=float_values), columns=["datetime", "float"])
305+
306+
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
307+
float=float_values[::-1]),
308+
columns=["datetime", "float"],
309+
index=[1, 0])
310+
311+
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
312+
assert_frame_equal(df_sorted, df_reversed)
313+
314+
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
315+
assert_frame_equal(df_sorted, df_reversed)
316+
317+
318+
class TestDataFrameSortIndexKinds(tm.TestCase, TestData):
319+
294320
def test_sort_index_multicolumn(self):
295-
import random
296321
A = np.arange(5).repeat(20)
297322
B = np.tile(np.arange(5), 20)
298323
random.shuffle(A)
@@ -448,78 +473,73 @@ def test_sort_index_level(self):
448473
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
449474
assert_frame_equal(df, res)
450475

451-
def test_sort_datetimes(self):
452-
453-
# GH 3461, argsort / lexsort differences for a datetime column
454-
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
455-
columns=['A'],
456-
index=date_range('20130101', periods=9))
457-
dts = [Timestamp(x)
458-
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
459-
'2005-09-20', '2010-10-04', '2009-05-12',
460-
'2008-11-12', '2010-09-28', '2010-09-28']]
461-
df['B'] = dts[::2] + dts[1::2]
462-
df['C'] = 2.
463-
df['A1'] = 3.
464-
465-
df1 = df.sort_values(by='A')
466-
df2 = df.sort_values(by=['A'])
467-
assert_frame_equal(df1, df2)
468-
469-
df1 = df.sort_values(by='B')
470-
df2 = df.sort_values(by=['B'])
471-
assert_frame_equal(df1, df2)
472-
473-
def test_frame_column_inplace_sort_exception(self):
474-
s = self.frame['A']
475-
with assertRaisesRegexp(ValueError, "This Series is a view"):
476-
s.sort_values(inplace=True)
477-
478-
cp = s.copy()
479-
cp.sort_values() # it works!
476+
def test_sort_index_categorical_index(self):
480477

481-
def test_sort_nat_values_in_int_column(self):
478+
df = (DataFrame({'A': np.arange(6, dtype='int64'),
479+
'B': Series(list('aabbca'))
480+
.astype('category', categories=list('cab'))})
481+
.set_index('B'))
482482

483-
# GH 14922: "sorting with large float and multiple columns incorrect"
483+
result = df.sort_index()
484+
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
485+
assert_frame_equal(result, expected)
484486

485-
# cause was that the int64 value NaT was considered as "na". Which is
486-
# only correct for datetime64 columns.
487+
result = df.sort_index(ascending=False)
488+
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
489+
assert_frame_equal(result, expected)
487490

488-
int_values = (2, int(NaT))
489-
float_values = (2.0, -1.797693e308)
491+
def test_sort_index(self):
492+
# GH13496
490493

491-
df = DataFrame(dict(int=int_values, float=float_values),
492-
columns=["int", "float"])
494+
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
495+
columns=['A', 'B', 'C', 'D'])
493496

494-
df_reversed = DataFrame(dict(int=int_values[::-1],
495-
float=float_values[::-1]),
496-
columns=["int", "float"],
497-
index=[1, 0])
497+
# axis=0 : sort rows by index labels
498+
unordered = frame.loc[[3, 2, 4, 1]]
499+
result = unordered.sort_index(axis=0)
500+
expected = frame
501+
assert_frame_equal(result, expected)
498502

499-
# NaT is not a "na" for int64 columns, so na_position must not
500-
# influence the result:
501-
df_sorted = df.sort_values(["int", "float"], na_position="last")
502-
assert_frame_equal(df_sorted, df_reversed)
503+
result = unordered.sort_index(ascending=False)
504+
expected = frame[::-1]
505+
assert_frame_equal(result, expected)
503506

504-
df_sorted = df.sort_values(["int", "float"], na_position="first")
505-
assert_frame_equal(df_sorted, df_reversed)
507+
# axis=1 : sort columns by column names
508+
unordered = frame.iloc[:, [2, 1, 3, 0]]
509+
result = unordered.sort_index(axis=1)
510+
assert_frame_equal(result, frame)
506511

507-
# reverse sorting order
508-
df_sorted = df.sort_values(["int", "float"], ascending=False)
509-
assert_frame_equal(df_sorted, df)
512+
result = unordered.sort_index(axis=1, ascending=False)
513+
expected = frame.iloc[:, ::-1]
514+
assert_frame_equal(result, expected)
510515

511-
# and now check if NaT is still considered as "na" for datetime64
512-
# columns:
513-
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
514-
float=float_values), columns=["datetime", "float"])
516+
def test_sort_index_multiindex(self):
517+
# GH13496
515518

516-
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
517-
float=float_values[::-1]),
518-
columns=["datetime", "float"],
519-
index=[1, 0])
519+
# sort rows by specified level of multi-index
520+
mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC'))
521+
df = DataFrame([[1, 2], [3, 4]], mi)
520522

521-
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
522-
assert_frame_equal(df_sorted, df_reversed)
523+
# MI sort, but no level: sort_level has no effect
524+
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
525+
df = DataFrame([[1, 2], [3, 4]], mi)
526+
result = df.sort_index(sort_remaining=False)
527+
expected = df.sort_index()
528+
assert_frame_equal(result, expected)
523529

524-
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
525-
assert_frame_equal(df_sorted, df_reversed)
530+
def test_sort_index_intervalindex(self):
531+
# this is a de-facto sort via unstack
532+
# confirming that we sort in the order of the bins
533+
y = Series(np.random.randn(100))
534+
x1 = Series(np.sign(np.random.randn(100)))
535+
x2 = pd.cut(Series(np.random.randn(100)),
536+
bins=[-3, -0.5, 0, 0.5, 3])
537+
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
538+
539+
result = model.groupby(['X1', 'X2']).mean().unstack()
540+
expected = IntervalIndex.from_tuples(
541+
[(-3.0, -0.5), (-0.5, 0.0),
542+
(0.0, 0.5), (0.5, 3.0)],
543+
closed='right')
544+
result = result.columns.levels[1].categories
545+
tm.assert_index_equal(result, expected)

0 commit comments

Comments
 (0)