|
1 | 1 | # -*- coding: utf-8 -*-
|
2 | 2 |
|
3 | 3 | from __future__ import print_function
|
4 |
| - |
| 4 | +import random |
5 | 5 | import numpy as np
|
6 | 6 |
|
| 7 | +import pandas as pd |
7 | 8 | from pandas.compat import lrange
|
8 | 9 | from pandas import (DataFrame, Series, MultiIndex, Timestamp,
|
9 |
| - date_range, NaT) |
| 10 | + date_range, NaT, IntervalIndex) |
10 | 11 |
|
11 | 12 | from pandas.util.testing import (assert_series_equal,
|
12 | 13 | assert_frame_equal,
|
|
19 | 20 |
|
20 | 21 | class TestDataFrameSorting(tm.TestCase, TestData):
|
21 | 22 |
|
22 |
| - def test_sort_index(self): |
23 |
| - # GH13496 |
24 |
| - |
25 |
| - frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], |
26 |
| - columns=['A', 'B', 'C', 'D']) |
27 |
| - |
28 |
| - # axis=0 : sort rows by index labels |
29 |
| - unordered = frame.loc[[3, 2, 4, 1]] |
30 |
| - result = unordered.sort_index(axis=0) |
31 |
| - expected = frame |
32 |
| - assert_frame_equal(result, expected) |
33 |
| - |
34 |
| - result = unordered.sort_index(ascending=False) |
35 |
| - expected = frame[::-1] |
36 |
| - assert_frame_equal(result, expected) |
37 |
| - |
38 |
| - # axis=1 : sort columns by column names |
39 |
| - unordered = frame.iloc[:, [2, 1, 3, 0]] |
40 |
| - result = unordered.sort_index(axis=1) |
41 |
| - assert_frame_equal(result, frame) |
42 |
| - |
43 |
| - result = unordered.sort_index(axis=1, ascending=False) |
44 |
| - expected = frame.iloc[:, ::-1] |
45 |
| - assert_frame_equal(result, expected) |
46 |
| - |
47 |
| - def test_sort_index_multiindex(self): |
48 |
| - # GH13496 |
49 |
| - |
50 |
| - # sort rows by specified level of multi-index |
51 |
| - mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) |
52 |
| - df = DataFrame([[1, 2], [3, 4]], mi) |
53 |
| - |
54 |
| - # MI sort, but no level: sort_level has no effect |
55 |
| - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) |
56 |
| - df = DataFrame([[1, 2], [3, 4]], mi) |
57 |
| - result = df.sort_index(sort_remaining=False) |
58 |
| - expected = df.sort_index() |
59 |
| - assert_frame_equal(result, expected) |
60 |
| - |
61 | 23 | def test_sort(self):
|
62 | 24 | frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
|
63 | 25 | columns=['A', 'B', 'C', 'D'])
|
@@ -151,21 +113,6 @@ def test_sort_values_inplace(self):
|
151 | 113 | expected = frame.sort_values(by=['A', 'B'], ascending=False)
|
152 | 114 | assert_frame_equal(sorted_df, expected)
|
153 | 115 |
|
154 |
| - def test_sort_index_categorical_index(self): |
155 |
| - |
156 |
| - df = (DataFrame({'A': np.arange(6, dtype='int64'), |
157 |
| - 'B': Series(list('aabbca')) |
158 |
| - .astype('category', categories=list('cab'))}) |
159 |
| - .set_index('B')) |
160 |
| - |
161 |
| - result = df.sort_index() |
162 |
| - expected = df.iloc[[4, 0, 1, 5, 2, 3]] |
163 |
| - assert_frame_equal(result, expected) |
164 |
| - |
165 |
| - result = df.sort_index(ascending=False) |
166 |
| - expected = df.iloc[[3, 2, 5, 1, 0, 4]] |
167 |
| - assert_frame_equal(result, expected) |
168 |
| - |
169 | 116 | def test_sort_nan(self):
|
170 | 117 | # GH3917
|
171 | 118 | nan = np.nan
|
@@ -291,8 +238,86 @@ def test_stable_descending_multicolumn_sort(self):
|
291 | 238 | kind='mergesort')
|
292 | 239 | assert_frame_equal(sorted_df, expected)
|
293 | 240 |
|
| 241 | + def test_sort_datetimes(self): |
| 242 | + |
| 243 | + # GH 3461, argsort / lexsort differences for a datetime column |
| 244 | + df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'], |
| 245 | + columns=['A'], |
| 246 | + index=date_range('20130101', periods=9)) |
| 247 | + dts = [Timestamp(x) |
| 248 | + for x in ['2004-02-11', '2004-01-21', '2004-01-26', |
| 249 | + '2005-09-20', '2010-10-04', '2009-05-12', |
| 250 | + '2008-11-12', '2010-09-28', '2010-09-28']] |
| 251 | + df['B'] = dts[::2] + dts[1::2] |
| 252 | + df['C'] = 2. |
| 253 | + df['A1'] = 3. |
| 254 | + |
| 255 | + df1 = df.sort_values(by='A') |
| 256 | + df2 = df.sort_values(by=['A']) |
| 257 | + assert_frame_equal(df1, df2) |
| 258 | + |
| 259 | + df1 = df.sort_values(by='B') |
| 260 | + df2 = df.sort_values(by=['B']) |
| 261 | + assert_frame_equal(df1, df2) |
| 262 | + |
| 263 | + def test_frame_column_inplace_sort_exception(self): |
| 264 | + s = self.frame['A'] |
| 265 | + with assertRaisesRegexp(ValueError, "This Series is a view"): |
| 266 | + s.sort_values(inplace=True) |
| 267 | + |
| 268 | + cp = s.copy() |
| 269 | + cp.sort_values() # it works! |
| 270 | + |
| 271 | + def test_sort_nat_values_in_int_column(self): |
| 272 | + |
| 273 | + # GH 14922: "sorting with large float and multiple columns incorrect" |
| 274 | + |
| 275 | + # cause was that the int64 value NaT was considered as "na". Which is |
| 276 | + # only correct for datetime64 columns. |
| 277 | + |
| 278 | + int_values = (2, int(NaT)) |
| 279 | + float_values = (2.0, -1.797693e308) |
| 280 | + |
| 281 | + df = DataFrame(dict(int=int_values, float=float_values), |
| 282 | + columns=["int", "float"]) |
| 283 | + |
| 284 | + df_reversed = DataFrame(dict(int=int_values[::-1], |
| 285 | + float=float_values[::-1]), |
| 286 | + columns=["int", "float"], |
| 287 | + index=[1, 0]) |
| 288 | + |
| 289 | + # NaT is not a "na" for int64 columns, so na_position must not |
| 290 | + # influence the result: |
| 291 | + df_sorted = df.sort_values(["int", "float"], na_position="last") |
| 292 | + assert_frame_equal(df_sorted, df_reversed) |
| 293 | + |
| 294 | + df_sorted = df.sort_values(["int", "float"], na_position="first") |
| 295 | + assert_frame_equal(df_sorted, df_reversed) |
| 296 | + |
| 297 | + # reverse sorting order |
| 298 | + df_sorted = df.sort_values(["int", "float"], ascending=False) |
| 299 | + assert_frame_equal(df_sorted, df) |
| 300 | + |
| 301 | + # and now check if NaT is still considered as "na" for datetime64 |
| 302 | + # columns: |
| 303 | + df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], |
| 304 | + float=float_values), columns=["datetime", "float"]) |
| 305 | + |
| 306 | + df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], |
| 307 | + float=float_values[::-1]), |
| 308 | + columns=["datetime", "float"], |
| 309 | + index=[1, 0]) |
| 310 | + |
| 311 | + df_sorted = df.sort_values(["datetime", "float"], na_position="first") |
| 312 | + assert_frame_equal(df_sorted, df_reversed) |
| 313 | + |
| 314 | + df_sorted = df.sort_values(["datetime", "float"], na_position="last") |
| 315 | + assert_frame_equal(df_sorted, df_reversed) |
| 316 | + |
| 317 | + |
| 318 | +class TestDataFrameSortIndexKinds(tm.TestCase, TestData): |
| 319 | + |
294 | 320 | def test_sort_index_multicolumn(self):
|
295 |
| - import random |
296 | 321 | A = np.arange(5).repeat(20)
|
297 | 322 | B = np.tile(np.arange(5), 20)
|
298 | 323 | random.shuffle(A)
|
@@ -448,78 +473,73 @@ def test_sort_index_level(self):
|
448 | 473 | res = df.sort_index(level=['A', 'B'], sort_remaining=False)
|
449 | 474 | assert_frame_equal(df, res)
|
450 | 475 |
|
451 |
| - def test_sort_datetimes(self): |
452 |
| - |
453 |
| - # GH 3461, argsort / lexsort differences for a datetime column |
454 |
| - df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'], |
455 |
| - columns=['A'], |
456 |
| - index=date_range('20130101', periods=9)) |
457 |
| - dts = [Timestamp(x) |
458 |
| - for x in ['2004-02-11', '2004-01-21', '2004-01-26', |
459 |
| - '2005-09-20', '2010-10-04', '2009-05-12', |
460 |
| - '2008-11-12', '2010-09-28', '2010-09-28']] |
461 |
| - df['B'] = dts[::2] + dts[1::2] |
462 |
| - df['C'] = 2. |
463 |
| - df['A1'] = 3. |
464 |
| - |
465 |
| - df1 = df.sort_values(by='A') |
466 |
| - df2 = df.sort_values(by=['A']) |
467 |
| - assert_frame_equal(df1, df2) |
468 |
| - |
469 |
| - df1 = df.sort_values(by='B') |
470 |
| - df2 = df.sort_values(by=['B']) |
471 |
| - assert_frame_equal(df1, df2) |
472 |
| - |
473 |
| - def test_frame_column_inplace_sort_exception(self): |
474 |
| - s = self.frame['A'] |
475 |
| - with assertRaisesRegexp(ValueError, "This Series is a view"): |
476 |
| - s.sort_values(inplace=True) |
477 |
| - |
478 |
| - cp = s.copy() |
479 |
| - cp.sort_values() # it works! |
| 476 | + def test_sort_index_categorical_index(self): |
480 | 477 |
|
481 |
| - def test_sort_nat_values_in_int_column(self): |
| 478 | + df = (DataFrame({'A': np.arange(6, dtype='int64'), |
| 479 | + 'B': Series(list('aabbca')) |
| 480 | + .astype('category', categories=list('cab'))}) |
| 481 | + .set_index('B')) |
482 | 482 |
|
483 |
| - # GH 14922: "sorting with large float and multiple columns incorrect" |
| 483 | + result = df.sort_index() |
| 484 | + expected = df.iloc[[4, 0, 1, 5, 2, 3]] |
| 485 | + assert_frame_equal(result, expected) |
484 | 486 |
|
485 |
| - # cause was that the int64 value NaT was considered as "na". Which is |
486 |
| - # only correct for datetime64 columns. |
| 487 | + result = df.sort_index(ascending=False) |
| 488 | + expected = df.iloc[[3, 2, 5, 1, 0, 4]] |
| 489 | + assert_frame_equal(result, expected) |
487 | 490 |
|
488 |
| - int_values = (2, int(NaT)) |
489 |
| - float_values = (2.0, -1.797693e308) |
| 491 | + def test_sort_index(self): |
| 492 | + # GH13496 |
490 | 493 |
|
491 |
| - df = DataFrame(dict(int=int_values, float=float_values), |
492 |
| - columns=["int", "float"]) |
| 494 | + frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], |
| 495 | + columns=['A', 'B', 'C', 'D']) |
493 | 496 |
|
494 |
| - df_reversed = DataFrame(dict(int=int_values[::-1], |
495 |
| - float=float_values[::-1]), |
496 |
| - columns=["int", "float"], |
497 |
| - index=[1, 0]) |
| 497 | + # axis=0 : sort rows by index labels |
| 498 | + unordered = frame.loc[[3, 2, 4, 1]] |
| 499 | + result = unordered.sort_index(axis=0) |
| 500 | + expected = frame |
| 501 | + assert_frame_equal(result, expected) |
498 | 502 |
|
499 |
| - # NaT is not a "na" for int64 columns, so na_position must not |
500 |
| - # influence the result: |
501 |
| - df_sorted = df.sort_values(["int", "float"], na_position="last") |
502 |
| - assert_frame_equal(df_sorted, df_reversed) |
| 503 | + result = unordered.sort_index(ascending=False) |
| 504 | + expected = frame[::-1] |
| 505 | + assert_frame_equal(result, expected) |
503 | 506 |
|
504 |
| - df_sorted = df.sort_values(["int", "float"], na_position="first") |
505 |
| - assert_frame_equal(df_sorted, df_reversed) |
| 507 | + # axis=1 : sort columns by column names |
| 508 | + unordered = frame.iloc[:, [2, 1, 3, 0]] |
| 509 | + result = unordered.sort_index(axis=1) |
| 510 | + assert_frame_equal(result, frame) |
506 | 511 |
|
507 |
| - # reverse sorting order |
508 |
| - df_sorted = df.sort_values(["int", "float"], ascending=False) |
509 |
| - assert_frame_equal(df_sorted, df) |
| 512 | + result = unordered.sort_index(axis=1, ascending=False) |
| 513 | + expected = frame.iloc[:, ::-1] |
| 514 | + assert_frame_equal(result, expected) |
510 | 515 |
|
511 |
| - # and now check if NaT is still considered as "na" for datetime64 |
512 |
| - # columns: |
513 |
| - df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], |
514 |
| - float=float_values), columns=["datetime", "float"]) |
| 516 | + def test_sort_index_multiindex(self): |
| 517 | + # GH13496 |
515 | 518 |
|
516 |
| - df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], |
517 |
| - float=float_values[::-1]), |
518 |
| - columns=["datetime", "float"], |
519 |
| - index=[1, 0]) |
| 519 | + # sort rows by specified level of multi-index |
| 520 | + mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) |
| 521 | + df = DataFrame([[1, 2], [3, 4]], mi) |
520 | 522 |
|
521 |
| - df_sorted = df.sort_values(["datetime", "float"], na_position="first") |
522 |
| - assert_frame_equal(df_sorted, df_reversed) |
| 523 | + # MI sort, but no level: sort_level has no effect |
| 524 | + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) |
| 525 | + df = DataFrame([[1, 2], [3, 4]], mi) |
| 526 | + result = df.sort_index(sort_remaining=False) |
| 527 | + expected = df.sort_index() |
| 528 | + assert_frame_equal(result, expected) |
523 | 529 |
|
524 |
| - df_sorted = df.sort_values(["datetime", "float"], na_position="last") |
525 |
| - assert_frame_equal(df_sorted, df_reversed) |
| 530 | + def test_sort_index_intervalindex(self): |
| 531 | + # this is a de-facto sort via unstack |
| 532 | + # confirming that we sort in the order of the bins |
| 533 | + y = Series(np.random.randn(100)) |
| 534 | + x1 = Series(np.sign(np.random.randn(100))) |
| 535 | + x2 = pd.cut(Series(np.random.randn(100)), |
| 536 | + bins=[-3, -0.5, 0, 0.5, 3]) |
| 537 | + model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2']) |
| 538 | + |
| 539 | + result = model.groupby(['X1', 'X2']).mean().unstack() |
| 540 | + expected = IntervalIndex.from_tuples( |
| 541 | + [(-3.0, -0.5), (-0.5, 0.0), |
| 542 | + (0.0, 0.5), (0.5, 3.0)], |
| 543 | + closed='right') |
| 544 | + result = result.columns.levels[1].categories |
| 545 | + tm.assert_index_equal(result, expected) |
0 commit comments