Skip to content

Commit 1afdbb8

Browse files
committed
Fix GH 14922
having the int equivalent of NaT in an int64 column caused wrong sorting because this special value was considered as "missing value".
1 parent 3ccb501 commit 1afdbb8

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

pandas/core/algorithms.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
343343

344344
table = hash_klass(size_hint or len(vals))
345345
uniques = vec_klass()
346-
labels = table.get_labels(vals, uniques, 0, na_sentinel, True)
346+
check_nulls = not is_integer_dtype(values)
347+
labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)
347348

348349
labels = _ensure_platform_int(labels)
349350

pandas/tests/frame/test_sorting.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66

77
from pandas.compat import lrange
88
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
9-
date_range)
9+
date_range, NaT)
1010

1111
from pandas.util.testing import (assert_series_equal,
1212
assert_frame_equal,
13-
assertRaisesRegexp)
13+
assertRaisesRegexp,
14+
is_sorted)
1415

1516
import pandas.util.testing as tm
1617

@@ -491,3 +492,18 @@ def test_frame_column_inplace_sort_exception(self):
491492

492493
cp = s.copy()
493494
cp.sort_values() # it works!
495+
496+
def test_sort_nat_values_in_int_column(self):
497+
498+
# GH 14922, sorting with large float and multiple columns incorrect
499+
int_values = (2, int(NaT))
500+
float_values = (2.0, -1.797693e308)
501+
502+
df = DataFrame(dict(int=int_values, float=float_values),
503+
columns=["int", "float"])
504+
505+
df_sorted = df.sort_values(["int", "float"])
506+
df_expected = DataFrame(dict(int=int_values[::-1], float=float_values[::-1]),
507+
columns=["int", "float"], index=[1, 0])
508+
509+
assert_frame_equal(df_sorted, df_expected)

0 commit comments

Comments
 (0)