Skip to content

Commit 03699c6

Browse files
committed
Fix GH 14922
having the int equivalent of NaT in an int64 column caused wrong sorting because this special value was considered as "missing value".
1 parent 3ccb501 commit 03699c6

File tree

2 files changed

+18
-2
lines changed

2 files changed

+18
-2
lines changed

pandas/core/algorithms.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
343343

344344
table = hash_klass(size_hint or len(vals))
345345
uniques = vec_klass()
346-
labels = table.get_labels(vals, uniques, 0, na_sentinel, True)
346+
check_nulls = not is_integer_dtype(values)
347+
labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)
347348

348349
labels = _ensure_platform_int(labels)
349350

pandas/tests/frame/test_sorting.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pandas.compat import lrange
88
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
9-
date_range)
9+
date_range, NaT)
1010

1111
from pandas.util.testing import (assert_series_equal,
1212
assert_frame_equal,
@@ -491,3 +491,18 @@ def test_frame_column_inplace_sort_exception(self):
491491

492492
cp = s.copy()
493493
cp.sort_values() # it works!
494+
495+
def test_sort_nat_values_in_int_column(self):
496+
497+
# GH 14922, sorting with large float and multiple columns incorrect
498+
int_values = (2, int(NaT))
499+
float_values = (2.0, -1.797693e308)
500+
501+
df = DataFrame(dict(int=int_values, float=float_values),
502+
columns=["int", "float"])
503+
504+
df_sorted = df.sort_values(["int", "float"])
505+
df_expected = DataFrame(dict(int=int_values[::-1], float=float_values[::-1]),
506+
columns=["int", "float"], index=[1, 0])
507+
508+
assert_frame_equal(df_sorted, df_expected)

0 commit comments

Comments
 (0)