File tree 3 files changed +15
-2
lines changed
3 files changed +15
-2
lines changed Original file line number Diff line number Diff line change @@ -62,6 +62,7 @@ Bug Fixes
62
62
- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`)
63
63
- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`)
64
64
- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`)
65
+ - Bug in ``pd.read_csv`` when reading numeric category fields with high cardinality (:issue `18186`)
65
66
66
67
Conversion
67
68
^^^^^^^^^^
Original file line number Diff line number Diff line change @@ -2228,8 +2228,9 @@ def _concatenate_chunks(list chunks):
2228
2228
arrs = [chunk.pop(name) for chunk in chunks]
2229
2229
# Check each arr for consistent types.
2230
2230
dtypes = set ([a.dtype for a in arrs])
2231
- if len (dtypes) > 1 :
2232
- common_type = np.find_common_type(dtypes, [])
2231
+ numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
2232
+ if len (numpy_dtypes) > 1 :
2233
+ common_type = np.find_common_type(numpy_dtypes, [])
2233
2234
if common_type == np.object:
2234
2235
warning_columns.append(str (name))
2235
2236
Original file line number Diff line number Diff line change @@ -114,6 +114,17 @@ def test_categorical_dtype(self):
114
114
actual = self .read_csv (StringIO (data ), dtype = 'category' )
115
115
tm .assert_frame_equal (actual , expected )
116
116
117
+ @pytest .mark .slow
118
+ def test_categorical_dtype_high_cardinality_numeric (self ):
119
+ # GH 18186
120
+ data = sorted ([str (i ) for i in range (10 ** 6 )])
121
+ expected = pd .DataFrame ({'a' : Categorical (data , ordered = True )})
122
+ actual = self .read_csv (StringIO ('a\n ' + '\n ' .join (data )),
123
+ dtype = 'category' )
124
+ actual .a .cat .reorder_categories (sorted (actual .a .cat .categories ),
125
+ ordered = True , inplace = True )
126
+ tm .assert_frame_equal (actual , expected )
127
+
117
128
def test_categorical_dtype_encoding (self ):
118
129
# GH 10153
119
130
pth = tm .get_data_path ('unicode_series.csv' )
You can’t perform that action at this time.
0 commit comments