Skip to content

Commit bf61eaf

Browse files
committed
Fix bug in read_csv for high cardinality category types (#18186)
1 parent 103ea6f commit bf61eaf

File tree

3 files changed

+24
-3
lines changed

3 files changed

+24
-3
lines changed

doc/source/whatsnew/v0.21.1.txt

+9
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@ Documentation Changes
5656

5757
Bug Fixes
5858
~~~~~~~~~
59+
- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`)
60+
- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`)
61+
- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`)
62+
- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`)
63+
- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`)
64+
- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`)
65+
- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`)
66+
- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`)
67+
- Bug in ``pd.read_csv`` when reading numeric category fields with high cardinality (:issue `18186`)
5968

6069
Conversion
6170
^^^^^^^^^^

pandas/_libs/parsers.pyx

+4-3
Original file line numberDiff line numberDiff line change
@@ -2227,9 +2227,10 @@ def _concatenate_chunks(list chunks):
22272227
for name in names:
22282228
arrs = [chunk.pop(name) for chunk in chunks]
22292229
# Check each arr for consistent types.
2230-
dtypes = set(a.dtype for a in arrs)
2231-
if len(dtypes) > 1:
2232-
common_type = np.find_common_type(dtypes, [])
2230+
dtypes = set([a.dtype for a in arrs])
2231+
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
2232+
if len(numpy_dtypes) > 1:
2233+
common_type = np.find_common_type(numpy_dtypes, [])
22332234
if common_type == np.object:
22342235
warning_columns.append(str(name))
22352236

pandas/tests/io/parser/dtypes.py

+11
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,17 @@ def test_categorical_dtype(self):
114114
actual = self.read_csv(StringIO(data), dtype='category')
115115
tm.assert_frame_equal(actual, expected)
116116

117+
@pytest.mark.slow
118+
def test_categorical_dtype_high_cardinality_numeric(self):
119+
# GH 18186
120+
data = sorted([str(i) for i in range(10**6)])
121+
expected = pd.DataFrame({'a': Categorical(data, ordered=True)})
122+
actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
123+
dtype='category')
124+
actual.a.cat.reorder_categories(sorted(actual.a.cat.categories),
125+
ordered=True, inplace=True)
126+
tm.assert_frame_equal(actual, expected)
127+
117128
def test_categorical_dtype_encoding(self):
118129
# GH 10153
119130
pth = tm.get_data_path('unicode_series.csv')

0 commit comments

Comments
 (0)