diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 86dcc9dcefa09..558dd14f24b4f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -82,6 +82,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) +- Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 546f08d651eea..f1bd03a097cd0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2227,9 +2227,10 @@ def _concatenate_chunks(list chunks): for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. - dtypes = set(a.dtype for a in arrs) - if len(dtypes) > 1: - common_type = np.find_common_type(dtypes, []) + dtypes = {a.dtype for a in arrs} + numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + if len(numpy_dtypes) > 1: + common_type = np.find_common_type(numpy_dtypes, []) if common_type == np.object: warning_columns.append(str(name)) diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7d3df6201a390..b91ce04673e29 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -114,6 +114,17 @@ def test_categorical_dtype(self): actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) + @pytest.mark.slow + def test_categorical_dtype_high_cardinality_numeric(self): + # GH 18186 + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({'a': Categorical(data, ordered=True)}) + actual = self.read_csv(StringIO('a\n' + '\n'.join(data)), + dtype='category') + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True) + tm.assert_frame_equal(actual, expected) + def test_categorical_dtype_encoding(self): # GH 10153 pth = tm.get_data_path('unicode_series.csv')