From bf61eaf43a130a874c3b33155e517e658727eea7 Mon Sep 17 00:00:00 2001 From: Sam Cohan Date: Tue, 21 Nov 2017 02:54:23 +0000 Subject: [PATCH 1/2] Fix bug in read_csv for high cardinality category types (#18186) --- doc/source/whatsnew/v0.21.1.txt | 9 +++++++++ pandas/_libs/parsers.pyx | 7 ++++--- pandas/tests/io/parser/dtypes.py | 11 +++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 86dcc9dcefa09..5ca58c647e0c9 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,6 +56,15 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) +- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) +- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) +- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) +- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) +- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) +- Bug in ``pd.read_csv`` when reading numeric category fields with high cardinality (:issue `18186`) Conversion ^^^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 546f08d651eea..5bd64f2c879a2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2227,9 +2227,10 @@ def _concatenate_chunks(list chunks): for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. - dtypes = set(a.dtype for a in arrs) - if len(dtypes) > 1: - common_type = np.find_common_type(dtypes, []) + dtypes = set([a.dtype for a in arrs]) + numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + if len(numpy_dtypes) > 1: + common_type = np.find_common_type(numpy_dtypes, []) if common_type == np.object: warning_columns.append(str(name)) diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7d3df6201a390..23d08134dbf44 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -114,6 +114,17 @@ def test_categorical_dtype(self): actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) + @pytest.mark.slow + def test_categorical_dtype_high_cardinality_numeric(self): + # GH 18186 + data = sorted([str(i) for i in range(10**6)]) + expected = pd.DataFrame({'a': Categorical(data, ordered=True)}) + actual = self.read_csv(StringIO('a\n' + '\n'.join(data)), + dtype='category') + actual.a.cat.reorder_categories(sorted(actual.a.cat.categories), + ordered=True, inplace=True) + tm.assert_frame_equal(actual, expected) + def test_categorical_dtype_encoding(self): # GH 10153 pth = tm.get_data_path('unicode_series.csv') From ff1945d371b32e18f158c6a8a24f4a6ff4c2e2ba Mon Sep 17 00:00:00 2001 From: Sam Cohan Date: Tue, 21 Nov 2017 09:02:31 +0000 Subject: [PATCH 2/2] Refactor read_csv bug fix with PR comments (#18186) --- doc/source/whatsnew/v0.21.1.txt | 10 +--------- pandas/_libs/parsers.pyx | 2 +- pandas/tests/io/parser/dtypes.py | 8 ++++---- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5ca58c647e0c9..558dd14f24b4f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,15 +56,6 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) -- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) -- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) -- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) -- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) -- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) -- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) -- Bug in ``pd.read_csv`` when reading numeric category fields with high cardinality (:issue `18186`) Conversion ^^^^^^^^^^ @@ -91,6 +82,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) +- Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5bd64f2c879a2..f1bd03a097cd0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2227,7 +2227,7 @@ def _concatenate_chunks(list chunks): for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. - dtypes = set([a.dtype for a in arrs]) + dtypes = {a.dtype for a in arrs} numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} if len(numpy_dtypes) > 1: common_type = np.find_common_type(numpy_dtypes, []) diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 23d08134dbf44..b91ce04673e29 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -117,12 +117,12 @@ def test_categorical_dtype(self): @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(self): # GH 18186 - data = sorted([str(i) for i in range(10**6)]) - expected = pd.DataFrame({'a': Categorical(data, ordered=True)}) + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({'a': Categorical(data, ordered=True)}) actual = self.read_csv(StringIO('a\n' + '\n'.join(data)), dtype='category') - actual.a.cat.reorder_categories(sorted(actual.a.cat.categories), - ordered=True, inplace=True) + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True) tm.assert_frame_equal(actual, expected) def test_categorical_dtype_encoding(self):