diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 04f25034638cd..6dc8bffd6dac9 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -114,6 +114,27 @@ def teardown(self): os.remove('test.csv') +class read_csv_categorical(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame({'a': np.random.choice(group1, N).astype('object'), + 'b': np.random.choice(group1, N).astype('object'), + 'c': np.random.choice(group1, N).astype('object')}) + df.to_csv('strings.csv', index=False) + + def time_read_csv_categorical_post(self): + read_csv('strings.csv').apply(pd.Categorical) + + def time_read_csv_categorical_direct(self): + read_csv('strings.csv', dtype='category') + + def teardown(self): + os.remove('strings.csv') + + class read_table_multiple_date(object): goal_time = 0.2 diff --git a/doc/source/io.rst b/doc/source/io.rst index 2866371cce61a..7917e6b4cdfce 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -500,6 +500,43 @@ worth trying. data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. +.. _io.categorical: + +Specifying Categorical dtype +'''''''''''''''''''''''''''' + +.. versionadded:: 0.19.0 + +``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` + +.. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a ``Categorical`` using a dict specification + +.. ipython:: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as strings (object dtype). + If the categories are numeric they can be converted using the + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. + + .. ipython:: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] Naming and Using Columns diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 59a106291dad8..6c995a6989a38 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -12,6 +12,7 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -195,6 +196,14 @@ default of the index) in a DataFrame. :func:`read_csv` has improved support for duplicate column names ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: + + :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) @@ -222,6 +231,46 @@ New behaviour: In [2]: pd.read_csv(StringIO(data), names=names) + +.. _whatsnew_0190.enhancements.read_csv_categorical: + +:func:`read_csv` supports parsing ``Categorical`` directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`read_csv` function now supports parsing a ``Categorical`` column when +specified as a dtype (:issue:`10153`). Depending on the structure of the data, +this can result in a faster parse time and lower memory usage compared to +converting to ``Categorical`` after parsing. See the io :ref:`docs here ` + +.. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a ``Categorical`` using a dict specification + +.. ipython:: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as strings (object dtype). + If the categories are numeric they can be converted using the + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. + + .. ipython:: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] + .. _whatsnew_0190.enhancements.semi_month_offsets: Semi-Month Offsets diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 103c9fa2b7ce8..4cea9e1d6b595 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,9 +12,10 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex +from pandas import DataFrame, Series, Index, MultiIndex, Categorical from pandas import compat from pandas.compat import StringIO, range, lrange +from pandas.types.dtypes import CategoricalDtype class CParserTests(object): @@ -135,6 +136,11 @@ def test_passing_dtype(self): dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) + # valid but unsupported - fixed width unicode string + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'U8'}, + index_col=0) + # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) @@ -184,6 +190,92 @@ def test_pass_dtype(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'object') + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( diff --git a/pandas/parser.pyx b/pandas/parser.pyx index e72e2f90a5213..5af82be5b741b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -25,6 +25,7 @@ cdef extern from "Python.h": cdef extern from "stdlib.h": void memcpy(void *dst, void *src, size_t n) +cimport cython cimport numpy as cnp from numpy cimport ndarray, uint8_t, uint64_t @@ -33,6 +34,15 @@ import numpy as np cimport util import pandas.lib as lib +from pandas.types.common import (is_categorical_dtype, CategoricalDtype, + is_integer_dtype, is_float_dtype, + is_bool_dtype, is_object_dtype, + is_string_dtype, is_datetime64_dtype, + pandas_dtype) +from pandas.core.categorical import Categorical +from pandas.core.algorithms import take_1d +from pandas.types.concat import union_categoricals +from pandas import Index import time import os @@ -399,11 +409,12 @@ cdef class TextReader: self._set_quoting(quotechar, quoting) - # TODO: endianness just a placeholder? + + dtype_order = ['int64', 'float64', 'bool', 'object'] if quoting == QUOTE_NONNUMERIC: - self.dtype_cast_order = [' 1: @@ -472,15 +483,10 @@ cdef class TextReader: self.encoding = encoding if isinstance(dtype, dict): - conv = {} - for k in dtype: - v = dtype[k] - if isinstance(v, basestring): - v = np.dtype(v) - conv[k] = v - dtype = conv + dtype = {k: pandas_dtype(dtype[k]) + for k in dtype} elif dtype is not None: - dtype = np.dtype(dtype) + dtype = pandas_dtype(dtype) self.dtype = dtype @@ -689,6 +695,7 @@ cdef class TextReader: int status Py_ssize_t size char *errors = "strict" + cdef StringPath path = _string_path(self.c_encoding) header = [] @@ -718,20 +725,18 @@ cdef class TextReader: field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] - # TODO: Py3 vs. Py2 counts = {} unnamed_count = 0 for i in range(field_count): word = self.parser.words[start + i] - if self.c_encoding == NULL and not PY3: + if path == CSTRING: name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + elif path == UTF8: + name = PyUnicode_FromString(word) + elif path == ENCODED: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) if name == '': if self.has_mi_columns: @@ -1076,17 +1081,12 @@ cdef class TextReader: col_dtype = self.dtype[i] else: if self.dtype.names: - col_dtype = self.dtype.descr[i][1] + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) else: col_dtype = self.dtype if col_dtype is not None: - if not isinstance(col_dtype, basestring): - if isinstance(col_dtype, np.dtype): - col_dtype = col_dtype.str - else: - col_dtype = np.dtype(col_dtype).str - col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, na_filter, 1, na_hashset, na_flist) @@ -1104,7 +1104,7 @@ cdef class TextReader: dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - '|O8', i, start, end, na_filter, 0, na_hashset, na_flist) + np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist) if col_res is not None: break @@ -1136,90 +1136,88 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if dtype[1] == 'i' or dtype[1] == 'u': - result, na_count = _try_int64(self.parser, i, start, end, - na_filter, na_hashset) + if is_integer_dtype(dtype): + result, na_count = _try_int64(self.parser, i, start, end, na_filter, + na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + "column {column}".format(column=i)) - if result is not None and dtype[1:] != 'i8': + if result is not None and dtype != 'int64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'f': + elif is_float_dtype(dtype): result, na_count = _try_double(self.parser, i, start, end, na_filter, na_hashset, na_flist) - if result is not None and dtype[1:] != 'f8': + if result is not None and dtype != 'float64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'b': + elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) return result, na_count - elif dtype[1] == 'c': - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) - - elif dtype[1] == 'S': + elif dtype.kind == 'S': # TODO: na handling - width = int(dtype[2:]) + width = dtype.itemsize if width > 0: result = _to_fw_string(self.parser, i, start, end, width) return result, 0 # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, - na_hashset) - elif dtype[1] == 'U': - width = int(dtype[2:]) + na_hashset) + elif dtype.kind == 'U': + width = dtype.itemsize if width > 0: - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not supported for parsing" % dtype) # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - - - elif dtype[1] == 'O': + elif is_categorical_dtype(dtype): + codes, cats, na_count = _categorical_convert(self.parser, i, start, + end, na_filter, na_hashset, + self.c_encoding) + # sort categories and recode if necessary + cats = Index(cats) + if not cats.is_monotonic_increasing: + unsorted = cats.copy() + cats = cats.sort_values() + indexer = cats.get_indexer(unsorted) + codes = take_1d(indexer, codes, fill_value=-1) + + return Categorical(codes, categories=cats, ordered=False, + fastpath=True), na_count + elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) + elif is_datetime64_dtype(dtype): + raise TypeError("the dtype %s is not supported for parsing, " + "pass this column using parse_dates instead" % dtype) else: - if dtype[1] == 'M': - raise TypeError("the dtype %s is not supported for parsing, " - "pass this column using parse_dates instead" % dtype) raise TypeError("the dtype %s is not supported for parsing" % dtype) cdef _string_convert(self, Py_ssize_t i, int start, int end, bint na_filter, kh_str_t *na_hashset): - if PY3: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_factorize(self.parser, i, start, end, - na_filter, na_hashset) + + cdef StringPath path = _string_path(self.c_encoding) + + if path == UTF8: + return _string_box_utf8(self.parser, i, start, end, na_filter, + na_hashset) + elif path == ENCODED: + return _string_box_decode(self.parser, i, start, end, + na_filter, na_hashset, self.c_encoding) + elif path == CSTRING: + return _string_box_factorize(self.parser, i, start, end, + na_filter, na_hashset) + def _get_converter(self, i, name): if self.converters is None: @@ -1331,6 +1329,19 @@ def _maybe_upcast(arr): return arr +cdef enum StringPath: + CSTRING + UTF8 + ENCODED + +# factored out logic to pick string converter +cdef inline StringPath _string_path(char *encoding): + if encoding != NULL and encoding != b"utf-8": + return ENCODED + elif PY3 or encoding != NULL: + return UTF8 + else: + return CSTRING # ---------------------------------------------------------------------- # Type conversions / inference support code @@ -1500,6 +1511,77 @@ cdef _string_box_decode(parser_t *parser, int col, return result, na_count +@cython.boundscheck(False) +cdef _categorical_convert(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + char *encoding): + "Convert column data into codes, categories" + cdef: + int error, na_count = 0 + Py_ssize_t i, size + size_t lines + coliter_t it + const char *word = NULL + + int64_t NA = -1 + int64_t[:] codes + int64_t current_category = 0 + + char *errors = "strict" + cdef StringPath path = _string_path(encoding) + + int ret = 0 + kh_str_t *table + khiter_t k + + lines = line_end - line_start + codes = np.empty(lines, dtype=np.int64) + + # factorize parsed values, creating a hash table + # bytes -> category code + with nogil: + table = kh_init_str() + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter: + k = kh_get_str(na_hashset, word) + # is in NA values + if k != na_hashset.n_buckets: + na_count += 1 + codes[i] = NA + continue + + k = kh_get_str(table, word) + # not in the hash table + if k == table.n_buckets: + k = kh_put_str(table, word, &ret) + table.vals[k] = current_category + current_category += 1 + + codes[i] = table.vals[k] + + # parse and box categories to python strings + result = np.empty(table.n_occupied, dtype=np.object_) + if path == ENCODED: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + size = strlen(table.keys[k]) + result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors) + elif path == UTF8: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) + elif path == CSTRING: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[table.vals[k]] = PyBytes_FromString(table.keys[k]) + + kh_destroy_str(table) + return np.asarray(codes), result, na_count cdef _to_fw_string(parser_t *parser, int col, int line_start, int line_end, size_t width): @@ -1719,6 +1801,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l const char *word = NULL khiter_t k na_count[0] = 0 + coliter_setup(&it, parser, col, line_start) if na_filter: @@ -1836,6 +1919,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, return 0 + cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1924,7 +2008,11 @@ def _concatenate_chunks(list chunks): common_type = np.find_common_type(dtypes, []) if common_type == np.object: warning_columns.append(str(name)) - result[name] = np.concatenate(arrs) + + if is_categorical_dtype(dtypes.pop()): + result[name] = union_categoricals(arrs, sort_categories=True) + else: + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ','.join(warning_columns) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 225ba533161b3..e3cc60e2856c2 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -850,6 +850,9 @@ def test_union_categorical(self): ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], + ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), + (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')),