wip

chris-b1 · chris-b1 · commit 0f0dba63fb4d · 2016-08-04T18:09:23.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -509,7 +509,7 @@ Specifying Categorical dtype
 
 ``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
 
-.. ipython :: python
+.. ipython:: python
 
    data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
 
@@ -525,7 +525,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
 .. note::
 
-   The resulting categories will always be parsed as string (object dtype).
+   The resulting categories will always be parsed as strings (object dtype).
    If the categories are numeric they can be converted using the
    :func:`pd.to_numeric` function, or as appropriate, another converter
    such as :func:`pd.to_datetime`.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -258,7 +258,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
 .. note::
 
-   The resulting categories will always be parsed as string (object dtype).
+   The resulting categories will always be parsed as strings (object dtype).
    If the categories are numeric they can be converted using the
    :func:`pd.to_numeric` function, or as appropriate, another converter
    such as :func:`pd.to_datetime`.
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -225,8 +225,7 @@ def test_categorical_dtype(self):
 1,b,3.4
 2,a,4.5"""
         expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
-                                 'b': Categorical.from_codes([0, 0, 1],
-                                                             ['b', 'a']),
+                                 'b': Categorical(['b', 'b', 'a']),
                                  'c': Categorical(['3.4', '3.4', '4.5'])})
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
@@ -237,8 +236,7 @@ def test_categorical_dtype(self):
 1,nan,3.4
 2,a,4.5"""
         expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
-                                 'b': Categorical.from_codes([0, -1, 1],
-                                                             ['b', 'a']),
+                                 'b': Categorical(['b', np.nan, 'a']),
                                  'c': Categorical(['3.4', '3.4', '4.5'])})
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
@@ -248,14 +246,15 @@ def test_categorical_dtype_encoding(self):
         pth = tm.get_data_path('unicode_series.csv')
         encoding = 'latin-1'
         expected = self.read_csv(pth, header=None, encoding=encoding)
+        expected[1] = Categorical(expected[1])
         actual = self.read_csv(pth, header=None, encoding=encoding,
                                dtype={1: 'category'})
-        actual[1] = actual[1].astype(object)
         tm.assert_frame_equal(actual, expected)
 
         pth = tm.get_data_path('utf16_ex.txt')
         encoding = 'utf-16'
         expected = self.read_table(pth, encoding=encoding)
+        expected = expected.apply(Categorical)
         actual = self.read_table(pth, encoding=encoding, dtype='category')
         actual = actual.apply(lambda x: x.astype(object))
         tm.assert_frame_equal(actual, expected)
@@ -270,7 +269,8 @@ def test_categorical_dtype_chunksize(self):
         expecteds = [pd.DataFrame({'a': [1, 1],
                                    'b': Categorical(['a', 'b'])}),
                      pd.DataFrame({'a': [1, 2],
-                                   'b': Categorical(['b', 'c'])})]
+                                   'b': Categorical(['b', 'c'])},
+                                  index=[2, 3])]
         actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
                                 chunksize=2)
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -40,7 +40,9 @@ from pandas.types.common import (is_categorical_dtype, CategoricalDtype,
                                  is_string_dtype, is_datetime64_dtype,
                                  pandas_dtype)
 from pandas.core.categorical import Categorical
+from pandas.core.algorithms import take_1d
 from pandas.types.concat import union_categoricals
+from pandas import Index
 
 import time
 import os
@@ -1182,6 +1184,19 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(self.parser, i, start,
                                                          end, na_filter, na_hashset,
                                                          self.c_encoding)
+            print cats
+            print codes
+            # sort categories and recode if necessary
+            cats = Index(cats)
+            if not cats.is_monotonic_increasing:
+                unsorted = cats.copy()
+                cats = cats.sort_values()
+                indexer = unsorted.get_indexer(cats)
+                codes = take_1d(indexer, codes, fill_value=-1)
+            print indexer
+            print cats
+            print codes
+
             return Categorical(codes, categories=cats, ordered=False,
                                fastpath=True), na_count
         elif is_object_dtype(dtype):
@@ -2000,7 +2015,7 @@ def _concatenate_chunks(list chunks):
                 warning_columns.append(str(name))
 
         if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs)
+            result[name] = union_categoricals(arrs, sort_categories=True)
         else:
             result[name] = np.concatenate(arrs)