Skip to content

Commit 0f0dba6

Browse files
committed
wip
1 parent da5c5b5 commit 0f0dba6

File tree

4 files changed

+25
-10
lines changed

4 files changed

+25
-10
lines changed

doc/source/io.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ Specifying Categorical dtype
509509

510510
``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
511511

512-
.. ipython :: python
512+
.. ipython:: python
513513
514514
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
515515
@@ -525,7 +525,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
525525
526526
.. note::
527527

528-
The resulting categories will always be parsed as string (object dtype).
528+
The resulting categories will always be parsed as strings (object dtype).
529529
If the categories are numeric they can be converted using the
530530
:func:`pd.to_numeric` function, or as appropriate, another converter
531531
such as :func:`pd.to_datetime`.

doc/source/whatsnew/v0.19.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
258258

259259
.. note::
260260

261-
The resulting categories will always be parsed as string (object dtype).
261+
The resulting categories will always be parsed as strings (object dtype).
262262
If the categories are numeric they can be converted using the
263263
:func:`pd.to_numeric` function, or as appropriate, another converter
264264
such as :func:`pd.to_datetime`.

pandas/io/tests/parser/c_parser_only.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,7 @@ def test_categorical_dtype(self):
225225
1,b,3.4
226226
2,a,4.5"""
227227
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
228-
'b': Categorical.from_codes([0, 0, 1],
229-
['b', 'a']),
228+
'b': Categorical(['b', 'b', 'a']),
230229
'c': Categorical(['3.4', '3.4', '4.5'])})
231230
actual = self.read_csv(StringIO(data), dtype='category')
232231
tm.assert_frame_equal(actual, expected)
@@ -237,8 +236,7 @@ def test_categorical_dtype(self):
237236
1,nan,3.4
238237
2,a,4.5"""
239238
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
240-
'b': Categorical.from_codes([0, -1, 1],
241-
['b', 'a']),
239+
'b': Categorical(['b', np.nan, 'a']),
242240
'c': Categorical(['3.4', '3.4', '4.5'])})
243241
actual = self.read_csv(StringIO(data), dtype='category')
244242
tm.assert_frame_equal(actual, expected)
@@ -248,14 +246,15 @@ def test_categorical_dtype_encoding(self):
248246
pth = tm.get_data_path('unicode_series.csv')
249247
encoding = 'latin-1'
250248
expected = self.read_csv(pth, header=None, encoding=encoding)
249+
expected[1] = Categorical(expected[1])
251250
actual = self.read_csv(pth, header=None, encoding=encoding,
252251
dtype={1: 'category'})
253-
actual[1] = actual[1].astype(object)
254252
tm.assert_frame_equal(actual, expected)
255253

256254
pth = tm.get_data_path('utf16_ex.txt')
257255
encoding = 'utf-16'
258256
expected = self.read_table(pth, encoding=encoding)
257+
expected = expected.apply(Categorical)
259258
actual = self.read_table(pth, encoding=encoding, dtype='category')
260259
actual = actual.apply(lambda x: x.astype(object))
261260
tm.assert_frame_equal(actual, expected)
@@ -270,7 +269,8 @@ def test_categorical_dtype_chunksize(self):
270269
expecteds = [pd.DataFrame({'a': [1, 1],
271270
'b': Categorical(['a', 'b'])}),
272271
pd.DataFrame({'a': [1, 2],
273-
'b': Categorical(['b', 'c'])})]
272+
'b': Categorical(['b', 'c'])},
273+
index=[2, 3])]
274274
actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
275275
chunksize=2)
276276

pandas/parser.pyx

+16-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ from pandas.types.common import (is_categorical_dtype, CategoricalDtype,
4040
is_string_dtype, is_datetime64_dtype,
4141
pandas_dtype)
4242
from pandas.core.categorical import Categorical
43+
from pandas.core.algorithms import take_1d
4344
from pandas.types.concat import union_categoricals
45+
from pandas import Index
4446

4547
import time
4648
import os
@@ -1182,6 +1184,19 @@ cdef class TextReader:
11821184
codes, cats, na_count = _categorical_convert(self.parser, i, start,
11831185
end, na_filter, na_hashset,
11841186
self.c_encoding)
1187+
print cats
1188+
print codes
1189+
# sort categories and recode if necessary
1190+
cats = Index(cats)
1191+
if not cats.is_monotonic_increasing:
1192+
unsorted = cats.copy()
1193+
cats = cats.sort_values()
1194+
indexer = unsorted.get_indexer(cats)
1195+
codes = take_1d(indexer, codes, fill_value=-1)
1196+
print indexer
1197+
print cats
1198+
print codes
1199+
11851200
return Categorical(codes, categories=cats, ordered=False,
11861201
fastpath=True), na_count
11871202
elif is_object_dtype(dtype):
@@ -2000,7 +2015,7 @@ def _concatenate_chunks(list chunks):
20002015
warning_columns.append(str(name))
20012016

20022017
if is_categorical_dtype(dtypes.pop()):
2003-
result[name] = union_categoricals(arrs)
2018+
result[name] = union_categoricals(arrs, sort_categories=True)
20042019
else:
20052020
result[name] = np.concatenate(arrs)
20062021

0 commit comments

Comments
 (0)