Skip to content

Commit a9db003

Browse files
committed
ENH: making some more progress. string factorization conversion
1 parent d63cee8 commit a9db003

File tree

5 files changed

+304
-57
lines changed

5 files changed

+304
-57
lines changed

pandas/io/tests/test_cparser.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,26 @@ def test_file_handle(self):
5454
finally:
5555
f.close()
5656

57-
# def test_StringIO(self):
58-
# text = open(self.csv1, 'rb').read()
57+
def test_file_handle_mmap(self):
58+
try:
59+
f = open(self.csv1, 'rb')
60+
reader = parser.TextReader(f, memory_map=True)
61+
result = reader.read()
62+
finally:
63+
f.close()
64+
65+
def test_StringIO(self):
66+
text = open(self.csv1, 'rb').read()
67+
reader = parser.TextReader(BytesIO(text))
68+
result = reader.read()
69+
70+
def test_string_factorize(self):
71+
# should this be optional?
72+
data = 'a\nb\na\nb\na'
73+
reader = parser.TextReader(StringIO(data))
74+
result = reader.read()
75+
self.assert_(len(set(map(id, result[0]))) == 2)
5976

60-
# reader = parser.TextReader(BytesIO(text))
61-
# result = reader.read()
6277

6378
if __name__ == '__main__':
6479
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],

pandas/src/khash.h

+2
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,8 @@ KHASH_SET_INIT_PYOBJECT(pyset)
613613

614614
KHASH_MAP_INIT_STR(str, Py_ssize_t)
615615

616+
KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
617+
616618
KHASH_MAP_INIT_INT(int32, Py_ssize_t)
617619
KHASH_MAP_INIT_INT64(int64, Py_ssize_t)
618620
KHASH_MAP_INIT_FLOAT64(float64, Py_ssize_t)

pandas/src/khash.pxd

+20
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ cdef extern from "khash.h":
5555

5656
bint kh_exist_str(kh_str_t*, khiter_t)
5757

58+
5859
ctypedef struct kh_int64_t:
5960
khint_t n_buckets, size, n_occupied, upper_bound
6061
uint32_t *flags
@@ -102,3 +103,22 @@ cdef extern from "khash.h":
102103
inline void kh_del_int32(kh_int32_t*, khint_t)
103104

104105
bint kh_exist_int32(kh_int32_t*, khiter_t)
106+
107+
# sweep factorize
108+
109+
ctypedef struct kh_strbox_t:
110+
khint_t n_buckets, size, n_occupied, upper_bound
111+
uint32_t *flags
112+
kh_cstr_t *keys
113+
PyObject **vals
114+
115+
inline kh_strbox_t* kh_init_strbox()
116+
inline void kh_destroy_strbox(kh_strbox_t*)
117+
inline void kh_clear_strbox(kh_strbox_t*)
118+
inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t)
119+
inline void kh_resize_strbox(kh_strbox_t*, khint_t)
120+
inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*)
121+
inline void kh_del_strbox(kh_strbox_t*, khint_t)
122+
123+
bint kh_exist_strbox(kh_strbox_t*, khiter_t)
124+

pandas/src/parser.pyx

+71-5
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ import numpy as np
33

44
cnp.import_array()
55

6+
from khash cimport *
7+
8+
from cpython cimport PyString_FromString, Py_INCREF, PyString_AsString
9+
610
cdef extern from "Python.h":
711
ctypedef struct FILE
812
FILE* PyFile_AsFile(object)
@@ -143,8 +147,10 @@ cdef class TextReader:
143147

144148
cdef public:
145149
object delimiter, na_values, converters, thousands, delim_whitespace
150+
object memory_map
146151

147-
def __cinit__(self, source, delimiter=',', header=0, memory_map=False,
152+
def __cinit__(self, source, delimiter=',', header=0,
153+
memory_map=False,
148154
chunksize=DEFAULT_CHUNKSIZE,
149155
delim_whitespace=False,
150156
na_values=None,
@@ -173,6 +179,7 @@ cdef class TextReader:
173179
self.delimiter = delimiter
174180
self.delim_whitespace = delim_whitespace
175181

182+
self.memory_map = memory_map
176183
self.na_values = na_values
177184
self.converters = converters
178185
self.thousands = thousands
@@ -185,16 +192,22 @@ cdef class TextReader:
185192
self.file_handle.close()
186193

187194
cdef _setup_parser_source(self, source):
188-
cdef int status
195+
cdef:
196+
int status
189197

190198
if isinstance(source, (basestring, file)):
191199
if isinstance(source, basestring):
192200
source = open(source, 'rb')
193201
self.should_close = True
194202

195203
self.file_handle = source
196-
status = parser_file_source_init(self.parser,
197-
PyFile_AsFile(source))
204+
205+
if self.memory_map:
206+
status = parser_mmap_init(self.parser,
207+
PyFile_AsFile(source))
208+
else:
209+
status = parser_file_source_init(self.parser,
210+
PyFile_AsFile(source))
198211

199212
if status != 0:
200213
raise Exception('Initializing from file failed')
@@ -209,7 +222,8 @@ cdef class TextReader:
209222
raise ValueError('Only ascii/bytes supported at the moment')
210223

211224
status = parser_array_source_init(self.parser,
212-
<char*> bytes, len(bytes))
225+
PyString_AsString(bytes),
226+
len(bytes))
213227
if status != 0:
214228
raise Exception('Initializing parser from file-like '
215229
'object failed')
@@ -248,6 +262,10 @@ cdef class TextReader:
248262
for i in range(ncols):
249263
col_res = _try_double(self.parser, i, 0, self.parser.lines)
250264

265+
if col_res is None:
266+
col_res = _string_box_factorize(self.parser, i,
267+
0, self.parser.lines)
268+
251269
results[i] = col_res
252270

253271
return results
@@ -256,6 +274,54 @@ class CParserError(Exception):
256274
pass
257275

258276

277+
# ----------------------------------------------------------------------
278+
# Type conversions / inference support code
279+
280+
cdef _string_box_factorize(parser_t *parser, int col,
281+
int line_start, int line_end):
282+
cdef:
283+
int error
284+
Py_ssize_t i
285+
size_t lines
286+
coliter_t it
287+
char *word
288+
cnp.ndarray[object] result
289+
290+
int ret = 0
291+
kh_strbox_t *table
292+
kh_iter_t
293+
294+
object pyval
295+
296+
297+
table = kh_init_strbox()
298+
299+
lines = line_end - line_start
300+
result = np.empty(lines, dtype=np.object_)
301+
302+
coliter_setup(&it, parser, col)
303+
for i in range(lines):
304+
word = COLITER_NEXT(it)
305+
306+
k = kh_get_strbox(table, word)
307+
308+
# in the hash table
309+
if k != table.n_buckets:
310+
# this increments the refcount, but need to test
311+
pyval = <object> table.vals[k]
312+
else:
313+
# box it. new ref?
314+
pyval = PyString_FromString(word)
315+
316+
k = kh_put_strbox(table, word, &ret)
317+
table.vals[k] = <PyObject*> pyval
318+
319+
result[i] = pyval
320+
321+
return result
322+
323+
324+
259325
cdef _try_double(parser_t *parser, int col, int line_start, int line_end):
260326
cdef:
261327
int error

0 commit comments

Comments
 (0)