Skip to content

Commit 4d60df8

Browse files
committed
ENH: speed up DataFrame constructor with nested dict, GH #212
1 parent 7531038 commit 4d60df8

File tree

5 files changed

+111
-65
lines changed

5 files changed

+111
-65
lines changed

RELEASE.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,13 @@ pandas 0.5.0
2525
- Added ability to join on multiple columns in `DataFrame.join` (GH #214)
2626
- Added private `_get_duplicates` function to `Index` for identifying
2727
duplicate values more easily
28+
- Added column attribute access to DataFrame, e.g. df.A equivalent to df['A']
29+
if 'A' is a column in the DataFrame (PR #213)
2830

2931
**Improvements to existing features**
3032

33+
- Added Cython function for converting tuples to ndarray very fast. Speeds up
34+
many MultiIndex-related operations
3135
- File parsing functions like `read_csv` and `read_table` will explicitly
3236
check if a parsed index has duplicates and raise a more helpful exception
3337
rather than deferring the check until later
@@ -38,6 +42,7 @@ pandas 0.5.0
3842
5x, regression from 0.3.0
3943
- With new `DataFrame.align` method, speeding up binary operations between
4044
differently-indexed DataFrame objects by 10-25%.
45+
- Significantly sped up conversion of nested dict into DataFrame
4146

4247
**Bug fixes**
4348

pandas/core/frame.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from pandas.util import py3compat
3434
import pandas.core.common as common
3535
import pandas.core.datetools as datetools
36-
import pandas._tseries as _tseries
36+
import pandas._tseries as lib
3737

3838
#----------------------------------------------------------------------
3939
# Factory helper methods
@@ -861,7 +861,7 @@ def _getitem_single(self, key):
861861
res = Series(values, index=self.index, name=key)
862862
self._series_cache[key] = res
863863
return res
864-
864+
865865
def __getattr__(self, name):
866866
"""After regular attribute access, try looking up the name of a column.
867867
This allows simpler access to columns for interactive use."""
@@ -3125,7 +3125,7 @@ def _get_index(v):
31253125
if isinstance(v, Series):
31263126
return v.index
31273127
elif isinstance(v, dict):
3128-
return Index(_try_sort(v))
3128+
return v.keys()
31293129

31303130
index = None
31313131
if len(data) == 0:
@@ -3155,26 +3155,45 @@ def _get_index(v):
31553155

31563156
def _union_indexes(indexes):
31573157
if len(indexes) == 1:
3158-
index = indexes[0]
3159-
if _any_special_indexes(indexes):
3158+
result = indexes[0]
3159+
if isinstance(result, list):
3160+
result = Index(sorted(result))
3161+
return result
3162+
3163+
indexes, kind = _sanitize_and_check(indexes)
3164+
3165+
if kind == 'special':
31603166
result = indexes[0]
31613167
for other in indexes[1:]:
31623168
result = result.union(other)
31633169
return result
3164-
else:
3170+
elif kind == 'array':
31653171
index = indexes[0]
31663172
for other in indexes[1:]:
31673173
if not index.equals(other):
3168-
return Index(_tseries.fast_unique_multiple(indexes))
3174+
return Index(lib.fast_unique_multiple(indexes))
31693175

31703176
return index
3177+
else:
3178+
return Index(lib.fast_unique_multiple_list(indexes))
3179+
3180+
3181+
def _sanitize_and_check(indexes):
3182+
kinds = list(set([type(index) for index in indexes]))
3183+
3184+
if list in kinds:
3185+
if len(kinds) > 1:
3186+
indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
3187+
for x in indexes]
3188+
kinds.remove(list)
3189+
else:
3190+
return indexes, 'list'
31713191

31723192

3173-
def _any_special_indexes(indexes):
3174-
for index in indexes:
3175-
if type(index) != Index:
3176-
return True
3177-
return False
3193+
if len(kinds) > 1 or Index not in kinds:
3194+
return indexes, 'special'
3195+
else:
3196+
return indexes, 'array'
31783197

31793198

31803199
def _check_data_types(data):

pandas/core/index.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class Index(np.ndarray):
3939
----
4040
An Index instance can **only** contain hashable objects
4141
"""
42+
name = None
4243
def __new__(cls, data, dtype=None, copy=False, name=None):
4344
if isinstance(data, np.ndarray):
4445
if dtype is None and issubclass(data.dtype.type, np.integer):
@@ -51,12 +52,6 @@ def __new__(cls, data, dtype=None, copy=False, name=None):
5152
# other iterable of some kind
5253
subarr = _asarray_tuplesafe(data, dtype=object)
5354

54-
# if not isinstance(data, (list, tuple)):
55-
# data = list(data)
56-
57-
# subarr = np.empty(len(data), dtype=object)
58-
# subarr[:] = data
59-
6055
subarr = subarr.view(cls)
6156
subarr.name = name
6257
return subarr

pandas/src/groupby.pyx

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -213,27 +213,6 @@ def group_labels2(ndarray[object] values):
213213

214214
return reverse, labels
215215

216-
@cython.wraparound(False)
217-
@cython.boundscheck(False)
218-
def fast_unique(ndarray[object] values):
219-
cdef:
220-
Py_ssize_t i, n = len(values)
221-
list uniques = []
222-
dict table = {}
223-
object val, stub = 0
224-
225-
for i from 0 <= i < n:
226-
val = values[i]
227-
if val not in table:
228-
table[val] = stub
229-
uniques.append(val)
230-
try:
231-
uniques.sort()
232-
except Exception:
233-
pass
234-
235-
return uniques
236-
237216
@cython.wraparound(False)
238217
@cython.boundscheck(False)
239218
def get_unique_labels(ndarray[object] values, dict idMap):
@@ -248,32 +227,6 @@ def get_unique_labels(ndarray[object] values, dict idMap):
248227

249228
return fillVec
250229

251-
@cython.wraparound(False)
252-
@cython.boundscheck(False)
253-
def fast_unique_multiple(list arrays):
254-
cdef:
255-
ndarray[object] buf
256-
Py_ssize_t k = len(arrays)
257-
Py_ssize_t i, j, n
258-
list uniques = []
259-
dict table = {}
260-
object val, stub = 0
261-
262-
for i from 0 <= i < k:
263-
buf = arrays[i]
264-
n = len(buf)
265-
for j from 0 <= j < n:
266-
val = buf[j]
267-
if val not in table:
268-
table[val] = stub
269-
uniques.append(val)
270-
try:
271-
uniques.sort()
272-
except Exception:
273-
pass
274-
275-
return uniques
276-
277230
# from libcpp.set cimport set as stlset
278231

279232
# cdef fast_unique_int32(ndarray arr):

pandas/src/tseries.pyx

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,80 @@ def list_to_object_array(list obj):
274274

275275
return arr
276276

277+
278+
@cython.wraparound(False)
279+
@cython.boundscheck(False)
280+
def fast_unique(ndarray[object] values):
281+
cdef:
282+
Py_ssize_t i, n = len(values)
283+
list uniques = []
284+
dict table = {}
285+
object val, stub = 0
286+
287+
for i from 0 <= i < n:
288+
val = values[i]
289+
if val not in table:
290+
table[val] = stub
291+
uniques.append(val)
292+
try:
293+
uniques.sort()
294+
except Exception:
295+
pass
296+
297+
return uniques
298+
299+
@cython.wraparound(False)
300+
@cython.boundscheck(False)
301+
def fast_unique_multiple(list arrays):
302+
cdef:
303+
ndarray[object] buf
304+
Py_ssize_t k = len(arrays)
305+
Py_ssize_t i, j, n
306+
list uniques = []
307+
dict table = {}
308+
object val, stub = 0
309+
310+
for i from 0 <= i < k:
311+
buf = arrays[i]
312+
n = len(buf)
313+
for j from 0 <= j < n:
314+
val = buf[j]
315+
if val not in table:
316+
table[val] = stub
317+
uniques.append(val)
318+
try:
319+
uniques.sort()
320+
except Exception:
321+
pass
322+
323+
return uniques
324+
325+
@cython.wraparound(False)
326+
@cython.boundscheck(False)
327+
def fast_unique_multiple_list(list lists):
328+
cdef:
329+
list buf
330+
Py_ssize_t k = len(lists)
331+
Py_ssize_t i, j, n
332+
list uniques = []
333+
dict table = {}
334+
object val, stub = 0
335+
336+
for i from 0 <= i < k:
337+
buf = lists[i]
338+
n = len(buf)
339+
for j from 0 <= j < n:
340+
val = buf[j]
341+
if val not in table:
342+
table[val] = stub
343+
uniques.append(val)
344+
try:
345+
uniques.sort()
346+
except Exception:
347+
pass
348+
349+
return uniques
350+
277351
include "skiplist.pyx"
278352
include "groupby.pyx"
279353
include "moments.pyx"

0 commit comments

Comments
 (0)