Skip to content

Commit b1ca192

Browse files
committed
ENH: index Cython method refactoring, left_join bugfix, sparse bugfix
1 parent cc722a8 commit b1ca192

File tree

5 files changed

+86
-117
lines changed

5 files changed

+86
-117
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ pandas 0.6.1
3838
is about 3x faster than df[column][row] by handling fewer cases (GH #437,
3939
#438). Add similar methods to sparse data structures for compatibility
4040
- Add Qt table widget to sandbox (PR #435)
41+
- DataFrame.align can accept Series arguments, add axis keyword (GH #461)
4142

4243
**Improvements to existing features**
4344

pandas/core/index.py

Lines changed: 64 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,17 @@ class Index(np.ndarray):
4040
----
4141
An Index instance can **only** contain hashable objects
4242
"""
43+
# Cython methods
4344
_map_indices = lib.map_indices_object
4445
_is_monotonic = lib.is_monotonic_object
4546
_groupby = lib.groupby_object
4647
_arrmap = lib.arrmap_object
48+
_left_indexer = lib.left_join_indexer_object
49+
_inner_indexer = lib.inner_join_indexer_object
50+
_outer_indexer = lib.outer_join_indexer_object
51+
_merge_indexer = lib.merge_indexer_object
52+
_pad = lib.pad_object
53+
_backfill = lib.backfill_object
4754

4855
name = None
4956
def __new__(cls, data, dtype=None, copy=False, name=None):
@@ -100,7 +107,11 @@ def values(self):
100107

101108
@cache_readonly
102109
def is_monotonic(self):
103-
return self._is_monotonic(self.values)
110+
try:
111+
# wrong buffer type raises ValueError
112+
return self._is_monotonic(self.values)
113+
except TypeError:
114+
return False
104115

105116
@property
106117
def indexMap(self):
@@ -331,12 +342,14 @@ def union(self, other):
331342
if len(self) == 0:
332343
return _ensure_index(other)
333344

334-
if self.is_monotonic and other.is_monotonic:
335-
if other.dtype != np.object_:
336-
other = Index(other, dtype=object)
345+
if self.dtype != other.dtype:
346+
this = self.astype('O')
347+
other = other.astype('O')
348+
return this.union(other)
337349

350+
if self.is_monotonic and other.is_monotonic:
338351
try:
339-
result = lib.outer_join_indexer_object(self, other.values)[0]
352+
result = self._outer_indexer(self, other.values)[0]
340353
except TypeError:
341354
# incomparable objects
342355
result = list(self.values)
@@ -385,17 +398,19 @@ def intersection(self, other):
385398
if not hasattr(other, '__iter__'):
386399
raise Exception('Input must be iterable!')
387400

401+
other = _ensure_index(other)
402+
388403
if self.equals(other):
389404
return self
390405

391-
other = _ensure_index(other)
392-
393-
if other.dtype != np.object_:
394-
other = other.astype(object)
406+
if self.dtype != other.dtype:
407+
this = self.astype('O')
408+
other = other.astype('O')
409+
return this.intersection(other)
395410

396411
if self.is_monotonic and other.is_monotonic:
397-
return Index(lib.inner_join_indexer_object(self,
398-
other.values)[0])
412+
result = self._inner_indexer(self, other.values)[0]
413+
return self._wrap_union_result(other, result)
399414
else:
400415
indexer = self.get_indexer(other.values)
401416
indexer = indexer.take((indexer != -1).nonzero()[0])
@@ -484,20 +499,28 @@ def get_indexer(self, target, method=None):
484499
target = _ensure_index(target)
485500

486501
if self.dtype != target.dtype:
502+
this = Index(self, dtype=object)
487503
target = Index(target, dtype=object)
504+
return this.get_indexer(target, method=method)
505+
# if self.dtype != target.dtype:
506+
# target = Index(target, dtype=object)
488507

489508
if method == 'pad':
490-
indexer = lib.pad_object(self, target, self.indexMap,
491-
target.indexMap)
509+
indexer = self._pad(self, target, self.indexMap, target.indexMap)
492510
elif method == 'backfill':
493-
indexer = lib.backfill_object(self, target, self.indexMap,
494-
target.indexMap)
511+
indexer = self._backfill(self, target, self.indexMap, target.indexMap)
495512
elif method is None:
496-
indexer = lib.merge_indexer_object(target, self.indexMap)
513+
indexer = self._get_indexer_standard(target)
497514
else:
498515
raise ValueError('unrecognized method: %s' % method)
499516
return indexer
500517

518+
def _get_indexer_standard(self, other):
519+
if self.is_monotonic and other.is_monotonic:
520+
return self._left_indexer(other, self)
521+
else:
522+
return self._merge_indexer(other, self.indexMap)
523+
501524
def groupby(self, to_groupby):
502525
return self._groupby(self.values, to_groupby)
503526

@@ -528,6 +551,11 @@ def reindex(self, target, method=None):
528551
return target, indexer
529552

530553
def join(self, other, how='left', return_indexers=False):
554+
if self.dtype != other.dtype:
555+
this = self.astype('O')
556+
other = other.astype('O')
557+
return this.join(other, how=how, return_indexers=return_indexers)
558+
531559
if self.is_monotonic and other.is_monotonic:
532560
return self._join_monotonic(other, how=how,
533561
return_indexers=return_indexers)
@@ -559,25 +587,23 @@ def join(self, other, how='left', return_indexers=False):
559587
def _join_monotonic(self, other, how='left', return_indexers=False):
560588
this_vals = self.values
561589

562-
if self.dtype != other.dtype:
563-
other = Index(other, dtype=object)
590+
# if self.dtype != other.dtype:
591+
# other = Index(other, dtype=object)
564592
other_vals = other.values
565593

566594
if how == 'left':
567595
join_index = self
568596
lidx = None
569-
ridx = lib.left_join_indexer_object(self, other)
597+
ridx = self._left_indexer(self, other)
570598
elif how == 'right':
571599
join_index = other
572-
lidx = lib.left_join_indexer_object(other, self)
600+
lidx = self._left_indexer(other, self)
573601
ridx = None
574602
elif how == 'inner':
575-
join_index, lidx, ridx = lib.inner_join_indexer_object(this_vals,
576-
other_vals)
603+
join_index, lidx, ridx = self._inner_indexer(this_vals, other_vals)
577604
join_index = self._wrap_joined_index(join_index, other)
578605
elif how == 'outer':
579-
join_index, lidx, ridx = lib.outer_join_indexer_object(this_vals,
580-
other_vals)
606+
join_index, lidx, ridx = self._outer_indexer(this_vals, other_vals)
581607
join_index = self._wrap_joined_index(join_index, other)
582608
else: # pragma: no cover
583609
raise Exception('do not recognize join method %s' % how)
@@ -690,6 +716,12 @@ class Int64Index(Index):
690716
_is_monotonic = lib.is_monotonic_int64
691717
_groupby = lib.groupby_int64
692718
_arrmap = lib.arrmap_int64
719+
_left_indexer = lib.left_join_indexer_int64
720+
_inner_indexer = lib.inner_join_indexer_int64
721+
_outer_indexer = lib.outer_join_indexer_int64
722+
_merge_indexer = lib.merge_indexer_int64
723+
_pad = lib.pad_int64
724+
_backfill = lib.backfill_int64
693725

694726
def __new__(cls, data, dtype=None, copy=False, name=None):
695727
if not isinstance(data, np.ndarray):
@@ -747,87 +779,9 @@ def equals(self, other):
747779

748780
return np.array_equal(self, other)
749781

750-
def get_indexer(self, target, method=None):
751-
target = _ensure_index(target)
752-
753-
if self.dtype != target.dtype:
754-
this = Index(self, dtype=object)
755-
target = Index(target, dtype=object)
756-
return this.get_indexer(target, method=method)
757-
758-
method = self._get_method(method)
759-
760-
if method == 'pad':
761-
indexer = lib.pad_int64(self, target, self.indexMap,
762-
target.indexMap)
763-
elif method == 'backfill':
764-
indexer = lib.backfill_int64(self, target, self.indexMap,
765-
target.indexMap)
766-
elif method is None:
767-
indexer = lib.merge_indexer_int64(target, self.indexMap)
768-
else: # pragma: no cover
769-
raise ValueError('unrecognized method: %s' % method)
770-
return indexer
771-
get_indexer.__doc__ = Index.get_indexer.__doc__
772-
773-
def join(self, other, how='left', return_indexers=False):
774-
if not isinstance(other, Int64Index):
775-
return Index.join(self.astype(object), other, how=how,
776-
return_indexers=return_indexers)
777-
778-
if self.is_monotonic and other.is_monotonic:
779-
return self._join_monotonic(other, how=how,
780-
return_indexers=return_indexers)
781-
else:
782-
return Index.join(self, other, how=how,
783-
return_indexers=return_indexers)
784-
785-
def _join_monotonic(self, other, how='left', return_indexers=False):
786-
if how == 'left':
787-
join_index = self
788-
lidx = None
789-
ridx = lib.left_join_indexer_int64(self, other)
790-
elif how == 'right':
791-
join_index = other
792-
lidx = lib.left_join_indexer_int64(other, self)
793-
ridx = None
794-
elif how == 'inner':
795-
join_index, lidx, ridx = lib.inner_join_indexer_int64(self, other)
796-
join_index = Int64Index(join_index)
797-
elif how == 'outer':
798-
join_index, lidx, ridx = lib.outer_join_indexer_int64(self, other)
799-
join_index = Int64Index(join_index)
800-
else: # pragma: no cover
801-
raise Exception('do not recognize join method %s' % how)
802-
803-
if return_indexers:
804-
return join_index, lidx, ridx
805-
else:
806-
return join_index
807-
808-
def intersection(self, other):
809-
if not isinstance(other, Int64Index):
810-
return Index.intersection(self.astype(object), other)
811-
812-
if self.is_monotonic and other.is_monotonic:
813-
result = lib.inner_join_indexer_int64(self, other)[0]
814-
else:
815-
indexer = self.get_indexer(other)
816-
indexer = indexer.take((indexer != -1).nonzero()[0])
817-
return self.take(indexer)
818-
return Int64Index(result)
819-
intersection.__doc__ = Index.intersection.__doc__
820-
821-
def union(self, other):
822-
if not isinstance(other, Int64Index):
823-
return Index.union(self.astype(object), other)
824-
825-
if self.is_monotonic and other.is_monotonic:
826-
result = lib.outer_join_indexer_int64(self, other)[0]
827-
else:
828-
result = np.unique(np.concatenate((self, other)))
829-
return Int64Index(result)
830-
union.__doc__ = Index.union.__doc__
782+
def _wrap_joined_index(self, joined, other):
783+
name = self.name if self.name == other.name else None
784+
return Int64Index(joined, name=name)
831785

832786
class DateIndex(Index):
833787
pass
@@ -1321,14 +1275,13 @@ def get_indexer(self, target, method=None):
13211275
self_index = self.get_tuple_index()
13221276

13231277
if method == 'pad':
1324-
indexer = lib.pad_object(self_index, target_index,
1325-
self_index.indexMap, target.indexMap)
1278+
indexer = self._pad(self_index, target_index, self_index.indexMap,
1279+
target.indexMap)
13261280
elif method == 'backfill':
1327-
indexer = lib.backfill_object(self_index, target_index,
1328-
self_index.indexMap, target.indexMap)
1281+
indexer = self._backfill(self_index, target_index, self_index.indexMap,
1282+
target.indexMap)
13291283
else:
1330-
indexer = lib.merge_indexer_object(target_index,
1331-
self_index.indexMap)
1284+
indexer = self._merge_indexer(target_index, self_index.indexMap)
13321285

13331286
return indexer
13341287

pandas/core/sparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,7 @@ def copy(self, deep=True):
871871
"""
872872
Make a copy of this SparseDataFrame
873873
"""
874-
series = self._series.copy()
874+
series = dict((k, v.copy()) for k, v in self.iteritems())
875875
return SparseDataFrame(series, index=self.index, columns=self.columns,
876876
default_fill_value=self.default_fill_value,
877877
default_kind=self.default_kind)

pandas/src/generate_code.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,9 +381,12 @@ def left_join_indexer_%(name)s(ndarray[%(c_type)s] left,
381381
lval = left[i]
382382
rval = right[j]
383383
384-
if lval == right[j]:
384+
if lval == rval:
385385
indexer[i] = j
386386
i += 1
387+
while i < nleft - 1 and left[i] == rval:
388+
indexer[i] = j
389+
i += 1
387390
j += 1
388391
elif lval > rval:
389392
indexer[i] = -1

pandas/src/generated.pyx

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,9 +1430,12 @@ def left_join_indexer_float64(ndarray[float64_t] left,
14301430
lval = left[i]
14311431
rval = right[j]
14321432

1433-
if lval == right[j]:
1433+
if lval == rval:
14341434
indexer[i] = j
14351435
i += 1
1436+
while i < nleft - 1 and left[i] == rval:
1437+
indexer[i] = j
1438+
i += 1
14361439
j += 1
14371440
elif lval > rval:
14381441
indexer[i] = -1
@@ -1469,9 +1472,12 @@ def left_join_indexer_object(ndarray[object] left,
14691472
lval = left[i]
14701473
rval = right[j]
14711474

1472-
if lval == right[j]:
1475+
if lval == rval:
14731476
indexer[i] = j
14741477
i += 1
1478+
while i < nleft - 1 and left[i] == rval:
1479+
indexer[i] = j
1480+
i += 1
14751481
j += 1
14761482
elif lval > rval:
14771483
indexer[i] = -1
@@ -1508,9 +1514,12 @@ def left_join_indexer_int32(ndarray[int32_t] left,
15081514
lval = left[i]
15091515
rval = right[j]
15101516

1511-
if lval == right[j]:
1517+
if lval == rval:
15121518
indexer[i] = j
15131519
i += 1
1520+
while i < nleft - 1 and left[i] == rval:
1521+
indexer[i] = j
1522+
i += 1
15141523
j += 1
15151524
elif lval > rval:
15161525
indexer[i] = -1
@@ -1547,9 +1556,12 @@ def left_join_indexer_int64(ndarray[int64_t] left,
15471556
lval = left[i]
15481557
rval = right[j]
15491558

1550-
if lval == right[j]:
1559+
if lval == rval:
15511560
indexer[i] = j
15521561
i += 1
1562+
while i < nleft - 1 and left[i] == rval:
1563+
indexer[i] = j
1564+
i += 1
15531565
j += 1
15541566
elif lval > rval:
15551567
indexer[i] = -1

0 commit comments

Comments
 (0)