Skip to content

Commit 3b1c5b7

Browse files
committed
ENH: extensively refactor BlockJoinOperation to support n > 2, Concatenator class to orchestrate concatenations, #273, #479
1 parent eef27e6 commit 3b1c5b7

File tree

11 files changed

+547
-413
lines changed

11 files changed

+547
-413
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ pandas 0.7.0
8989
5-10x in most typical use cases (GH #374)
9090
- Some performance enhancements in constructing a Panel from a dict of
9191
DataFrame objects
92+
- Made ``Index._get_duplicates`` a public method by removing the underscore
9293

9394
**Bug fixes**
9495

pandas/core/frame.py

+17-102
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import numpy.ma as ma
2424

2525
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
26-
_default_index, _stringify, _maybe_upcast)
26+
_default_index, _stringify)
2727
from pandas.core.daterange import DateRange
2828
from pandas.core.generic import NDFrame
2929
from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index
@@ -1638,7 +1638,8 @@ def reindex_like(self, other, method=None, copy=True):
16381638

16391639
truncate = generic.truncate
16401640

1641-
def set_index(self, col_or_cols, drop=True, inplace=False):
1641+
def set_index(self, col_or_cols, drop=True, inplace=False,
1642+
verify_integrity=True):
16421643
"""
16431644
Set the DataFrame index (row labels) using one or more existing
16441645
columns. By default yields a new object.
@@ -1650,6 +1651,10 @@ def set_index(self, col_or_cols, drop=True, inplace=False):
16501651
Delete columns to be used as the new index
16511652
inplace : boolean, default False
16521653
Modify the DataFrame in place (do not create a new object)
1654+
verify_integrity : boolean, default True
1655+
Check the new index for duplicates. Otherwise defer the check until
1656+
necessary. Setting to False will improve the performance of this
1657+
method
16531658
16541659
Returns
16551660
-------
@@ -1674,8 +1679,8 @@ def set_index(self, col_or_cols, drop=True, inplace=False):
16741679

16751680
index = MultiIndex.from_arrays(arrays, names=cols)
16761681

1677-
if not index._verify_integrity():
1678-
duplicates = index._get_duplicates()
1682+
if verify_integrity and not index._verify_integrity():
1683+
duplicates = index.get_duplicates()
16791684
raise Exception('Index has duplicate keys: %s' % duplicates)
16801685

16811686
# clear up memory usage
@@ -2738,60 +2743,13 @@ def append(self, other, ignore_index=False):
27382743
if not self:
27392744
return other.copy()
27402745

2741-
if ignore_index:
2742-
new_index = None
2746+
from pandas.tools.merge import concat
2747+
if isinstance(other, list):
2748+
to_concat = [self] + other
27432749
else:
2744-
new_index = self.index.append(other.index)
2745-
assert(new_index._verify_integrity())
2746-
2747-
if self.columns.equals(other.columns):
2748-
return self._append_same_columns(other, new_index)
2749-
else:
2750-
return self._append_different_columns(other, new_index)
2751-
2752-
def _append_different_columns(self, other, new_index):
2753-
indexer = self.columns.get_indexer(other.columns)
2754-
2755-
if not (indexer == -1).any():
2756-
new_columns = self.columns
2757-
else:
2758-
new_columns = self.columns.union(other.columns)
2759-
2760-
new_data = self._append_column_by_column(other)
2761-
return self._constructor(data=new_data, index=new_index,
2762-
columns=new_columns)
2763-
2764-
def _append_same_columns(self, other, new_index):
2765-
if self._is_mixed_type:
2766-
new_data = self._append_column_by_column(other)
2767-
else:
2768-
new_data = np.concatenate((self.values, other.values), axis=0)
2769-
return self._constructor(new_data, index=new_index,
2770-
columns=self.columns)
2771-
2772-
def _append_column_by_column(self, other):
2773-
def _concat_missing(values, n):
2774-
values = _maybe_upcast(values)
2775-
missing_values = np.empty(n, dtype=values.dtype)
2776-
missing_values.fill(np.nan)
2777-
return values, missing_values
2778-
2779-
new_data = {}
2780-
for col in self:
2781-
values = self._get_raw_column(col)
2782-
if col in other:
2783-
other_values = other._get_raw_column(col)
2784-
else:
2785-
values, other_values = _concat_missing(values, len(other))
2786-
new_data[col] = np.concatenate((values, other_values))
2787-
2788-
for col in other:
2789-
values = other._get_raw_column(col)
2790-
if col not in self:
2791-
values, missing_values = _concat_missing(values, len(self))
2792-
new_data[col] = np.concatenate((missing_values, values))
2793-
2794-
return new_data
2750+
to_concat = [self, other]
2751+
return concat(to_concat, ignore_index=ignore_index,
2752+
verify_integrity=True)
27952753

27962754
def _get_raw_column(self, col):
27972755
return self._data.get(col)
@@ -3618,6 +3576,8 @@ def factor_agg(factor, vec, func):
36183576

36193577

36203578
def extract_index(data):
3579+
from pandas.core.index import _union_indexes
3580+
36213581
index = None
36223582
if len(data) == 0:
36233583
index = NULL_INDEX
@@ -3663,51 +3623,6 @@ def extract_index(data):
36633623
return _ensure_index(index)
36643624

36653625

3666-
def _union_indexes(indexes):
3667-
if len(indexes) == 0:
3668-
return Index([])
3669-
3670-
if len(indexes) == 1:
3671-
result = indexes[0]
3672-
if isinstance(result, list):
3673-
result = Index(sorted(result))
3674-
return result
3675-
3676-
indexes, kind = _sanitize_and_check(indexes)
3677-
3678-
if kind == 'special':
3679-
result = indexes[0]
3680-
for other in indexes[1:]:
3681-
result = result.union(other)
3682-
return result
3683-
elif kind == 'array':
3684-
index = indexes[0]
3685-
for other in indexes[1:]:
3686-
if not index.equals(other):
3687-
return Index(lib.fast_unique_multiple(indexes))
3688-
3689-
return index
3690-
else:
3691-
return Index(lib.fast_unique_multiple_list(indexes))
3692-
3693-
3694-
def _sanitize_and_check(indexes):
3695-
kinds = list(set([type(index) for index in indexes]))
3696-
3697-
if list in kinds:
3698-
if len(kinds) > 1:
3699-
indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
3700-
for x in indexes]
3701-
kinds.remove(list)
3702-
else:
3703-
return indexes, 'list'
3704-
3705-
3706-
if len(kinds) > 1 or Index not in kinds:
3707-
return indexes, 'special'
3708-
else:
3709-
return indexes, 'array'
3710-
37113626

37123627
def _check_data_types(data):
37133628
have_raw_arrays = False

pandas/core/index.py

+73-11
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import numpy as np
77

8-
from pandas.core.common import (adjoin as _adjoin, _stringify,
8+
from pandas.core.common import (adjoin as _adjoin, _stringify, _try_sort,
99
_is_bool_indexer, _asarray_tuplesafe)
1010
from pandas.util.decorators import cache_readonly
1111
import pandas.core.common as com
@@ -119,6 +119,15 @@ def is_monotonic(self):
119119
except TypeError:
120120
return False
121121

122+
def get_duplicates(self):
123+
from collections import defaultdict
124+
counter = defaultdict(lambda: 0)
125+
for k in self.values:
126+
counter[k] += 1
127+
return sorted(k for k, v in counter.iteritems() if v > 1)
128+
129+
_get_duplicates = get_duplicates
130+
122131
@property
123132
def indexMap(self):
124133
"{label -> location}"
@@ -143,13 +152,6 @@ def _get_level_number(self, level):
143152
def _verify_integrity(self):
144153
return self._engine.has_integrity
145154

146-
def _get_duplicates(self):
147-
from collections import defaultdict
148-
counter = defaultdict(lambda: 0)
149-
for k in self.values:
150-
counter[k] += 1
151-
return sorted(k for k, v in counter.iteritems() if v > 1)
152-
153155
_allDates = None
154156
def is_all_dates(self):
155157
"""
@@ -1261,9 +1263,6 @@ def append(self, other):
12611263
appended : Index
12621264
"""
12631265
if isinstance(other, (list, tuple)):
1264-
for k in other:
1265-
assert(isinstance(k, MultiIndex))
1266-
12671266
to_concat = (self.values,) + tuple(k.values for k in other)
12681267
else:
12691268
to_concat = self.values, other.values
@@ -1871,3 +1870,66 @@ def _ensure_index(index_like):
18711870
def _validate_join_method(method):
18721871
if method not in ['left', 'right', 'inner', 'outer']:
18731872
raise Exception('do not recognize join method %s' % method)
1873+
1874+
# TODO: handle index names!
1875+
1876+
def _get_combined_index(indexes, intersect=False):
1877+
indexes = _get_distinct_indexes(indexes)
1878+
if len(indexes) == 1:
1879+
return indexes[0]
1880+
if intersect:
1881+
index = indexes[0]
1882+
for other in indexes[1:]:
1883+
index = index.intersection(other)
1884+
return index
1885+
union = _union_indexes(indexes)
1886+
return Index(union)
1887+
1888+
def _get_distinct_indexes(indexes):
1889+
return dict((id(x), x) for x in indexes).values()
1890+
1891+
1892+
def _union_indexes(indexes):
1893+
if len(indexes) == 0:
1894+
return Index([])
1895+
1896+
if len(indexes) == 1:
1897+
result = indexes[0]
1898+
if isinstance(result, list):
1899+
result = Index(sorted(result))
1900+
return result
1901+
1902+
indexes, kind = _sanitize_and_check(indexes)
1903+
1904+
if kind == 'special':
1905+
result = indexes[0]
1906+
for other in indexes[1:]:
1907+
result = result.union(other)
1908+
return result
1909+
elif kind == 'array':
1910+
index = indexes[0]
1911+
for other in indexes[1:]:
1912+
if not index.equals(other):
1913+
return Index(lib.fast_unique_multiple(indexes))
1914+
1915+
return index
1916+
else:
1917+
return Index(lib.fast_unique_multiple_list(indexes))
1918+
1919+
1920+
def _sanitize_and_check(indexes):
1921+
kinds = list(set([type(index) for index in indexes]))
1922+
1923+
if list in kinds:
1924+
if len(kinds) > 1:
1925+
indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
1926+
for x in indexes]
1927+
kinds.remove(list)
1928+
else:
1929+
return indexes, 'list'
1930+
1931+
1932+
if len(kinds) > 1 or Index not in kinds:
1933+
return indexes, 'special'
1934+
else:
1935+
return indexes, 'array'

pandas/core/panel.py

+8-41
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99

1010
from pandas.core.common import (PandasError, _mut_exclusive,
1111
_try_sort, _default_index, _infer_dtype)
12-
from pandas.core.index import Factor, Index, MultiIndex, _ensure_index
12+
from pandas.core.index import (Factor, Index, MultiIndex, _ensure_index,
13+
_get_combined_index, _union_indexes)
1314
from pandas.core.indexing import _NDFrameIndexer
1415
from pandas.core.internals import BlockManager, make_block, form_blocks
15-
from pandas.core.frame import DataFrame, _union_indexes
16+
from pandas.core.frame import DataFrame
1617
from pandas.core.generic import NDFrame
1718
from pandas.util import py3compat
1819
from pandas.util.decorators import deprecate
@@ -1152,52 +1153,18 @@ def _homogenize_dict(frames, intersect=True, dtype=None):
11521153
else:
11531154
adj_frames[k] = v
11541155

1155-
index = _get_combined_index(adj_frames, intersect=intersect)
1156-
columns = _get_combined_columns(adj_frames, intersect=intersect)
1156+
all_indexes = [df.index for df in adj_frames.values()]
1157+
all_columns = [df.columns for df in adj_frames.values()]
1158+
1159+
index = _get_combined_index(all_indexes, intersect=intersect)
1160+
columns = _get_combined_index(all_columns, intersect=intersect)
11571161

11581162
for key, frame in adj_frames.iteritems():
11591163
result[key] = frame.reindex(index=index, columns=columns,
11601164
copy=False)
11611165

11621166
return result, index, columns
11631167

1164-
def _get_combined_columns(frames, intersect=False):
1165-
columns = None
1166-
1167-
if intersect:
1168-
combine = set.intersection
1169-
else:
1170-
combine = set.union
1171-
1172-
for _, frame in frames.iteritems():
1173-
this_cols = set(frame.columns)
1174-
1175-
if columns is None:
1176-
columns = this_cols
1177-
else:
1178-
columns = combine(columns, this_cols)
1179-
1180-
return Index(sorted(columns))
1181-
1182-
def _get_combined_index(frames, intersect=False):
1183-
from pandas.core.frame import _union_indexes
1184-
1185-
indexes = _get_distinct_indexes([df.index for df in frames.values()])
1186-
if len(indexes) == 1:
1187-
return indexes[0]
1188-
if intersect:
1189-
index = indexes[0]
1190-
for other in indexes[1:]:
1191-
index = index.intersection(other)
1192-
return index
1193-
union = _union_indexes(indexes)
1194-
return Index(union)
1195-
1196-
def _get_distinct_indexes(indexes):
1197-
from itertools import groupby
1198-
indexes = sorted(indexes, key=id)
1199-
return [gp.next() for _, gp in groupby(indexes, id)]
1200-
12011168
def _monotonic(arr):
12021169
return not (arr[1:] < arr[:-1]).any()
12031170

pandas/io/parsers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ def get_chunk(self, rows=None):
413413
index = Index(np.arange(len(content)))
414414

415415
if not index._verify_integrity():
416-
dups = index._get_duplicates()
416+
dups = index.get_duplicates()
417417
raise Exception('Index has duplicates: %s' % str(dups))
418418

419419
if len(self.columns) != len(zipped_content):

pandas/sparse/panel.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ def minor_xs(self, key):
426426
SparseWidePanel = SparsePanel
427427

428428
def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
429-
from pandas.core.panel import _get_combined_index, _get_combined_columns
429+
from pandas.core.panel import _get_combined_index
430430
output = {}
431431
for item, df in frames.iteritems():
432432
if not isinstance(df, SparseDataFrame):
@@ -436,9 +436,11 @@ def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
436436
output[item] = df
437437

438438
if index is None:
439-
index = _get_combined_index(output)
439+
all_indexes = [df.index for df in output.values()]
440+
index = _get_combined_index(all_indexes)
440441
if columns is None:
441-
columns = _get_combined_columns(output)
442+
all_columns = [df.columns for df in output.values()]
443+
columns = _get_combined_index(all_columns)
442444

443445
index = _ensure_index(index)
444446
columns = _ensure_index(columns)

0 commit comments

Comments
 (0)