Skip to content

Commit b1b85ae

Browse files
committed
BUG: basic DataFrame constructor refactoring to better support duplicate columns. close #2079
1 parent 81661af commit b1b85ae

File tree

8 files changed

+123
-108
lines changed

8 files changed

+123
-108
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ pandas 0.9.1
5555

5656
**Bug fixes**
5757

58+
- Fix some duplicate-column DataFrame constructor issues (#2079)
5859
- Fix bar plot color cycle issues (#2082)
5960
- Implement comparisons on date offsets with fixed delta (#2078)
6061
- Handle inf/-inf correctly in read_* parser functions (#2041)

pandas/core/frame.py

+38-23
Original file line numberDiff line numberDiff line change
@@ -401,15 +401,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
401401
index = _get_names_from_index(data)
402402

403403
if isinstance(data[0], (list, tuple, dict, Series)):
404-
conv_data, columns = _to_sdict(data, columns)
405-
if isinstance(conv_data, dict):
406-
if len(conv_data) == 0 and index is None:
407-
index = np.arange(len(data))
408-
mgr = self._init_dict(conv_data, index, columns,
409-
dtype=dtype)
410-
else:
411-
mgr = self._init_ndarray(conv_data, index, columns,
412-
dtype=dtype, copy=copy)
404+
arrays, columns = _to_arrays(data, columns)
405+
406+
columns = _ensure_index(columns)
407+
408+
if index is None:
409+
index = _default_index(len(data))
410+
mgr = self._init_arrays(arrays, columns, index, columns,
411+
dtype=dtype)
413412
else:
414413
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
415414
copy=copy)
@@ -463,13 +462,20 @@ def _init_dict(self, data, index, columns, dtype=None):
463462
index = _ensure_index(index)
464463

465464
# don't force copy because getting jammed in an ndarray anyway
466-
homogenized = _homogenize(data, index, columns, dtype)
465+
hom_arrays, arr_names = _homogenize(data, index, columns, dtype)
466+
467+
return self._init_arrays(hom_arrays, arr_names, index, columns)
467468

469+
def _init_arrays(self, arrays, arr_names, index, columns, dtype=None):
470+
"""
471+
Segregate Series based on type and coerce into matrices.
472+
Needs to handle a lot of exceptional cases.
473+
"""
468474
# from BlockManager perspective
469475
axes = [columns, index]
470476

471477
# segregates dtypes and forms blocks matching to columns
472-
blocks = form_blocks(homogenized, axes)
478+
blocks = form_blocks(arrays, arr_names, axes)
473479

474480
# consolidate for now
475481
mgr = BlockManager(blocks, axes)
@@ -870,8 +876,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
870876
if isinstance(data, (np.ndarray, DataFrame, dict)):
871877
columns, sdict = _rec_to_dict(data)
872878
else:
873-
sdict, columns = _to_sdict(data, columns,
874-
coerce_float=coerce_float)
879+
arrays, columns = _to_arrays(data, columns,
880+
coerce_float=coerce_float)
881+
sdict = dict(zip(columns, arrays))
875882

876883
if exclude is None:
877884
exclude = set()
@@ -5053,9 +5060,13 @@ def _rec_to_dict(arr):
50535060
return columns, sdict
50545061

50555062

5056-
def _to_sdict(data, columns, coerce_float=False):
5063+
def _to_arrays(data, columns, coerce_float=False):
5064+
"""
5065+
Return list of arrays, columns
5066+
"""
5067+
50575068
if len(data) == 0:
5058-
return {}, columns
5069+
return [], columns if columns is not None else []
50595070
if isinstance(data[0], (list, tuple)):
50605071
return _list_to_sdict(data, columns, coerce_float=coerce_float)
50615072
elif isinstance(data[0], dict):
@@ -5100,14 +5111,15 @@ def _list_of_series_to_sdict(data, columns, coerce_float=False):
51005111
indexer = indexer_cache[id(index)] = index.get_indexer(columns)
51015112
aligned_values.append(com.take_1d(s.values, indexer))
51025113

5114+
# TODO: waste
51035115
values = np.vstack(aligned_values)
51045116

51055117
if values.dtype == np.object_:
51065118
content = list(values.T)
51075119
return _convert_object_array(content, columns,
51085120
coerce_float=coerce_float)
51095121
else:
5110-
return values, columns
5122+
return values.T, columns
51115123

51125124

51135125
def _list_of_dict_to_sdict(data, columns, coerce_float=False):
@@ -5133,9 +5145,10 @@ def _convert_object_array(content, columns, coerce_float=False):
51335145
raise AssertionError('%d columns passed, passed data had %s '
51345146
'columns' % (len(columns), len(content)))
51355147

5136-
sdict = dict((c, lib.maybe_convert_objects(vals, try_float=coerce_float))
5137-
for c, vals in zip(columns, content))
5138-
return sdict, columns
5148+
arrays = [lib.maybe_convert_objects(arr, try_float=coerce_float)
5149+
for arr in content]
5150+
5151+
return arrays, columns
51395152

51405153

51415154
def _get_names_from_index(data):
@@ -5159,13 +5172,14 @@ def _get_names_from_index(data):
51595172
def _homogenize(data, index, columns, dtype=None):
51605173
from pandas.core.series import _sanitize_array
51615174

5162-
homogenized = {}
5163-
51645175
if dtype is not None:
51655176
dtype = np.dtype(dtype)
51665177

51675178
oindex = None
51685179

5180+
homogenized = []
5181+
names = []
5182+
51695183
for k in columns:
51705184
if k not in data:
51715185
# no obvious "empty" int column
@@ -5202,9 +5216,10 @@ def _homogenize(data, index, columns, dtype=None):
52025216
v = _sanitize_array(v, index, dtype=dtype, copy=False,
52035217
raise_cast_failure=False)
52045218

5205-
homogenized[k] = v
5219+
names.append(k)
5220+
homogenized.append(v)
52065221

5207-
return homogenized
5222+
return homogenized, names
52085223

52095224

52105225
def _put_str(s, space):

pandas/core/internals.py

+52-71
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ class Block(object):
1717
"""
1818
__slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim']
1919

20-
def __init__(self, values, items, ref_items, ndim=2,
21-
do_integrity_check=False):
20+
def __init__(self, values, items, ref_items, ndim=2):
2221
if issubclass(values.dtype.type, basestring):
2322
values = np.array(values, dtype=object)
2423

@@ -31,15 +30,6 @@ def __init__(self, values, items, ref_items, ndim=2,
3130
self.items = _ensure_index(items)
3231
self.ref_items = _ensure_index(ref_items)
3332

34-
if do_integrity_check:
35-
self._check_integrity()
36-
37-
def _check_integrity(self):
38-
if len(self.items) < 2:
39-
return
40-
# monotonicity
41-
return (self.ref_locs[1:] > self.ref_locs[:-1]).all()
42-
4333
@property
4434
def ref_locs(self):
4535
if self._ref_locs is None:
@@ -400,13 +390,11 @@ def should_store(self, value):
400390
class DatetimeBlock(Block):
401391
_can_hold_na = True
402392

403-
def __init__(self, values, items, ref_items, ndim=2,
404-
do_integrity_check=False):
393+
def __init__(self, values, items, ref_items, ndim=2):
405394
if values.dtype != _NS_DTYPE:
406395
values = lib.cast_to_nanoseconds(values)
407396

408-
Block.__init__(self, values, items, ref_items, ndim=ndim,
409-
do_integrity_check=do_integrity_check)
397+
Block.__init__(self, values, items, ref_items, ndim=ndim)
410398

411399
def _can_hold_element(self, element):
412400
return com.is_integer(element) or isinstance(element, datetime)
@@ -443,7 +431,7 @@ def get_values(self, dtype):
443431
return self.values
444432

445433

446-
def make_block(values, items, ref_items, do_integrity_check=False):
434+
def make_block(values, items, ref_items):
447435
dtype = values.dtype
448436
vtype = dtype.type
449437

@@ -462,8 +450,7 @@ def make_block(values, items, ref_items, do_integrity_check=False):
462450
else:
463451
klass = ObjectBlock
464452

465-
return klass(values, items, ref_items, ndim=values.ndim,
466-
do_integrity_check=do_integrity_check)
453+
return klass(values, items, ref_items, ndim=values.ndim)
467454

468455
# TODO: flexible with index=None and/or items=None
469456

@@ -548,8 +535,7 @@ def __setstate__(self, state):
548535

549536
blocks = []
550537
for values, items in zip(bvalues, bitems):
551-
blk = make_block(values, items, self.axes[0],
552-
do_integrity_check=True)
538+
blk = make_block(values, items, self.axes[0])
553539
blocks.append(blk)
554540
self.blocks = blocks
555541

@@ -1079,8 +1065,7 @@ def _make_na_block(self, items, ref_items, fill_value=np.nan):
10791065
dtype = com._infer_dtype(fill_value)
10801066
block_values = np.empty(block_shape, dtype=dtype)
10811067
block_values.fill(fill_value)
1082-
na_block = make_block(block_values, items, ref_items,
1083-
do_integrity_check=True)
1068+
na_block = make_block(block_values, items, ref_items)
10841069
return na_block
10851070

10861071
def take(self, indexer, axis=1):
@@ -1236,69 +1221,66 @@ def item_dtypes(self):
12361221
assert(mask.all())
12371222
return result
12381223

1239-
def form_blocks(data, axes):
1224+
def form_blocks(arrays, names, axes):
12401225
# pre-filter out items if we passed it
12411226
items = axes[0]
12421227

1243-
if len(data) < len(items):
1244-
extra_items = items - Index(data.keys())
1228+
if len(arrays) < len(items):
1229+
extra_items = items - Index(names)
12451230
else:
12461231
extra_items = []
12471232

12481233
# put "leftover" items in float bucket, where else?
12491234
# generalize?
1250-
float_dict = {}
1251-
complex_dict = {}
1252-
int_dict = {}
1253-
bool_dict = {}
1254-
object_dict = {}
1255-
datetime_dict = {}
1256-
for k, v in data.iteritems():
1235+
float_items = []
1236+
complex_items = []
1237+
int_items = []
1238+
bool_items = []
1239+
object_items = []
1240+
datetime_items = []
1241+
for k, v in zip(names, arrays):
12571242
if issubclass(v.dtype.type, np.floating):
1258-
float_dict[k] = v
1243+
float_items.append((k, v))
12591244
elif issubclass(v.dtype.type, np.complexfloating):
1260-
complex_dict[k] = v
1245+
complex_items.append((k, v))
12611246
elif issubclass(v.dtype.type, np.datetime64):
12621247
if v.dtype != _NS_DTYPE:
12631248
v = lib.cast_to_nanoseconds(v)
1264-
datetime_dict[k] = v
1249+
1250+
if hasattr(v, 'tz') and v.tz is not None:
1251+
object_items.append((k, v))
1252+
else:
1253+
datetime_items.append((k, v))
12651254
elif issubclass(v.dtype.type, np.integer):
1266-
int_dict[k] = v
1255+
int_items.append((k, v))
12671256
elif v.dtype == np.bool_:
1268-
bool_dict[k] = v
1257+
bool_items.append((k, v))
12691258
else:
1270-
object_dict[k] = v
1259+
object_items.append((k, v))
12711260

12721261
blocks = []
1273-
if len(float_dict):
1274-
float_block = _simple_blockify(float_dict, items, np.float64)
1262+
if len(float_items):
1263+
float_block = _simple_blockify(float_items, items, np.float64)
12751264
blocks.append(float_block)
12761265

1277-
if len(complex_dict):
1278-
complex_block = _simple_blockify(complex_dict, items, np.complex128)
1266+
if len(complex_items):
1267+
complex_block = _simple_blockify(complex_items, items, np.complex128)
12791268
blocks.append(complex_block)
12801269

1281-
if len(int_dict):
1282-
int_block = _simple_blockify(int_dict, items, np.int64)
1270+
if len(int_items):
1271+
int_block = _simple_blockify(int_items, items, np.int64)
12831272
blocks.append(int_block)
12841273

1285-
for k, v in list(datetime_dict.items()):
1286-
# hackeroo
1287-
if hasattr(v, 'tz') and v.tz is not None:
1288-
del datetime_dict[k]
1289-
object_dict[k] = v.asobject
1290-
1291-
if len(datetime_dict):
1292-
datetime_block = _simple_blockify(datetime_dict, items,
1293-
np.dtype('M8[ns]'))
1274+
if len(datetime_items):
1275+
datetime_block = _simple_blockify(datetime_items, items, _NS_DTYPE)
12941276
blocks.append(datetime_block)
12951277

1296-
if len(bool_dict):
1297-
bool_block = _simple_blockify(bool_dict, items, np.bool_)
1278+
if len(bool_items):
1279+
bool_block = _simple_blockify(bool_items, items, np.bool_)
12981280
blocks.append(bool_block)
12991281

1300-
if len(object_dict) > 0:
1301-
object_block = _simple_blockify(object_dict, items, np.object_)
1282+
if len(object_items) > 0:
1283+
object_block = _simple_blockify(object_items, items, np.object_)
13021284
blocks.append(object_block)
13031285

13041286
if len(extra_items):
@@ -1309,22 +1291,21 @@ def form_blocks(data, axes):
13091291

13101292
block_values.fill(nan)
13111293

1312-
na_block = make_block(block_values, extra_items, items,
1313-
do_integrity_check=True)
1294+
na_block = make_block(block_values, extra_items, items)
13141295
blocks.append(na_block)
13151296
blocks = _consolidate(blocks, items)
13161297

13171298
return blocks
13181299

1319-
def _simple_blockify(dct, ref_items, dtype):
1320-
block_items, values = _stack_dict(dct, ref_items, dtype)
1300+
def _simple_blockify(tuples, ref_items, dtype):
1301+
block_items, values = _stack_arrays(tuples, ref_items, dtype)
13211302
# CHECK DTYPE?
13221303
if values.dtype != dtype: # pragma: no cover
13231304
values = values.astype(dtype)
13241305

1325-
return make_block(values, block_items, ref_items, do_integrity_check=True)
1306+
return make_block(values, block_items, ref_items)
13261307

1327-
def _stack_dict(dct, ref_items, dtype):
1308+
def _stack_arrays(tuples, ref_items, dtype):
13281309
from pandas.core.series import Series
13291310

13301311
# fml
@@ -1342,17 +1323,18 @@ def _shape_compat(x):
13421323
else:
13431324
return x.shape
13441325

1326+
names, arrays = zip(*tuples)
1327+
13451328
# index may box values
1346-
items = ref_items[[x in dct for x in ref_items]]
1329+
items = ref_items[ref_items.isin(names)]
13471330

1348-
first = dct[items[0]]
1349-
shape = (len(dct),) + _shape_compat(first)
1331+
first = arrays[0]
1332+
shape = (len(arrays),) + _shape_compat(first)
13501333

13511334
stacked = np.empty(shape, dtype=dtype)
1352-
for i, item in enumerate(items):
1353-
stacked[i] = _asarray_compat(dct[item])
1335+
for i, arr in enumerate(arrays):
1336+
stacked[i] = _asarray_compat(arr)
13541337

1355-
# stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
13561338
return items, stacked
13571339

13581340
def _blocks_to_series_dict(blocks, index=None):
@@ -1419,8 +1401,7 @@ def _merge_blocks(blocks, items):
14191401
return blocks[0]
14201402
new_values = _vstack([b.values for b in blocks])
14211403
new_items = blocks[0].items.append([b.items for b in blocks[1:]])
1422-
new_block = make_block(new_values, new_items, items,
1423-
do_integrity_check=True)
1404+
new_block = make_block(new_values, new_items, items)
14241405
return new_block.reindex_items_from(items)
14251406

14261407
def _union_block_items(blocks):

0 commit comments

Comments
 (0)