Skip to content

Commit 62cff78

Browse files
jrebackwesm
authored andcommitted
BUG: create correctly named indexables in HDFStore Tables
indexable columns were created and named in a legacy format, now named like the indexable in the object, e.g. 'index' for DataFrame, or 'major_axis'/'minor_axis' for Panel fixes an issue if we want to support say a 'column' oriented Table (e.g. instead of storing the transpose, store with a 'column' indexables) - not implemented yet, but supported now
1 parent 648d581 commit 62cff78

File tree

3 files changed

+88
-59
lines changed

3 files changed

+88
-59
lines changed

pandas/io/pytables.py

Lines changed: 82 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -855,46 +855,66 @@ def _read_frame_table(self, group, where=None):
855855
return t.read(where)
856856

857857

858-
class Col(object):
859-
""" a column description class
858+
class IndexCol(object):
859+
""" an index column description class
860860
861861
Parameters
862862
----------
863863
864+
axis : axis which I reference
864865
values : the ndarray like converted values
865866
kind : a string description of this type
866867
typ : the pytables type
868+
pos : the position in the pytables
867869
868870
"""
869871
is_indexable = True
870872

871-
def __init__(self, values = None, kind = None, typ = None, cname = None, itemsize = None, name = None, kind_attr = None, **kwargs):
873+
def __init__(self, values = None, kind = None, typ = None, cname = None, itemsize = None, name = None, axis = None, kind_attr = None, pos = None, **kwargs):
872874
self.values = values
873875
self.kind = kind
874876
self.typ = typ
875877
self.itemsize = itemsize
876-
self.name = None
878+
self.name = name
877879
self.cname = cname
878-
self.kind_attr = None
880+
self.kind_attr = kind_attr
881+
self.axis = axis
882+
self.pos = pos
879883
self.table = None
880884

881885
if name is not None:
882886
self.set_name(name, kind_attr)
887+
if pos is not None:
888+
self.set_pos(pos)
883889

884890
def set_name(self, name, kind_attr = None):
891+
""" set the name of this indexer """
885892
self.name = name
886893
self.kind_attr = kind_attr or "%s_kind" % name
887894
if self.cname is None:
888895
self.cname = name
889896

890897
return self
891898

899+
def set_axis(self, axis):
900+
""" set the axis over which I index """
901+
self.axis = axis
902+
903+
return self
904+
905+
def set_pos(self, pos):
906+
""" set the position of this column in the Table """
907+
self.pos = pos
908+
if pos is not None and self.typ is not None:
909+
self.typ._v_pos = pos
910+
return self
911+
892912
def set_table(self, table):
893913
self.table = table
894914
return self
895915

896916
def __repr__(self):
897-
return "name->%s,cname->%s,kind->%s" % (self.name,self.cname,self.kind)
917+
return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % (self.name,self.cname,self.axis,self.pos,self.kind)
898918

899919
__str__ = __repr__
900920

@@ -921,11 +941,6 @@ def attrs(self):
921941
def description(self):
922942
return self.table.description
923943

924-
@property
925-
def pos(self):
926-
""" my column position """
927-
return getattr(self.col,'_v_pos',None)
928-
929944
@property
930945
def col(self):
931946
""" return my current col description """
@@ -948,7 +963,7 @@ def maybe_set_size(self, min_itemsize = None, **kwargs):
948963
min_itemsize = min_itemsize.get(self.name)
949964

950965
if min_itemsize is not None and self.typ.itemsize < min_itemsize:
951-
self.typ = _tables().StringCol(itemsize = min_itemsize, pos = getattr(self.typ,'pos',None))
966+
self.typ = _tables().StringCol(itemsize = min_itemsize, pos = self.pos)
952967

953968
def validate_and_set(self, table, append, **kwargs):
954969
self.set_table(table)
@@ -984,7 +999,7 @@ def set_attr(self):
984999
""" set the kind for this colummn """
9851000
setattr(self.attrs,self.kind_attr,self.kind)
9861001

987-
class DataCol(Col):
1002+
class DataCol(IndexCol):
9881003
""" a data holding column, by definition this is not indexable
9891004
9901005
Parameters
@@ -1072,18 +1087,26 @@ class Table(object):
10721087
parent : my parent HDFStore
10731088
group : the group node where the table resides
10741089
1090+
Attrs in Table Node
1091+
-------------------
1092+
These are attributes that are store in the main table node, they are necessary
1093+
to recreate these tables when read back in.
1094+
1095+
index_axes: a list of tuples of the (original indexing axis and index column)
1096+
non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis)
1097+
values_axes : a list of the columns which comprise the data of this table
1098+
10751099
"""
10761100
table_type = None
10771101
ndim = None
1078-
axis_names = ['index','column']
10791102

10801103
def __init__(self, parent, group):
10811104
self.parent = parent
10821105
self.group = group
10831106
self.index_axes = []
10841107
self.non_index_axes = []
10851108
self.values_axes = []
1086-
self.selection = None
1109+
self.selection = None
10871110

10881111
@property
10891112
def pandas_type(self):
@@ -1136,22 +1159,17 @@ def attrs(self):
11361159
def description(self):
11371160
return self.table.description
11381161

1139-
@property
1140-
def is_transpose(self):
1141-
""" does my data need transposition """
1142-
return False
1143-
11441162
@property
11451163
def axes(self):
11461164
return itertools.chain(self.index_axes, self.values_axes)
11471165

11481166
def kinds_map(self):
1149-
""" return a diction of columns -> kinds """
1150-
return dict([ (a.cname,a.kind) for a in self.axes ])
1167+
""" return a list of the kinds for each columns """
1168+
return [ (a.cname,a.kind) for a in self.index_axes ]
11511169

11521170
def index_cols(self):
11531171
""" return a list of my index cols """
1154-
return [ i.cname for i in self.index_axes ]
1172+
return [ (i.axis,i.cname) for i in self.index_axes ]
11551173

11561174
def values_cols(self):
11571175
""" return a list of my values cols """
@@ -1184,10 +1202,11 @@ def indexables(self):
11841202
self._indexables = []
11851203

11861204
# index columns
1187-
self._indexables.extend([ Col(name = i) for i in self.attrs.index_cols ])
1205+
self._indexables.extend([ IndexCol(name = name, axis = axis, pos = i) for i, (axis, name) in enumerate(self.attrs.index_cols) ])
11881206

11891207
# data columns
1190-
self._indexables.extend([ DataCol.create_for_block(i = i) for i, c in enumerate(self.attrs.values_cols) ])
1208+
base_pos = len(self._indexables)
1209+
self._indexables.extend([ DataCol.create_for_block(i = i, pos = base_pos + i ) for i, c in enumerate(self.attrs.values_cols) ])
11911210

11921211
return self._indexables
11931212

@@ -1199,7 +1218,7 @@ def create_index(self, columns = None, optlevel = None, kind = None):
11991218
12001219
Paramaters
12011220
----------
1202-
columns : None or list_like (the columns to index - currently supports index/column)
1221+
columns : None or list_like (the indexers to index)
12031222
optlevel: optimization level (defaults to 6)
12041223
kind : kind of index (defaults to 'medium')
12051224
@@ -1212,8 +1231,10 @@ def create_index(self, columns = None, optlevel = None, kind = None):
12121231
table = self.table
12131232
if table is None: return
12141233

1234+
self.infer_axes()
1235+
12151236
if columns is None:
1216-
columns = ['index']
1237+
columns = [ self.index_axes[0].name ]
12171238
if not isinstance(columns, (tuple,list)):
12181239
columns = [ columns ]
12191240

@@ -1253,15 +1274,18 @@ def create_axes(self, axes_to_index, obj, validate = True, min_itemsize = None):
12531274
12541275
"""
12551276

1256-
self.index_axes = []
1277+
self.index_axes = []
12571278
self.non_index_axes = []
12581279

12591280
# create axes to index and non_index
12601281
j = 0
12611282
for i, a in enumerate(obj.axes):
1283+
12621284
if i in axes_to_index:
1263-
self.index_axes.append(_convert_index(a).set_name(self.axis_names[j]))
1285+
name = obj._AXIS_NAMES[i]
1286+
self.index_axes.append(_convert_index(a).set_name(name).set_axis(i).set_pos(j))
12641287
j += 1
1288+
12651289
else:
12661290
self.non_index_axes.append((i,list(a)))
12671291

@@ -1289,7 +1313,8 @@ def create_axes(self, axes_to_index, obj, validate = True, min_itemsize = None):
12891313
except (Exception), detail:
12901314
raise Exception("cannot coerce data type -> [dtype->%s]" % b.dtype.name)
12911315

1292-
dc = DataCol.create_for_block(i = i, values = list(b.items), kind = b.dtype.name, typ = atom, data = values)
1316+
dc = DataCol.create_for_block(i = i, values = list(b.items), kind = b.dtype.name, typ = atom, data = values, pos = j)
1317+
j += 1
12931318
self.values_axes.append(dc)
12941319

12951320
def create_description(self, compression = None, complevel = None):
@@ -1352,7 +1377,9 @@ class LegacyTable(Table):
13521377
that can be easily searched
13531378
13541379
"""
1355-
_indexables = [Col(name = 'index'),Col(name = 'column', index_kind = 'columns_kind'), DataCol(name = 'fields', cname = 'values', kind_attr = 'fields') ]
1380+
_indexables = [IndexCol(name = 'index', axis = 0, pos = 0),
1381+
IndexCol(name = 'column', axis = 1, pos = 1, index_kind = 'columns_kind'),
1382+
DataCol( name = 'fields', cname = 'values', kind_attr = 'fields', pos = 2) ]
13561383
table_type = 'legacy'
13571384

13581385
def write(self, **kwargs):
@@ -1482,10 +1509,10 @@ def write(self, axes_to_index, obj, append=False, compression=None,
14821509
a.validate_and_set(table, append)
14831510

14841511
# add the rows
1485-
self._write_data()
1512+
self.write_data()
14861513
self.handle.flush()
14871514

1488-
def _write_data(self):
1515+
def write_data(self):
14891516
""" fast writing of data: requires specific cython routines each axis shape """
14901517

14911518
masks = []
@@ -1632,10 +1659,10 @@ def create_table(parent, group, typ = None, **kwargs):
16321659
def _convert_index(index):
16331660
if isinstance(index, DatetimeIndex):
16341661
converted = index.asi8
1635-
return Col(converted, 'datetime64', _tables().Int64Col())
1662+
return IndexCol(converted, 'datetime64', _tables().Int64Col())
16361663
elif isinstance(index, (Int64Index, PeriodIndex)):
16371664
atom = _tables().Int64Col()
1638-
return Col(index.values, 'integer', atom)
1665+
return IndexCol(index.values, 'integer', atom)
16391666

16401667
if isinstance(index, MultiIndex):
16411668
raise Exception('MultiIndex not supported here!')
@@ -1646,36 +1673,36 @@ def _convert_index(index):
16461673

16471674
if inferred_type == 'datetime64':
16481675
converted = values.view('i8')
1649-
return Col(converted, 'datetime64', _tables().Int64Col())
1676+
return IndexCol(converted, 'datetime64', _tables().Int64Col())
16501677
elif inferred_type == 'datetime':
16511678
converted = np.array([(time.mktime(v.timetuple()) +
16521679
v.microsecond / 1E6) for v in values],
16531680
dtype=np.float64)
1654-
return Col(converted, 'datetime', _tables().Time64Col())
1681+
return IndexCol(converted, 'datetime', _tables().Time64Col())
16551682
elif inferred_type == 'date':
16561683
converted = np.array([time.mktime(v.timetuple()) for v in values],
16571684
dtype=np.int32)
1658-
return Col(converted, 'date', _tables().Time32Col())
1685+
return IndexCol(converted, 'date', _tables().Time32Col())
16591686
elif inferred_type == 'string':
16601687
# atom = _tables().ObjectAtom()
16611688
# return np.asarray(values, dtype='O'), 'object', atom
16621689

16631690
converted = np.array(list(values), dtype=np.str_)
16641691
itemsize = converted.dtype.itemsize
1665-
return Col(converted, 'string', _tables().StringCol(itemsize), itemsize = itemsize)
1692+
return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize = itemsize)
16661693
elif inferred_type == 'unicode':
16671694
atom = _tables().ObjectAtom()
1668-
return Col(np.asarray(values, dtype='O'), 'object', atom)
1695+
return IndexCol(np.asarray(values, dtype='O'), 'object', atom)
16691696
elif inferred_type == 'integer':
16701697
# take a guess for now, hope the values fit
16711698
atom = _tables().Int64Col()
1672-
return Col(np.asarray(values, dtype=np.int64), 'integer', atom)
1699+
return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom)
16731700
elif inferred_type == 'floating':
16741701
atom = _tables().Float64Col()
1675-
return Col(np.asarray(values, dtype=np.float64), 'float', atom)
1702+
return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom)
16761703
else: # pragma: no cover
16771704
atom = _tables().ObjectAtom()
1678-
return Col(np.asarray(values, dtype='O'), 'object', atom)
1705+
return IndexCol(np.asarray(values, dtype='O'), 'object', atom)
16791706

16801707

16811708
def _read_array(group, key):
@@ -1812,13 +1839,16 @@ class Term(object):
18121839
_ops = ['<=','<','>=','>','!=','=']
18131840
_search = re.compile("^(?P<field>\w+)(?P<op>%s)(?P<value>.+)$" % '|'.join(_ops))
18141841
_index = ['index','major_axis','major']
1815-
_column = ['column','minor_axis','minor']
1842+
_column = ['column','columns','minor_axis','minor']
18161843

18171844
def __init__(self, field, op = None, value = None, kinds = None):
18181845
self.field = None
18191846
self.op = None
18201847
self.value = None
1821-
self.kinds = kinds or dict()
1848+
1849+
if kinds is None:
1850+
kinds = []
1851+
self.kinds = dict(kinds)
18221852
self.filter = None
18231853
self.condition = None
18241854

@@ -1871,13 +1901,11 @@ def __init__(self, field, op = None, value = None, kinds = None):
18711901
if self.field is None or self.op is None or self.value is None:
18721902
raise Exception("Could not create this term [%s]" % str(self))
18731903

1874-
# valid field name
1875-
if self.field in self._index:
1876-
self.field = 'index'
1877-
elif self.field in self._column:
1878-
self.field = 'column'
1879-
else:
1880-
raise Exception("field is not a valid index/column for this term [%s]" % str(self))
1904+
# map alias for field names
1905+
if self.field in self._index and len(kinds) > 0:
1906+
self.field = kinds[0][0]
1907+
elif self.field in self._column and len(kinds) > 1:
1908+
self.field = kinds[1][0]
18811909

18821910
# we have valid conditions
18831911
if self.op in ['>','>=','<','<=']:
@@ -1935,7 +1963,8 @@ def eval(self):
19351963

19361964
def convert_value(self, v):
19371965

1938-
if self.field == 'index':
1966+
#### a little hacky here, need to really figure out what we should convert ####x
1967+
if self.field == 'index' or self.field == 'major_axis':
19391968
if self.kind == 'datetime64' :
19401969
return [lib.Timestamp(v).value, None]
19411970
elif isinstance(v, datetime):

pandas/io/tests/test_pytables.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,14 +207,14 @@ def test_append_with_strings(self):
207207
tm.assert_panel_equal(self.store['s1'], expected)
208208

209209
# test dict format
210-
self.store.append('s2', wp, min_itemsize = { 'column' : 20 })
210+
self.store.append('s2', wp, min_itemsize = { 'minor_axis' : 20 })
211211
self.store.append('s2', wp2)
212212
expected = concat([ wp, wp2], axis = 2)
213213
expected = expected.reindex(minor_axis = sorted(expected.minor_axis))
214214
tm.assert_panel_equal(self.store['s2'], expected)
215215

216216
# apply the wrong field (similar to #1)
217-
self.store.append('s3', wp, min_itemsize = { 'index' : 20 })
217+
self.store.append('s3', wp, min_itemsize = { 'major_axis' : 20 })
218218
self.assertRaises(Exception, self.store.append, 's3')
219219

220220
# test truncation of bigger strings
@@ -226,8 +226,8 @@ def test_create_table_index(self):
226226
self.store.append('p5', wp)
227227
self.store.create_table_index('p5')
228228

229-
assert(self.store.handle.root.p5.table.cols.index.is_indexed == True)
230-
assert(self.store.handle.root.p5.table.cols.column.is_indexed == False)
229+
assert(self.store.handle.root.p5.table.cols.major_axis.is_indexed == True)
230+
assert(self.store.handle.root.p5.table.cols.minor_axis.is_indexed == False)
231231

232232
df = tm.makeTimeDataFrame()
233233
self.store.append('f', df[:10])

pandas/lib.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -807,11 +807,11 @@ def create_hdf_rows_3d(ndarray index, ndarray columns,
807807

808808
tup = PyTuple_New(tup_size)
809809

810-
val = columns[c]
810+
val = index[i]
811811
PyTuple_SET_ITEM(tup, 0, val)
812812
Py_INCREF(val)
813813

814-
val = index[i]
814+
val = columns[c]
815815
PyTuple_SET_ITEM(tup, 1, val)
816816
Py_INCREF(val)
817817

0 commit comments

Comments
 (0)