forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpytables.py
4845 lines (3917 loc) · 162 KB
/
pytables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# pylint: disable-msg=E1101,W0613,W0603
"""
High level interface to PyTables for reading and writing pandas data structures
to disk
"""
import copy
from datetime import date, datetime
from distutils.version import LooseVersion
import itertools
import os
import re
import time
import warnings
import numpy as np
from pandas._libs import algos, lib, writers as libwriters
from pandas._libs.tslibs import timezones
from pandas.compat import PY3, filter, lrange, range, string_types
from pandas.errors import PerformanceWarning
from pandas.core.dtypes.common import (
ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype,
is_datetime64_dtype, is_datetime64tz_dtype, is_list_like,
is_timedelta64_dtype)
from pandas.core.dtypes.missing import array_equivalent
from pandas import (
DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, Panel,
PeriodIndex, Series, SparseDataFrame, SparseSeries, TimedeltaIndex, compat,
concat, isna, to_datetime)
from pandas.core import config
from pandas.core.algorithms import match, unique
from pandas.core.arrays.categorical import (
Categorical, _factorize_from_iterables)
from pandas.core.arrays.sparse import BlockIndex, IntIndex
from pandas.core.base import StringMixin
import pandas.core.common as com
from pandas.core.computation.pytables import Expr, maybe_expression
from pandas.core.config import get_option
from pandas.core.index import ensure_index
from pandas.core.internals import (
BlockManager, _block2d_to_blocknd, _block_shape, _factor_indexer,
make_block)
from pandas.io.common import _stringify_path
from pandas.io.formats.printing import adjoin, pprint_thing
# versioning attribute
_version = '0.15.2'
# encoding
# PY3 encoding if we don't specify
_default_encoding = 'UTF-8'
def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
if isinstance(s, np.bytes_):
s = s.decode('UTF-8')
return s
def _ensure_encoding(encoding):
# set the encoding if we need
if encoding is None:
if PY3:
encoding = _default_encoding
return encoding
def _ensure_str(name):
"""Ensure that an index / column name is a str (python 3) or
unicode (python 2); otherwise they may be np.string dtype.
Non-string dtypes are passed through unchanged.
https://github.com/pandas-dev/pandas/issues/13492
"""
if isinstance(name, compat.string_types):
name = compat.text_type(name)
return name
Term = Expr
def _ensure_term(where, scope_level):
"""
ensure that the where is a Term or a list of Term
this makes sure that we are capturing the scope of variables
that are passed
create the terms here with a frame_level=2 (we are 2 levels down)
"""
# only consider list/tuple here as an ndarray is automatically a coordinate
# list
level = scope_level + 1
if isinstance(where, (list, tuple)):
wlist = []
for w in filter(lambda x: x is not None, where):
if not maybe_expression(w):
wlist.append(w)
else:
wlist.append(Term(w, scope_level=level))
where = wlist
elif maybe_expression(where):
where = Term(where, scope_level=level)
return where
class PossibleDataLossError(Exception):
pass
class ClosedFileError(Exception):
pass
class IncompatibilityWarning(Warning):
pass
incompatibility_doc = """
where criteria is being ignored as this version [%s] is too old (or
not-defined), read the file in and write it out to a new file to upgrade (with
the copy_to method)
"""
class AttributeConflictWarning(Warning):
pass
attribute_conflict_doc = """
the [%s] attribute of the existing index is [%s] which conflicts with the new
[%s], resetting the attribute to None
"""
class DuplicateWarning(Warning):
pass
duplicate_doc = """
duplicate entries in table, taking most recently appended
"""
performance_doc = """
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->%s,key->%s] [items->%s]
"""
# formats
_FORMAT_MAP = {
u'f': 'fixed',
u'fixed': 'fixed',
u't': 'table',
u'table': 'table',
}
format_deprecate_doc = """
the table keyword has been deprecated
use the format='fixed(f)|table(t)' keyword instead
fixed(f) : specifies the Fixed format
and is the default for put operations
table(t) : specifies the Table format
and is the default for append operations
"""
# map object types
_TYPE_MAP = {
Series: u'series',
SparseSeries: u'sparse_series',
DataFrame: u'frame',
SparseDataFrame: u'sparse_frame',
Panel: u'wide',
}
# storer class map
_STORER_MAP = {
u'Series': 'LegacySeriesFixed',
u'DataFrame': 'LegacyFrameFixed',
u'DataMatrix': 'LegacyFrameFixed',
u'series': 'SeriesFixed',
u'sparse_series': 'SparseSeriesFixed',
u'frame': 'FrameFixed',
u'sparse_frame': 'SparseFrameFixed',
u'wide': 'PanelFixed',
}
# table class map
_TABLE_MAP = {
u'generic_table': 'GenericTable',
u'appendable_series': 'AppendableSeriesTable',
u'appendable_multiseries': 'AppendableMultiSeriesTable',
u'appendable_frame': 'AppendableFrameTable',
u'appendable_multiframe': 'AppendableMultiFrameTable',
u'appendable_panel': 'AppendablePanelTable',
u'worm': 'WORMTable',
u'legacy_frame': 'LegacyFrameTable',
u'legacy_panel': 'LegacyPanelTable',
}
# axes map
_AXES_MAP = {
DataFrame: [0],
Panel: [1, 2]
}
# register our configuration options
dropna_doc = """
: boolean
drop ALL nan rows when appending to a table
"""
format_doc = """
: format
default format writing format, if None, then
put will default to 'fixed' and append will default to 'table'
"""
with config.config_prefix('io.hdf'):
config.register_option('dropna_table', False, dropna_doc,
validator=config.is_bool)
config.register_option(
'default_format', None, format_doc,
validator=config.is_one_of_factory(['fixed', 'table', None])
)
# oh the troubles to reduce import time
_table_mod = None
_table_file_open_policy_is_strict = False
def _tables():
global _table_mod
global _table_file_open_policy_is_strict
if _table_mod is None:
import tables
_table_mod = tables
# version requirements
if LooseVersion(tables.__version__) < LooseVersion('3.0.0'):
raise ImportError("PyTables version >= 3.0.0 is required")
# set the file open policy
# return the file open policy; this changes as of pytables 3.1
# depending on the HDF5 version
try:
_table_file_open_policy_is_strict = (
tables.file._FILE_OPEN_POLICY == 'strict')
except AttributeError:
pass
return _table_mod
# interface to/from ###
def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
append=None, **kwargs):
""" store this object, close it if we opened it """
if append:
f = lambda store: store.append(key, value, **kwargs)
else:
f = lambda store: store.put(key, value, **kwargs)
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, string_types):
with HDFStore(path_or_buf, mode=mode, complevel=complevel,
complib=complib) as store:
f(store)
else:
f(path_or_buf)
def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
"""
Read from the store, close it if we opened it.
Retrieve pandas object stored in file, optionally based on where
criteria
Parameters
----------
path_or_buf : string, buffer or path object
Path to the file to open, or an open :class:`pandas.HDFStore` object.
Supports any object implementing the ``__fspath__`` protocol.
This includes :class:`pathlib.Path` and py._path.local.LocalPath
objects.
.. versionadded:: 0.19.0 support for pathlib, py.path.
.. versionadded:: 0.21.0 support for __fspath__ protocol.
key : object, optional
The group identifier in the store. Can be omitted if the HDF file
contains a single pandas object.
mode : {'r', 'r+', 'a'}, optional
Mode to use when opening the file. Ignored if path_or_buf is a
:class:`pandas.HDFStore`. Default is 'r'.
where : list, optional
A list of Term (or convertible) objects.
start : int, optional
Row number to start selection.
stop : int, optional
Row number to stop selection.
columns : list, optional
A list of columns names to return.
iterator : bool, optional
Return an iterator object.
chunksize : int, optional
Number of rows to include in an iteration when using an iterator.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
**kwargs
Additional keyword arguments passed to HDFStore.
Returns
-------
item : object
The selected object. Return type depends on the object stored.
See Also
--------
pandas.DataFrame.to_hdf : write a HDF file from a DataFrame
pandas.HDFStore : low-level access to HDF files
Examples
--------
>>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
>>> df.to_hdf('./store.h5', 'data')
>>> reread = pd.read_hdf('./store.h5')
"""
if mode not in ['r', 'r+', 'a']:
raise ValueError('mode {0} is not allowed while performing a read. '
'Allowed modes are r, r+ and a.'.format(mode))
# grab the scope
if 'where' in kwargs:
kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
if isinstance(path_or_buf, HDFStore):
if not path_or_buf.is_open:
raise IOError('The HDFStore must be open for reading.')
store = path_or_buf
auto_close = False
else:
path_or_buf = _stringify_path(path_or_buf)
if not isinstance(path_or_buf, string_types):
raise NotImplementedError('Support for generic buffers has not '
'been implemented.')
try:
exists = os.path.exists(path_or_buf)
# if filepath is too long
except (TypeError, ValueError):
exists = False
if not exists:
raise compat.FileNotFoundError(
'File %s does not exist' % path_or_buf)
store = HDFStore(path_or_buf, mode=mode, **kwargs)
# can't auto open/close if we are using an iterator
# so delegate to the iterator
auto_close = True
try:
if key is None:
groups = store.groups()
if len(groups) == 0:
raise ValueError('No dataset in HDF5 file.')
candidate_only_group = groups[0]
# For the HDF file to have only one dataset, all other groups
# should then be metadata groups for that candidate group. (This
# assumes that the groups() method enumerates parent groups
# before their children.)
for group_to_check in groups[1:]:
if not _is_metadata_of(group_to_check, candidate_only_group):
raise ValueError('key must be provided when HDF5 file '
'contains multiple datasets.')
key = candidate_only_group._v_pathname
return store.select(key, auto_close=auto_close, **kwargs)
except (ValueError, TypeError):
# if there is an error, close the store
try:
store.close()
except AttributeError:
pass
raise
def _is_metadata_of(group, parent_group):
"""Check if a given group is a metadata group for a given parent_group."""
if group._v_depth <= parent_group._v_depth:
return False
current = group
while current._v_depth > 1:
parent = current._v_parent
if parent == parent_group and current._v_name == 'meta':
return True
current = current._v_parent
return False
class HDFStore(StringMixin):
"""
dict-like IO interface for storing pandas objects in PyTables
either Fixed or Table format.
Parameters
----------
path : string
File path to HDF5 file
mode : {'a', 'w', 'r', 'r+'}, default 'a'
``'r'``
Read-only; no data can be modified.
``'w'``
Write; a new file is created (an existing file with the same
name would be deleted).
``'a'``
Append; an existing file is opened for reading and writing,
and if the file does not exist it is created.
``'r+'``
It is similar to ``'a'``, but the file must already exist.
complevel : int, 0-9, default None
Specifies a compression level for data.
A value of 0 disables compression.
complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.
fletcher32 : bool, default False
If applying compression use the fletcher32 checksum
Examples
--------
>>> bar = pd.DataFrame(np.random.randn(10, 4))
>>> store = pd.HDFStore('test.h5')
>>> store['foo'] = bar # write to HDF5
>>> bar = store['foo'] # retrieve
>>> store.close()
"""
def __init__(self, path, mode=None, complevel=None, complib=None,
fletcher32=False, **kwargs):
try:
import tables # noqa
except ImportError as ex: # pragma: no cover
raise ImportError('HDFStore requires PyTables, "{ex}" problem '
'importing'.format(ex=str(ex)))
if complib is not None and complib not in tables.filters.all_complibs:
raise ValueError(
"complib only supports {libs} compression.".format(
libs=tables.filters.all_complibs))
if complib is None and complevel is not None:
complib = tables.filters.default_complib
self._path = _stringify_path(path)
if mode is None:
mode = 'a'
self._mode = mode
self._handle = None
self._complevel = complevel if complevel else 0
self._complib = complib
self._fletcher32 = fletcher32
self._filters = None
self.open(mode=mode, **kwargs)
def __fspath__(self):
return self._path
@property
def root(self):
""" return the root node """
self._check_if_open()
return self._handle.root
@property
def filename(self):
return self._path
def __getitem__(self, key):
return self.get(key)
def __setitem__(self, key, value):
self.put(key, value)
def __delitem__(self, key):
return self.remove(key)
def __getattr__(self, name):
""" allow attribute access to get stores """
try:
return self.get(name)
except (KeyError, ClosedFileError):
pass
raise AttributeError("'%s' object has no attribute '%s'" %
(type(self).__name__, name))
def __contains__(self, key):
""" check for existence of this key
can match the exact pathname or the pathnm w/o the leading '/'
"""
node = self.get_node(key)
if node is not None:
name = node._v_pathname
if name == key or name[1:] == key:
return True
return False
def __len__(self):
return len(self.groups())
def __unicode__(self):
return '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def keys(self):
"""
Return a (potentially unordered) list of the keys corresponding to the
objects stored in the HDFStore. These are ABSOLUTE path-names (e.g.
have the leading '/'
"""
return [n._v_pathname for n in self.groups()]
def __iter__(self):
return iter(self.keys())
def items(self):
"""
iterate on key->group
"""
for g in self.groups():
yield g._v_pathname, g
iteritems = items
def open(self, mode='a', **kwargs):
"""
Open the file in the specified mode
Parameters
----------
mode : {'a', 'w', 'r', 'r+'}, default 'a'
See HDFStore docstring or tables.open_file for info about modes
"""
tables = _tables()
if self._mode != mode:
# if we are changing a write mode to read, ok
if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
pass
elif mode in ['w']:
# this would truncate, raise here
if self.is_open:
raise PossibleDataLossError(
"Re-opening the file [{0}] with mode [{1}] "
"will delete the current file!"
.format(self._path, self._mode)
)
self._mode = mode
# close and reopen the handle
if self.is_open:
self.close()
if self._complevel and self._complevel > 0:
self._filters = _tables().Filters(self._complevel, self._complib,
fletcher32=self._fletcher32)
try:
self._handle = tables.open_file(self._path, self._mode, **kwargs)
except (IOError) as e: # pragma: no cover
if 'can not be written' in str(e):
print('Opening %s in read-only mode' % self._path)
self._handle = tables.open_file(self._path, 'r', **kwargs)
else:
raise
except (ValueError) as e:
# trap PyTables >= 3.1 FILE_OPEN_POLICY exception
# to provide an updated message
if 'FILE_OPEN_POLICY' in str(e):
e = ValueError(
"PyTables [{version}] no longer supports opening multiple "
"files\n"
"even in read-only mode on this HDF5 version "
"[{hdf_version}]. You can accept this\n"
"and not open the same file multiple times at once,\n"
"upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
"which allows\n"
"files to be opened multiple times at once\n"
.format(version=tables.__version__,
hdf_version=tables.get_hdf5_version()))
raise e
except (Exception) as e:
# trying to read from a non-existent file causes an error which
# is not part of IOError, make it one
if self._mode == 'r' and 'Unable to open/create file' in str(e):
raise IOError(str(e))
raise
def close(self):
"""
Close the PyTables file handle
"""
if self._handle is not None:
self._handle.close()
self._handle = None
@property
def is_open(self):
"""
return a boolean indicating whether the file is open
"""
if self._handle is None:
return False
return bool(self._handle.isopen)
def flush(self, fsync=False):
"""
Force all buffered modifications to be written to disk.
Parameters
----------
fsync : bool (default False)
call ``os.fsync()`` on the file handle to force writing to disk.
Notes
-----
Without ``fsync=True``, flushing may not guarantee that the OS writes
to disk. With fsync, the operation will block until the OS claims the
file has been written; however, other caching layers may still
interfere.
"""
if self._handle is not None:
self._handle.flush()
if fsync:
try:
os.fsync(self._handle.fileno())
except OSError:
pass
def get(self, key):
"""
Retrieve pandas object stored in file
Parameters
----------
key : object
Returns
-------
obj : same type as object stored in file
"""
group = self.get_node(key)
if group is None:
raise KeyError('No object named %s in the file' % key)
return self._read_group(group)
def select(self, key, where=None, start=None, stop=None, columns=None,
iterator=False, chunksize=None, auto_close=False, **kwargs):
"""
Retrieve pandas object stored in file, optionally based on where
criteria
Parameters
----------
key : object
where : list of Term (or convertible) objects, optional
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
columns : a list of columns that if not None, will limit the return
columns
iterator : boolean, return an iterator, default False
chunksize : nrows to include in iteration, return an iterator
auto_close : boolean, should automatically close the store when
finished, default is False
Returns
-------
The selected object
"""
group = self.get_node(key)
if group is None:
raise KeyError('No object named %s in the file' % key)
# create the storer and axes
where = _ensure_term(where, scope_level=1)
s = self._create_storer(group)
s.infer_axes()
# function to call on iteration
def func(_start, _stop, _where):
return s.read(start=_start, stop=_stop,
where=_where,
columns=columns)
# create the iterator
it = TableIterator(self, s, func, where=where, nrows=s.nrows,
start=start, stop=stop, iterator=iterator,
chunksize=chunksize, auto_close=auto_close)
return it.get_result()
def select_as_coordinates(
self, key, where=None, start=None, stop=None, **kwargs):
"""
return the selection as an Index
Parameters
----------
key : object
where : list of Term (or convertible) objects, optional
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
"""
where = _ensure_term(where, scope_level=1)
return self.get_storer(key).read_coordinates(where=where, start=start,
stop=stop, **kwargs)
def select_column(self, key, column, **kwargs):
"""
return a single column from the table. This is generally only useful to
select an indexable
Parameters
----------
key : object
column: the column of interest
Exceptions
----------
raises KeyError if the column is not found (or key is not a valid
store)
raises ValueError if the column can not be extracted individually (it
is part of a data block)
"""
return self.get_storer(key).read_column(column=column, **kwargs)
def select_as_multiple(self, keys, where=None, selector=None, columns=None,
start=None, stop=None, iterator=False,
chunksize=None, auto_close=False, **kwargs):
""" Retrieve pandas objects from multiple tables
Parameters
----------
keys : a list of the tables
selector : the table to apply the where criteria (defaults to keys[0]
if not supplied)
columns : the columns I want back
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
iterator : boolean, return an iterator, default False
chunksize : nrows to include in iteration, return an iterator
Exceptions
----------
raises KeyError if keys or selector is not found or keys is empty
raises TypeError if keys is not a list or tuple
raises ValueError if the tables are not ALL THE SAME DIMENSIONS
"""
# default to single select
where = _ensure_term(where, scope_level=1)
if isinstance(keys, (list, tuple)) and len(keys) == 1:
keys = keys[0]
if isinstance(keys, string_types):
return self.select(key=keys, where=where, columns=columns,
start=start, stop=stop, iterator=iterator,
chunksize=chunksize, **kwargs)
if not isinstance(keys, (list, tuple)):
raise TypeError("keys must be a list/tuple")
if not len(keys):
raise ValueError("keys must have a non-zero length")
if selector is None:
selector = keys[0]
# collect the tables
tbls = [self.get_storer(k) for k in keys]
s = self.get_storer(selector)
# validate rows
nrows = None
for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
if t is None:
raise KeyError("Invalid table [%s]" % k)
if not t.is_table:
raise TypeError(
"object [%s] is not a table, and cannot be used in all "
"select as multiple" % t.pathname
)
if nrows is None:
nrows = t.nrows
elif t.nrows != nrows:
raise ValueError(
"all tables must have exactly the same nrows!")
# axis is the concentation axes
axis = list({t.non_index_axes[0][0] for t in tbls})[0]
def func(_start, _stop, _where):
# retrieve the objs, _where is always passed as a set of
# coordinates here
objs = [t.read(where=_where, columns=columns, start=_start,
stop=_stop, **kwargs) for t in tbls]
# concat and return
return concat(objs, axis=axis,
verify_integrity=False)._consolidate()
# create the iterator
it = TableIterator(self, s, func, where=where, nrows=nrows,
start=start, stop=stop, iterator=iterator,
chunksize=chunksize, auto_close=auto_close)
return it.get_result(coordinates=True)
def put(self, key, value, format=None, append=False, **kwargs):
"""
Store object in HDFStore
Parameters
----------
key : object
value : {Series, DataFrame, Panel}
format : 'fixed(f)|table(t)', default is 'fixed'
fixed(f) : Fixed format
Fast writing/reading. Not-appendable, nor searchable
table(t) : Table format
Write as a PyTables Table structure which may perform
worse but allow more flexible operations like searching
/ selecting subsets of the data
append : boolean, default False
This will force Table format, append the input data to the
existing.
data_columns : list of columns to create as data columns, or True to
use all columns. See
`here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
encoding : default None, provide an encoding for strings
dropna : boolean, default False, do not write an ALL nan row to
the store settable by the option 'io.hdf.dropna_table'
"""
if format is None:
format = get_option("io.hdf.default_format") or 'fixed'
kwargs = self._validate_format(format, kwargs)
self._write_to_group(key, value, append=append, **kwargs)
def remove(self, key, where=None, start=None, stop=None):
"""
Remove pandas object partially by specifying the where condition
Parameters
----------
key : string
Node to remove or delete rows from
where : list of Term (or convertible) objects, optional
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
Returns
-------
number of rows removed (or None if not a Table)
Exceptions
----------
raises KeyError if key is not a valid store
"""
where = _ensure_term(where, scope_level=1)
try:
s = self.get_storer(key)
except KeyError:
# the key is not a valid store, re-raising KeyError
raise
except Exception:
if where is not None:
raise ValueError(
"trying to remove a node with a non-None where clause!")
# we are actually trying to remove a node (with children)
s = self.get_node(key)
if s is not None:
s._f_remove(recursive=True)
return None
# remove the node
if com._all_none(where, start, stop):
s.group._f_remove(recursive=True)
# delete from the table
else:
if not s.is_table:
raise ValueError(
'can only remove with where on objects written as tables')
return s.delete(where=where, start=start, stop=stop)
def append(self, key, value, format=None, append=True, columns=None,
dropna=None, **kwargs):
"""
Append to Table in file. Node must already exist and be Table
format.
Parameters
----------
key : object
value : {Series, DataFrame, Panel}
format: 'table' is the default
table(t) : table format
Write as a PyTables Table structure which may perform
worse but allow more flexible operations like searching
/ selecting subsets of the data
append : boolean, default True, append the input data to the
existing
data_columns : list of columns, or True, default None
List of columns to create as indexed data columns for on-disk
queries, or True to use all columns. By default only the axes
of the object are indexed. See `here
<http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
min_itemsize : dict of columns that specify minimum string sizes
nan_rep : string to use as string nan represenation
chunksize : size to chunk the writing
expectedrows : expected TOTAL row size of this table
encoding : default None, provide an encoding for strings
dropna : boolean, default False, do not write an ALL nan row to
the store settable by the option 'io.hdf.dropna_table'
Notes
-----
Does *not* check if data being appended overlaps with existing
data in the table, so be careful
"""
if columns is not None:
raise TypeError("columns is not a supported keyword in append, "
"try data_columns")
if dropna is None:
dropna = get_option("io.hdf.dropna_table")
if format is None:
format = get_option("io.hdf.default_format") or 'table'
kwargs = self._validate_format(format, kwargs)
self._write_to_group(key, value, append=append, dropna=dropna,
**kwargs)
def append_to_multiple(self, d, value, selector, data_columns=None,
axes=None, dropna=False, **kwargs):
"""
Append to multiple tables
Parameters
----------
d : a dict of table_name to table_columns, None is acceptable as the
values of one node (this will get all the remaining columns)
value : a pandas object
selector : a string that designates the indexable table; all of its
columns will be designed as data_columns, unless data_columns is
passed, in which case these are used
data_columns : list of columns to create as data columns, or True to
use all columns
dropna : if evaluates to True, drop rows from all tables if any single
row in each table has all NaN. Default False.
Notes