|
| 1 | +import abc |
| 2 | +import hashlib |
| 3 | +import logging |
| 4 | +import os |
| 5 | +from threading import RLock |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +from bson import Binary |
| 10 | + |
| 11 | +from arctic.serialization.numpy_records import PandasSerializer |
| 12 | +from .._compression import compress |
| 13 | +from ..exceptions import ArcticSerializationException |
| 14 | +from .._util import MAX_DOCUMENT_SIZE, NP_OBJECT_DTYPE |
| 15 | + |
| 16 | +ARCTIC_AUTO_EXPAND_CHUNK_SIZE = bool(os.environ.get('ARCTIC_AUTO_EXPAND_CHUNK_SIZE')) |
| 17 | + |
| 18 | +ABC = abc.ABCMeta('ABC', (object,), {}) |
| 19 | + |
| 20 | +log = logging.getLogger(__name__) |
| 21 | + |
| 22 | + |
| 23 | +def incremental_checksum(item, curr_sha=None, is_bytes=False): |
| 24 | + curr_sha = hashlib.sha1() if curr_sha is None else curr_sha |
| 25 | + curr_sha.update(item if is_bytes else item.tostring()) |
| 26 | + return curr_sha |
| 27 | + |
| 28 | + |
| 29 | +class LazyIncrementalSerializer(ABC): |
| 30 | + def __init__(self, serializer, input_data, chunk_size): |
| 31 | + if chunk_size < 1: |
| 32 | + raise ArcticSerializationException("LazyIncrementalSerializer can't be initialized " |
| 33 | + "with chunk_size < 1 ({})".format(chunk_size)) |
| 34 | + if not serializer: |
| 35 | + raise ArcticSerializationException("LazyIncrementalSerializer can't be initialized " |
| 36 | + "with a None serializer object") |
| 37 | + self.input_data = input_data |
| 38 | + self.chunk_size = chunk_size |
| 39 | + self._serializer = serializer |
| 40 | + self._initialized = False |
| 41 | + self._checksum = None |
| 42 | + |
| 43 | + @abc.abstractmethod |
| 44 | + def __len__(self): |
| 45 | + pass |
| 46 | + |
| 47 | + @abc.abstractproperty |
| 48 | + def generator(self): |
| 49 | + pass |
| 50 | + |
| 51 | + @abc.abstractproperty |
| 52 | + def generator_bytes(self): |
| 53 | + pass |
| 54 | + |
| 55 | + @abc.abstractproperty |
| 56 | + def serialize(self): |
| 57 | + pass |
| 58 | + |
| 59 | + |
| 60 | +class IncrementalPandasToRecArraySerializer(LazyIncrementalSerializer): |
| 61 | + def __init__(self, serializer, input_data, chunk_size, string_max_len=None): |
| 62 | + super(IncrementalPandasToRecArraySerializer, self).__init__(serializer, input_data, chunk_size) |
| 63 | + if not isinstance(serializer, PandasSerializer): |
| 64 | + raise ArcticSerializationException("IncrementalPandasToRecArraySerializer requires a serializer of " |
| 65 | + "type PandasSerializer.") |
| 66 | + if not isinstance(input_data, (pd.DataFrame, pd.Series)): |
| 67 | + raise ArcticSerializationException("IncrementalPandasToRecArraySerializer requires a pandas DataFrame or " |
| 68 | + "Series as data source input.") |
| 69 | + if string_max_len and string_max_len < 1: |
| 70 | + raise ArcticSerializationException("IncrementalPandasToRecArraySerializer can't be initialized " |
| 71 | + "with string_max_len < 1 ({})".format(string_max_len)) |
| 72 | + self.string_max_len = string_max_len |
| 73 | + # The state which needs to be lazily initialized |
| 74 | + self._dtype = None |
| 75 | + self._shape = None |
| 76 | + self._rows_per_chunk = 0 |
| 77 | + self._total_chunks = 0 |
| 78 | + self._has_string_object = False |
| 79 | + self._lock = RLock() |
| 80 | + |
| 81 | + def _dtype_convert_to_max_len_string(self, input_ndtype, fname): |
| 82 | + if input_ndtype.type not in (np.string_, np.unicode_): |
| 83 | + return input_ndtype, False |
| 84 | + type_sym = 'S' if input_ndtype.type == np.string_ else 'U' |
| 85 | + max_str_len = len(max(self.input_data[fname].astype(type_sym), key=len)) |
| 86 | + str_field_dtype = np.dtype('{}{:d}'.format(type_sym, max_str_len)) if max_str_len > 0 else input_ndtype |
| 87 | + return str_field_dtype, True |
| 88 | + |
| 89 | + def _get_dtype(self): |
| 90 | + # Serializer is being called only if can_convert_to_records_without_objects() has passed, |
| 91 | + # which means that the resulting recarray does not contain objects but only numpy types, string, or unicode |
| 92 | + |
| 93 | + # Serialize the first row to obtain info about row size in bytes (cache first few rows only) |
| 94 | + # Also raise an Exception early, if data are not serializable |
| 95 | + first_chunk, serialized_dtypes = self._serializer.serialize( |
| 96 | + self.input_data[0:10] if len(self) > 0 else self.input_data, |
| 97 | + string_max_len=self.string_max_len) |
| 98 | + |
| 99 | + # This is the common case, where first row's dtype represents well the whole dataframe's dtype |
| 100 | + if serialized_dtypes is None or \ |
| 101 | + len(self.input_data) == 0 or \ |
| 102 | + NP_OBJECT_DTYPE not in self.input_data.dtypes.values: |
| 103 | + return first_chunk, serialized_dtypes, False |
| 104 | + |
| 105 | + # Reaching here means we have at least one column of type object |
| 106 | + # To correctly serialize incrementally, we need to know the final dtype (type and fixed length), |
| 107 | + # using length-conversion information from all values of the object columns |
| 108 | + |
| 109 | + dtype_arr = [] |
| 110 | + has_string_object = False |
| 111 | + for field_name in serialized_dtypes.names: # include all column names, along with the expanded multi-index |
| 112 | + field_dtype = serialized_dtypes[field_name] |
| 113 | + if field_name not in self.input_data or self.input_data.dtypes[field_name] is NP_OBJECT_DTYPE: |
| 114 | + # Note: .hasobject breaks for timezone-aware datetime64 pandas columns, so compare with dtype('O') |
| 115 | + # if column is an expanded multi index or doesn't contain objects, the serialized 1st row dtype is safe |
| 116 | + field_dtype, with_str_object = self._dtype_convert_to_max_len_string(field_dtype, field_name) |
| 117 | + has_string_object |= with_str_object |
| 118 | + dtype_arr.append((field_name, field_dtype)) |
| 119 | + return first_chunk, np.dtype(dtype_arr), has_string_object |
| 120 | + |
| 121 | + def _lazy_init(self): |
| 122 | + if self._initialized: |
| 123 | + return |
| 124 | + |
| 125 | + with self._lock: |
| 126 | + if self._initialized: # intentional double check here |
| 127 | + return |
| 128 | + # Get the dtype of the serialized array (takes into account object types, converted to fixed length strings) |
| 129 | + first_chunk, dtype, has_string_object = self._get_dtype() |
| 130 | + |
| 131 | + # Compute the number of rows which can fit in a chunk |
| 132 | + rows_per_chunk = 0 |
| 133 | + if len(self) > 0 and self.chunk_size > 1: |
| 134 | + rows_per_chunk = IncrementalPandasToRecArraySerializer._calculate_rows_per_chunk(self.chunk_size, first_chunk) |
| 135 | + |
| 136 | + # Initialize object's state |
| 137 | + self._dtype = dtype |
| 138 | + shp = list(first_chunk.shape) |
| 139 | + shp[0] = len(self) |
| 140 | + self._shape = tuple(shp) |
| 141 | + self._has_string_object = has_string_object |
| 142 | + self._rows_per_chunk = rows_per_chunk |
| 143 | + self._total_chunks = int(np.ceil(float(len(self)) / self._rows_per_chunk)) if rows_per_chunk > 0 else 0 |
| 144 | + self._initialized = True |
| 145 | + |
| 146 | + @staticmethod |
| 147 | + def _calculate_rows_per_chunk(max_chunk_size, chunk): |
| 148 | + sze = int(chunk.dtype.itemsize * np.prod(chunk.shape[1:])) |
| 149 | + sze = sze if sze < max_chunk_size else max_chunk_size |
| 150 | + rows_per_chunk = int(max_chunk_size / sze) |
| 151 | + if rows_per_chunk < 1 and ARCTIC_AUTO_EXPAND_CHUNK_SIZE: |
| 152 | + # If a row size is larger than chunk_size, use the maximum document size |
| 153 | + logging.warning('Chunk size of {} is too small to fit a row ({}). ' |
| 154 | + 'Using maximum document size.'.format(max_chunk_size, MAX_DOCUMENT_SIZE)) |
| 155 | + # For huge rows, fall-back to using a very large document size, less than max-allowed by MongoDB |
| 156 | + rows_per_chunk = int(MAX_DOCUMENT_SIZE / sze) |
| 157 | + if rows_per_chunk < 1: |
| 158 | + raise ArcticSerializationException("Serialization failed to split data into max sized chunks.") |
| 159 | + return rows_per_chunk |
| 160 | + |
| 161 | + def __len__(self): |
| 162 | + return len(self.input_data) |
| 163 | + |
| 164 | + @property |
| 165 | + def shape(self): |
| 166 | + self._lazy_init() |
| 167 | + return self._shape |
| 168 | + |
| 169 | + @property |
| 170 | + def dtype(self): |
| 171 | + self._lazy_init() |
| 172 | + return self._dtype |
| 173 | + |
| 174 | + @property |
| 175 | + def rows_per_chunk(self): |
| 176 | + self._lazy_init() |
| 177 | + return self._rows_per_chunk |
| 178 | + |
| 179 | + def checksum(self, from_idx, to_idx): |
| 180 | + if self._checksum is None: |
| 181 | + self._lazy_init() |
| 182 | + total_sha = None |
| 183 | + for chunk_bytes, dtype in self.generator_bytes(from_idx=from_idx, to_idx=to_idx): |
| 184 | + # TODO: what about compress_array here in batches? |
| 185 | + compressed_chunk = compress(chunk_bytes) |
| 186 | + total_sha = incremental_checksum(compressed_chunk, curr_sha=total_sha, is_bytes=True) |
| 187 | + self._checksum = Binary(total_sha.digest()) |
| 188 | + return self._checksum |
| 189 | + |
| 190 | + def generator(self, from_idx=None, to_idx=None): |
| 191 | + return self._generator(from_idx=from_idx, to_idx=to_idx) |
| 192 | + |
| 193 | + def generator_bytes(self, from_idx=None, to_idx=None): |
| 194 | + return self._generator(from_idx=from_idx, to_idx=to_idx, get_bytes=True) |
| 195 | + |
| 196 | + def _generator(self, from_idx, to_idx, get_bytes=False): |
| 197 | + # Note that the range is: [from_idx, to_idx) |
| 198 | + self._lazy_init() |
| 199 | + |
| 200 | + my_lenth = len(self) |
| 201 | + |
| 202 | + # Take into account default arguments and negative indexing (from end offset) |
| 203 | + from_idx = 0 if from_idx is None else from_idx |
| 204 | + if from_idx < 0: |
| 205 | + from_idx = my_lenth + from_idx |
| 206 | + to_idx = my_lenth if to_idx is None else min(to_idx, my_lenth) |
| 207 | + if to_idx < 0: |
| 208 | + to_idx = my_lenth + to_idx |
| 209 | + |
| 210 | + # No data, finish iteration |
| 211 | + if my_lenth == 0 or from_idx >= my_lenth or from_idx >= to_idx: |
| 212 | + return |
| 213 | + |
| 214 | + # Perform serialization for each chunk |
| 215 | + while from_idx < to_idx: |
| 216 | + curr_stop = min(from_idx+self._rows_per_chunk, to_idx) |
| 217 | + |
| 218 | + chunk, _ = self._serializer.serialize( |
| 219 | + self.input_data[from_idx: curr_stop], |
| 220 | + string_max_len=self.string_max_len, |
| 221 | + forced_dtype=self.dtype if self._has_string_object else None) |
| 222 | + |
| 223 | + # Let the gc collect the intermediate serialized chunk as early as possible |
| 224 | + chunk = chunk.tostring() if chunk is not None and get_bytes else chunk |
| 225 | + |
| 226 | + yield chunk, self.dtype, from_idx, curr_stop |
| 227 | + from_idx = curr_stop |
| 228 | + |
| 229 | + def serialize(self): |
| 230 | + return self._serializer.serialize(self.input_data, self.string_max_len) |
0 commit comments