|
5 | 5 | """
|
6 | 6 |
|
7 | 7 | import re
|
8 |
| -from functools import reduce |
9 |
| -from itertools import chain |
| 8 | +import math |
10 | 9 |
|
11 | 10 | from influxdb_client.client.write.point import _ESCAPE_KEY, _ESCAPE_STRING, _ESCAPE_MEASUREMENT
|
12 | 11 |
|
13 | 12 |
|
14 |
| -def _replace(data_frame): |
15 |
| - from ...extras import np |
16 |
| - |
17 |
| - # string columns |
18 |
| - obj_cols = {k for k, v in dict(data_frame.dtypes).items() if v is np.dtype('O')} |
19 |
| - |
20 |
| - # number columns |
21 |
| - other_cols = set(data_frame.columns) - obj_cols |
22 |
| - |
23 |
| - obj_nans = (f'{k}=nan' for k in obj_cols) |
24 |
| - other_nans = (f'{k}=nani?' for k in other_cols) |
25 |
| - |
26 |
| - replacements = [ |
27 |
| - ('|'.join(chain(obj_nans, other_nans)), ''), |
28 |
| - (',{2,}', ','), |
29 |
| - ('|'.join([', ,', ', ', ' ,']), ' '), |
30 |
| - ] |
31 |
| - |
32 |
| - return replacements |
33 |
| - |
34 |
| - |
35 | 13 | def _itertuples(data_frame):
|
36 | 14 | cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
|
37 | 15 | return zip(data_frame.index, *cols)
|
38 | 16 |
|
39 | 17 |
|
40 |
| -def _is_nan(x): |
41 |
| - return x != x |
| 18 | +def _not_nan(x): |
| 19 | + return x == x |
42 | 20 |
|
43 | 21 |
|
44 | 22 | def _any_not_nan(p, indexes):
|
45 |
| - return any(map(lambda inx: not _is_nan(p[inx]), indexes)) |
| 23 | + return any(map(lambda x: _not_nan(p[x]), indexes)) |
46 | 24 |
|
47 | 25 |
|
48 | 26 | def data_frame_to_list_of_points(data_frame, point_settings, **kwargs):
|
49 | 27 | """Serialize DataFrame into LineProtocols."""
|
| 28 | + # This function is hard to understand but for good reason: |
| 29 | + # the approach used here is considerably more efficient |
| 30 | + # than the alternatives, |
| 31 | + # |
| 32 | + # We build up a Python expression that very efficiently converts a data point |
| 33 | + # tuple into line-protocol entry, and then evaluate the expression |
| 34 | + # as a lambda so that we can call it. This avoids the overhead of |
| 35 | + # invoking a function on every data value - we only have one function |
| 36 | + # call per row instead. |
| 37 | + |
50 | 38 | from ...extras import pd, np
|
51 | 39 | if not isinstance(data_frame, pd.DataFrame):
|
52 | 40 | raise TypeError('Must be DataFrame, but type was: {0}.'
|
53 | 41 | .format(type(data_frame)))
|
54 | 42 |
|
55 |
| - if 'data_frame_measurement_name' not in kwargs: |
| 43 | + data_frame_measurement_name = kwargs.get('data_frame_measurement_name') |
| 44 | + if data_frame_measurement_name is None: |
56 | 45 | raise TypeError('"data_frame_measurement_name" is a Required Argument')
|
57 | 46 |
|
| 47 | + data_frame = data_frame.copy(deep=False) |
58 | 48 | if isinstance(data_frame.index, pd.PeriodIndex):
|
59 | 49 | data_frame.index = data_frame.index.to_timestamp()
|
60 | 50 | else:
|
| 51 | + # TODO: this is almost certainly not what you want |
| 52 | + # when the index is the default RangeIndex. |
| 53 | + # Instead, it would probably be better to leave |
| 54 | + # out the timestamp unless a time column is explicitly |
| 55 | + # enabled. |
61 | 56 | data_frame.index = pd.to_datetime(data_frame.index)
|
62 | 57 |
|
63 | 58 | if data_frame.index.tzinfo is None:
|
64 | 59 | data_frame.index = data_frame.index.tz_localize('UTC')
|
65 | 60 |
|
66 |
| - measurement_name = str(kwargs.get('data_frame_measurement_name')).translate(_ESCAPE_MEASUREMENT) |
67 | 61 | data_frame_tag_columns = kwargs.get('data_frame_tag_columns')
|
68 | 62 | data_frame_tag_columns = set(data_frame_tag_columns or [])
|
69 | 63 |
|
70 | 64 | tags = []
|
71 |
| - fields = [] |
72 |
| - fields_indexes = [] |
73 | 65 | keys = []
|
| 66 | + fields = [] |
| 67 | + field_indexes = [] |
74 | 68 |
|
75 | 69 | if point_settings.defaultTags:
|
76 | 70 | for key, value in point_settings.defaultTags.items():
|
| 71 | + # TODO: this overrides any values for the column |
| 72 | + # which is probably not what a "default" tag value |
| 73 | + # is meant to do. It might be better to add the |
| 74 | + # column only when it doesn't already exist, |
| 75 | + # and to fill out any NaN values with the default |
| 76 | + # value otherwise. |
77 | 77 | data_frame[key] = value
|
78 | 78 | data_frame_tag_columns.add(key)
|
79 | 79 |
|
80 |
| - for index, (key, value) in enumerate(data_frame.dtypes.items()): |
| 80 | + # Get a list of all the columns sorted by field/tag key. |
| 81 | + columns = sorted(enumerate(data_frame.dtypes.items()), key=lambda col: col[1][0]) |
| 82 | + |
| 83 | + null_columns = data_frame.isnull().any() |
| 84 | + for index, (key, value) in columns: |
81 | 85 | key = str(key)
|
| 86 | + key_format = f'{{keys[{len(keys)}]}}' |
82 | 87 | keys.append(key.translate(_ESCAPE_KEY))
|
83 |
| - key_format = f'{{keys[{index}]}}' |
| 88 | + # The field index is one more than the column index because the |
| 89 | + # time index is at column zero in the finally zipped-together |
| 90 | + # result columns. |
| 91 | + field_index = index + 1 |
| 92 | + val_format = f'p[{field_index}]' |
84 | 93 |
|
85 |
| - index_value = index + 1 |
86 | 94 | if key in data_frame_tag_columns:
|
87 |
| - tags.append({'key': key, 'value': f"{key_format}={{str(p[{index_value}]).translate(_ESCAPE_KEY)}}"}) |
88 |
| - elif issubclass(value.type, np.integer): |
89 |
| - fields.append(f"{key_format}={{p[{index_value}]}}i") |
90 |
| - fields_indexes.append(index_value) |
91 |
| - elif issubclass(value.type, (np.float, np.bool_)): |
92 |
| - fields.append(f"{key_format}={{p[{index_value}]}}") |
93 |
| - fields_indexes.append(index_value) |
| 95 | + if null_columns[index]: |
| 96 | + key_value = f"""{{ |
| 97 | + '' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else |
| 98 | + f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}' |
| 99 | + }}""" |
| 100 | + else: |
| 101 | + key_value = f',{key_format}={{str({val_format}).translate(_ESCAPE_KEY)}}' |
| 102 | + tags.append(key_value) |
| 103 | + continue |
| 104 | + # Note: no comma separator needed for the first field. |
| 105 | + # It's important to omit it because when the first |
| 106 | + # field column has no nulls, we don't run the comma-removal |
| 107 | + # regexp substitution step. |
| 108 | + sep = '' if len(field_indexes) == 0 else ',' |
| 109 | + if issubclass(value.type, np.integer): |
| 110 | + field_value = f"{sep}{key_format}={{{val_format}}}i" |
| 111 | + elif issubclass(value.type, np.bool_): |
| 112 | + field_value = f'{sep}{key_format}={{{val_format}}}' |
| 113 | + elif issubclass(value.type, np.float): |
| 114 | + if null_columns[index]: |
| 115 | + field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}""" |
| 116 | + else: |
| 117 | + field_value = f'{sep}{key_format}={{{val_format}}}' |
94 | 118 | else:
|
95 |
| - fields.append(f"{key_format}=\"{{str(p[{index_value}]).translate(_ESCAPE_STRING)}}\"") |
96 |
| - fields_indexes.append(index_value) |
97 |
| - |
98 |
| - tags.sort(key=lambda x: x['key']) |
99 |
| - tags = ','.join(map(lambda y: y['value'], tags)) |
100 |
| - |
101 |
| - fmt = ('{measurement_name}', f'{"," if tags else ""}', tags, |
102 |
| - ' ', ','.join(fields), ' {p[0].value}') |
103 |
| - f = eval("lambda p: f'{}'".format(''.join(fmt)), |
104 |
| - {'measurement_name': measurement_name, '_ESCAPE_KEY': _ESCAPE_KEY, '_ESCAPE_STRING': _ESCAPE_STRING, |
105 |
| - 'keys': keys}) |
| 119 | + if null_columns[index]: |
| 120 | + field_value = f"""{{ |
| 121 | + '' if type({val_format}) == float64 and math.isnan({val_format}) else |
| 122 | + f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"' |
| 123 | + }}""" |
| 124 | + else: |
| 125 | + field_value = f'''{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"''' |
| 126 | + field_indexes.append(field_index) |
| 127 | + fields.append(field_value) |
| 128 | + |
| 129 | + measurement_name = str(data_frame_measurement_name).translate(_ESCAPE_MEASUREMENT) |
| 130 | + |
| 131 | + tags = ''.join(tags) |
| 132 | + fields = ''.join(fields) |
| 133 | + timestamp = '{p[0].value}' |
| 134 | + |
| 135 | + print(f'measurement_name: {measurement_name}') |
| 136 | + print(f'keys: {keys}') |
| 137 | + print(f'tag columns: {data_frame_tag_columns}') |
| 138 | + print(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""') |
| 139 | + f = eval(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""', { |
| 140 | + 'measurement_name': measurement_name, |
| 141 | + '_ESCAPE_KEY': _ESCAPE_KEY, |
| 142 | + '_ESCAPE_STRING': _ESCAPE_STRING, |
| 143 | + 'keys': keys, |
| 144 | + 'math': math, |
| 145 | + }) |
106 | 146 |
|
107 | 147 | for k, v in dict(data_frame.dtypes).items():
|
108 | 148 | if k in data_frame_tag_columns:
|
109 | 149 | data_frame[k].replace('', np.nan, inplace=True)
|
110 | 150 |
|
111 |
| - isnull = data_frame.isnull().any(axis=1) |
112 |
| - |
113 |
| - if isnull.any(): |
114 |
| - rep = _replace(data_frame) |
115 |
| - lp = (reduce(lambda a, b: re.sub(*b, a), rep, f(p)) |
116 |
| - for p in filter(lambda x: _any_not_nan(x, fields_indexes), _itertuples(data_frame))) |
| 151 | + first_field_maybe_null = null_columns[field_indexes[0] - 1] |
| 152 | + if first_field_maybe_null: |
| 153 | + # When the first field is null (None/NaN), we'll have |
| 154 | + # a spurious leading comma which needs to be removed. |
| 155 | + lp = (re.sub('^((\\ |[^ ])* ),', '\\1', f(p)) |
| 156 | + for p in filter(lambda x: _any_not_nan(x, field_indexes), _itertuples(data_frame))) |
117 | 157 | return list(lp)
|
118 | 158 | else:
|
119 | 159 | return list(map(f, _itertuples(data_frame)))
|
0 commit comments