chore: influxdb_client/client/write: fix data_frame_to_list_of_points

rogpeppe · rogpeppe · commit 9da33f0f4108 · 2021-01-14T13:12:55.000Z
Fix the possibility of data corruption by using a much simpler
regular expression to fix up the results.
diff --git a/.gitignore b/.gitignore
@@ -114,4 +114,3 @@ sandbox
 # OpenAPI-generator
 /.openapi-generator*
 **/writer.pickle
-/tests/data_frame_file.csv
diff --git a/influxdb_client/client/write/dataframe_serializer.py b/influxdb_client/client/write/dataframe_serializer.py
@@ -5,115 +5,197 @@
 """
 
 import re
-from functools import reduce
-from itertools import chain
+import math
 
 from influxdb_client.client.write.point import _ESCAPE_KEY, _ESCAPE_STRING, _ESCAPE_MEASUREMENT
 
 
-def _replace(data_frame):
-    from ...extras import np
-
-    # string columns
-    obj_cols = {k for k, v in dict(data_frame.dtypes).items() if v is np.dtype('O')}
-
-    # number columns
-    other_cols = set(data_frame.columns) - obj_cols
-
-    obj_nans = (f'{k}=nan' for k in obj_cols)
-    other_nans = (f'{k}=nani?' for k in other_cols)
-
-    replacements = [
-        ('|'.join(chain(obj_nans, other_nans)), ''),
-        (',{2,}', ','),
-        ('|'.join([', ,', ', ', ' ,']), ' '),
-    ]
-
-    return replacements
-
-
 def _itertuples(data_frame):
     cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
     return zip(data_frame.index, *cols)
 
 
-def _is_nan(x):
-    return x != x
+def _not_nan(x):
+    return x == x
 
 
 def _any_not_nan(p, indexes):
-    return any(map(lambda inx: not _is_nan(p[inx]), indexes))
+    return any(map(lambda x: _not_nan(p[x]), indexes))
 
 
 def data_frame_to_list_of_points(data_frame, point_settings, **kwargs):
     """Serialize DataFrame into LineProtocols."""
+    # This function is hard to understand but for good reason:
+    # the approach used here is considerably more efficient
+    # than the alternatives.
+    #
+    # We build up a Python expression that efficiently converts a data point
+    # tuple into line-protocol entry, and then evaluate the expression
+    # as a lambda so that we can call it. This avoids the overhead of
+    # invoking a function on every data value - we only have one function
+    # call per row instead. The expression consists of exactly
+    # one f-string, so we build up the parts of it as segments
+    # that are concatenated together to make the full f-string inside
+    # the lambda.
+    #
+    # Things are made a little more complex because fields and tags with NaN
+    # values and empty tags are omitted from the generated line-protocol
+    # output.
+    #
+    # As an example, say we have a data frame with two value columns:
+    #		a float
+    #		b int
+    #
+    # This will generate a lambda expression to be evaluated that looks like
+    # this:
+    #
+    #     lambda p: f"""{measurement_name} {keys[0]}={p[1]},{keys[1]}={p[2]}i {p[0].value}"""
+    #
+    # This lambda is then executed for each row p.
+    #
+    # When NaNs are present, the expression looks like this:
+    #
+    #    lambda p: f"""{measurement_name} {"" if math.isnan(p[1]) else f"{keys[0]}={p[1]}"},{keys[1]}={p[2]}i {p[0].value}"""
+    #
+    # When there's a NaN value in column a, we'll end up with a comma at the start of the
+    # fields, so we run a regexp substitution after generating the line-protocol entries
+    # to remove this.
+    #
+    # We're careful to run these potentially costly extra steps only when NaN values actually
+    # exist in the data.
+
     from ...extras import pd, np
     if not isinstance(data_frame, pd.DataFrame):
         raise TypeError('Must be DataFrame, but type was: {0}.'
                         .format(type(data_frame)))
 
-    if 'data_frame_measurement_name' not in kwargs:
+    data_frame_measurement_name = kwargs.get('data_frame_measurement_name')
+    if data_frame_measurement_name is None:
         raise TypeError('"data_frame_measurement_name" is a Required Argument')
 
+    data_frame = data_frame.copy(deep=False)
     if isinstance(data_frame.index, pd.PeriodIndex):
         data_frame.index = data_frame.index.to_timestamp()
     else:
+        # TODO: this is almost certainly not what you want
+        # when the index is the default RangeIndex.
+        # Instead, it would probably be better to leave
+        # out the timestamp unless a time column is explicitly
+        # enabled.
         data_frame.index = pd.to_datetime(data_frame.index)
 
     if data_frame.index.tzinfo is None:
         data_frame.index = data_frame.index.tz_localize('UTC')
 
-    measurement_name = str(kwargs.get('data_frame_measurement_name')).translate(_ESCAPE_MEASUREMENT)
     data_frame_tag_columns = kwargs.get('data_frame_tag_columns')
     data_frame_tag_columns = set(data_frame_tag_columns or [])
 
+    # keys holds a list of string keys.
+    keys = []
+    # tags holds a list of tag f-string segments ordered alphabetically by tag key.
     tags = []
+    # fields holds a list of field f-string segments  ordered alphebetically by field key
     fields = []
-    fields_indexes = []
-    keys = []
+    # field_indexes holds the index into each row of all the fields.
+    field_indexes = []
 
     if point_settings.defaultTags:
         for key, value in point_settings.defaultTags.items():
-            data_frame[key] = value
-            data_frame_tag_columns.add(key)
-
-    for index, (key, value) in enumerate(data_frame.dtypes.items()):
+            # Avoid overwriting existing data if there's a column
+            # that already exists with the default tag's name.
+            # Note: when a new column is added, the old DataFrame
+            # that we've made a shallow copy of is unaffected.
+            # TODO: when there are NaN or empty values in
+            # the column, we could make a deep copy of the
+            # data and fill in those values with the default tag value.
+            if key not in data_frame.columns:
+                data_frame[key] = value
+                data_frame_tag_columns.add(key)
+
+    # Get a list of all the columns sorted by field/tag key.
+    # We want to iterate through the columns in sorted order
+    # so that we know when we're on the first field so we
+    # can know whether a comma is needed for that
+    # field.
+    columns = sorted(enumerate(data_frame.dtypes.items()), key=lambda col: col[1][0])
+
+    # null_columns has a bool value for each column holding
+    # whether that column contains any null (NaN or None) values.
+    null_columns = data_frame.isnull().any()
+
+    # Iterate through the columns building up the expression for each column.
+    for index, (key, value) in columns:
         key = str(key)
+        key_format = f'{{keys[{len(keys)}]}}'
         keys.append(key.translate(_ESCAPE_KEY))
-        key_format = f'{{keys[{index}]}}'
+        # The field index is one more than the column index because the
+        # time index is at column zero in the finally zipped-together
+        # result columns.
+        field_index = index + 1
+        val_format = f'p[{field_index}]'
 
-        index_value = index + 1
         if key in data_frame_tag_columns:
-            tags.append({'key': key, 'value': f"{key_format}={{str(p[{index_value}]).translate(_ESCAPE_KEY)}}"})
-        elif issubclass(value.type, np.integer):
-            fields.append(f"{key_format}={{p[{index_value}]}}i")
-            fields_indexes.append(index_value)
-        elif issubclass(value.type, (np.float, np.bool_)):
-            fields.append(f"{key_format}={{p[{index_value}]}}")
-            fields_indexes.append(index_value)
+            # This column is a tag column.
+            if null_columns[index]:
+                key_value = f"""{{
+                    '' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else
+                    f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}'
+                }}"""
+            else:
+                key_value = f',{key_format}={{str({val_format}).translate(_ESCAPE_KEY)}}'
+            tags.append(key_value)
+            continue
+
+        # This column is a field column.
+        # Note: no comma separator is needed for the first field.
+        # It's important to omit it because when the first
+        # field column has no nulls, we don't run the comma-removal
+        # regexp substitution step.
+        sep = '' if len(field_indexes) == 0 else ','
+        if issubclass(value.type, np.integer):
+            field_value = f"{sep}{key_format}={{{val_format}}}i"
+        elif issubclass(value.type, np.bool_):
+            field_value = f'{sep}{key_format}={{{val_format}}}'
+        elif issubclass(value.type, np.float):
+            if null_columns[index]:
+                field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}"""
+            else:
+                field_value = f'{sep}{key_format}={{{val_format}}}'
         else:
-            fields.append(f"{key_format}=\"{{str(p[{index_value}]).translate(_ESCAPE_STRING)}}\"")
-            fields_indexes.append(index_value)
-
-    tags.sort(key=lambda x: x['key'])
-    tags = ','.join(map(lambda y: y['value'], tags))
-
-    fmt = ('{measurement_name}', f'{"," if tags else ""}', tags,
-           ' ', ','.join(fields), ' {p[0].value}')
-    f = eval("lambda p: f'{}'".format(''.join(fmt)),
-             {'measurement_name': measurement_name, '_ESCAPE_KEY': _ESCAPE_KEY, '_ESCAPE_STRING': _ESCAPE_STRING,
-              'keys': keys})
+            if null_columns[index]:
+                field_value = f"""{{
+                    '' if type({val_format}) == float and math.isnan({val_format}) else
+                    f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'
+                }}"""
+            else:
+                field_value = f'''{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'''
+        field_indexes.append(field_index)
+        fields.append(field_value)
+
+    measurement_name = str(data_frame_measurement_name).translate(_ESCAPE_MEASUREMENT)
+
+    tags = ''.join(tags)
+    fields = ''.join(fields)
+    timestamp = '{p[0].value}'
+
+    f = eval(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""', {
+        'measurement_name': measurement_name,
+        '_ESCAPE_KEY': _ESCAPE_KEY,
+        '_ESCAPE_STRING': _ESCAPE_STRING,
+        'keys': keys,
+        'math': math,
+    })
 
     for k, v in dict(data_frame.dtypes).items():
         if k in data_frame_tag_columns:
             data_frame[k].replace('', np.nan, inplace=True)
 
-    isnull = data_frame.isnull().any(axis=1)
-
-    if isnull.any():
-        rep = _replace(data_frame)
-        lp = (reduce(lambda a, b: re.sub(*b, a), rep, f(p))
-              for p in filter(lambda x: _any_not_nan(x, fields_indexes), _itertuples(data_frame)))
+    first_field_maybe_null = null_columns[field_indexes[0] - 1]
+    if first_field_maybe_null:
+        # When the first field is null (None/NaN), we'll have
+        # a spurious leading comma which needs to be removed.
+        lp = (re.sub('^((\\ |[^ ])* ),', '\\1', f(p))
+              for p in filter(lambda x: _any_not_nan(x, field_indexes), _itertuples(data_frame)))
         return list(lp)
     else:
         return list(map(f, _itertuples(data_frame)))
diff --git a/influxdb_client/client/write/point.py b/influxdb_client/client/write/point.py
@@ -14,10 +14,32 @@
 from influxdb_client.domain.write_precision import WritePrecision
 
 EPOCH = UTC.localize(datetime.utcfromtimestamp(0))
+
 DEFAULT_WRITE_PRECISION = WritePrecision.NS
-_ESCAPE_MEASUREMENT = str.maketrans({'\\': '\\\\', ',': r'\,', ' ': r'\ ', '\n': '\\n', '\t': '\\t', '\r': '\\r'})
-_ESCAPE_KEY = str.maketrans({'\\': '\\\\', ',': r'\,', ' ': r'\ ', '=': r'\=', '\n': '\\n', '\t': '\\t', '\r': '\\r'})
-_ESCAPE_STRING = str.maketrans({'\"': r"\"", "\\": r"\\"})
+
+_ESCAPE_MEASUREMENT = str.maketrans({
+    '\\': r'\\',      # Note: this is wrong. Backslashes are not escaped like this in measurements.
+    ',': r'\,',
+    ' ': r'\ ',
+    '\n': r'\n',
+    '\t': r'\t',
+    '\r': r'\r',
+})
+
+_ESCAPE_KEY = str.maketrans({
+    '\\': r'\\',      # Note: this is wrong. Backslashes are not escaped like this in keys.
+    ',': r'\,',
+    '=': r'\=',
+    ' ': r'\ ',
+    '\n': r'\n',
+    '\t': r'\t',
+    '\r': r'\r',
+})
+
+_ESCAPE_STRING = str.maketrans({
+    '"': r'\"',
+    '\\': r'\\',
+})
 
 
 class Point(object):
diff --git a/tests/test_WriteApiDataFrame.py b/tests/test_WriteApiDataFrame.py