chore: influxdb_client/client/write: fix data_frame_to_list_of_points

rogpeppe · rogpeppe · commit f4be9f488a2d · 2021-01-13T15:12:18.000Z
Fix the possibility of data corruption by using a much simpler
regular expression to fix up the results.
diff --git a/.gitignore b/.gitignore
@@ -114,4 +114,3 @@ sandbox
 # OpenAPI-generator
 /.openapi-generator*
 **/writer.pickle
-/tests/data_frame_file.csv
diff --git a/influxdb_client/client/write/dataframe_serializer.py b/influxdb_client/client/write/dataframe_serializer.py
@@ -5,115 +5,155 @@
 """
 
 import re
-from functools import reduce
-from itertools import chain
+import math
 
 from influxdb_client.client.write.point import _ESCAPE_KEY, _ESCAPE_STRING, _ESCAPE_MEASUREMENT
 
 
-def _replace(data_frame):
-    from ...extras import np
-
-    # string columns
-    obj_cols = {k for k, v in dict(data_frame.dtypes).items() if v is np.dtype('O')}
-
-    # number columns
-    other_cols = set(data_frame.columns) - obj_cols
-
-    obj_nans = (f'{k}=nan' for k in obj_cols)
-    other_nans = (f'{k}=nani?' for k in other_cols)
-
-    replacements = [
-        ('|'.join(chain(obj_nans, other_nans)), ''),
-        (',{2,}', ','),
-        ('|'.join([', ,', ', ', ' ,']), ' '),
-    ]
-
-    return replacements
-
-
 def _itertuples(data_frame):
     cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
     return zip(data_frame.index, *cols)
 
 
-def _is_nan(x):
-    return x != x
+def _not_nan(x):
+    return x == x
 
 
 def _any_not_nan(p, indexes):
-    return any(map(lambda inx: not _is_nan(p[inx]), indexes))
+    return any(map(lambda x: _not_nan(p[x]), indexes))
 
 
 def data_frame_to_list_of_points(data_frame, point_settings, **kwargs):
     """Serialize DataFrame into LineProtocols."""
+    # This function is hard to understand but for good reason:
+    # the approach used here is considerably more efficient
+    # than the alternatives,
+    #
+    # We build up a Python expression that very efficiently converts a data point
+    # tuple into line-protocol entry, and then evaluate the expression
+    # as a lambda so that we can call it. This avoids the overhead of
+    # invoking a function on every data value - we only have one function
+    # call per row instead.
+
     from ...extras import pd, np
     if not isinstance(data_frame, pd.DataFrame):
         raise TypeError('Must be DataFrame, but type was: {0}.'
                         .format(type(data_frame)))
 
-    if 'data_frame_measurement_name' not in kwargs:
+    data_frame_measurement_name = kwargs.get('data_frame_measurement_name')
+    if data_frame_measurement_name is None:
         raise TypeError('"data_frame_measurement_name" is a Required Argument')
 
+    data_frame = data_frame.copy(deep=False)
     if isinstance(data_frame.index, pd.PeriodIndex):
         data_frame.index = data_frame.index.to_timestamp()
     else:
+        # TODO: this is almost certainly not what you want
+        # when the index is the default RangeIndex.
+        # Instead, it would probably be better to leave
+        # out the timestamp unless a time column is explicitly
+        # enabled.
         data_frame.index = pd.to_datetime(data_frame.index)
 
     if data_frame.index.tzinfo is None:
         data_frame.index = data_frame.index.tz_localize('UTC')
 
-    measurement_name = str(kwargs.get('data_frame_measurement_name')).translate(_ESCAPE_MEASUREMENT)
     data_frame_tag_columns = kwargs.get('data_frame_tag_columns')
     data_frame_tag_columns = set(data_frame_tag_columns or [])
 
     tags = []
-    fields = []
-    fields_indexes = []
     keys = []
+    fields = []
+    field_indexes = []
 
     if point_settings.defaultTags:
         for key, value in point_settings.defaultTags.items():
+            # TODO: this overrides any values for the column
+            # which is probably not what a "default" tag value
+            # is meant to do. It might be better to add the
+            # column only when it doesn't already exist,
+            # and to fill out any NaN values with the default
+            # value otherwise.
             data_frame[key] = value
             data_frame_tag_columns.add(key)
 
-    for index, (key, value) in enumerate(data_frame.dtypes.items()):
+    # Get a list of all the columns sorted by field/tag key.
+    columns = sorted(enumerate(data_frame.dtypes.items()), key=lambda col: col[1][0])
+
+    null_columns = data_frame.isnull().any()
+    for index, (key, value) in columns:
         key = str(key)
+        key_format = f'{{keys[{len(keys)}]}}'
         keys.append(key.translate(_ESCAPE_KEY))
-        key_format = f'{{keys[{index}]}}'
+        # The field index is one more than the column index because the
+        # time index is at column zero in the finally zipped-together
+        # result columns.
+        field_index = index + 1
+        val_format = f'p[{field_index}]'
 
-        index_value = index + 1
         if key in data_frame_tag_columns:
-            tags.append({'key': key, 'value': f"{key_format}={{str(p[{index_value}]).translate(_ESCAPE_KEY)}}"})
-        elif issubclass(value.type, np.integer):
-            fields.append(f"{key_format}={{p[{index_value}]}}i")
-            fields_indexes.append(index_value)
-        elif issubclass(value.type, (np.float, np.bool_)):
-            fields.append(f"{key_format}={{p[{index_value}]}}")
-            fields_indexes.append(index_value)
+            if null_columns[index]:
+                key_value = f"""{{
+                    '' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else
+                    f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}'
+                }}"""
+            else:
+                key_value = f',{key_format}={{str({val_format}).translate(_ESCAPE_KEY)}}'
+            tags.append(key_value)
+            continue
+        # Note: no comma separator needed for the first field.
+        # It's important to omit it because when the first
+        # field column has no nulls, we don't run the comma-removal
+        # regexp substitution step.
+        sep = '' if len(field_indexes) == 0 else ','
+        if issubclass(value.type, np.integer):
+            field_value = f"{sep}{key_format}={{{val_format}}}i"
+        elif issubclass(value.type, np.bool_):
+            field_value = f'{sep}{key_format}={{{val_format}}}'
+        elif issubclass(value.type, np.float):
+            if null_columns[index]:
+                field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}"""
+            else:
+                field_value = f'{sep}{key_format}={{{val_format}}}'
         else:
-            fields.append(f"{key_format}=\"{{str(p[{index_value}]).translate(_ESCAPE_STRING)}}\"")
-            fields_indexes.append(index_value)
-
-    tags.sort(key=lambda x: x['key'])
-    tags = ','.join(map(lambda y: y['value'], tags))
-
-    fmt = ('{measurement_name}', f'{"," if tags else ""}', tags,
-           ' ', ','.join(fields), ' {p[0].value}')
-    f = eval("lambda p: f'{}'".format(''.join(fmt)),
-             {'measurement_name': measurement_name, '_ESCAPE_KEY': _ESCAPE_KEY, '_ESCAPE_STRING': _ESCAPE_STRING,
-              'keys': keys})
+            if null_columns[index]:
+                field_value = f"""{{
+                    '' if type({val_format}) == float64 and math.isnan({val_format}) else
+                    f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'
+                }}"""
+            else:
+                field_value = f'''{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'''
+        field_indexes.append(field_index)
+        fields.append(field_value)
+
+    measurement_name = str(data_frame_measurement_name).translate(_ESCAPE_MEASUREMENT)
+
+    tags = ''.join(tags)
+    fields = ''.join(fields)
+    timestamp = '{p[0].value}'
+
+    print(f'measurement_name: {measurement_name}')
+    print(f'keys: {keys}')
+    print(f'tag columns: {data_frame_tag_columns}')
+    print(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""')
+    f = eval(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""', {
+        'measurement_name': measurement_name,
+        '_ESCAPE_KEY': _ESCAPE_KEY,
+        '_ESCAPE_STRING': _ESCAPE_STRING,
+        'keys': keys,
+        'math': math,
+    })
 
     for k, v in dict(data_frame.dtypes).items():
         if k in data_frame_tag_columns:
             data_frame[k].replace('', np.nan, inplace=True)
 
-    isnull = data_frame.isnull().any(axis=1)
-
-    if isnull.any():
-        rep = _replace(data_frame)
-        lp = (reduce(lambda a, b: re.sub(*b, a), rep, f(p))
-              for p in filter(lambda x: _any_not_nan(x, fields_indexes), _itertuples(data_frame)))
+    first_field_maybe_null = null_columns[field_indexes[0] - 1]
+    if first_field_maybe_null:
+        # When the first field is null (None/NaN), we'll have
+        # a spurious leading comma which needs to be removed.
+        lp = (re.sub('^((\\ |[^ ])* ),', '\\1', f(p))
+              for p in filter(lambda x: _any_not_nan(x, field_indexes), _itertuples(data_frame)))
         return list(lp)
     else:
         return list(map(f, _itertuples(data_frame)))
diff --git a/influxdb_client/client/write/point.py b/influxdb_client/client/write/point.py
@@ -14,10 +14,32 @@
 from influxdb_client.domain.write_precision import WritePrecision
 
 EPOCH = UTC.localize(datetime.utcfromtimestamp(0))
+
 DEFAULT_WRITE_PRECISION = WritePrecision.NS
-_ESCAPE_MEASUREMENT = str.maketrans({'\\': '\\\\', ',': r'\,', ' ': r'\ ', '\n': '\\n', '\t': '\\t', '\r': '\\r'})
-_ESCAPE_KEY = str.maketrans({'\\': '\\\\', ',': r'\,', ' ': r'\ ', '=': r'\=', '\n': '\\n', '\t': '\\t', '\r': '\\r'})
-_ESCAPE_STRING = str.maketrans({'\"': r"\"", "\\": r"\\"})
+
+_ESCAPE_MEASUREMENT = str.maketrans({
+    '\\': r'\\',      # Note: this is wrong. Backslashes are not escaped like this in measurements.
+    ',': r'\,',
+    ' ': r'\ ',
+    '\n': r'\n',
+    '\t': r'\t',
+    '\r': r'\r',
+})
+
+_ESCAPE_KEY = str.maketrans({
+    '\\': r'\\',      # Note: this is wrong. Backslashes are not escaped like this in keys.
+    ',': r'\,',
+    '=': r'\=',
+    ' ': r'\ ',
+    '\n': r'\n',
+    '\t': r'\t',
+    '\r': r'\r',
+})
+
+_ESCAPE_STRING = str.maketrans({
+    '"': r'\"',
+    '\\': r'\\',
+})
 
 
 class Point(object):
diff --git a/tests/test_WriteApiDataFrame.py b/tests/test_WriteApiDataFrame.py
@@ -23,41 +23,27 @@ def tearDown(self) -> None:
         super().tearDown()
         self._write_client.__del__()
 
-    @unittest.skip('Test big file')
-    def test_write_data_frame(self):
-        import random
+    @unittest.skip('Test big data')
+    def test_convert_data_frame(self):
         from influxdb_client.extras import pd
 
-        if not os.path.isfile("data_frame_file.csv"):
-            with open('data_frame_file.csv', mode='w+') as csv_file:
-                _writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
-                _writer.writerow(['time', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8'])
+        num_rows=1500000
+        col_data={
+            'time': np.arange(0, num_rows, 1, dtype=int),
+            'col1': np.random.choice(['test_a', 'test_b', 'test_c'], size=(num_rows,)),
+        }
+        for n in range(2, 9):
+            col_data[f'col{n}'] = np.random.rand(num_rows)
 
-                for i in range(1, 1500000):
-                    choice = ['test_a', 'test_b', 'test_c']
-                    _writer.writerow([i, random.choice(choice), 'test', random.random(), random.random(),
-                                      random.random(), random.random(), random.random(), random.random()])
+        data_frame = pd.DataFrame(data=col_data)
+        print(data_frame)
 
-            csv_file.close()
+        start = time.time()
+        data_frame_to_list_of_points(record, PointSettings(),
+            data_frame_measurement_name='h2o_feet',
+            data_frame_tag_columns=['location'])
 
-        with open('data_frame_file.csv', mode='rb') as csv_file:
-
-            data_frame = pd.read_csv(csv_file, index_col='time')
-            print(data_frame)
-
-            print('Writing...')
-
-            start = time.time()
-
-            self._write_client.write("my-bucket", "my-org", record=data_frame,
-                                     data_frame_measurement_name='h2o_feet',
-                                     data_frame_tag_columns=['location'])
-
-            self._write_client.__del__()
-
-            print("Time elapsed: ", (time.time() - start))
-
-        csv_file.close()
+        print("Time elapsed: ", (time.time() - start))
 
     def test_write_num_py(self):
         from influxdb_client.extras import pd, np
@@ -110,14 +96,14 @@ def test_write_nan(self):
                                               data_frame_measurement_name='measurement')
 
         self.assertEqual(4, len(points))
-        self.assertEqual("measurement actual_kw_price=3.1955,actual_general_use=20.514305 1586044800000000000",
+        self.assertEqual("measurement actual_general_use=20.514305,actual_kw_price=3.1955 1586044800000000000",
                          points[0])
-        self.assertEqual("measurement actual_kw_price=5.731,actual_general_use=23.32871 1586046600000000000",
+        self.assertEqual("measurement actual_general_use=23.32871,actual_kw_price=5.731 1586046600000000000",
                          points[1])
-        self.assertEqual("measurement forecast_kw_price=3.138664,forecast_general_use=20.755026 1586048400000000000",
+        self.assertEqual("measurement forecast_general_use=20.755026,forecast_kw_price=3.138664 1586048400000000000",
                          points[2])
-        self.assertEqual("measurement actual_kw_price=5.731,forecast_kw_price=5.139563,actual_general_use=23.32871,"
-                         "forecast_general_use=19.79124 1586050200000000000",
+        self.assertEqual("measurement actual_general_use=23.32871,actual_kw_price=5.731,forecast_general_use=19.79124"
+                         ",forecast_kw_price=5.139563 1586050200000000000",
                          points[3])
 
     def test_write_tag_nan(self):