Skip to content

Commit 41ee594

Browse files
committed
feat: Add possibility to specify DataFrame index (#29)
1 parent 9e96c58 commit 41ee594

File tree

5 files changed

+156
-620
lines changed

5 files changed

+156
-620
lines changed

influxdb_client/client/flux_csv_parser.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import codecs
33
import csv as csv_parser
44
from enum import Enum
5+
from typing import List
56

67
import ciso8601
78
from pandas import DataFrame
@@ -28,10 +29,12 @@ class FluxSerializationMode(Enum):
2829

2930
class FluxCsvParser(object):
3031

31-
def __init__(self, response: HTTPResponse, serialization_mode: FluxSerializationMode) -> None:
32+
def __init__(self, response: HTTPResponse, serialization_mode: FluxSerializationMode,
33+
data_frame_index: List[str] = None) -> None:
3234
self._response = response
3335
self.tables = []
3436
self._serialization_mode = serialization_mode
37+
self._data_frame_index = data_frame_index
3538
pass
3639

3740
def __enter__(self):
@@ -74,8 +77,8 @@ def _parse_flux_response(self):
7477
if "#datatype" == token:
7578

7679
# Return already parsed DataFrame
77-
if (self._serialization_mode is FluxSerializationMode.dataFrame) & hasattr(self, '_dataFrame'):
78-
yield self._dataFrame
80+
if (self._serialization_mode is FluxSerializationMode.dataFrame) & hasattr(self, '_data_frame'):
81+
yield self._prepare_data_frame()
7982

8083
start_new_table = True
8184
table = FluxTable()
@@ -101,9 +104,9 @@ def _parse_flux_response(self):
101104
start_new_table = False
102105
# Create DataFrame with default values
103106
if self._serialization_mode is FluxSerializationMode.dataFrame:
104-
self._dataFrame = DataFrame(data=[], columns=[], index=None)
107+
self._data_frame = DataFrame(data=[], columns=[], index=None)
105108
for column in table.columns:
106-
self._dataFrame[column.label] = column.default_value
109+
self._data_frame[column.label] = column.default_value
107110
pass
108111
continue
109112

@@ -127,15 +130,20 @@ def _parse_flux_response(self):
127130
yield flux_record
128131

129132
if self._serialization_mode is FluxSerializationMode.dataFrame:
130-
self._dataFrame.loc[len(self._dataFrame.index)] = flux_record.values
133+
self._data_frame.loc[len(self._data_frame.index)] = flux_record.values
131134
pass
132135

133136
# debug
134137
# print(flux_record)
135138

136139
# Return latest DataFrame
137-
if (self._serialization_mode is FluxSerializationMode.dataFrame) & hasattr(self, '_dataFrame'):
138-
yield self._dataFrame
140+
if (self._serialization_mode is FluxSerializationMode.dataFrame) & hasattr(self, '_data_frame'):
141+
yield self._prepare_data_frame()
142+
143+
def _prepare_data_frame(self):
144+
if self._data_frame_index:
145+
self._data_frame = self._data_frame.set_index(self._data_frame_index)
146+
return self._data_frame
139147

140148
def parse_record(self, table_index, table, csv):
141149
record = FluxRecord(table_index)

influxdb_client/client/query_api.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,14 @@ def query_stream(self, query: str, org=None) -> Generator['FluxRecord', Any, Non
9292

9393
return _parser.generator()
9494

95-
def query_data_frame(self, query: str, org=None):
95+
def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = None):
9696
"""
9797
Synchronously executes the Flux query and return Pandas DataFrame.
98-
Note that if a query returns more then one table than the client generates a dataframe for each of them.
98+
Note that if a query returns more then one table than the client generates a DataFrame for each of them.
9999
100100
:param query: the Flux query
101101
:param org: organization name (optional if already specified in InfluxDBClient)
102+
:param data_frame_index: the list of columns that are used as DataFrame index
102103
:return:
103104
"""
104105
if org is None:
@@ -107,7 +108,8 @@ def query_data_frame(self, query: str, org=None):
107108
response = self._query_api.post_query(org=org, query=self._create_query(query, self.default_dialect),
108109
async_req=False, _preload_content=False, _return_http_data_only=False)
109110

110-
_parser = FluxCsvParser(response=response, serialization_mode=FluxSerializationMode.dataFrame)
111+
_parser = FluxCsvParser(response=response, serialization_mode=FluxSerializationMode.dataFrame,
112+
data_frame_index=data_frame_index)
111113
_dataFrames = list(_parser.generator())
112114

113115
if len(_dataFrames) == 1:

notebooks/realtime-stream.ipynb

+29-31
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"outputs": [],
3636
"source": [
3737
"from datetime import timedelta\n",
38+
"from typing import List\n",
3839
"\n",
3940
"import hvplot.streamz\n",
4041
"import pandas as pd\n",
@@ -52,15 +53,15 @@
5253
"metadata": {},
5354
"outputs": [],
5455
"source": [
55-
"def source_data(auto_refresh: int, sink: Stream):\n",
56+
"def source_data(auto_refresh: int, query: str, sink: Stream):\n",
5657
" rx \\\n",
5758
" .interval(period=timedelta(seconds=auto_refresh)) \\\n",
5859
" .pipe(ops.map(lambda start: f'from(bucket: \"my-bucket\") '\n",
5960
" f'|> range(start: -{auto_refresh}s, stop: now()) '\n",
60-
" f'|> filter(fn: (r) => (r._measurement == \"cpu\") or (r._measurement == \"mem\")) ')) \\\n",
61-
" .pipe(ops.map(lambda query: client.query_api().query_stream(query))) \\\n",
62-
" .pipe(ops.flat_map(lambda records: rx.from_iterable(records))) \\\n",
63-
" .subscribe(observer=lambda record: sink.emit(record), on_error=lambda error: print(error))\n",
61+
" f'{query}')) \\\n",
62+
" .pipe(ops.map(lambda query: client.query_api().query_data_frame(query, data_frame_index=['_time']))) \\\n",
63+
" .pipe(ops.map(lambda data_frame: data_frame.drop(columns=['result', 'table']))) \\\n",
64+
" .subscribe(observer=lambda data_frame: sink.emit(data_frame), on_error=lambda error: print(error))\n",
6465
" pass"
6566
]
6667
},
@@ -70,10 +71,7 @@
7071
"metadata": {},
7172
"outputs": [],
7273
"source": [
73-
"client = InfluxDBClient(url=\"http://localhost:9999\", token=\"my-token\", org=\"my-org\", debug=False)\n",
74-
"\n",
75-
"sink = Stream()\n",
76-
"source_data(auto_refresh=5, sink=sink)"
74+
"client = InfluxDBClient(url='http://localhost:9999', token='my-token', org='my-org')"
7775
]
7876
},
7977
{
@@ -82,12 +80,17 @@
8280
"metadata": {},
8381
"outputs": [],
8482
"source": [
85-
"cpu_example = pd.DataFrame({'value': []}, columns=['value'])\n",
83+
"cpu_query = '|> filter(fn: (r) => r._measurement == \"cpu\") ' \\\n",
84+
" '|> filter(fn: (r) => r._field == \"usage_user\") ' \\\n",
85+
" '|> filter(fn: (r) => r.cpu == \"cpu-total\") ' \\\n",
86+
" '|> keep(columns: [\"_time\", \"_value\"])'\n",
87+
"\n",
88+
"\n",
89+
"cpu_sink = Stream()\n",
90+
"cpu_example = pd.DataFrame({'_value': []}, columns=['_value'])\n",
91+
"cpu_df = DataFrame(cpu_sink, example=cpu_example)\n",
8692
"\n",
87-
"cpu_sink = sink\\\n",
88-
" .filter(lambda record: (record[\"_measurement\"] == \"cpu\") & (record[\"_field\"] == \"usage_user\"))\\\n",
89-
" .map(lambda record: pd.DataFrame({'value': [record[\"_value\"]]}, columns=['value'], index=[record[\"_time\"]]))\n",
90-
"cpu = DataFrame(cpu_sink, example=cpu_example)"
93+
"source_data(auto_refresh=5, sink=cpu_sink, query=cpu_query)"
9194
]
9295
},
9396
{
@@ -96,14 +99,17 @@
9699
"metadata": {},
97100
"outputs": [],
98101
"source": [
99-
"mem_example = pd.DataFrame({'field': [], 'value': []}, columns=['field', 'value'])\n",
102+
"mem_query = '|> filter(fn: (r) => r._measurement == \"mem\") ' \\\n",
103+
" '|> filter(fn: (r) => r._field == \"available\" or r._field == \"free\" or r._field == \"total\" or r._field == \"used\") ' \\\n",
104+
" '|> map(fn: (r) => ({ r with _value: r._value / 1024 / 1024 }))' \\\n",
105+
" '|> pivot(rowKey:[\"_time\"], columnKey: [\"_field\"], valueColumn: \"_value\")' \\\n",
106+
" '|> keep(columns: [\"_time\", \"used\", \"total\", \"free\", \"available\"])'\n",
100107
"\n",
101-
"mem_sink = sink \\\n",
102-
" .filter(lambda record: record[\"_measurement\"] == \"mem\") \\\n",
103-
" .filter(lambda record: record[\"_field\"] in [\"total\", \"used\", \"free\", \"available\"]) \\\n",
104-
" .map(lambda record: pd.DataFrame({'field': record[\"_field\"], 'value': record[\"_value\"]},\n",
105-
" columns=['field', 'value'], index=[record[\"_time\"], record[\"_field\"]]))\n",
106-
"mem = DataFrame(mem_sink, example=mem_example)"
108+
"mem_sink = Stream()\n",
109+
"mem_example = pd.DataFrame({'used': [], 'total': [], 'free': [], 'available': []}, columns=['available', 'free', 'total', 'used'])\n",
110+
"mem_df = DataFrame(mem_sink, example=mem_example)\n",
111+
"\n",
112+
"source_data(auto_refresh=5, sink=mem_sink, query=mem_query)"
107113
]
108114
},
109115
{
@@ -130,16 +136,8 @@
130136
" years=[\"%H:%M:%S\"],\n",
131137
")\n",
132138
"\n",
133-
"cpu.hvplot(width=700, backlog=50, title='CPU % usage', xlabel='Time', ylabel='%', xformatter=formatter)"
134-
]
135-
},
136-
{
137-
"cell_type": "code",
138-
"execution_count": null,
139-
"metadata": {},
140-
"outputs": [],
141-
"source": [
142-
" mem.groupby('field').sum().hvplot.bar()"
139+
"cpu_df.hvplot(width=450, backlog=50, title='CPU % usage', xlabel='Time', ylabel='%', xformatter=formatter) +\\\n",
140+
"mem_df.hvplot.line(width=450, backlog=50, title='Memory', xlabel='Time', ylabel='MiB', xformatter=formatter, legend='top_left')"
143141
]
144142
},
145143
{

0 commit comments

Comments
 (0)