19
19
20
20
pp = pprint .PrettyPrinter (indent = 4 )
21
21
22
- # ['b', 'a']
23
- dtype_role = pd .api .types .CategoricalDtype (categories = ConnectionRoles , ordered = True )
24
- # df1['mptcpdest'] = pd.Categorical(np.nan, ordered=False, categories=ConnectionRoles) ;
22
+ # dtype_role = pd.api.types.CategoricalDtype(categories=ConnectionRoles, ordered=True)
23
+ dtype_role = pd .api .types .CategoricalDtype (categories = [ x .name for x in ConnectionRoles ], ordered = True )
25
24
26
25
27
26
# columns we usually display to debug dataframes
@@ -64,6 +63,7 @@ def _convert_flags(x):
64
63
65
64
def _convert_to_list (x , field = "pass a field to debug" ):
66
65
"""
66
+ Loads x of the form "1,2,5" or None
67
67
for instance functools.partial(_convert_to_list, field="reinjectionOf"),
68
68
returns np.nan instead of [] to allow for faster filtering
69
69
"""
@@ -235,10 +235,11 @@ def load_merged_streams_into_pandas(
235
235
# columns=columns,
236
236
index = False ,
237
237
header = True ,
238
-
239
238
sep = tshark_config .delimiter ,
240
239
)
241
240
241
+ print ("MATT=" , dict (merged_df .dtypes ))
242
+
242
243
# print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))
243
244
244
245
@@ -247,8 +248,6 @@ def load_merged_streams_into_pandas(
247
248
csv_fields = tshark_config .get_fields ("name" , "type" )
248
249
# dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]}
249
250
def _gen_dtypes (fields ):
250
-
251
-
252
251
dtypes = {} # type: ignore
253
252
for suffix in [ SENDER_SUFFIX , RECEIVER_SUFFIX ]:
254
253
@@ -257,24 +256,40 @@ def _gen_dtypes(fields):
257
256
dtypes .setdefault (suffix_fields (suffix , k ), v )
258
257
259
258
dtypes .update ({
259
+ # during the merge, we join even unmapped packets so some entries
260
+ # may be empty => float64
261
+ _sender ("packetid" ): np .float64 ,
262
+ _receiver ("packetid" ): np .float64 ,
263
+ # there is a bug currently
264
+ # https://github.com/pandas-dev/pandas/pull/20826
260
265
'mptcpdest' : dtype_role ,
261
266
'tcpdest' : dtype_role ,
267
+ # '_merge':
262
268
})
263
269
return dtypes
264
270
271
+ def _load_list (x , field = "set field to debug" ):
272
+ """
273
+ Contrary to _convert_to_list
274
+ """
275
+ res = ast .literal_eval (x ) if (x is not None and x != '' ) else np .nan
276
+ return res
277
+
265
278
with open (cachename ) as fd :
266
279
import ast
267
280
dtypes = _gen_dtypes (csv_fields )
268
- pd .set_option ('display.max_rows' , 200 )
269
- pd .set_option ('display.max_colwidth' , - 1 )
281
+
282
+ # more recent versions can do without it
283
+ # pd.set_option('display.max_rows', 200)
284
+ # pd.set_option('display.max_colwidth', -1)
270
285
print ("dtypes=" , dict (dtypes ))
271
286
merged_df = pd .read_csv (
272
287
fd ,
273
- # skip_blank_lines=True,
288
+ skip_blank_lines = True ,
274
289
# hum not needed with comment='#'
275
290
comment = '#' ,
276
291
# we don't need 'header' when metadata is with comment
277
- header = 0 , # read column names from row 2 (before, it's metadata)
292
+ # header=0, # read column names from row 2 (before, it's metadata)
278
293
# skiprows
279
294
sep = tshark_config .delimiter ,
280
295
# converters={
@@ -288,10 +303,15 @@ def _gen_dtypes(fields):
288
303
converters = {
289
304
_sender ("tcpflags" ): _convert_flags ,
290
305
# reinjections, converts to list of integers
291
- _sender ("reinjection_of" ): ast .literal_eval ,
292
- _sender ("reinjected_in" ): ast .literal_eval ,
293
- _receiver ("reinjection_of" ): ast .literal_eval ,
294
- _receiver ("reinjected_in" ): ast .literal_eval ,
306
+ _sender ("reinjection_of" ): functools .partial (_load_list , field = "reinjectedOfSender" ),
307
+ _sender ("reinjected_in" ): functools .partial (_load_list , field = "reinjectedInSender" ),
308
+ _receiver ("reinjection_of" ): functools .partial (_load_list , field = "reinjectedInReceiver" ),
309
+ _receiver ("reinjected_in" ): functools .partial (_load_list , field = "reinjectedInReceiver" ),
310
+
311
+ # there is a bug in pandas see https://github.com/pandas-dev/pandas/pull/20826
312
+ # where the
313
+ "mptcpdest" : lambda x : ConnectionRoles [x ] if x else np .nan ,
314
+
295
315
# "mptcp.reinjection_of": functools.partial(_convert_to_list, field="reinjectionOf"),
296
316
# "mptcp.reinjection_listing": functools.partial(_convert_to_list, field="reinjectedIn"),
297
317
# "mptcp.reinjected_in": functools.partial(_convert_to_list, field="reinjectedIn"),
@@ -374,8 +394,6 @@ def load_into_pandas(
374
394
try :
375
395
with open (csv_filename ) as fd :
376
396
377
-
378
- # TODO use packetid as Index
379
397
data = pd .read_csv (
380
398
fd ,
381
399
comment = '#' ,
@@ -388,14 +406,13 @@ def load_into_pandas(
388
406
"tcp.flags" : _convert_flags ,
389
407
# reinjections, converts to list of integers
390
408
"mptcp.reinjection_of" : functools .partial (_convert_to_list , field = "reinjectionOf" ),
391
- # "mptcp.reinjection_listing": functools.partial(_convert_to_list, field="reinjectedIn"),
392
409
"mptcp.reinjected_in" : functools .partial (_convert_to_list , field = "reinjectedIn" ),
393
- # "mptcp.duplicated_dsn": lambda x: list(map(int, x.split(','))) if x is not None else np.nan,
394
410
},
395
411
# nrows=10, # useful for debugging purpose
396
412
)
397
413
data .rename (inplace = True , columns = config .get_fields ("fullname" , "name" ))
398
414
# we want packetid column to survive merges/dataframe transformation so keepit as a column
415
+ # TODO remove ? let other functions do it ?
399
416
data .set_index ("packetid" , drop = False , inplace = True )
400
417
log .debug ("Column names: %s" , data .columns )
401
418
@@ -887,8 +904,6 @@ def map_tcp_packets_via_hash(
887
904
res = pd .merge (
888
905
sender_df , receiver_df ,
889
906
on = "hash" ,
890
- # right_index=True,
891
- # TODO en fait suffit d'inverser les suffixes, h1, h2
892
907
suffixes = (SENDER_SUFFIX , RECEIVER_SUFFIX ), # columns suffixes (sender/receiver)
893
908
how = "outer" , # we want to keep packets from both
894
909
# we want to know how many packets were not mapped correctly, adds the _merge column
0 commit comments