Hacked a fix to work around pandas limitation

teto · teto · commit 7e0ba0485c61 · 2018-07-04T18:38:26.000+09:00
... around to_csv/read_csv and pd.api.types.CategoricalDtype for instance using pd.api.types.CategoricalDtype(categories=ConnectionRoles, ordered=True) to_csv will correctly write the column but read_csv will fail expecting a string pandas-dev/pandas#20826. I ended up writing my own converter
diff --git a/mptcpanalyzer/cli.py b/mptcpanalyzer/cli.py
@@ -580,9 +580,17 @@ def do_qualify_reinjections(self, line):
         # keep only those that matched both for now
 
         df_all["redundant"] = False
+
         df = df_all[ df_all._merge == "both" ]
+        print("MATT %d df packets" % len(df))
+        
 
-        df.to_excel("temp.xls")
+        print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
+            _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
+            _receiver(["packetid", "reltime"])
+        ])
+        # to help debug
+        # df.to_excel("temp.xls")
 
         def _print_reinjection_comparison(original_packet, reinj):
             """
@@ -607,18 +615,9 @@ def _print_reinjection_comparison(original_packet, reinj):
             if getattr(row, _receiver("abstime")) > original_packet[ _receiver("abstime") ]:
                 print("BUG: this is not a valid reinjection after all ?")
 
-        print("debugging ")
+        # print("debugging ")
         print("dataframe size = %d" % len(df))
 
-        # print(df.columns)
-        # print(df[['owd']].head())
-        # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))
-        # print(df[mpdata.MPTCP_DEBUG_FIELDS].head(20))
-
-        # TODO for debug
-        # todo we need to add 
-        # res['mptcpdest'] = dest.name
-
         # TODO keep only the ones with "merge_" : "both" ?
 
         # reinjections = df[['tcpstream', "reinjection_of"]].dropna(axis=0, )
@@ -638,12 +637,16 @@ def _print_reinjection_comparison(original_packet, reinj):
         for destination in ConnectionRoles:
             self.poutput("looking for reinjections towards mptcp %s" % destination)
 
+            print(df["mptcpdest"])
             sender_df   = df[df.mptcpdest == destination]
 
             # print(sender_df[ sender_df.reinjected_in.notna() ][["packetid", "reinjected_in"]])
             # print("successful reinjections" % len(reinjected_in))
 
             # select only packets that have been reinjected
+
+            print("%d sender_df packets" % len(sender_df))
+            print(sender_df["reinjection_of"])
             reinjected_packets = sender_df.dropna(axis='index', subset=[ _sender("reinjection_of") ])
 
             print("%d reinjected packets" % len(reinjected_packets))
@@ -653,24 +656,24 @@ def _print_reinjection_comparison(original_packet, reinj):
                 print(reinjected_packets[["packetid", "packetid_receiver", *_receiver(["reinjected_in", "reinjection_of"])]].head())
 
 
-            for row in reinjected_packets.itertuples():
+            for reinjection in reinjected_packets.itertuples():
                 # here we look at all the reinjected packets
 
-                # print("full row %r" % (row,))
+                # print("full reinjection %r" % (reinjection,))
 
                 # if there are packets in _receiver(reinjected_in), it means the reinjections 
                 # arrived before other similar segments and thus these segments are useless
                 # it should work because 
-                # useless_reinjections = getattr(row, _receiver("reinjected_in"), [])
+                # useless_reinjections = getattr(reinjection, _receiver("reinjected_in"), [])
 
                 # if it was correctly mapped
-                # row._merge doesn't exist ?
-                if row._1 != "both":
+                # reinjection._merge doesn't exist ?
+                if reinjection._1 != "both":
                     # TODO count missed classifications ?
-                    log.debug("reinjection %d could not be mapped, giving up..." % (row.packetid))
+                    log.debug("reinjection %d could not be mapped, giving up..." % (reinjection.packetid))
                     continue
 
-                initial_packetid = row.reinjection_of[0]
+                initial_packetid = reinjection.reinjection_of[0]
                 # print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))
 
                 # 
@@ -683,7 +686,7 @@ def _print_reinjection_comparison(original_packet, reinj):
 
 
                 orig_arrival  = getattr(original_packet, _receiver("reltime"))
-                reinj_arrival = getattr(row, _receiver("reltime"))
+                reinj_arrival = getattr(reinjection, _receiver("reltime"))
 
 
                 # print("useless_reinjections listing  %r" % (useless_reinjections,))
@@ -694,7 +697,8 @@ def _print_reinjection_comparison(original_packet, reinj):
 
                 if orig_arrival < reinj_arrival:
                     print("GOT A MATCH")
-                    sender_df.loc[ sender_df[ _sender("packetid")] == row.packetid, "redundant"] = True
+                    sender_df.loc[ sender_df[ _sender("packetid")] == reinjection.packetid, "redundant"] = True
+                    print("is this where it's wrong ?")
 
 
             print("results: ", df[ df.redundant == True] )
@@ -718,15 +722,15 @@ def _print_reinjection_comparison(original_packet, reinj):
             # df[ df.redundant == False] && df["reinjected_in" + RECEIVER_SUFFIX])
 
             for row in successful_reinjections.itertuples(index=False):
-                print("full row %r" % (row,))
+                # print("full row %r" % (row,))
 
                 # loc ? this is an array, sort it and take the first one ?
                 # initial_packetid = getattr(row, _sender("reinjection_of")),
                 initial_packetid = row.reinjection_of[0]
-                print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))
+                # print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))
 
                 original_packet  = df_all.loc[ df_all.packetid == initial_packetid ].iloc[0]
-                print("original packet = %r %s" % (original_packet, type(original_packet)))
+                # print("original packet = %r %s" % (original_packet, type(original_packet)))
 
                 _print_reinjection_comparison(original_packet, row)
 
diff --git a/mptcpanalyzer/data.py b/mptcpanalyzer/data.py
@@ -19,9 +19,8 @@
 
 pp = pprint.PrettyPrinter(indent=4)
 
-# ['b', 'a']
-dtype_role = pd.api.types.CategoricalDtype(categories=ConnectionRoles, ordered=True)
-# df1['mptcpdest'] = pd.Categorical(np.nan, ordered=False, categories=ConnectionRoles) ;
+# dtype_role = pd.api.types.CategoricalDtype(categories=ConnectionRoles, ordered=True)
+dtype_role = pd.api.types.CategoricalDtype(categories=[ x.name for x in ConnectionRoles], ordered=True)
 
 
 # columns we usually display to debug dataframes
@@ -64,6 +63,7 @@ def _convert_flags(x):
 
 def _convert_to_list(x, field="pass a field to debug"):
     """
+    Loads x of the form "1,2,5" or None
     for instance functools.partial(_convert_to_list, field="reinjectionOf"),
     returns np.nan instead of [] to allow for faster filtering
     """
@@ -235,10 +235,11 @@ def load_merged_streams_into_pandas(
                 # columns=columns,
                 index=False,
                 header=True,
-
                 sep=tshark_config.delimiter,
             )
 
+            print("MATT=", dict(merged_df.dtypes))
+
             # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))
 
 
@@ -247,8 +248,6 @@ def load_merged_streams_into_pandas(
             csv_fields = tshark_config.get_fields("name", "type")
             # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]}
             def _gen_dtypes(fields):
-
-
                 dtypes = {} # type: ignore
                 for suffix in [ SENDER_SUFFIX, RECEIVER_SUFFIX]:
 
@@ -257,24 +256,40 @@ def _gen_dtypes(fields):
                             dtypes.setdefault(suffix_fields(suffix, k), v)
 
                 dtypes.update({
+                    # during the merge, we join even unmapped packets so some entries
+                    # may be empty => float64
+                    _sender("packetid"): np.float64,
+                    _receiver("packetid"): np.float64,
+                    # there is a bug currently
+                    # https://github.com/pandas-dev/pandas/pull/20826
                     'mptcpdest': dtype_role,
                     'tcpdest': dtype_role,
+                    # '_merge': 
                 })
                 return dtypes
 
+            def _load_list(x, field="set field to debug"):
+                """
+                Contrary to _convert_to_list
+                """
+                res = ast.literal_eval(x) if (x is not None and x != '') else np.nan
+                return res
+
             with open(cachename) as fd:
                 import ast
                 dtypes = _gen_dtypes(csv_fields)
-                pd.set_option('display.max_rows', 200)
-                pd.set_option('display.max_colwidth', -1)
+
+                # more recent versions can do without it
+                # pd.set_option('display.max_rows', 200)
+                # pd.set_option('display.max_colwidth', -1)
                 print("dtypes=", dict(dtypes))
                 merged_df = pd.read_csv(
                     fd,
-                    # skip_blank_lines=True,
+                    skip_blank_lines=True,
                     # hum not needed with comment='#'
                     comment='#',
                     # we don't need 'header' when metadata is with comment
-                    header=0, # read column names from row 2 (before, it's metadata)
+                    # header=0, # read column names from row 2 (before, it's metadata)
                     # skiprows
                     sep=tshark_config.delimiter,
                     # converters={
@@ -288,10 +303,15 @@ def _gen_dtypes(fields):
                     converters={
                         _sender("tcpflags"): _convert_flags,
                         # reinjections, converts to list of integers
-                        _sender("reinjection_of"): ast.literal_eval,
-                        _sender("reinjected_in"): ast.literal_eval,
-                        _receiver("reinjection_of"): ast.literal_eval,
-                        _receiver("reinjected_in"): ast.literal_eval,
+                        _sender("reinjection_of"): functools.partial(_load_list, field="reinjectedOfSender"),
+                        _sender("reinjected_in"): functools.partial(_load_list, field="reinjectedInSender"),
+                        _receiver("reinjection_of"): functools.partial(_load_list, field="reinjectedInReceiver"),
+                        _receiver("reinjected_in"): functools.partial(_load_list, field="reinjectedInReceiver"),
+
+                        # there is a bug in pandas see https://github.com/pandas-dev/pandas/pull/20826
+                        # where the 
+                        "mptcpdest": lambda x: ConnectionRoles[x] if x else np.nan,
+
                         # "mptcp.reinjection_of": functools.partial(_convert_to_list, field="reinjectionOf"),
                         # "mptcp.reinjection_listing": functools.partial(_convert_to_list, field="reinjectedIn"),
                         # "mptcp.reinjected_in": functools.partial(_convert_to_list, field="reinjectedIn"),
@@ -374,8 +394,6 @@ def load_into_pandas(
     try:
         with open(csv_filename) as fd:
 
-
-            # TODO use packetid as Index
             data = pd.read_csv(
                 fd,
                 comment='#',
@@ -388,14 +406,13 @@ def load_into_pandas(
                     "tcp.flags": _convert_flags,
                     # reinjections, converts to list of integers
                     "mptcp.reinjection_of": functools.partial(_convert_to_list, field="reinjectionOf"),
-                    # "mptcp.reinjection_listing": functools.partial(_convert_to_list, field="reinjectedIn"),
                     "mptcp.reinjected_in": functools.partial(_convert_to_list, field="reinjectedIn"),
-                    # "mptcp.duplicated_dsn": lambda x: list(map(int, x.split(','))) if x is not None else np.nan,
                 },
                 # nrows=10, # useful for debugging purpose
             )
             data.rename(inplace=True, columns=config.get_fields("fullname", "name"))
             # we want packetid column to survive merges/dataframe transformation so keepit as a column
+            # TODO remove ? let other functions do it ?
             data.set_index("packetid", drop=False, inplace=True)
             log.debug("Column names: %s", data.columns)
 
@@ -887,8 +904,6 @@ def map_tcp_packets_via_hash(
     res = pd.merge(
         sender_df, receiver_df,
         on="hash",
-        # right_index=True,
-        # TODO en fait suffit d'inverser les suffixes, h1, h2
         suffixes=(SENDER_SUFFIX, RECEIVER_SUFFIX), #  columns suffixes (sender/receiver)
         how="outer", # we want to keep packets from both
         # we want to know how many packets were not mapped correctly, adds the _merge column