list_mptcp_connections chose a wrong mptcpdest

teto · teto · commit 8897e7baf483 · 2019-02-28T17:38:12.000+09:00
because it was comparing values of different types. For now I encode the failing fields as str instead of UInt64 (dsnraw seems concerned as well) see pandas-dev/pandas#25472 for more details
diff --git a/mptcpanalyzer/cli.py b/mptcpanalyzer/cli.py
@@ -362,8 +362,8 @@ def list_subflows(self, mptcpstreamid: MpTcpStreamId):
 
         try:
             con = MpTcpConnection.build_from_dataframe(self.data, mptcpstreamid)
-            self.poutput("mptcp.stream %d has %d subflow(s) (client/server): "
-                         % (mptcpstreamid, len(con.subflows())))
+            msg = "mptcp.stream %d has %d subflow(s) (client/server): "
+            self.poutput(msg % (mptcpstreamid, len(con.subflows())))
             for sf in con.subflows():
                 self.poutput("\t%s" % sf)
         except mp.MpTcpException as e:
@@ -526,7 +526,6 @@ def do_tcp_summary(self, args, unknown):
         df = self.data
 
         # args.pcapdestinations ?
-        # print(args)
 
         for dest in ConnectionRoles:
             # TODO do it only when needed
@@ -573,7 +572,7 @@ def do_mptcp_summary(self, args, unknown):
         mptcpstream = args.mptcpstream
 
         # args.pcapdestinations ?
-        print(args)
+        # print(args)
         ret = mptcp_compute_throughput(
             self.data, args.mptcpstream, args.destination
         )
@@ -670,7 +669,7 @@ def do_summary_extended(self, args, unknown):
         For now it is naive, does not look at retransmissions ?
         """
 
-        print("%r" % args)
+        print("Summary extended resume %r" % args)
         df_pcap1 = load_into_pandas(args.pcap1, self.tshark_config)
 
         # to abstract things a bit
@@ -779,17 +778,6 @@ def do_list_mptcp_connections(self, *args):
             self.list_subflows(mptcpstream)
             self.poutput("\n")
 
-    # def generate_namespace(self) -> argparse.Namespace:
-    #     myNamespace = Namespace()
-    #     myNamespace.toto = self.data
-    #     parser = argparse_completer.ACArgumentParser(
-    #         description="""
-    #         Mptcpanalyzer filters pcaps to keep only tcp packets.
-    #         This may explain why printed packet ids dont map
-    #         """
-    #     )
-
-
 
     parser = MpTcpAnalyzerParser(
         description="Export a pcap that can be used with wireshark to debug ids"
@@ -1091,12 +1079,7 @@ def do_load_pcap(self, args):
         """
         Load the file as the current one
         """
-        print(args)
-        # args = shlex.split(args)
         # print(args)
-        # parser = self.do_load_pcap.argparser
-        # print(parser)
-        # args = parser.parse_args(args)
 
         self.poutput("Loading %s" % args.input_file)
         self.data = args._dataframes["input_file"]
@@ -1138,15 +1121,15 @@ def do_plot(self, args, unknown):
         # 'converts' the namespace to for the syntax define a dict
         dargs = vars(args)
 
-        print("%s" % dargs)
+        # print("%s" % dargs)
         dataframes = dargs.pop("_dataframes")
         # workaround argparse limitations to set as default both directions
         # TODO replace that with an action ?
         # destinations=dargs.get("destinations", list(mp.ConnectionRoles))
         # dargs.update(destinations=destinations)
         # log.debug("Selecting destinations %s" % (destinations,))
         # dataframes = plotter.preprocess(**dargs)
-        print("%s" % args)
+        print("DO_PLOT %s" % args)
         # dataframes = args._dataframes.values()
         assert dataframes is not None, "Preprocess must return a list"
         # pass unknown_args too ?
diff --git a/mptcpanalyzer/connection.py b/mptcpanalyzer/connection.py
@@ -154,19 +154,23 @@ def __str__(self):
         # :>5d
         # TODO should be converted to int instead, would spare some memory
         line = ("tcp.stream {s.tcpstreamid:.0f}: {s.tcpclient_ip}:{s.client_port:0>5.0f} "
-                " <-> {s.tcpserver_ip}:{s.server_port:0>5.0f} ").format(s=self,
+                " -> {s.tcpserver_ip}:{s.server_port:0>5.0f} ").format(s=self,
                         # tcpstreamid=self.tcpstreamid
                         )
         return line
 
 
 # should it ?
-@dataclass
+# @dataclass
 class MpTcpSubflow(TcpConnection):
     """
 
     """
 
+    """ to which mptcp side belongs the tcp server"""
+    # mptcpdest: ConnectionRoles
+    # addrid: int = None
+
     def __init__(self, mptcpdest: ConnectionRoles, addrid=None, **kwargs) -> None:
         super().__init__(**kwargs)
         self.addrid = addrid
@@ -233,6 +237,7 @@ def __str__(self):
 
 
 # @dataframe
+# @dataclass
 class MpTcpConnection:
     """
     Holds key characteristics of an MPTCP connection: keys, tokens, subflows
@@ -241,6 +246,8 @@ class MpTcpConnection:
 
     subflows can be any order
     """
+    # mptcpstreamid: MpTcpStreamId
+
     def __init__(self,
             mptcpstreamid: int,
             client_key: int, client_token: int, server_key: int,
@@ -294,6 +301,8 @@ def subflows(self, mptcpdest: ConnectionRoles = ConnectionRoles.Server):
     def build_from_dataframe(ds: pd.DataFrame, mptcpstreamid: MpTcpStreamId) -> 'MpTcpConnection':
         """
         Instantiates a class that describes an MPTCP connection
+
+        Look for the first 2 packets containing "sendkey"
         """
 
         def get_index_of_non_null_values(serie):
@@ -345,17 +354,23 @@ def get_index_of_non_null_values(serie):
             receiver_token = subflow_ds["recvtok"].iloc[row]
 
             # if we see the token
+            log.debug("receiver_token %r to compare with server_token %r" % (receiver_token, server_token))
+            log.debug("Test %s" % (receiver_token == server_token))
+            mptcpdest = ConnectionRoles.Server if receiver_token == server_token \
+                    else ConnectionRoles.Client
+
             subflow = MpTcpSubflow.create_subflow(
-                mptcpdest   = ConnectionRoles.Server if receiver_token == server_token \
-                        else ConnectionRoles.Client,
+                mptcpdest   = mptcpdest,
                 tcpstreamid =tcpstreamid,
                 tcpclient_ip=subflow_ds['ipsrc'].iloc[row],
                 tcpserver_ip=subflow_ds['ipdst'].iloc[row],
                 client_port =subflow_ds['sport'].iloc[row],
                 server_port =subflow_ds['dport'].iloc[row],
                 addrid      =None,
                 # rcv_token   =receiver_token,
-                )
+            )
+
+            log.debug("Created subflow %s" % subflow)
 
             subflows.append(subflow)
 
diff --git a/mptcpanalyzer/data.py b/mptcpanalyzer/data.py
@@ -430,21 +430,22 @@ def load_into_pandas(
 
             converters = {f.fullname: f.converter for _, f in config.fields.items() if f.converter}
             converters.update({name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter})
-            # print("converters\n", converters)
 
             dtypes = {field.fullname: field.type for _, field in config.fields.items() if field.converter is None}
-            log.debug("Dtypes before load: %s" % dtypes)
-            log.debug("Converters before load: %s" % converters)
+            log.debug("Dtypes before load: %s" % (pp.pformat(dtypes)))
+            log.debug("Converters before load: %s" % (pp.pformat(converters)))
 
             from .pdutils import read_csv_debug
-            fields = [f.fullname for _, f in config.fields.items()]
+            # fields = [f.fullname for _, f in config.fields.items()]
+            # fields =[ "tcp.options.mptcp.sendkey" ]
             # data = read_csv_debug(fields,
             data = pd.read_csv(
                 fd,
                 comment='#',
                 sep=config.delimiter,
                 dtype=dtypes,
-                # usecols = [config.fields["ipsrc"].fullname ],
+                # config.fields["ipsrc"].fullname
+                # usecols = [ "tcp.options.mptcp.sendkey" ],
                 # seems like for now we can't change the default representation apart from converting the column to
                 # a string !!!
                 # https://stackoverflow.com/questions/46930201/pandas-to-datetime-is-not-formatting-the-datetime-value-in-the-desired-format
@@ -456,7 +457,7 @@ def load_into_pandas(
                 converters=converters,
                 # float_precision="high",  # might be necessary
                 # nrows=13, # useful for debugging purpose
-                # chunksize=5, # useful for debugging purpose
+                # chunksize=1, # useful for debugging purpose
             )
 
             log.debug("Finished loading CSV file")
@@ -472,7 +473,7 @@ def load_into_pandas(
             # we want packetid column to survive merges/dataframe transformation so keepit as a column
             # TODO remove ? let other functions do it ?
             data.set_index("packetid", drop=False, inplace=True)
-            log.debug("Column names: %s" % data.columns)
+            # log.debug("Column names: %s" % data.columns)
 
             hashing_fields = [name for name, field in config.fields.items() if field.hash]
             log.debug("Hashing over fields %s" % hashing_fields)
@@ -488,14 +489,9 @@ def load_into_pandas(
     except Exception as e:
         logging.error("You may need to filter more your pcap to keep only mptcp packets")
         raise e
-    # finally:
-        # print (data)
 
     log.info("Finished loading dataframe for %s. Size=%d" % (input_file, len(data)))
 
-    # print("FINAL_DTYPES")
-    # log.debug(data.dtypes)
-    # print(data.head(5))
     return data
 
 
diff --git a/mptcpanalyzer/pdutils.py b/mptcpanalyzer/pdutils.py
@@ -54,7 +54,7 @@ def debug_dataframe(
 def read_csv_debug(fields, fd, *args, first_try=True, **kwargs):
     """
     Help debugging dataframe loading errors (with dtypes/converters)
-    chunksize: bool, 
+    chunksize: bool,
     """
 
     chunksize = kwargs.get("chunksize")
@@ -63,22 +63,23 @@ def read_csv_debug(fields, fd, *args, first_try=True, **kwargs):
         kwargs.pop("chunksize", None)
 
     for field in fields:
-        print("TESTING field ", field)
+        print("TESTING field %s (first_try ? %s ) " % (field, first_try))
+        print(kwargs.get("dtype")[field])
         try:
             res = pd.read_csv(
-                    fd,
-                    *args,
-                    usecols=[ field],
-                    **kwargs
-                )
+                fd,
+                *args,
+                usecols=[ field],
+                **kwargs
+            )
             if chunksize is not None:
                 for i, chunk in enumerate(res):
                     # print("chunk %d" % i)
                     print(chunk)
         except TypeError as e:
             # TODO retry with chunksize
             if first_try:
-                kwargs.update({"chunksize":chunksize  or 40})
+                kwargs.update({"chunksize": chunksize  or 40})
                 fd.seek(0)
                 read_csv_debug([field], fd, *args, first_try=False, **kwargs)
             else:
diff --git a/mptcpanalyzer/statistics.py b/mptcpanalyzer/statistics.py
@@ -164,9 +164,10 @@ def mptcp_compute_throughput(
     subflow_stats: List[TcpUnidirectionalStats] = []
     for tcpstream, subdf in d:
         # subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))]
-        debug_dataframe(subdf, "subdf for stream %d" % tcpstream)
+        # debug_dataframe(subdf, "subdf for stream %d" % tcpstream)
         dest = subdf.iloc[0, subdf.columns.get_loc(_sender('tcpdest'))]
         sf_stats = tcp_get_stats(subdf, tcpstream,
+            # work around pandas issue
             ConnectionRoles(dest),
         True)
 
diff --git a/mptcpanalyzer/tshark.py b/mptcpanalyzer/tshark.py
@@ -198,9 +198,12 @@ def add_mptcp_fields(self, advanced=True):
         # remove this one ?
         self.add_field("mptcp.expected_token", "expected_token", str, False, False)
         self.add_field("mptcp.stream", "mptcpstream", 'UInt64', False, False)
-        self.add_field("tcp.options.mptcp.sendkey", "sendkey", np.float64, False, True)
-        self.add_field("tcp.options.mptcp.recvkey", "recvkey", np.float64, False, True)
-        self.add_field("tcp.options.mptcp.recvtok", "recvtok", np.float64, False, True)
+        
+        # TODO convert to 'UInt64'
+        self.add_field("tcp.options.mptcp.sendkey", "sendkey", str, False, True)
+        self.add_field("tcp.options.mptcp.recvkey", "recvkey", str, False, True)
+        self.add_field("tcp.options.mptcp.recvtok", "recvtok", str, False, True)
+
         self.add_field("tcp.options.mptcp.datafin.flag", "datafin", 'Int64', False, True)
         # this is a list really; can contain "2,4"
         self.add_field("tcp.options.mptcp.subtype", "subtype", str, False, True)