ok ?

teto · teto · commit 4e5a70804cff · 2019-02-27T16:58:06.000+09:00
diff --git a/mptcpanalyzer/cli.py b/mptcpanalyzer/cli.py
@@ -54,7 +54,7 @@
 import mptcpanalyzer.pdutils
 import dataclasses
 from colorama import Fore, Back
-
+from mptcpanalyzer.pdutils import debug_dataframe
 from stevedore import extension
 
 plugin_logger = logging.getLogger("stevedore")
diff --git a/mptcpanalyzer/data.py b/mptcpanalyzer/data.py
@@ -26,18 +26,18 @@
 
 # TODO might need a converter when saving/loading
 # TODO pandas.api.types.register_extension_dtype()
-dtype_role = pd.api.types.CategoricalDtype(categories=list(ConnectionRoles), ordered=True)
+dtype_role = pd.api.types.CategoricalDtype(categories=ConnectionRoles, ordered=True)
 
 TCP_DEBUG_FIELDS = ['hash', 'packetid', "reltime", "abstime"]
 MPTCP_DEBUG_FIELDS = TCP_DEBUG_FIELDS + ['mptcpdest']
 
 
-# def _convert_role(x):
-#     """
-#     Workaround https://github.com/pandas-dev/pandas/pull/20826
-#     """
-#     log.log(mp.TRACE, "converting [%r] into role" % x)
-#     return ConnectionRoles.from_string(x)
+def _convert_role(x):
+    """
+    Workaround https://github.com/pandas-dev/pandas/pull/20826
+    """
+    log.log(mp.TRACE, "converting [%r] into role" % x)
+    return ConnectionRoles(x)
 
 def ignore(f1, f2):
     return 0
@@ -96,6 +96,10 @@ def getrealpath(input_file):
 per_pcap_artificial_fields = {
     "mptcpdest": Field("mptcpdest", dtype_role, "MPTCP destination", False, None),
     "tcpdest": Field("tcpdest", dtype_role, "TCP destination", False, None),
+
+    # "mptcpdest": Field("mptcpdest", None, "MPTCP destination", False, _convert_role),
+    # "tcpdest": Field("tcpdest", None, "TCP destination", False, _convert_role),
+
     # TODO use int? as type
     "hash": Field("hash", str, "Hash of fields", False, None),
 
@@ -287,7 +291,7 @@ def _gen_converters() -> Dict[str, Callable]:
                     converters=converters,
                 )
                 # at this stage, destinatiosn are nan
-                debug_dataframe(merged_df, "Merged dataframe", )
+                # debug_dataframe(merged_df, "Merged dataframe", )
 
                 # log.debug("Column names after loading from cache: %s", merged_df.columns)
 
@@ -316,8 +320,8 @@ def _gen_converters() -> Dict[str, Callable]:
             # don't do it here else we might repeat it
             # data["abstime"] += clock_offset
 
-        debug_dataframe(res, "checking merge", usecols=["merge_status"])
-        print("%d nan values" % len(res[res.merge_status == np.nan]))
+        # debug_dataframe(res, "checking merge", usecols=["merge_status"])
+        # print("%d nan values" % len(res[res.merge_status == np.nan]))
 
         # log.debug("Column names: %s", res.columns)
         # log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
@@ -610,13 +614,14 @@ def _rename_column(col_name, suffixes) -> str:
             log.log(mp.TRACE, "renaming inplace")
 
             tdf.rename(columns=rename_func, inplace=True)
-            debug_dataframe(tdf, "temporary dataframe")
+            # debug_dataframe(tdf, "temporary dataframe")
             total = pd.concat([total, tdf], ignore_index=True, sort=False, )
             print("total df size = %d" % len(total))
 
         # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True)
         # print(subdf.columns)
         # print(total.columns)
+    debug_dataframe(total, "total")
 
     logging.debug("Converted to sender/receiver format")
     return total
@@ -641,14 +646,14 @@ def merge_tcp_dataframes_known_streams(
     2/ identify which dataframe is server's/client's
     2/
 
-    Adds a merge_status column
 
     Args:
         con1: Tuple dataframe/tcpstream id
         con2: same
 
     Returns:
-        res
+        A dataframe with a "merge_status" column and valid tcp/mptcp destinations
+
         To ease debug we want to see packets in chronological order
 
     """
@@ -690,14 +695,15 @@ def merge_tcp_dataframes_known_streams(
         # generate_mptcp_direction_query
         if isinstance(main_connection, MpTcpSubflow):
 
-            print("THIS IS A SUBFLOW")
+            log.debug("This is a subflow, setting mptcp destinations...")
             mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest)
             res[_first('mptcpdest')][:] = mptcpdest
             res[_second('mptcpdest')][:] = mptcpdest
 
             log.debug("Setting mptcpdest to %s" % mptcpdest)
             # if tcpdest == main_connection.mptcpdest
 
+        debug_dataframe(total, "concanated df", usecols=["tcpdest", "mptcpdest"])
         # TODO here we should
         total = pd.concat([res, total])
 
@@ -894,6 +900,7 @@ def map_tcp_packets(
     # con1: TcpConnection, con2: TcpConnection
 ) -> pd.DataFrame:
     '''
+    Dataframe with format
     '''
     if mode == "hash":
         res = map_tcp_packets_via_hash(sender_df, receiver_df, explain)
@@ -923,14 +930,14 @@ def map_tcp_packets_via_hash(
     ):
     """
     Merge on hash of different fields
+    Resulting dataframe has H1_SUFFIX / H2_SUFFIX
     """
     log.info("Merging packets via hash")
     debug_cols = ["packetid", "hash", "reltime"]
 
     from .pdutils import debug_dataframe
-    debug_dataframe(sender_df, "sender_df", )
-    debug_dataframe(receiver_df, "receiver df")
-    # print(receiver_df[debug_cols].head(20))
+    # debug_dataframe(sender_df, "sender_df", )
+    # debug_dataframe(receiver_df, "receiver df")
     # print("sender_df dtype=", sender_df.dtypes.tcpdest)
     # print("receiver_df dtype=", receiver_df.dtypes.tcpdest)
 
diff --git a/mptcpanalyzer/exporter.py b/mptcpanalyzer/exporter.py
@@ -8,7 +8,7 @@
 import subprocess
 import logging
 import sys
-from mptcpanalyzer.tshark import TsharkConfig, convert_csv_to_sql
+from mptcpanalyzer.tshark import TsharkConfig
 
 log = logging.getLogger(__name__)
 
diff --git a/mptcpanalyzer/parser.py b/mptcpanalyzer/parser.py
@@ -178,6 +178,7 @@ def cmd_wrapper(instance, cmdline):
 class AppendDestination(DataframeAction):
     """
     assume convention on naming
+    TODO check if it's ok with FilterDest
     """
 
     def __init__(self, *args, **kwargs) -> None:
@@ -335,6 +336,7 @@ def __init__(self, df_name: str, **kwargs) -> None:
         # init with all destinations
         self.destinations = list(ConnectionRoles)
         self.already_called = False
+        # TODO it could set choices automatically 
         super().__init__(df_name, **kwargs)
 
     def __call__(self, parser, namespace, values, option_string=None):
diff --git a/mptcpanalyzer/pdutils.py b/mptcpanalyzer/pdutils.py
@@ -43,7 +43,7 @@ def debug_dataframe(
     print(df.info())
     # print(df.columns)
     # print(df.dtypes)
-    pp.pformat(df.dtypes)
+    print(pp.pformat(df.dtypes))
     with pd.option_context('float_format', '{:f}'.format):
         sdf = df
         if usecols:
diff --git a/mptcpanalyzer/statistics.py b/mptcpanalyzer/statistics.py
@@ -73,13 +73,15 @@ def tcp_get_stats(
     con = TcpConnection.build_from_dataframe(df, tcpstreamid)
 
     df2 = tcpdest_from_connections(df, con)
+
+    log.debug("df2 size = %d" % len(df2))
     # q = con.generate_direction_query(destination)
     # df = unidirectional_df = df.query(q, engine="python")
     # return (TcpUnidirectionalStats(),  TcpUnidirectionalStats() )
     # res = { }
     # debug_dataframe(df2, "before connection", )
     # for destination in ConnectionRoles:
-    log.log(mp.TRACE, "Looking at role %s" % destination)
+    log.debug("Looking at role %s" % destination)
     # print(df2["tcpdest"])
     # TODO assume it's already filtered ?
     sdf = df2[df2.tcpdest == destination]
@@ -132,13 +134,15 @@ def mptcp_compute_throughput(
     # -1 because of syn
     dsn_range = dsn_max - dsn_min - 1
 
+    # Could groupby destination as well
     d = df.groupby(_sender('tcpstream'))
     subflow_stats: List[TcpUnidirectionalStats] = []
     for tcpstream, subdf in d:
         # subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))]
-        debug_dataframe(subdf, "subdf")
+        debug_dataframe(subdf, "subdf for stream %d" % tcpstream)
+        dest = subdf.iloc[0, subdf.columns.get_loc(_sender('tcpdest'))]
         sf_stats = tcp_get_stats(subdf, tcpstream,
-            subdf.iloc[0, subdf.columns.get_loc(_sender('tcpdest'))],
+            dest,
         True)
 
         # TODO drop retransmitted
@@ -173,6 +177,8 @@ def mptcp_compute_throughput_extended(
 
     Should display goodput
     """
+    assert isinstance(destination, ConnectionRoles)
+    log.debug("Looking at destination ", destination)
     df_both = classify_reinjections(rawdf)
 
     df = df_both[df_both.mptcpdest == destination]