renamed _merge column to merge status

teto · teto · commit ed3487161d68 · 2019-02-27T14:56:34.000+09:00
it's a categorical
diff --git a/mptcpanalyzer/__init__.py b/mptcpanalyzer/__init__.py
@@ -106,6 +106,11 @@ class MpTcpStreamId(int):
     pass
 
 # Keep it as Enum so that it gets serialized as a string in the CSV
+# @register_extension_dtype
+    # must be implemented
+    # * type
+    # * name
+    # * construct_from_string
 class ConnectionRoles(IntEnum):
     """
     Used to filter datasets and keep packets flowing in only one direction !
@@ -114,12 +119,12 @@ class ConnectionRoles(IntEnum):
     Client = auto()
     Server = auto()
 
-    def __str__(self):
-        # Note that defining __str__ is required to get ArgumentParser's help output to include
-        # the human readable (values) of Color
-        return self.name
+    # def __str__(self):
+    #     # Note that defining __str__ is required to get ArgumentParser's help output to include
+    #     # the human readable (values) of Color
+    #     return self.name
 
-    @staticmethod
+    # @staticmethod
     def from_string(s):
         try:
             return ConnectionRoles[s]
diff --git a/mptcpanalyzer/cli.py b/mptcpanalyzer/cli.py
@@ -550,7 +550,6 @@ def do_tcp_summary(self, args, unknown):
     # TODO use filter_dest instead
     summary_parser.add_argument(
         'destination',
-        # mp.DestinationChoice,
         action="store", choices=mp.DestinationChoice, type=lambda x: mp.ConnectionRoles[x],
         help='Filter flows according to their direction'
         '(towards the client or the server)'
diff --git a/mptcpanalyzer/data.py b/mptcpanalyzer/data.py
@@ -32,15 +32,12 @@
 MPTCP_DEBUG_FIELDS = TCP_DEBUG_FIELDS + ['mptcpdest']
 
 
-def _convert_role(x):
-    """
-    Workaround https://github.com/pandas-dev/pandas/pull/20826
-    """
-    log.log(mp.TRACE, "converting [%r] into role" % x)
-
-    # else throw
-    return ConnectionRoles.from_string(x)
-    # return ConnectionRoles[x] if x else np.nan
+# def _convert_role(x):
+#     """
+#     Workaround https://github.com/pandas-dev/pandas/pull/20826
+#     """
+#     log.log(mp.TRACE, "converting [%r] into role" % x)
+#     return ConnectionRoles.from_string(x)
 
 def ignore(f1, f2):
     return 0
@@ -97,14 +94,14 @@ def getrealpath(input_file):
 On top of Tshark fields, we also describe fields generated by mptcpanalyzer
 """
 per_pcap_artificial_fields = {
-    # TODO use dtype_role as type
     "mptcpdest": Field("mptcpdest", dtype_role, "MPTCP destination", False, None),
     "tcpdest": Field("tcpdest", dtype_role, "TCP destination", False, None),
+    # TODO use int? as type
     "hash": Field("hash", str, "Hash of fields", False, None),
 
     # TODO rename ?
     # TODO should be a CategoryDataType !
-    "merge": Field("_merge", None, "How many packets were merged", False, None)
+    # "merge": Field("_merge", None, "How many packets were merged", False, None)
 }
 
 # merged_per_pcap_artificial_fields = {
@@ -149,8 +146,8 @@ def load_merged_streams_into_pandas(
     """
     Arguments:
         protocol: mptcp or tcp
-
         mapping_mode: Only HASH works for now
+        clock_offset: untested
 
     Returns
         a dataframe with columns... owd ?
@@ -247,12 +244,10 @@ def _gen_dtypes(fields) -> Dict[str, Any]:
                     dtypes.update({_name(f.fullname): f.type for f in per_pcap_artificial_fields.values()})
 
                 # these are overrides from the generated dtypes
-                dtypes.update({
-                    # during the merge, we join even unmapped packets so some entries
-                    # may be empty => float64
-                    _first("packetid"): tshark_config.fields["packetid"].type,
-                    _second("packetid"): tshark_config.fields["packetid"].type,
-                })
+                # dtypes.update({
+                #     _first("packetid"): tshark_config.fields["packetid"].type,
+                #     _second("packetid"): tshark_config.fields["packetid"].type,
+                # })
 
                 return dtypes
 
@@ -321,6 +316,9 @@ def _gen_converters() -> Dict[str, Callable]:
             # don't do it here else we might repeat it
             # data["abstime"] += clock_offset
 
+        debug_dataframe(res, "checking merge", usecols=["merge_status"])
+        print("%d nan values" % len(res[res.merge_status == np.nan]))
+
         # log.debug("Column names: %s", res.columns)
         # log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
         # print(res["mptcpdest"].dtype)
@@ -545,14 +543,10 @@ def tcpdest_from_connections(df, con: TcpConnection) -> pd.DataFrame:
 
 def convert_to_sender_receiver(
     df
-# def tcp_compute_owd(
-    # already merged df
-    # con1: Tuple[pd.DataFrame, TcpConnection],
-    # con2: Tuple[pd.DataFrame, TcpConnection]
-    # tcp_sender_df,
-    # tcp_receiver_df
     ):
     """
+    Convert dataframe from  X_HOST1 | X_HOST2 to X_SENDER | X_RECEIVER
+
     each packet has a destination marker
     Assume clocks are fine here !
     """
@@ -616,7 +610,9 @@ def _rename_column(col_name, suffixes) -> str:
             log.log(mp.TRACE, "renaming inplace")
 
             tdf.rename(columns=rename_func, inplace=True)
+            debug_dataframe(tdf, "temporary dataframe")
             total = pd.concat([total, tdf], ignore_index=True, sort=False, )
+            print("total df size = %d" % len(total))
 
         # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True)
         # print(subdf.columns)
@@ -645,6 +641,8 @@ def merge_tcp_dataframes_known_streams(
     2/ identify which dataframe is server's/client's
     2/
 
+    Adds a merge_status column
+
     Args:
         con1: Tuple dataframe/tcpstream id
         con2: same
@@ -707,7 +705,7 @@ def merge_tcp_dataframes_known_streams(
     log.info("Resulting merged tcp dataframe of size {} ({} mapped packets vs {} unmapped)"
             "with input dataframes of size {} and {}.".format(
         len(total),
-        len(total[total._merge == "both"]), len(total[total._merge != "both"]),
+        len(total[total.merge_status == "both"]), len(total[total.merge_status != "both"]),
         len(h1_df), len(h2_df)
     ))
 
@@ -895,21 +893,23 @@ def map_tcp_packets(
     mode="hash"
     # con1: TcpConnection, con2: TcpConnection
 ) -> pd.DataFrame:
+    '''
+    '''
     if mode == "hash":
         res = map_tcp_packets_via_hash(sender_df, receiver_df, explain)
     else:
         res = map_tcp_packets_score_based(sender_df, receiver_df, explain)
 
-    log.info("Merged packets. Resulting dataframe of size {} generated from {} and {}".format(
+    log.info("Merged dataframe of size {} generated from {} and {} sources.".format(
         len(res), len(sender_df), len(receiver_df)
     ))
     log.info("{} unmapped packets. ".format(
-        len(res[res._merge == "left_only"]) + len(res[res._merge == "right_only"])
+        len(res[res.merge_status == "left_only"]) + len(res[res.merge_status == "right_only"])
     ))
 
     def _show_unmapped_pkts():
-        print(res[res._merge == "left_only"])
-        print(res[res._merge == "right_only"])
+        print(res[res.merge_status == "left_only"])
+        print(res[res.merge_status == "right_only"])
 
     _show_unmapped_pkts()
 
@@ -942,9 +942,9 @@ def map_tcp_packets_via_hash(
         # suffixes=(SENDER_SUFFIX, RECEIVER_SUFFIX), #  columns suffixes (sender/receiver)
         suffixes=(HOST1_SUFFIX, HOST2_SUFFIX),  # columns suffixes (sender/receiver)
         how="outer", # we want to keep packets from both
-        # we want to know how many packets were not mapped correctly, adds the _merge column
+        # we want to know how many packets were not mapped correctly, adds the merge column
         # can take values "left_only"/ "right_only" or both
-        indicator=True,
+        indicator="merge_status",
         # TODO reestablish
         validate="one_to_one",  # can slow process
     )
@@ -954,7 +954,8 @@ def map_tcp_packets_via_hash(
     ## print(receiver_df[['hash', 'packetid']].head(20))
 
     log.debug("Just after hash")
-    log.debug(res.columns)
+    debug_dataframe(res, "Just after hash")
+    # log.debug(res.columns)
     # print(res[TCP_DEBUG_FIELDS].head(20))
     return res
 
@@ -1153,7 +1154,7 @@ def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
     df_all["reinj_delta"] = np.nan
 
     # rename to df_both ?
-    df = df_all[df_all._merge == "both"]
+    df = df_all[df_all.merge_status == "both"]
 
     # print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
     #     _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
@@ -1192,7 +1193,7 @@ def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
 
             # if it was correctly mapped
             # TODO why reinjection._merge doesn't exist ?
-            if reinjection._1 != "both":
+            if reinjection.merge_status != "both":
                 # TODO count missed classifications ?
                 log.debug("reinjection %d could not be mapped, giving up..." % (reinjection.packetid))
                 continue
@@ -1203,7 +1204,7 @@ def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
 
             original_packet = df_all.loc[df_all.packetid == initial_packetid].iloc[0]
 
-            if original_packet._merge != "both":
+            if original_packet.merge_status != "both":
                 # TODO count missed classifications ?
                 logging.debug("Original packet %d could not be mapped, giving up..." % (original_packet.packetid))
                 continue
diff --git a/mptcpanalyzer/parser.py b/mptcpanalyzer/parser.py
@@ -493,7 +493,6 @@ def _pcap(name, pcapAction="store", filterAction="store"):
                 # parser.add_argument("--clock-offset" + name, action="store", type=int,
                 #     help='Offset compared to epoch (in nanoseconds)')
 
-                # or merge ?
                 if bitfield & (PreprocessingActions.FilterStream | PreprocessingActions.Merge):
                     # difficult to change the varname here => change it everywhere
                     mptcp: bool = (bitfield & PreprocessingActions.FilterMpTcpStream) != 0
diff --git a/mptcpanalyzer/pdutils.py b/mptcpanalyzer/pdutils.py
@@ -24,7 +24,7 @@ def debug_dataframe(
         msg,
         # intro="Debugging dataframe", 
         nrows=5,
-        use_cols=None,
+        usecols=None,
         **kwargs
     ):
     '''
@@ -45,7 +45,10 @@ def debug_dataframe(
     # print(df.dtypes)
     pp.pformat(df.dtypes)
     with pd.option_context('float_format', '{:f}'.format):
-        print(df.head(nrows, ))
+        sdf = df
+        if usecols:
+            sdf = df[usecols]
+        print(sdf.head(nrows, ))
 
 # https://stackoverflow.com/questions/52686559/read-csv-get-the-line-where-exception-occured
 def read_csv_debug(fields, fd, *args, first_try=True, **kwargs):
diff --git a/mptcpanalyzer/plots/owd.py b/mptcpanalyzer/plots/owd.py
@@ -70,12 +70,12 @@ def default_parser(self, *args, **kwargs):
             title="Subparsers", help='sub-command help',)
         subparsers.required = True  # type: ignore
 
-        actions = {
+        orig_actions = {
             "tcp": PreprocessingActions.MergeTcp | PreprocessingActions.FilterDestination,
             "mptcp": PreprocessingActions.MergeMpTcp | PreprocessingActions.FilterDestination,
         }
-        
-        for protocol, actions in actions.items():
+
+        for protocol, actions in orig_actions.items():
 
             expected_pcaps = {
                 "pcap": actions
@@ -92,7 +92,6 @@ def default_parser(self, *args, **kwargs):
         parser.epilog = '''
             plot owd tcp examples/client_2_filtered.pcapng 0 examples/server_2_filtered.pcapng 0 --display
         '''
-        
         return parser
 
         # here we recompute the OWDs
@@ -118,7 +117,7 @@ def plot(self, pcap, protocol, **kwargs):
         print("columns", pcap)
         print("columns", res.columns)
         print("info", res.info())
-        print(res.loc[res._merge == "both", debug_fields ])
+        print(res.loc[res.merge_status == "both", debug_fields ])
 
         df = res