Skip to content

Commit ed34871

Browse files
committed
renamed _merge column to merge status
it's a categorical
1 parent 65f2876 commit ed34871

File tree

6 files changed

+55
-49
lines changed

6 files changed

+55
-49
lines changed

mptcpanalyzer/__init__.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ class MpTcpStreamId(int):
106106
pass
107107

108108
# Keep it as Enum so that it gets serialized as a string in the CSV
109+
# @register_extension_dtype
110+
# must be implemented
111+
# * type
112+
# * name
113+
# * construct_from_string
109114
class ConnectionRoles(IntEnum):
110115
"""
111116
Used to filter datasets and keep packets flowing in only one direction !
@@ -114,12 +119,12 @@ class ConnectionRoles(IntEnum):
114119
Client = auto()
115120
Server = auto()
116121

117-
def __str__(self):
118-
# Note that defining __str__ is required to get ArgumentParser's help output to include
119-
# the human readable (values) of Color
120-
return self.name
122+
# def __str__(self):
123+
# # Note that defining __str__ is required to get ArgumentParser's help output to include
124+
# # the human readable (values) of Color
125+
# return self.name
121126

122-
@staticmethod
127+
# @staticmethod
123128
def from_string(s):
124129
try:
125130
return ConnectionRoles[s]

mptcpanalyzer/cli.py

-1
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,6 @@ def do_tcp_summary(self, args, unknown):
550550
# TODO use filter_dest instead
551551
summary_parser.add_argument(
552552
'destination',
553-
# mp.DestinationChoice,
554553
action="store", choices=mp.DestinationChoice, type=lambda x: mp.ConnectionRoles[x],
555554
help='Filter flows according to their direction'
556555
'(towards the client or the server)'

mptcpanalyzer/data.py

+36-35
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,12 @@
3232
MPTCP_DEBUG_FIELDS = TCP_DEBUG_FIELDS + ['mptcpdest']
3333

3434

35-
def _convert_role(x):
36-
"""
37-
Workaround https://github.com/pandas-dev/pandas/pull/20826
38-
"""
39-
log.log(mp.TRACE, "converting [%r] into role" % x)
40-
41-
# else throw
42-
return ConnectionRoles.from_string(x)
43-
# return ConnectionRoles[x] if x else np.nan
35+
# def _convert_role(x):
36+
# """
37+
# Workaround https://github.com/pandas-dev/pandas/pull/20826
38+
# """
39+
# log.log(mp.TRACE, "converting [%r] into role" % x)
40+
# return ConnectionRoles.from_string(x)
4441

4542
def ignore(f1, f2):
4643
return 0
@@ -97,14 +94,14 @@ def getrealpath(input_file):
9794
On top of Tshark fields, we also describe fields generated by mptcpanalyzer
9895
"""
9996
per_pcap_artificial_fields = {
100-
# TODO use dtype_role as type
10197
"mptcpdest": Field("mptcpdest", dtype_role, "MPTCP destination", False, None),
10298
"tcpdest": Field("tcpdest", dtype_role, "TCP destination", False, None),
99+
# TODO use int? as type
103100
"hash": Field("hash", str, "Hash of fields", False, None),
104101

105102
# TODO rename ?
106103
# TODO should be a CategoryDataType !
107-
"merge": Field("_merge", None, "How many packets were merged", False, None)
104+
# "merge": Field("_merge", None, "How many packets were merged", False, None)
108105
}
109106

110107
# merged_per_pcap_artificial_fields = {
@@ -149,8 +146,8 @@ def load_merged_streams_into_pandas(
149146
"""
150147
Arguments:
151148
protocol: mptcp or tcp
152-
153149
mapping_mode: Only HASH works for now
150+
clock_offset: untested
154151
155152
Returns
156153
a dataframe with columns... owd ?
@@ -247,12 +244,10 @@ def _gen_dtypes(fields) -> Dict[str, Any]:
247244
dtypes.update({_name(f.fullname): f.type for f in per_pcap_artificial_fields.values()})
248245

249246
# these are overrides from the generated dtypes
250-
dtypes.update({
251-
# during the merge, we join even unmapped packets so some entries
252-
# may be empty => float64
253-
_first("packetid"): tshark_config.fields["packetid"].type,
254-
_second("packetid"): tshark_config.fields["packetid"].type,
255-
})
247+
# dtypes.update({
248+
# _first("packetid"): tshark_config.fields["packetid"].type,
249+
# _second("packetid"): tshark_config.fields["packetid"].type,
250+
# })
256251

257252
return dtypes
258253

@@ -321,6 +316,9 @@ def _gen_converters() -> Dict[str, Callable]:
321316
# don't do it here else we might repeat it
322317
# data["abstime"] += clock_offset
323318

319+
debug_dataframe(res, "checking merge", usecols=["merge_status"])
320+
print("%d nan values" % len(res[res.merge_status == np.nan]))
321+
324322
# log.debug("Column names: %s", res.columns)
325323
# log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
326324
# print(res["mptcpdest"].dtype)
@@ -545,14 +543,10 @@ def tcpdest_from_connections(df, con: TcpConnection) -> pd.DataFrame:
545543

546544
def convert_to_sender_receiver(
547545
df
548-
# def tcp_compute_owd(
549-
# already merged df
550-
# con1: Tuple[pd.DataFrame, TcpConnection],
551-
# con2: Tuple[pd.DataFrame, TcpConnection]
552-
# tcp_sender_df,
553-
# tcp_receiver_df
554546
):
555547
"""
548+
Convert dataframe from X_HOST1 | X_HOST2 to X_SENDER | X_RECEIVER
549+
556550
each packet has a destination marker
557551
Assume clocks are fine here !
558552
"""
@@ -616,7 +610,9 @@ def _rename_column(col_name, suffixes) -> str:
616610
log.log(mp.TRACE, "renaming inplace")
617611

618612
tdf.rename(columns=rename_func, inplace=True)
613+
debug_dataframe(tdf, "temporary dataframe")
619614
total = pd.concat([total, tdf], ignore_index=True, sort=False, )
615+
print("total df size = %d" % len(total))
620616

621617
# subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True)
622618
# print(subdf.columns)
@@ -645,6 +641,8 @@ def merge_tcp_dataframes_known_streams(
645641
2/ identify which dataframe is server's/client's
646642
2/
647643
644+
Adds a merge_status column
645+
648646
Args:
649647
con1: Tuple dataframe/tcpstream id
650648
con2: same
@@ -707,7 +705,7 @@ def merge_tcp_dataframes_known_streams(
707705
log.info("Resulting merged tcp dataframe of size {} ({} mapped packets vs {} unmapped)"
708706
"with input dataframes of size {} and {}.".format(
709707
len(total),
710-
len(total[total._merge == "both"]), len(total[total._merge != "both"]),
708+
len(total[total.merge_status == "both"]), len(total[total.merge_status != "both"]),
711709
len(h1_df), len(h2_df)
712710
))
713711

@@ -895,21 +893,23 @@ def map_tcp_packets(
895893
mode="hash"
896894
# con1: TcpConnection, con2: TcpConnection
897895
) -> pd.DataFrame:
896+
'''
897+
'''
898898
if mode == "hash":
899899
res = map_tcp_packets_via_hash(sender_df, receiver_df, explain)
900900
else:
901901
res = map_tcp_packets_score_based(sender_df, receiver_df, explain)
902902

903-
log.info("Merged packets. Resulting dataframe of size {} generated from {} and {}".format(
903+
log.info("Merged dataframe of size {} generated from {} and {} sources.".format(
904904
len(res), len(sender_df), len(receiver_df)
905905
))
906906
log.info("{} unmapped packets. ".format(
907-
len(res[res._merge == "left_only"]) + len(res[res._merge == "right_only"])
907+
len(res[res.merge_status == "left_only"]) + len(res[res.merge_status == "right_only"])
908908
))
909909

910910
def _show_unmapped_pkts():
911-
print(res[res._merge == "left_only"])
912-
print(res[res._merge == "right_only"])
911+
print(res[res.merge_status == "left_only"])
912+
print(res[res.merge_status == "right_only"])
913913

914914
_show_unmapped_pkts()
915915

@@ -942,9 +942,9 @@ def map_tcp_packets_via_hash(
942942
# suffixes=(SENDER_SUFFIX, RECEIVER_SUFFIX), # columns suffixes (sender/receiver)
943943
suffixes=(HOST1_SUFFIX, HOST2_SUFFIX), # columns suffixes (sender/receiver)
944944
how="outer", # we want to keep packets from both
945-
# we want to know how many packets were not mapped correctly, adds the _merge column
945+
# we want to know how many packets were not mapped correctly, adds the merge column
946946
# can take values "left_only"/ "right_only" or both
947-
indicator=True,
947+
indicator="merge_status",
948948
# TODO reestablish
949949
validate="one_to_one", # can slow process
950950
)
@@ -954,7 +954,8 @@ def map_tcp_packets_via_hash(
954954
## print(receiver_df[['hash', 'packetid']].head(20))
955955

956956
log.debug("Just after hash")
957-
log.debug(res.columns)
957+
debug_dataframe(res, "Just after hash")
958+
# log.debug(res.columns)
958959
# print(res[TCP_DEBUG_FIELDS].head(20))
959960
return res
960961

@@ -1153,7 +1154,7 @@ def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
11531154
df_all["reinj_delta"] = np.nan
11541155

11551156
# rename to df_both ?
1156-
df = df_all[df_all._merge == "both"]
1157+
df = df_all[df_all.merge_status == "both"]
11571158

11581159
# print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
11591160
# _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
@@ -1192,7 +1193,7 @@ def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
11921193

11931194
# if it was correctly mapped
11941195
# TODO why reinjection._merge doesn't exist ?
1195-
if reinjection._1 != "both":
1196+
if reinjection.merge_status != "both":
11961197
# TODO count missed classifications ?
11971198
log.debug("reinjection %d could not be mapped, giving up..." % (reinjection.packetid))
11981199
continue
@@ -1203,7 +1204,7 @@ def classify_reinjections(df_all: pd.DataFrame) -> pd.DataFrame:
12031204

12041205
original_packet = df_all.loc[df_all.packetid == initial_packetid].iloc[0]
12051206

1206-
if original_packet._merge != "both":
1207+
if original_packet.merge_status != "both":
12071208
# TODO count missed classifications ?
12081209
logging.debug("Original packet %d could not be mapped, giving up..." % (original_packet.packetid))
12091210
continue

mptcpanalyzer/parser.py

-1
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,6 @@ def _pcap(name, pcapAction="store", filterAction="store"):
493493
# parser.add_argument("--clock-offset" + name, action="store", type=int,
494494
# help='Offset compared to epoch (in nanoseconds)')
495495

496-
# or merge ?
497496
if bitfield & (PreprocessingActions.FilterStream | PreprocessingActions.Merge):
498497
# difficult to change the varname here => change it everywhere
499498
mptcp: bool = (bitfield & PreprocessingActions.FilterMpTcpStream) != 0

mptcpanalyzer/pdutils.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def debug_dataframe(
2424
msg,
2525
# intro="Debugging dataframe",
2626
nrows=5,
27-
use_cols=None,
27+
usecols=None,
2828
**kwargs
2929
):
3030
'''
@@ -45,7 +45,10 @@ def debug_dataframe(
4545
# print(df.dtypes)
4646
pp.pformat(df.dtypes)
4747
with pd.option_context('float_format', '{:f}'.format):
48-
print(df.head(nrows, ))
48+
sdf = df
49+
if usecols:
50+
sdf = df[usecols]
51+
print(sdf.head(nrows, ))
4952

5053
# https://stackoverflow.com/questions/52686559/read-csv-get-the-line-where-exception-occured
5154
def read_csv_debug(fields, fd, *args, first_try=True, **kwargs):

mptcpanalyzer/plots/owd.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,12 @@ def default_parser(self, *args, **kwargs):
7070
title="Subparsers", help='sub-command help',)
7171
subparsers.required = True # type: ignore
7272

73-
actions = {
73+
orig_actions = {
7474
"tcp": PreprocessingActions.MergeTcp | PreprocessingActions.FilterDestination,
7575
"mptcp": PreprocessingActions.MergeMpTcp | PreprocessingActions.FilterDestination,
7676
}
77-
78-
for protocol, actions in actions.items():
77+
78+
for protocol, actions in orig_actions.items():
7979

8080
expected_pcaps = {
8181
"pcap": actions
@@ -92,7 +92,6 @@ def default_parser(self, *args, **kwargs):
9292
parser.epilog = '''
9393
plot owd tcp examples/client_2_filtered.pcapng 0 examples/server_2_filtered.pcapng 0 --display
9494
'''
95-
9695
return parser
9796

9897
# here we recompute the OWDs
@@ -118,7 +117,7 @@ def plot(self, pcap, protocol, **kwargs):
118117
print("columns", pcap)
119118
print("columns", res.columns)
120119
print("info", res.info())
121-
print(res.loc[res._merge == "both", debug_fields ])
120+
print(res.loc[res.merge_status == "both", debug_fields ])
122121

123122
df = res
124123

0 commit comments

Comments
 (0)