From 2b595d41fed7106af0c6c1693000e7d24c8a7a24 Mon Sep 17 00:00:00 2001 From: Stephan Kleber Date: Mon, 17 Oct 2022 00:30:49 +0200 Subject: [PATCH] 3.0: Field type clustering DCDS 2022 --- .idea/dictionaries/stephan.xml | 1 + .idea/inspectionProfiles/Project_Default.xml | 16 +- .idea/misc.xml | 2 +- .idea/modules.xml | 4 +- .idea/vcs.xml | 2 + README.md | 4 +- eval-nemeftr-clustering-iterateeps.sh | 78 + eval-nemeftr-clustering.sh | 89 ++ eval-nemeftr-truefield-iterateeps.sh | 49 + eval-nemeftr-truefield.sh | 64 + eval-nemeftr-visualize.sh | 67 + eval-nemesys-refine.sh | 63 + eval-nemetyl-messagetype.sh | 45 +- eval-netzob-fms.sh | 8 +- eval-netzob-messagetype.sh | 23 +- input/Sources.md | 14 +- .../dns_ictf2010_maxdiff-100.pcap | Bin .../dns_ictf2010_maxdiff-1000.pcap | Bin input/splitcap.sh | 11 + requirements.txt | 41 +- src/Contents.md | 73 + src/check_parse-pcap.py | 12 +- src/check_pcap-info.py | 2 + src/nemeftr-prod_cluster-segments.py | 218 +++ src/nemeftr_cluster-segments.py | 326 ++++ src/nemeftr_cluster-segments_iterate-eps.py | 267 ++++ src/nemeftr_cluster-true-fields.py | 296 ++++ ...nemeftr_cluster-true-fields_iterate-eps.py | 199 +++ src/nemere/alignment/clusterMerging.py | 1 + src/nemere/inference/analyzers.py | 606 +++++--- src/nemere/inference/fieldTypes.py | 188 +++ src/nemere/inference/formatRefinement.py | 32 +- src/nemere/inference/segmentHandler.py | 160 +- src/nemere/inference/segments.py | 13 +- src/nemere/inference/series.py | 83 ++ src/nemere/inference/templates.py | 1306 ++++++++++++++++- src/nemere/inference/trackingBIDE.py | 297 ++++ src/nemere/utils/evaluationHelpers.py | 158 +- src/nemere/utils/loader.py | 28 +- src/nemere/utils/reportWriter.py | 60 +- src/nemere/validation/clusterInspector.py | 845 +++++++++++ src/nemere/validation/dissectorMatcher.py | 10 +- src/nemere/validation/messageParser.py | 225 ++- .../validation/netzobFormatMatchScore.py | 7 +- src/nemere/validation/protocols/ari.py | 47 + src/nemere/validation/protocols/autounlock.py | 90 ++ src/nemere/validation/protocols/awdl.py | 167 +++ src/nemere/validation/tsharkConnector.py | 230 +-- src/nemere/visualization/distancesPlotter.py | 64 +- src/nemere/visualization/multiPlotter.py | 45 +- src/nemere/visualization/plotter.py | 25 +- src/nemere/visualization/simplePrint.py | 223 ++- src/nemere/visualization/singlePlotter.py | 69 +- src/nemesys.py | 2 +- src/nemesys_fms.py | 23 +- src/nemesys_vd.py | 195 +++ src/nemetyl.py | 14 +- src/nemetyl_align-segments.py | 22 +- src/netzob_fms.py | 11 +- src/netzob_messagetypes.py | 22 + src/prep_filter-maxdiff-trace.py | 270 ++++ src/transform_cluster-statistics.py | 88 ++ src/visualize_fieldtype_separation.py | 106 ++ tests/messageparsing.py | 114 +- tests/netzob-support.py | 4 +- 65 files changed, 7208 insertions(+), 616 deletions(-) create mode 100755 eval-nemeftr-clustering-iterateeps.sh create mode 100755 eval-nemeftr-clustering.sh create mode 100755 eval-nemeftr-truefield-iterateeps.sh create mode 100755 eval-nemeftr-truefield.sh create mode 100755 eval-nemeftr-visualize.sh create mode 100755 eval-nemesys-refine.sh rename input/{maxdiff-fromOrig => hide}/dns_ictf2010_maxdiff-100.pcap (100%) rename input/{maxdiff-fromOrig => hide}/dns_ictf2010_maxdiff-1000.pcap (100%) create mode 100755 input/splitcap.sh create mode 100644 src/nemeftr-prod_cluster-segments.py create mode 100644 src/nemeftr_cluster-segments.py create mode 100644 src/nemeftr_cluster-segments_iterate-eps.py create mode 100644 src/nemeftr_cluster-true-fields.py create mode 100644 src/nemeftr_cluster-true-fields_iterate-eps.py create mode 100644 src/nemere/inference/fieldTypes.py create mode 100644 src/nemere/inference/series.py create mode 100644 src/nemere/inference/trackingBIDE.py create mode 100644 src/nemere/validation/clusterInspector.py create mode 100644 src/nemere/validation/protocols/ari.py create mode 100644 src/nemere/validation/protocols/autounlock.py create mode 100644 src/nemere/validation/protocols/awdl.py create mode 100644 src/nemesys_vd.py create mode 100644 src/prep_filter-maxdiff-trace.py create mode 100644 src/transform_cluster-statistics.py create mode 100644 src/visualize_fieldtype_separation.py diff --git a/.idea/dictionaries/stephan.xml b/.idea/dictionaries/stephan.xml index a7de52ac..293b4ad0 100644 --- a/.idea/dictionaries/stephan.xml +++ b/.idea/dictionaries/stephan.xml @@ -3,6 +3,7 @@ basesegments nemere + segmenter tshark diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml index 72b1561d..8bed9fb1 100644 --- a/.idea/inspectionProfiles/Project_Default.xml +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -38,20 +38,8 @@ diff --git a/.idea/misc.xml b/.idea/misc.xml index 6a948b1a..06ca8a08 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/.idea/modules.xml b/.idea/modules.xml index c149c880..954920fa 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -2,8 +2,10 @@ + + - + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml index c5583a9a..5e4a3eec 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -2,6 +2,8 @@ + + \ No newline at end of file diff --git a/README.md b/README.md index 001153df..9b6d31b8 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ All scripts provide these command line options: ### prep_* PCAP preparation scripts: -* `prep_deduplicate-trace.py pcapfilename` +* `prep_deduplicate-trace.py pcapfilename` Detect identical payloads and de-duplicate traces, ignoring encapsulation metadata. @@ -121,7 +121,7 @@ Basic checks whether PCAPs are parseable: The tshark-dissected fields that are contained in the PCAPs need to be known to the message parser. Therefore, validation.messageParser.ParsingConstants needs to be made aware of any field occuring in the traces. -* `check_parse-pcap.py pcapfilename` +* `check_parse-pcap.py pcapfilename` Parse a PCAP file and print its dissection for testing. This helps verifying if there are any unknown fields that need to be added to validation.messageParser.ParsingConstants. Before starting to validate/use FMS with a new PCAP, first run this check and solve any errors. diff --git a/eval-nemeftr-clustering-iterateeps.sh b/eval-nemeftr-clustering-iterateeps.sh new file mode 100755 index 00000000..b09ae098 --- /dev/null +++ b/eval-nemeftr-clustering-iterateeps.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# +# NEMEFTR-full mode 1: +# Clustering of segments on similarity without ground truth. + +input="input/maxdiff-fromOrig/*-100*.pcap" + + +segmenters="nemesys" + +# Nemesys options +refines="none original nemetyl" + +L2PROTOS="input/awdl-* input/wlan-beacons-*" +LEPROTOS="input/awdl-* input/wlan-beacons-* input/smb* input/*/smb*" + +prefix="cft" + +cftnpad="245" +for f in reports/${prefix}-* ; do + if [ -e "$f" ] ; then + cftnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) + cftnpad=$(printf "%03d" ${cftnext}) + fi + break +done +currcomm=$(git log -1 --format="%h") +report=reports/${prefix}-${cftnpad}-clustering-${currcomm} +mkdir ${report} + + +for seg in ${segmenters} ; do + for ref in ${refines} ; do + if [[ ${seg} == "zeros" ]] && [[ ! ${ref} =~ ^(none|PCA1|PCAmocoSF)$ ]] ; then + echo ${ref} not suited for zeros segmenter. Ignoring. + continue + fi + + pids=() + for fn in ${input} ; do + optargs="-r" + for proto in ${L2PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 2" + fi + done + for proto in ${LEPROTOS} ; do + if [[ "${fn}" == $proto ]] ; then + # append + optargs="${optargs} -e" # -e: little endian + fi + done + # fixed sigma 1.2 (nemeftr-paper: "constant σ of 1.2") + python src/nemeftr_cluster-segments_iterate-eps.py -t ${seg} -s 1.2 -p ${optargs} -f ${ref} ${fn} >> "${report}/$(basename -s .pcap ${fn}).log" & + pids+=( $! ) + done + + for pid in "${pids[@]}"; do + printf 'Waiting for %d...' "$pid" + wait $pid + echo 'done.' + done + + mkdir ${report}-${seg}-${ref} +# mv reports/*.pdf ${report}-${seg}-${ref}/ + for fn in ${input}; + do + bn=$(basename -s .pcap ${fn}) + mv reports/${bn}* ${report}-${seg}-${ref}/ + done + done +done + +python src/transform_cluster-statistics.py +mv reports/*.csv ${report}/ + +spd-say "Bin fertig!" diff --git a/eval-nemeftr-clustering.sh b/eval-nemeftr-clustering.sh new file mode 100755 index 00000000..bdd0628e --- /dev/null +++ b/eval-nemeftr-clustering.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# +# NEMEFTR-full mode 1: +# Clustering of segments on similarity without ground truth. + +input="input/maxdiff-fromOrig/*-100*.pcap" +#input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" + + +segmenters="nemesys" + +# Nemesys options +refines="original nemetyl" + + +L2PROTOS="input/awdl-* input/au-* input/wlan-beacons-*" +LEPROTOS="input/awdl-* input/au-* input/smb* input/*/smb* input/wlan-beacons-*" + +prefix="cft" + +cftnpad="352" +for f in reports/${prefix}-* ; do + if [ -e "$f" ] ; then + cftnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) + cftnpad=$(printf "%03d" ${cftnext}) + fi + break +done +currcomm=$(git log -1 --format="%h") +report=reports/${prefix}-${cftnpad}-clustering-${currcomm} +mkdir ${report} + + +for seg in ${segmenters} ; do + for ref in ${refines} ; do + if [[ ${seg} == "zeros" ]] && [[ ! ${ref} =~ ^(none|PCA1|PCAmocoSF)$ ]] ; then + echo ${ref} not suited for zeros segmenter. Ignoring. + continue + fi + + pids=() + for fn in ${input} ; do + optargs="-r" + for proto in ${L2PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 2" + fi + done + for proto in ${LEPROTOS} ; do + if [[ "${fn}" == $proto ]] ; then + # append + optargs="${optargs} -e" # -e: little endian + fi + done + bn=$(basename -- ${fn}) + strippedname="${bn%.*}" + + # fixed sigma 1.2 (nemeftr-paper: "constant σ of 1.2") ### add -p for plots + python src/nemeftr_cluster-segments.py -pt ${seg} -s 1.2 ${optargs} -f ${ref} ${fn} # >> "${report}/${strippedname}.log" & + pids+=( $! ) + # python src/nemeftr_cluster-segments.py -t ${seg} -s 1.2 -p -e ${optargs} -f ${ref} ${fn} + + # dynamic sigma: + # python src/nemeftr_cluster-segments.py -p -f ${ref} ${fn} + done + + for pid in "${pids[@]}"; do + printf 'Waiting for %d...' "$pid" + wait $pid + echo 'done.' + done + + mkdir ${report}-${seg}-${ref} + for fn in ${input}; + do + bn=$(basename -- ${fn}) + strippedname="${bn%.*}" + mv reports/${strippedname}* ${report}-${seg}-${ref}/ + done + done +done + +python src/transform_cluster-statistics.py +mv reports/*.csv ${report}/ + +spd-say "Bin fertig!" + + diff --git a/eval-nemeftr-truefield-iterateeps.sh b/eval-nemeftr-truefield-iterateeps.sh new file mode 100755 index 00000000..d5a64b7c --- /dev/null +++ b/eval-nemeftr-truefield-iterateeps.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# +# NEMEFTR: Optimal-segmentation baseline + +input="input/maxdiff-fromOrig/*-100*.pcap" + + +L2PROTOS="input/awdl-* input/au-* input/wlan-beacons-*" + +prefix="tft" + +numpad="200" +for f in reports/${prefix}-* ; do + if [ -e "$f" ] ; then + numnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) + numpad=$(printf "%03d" ${numnext}) + fi + break +done +currcomm=$(git log -1 --format="%h") +report=reports/${prefix}-${numpad}-clustering-${currcomm} +mkdir ${report} + + +for fn in ${input} ; do + # relative to IP layer + optargs="-r" + for proto in ${L2PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 2" + fi + done + + python src/nemeftr_cluster-true-fields_iterate-eps.py ${optargs} ${fn} +done + + + + +mv reports/*.csv ${report}/ +mv reports/*.pdf ${report}/ + + + + + + +spd-say "Bin fertig!" diff --git a/eval-nemeftr-truefield.sh b/eval-nemeftr-truefield.sh new file mode 100755 index 00000000..09e5d29e --- /dev/null +++ b/eval-nemeftr-truefield.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# +# NEMEFTR: Optimal-segmentation baseline + +#input=input/*-100*.pcap +#input=input/*-1000.pcap +input="input/maxdiff-fromOrig/*-100*.pcap" +#input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" + + +L1PROTOS="input/ari_*" +L2PROTOS="input/awdl-* input/au-* input/wlan-beacons-*" + +prefix="tft" + +numpad="350" +for f in reports/${prefix}-* ; do + if [ -e "$f" ] ; then + numnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) + numpad=$(printf "%03d" ${numnext}) + fi + break +done +currcomm=$(git log -1 --format="%h") +report=reports/${prefix}-${numpad}-clustering-${currcomm} +mkdir ${report} + + +for fn in ${input} ; do + # relative to IP layer + optargs="-r" + for proto in ${L2PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 2" + fi + done + for proto in ${L1PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 1" + fi + done + + # add -p to write plots ### add -p for plots + python src/nemeftr_cluster-true-fields.py ${optargs} ${fn} +done + + + +for fn in ${input} ; do + bn=$(basename -- ${fn}) + strippedname="${bn%.*}" + mv reports/${strippedname}/ ${report}/ +done +mv reports/*.csv ${report}/ +mv reports/*.pdf ${report}/ + + + + + + +spd-say "Bin fertig!" diff --git a/eval-nemeftr-visualize.sh b/eval-nemeftr-visualize.sh new file mode 100755 index 00000000..9444693e --- /dev/null +++ b/eval-nemeftr-visualize.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# +# NEMEFTR pre +# =========== +# +# Topology plot: template centers of true field types +# Histogram: type-separation per true field type +# +# Histograms used in nemeftr-full + +#input=input/*-100.pcap +#input=input/*-1000.pcap +#input="input/*-100.pcap input/*-1000.pcap" +#input=input/maxdiff-filtered/*-1000.pcap +input="input/maxdiff-fromOrig/*-100*.pcap" +#input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" + + +L2PROTOS="input/awdl-* input/au-*" +LEPROTOS="input/awdl-* input/au-* input/smb* input/*/smb*" + +prefix="ftrvisualize" + +numpad="200" +for f in reports/${prefix}-* ; do + if [ -e "$f" ] ; then + numnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) + numpad=$(printf "%03d" ${numnext}) + fi + break +done +currcomm=$(git log -1 --format="%h") +report=reports/${prefix}-${numpad}-clustering-${currcomm} +mkdir ${report} + + +for fn in ${input} ; do + # relative to IP layer + optargs="-r" # for varying epsilon add: -e + for proto in ${L2PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 2" + fi + done + for proto in ${LEPROTOS} ; do + if [[ "${fn}" == $proto ]] ; then + # append + optargs="${optargs} -e" # -e: little endian + fi + done + python src/visualize_fieldtype_separation.py ${optargs} ${fn} +done + + +# legacy ? +# for fn in ${input} ; do python src/visualize_fieldtype_separation.py ${optargs} ${fn} ; done + +mv reports/*.csv ${report}/ +mv reports/*.pdf ${report}/ + + + + + + +spd-say "Bin fertig!" diff --git a/eval-nemesys-refine.sh b/eval-nemesys-refine.sh new file mode 100755 index 00000000..b079926a --- /dev/null +++ b/eval-nemesys-refine.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# input=input/*-100.pcap +# input=input/*-1000.pcap +input="input/maxdiff-fromOrig/*-100.pcap" +#input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" + + +# full +#sigmas="0.6 0.7 0.8 0.9 1.0 1.1 1.2 2.4" +sigmas="0.9 1.2" + +L2PROTOS="input/awdl-* input/au-*" +LEPROTOS="input/awdl-* input/au-* input/smb* input/*/smb*" + +prefix="nemesys" + +cftnpad="200" +for f in reports/${prefix}-* ; do + if [ -e "$f" ] ; then + cftnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) + cftnpad=$(printf "%03d" ${cftnext}) + fi + break +done +currcomm=$(git log -1 --format="%h") +report=reports/${prefix}-${cftnpad}-original-${currcomm} +mkdir ${report} + +#for fn in ${input} ; do python src/nemesys_fms.py -r ${fn} ; done +for fn in ${input} ; do + optargs="-r" + for proto in ${L2PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 2" + fi + done + for proto in ${LEPROTOS} ; do + if [[ "${fn}" == $proto ]] ; then + # append + optargs="${optargs} -e" # -e: little endian + fi + done + + for sig in ${sigmas} ; do + python src/nemesys_fms.py ${optargs} -s ${sig} ${fn} + done +done + + +#mv reports/*.pdf ${report}/ +for fn in ${input}; +do + bn=$(basename -s .pcap ${fn}) + mv reports/${bn}* ${report}/ +done + +python reports/combine-nemesys-fms.py ${report}/ + + + +spd-say "Bin fertig!" diff --git a/eval-nemetyl-messagetype.sh b/eval-nemetyl-messagetype.sh index 1c5a4a96..744246cb 100755 --- a/eval-nemetyl-messagetype.sh +++ b/eval-nemetyl-messagetype.sh @@ -5,8 +5,8 @@ #input="input/*-100.pcap input/*-1000.pcap" #input="input/ntp_SMIA-20111010_deduped-1000.pcap input/smb_SMIA20111010-one_deduped-1000.pcap" -input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" -#input="input/maxdiff-fromOrig/*-100*.pcap" +input="input/maxdiff-fromOrig/*-100*.pcap" +#input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" #sigmas="0.6 0.8 1.0 1.2" @@ -15,7 +15,6 @@ input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" sigmas="1.2" # full -#segmenters="nemesys" segmenters="nemesys" # full @@ -23,16 +22,16 @@ segmenters="nemesys" # Nemesys options # refines="original nemetyl" -# default -# refines="original nemetyl" refines="nemetyl" L2PROTOS="input/awdl-* input/au-* input/wlan-beacons-*" +L1PROTOS="" +LEPROTOS="input/awdl-* input/au-* input/smb* input/*/smb* input/wlan-beacons-*" prefix="nemetyl" -cftnpad="229" +cftnpad="405" for f in reports/${prefix}-* ; do if [ -e "$f" ] ; then cftnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1)) @@ -55,6 +54,12 @@ for fn in ${input} ; do optargs="-l 2" fi done + for proto in ${L1PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 1" + fi + done echo -e "\n\ntshark: ${fn}" # echo "$fn -t tshark ${optargs} --with-plots" # exit @@ -68,6 +73,12 @@ for fn in ${input} ; do optargs="-l 2" fi done + for proto in ${L1PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 1" + fi + done echo -e "\n\n4bytesfixed: ${fn}" python src/nemetyl_align-segments.py $fn -t 4bytesfixed ${optargs} --with-plots done @@ -87,9 +98,31 @@ for seg in ${segmenters} ; do optargs="-l 2" fi done + for proto in ${L1PROTOS} ; do + if [[ "${fn}" == ${proto} ]] ; then + # replace + optargs="-l 1" + fi + done + # comment out for "the wrong branch" + for proto in ${LEPROTOS} ; do + if [[ "${fn}" == $proto ]] ; then + # append + optargs="${optargs} -e" # -e: little endian + echo -e "\nlitte endian" + fi + done echo -e "\n${seg}, sigma ${sig} (${refines}): ${fn}" python src/nemetyl_align-segments.py ${fn} -f ${ref} -t ${seg} -s ${sig} ${optargs} --with-plots +# # # # # +# # the wrong branch: apply LE optimization to the BE protocols +# echo -e "\nforced litte endian" +# echo -e "\n${seg}, sigma ${sig} (${refines}): ${fn}" +# python src/nemetyl_align-segments.py ${fn} -f ${ref} -t ${seg} -s ${sig} -e ${optargs} --with-plots +# # # # # done + # mkdir ${report}/sig${sig}-${ref} + # mv reports/*.pdf ${report}/sig${sig}-${ref}/ done done done diff --git a/eval-netzob-fms.sh b/eval-netzob-fms.sh index 93ae6a8a..2b80d0ee 100755 --- a/eval-netzob-fms.sh +++ b/eval-netzob-fms.sh @@ -4,16 +4,14 @@ #input=input/*-1000.pcap #input="input/*-100.pcap input/*-1000.pcap" #input=input/maxdiff-filtered/*-1000.pcap -#input=input/maxdiff-fromOrig/*-1000.pcap +input="input/maxdiff-fromOrig/*-100*.pcap" +#input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" -input=input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap -#input="input/maxdiff-fromOrig/smb_SMIA20111010-one-rigid1_maxdiff-1000.pcap" L2PROTOS="input/awdl-* input/au-*" prefix="netzob-format" -# AWDL numpad="206" for f in reports/${prefix}-* ; do if [ -e "$f" ] ; then @@ -26,7 +24,7 @@ currcomm=$(git log -1 --format="%h") report=reports/${prefix}-${numpad}-fms-${currcomm} mkdir ${report} -smin=50 +smin=57 pids=() for fn in ${input} ; do diff --git a/eval-netzob-messagetype.sh b/eval-netzob-messagetype.sh index 9e2d2eee..0fc560cb 100755 --- a/eval-netzob-messagetype.sh +++ b/eval-netzob-messagetype.sh @@ -21,29 +21,32 @@ mkdir ${report} #python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 78 --smax 78 #python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 79 --smax 79 # -# ## dns_ictf2010_deduped-982-1000.pcap #python src/netzob_messagetypes.py input/dns_ictf2010_deduped-982-1000.pcap -r --smin 49 --smax 51 # -# ## nbns_SMIA20111010-one_deduped-1000.pcap #python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 57 --smax 59 # -# ## ntp_SMIA-20111010_deduped-1000.pcap #python src/netzob_messagetypes.py input/ntp_SMIA-20111010_deduped-1000.pcap -r --smin 56 --smax 58 # -# ntp_SMIA-20111010_deduped-100.pcap -python src/netzob_messagetypes.py input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap -r --smin 56 --smax 58 - - +## ntp_SMIA-20111010_deduped-100.pcap +#python src/netzob_messagetypes.py input/ntp_SMIA-20111010_deduped-100.pcap -r --smin 56 --smax 58 +# ## smb_SMIA20111010-one_deduped-1000.pcap #python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 54 --smax 55 #python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 56 --smax 56 - - - +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/dhcp_SMIA2011101X-filtered_maxdiff-100.pcap -r --smin 76 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/dhcp_SMIA2011101X-filtered_maxdiff-1000.pcap -r --smin 76 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/dns_ictf2010-new_maxdiff-100.pcap -r --smin 50 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/dns_ictf2010-new_maxdiff-1000.pcap -r --smin 50 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/nbns_SMIA20111010-one_maxdiff-100.pcap -r --smin 53 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/nbns_SMIA20111010-one_maxdiff-1000.pcap -r --smin 53 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap -r --smin 66 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-1000.pcap -r --smin 66 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/smb_SMIA20111010-one-rigid1_maxdiff-100.pcap -r --smin 53 +#python src/netzob_messagetypes.py input/maxdiff-fromOrig/smb_SMIA20111010-one-rigid1_maxdiff-1000.pcap -r --smin 53 mv reports/*.csv ${report}/ diff --git a/input/Sources.md b/input/Sources.md index ad603335..d6617bae 100644 --- a/input/Sources.md +++ b/input/Sources.md @@ -11,18 +11,20 @@ * Concatenate pcaps: `mergecap -F pcap -w OUTFILE INFILES` * Change encapulation: `editcap -F pcap -T ENCTYPE INFILE OUTFILE` * Deduplicate and truncate to fixed size: `prep_deduplicate-trace.py PCAP --p [N]` +* **maxdiff** `python ./src/prep_filter-maxdiff-trace.py -p [N] -r -f valcom PCAP` ## SMIA -[Netresec-Page](http://download.netresec.com/pcap/smia-2011/SMIA_2011-10-10_08%253A03%253A19_CEST_632834000_file1.pcap) +[Netresec-Page](http://download.netresec.com/pcap/smia-2011/SMIA_2011-10-10_08%253A03%253A19_CEST_632834000_file1.pcap) ### NTP * ntp_SMIA-20111010.pcap * from SMIA_2011-10-10_08-03-19_CEST_632834000_file1.pcap * filtered by `tshark -2 -R "ntp && !icmp" ... -F pcap` * by `python src/prep_deduplicate-trace.py ntp_SMIA-20111010.pcap --p [N]` - * filter out `ntp.flags.mode == 6` + * filter out: `!ntp.flags.mode == 6 && !ntp.flags.mode == 7` + (for mode == 7 the dissector is incomplete) ### DHCP * dhcp_SMIA-20111010_deduped-100.pcap @@ -44,6 +46,8 @@ * maxdiff-fromOrig/dhcp_SMIA2011101X-filtered_maxdiff-1100.pcap * from trace-collection/sources/A_SMIA/SMIA_2011-10-10+11/dhcp_SMIA2011101X-filtered.pcap +**For newer Wireshark versions this is now: `dhcp && !dhcp.option.user_class && !icmp`** + ### Netbios Name Server * nbns_SMIA20111010-one_deduped-100.pcap * from SMIA_2011-10-10_08_632834000_file1-splits/nbns @@ -63,7 +67,7 @@ * filtered by adapted /home/stephan/REUP-common/trace-collection/sources/filters/filter-smb.sh * smb_maccdc2012_maxdiff-1100.pcap * from /media/usb0/project-raw-files/traces/MACCDC2012/smb_maccdc2012.pcap - * packets of trace are 802.11q VLAN encapsulated, strip to IP: + * packets of trace are 802.11q VLAN encapsulated, strip to IP: python3 strip_encapsulation.py smb_maccdc2012_000*-f2.pcap * filtered by /home/stephan/REUP-common/trace-collection/sources/filters/filter-smb.sh * download command and additional infos: /media/usb0/project-raw-files/traces/MACCDC2012/source.txt @@ -71,7 +75,7 @@ ## iCTF 2010 -[UCSB](http://ictf.cs.ucsb.edu/ictfdata/2010/dumps/ictf2010pcap.tar.gz) +[UCSB](http://ictf.cs.ucsb.edu/ictfdata/2010/dumps/ictf2010pcap.tar.gz) ### IRC * irc_ictf2010-42.pcap @@ -104,7 +108,7 @@ ## Random Validation to find structure: Generated PCAPs with no structure (random byte sequences): -generate_random_pcap.py +generate_random_pcap.py with parameters: * -l 100 diff --git a/input/maxdiff-fromOrig/dns_ictf2010_maxdiff-100.pcap b/input/hide/dns_ictf2010_maxdiff-100.pcap similarity index 100% rename from input/maxdiff-fromOrig/dns_ictf2010_maxdiff-100.pcap rename to input/hide/dns_ictf2010_maxdiff-100.pcap diff --git a/input/maxdiff-fromOrig/dns_ictf2010_maxdiff-1000.pcap b/input/hide/dns_ictf2010_maxdiff-1000.pcap similarity index 100% rename from input/maxdiff-fromOrig/dns_ictf2010_maxdiff-1000.pcap rename to input/hide/dns_ictf2010_maxdiff-1000.pcap diff --git a/input/splitcap.sh b/input/splitcap.sh new file mode 100755 index 00000000..3ffe68fc --- /dev/null +++ b/input/splitcap.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# +# Split pcaps ending with "-1100.pcap" into disjunct 100 and 1000 parts. + +ext="-1100.pcap" + +for fn in ${@} ; do + bn=$(basename -s "${ext}" "${fn}") + editcap -r ${fn} "${bn}-100.pcap" 1-100 + editcap -r ${fn} "${bn}-1000.pcap" 101-1100 +done diff --git a/requirements.txt b/requirements.txt index 5f0ed255..3e4f08ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,22 @@ -bitarray==0.8.1 -colorhash==1.0.3 -Cython==0.29.21 -GitPython==3.1.18 -hdbscan==0.8.26 -humanhash==0.0.1 -ipython==7.19.0 -kneed==0.7.0 -matplotlib==3.3.4 -netaddr==0.8.0 -networkx==2.5.1 -numpy==1.19.5 -pandas==1.2.1 -pcapy==0.10.10 -scapy==2.4.4 -scikit-learn==0.24.1 -scipy==1.6.0 -tabulate==0.8.7 -xlwt==1.3.0 +bitstring>=3.1.7 +colorhash>=1.0.3 +Cython>=0.29.21 +GitPython>=3.1.18 +hdbscan>=0.8.26 +humanhash>=0.0.1 +ipython>=7.31.1 +kneed>=0.7.0 +matplotlib>=3.3.4 +netaddr>=0.8.0 +networkx>=2.5.1 +numpy>=1.22.0 +pandas>=1.2.1 +pcapy>=0.10.10 +scapy>=2.4.4 +scikit-learn>=0.24.1 +scipy>=1.6.0 +tabulate>=0.8.7 +xlwt>=1.3.0 +pyitlib>=0.2.2 +# netzob==1.0.3 # we require the next version from github: https://github.com/netzob/netzob/tree/next +nose>=1.3.7 diff --git a/src/Contents.md b/src/Contents.md index 27ccf1f5..bc7a7a39 100644 --- a/src/Contents.md +++ b/src/Contents.md @@ -9,6 +9,8 @@ ### check_pcap-info.py ### prep_deduplicate-trace.py +### prep_filter-maxdiff-trace.py +### transform_cluster-statistics.py @@ -23,6 +25,26 @@ WOOT 2018 #### nemesys_field-deviation-plot.py #### nemesys_fms.py #### nemesys.py +#### analysis_message_plot.py + +### PCA +(Usenix 2019) +#### nemesys_pca-refinement.py +#### (nemesys_pca-refinement-iterate-params.py) +#### nemezero_pca-refinement.py + + + + +## Pilot-Tests: Message Type Identification (NEMETYL pre) + +### analyze_fieldtype_separation.py +### fieldtype-aware_distances.py +### mixedlength_fieldtype_offsets.py +### visualize_fieldtype_separation.py + +... + @@ -36,3 +58,54 @@ baseline +## Pilot-Tests: Field Type Recognition (NEMEFTR pre) + +### nemeftr_cluster-true-fields.py +NEMEFTR: early state and evaluation of epsilon autoconfiguration. + +optimal-segmentation **baseline** + +* eval-nemeftr-truefield.sh +* tft +* cluster statistics; precision, recall + +### visualize_fieldtype_separation.py +Topology plot: template centers of true field types +Histogram: type-separation per true field type + +* eval-nemeftr-visualize.sh +* Histograms used in nemeftr-full + +### characterize_fieldtypes.py +Very old state: legacy + + + + +## NEMEFTR + +### nemeftr_cluster-segments.py +NEMEFTR-full mode 1: + +Clustering of segments on similarity without ground truth. + +* eval-nemeftr-clustering.sh +* cft +* cluster statistics; precision, recall + +### nemeftr_generate-fieldtype-templates.py +NEMEFTR-full mode 2, step 1: + +Generate FieldTypeTemplates representing data types. + +* Run manually + +### nemeftr_field-recognition.py +NEMEFTR-full mode 2, step 2: + +Recognize field types by templates. Templates are previously generated by `generate-fieldtype-templates.py` + +* eval-nemeftr-recognize.sh +* recognize +* FMS + diff --git a/src/check_parse-pcap.py b/src/check_parse-pcap.py index 31f43c83..61facdd3 100644 --- a/src/check_parse-pcap.py +++ b/src/check_parse-pcap.py @@ -4,7 +4,7 @@ interpreted correctly to create a baseline to compare inferences to. """ -import time, logging +import time from argparse import ArgumentParser from os.path import isfile from sys import exit @@ -52,10 +52,12 @@ ########################### # # Multiple messages with ParsedMessage.parseMultiple test: Dissection ran in 1.55 seconds. - if args.targetlayer: - pms = ParsedMessage.parseMultiple(pkt, args.targetlayer, args.relativeToIP, linktype=specimens.getBaseLayerOfPCAP()) - else: - pms = ParsedMessage.parseMultiple(pkt, linktype=specimens.getBaseLayerOfPCAP()) + # if args.targetlayer: + # pms = ParsedMessage.parseMultiple(pkt, args.targetlayer, args.relativeToIP, linktype=specimens.getBaseLayerOfPCAP()) + # pms = ParsedMessage.parseOneshot(specimens) + # else: + # pms = ParsedMessage.parseMultiple(pkt, linktype=specimens.getBaseLayerOfPCAP()) + pms = ParsedMessage.parseOneshot(specimens) pms = list(pms.values()) print("Dissection ran in {:3.2f} seconds.".format(time.time()-st)) diff --git a/src/check_pcap-info.py b/src/check_pcap-info.py index 0ab3e159..c0593bab 100644 --- a/src/check_pcap-info.py +++ b/src/check_pcap-info.py @@ -45,6 +45,7 @@ def meanByteDiff(messages: Sequence) -> List[List[float]]: print("Filename:", basename(args.pcapfilename)) print("PCAP base layer is:", specimens.getBaseLayerOfPCAP()) print("Longest message without its encapsulation:", specimens.maximumMessageLength) + print("Sum of message payload bytes:", specimens.cumulatedMessageLength) print("Most frequent byte values:") print(tabulate( ((hex(b), o) for b, o in countByteFrequency()[:10]) @@ -52,6 +53,7 @@ def meanByteDiff(messages: Sequence) -> List[List[float]]: print("Mean difference between bytes per message:", numpy.mean(list(chain.from_iterable(meanByteDiff(specimens.messagePool.keys()))))) # print(tabulate(meanByteDiff(specimens.messagePool.keys()))) + print() if args.interactive: print('Loaded PCAP in: specimens') diff --git a/src/nemeftr-prod_cluster-segments.py b/src/nemeftr-prod_cluster-segments.py new file mode 100644 index 00000000..243c6978 --- /dev/null +++ b/src/nemeftr-prod_cluster-segments.py @@ -0,0 +1,218 @@ +""" +Reference implementation for calling NEMEFTR-full mode 1, the NEtwork MEssage Field Type Recognition, +classification of data types, with an unknown protocol. + +Clustering of segments on similarity without ground truth. +Segments are created from messages by NEMESYS and clustered with DBSCANsegmentClusterer +and refined by the method selected at the command line (-r). +Generates segment-dissimilarity topology plots of the clustering result. +""" +import argparse +import os + +import IPython +import matplotlib.pyplot as plt +import numpy as numpy + +from nemere.inference.segmentHandler import originalRefinements, nemetylRefinements, isExtendedCharSeq +from nemere.inference.templates import ClusterAutoconfException, FieldTypeTemplate, DBSCANadjepsClusterer +from nemere.utils.evaluationHelpers import StartupFilecheck, CachedDistances, TitleBuilderSens +from nemere.utils.reportWriter import SegmentClusterReport +from nemere.visualization.distancesPlotter import DistancesPlotter +from nemere.visualization.simplePrint import FieldClassesPrinter + +debug = False + +# fix the analysis method to VALUE +analysis_method = 'value' +# fix the distance method to canberra +distance_method = 'canberra' +# tokenizers to select from +tokenizers = ('nemesys', 'zeros') +# refinement methods +refinementMethods = [ + "none", + "original", # WOOT2018 paper + "nemetyl", # INFOCOM2020 paper: ConsecutiveChars+moco+splitfirstseg + "PCAmocoSF", # PCA+moco+SF (v2) | applicable to zeros + "zerocharPCAmocoSF" # with split fixed (v2) + ] +# Parameter for DBSCAN epsilon autoconfiguration by Kneedle +kneedleSensitivity=24.0 + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Cluster NEMESYS segments of messages according to similarity.') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + parser.add_argument('-e', '--littleendian', help='Toggle presumed endianness to little.', action="store_true") + + parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.', + choices=tokenizers, default=tokenizers[0]) + parser.add_argument('-s', '--sigma', type=float, help='Sigma for noise reduction (gauss filter) in NEMESYS,' + 'default: 0.9') + parser.add_argument('-f', '--refinement', help='Select segment refinement method.', choices=refinementMethods, + default=refinementMethods[-1]) + + parser.add_argument('-p', '--with-plots', + help='Generate plots of true field types and their distances.', + action="store_true") + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + args = parser.parse_args() + + filechecker = StartupFilecheck(args.pcapfilename) + withplots = args.with_plots + littleendian = args.littleendian == True + tokenizer = args.tokenizer + if littleendian: + tokenizer += "le" + + # # # # # # # # # # # # # # # # # # # # # # # # + # cache/load the DistanceCalculator to the filesystem + # + fromCache = CachedDistances(args.pcapfilename, analysis_method, args.layer, args.relativeToIP) + # Note! When manipulating distances calculation, deactivate caching by uncommenting the following assignment. + # fromCache.disableCache = True + fromCache.debug = debug + # As we analyze a truly unknown protocol, tell CachedDistances that it should not try to use tshark to obtain + # a dissection. The switch may be set to true for evaluating the approach with a known protocol. + # see src/nemetyl_align-segments.py + fromCache.dissectGroundtruth = False + fromCache.configureTokenizer(tokenizer, args.sigma) + refinement = args.refinement + if tokenizer[:7] == "nemesys": + if args.refinement == "original": + fromCache.configureRefinement(originalRefinements) + elif args.refinement == "nemetyl": + fromCache.configureRefinement(nemetylRefinements) + elif args.refinement is None or args.refinement == "none": + print("No refinement selected. Performing raw segmentation.") + else: + print(f"The refinement {args.refinement} is not supported with this tokenizer. Abort.") + exit(2) + try: + fromCache.get() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) + segmentedMessages = fromCache.segmentedMessages + specimens, comparator, dc = fromCache.specimens, fromCache.comparator, fromCache.dc + segmentationTime, dist_calc_segmentsTime = fromCache.segmentationTime, fromCache.dist_calc_segmentsTime + + segments2cluster = dc.segments + + # extract char sequences + charSegments = list() + nonCharSegs = list() + for seg in segments2cluster: + if isExtendedCharSeq(seg.bytes): + charSegments.append(seg) + else: + nonCharSegs.append(seg) + + # # # # # # # # # # # # # # # # # # # # # # # # + # cluster segments to determine field types on commonality + clusterer = None # just to prevent PyCharm's warnings + try: + clusterer = DBSCANadjepsClusterer(dc, nonCharSegs, S=kneedleSensitivity) + clusterer.preventLargeCluster() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) + # # # # # # # # # # # # # # # # # # # # # # # # # + if withplots: + clusterer.kneelocator.plot_knee() # plot_knee_normalized() + plt.text(0.5, 0.2, "S = {:.1f}\neps = {:.3f}\nk = {:.0f}".format(clusterer.S, clusterer.eps, clusterer.k)) + plt.savefig(os.path.join(filechecker.reportFullPath, "knee-{}-S{:.1f}-eps{:.3f}.pdf".format( + filechecker.pcapstrippedname, kneedleSensitivity, clusterer.eps))) + + noise, *clusters = clusterer.clusterSimilarSegments(False) + # # # # # # # # # # # # # # # # # # # # # # # # # + + inferenceParams = TitleBuilderSens(tokenizer, refinement, args.sigma, clusterer) + print("{} clusters generated from {} distinct segments".format(len(clusters), len(dc.segments))) + + if len(charSegments) > 0: + clusters.append(charSegments) + + # generate labels for inferred clusters. + ftclusters = {"tf{:02d}".format(cLabel): segments for cLabel, segments in enumerate(clusters)} + ftclusters["Noise"] = noise + # alternative representation of the same clusters as FieldTypeTemplate + ftTemplates = list() + for cLabel, segments in ftclusters.items(): + ftype = FieldTypeTemplate(segments) + ftype.fieldtype = cLabel + ftTemplates.append(ftype) + + # # # # # # # # # # # # # # # # # # # # # # # # + # Report: write cluster elements to csv + # # # # # # # # # # # # # # # # # # # # # # # # + elementsReport = SegmentClusterReport(filechecker, filechecker.reportFullPath) + elementsReport.write(ftclusters) + + # # # # # # # # # # # # # # # # # # # # # # # # + # distance Topology plot + # # # # # # # # # # # # # # # # # # # # # # # # + if withplots: + # show only largest clusters + clusterCutoff = 20 + print("Plot distances...") + + # look up cluster sizes, sort them by size, and select the largest clusters (if clusterCutoff > 0) + clusterStatsLookup = {cLabel: len(segments) # label, numSegsinCuster + for cLabel, segments in ftclusters.items()} + sortedClusters = sorted([cLabel for cLabel in ftclusters.keys() if cLabel != "Noise"], + key=lambda x: -clusterStatsLookup[x]) + if clusterCutoff > 0: + selectedClusters = [ftt for ftt in sortedClusters][:clusterCutoff] + inferenceParams.postProcess = "largest{}clusters".format(clusterCutoff) + else: + selectedClusters = sortedClusters + atitle = 'segment-distances_' + inferenceParams.plotTitle + + # Generate the kind of labels suited and needed for the plot + omittedClusters = [ftt for ftt in sortedClusters if ftt not in selectedClusters] + ["Noise"] + clustermask = {segid: "{}: {} seg.s".format(ftt, clusterStatsLookup[ftt]) + for ftt in selectedClusters for segid in dc.segments2index(ftclusters[ftt])} + # In the plot, label everything as noise that is not in the selected clusters (including the actual noise) + clustermask.update({segid: "Noise" for segid in dc.segments2index( + [bs for ftt in omittedClusters for bs in ftclusters[ftt]] + )}) + labels = numpy.array([clustermask[segid] for segid in range(len(dc.segments))]) + + sdp = DistancesPlotter(specimens, atitle, False) + # hand over selected subset of clusters to plot + sdp.plotManifoldDistances(dc.segments, dc.distanceMatrix, labels) + # sdp.plotSegmentDistances(dc, labels) + sdp.writeOrShowFigure(filechecker.reportFullPath) + del sdp + + # # # # # # # # # # # # # # # # # # # # # # # # + # visualization of segments from clusters in messages. + # # # # # # # # # # # # # # # # # # # # # # # # + cp = FieldClassesPrinter(ftTemplates) + msgsupto400bytes = [msg for msg in specimens.messagePool.keys() if len(msg.data) <= 400] + cp.toTikzFile(msgsupto400bytes[:100]) + + + + filechecker.writeReportMetadata(fromCache.dccachefn if fromCache.isLoaded else None) + + if args.interactive: + # noinspection PyUnresolvedReferences + from collections import Counter + # noinspection PyUnresolvedReferences + from nemere.inference.segments import MessageSegment, TypedSegment + # noinspection PyUnresolvedReferences + import numpy + + # globals().update(locals()) + IPython.embed() diff --git a/src/nemeftr_cluster-segments.py b/src/nemeftr_cluster-segments.py new file mode 100644 index 00000000..597cdf09 --- /dev/null +++ b/src/nemeftr_cluster-segments.py @@ -0,0 +1,326 @@ +""" +NEMEFTR-full mode 1: +Clustering of segments on similarity without ground truth. However, ground truth is expected and used for evaluation. +Segments are created from messages by NEMESYS and clustered with DBSCANsegmentClusterer +and refined by the method selected at the command line (-r). +Generates segment-dissimilarity topology plots of the clustering result. +""" + +import argparse +import math + +from nemere.inference.templates import ClusterAutoconfException, DBSCANadjepsClusterer, \ + DBSCANsegmentClusterer, OPTICSsegmentClusterer +from nemere.inference.segmentHandler import baseRefinements, originalRefinements, \ + isExtendedCharSeq, nemetylRefinements +from nemere.utils.reportWriter import IndividualClusterReport, CombinatorialClustersReport, \ + SegmentClusterGroundtruthReport, writeFieldTypesTikz +from nemere.validation.clusterInspector import SegmentClusterCauldron +from nemere.visualization.distancesPlotter import SegmentTopology +from nemere.utils.evaluationHelpers import * + +debug = False + +# fix the analysis method to VALUE +analysis_method = 'value' +# fix the distance method to canberra +distance_method = 'canberra' + +# Parameter for DBSCAN epsilon autoconfiguration by Kneedle +kneedleSensitivity=24.0 +# kneedleSensitivity=4.0 +# kneedleSensitivity=6.0 +# kneedleSensitivity=9.0 + + + +def inferred4segment(segment: MessageSegment) -> Sequence[MessageSegment]: + """ + :param segment: The input segment. + :return: All inferred segments for the message which the input segment is from. + """ + return next(msegs for msegs in segmentedMessages if msegs[0].message == segment.message) + +def inferredFEs4segment(segment: MessageSegment) -> List[int]: + return [infs.nextOffset for infs in inferred4segment(segment)] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Cluster NEMESYS segments of messages according to similarity.') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + parser.add_argument('-s', '--sigma', type=float, help='Sigma for noise reduction (gauss filter) in NEMESYS,' + 'default: 0.9') + parser.add_argument('-p', '--with-plots', + help='Generate plots of true field types and their distances.', + action="store_true") + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.', + choices=CachedDistances.tokenizers, default=CachedDistances.tokenizers[0]) + parser.add_argument('-f', '--refinement', help='Select segment refinement method.', + choices=CachedDistances.refinementMethods, default=CachedDistances.refinementMethods[-1]) + parser.add_argument('-e', '--littleendian', help='Toggle presumed endianness to little.', action="store_true") + args = parser.parse_args() + + filechecker = StartupFilecheck(args.pcapfilename) + withplots = args.with_plots + littleendian = args.littleendian == True + analyzerType = analyses[analysis_method] + analysisArgs = None + analysisTitle = analysis_method + tokenizer = args.tokenizer + if littleendian: + tokenizer += "le" + + # # # # # # # # # # # # # # # # # # # # # # # # + # cache/load the DistanceCalculator to the filesystem + # + fromCache = CachedDistances(args.pcapfilename, analysisTitle, args.layer, args.relativeToIP) + # Note! When manipulating distances calculation, deactivate caching by uncommenting the following assignment. + # fromCache.disableCache = True + fromCache.debug = debug + if analysisArgs is not None: + # noinspection PyArgumentList + fromCache.configureAnalysis(*analysisArgs) + fromCache.configureTokenizer(tokenizer, args.sigma) + fromCache.filter = True + + refinement = args.refinement + if tokenizer[:7] == "nemesys" or tokenizer[:4] == "bide": + if args.refinement == "original": + fromCache.configureRefinement(originalRefinements) + elif args.refinement == "base": + fromCache.configureRefinement(baseRefinements) + elif args.refinement == "nemetyl": + fromCache.configureRefinement(nemetylRefinements) + elif args.refinement is None or args.refinement == "none": + print("No refinement selected. Performing raw segmentation.") + else: + print(f"The refinement {args.refinement} is not supported with this tokenizer. Abort.") + exit(2) + try: + fromCache.get() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) + segmentedMessages = fromCache.segmentedMessages + # noinspection PyTypeChecker + specimens, comparator, dc = fromCache.specimens, fromCache.comparator, fromCache.dc # type: SpecimenLoader, MessageComparator, DistanceCalculator + segmentationTime, dist_calc_segmentsTime = fromCache.segmentationTime, fromCache.dist_calc_segmentsTime + # + # # gt for manual usage + # trueSegmentedMessages = {msgseg[0].message: msgseg + # for msgseg in annotateFieldTypes(analyzerType, analysisArgs, comparator)} + # # # # # # # # # # # # # # # # # # # # # # # # + + # Configure some clustering alternatives during evaluations: (can be removed after final decision on either way.) + separateChars = True + singularsFromNoise = False + # to evaluate clustering of unique-valued segments + clusterUnique = False + if clusterUnique: + # TODO eval clustering of unique-valued segments + segments2cluster = dc.segments + else: + segments2cluster = dc.rawSegments + + # extract char sequences + if separateChars: + charSegments = list() + nonCharSegs = list() + for seg in segments2cluster: + if isExtendedCharSeq(seg.bytes): + charSegments.append(seg) + else: + nonCharSegs.append(seg) + + # # # # # # # # # # # # # # # # # # # # # # # # + # cluster segments to determine field types on commonality + clusterer = None # just to prevent PyCharm's warnings + try: + if separateChars: + if tokenizer[:7] == "nemesys": + # noinspection PyUnboundLocalVariable + clusterer = DBSCANadjepsClusterer(dc, nonCharSegs, S=kneedleSensitivity) + if args.refinement in ["emzcPCAmocoSF", "zerocharPCAmocoSF"]: + clusterer.eps *= 1.3 + else: + clusterer = DBSCANsegmentClusterer(dc, nonCharSegs, S=kneedleSensitivity) + # clusterer = OPTICSsegmentClusterer(dc, nonCharSegs) + # clusterer = HDBSCANsegmentClusterer(dc, nonCharSegs, min_cluster_size=12) + else: + if tokenizer[:7] == "nemesys": + clusterer = DBSCANadjepsClusterer(dc, segments2cluster, S=kneedleSensitivity) + else: + clusterer = DBSCANsegmentClusterer(dc, segments2cluster, S=kneedleSensitivity) + # clusterer = OPTICSsegmentClusterer(dc, segments2cluster) + # clusterer = HDBSCANsegmentClusterer(dc, segments2cluster, min_cluster_size=12) + # # # # # # # # # # # # # # # # # # # # # # # # # + if isinstance(clusterer, DBSCANsegmentClusterer): + clusterer.preventLargeCluster() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed using DBSCAN." + " The original exception message was:\n ", e, "\nFalling back to OPTICS clusterer.") + ms = round(math.sqrt(len(nonCharSegs if separateChars else segments2cluster))) + clusterer = OPTICSsegmentClusterer(dc, nonCharSegs, min_samples=ms) + # print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + # "The original exception message was:\n", e) + # exit(10) + # # # # # # # # # # # # # # # # # # # # # # # # # + + inferenceParams = TitleBuilderSens(tokenizer, refinement, args.sigma, clusterer) + + cauldron = SegmentClusterCauldron(clusterer, analysisTitle) + + if separateChars: + # noinspection PyUnboundLocalVariable + cauldron.appendCharSegments(charSegments) + # # TODO extract "large" templates from noise that should rather be its own cluster + if singularsFromNoise: + cauldron.extractSingularFromNoise() + # # # # # # # # # # # # # # # # # # # # # # # # + # # # # # # # # # # # # # # # # # # # # # # # # # + # TODO make one "enum" cluster + # see nemere.validation.clusterInspector.SegmentClusterCauldron.extractSingularFromNoise + # # # # # # # # # # # # # # # # # # # # # # # # # + + cauldron.clustersOfUniqueSegments() + + # # # # # # # # # # # # # # # # # # # # # # # # + fTypeTemplates = cauldron.exportAsTemplates() + + # # for treating singular clusters individually (in plot and reports) + # # TODO reinstate non-singular clusters for everything else besides Topo Plots + # for i in cauldron.unisegClusters.clusterIndices: + # # generate FieldTypeTemplates (padded nans) - Templates as is + # ftype = FieldTypeTemplate(cauldron.unisegClusters.clusterElements(i)) + # ftype.fieldtype = cauldron.unisegClusters.clusterLabel(i) + # fTypeTemplates.append(ftype) + + # fTypeContext = list() + # for cLabel, segments in enumerate(clusters): + # # generate FieldTypeContexts (padded values) - Templates resolved to single Segments + # resolvedSegments = resolveTemplates2Segments(segments) + # fcontext = FieldTypeContext(resolvedSegments) + # fcontext.fieldtype = ftype.fieldtype + # fTypeContext.append(fcontext) + + # print("\nCluster", cLabel, "Segments", len(segments)) + # print({seg.bytes for seg in segments}) + + # for seg in segments: + # # # sometimes raises: ValueError: On entry to DLASCL parameter number 4 had an illegal value + # # try: + # # confidence = float(ftype.confidence(numpy.array(seg.values))) if ftype.length == seg.length else 0.0 + # # except ValueError as e: + # # print(seg.values) + # # raise e + # # + # + # confidence = 0.0 + # if isinstance(seg, Template): + # for bs in seg.baseSegments: + # recog = RecognizedVariableLengthField(bs.message, ftype, bs.offset, bs.nextOffset, confidence) + # printFieldContext(trueSegmentedMessages, recog) + # else: + # recog = RecognizedVariableLengthField(seg.message, ftype, seg.offset, seg.nextOffset, confidence) + # printFieldContext(trueSegmentedMessages, recog) + # # # # # # # # # # # # # # # # # # # # # # # # + + + # # # # # # # # # # # # # # # # # # # # # # # # + # Templates resolved to single Segments + # see adjustments to make in nemere.utils.reportWriter.SegmentClusterGroundtruthReport._writeCSV + # if dc.segments != clusterer.segments: + # # print("resolve Templates") + # ftclusters = {ftc.fieldtype : ftc.baseSegments for ftc in fTypeContext} + # ftclusters["Noise"] = resolveTemplates2Segments(noise) + # else: + # print("keep Templates") + # Templates as is + ftclusters = {ftc.fieldtype: ftc.baseSegments for ftc in fTypeTemplates} + """ftclusters is a mixed list of MessageSegment and Template""" + ftclusters["Noise"] = cauldron.noise + # ftclusters["Noise"] = noise + + # # # # # # # # # # # # # # # # # # # # # # # # + # Report: write cluster elements to csv + # # # # # # # # # # # # # # # # # # # # # # # # + elementsReport = SegmentClusterGroundtruthReport(comparator, dc.segments, filechecker) + # # unknown segments + # unk = [(o, t) for o, t in typedMatchSegs.values() if o < 1] + # # print all unidentified segments + # for ovr, seg in unk: + # print("overlap: {:.2f}".format(ovr), seg.fieldtype if isinstance(seg, TypedSegment) else "") + # # if isinstance(seg, Template): + # # for bs in seg.baseSegments: + # # comparator.pprint2Interleaved(bs.message, mark=bs) + # # else: + # comparator.pprint2Interleaved(seg.message, mark=seg) + elementsReport.write(ftclusters) + # # # # # # # # # # # # # # # # # # # # # # # # + + # # # # # # # # # # # # # # # # # # # # # # # # + # # Report: allover clustering quality statistics + # # # # # # # # # # # # # # # # # # # # # # # # + report = CombinatorialClustersReport(elementsReport.groundtruth, filechecker) + report.write(ftclusters, inferenceParams.plotTitle) + # + # field-type-wise cluster quality statistics + report = IndividualClusterReport(elementsReport.groundtruth, filechecker) + # add column $d_max$ to report + cluDists = {lab: clusterer.distanceCalculator.distancesSubset(clu) for lab, clu in ftclusters.items()} + cluDistsMax = {lab: clu.max() for lab, clu in cluDists.items()} + report.addColumn(cluDistsMax, "$d_max$") + # + # # clusters with any internal distance > 0 + # nonZeroDmaxClusters = {lab: clu for lab, clu in ftclusters.items() if cluDistsMax[lab] > 0} + # + report.write(ftclusters, inferenceParams.plotTitle) + clusterStats = report.precisionRecallList + + # # # # # # # # # # # # # # # # # # # # # # # # + if withplots: + # distance Topology plot + topoplot = SegmentTopology(clusterStats, fTypeTemplates, cauldron.noise, dc) + topoplot.writeFigure(specimens, inferenceParams, elementsReport, filechecker) + # # # # # # # # # # # # # # # # # # # # # # # # + writeFieldTypesTikz(comparator, segmentedMessages, fTypeTemplates, filechecker) + # # # # # # # # # # # # # # # # # # # # # # # # + filechecker.writeReportMetadata(fromCache.dccachefn if fromCache.isLoaded else None) + + # # show position of each segment individually. + # for clu in clusters: + # print("# "*20) + # for seg in clu: + # markSegNearMatch(seg) + + # # # show segmentation of messages. + # for msgsegs in inferredSegmentedMessages: + # comparator.pprint2Interleaved(msgsegs[0].message, [infs.nextOffset for infs in msgsegs]) + + if args.interactive: + from collections import Counter + from nemere.inference.segments import MessageSegment, TypedSegment + # noinspection PyUnresolvedReferences + import numpy + + # globals().update(locals()) + IPython.embed() + + + + + + + + + + diff --git a/src/nemeftr_cluster-segments_iterate-eps.py b/src/nemeftr_cluster-segments_iterate-eps.py new file mode 100644 index 00000000..15dc1c95 --- /dev/null +++ b/src/nemeftr_cluster-segments_iterate-eps.py @@ -0,0 +1,267 @@ +""" +NEMEFTR-full mode 1: +Clustering of segments on similarity without ground truth. However, ground truth is expected and used for evaluation. +Segments are created from messages by NEMESYS and clustered with DBSCANsegmentClusterer +and refined by the method selected at the command line (-r). +Generates segment-dissimilarity topology plots of the clustering result. +""" + +import argparse +from math import log + +from nemere.inference.templates import FieldTypeTemplate, DBSCANsegmentClusterer, ClusterAutoconfException +from nemere.inference.segmentHandler import baseRefinements, originalRefinements, \ + isExtendedCharSeq, nemetylRefinements +from nemere.utils.reportWriter import IndividualClusterReport, CombinatorialClustersReport, \ + SegmentClusterGroundtruthReport +from nemere.visualization.distancesPlotter import SegmentTopology +from nemere.utils.evaluationHelpers import * + +debug = False + +# fix the analysis method to VALUE +analysis_method = 'value' +# fix the distance method to canberra +distance_method = 'canberra' +# tokenizers to select from +tokenizers = ('nemesys', 'zeros') # zeroslices + CropChars +# refinement methods +refinementMethods = [ + "none", + "original", # WOOT2018 paper + "base", # ConsecutiveChars+moco + "nemetyl", # INFOCOM2020 paper: ConsecutiveChars+moco+splitfirstseg + "PCA1", # PCA 1-pass | applicable to nemesys and zeros + "PCAmoco", # PCA+moco + "PCAmocoSF", # PCA+moco+SF (v2) | applicable to zeros + "zerocharPCAmocoSF", # with split fixed (v2) + ] + + + +def inferred4segment(segment: MessageSegment) -> Sequence[MessageSegment]: + """ + :param segment: The input segment. + :return: All inferred segments for the message which the input segment is from. + """ + return next(msegs for msegs in segmentedMessages if msegs[0].message == segment.message) + +def inferredFEs4segment(segment: MessageSegment) -> List[int]: + return [infs.nextOffset for infs in inferred4segment(segment)] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Cluster NEMESYS segments of messages according to similarity.') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + parser.add_argument('-s', '--sigma', type=float, help='Sigma for noise reduction (gauss filter) in NEMESYS,' + 'default: 0.9') + parser.add_argument('-p', '--with-plots', + help='Generate plots of true field types and their distances.', + action="store_true") + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.', + choices=tokenizers, default=tokenizers[0]) + parser.add_argument('-f', '--refinement', help='Select segment refinement method.', choices=refinementMethods, + default=refinementMethods[-1]) + parser.add_argument('-e', '--littleendian', help='Toggle presumed endianness to little.', action="store_true") + args = parser.parse_args() + + filechecker = StartupFilecheck(args.pcapfilename) + withplots = args.with_plots + littleendian = args.littleendian == True + analyzerType = analyses[analysis_method] + analysisArgs = None + analysisTitle = analysis_method + tokenizer = args.tokenizer + if littleendian: + tokenizer += "le" + + # # # # # # # # # # # # # # # # # # # # # # # # + # cache/load the DistanceCalculator to the filesystem + # + fromCache = CachedDistances(args.pcapfilename, analysisTitle, args.layer, args.relativeToIP) + # Note! When manipulating distances calculation, deactivate caching by uncommenting the following assignment. + # fromCache.disableCache = True + fromCache.debug = debug + if analysisArgs is not None: + # noinspection PyArgumentList + fromCache.configureAnalysis(*analysisArgs) + fromCache.configureTokenizer(tokenizer, args.sigma) + + refinement = args.refinement + if tokenizer[:7] == "nemesys": + if args.refinement == "original": + fromCache.configureRefinement(originalRefinements) + elif args.refinement == "base": + fromCache.configureRefinement(baseRefinements) + elif args.refinement == "nemetyl": + fromCache.configureRefinement(nemetylRefinements) + elif args.refinement is None or args.refinement == "none": + print("No refinement selected. Performing raw segmentation.") + else: + print(f"The refinement {args.refinement} is not supported with this tokenizer. Abort.") + exit(2) + try: + fromCache.get() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) + segmentedMessages = fromCache.segmentedMessages + # noinspection PyTypeChecker + specimens, comparator, dc = fromCache.specimens, fromCache.comparator, fromCache.dc # type: SpecimenLoader, MessageComparator, DistanceCalculator + segmentationTime, dist_calc_segmentsTime = fromCache.segmentationTime, fromCache.dist_calc_segmentsTime + + trueSegmentedMessages = {msgseg[0].message: msgseg + for msgseg in annotateFieldTypes(analyzerType, analysisArgs, comparator)} + # # # # # # # # # # # # # # # # # # # # # # # # + + + # Configure some clustering alternatives during evaluations: (can be removed after final decision on either way.) + separateChars = True + # to evaluate clustering of unique-valued segments + clusterUnique = False + if clusterUnique: + # TODO eval clustering of unique-valued segments + segments2cluster = dc.segments + else: + segments2cluster = dc.rawSegments + + # extract char sequences + if separateChars: + charSegments = list() + nonCharSegs = list() + for seg in segments2cluster: + if isExtendedCharSeq(seg.bytes): + charSegments.append(seg) + else: + nonCharSegs.append(seg) + + for epsint in range(5,50): + eps = epsint*0.01 + # # # # # # # # # # # # # # # # # # # # # # # # + # cluster segments to determine field types on commonality + clusterer = None # just to prevent PyCharm's warnings + + print("Clustering...") + try: + if separateChars: + # noinspection PyUnboundLocalVariable + min_samples = round(log(len(nonCharSegs))) + clusterer = DBSCANsegmentClusterer(dc, nonCharSegs, min_samples=min_samples, eps=eps) + else: + min_samples = round(log(len(segments2cluster))) + clusterer = DBSCANsegmentClusterer(dc, segments2cluster, min_samples=min_samples, eps=eps) + # # # # # # # # # # # # # # # # # # # # # # # # # + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) + # # # # # # # # # # # # # # # # # # # # # # # # # + + inferenceParams = TitleBuilderSens(tokenizer, refinement, args.sigma, clusterer) + + noise, *clusters = clusterer.clusterSimilarSegments(False) + print("{} clusters generated from {} distinct segments".format(len(clusters), len(dc.segments))) + # # # # # # # # # # # # # # # # # # # # # # # # + + # noinspection PyUnboundLocalVariable + if separateChars and len(charSegments) > 0: + clusters.append(charSegments) + + # The same value in different segments only represented by once + uniqueClusters = list() + for elements in clusters + [noise]: + # same template with different labels + uniqueSegments = {dc.segments[dc.segments2index([tSegment])[0]] for tSegment in elements} + uniqueClusters.append(sorted(uniqueSegments, key=lambda x: x.values)) + mixedSegments = [seg for seg, cnt in + Counter(tSegment for elements in uniqueClusters for tSegment in elements).items() + if cnt > 1] + for tSegment in mixedSegments: + mixedClusters = [elements for elements in uniqueClusters + if tSegment in elements] + assert len(mixedClusters) < 2 # that would be strange and we needed to find some solution then + toReplace = [sIdx for sIdx, mSegment in enumerate(mixedClusters[0]) if mSegment == tSegment] + for rIdx in reversed(sorted(toReplace)): + del mixedClusters[0][rIdx] + mixedClusters[0].append(("[mixed]", tSegment)) + uniqueNoise = uniqueClusters[-1] + uniqueClusters = uniqueClusters[:-1] + + + # # # # # # # # # # # # # # # # # # # # # # # # + fTypeTemplates = list() + fTypeContext = list() + for cLabel, segments in enumerate(uniqueClusters): + # generate FieldTypeTemplates (padded nans) - Templates as is + ftype = FieldTypeTemplate(segments) + ftype.fieldtype = "tf{:02d}".format(cLabel) + fTypeTemplates.append(ftype) + # # # # # # # # # # # # # # # # # # # # # # # # + + + # # # # # # # # # # # # # # # # # # # # # # # # + ftclusters = {ftc.fieldtype: ftc.baseSegments for ftc in fTypeTemplates} + """ftclusters is a mixed list of MessageSegment and Template""" + ftclusters["Noise"] = uniqueNoise + + # # # # # # # # # # # # # # # # # # # # # # # # + # Report: write cluster elements to csv + # # # # # # # # # # # # # # # # # # # # # # # # + elementsReport = SegmentClusterGroundtruthReport(comparator, dc.segments, filechecker) + elementsReport.write(ftclusters) + # # # # # # # # # # # # # # # # # # # # # # # # + + # # # # # # # # # # # # # # # # # # # # # # # # + # # Report: allover clustering quality statistics + # # # # # # # # # # # # # # # # # # # # # # # # + report = CombinatorialClustersReport(elementsReport.groundtruth, filechecker) + report.write(ftclusters, inferenceParams.plotTitle) + # + # field-type-wise cluster quality statistics + report = IndividualClusterReport(elementsReport.groundtruth, filechecker) + # add column $d_max$ to report + cluDists = {lab: clusterer.distanceCalculator.distancesSubset(clu) for lab, clu in ftclusters.items()} + cluDistsMax = {lab: clu.max() for lab, clu in cluDists.items()} + report.addColumn(cluDistsMax, "$d_max$") + # + report.write(ftclusters, inferenceParams.plotTitle) + clusterStats = report.precisionRecallList + + # # # # # # # # # # # # # # # # # # # # # # # # + if withplots: + # distance Topology plot + topoplot = SegmentTopology(clusterStats, fTypeTemplates, uniqueNoise, dc) + topoplot.writeFigure(specimens, inferenceParams, elementsReport, filechecker) + # # # # # # # # # # # # # # # # # # # # # # # # + + filechecker.writeReportMetadata(fromCache.dccachefn if fromCache.isLoaded else None) + + + + if args.interactive: + from collections import Counter + from nemere.inference.segments import MessageSegment, TypedSegment + # noinspection PyUnresolvedReferences + import numpy + + # globals().update(locals()) + IPython.embed() + + + + + + + + + + diff --git a/src/nemeftr_cluster-true-fields.py b/src/nemeftr_cluster-true-fields.py new file mode 100644 index 00000000..51ffa1f7 --- /dev/null +++ b/src/nemeftr_cluster-true-fields.py @@ -0,0 +1,296 @@ +""" +NEMEFTR: early state and evaluation of epsilon autoconfiguration. Optimal-segmentation baseline. + +Plot and print dissimilarities between segments. Clusters on dissimilarities and compares the results to ground truth. +Segmentations are obtained by dissectors and apply field type identification to them. +Output for evaluation are a dissimilarity topology plot and histogram, ECDF plot, clustered vector visualization plots, +and segment cluster statistics. + +Takes a PCAP trace of a known protocol, dissects each message into their fields, and yields segments from each of them. +These segments get analyzed by the given analysis method which is used as feature to determine their similarity. +The distance matrix is generated using representatives for field-type-hypothesis specific values modifications (chars). +Similar fields are then clustered by DBSCAN and for comparison plotted in groups of their real field types. +In addition, a MDS projection into a 2D plane for visualization of the relative distances of the features is plotted. +""" + +import argparse, IPython +import csv +from os.path import join, exists + +from matplotlib import pyplot as plt +import numpy, math + + +from nemere.utils.evaluationHelpers import analyses, StartupFilecheck, consolidateLabels, CachedDistances +from nemere.utils.reportWriter import plotMultiSegmentLines, CombinatorialClustersReport, reportFolder, \ + SegmentClusterGroundtruthReport +from nemere.inference.templates import DBSCANsegmentClusterer, ClusterAutoconfException +from nemere.inference.segmentHandler import segments2types, isExtendedCharSeq +from nemere.validation.clusterInspector import TypedSegmentClusterCauldron +from nemere.validation.dissectorMatcher import MessageComparator +from nemere.visualization.distancesPlotter import DistancesPlotter +from nemere.visualization.singlePlotter import SingleMessagePlotter + +debug = False + +# fix the analysis method to VALUE +analysisTitle = 'value' +# fix the distance method to canberra +distance_method = 'canberra' + +# kneedleSensitivity=9.0 +# kneedleSensitivity=8.0 +# kneedleSensitivity=4.0 +kneedleSensitivity=24.0 + +# for evaluation +besteps = { + "dhcp_SMIA2011101X_deduped-100.pcap": 0.188, + "dhcp_SMIA2011101X_deduped-1000.pcap": 0.251, + "dns_ictf2010_deduped-100.pcap": 0.483, + "dns_ictf2010_deduped-982-1000.pcap": 0.167, + "nbns_SMIA20111010-one_deduped-100.pcap": 0.346, + "nbns_SMIA20111010-one_deduped-1000.pcap": 0.400, + "ntp_SMIA-20111010_deduped-100.pcap": 0.340, + "ntp_SMIA-20111010_deduped-1000.pcap": 0.351, + "smb_SMIA20111010-one_deduped-100.pcap": 0.259, + "smb_SMIA20111010-one_deduped-1000.pcap": 0.242, +} + + + + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Analyze fields as segments of messages and plot field type identification quality.') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + parser.add_argument('-p', '--with-plots', + help='Write plots and statistics, e. g., about the knee detection, Topology Plot.', + action="store_true") + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + args = parser.parse_args() + + filechecker = StartupFilecheck(args.pcapfilename) + withplots = args.with_plots + analyzerType = analyses[analysisTitle] + analysisArgs = None + singularsFromNoise = False + separateChars = False + + # + # # # # # # # # # # # # # # # # # # # # # # # # + # segment messages according to true fields from the labels and filter 1-byte segments before clustering, + # respectively cache/load the DistanceCalculator to the filesystem + # + fromCache = CachedDistances(args.pcapfilename, analysisTitle, args.layer, args.relativeToIP) + # Note! When manipulating distances calculation, deactivate caching by uncommenting the following assignment. + # fromCache.disableCache = True + fromCache.debug = debug + fromCache.configureAnalysis(analysisArgs) + fromCache.configureTokenizer("tshark", filtering=True) + try: + fromCache.get() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) + specimens, comparator, dc = fromCache.specimens, fromCache.comparator, fromCache.dc + assert isinstance(comparator, MessageComparator) + + # extract char sequences + if separateChars: + charSegments = list() + nonCharSegs = list() + for seg in dc.rawSegments: + if isExtendedCharSeq(seg.bytes): + charSegments.append(seg) + else: + nonCharSegs.append(seg) + + print("Clustering...") + # # use HDBSCAN + # segmentGroups = segments2clusteredTypes(tg, analysisTitle, min_cluster_size=15) + + # use DBSCAN + # kneedleSensitivity += (log(len(dc.segments),10) - 2) * 4 # for larger traces + # clusterer = DBSCANsegmentClusterer(dc, S=kneedleSensitivity) + if separateChars: + # noinspection PyUnboundLocalVariable + clusterer = DBSCANsegmentClusterer(dc, nonCharSegs, S=kneedleSensitivity) + else: + # clustering raw, possibly duplicate segments, not unique values, here + clusterer = DBSCANsegmentClusterer(dc, dc.rawSegments, S=kneedleSensitivity) + + # only unique + clusterer.preventLargeCluster() + clusterer.min_samples = round(math.sqrt(len(clusterer.distanceCalculator.segments))) + + titleFormat = "tshark {} S {:.1f}".format( + clusterer, kneedleSensitivity) + + if withplots: + # Statistics about the knee detection + # # # # # # # # # # # # # # # # # # # # # # # # # + autoeps = max(clusterer.kneelocator.all_knees) + adjeps = clusterer.eps + clusterer.kneelocator.plot_knee() + plt.text(0.5, 0.2, "S = {:.1f}\nautoeps = {:.3f}\nadjeps = {:.3f}\nk = {:.0f}".format( + clusterer.S, autoeps, adjeps, clusterer.k)) + plt.axvline(adjeps, linestyle="dotted", color="blue", alpha=.4) + plt.savefig(join(filechecker.reportFullPath, "knee-{}-S{:.1f}-eps{:.3f}.pdf".format( + filechecker.pcapstrippedname, kneedleSensitivity, clusterer.eps))) + + clusterer.kneelocator.plot_knee_normalized() + plt.text(0.7, 0.5, "S = {:.1f}\nautoeps = {:.3f}\nknee_y = {:.3f}\nnorm_dmax = {:.3f}".format( + clusterer.S, autoeps, clusterer.kneelocator.knee_y, max(clusterer.kneelocator.y_difference_maxima))) + plt.savefig(join(filechecker.reportFullPath, "knee_normalized-{}-S{:.1f}-eps{:.3f}.pdf".format( + filechecker.pcapstrippedname, kneedleSensitivity, clusterer.eps))) + + kneePath = join(reportFolder, "knee-statistics.csv") + writeHeader = not exists(kneePath) + with open(kneePath, "a") as kneeStats: + kneecsv = csv.writer(kneeStats) # type: csv.writer + if writeHeader: + kneecsv.writerow([ "run_title", "trace", "autoeps", "adjeps", "k", "polydeg", "knee_y", "y_dmax", "norm_knees" ]) + + kneecsv.writerow([titleFormat, filechecker.pcapstrippedname, autoeps, adjeps, clusterer.k, + clusterer.kneelocator.polynomial_degree, + ";".join(f"{v:.5f}" for v in clusterer.kneelocator.all_knees_y), + ";".join(f"{v:.5f}" for v in clusterer.kneelocator.y_difference_maxima), + ";".join(f"{v:.5f}" for v in clusterer.kneelocator.all_norm_knees) + ]) + # # # # # # # # # # # # # # # # # # # # # # # # # + + # Histogram of all the distances between the segments + hstplt = SingleMessagePlotter(specimens, 'histo-distance-1nn-' + titleFormat, False) + # hstplt.histogram(tril(dc.distanceMatrix), bins=[x/50 for x in range(50)]) + knn = [dc.neighbors(seg)[0][1] for seg in dc.segments] + # print(knn) + hstplt.histogram(knn, bins=[x / 50 for x in range(50)]) + if filechecker.pcapbasename in besteps: + plt.axvline(besteps[filechecker.pcapbasename], label=besteps[filechecker.pcapbasename], + color="orchid", linestyle="dotted") + plt.axvline(clusterer.eps, label="{:.3f}".format(clusterer.eps), color="darkmagenta") + hstplt.writeOrShowFigure() + plt.clf() + + # # # # # # # # # # # # # # # # # # # # # # # # # + cauldron = TypedSegmentClusterCauldron(clusterer, analysisTitle) + if singularsFromNoise: + cauldron.extractSingularFromNoise() + if separateChars: + # noinspection PyUnboundLocalVariable + cauldron.appendCharSegments(charSegments) + uniqueGroups = cauldron.clustersOfUniqueSegments() + # # only use "real" clusters: + # segmentGroups = cauldron.segments2clusteredTypes() + # # # # # # # # # # # # # # # # # # # # # # # # # + + titleFormat = "{} ({}, {}-{})".format( + cauldron.analysisLabel(), + distance_method, + dc.thresholdFunction.__name__, + "".join([str(k) + str(v) for k, v in dc.thresholdArgs.items()]) if dc.thresholdArgs else '') + + if withplots: + clusterer.kneelocator.plot_knee() # plot_knee_normalized() + plt.text(0.5, 0.2, "S = {:.1f}\neps = {:.3f}\nk = {:.0f}".format(clusterer.S, clusterer.eps, clusterer.k)) + plt.savefig(join( + reportFolder, + "knee-{}-S{:.1f}-eps{:.3f}.pdf".format(filechecker.pcapstrippedname, kneedleSensitivity, clusterer.eps))) + + # # # # # # # # # # # # # # # # # # # # # # # # # + # re-extract cluster labels for segments, templates can only be represented as one label for this distinct position + print("Plot distances...") + sdp = DistancesPlotter(specimens, 'distances-' + titleFormat, False) + # labels = numpy.array([labelForSegment(segmentGroups, seg) for seg in dc.segments]) + labels = numpy.array([cauldron.label4segment(seg) for seg in dc.segments]) + + # we need to omit some labels, if the amount amount of unique labels is greater than threshold + # (to prevent the plot from overflowing) + uniqueLabelCount = len(set(labels)) + if uniqueLabelCount > 20: + consolidateLabels(labels) + sdp.plotManifoldDistances( + dc.segments, dc.distanceMatrix, labels) + sdp.writeOrShowFigure() + del sdp + # # # # # # # # # # # # # # # # # # # # # # # # # + + print("Prepare output...") + if withplots: + analysisLabel = cauldron.analysisLabel() + paginatedGroups = [ + (analysisLabel + " (regular clusters)", cauldron.regularClusters.clusters), + (analysisLabel + " (singular clusters)", cauldron.singularClusters.clusters) + ] + typeDict = segments2types(dc.rawSegments) + # for pagetitle, segmentClusters in segmentGroups: + for pagetitle, segmentClusters in paginatedGroups: + plotMultiSegmentLines(segmentClusters, specimens, pagetitle, # titleFormat + True, typeDict, False) + + # # total/all segments + # ftclusters = {label: resolveTemplates2Segments(e for t, e in elements) + # for label, elements in segmentGroups[0][1]} + # + # unique segments + ftclusters = {cauldron.unisegClusters.clusterLabel(i): cauldron.unisegClusters.clusterElements(i) + for i in range(len(cauldron.unisegClusters))} + # # remove noise if present + # may not be necessary any more (due to the new SegmentClusterCauldron class), + # leave at the moment to be on the safe side + noisekeys = [ftk for ftk in ftclusters.keys() if ftk.find("Noise") >= 0] + if len(noisekeys) > 0: + ftclusters["Noise"] = ftclusters[noisekeys[0]] + del ftclusters[noisekeys[0]] + # print("\n\nFor comparison, remove the failed cluster #10(?) of smb from ftclusters...\n\n") + # IPython.embed() + # # gt from ft + groundtruth = {seg: seg.fieldtype + for l, segs in ftclusters.items() for seg in segs} + # # (non?)unique segments + # ftclusters = {ftc.fieldtype: ftc.baseSegments for ftc in fTypeContext} + # ftclusters["Noise"] = resolveTemplates2Segments(noise) + # groundtruth = {rawSeg: typSeg[1].fieldtype if typSeg[0] > 0.5 else "[unknown]" + # for rawSeg, typSeg in typedMatchSegs.items()} + report = CombinatorialClustersReport(groundtruth, filechecker) + report.write(ftclusters, titleFormat) + + elementsReport = SegmentClusterGroundtruthReport(comparator, dc.segments, filechecker) + elementsReport.write(ftclusters) + # # # # # # # # # # # # # # # # # # # # # # # # # + + filechecker.writeReportMetadata(None) + + # # # # # # # # # # # # # # # # # # # # # # # # # + + for i in range(len(cauldron.regularClusters)): cauldron.regularClusters.plotDistances(i, specimens) + # cauldron.regularClusters.plotDistributions(specimens) + + # # # # # # # # # # # # # # # # # # # # # # # # # + # + # + # + # + # # # # # # # # # # # # # # # # # # # # # # # # # + + if args.interactive: + # noinspection PyUnresolvedReferences + from tabulate import tabulate + + # globals().update(locals()) + IPython.embed() + + + + diff --git a/src/nemeftr_cluster-true-fields_iterate-eps.py b/src/nemeftr_cluster-true-fields_iterate-eps.py new file mode 100644 index 00000000..79c6f528 --- /dev/null +++ b/src/nemeftr_cluster-true-fields_iterate-eps.py @@ -0,0 +1,199 @@ +""" +NEMEFTR: Optimal-segmentation baseline. Determine the best epsilon value by iterating all epsilons between 0.05 and 0.5. + +Plot and print dissimilarities between segments. Clusters on dissimilarities and compares the results to ground truth. +Segmentations are obtained by dissectors and apply field type identification to them. +Output for evaluation are a dissimilarity topology plot, histogram and segment cluster statistics. + +Takes a PCAP trace of a known protocol, dissects each message into their fields, and yields segments from each of them. +These segments get analyzed by the given analysis method which is used as feature to determine their similarity. +The distance matrix is generated using representatives for field-type-hypothesis specific values modifications (chars). +Similar fields are then clustered by DBSCAN iterating all epsilons between 0.05 and 0.5. +In addition, a MDS projection into a 2D plane for visualization of the relative distances of the features is plotted. +""" + +import argparse, IPython +from collections import Counter +from itertools import chain +from matplotlib import pyplot as plt +import math, numpy + +from nemere.utils.evaluationHelpers import analyses, annotateFieldTypes, labelForSegment, StartupFilecheck, consolidateLabels +from nemere.utils.reportWriter import CombinatorialClustersReport, reportFolder, \ + SegmentClusterGroundtruthReport +from nemere.inference.templates import DBSCANsegmentClusterer, MemmapDC, DelegatingDC +from nemere.inference.segmentHandler import segments2clusteredTypes, isExtendedCharSeq +from nemere.validation.dissectorMatcher import MessageComparator +from nemere.utils.loader import SpecimenLoader +from nemere.visualization.distancesPlotter import DistancesPlotter +from nemere.visualization.singlePlotter import SingleMessagePlotter + +debug = False + +# fix the analysis method to VALUE +analysisTitle = 'value' +# fix the distance method to canberra +distance_method = 'canberra' + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Analyze fields as segments of messages and plot field type identification quality.') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + args = parser.parse_args() + + filechecker = StartupFilecheck(args.pcapfilename) + analyzerType = analyses[analysisTitle] + analysisArgs = None + + # dissect and label messages + print("Load messages...") + specimens = SpecimenLoader(args.pcapfilename, layer=args.layer, relativeToIP = args.relativeToIP) + comparator = MessageComparator(specimens, layer=args.layer, relativeToIP=args.relativeToIP, debug=debug) + + # segment messages according to true fields from the labels + print("Segmenting messages...") + # produce TypedSegments from dissection field information + segmentedMessages = annotateFieldTypes(analyzerType, analysisArgs, comparator) + # filter 1-byte segments before clustering + segments = [seg for seg in chain.from_iterable(segmentedMessages) if seg.length > 1 and set(seg.values) != {0} ] + + print("Calculate distances...") + if len(segments) ** 2 > MemmapDC.maxMemMatrix: + dc = MemmapDC(segments) + else: + dc = DelegatingDC(segments) + + separateChars = False + + # extract char sequences + if separateChars: + charSegments = list() + nonCharSegs = list() + for seg in segments: + if isExtendedCharSeq(seg.bytes): + charSegments.append(seg) + else: + nonCharSegs.append(seg) + + print("Clustering...") + for epsint in range(5,50): + eps = epsint*0.01 + min_samples = round(math.log(len(dc.segments))) + if separateChars: + # noinspection PyUnboundLocalVariable + clusterer = DBSCANsegmentClusterer(dc, nonCharSegs, min_samples=min_samples, eps=eps) + else: + clusterer = DBSCANsegmentClusterer(dc, dc.rawSegments, min_samples=min_samples, eps=eps) + + # only unique and sqrt instead of ln + clusterer.min_samples = round(math.sqrt(len(clusterer.distanceCalculator.segments))) + + titleFormat = "{} (eps {:.3f}, ms {})".format(distance_method, eps, clusterer.min_samples) + + # Histogram of all the distances between the segments + hstplt = SingleMessagePlotter(specimens, 'histo-distance-1nn-' + titleFormat, False) + # hstplt.histogram(tril(dc.distanceMatrix), bins=[x/50 for x in range(50)]) + knn = [dc.neighbors(seg)[0][1] for seg in dc.segments] + # print(knn) + hstplt.histogram(knn, bins=[x / 50 for x in range(50)]) + plt.axvline(clusterer.eps, label="{:.3f}".format(clusterer.eps), color="darkmagenta") + hstplt.writeOrShowFigure() + plt.clf() + + # # # # # # # # # # # # # # # # # # # # # # # # # + # noinspection PyUnboundLocalVariable + segmentGroups = segments2clusteredTypes(clusterer, analysisTitle, False, charSegments if separateChars else None) + titleFormat = "{} ({}, eps {:.3f}, ms {})".format( + segmentGroups[0][0], distance_method, eps, clusterer.min_samples) + + # The same value in different segments only represented by once + uniqueClusters = list() + for cLabel, elements in segmentGroups[0][1]: + # same template with different labels + uniqueSegments = {(sLabel, dc.segments[dc.segments2index([tSegment])[0]]) for sLabel, tSegment in elements} + uniqueClusters.append((cLabel, sorted(uniqueSegments, key=lambda x: x[1].values))) + mixedSegments = [seg for seg, cnt in + Counter(tSegment for cLabel, elements in uniqueClusters for sLabel, tSegment in elements).items() if cnt > 1] + for tSegment in mixedSegments: + mixedClusters = [elements for cLabel, elements in uniqueClusters + if tSegment in (sElem for sLabel, sElem in elements)] + assert len(mixedClusters) < 2 # that would be strange and we needed to find some solution then + toReplace = [sIdx for sIdx,sTuple in enumerate(mixedClusters[0]) if sTuple[1] == tSegment] + for rIdx in reversed(sorted(toReplace)): + del mixedClusters[0][rIdx] + mixedClusters[0].append(("[mixed]", tSegment)) + uniqueGroups = [(segmentGroups[0][0], uniqueClusters)] + # # only use "real" clusters: + # uniqueGroups = segmentGroups + + # # # # # # # # # # # # # # # # # # # # # # # # # + # re-extract cluster labels for segments, templates can only be represented as one label for this distinct position + print("Plot distances...") + sdp = DistancesPlotter(specimens, 'distances-' + titleFormat, False) + # labels = numpy.array([labelForSegment(segmentGroups, seg) for seg in dc.segments]) + labels = numpy.array([labelForSegment(uniqueGroups, seg) for seg in dc.segments]) + + # we need to omit some labels, if the amount amount of unique labels is greater than threshold + # (to prevent the plot from overflowing) + uniqueLabelCount = len(set(labels)) + if uniqueLabelCount > 20: + consolidateLabels(labels) + uniqueLabelCount = len(set(labels)) + if uniqueLabelCount > 20: + print("Still too many cluster labels!") + + sdp.plotManifoldDistances(dc.segments, dc.distanceMatrix, labels) + sdp.writeOrShowFigure() + del sdp + # # # # # # # # # # # # # # # # # # # # # # # # # + + + print("Prepare output...") + # typeDict = segments2types(segments) + # # for pagetitle, segmentClusters in segmentGroups: + # for pagetitle, segmentClusters in uniqueGroups: + # plotMultiSegmentLines(segmentClusters, specimens, titleFormat, + # True, typeDict, False) + + # # total/all segments + # ftclusters = {label: resolveTemplates2Segments(e for t, e in elements) + # for label, elements in segmentGroups[0][1]} + # + # unique segments + ftclusters = {label: [e for t, e in elements] + for label, elements in uniqueGroups[0][1]} + noisekeys = [ftk for ftk in ftclusters.keys() if ftk.find("Noise") >= 0] + if len(noisekeys) > 0: + ftclusters["Noise"] = ftclusters[noisekeys[0]] + del ftclusters[noisekeys[0]] + groundtruth = {seg: seg.fieldtype + for l, segs in ftclusters.items() for seg in segs} + + # # unique segments + # ftclusters = {ftc.fieldtype: ftc.baseSegments for ftc in fTypeContext} + # ftclusters["Noise"] = resolveTemplates2Segments(noise) + # groundtruth = {rawSeg: typSeg[1].fieldtype if typSeg[0] > 0.5 else "[unknown]" + # for rawSeg, typSeg in typedMatchSegs.items()} + report = CombinatorialClustersReport(groundtruth, filechecker) + report.write(ftclusters, titleFormat) + + elementsReport = SegmentClusterGroundtruthReport(comparator, dc.segments, filechecker, reportFolder) + elementsReport.write(ftclusters) + # # # # # # # # # # # # # # # # # # # # # # # # # + + filechecker.writeReportMetadata(None) + + if args.interactive: + IPython.embed() + + + + diff --git a/src/nemere/alignment/clusterMerging.py b/src/nemere/alignment/clusterMerging.py index d1bf4ff0..5e3076f7 100644 --- a/src/nemere/alignment/clusterMerging.py +++ b/src/nemere/alignment/clusterMerging.py @@ -23,6 +23,7 @@ def __init__(self, alignedClusters: Dict[int, List], dc: DistanceCalculator): self.dc = dc def generateHirsch(self, mmg=(0, -1, 5)): + """TODO fix the non-symmetric DYN-DYN similarity.""" alignedFields = {clunu: [field for field in zip(*cluelms)] for clunu, cluelms in self.alignedClusters.items() if clunu != -1} statDynFields = dict() diff --git a/src/nemere/inference/analyzers.py b/src/nemere/inference/analyzers.py index f9ce403c..088ff42c 100644 --- a/src/nemere/inference/analyzers.py +++ b/src/nemere/inference/analyzers.py @@ -5,11 +5,11 @@ :author: Stephan Kleber """ +import IPython import numpy import pandas -from bitarray import bitarray +from bitstring import Bits from typing import Dict, List, Tuple, Union, Type -from abc import abstractmethod from scipy.ndimage.filters import gaussian_filter1d from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage @@ -88,16 +88,14 @@ def bitCongruenceBetweenTokens(tokenlist: Union[List, bytes]): "Needs at least two tokens to determine a congruence. Token list is {}".format(tokenlist)) try: # We need a type that can be casted to byte. Do it as soon as possible to fail early and completely. for tokenA, tokenB in zip(tokenlist[:-1], tokenlist[1:]): - # converting and failsafes. Ugly bytes and bitarray! + # converting and failsafes. if not isinstance(tokenA, bytes): tokenA = bytes( [ tokenA] ) - bitsA = bitarray() - bitsA.frombytes(tokenA) + bitsA = Bits(bytes=tokenA) if not isinstance(tokenB, bytes): tokenB = bytes( [tokenB] ) - bitsB = bitarray() - bitsB.frombytes(tokenB) + bitsB = Bits(bytes=tokenB) bitlength = len(bitsA) if bitlength != len(bitsB): @@ -106,8 +104,8 @@ def bitCongruenceBetweenTokens(tokenlist: Union[List, bytes]): # finally do the real work: # total number of times (bits) subsequent tokens agree. - bAgree = ~ (bitsA ^ bitsB) # type: bitarray - congruencelist.append(bAgree.count() / bitlength) + bAgree = ~ (bitsA ^ bitsB) # type: Bits + congruencelist.append(bAgree.count(1) / bitlength) except TypeError as e: raise TypeError("Tokens must be convertible to bytes, which failed because: {} ".format(e)) return congruencelist @@ -145,7 +143,7 @@ def bitcongruences(self): def messageSegmentation(self) -> List[MessageSegment]: """ - Segment message by determining local maxima of sigma-1.5-gauss-filtered bit-congruence. + Segment message by determining local minima of sigma-1.5-gauss-filtered bit-congruence. >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage >>> tstmsg = '19040aec0000027b000012850a6400c8d23d06a2535ed71ed23d09faa4673315d23d09faa1766325d23d09faa17b4b10' @@ -361,6 +359,151 @@ def bcHighPlateaus(self): hiPlat[1].append(vl) return hiPlat +class BitCongruenceLE(BitCongruence): + """ + Little Endian version of + Bitwise congruence: Simple Matching [Sokal & Michener]. + + not unit-dependant, always byte-wise + """ + @property + def values(self): + """ + :return: The analysis values for this message, possibly prepended by NaN values + in the amount of startskip (see there), + after analyze() was called. None otherwise. + """ + if self._values is None: + return None + return self._values[::-1] + [numpy.nan] * self.startskip + + def analyze(self): + """ + Bitwise congruence: Simple Matching [Sokal & Michener]. + other kinds of bit variances from http://btluke.com/binclus.html + + :return: list of congruences from index i = 1 to n between bits of i-1 and i + """ + tokenlist = self._message.data[::-1] + self._values = BitCongruence.bitCongruenceBetweenTokens(tokenlist) + super().analyze() + +class BitCongruenceDeltaLE(BitCongruenceLE, BitCongruenceDelta): + @property + def bitcongruences(self): + """ + :return: basic bit congruences + """ + if self._bcvalues is None: + return None + return self._bcvalues[::-1] + [numpy.nan] * super().startskip + +class BitCongruenceDeltaGaussLE(BitCongruenceDeltaLE, BitCongruenceDeltaGauss): + @property + def bcdeltas(self): + """ + :return: bit congruence deltas without smoothing + """ + if self._bcdvalues is None: + return None + return self._bcdvalues[::-1] + [numpy.nan] * self.startskip + + def messageSegmentation(self) -> List[MessageSegment]: + """ + Segment message by determining inflection points of sigma-s-gauss-filtered bit-congruence. + The cut position is the delta max of the unsmoothed bcd in the scope of a min/max (rising) pair. + + additionally cut at high plateaus starts in the basic bc values. + + :return: Segmentation of this message based on this analyzer's type. + """ + if not self.values: + if not self._analysisArgs: + raise ValueError('No values or analysis parameters set.') + self.analyze() + + # CAVE: all following is in reversed index order! + + # cut one byte before the inflection + inflectionPoints = self.inflectionPoints() + inflectionCuts = [ int(i)-1 for i in inflectionPoints[0]] + + # get candidates to cut segments from message + cutCandidates = [0] + inflectionCuts \ + + [len(self._message.data)] # add the message end + # cut only where a segment is of a length larger than 1 + cutPositions = [0] + [right for left, right in zip( + cutCandidates[:-1], cutCandidates[1:] + ) if right - left > 1] + # cutPositions = list(sorted(cutPositions + nansep[0])) + # add the end of the message if its not already there + if cutPositions[-1] != cutCandidates[-1]: + cutPositions[-1] = cutCandidates[-1] + + segments = list() + # zip(cutPositions[::-1][:-1], cutPositions[::-1][1:]) is in simpler terms: + for cutCurr, cutNext in zip(cutPositions[:0:-1], cutPositions[-2::-1]): + # here we reverse the index order again, to reinstate the actual byte offsets + offset = len(self.values) - cutCurr + length = cutCurr - cutNext + segments.append(MessageSegment(self, offset, length)) + return segments + + def extrema(self) -> List[Tuple[int, bool]]: + """ + in reversed index order! + :return: all extrema of the smoothed bcd, each described by a tuple of its index and bool (min is False) + """ + bcdNR = self.values[::-1] # values is in message byte order and with added nans for missing values + lmin = MessageAnalyzer.localMinima(bcdNR) + lmax = MessageAnalyzer.localMaxima(bcdNR) + nrExtrema = sorted( + [(i, False) for i in lmin[0]] + [(i, True) for i in lmax[0]], key=lambda k: k[0]) + return nrExtrema + + def risingDeltas(self) -> List[Tuple[int, numpy.ndarray]]: + """ + in reversed index order! + the deltas in the original bcd (so: 2nd delta) between minima and maxima in smoothed bcd + + :return: offset of and the bcd-delta values starting at this position in rising parts of the smoothed bcd. + Thus, offset is a minimum + 1 and the array covers the indices up to the following maximum, itself included. + """ + extrema = self.extrema() + risingdeltas = [ ( i[0] + 1, numpy.ediff1d(self.bcdeltas[::-1][i[0]:j[0]+1]) ) # include index of max + for i, j in zip(extrema[:-1], extrema[1:]) + if i[1] == False and j[1] == True and j[0]+1 - i[0] > 1] + # risingdeltas[-1][0] >= len(self.bcdeltas) + return risingdeltas + + def inflectionPoints(self) -> Tuple[List[int], List[float]]: + """ + in reversed index order! + adjusted approximation of the inflection points at rising edges of the smoothed bcd. + The approximation is that we are using the maximum delta of the unsmoothed bcd + in scope of the rising part of the graph. + + :return: The indices and values of the approximated inflections. + """ + inflpt = [ offset + int(numpy.nanargmax(wd)) for offset, wd in self.risingDeltas() ] + inflvl = [ self.bcdeltas[::-1][pkt] for pkt in inflpt ] + return inflpt, inflvl + + def bcHighPlateaus(self): + """ + in reversed index order! + :return: Plateaus in the bit congruence at high level (> 0.8) + """ + plateauElevation = 0.8 + plat = MessageAnalyzer.plateouStart(self.bitcongruences[::-1]) + + # filter for plateaus of high bit congruence + hiPlat = ([], []) + for ix, vl in zip(plat[0], plat[1]): + if vl > plateauElevation: + hiPlat[0].append(ix) + hiPlat[1].append(vl) + return hiPlat class BitCongruence2ndDelta(BitCongruenceDelta): @@ -440,6 +583,77 @@ def values(self): return [0.0] * self.startskip + self._ngramMean +class BitCongruenceNgramStd(BitCongruence): + """ + Standard deviation of bit congruences for all bits within ngrams. + """ + _n = None + _ngramVar = list() + + def setAnalysisParams(self, n: Union[int, Tuple[int]]): + self._n = int(n) if not isinstance(n, tuple) else int(n[0]) + self._startskip = self._n + + + def analyze(self): + """ + not unit-dependant + + deviation of bit congruence within ngrams + + :return: + """ + if not self._n: + raise ParametersNotSet('Analysis parameter missing: N-gram size ("n").') + from ..utils.baseAlgorithms import ngrams + + super().analyze() + self._ngramVar = [float(numpy.std(bcn)) for bcn in ngrams(self._values, self._n)] + + + @property + def values(self): + if self._ngramVar is None: + return None + return [0.0] * self.startskip + self._ngramVar + + + def messageSegmentation(self): + """ + + :return: Segmentation of this message based on this analyzer's type. + """ + raise NotImplementedError('Unfinished implementation.') + + if not self._ngramVar: + if not self._analysisArgs: + raise ValueError('No values or analysis parameters set.') + self.analyze() + + # TODO segmentation based on areas of similar congruence (not border detection): + # factor * std(message) < abs(std(3gram_n) - std(3gram_(n-1))) -> cut segment + # factor = 1 for now + + # find a threshold factor + min(self._ngramVar) + max(self._ngramVar) + + # prevent 1 byte segments, since they do not contain usable congruence! + cutCandidates = [0] + [int(b) for b in bclmins] + [len(self._message.data)] # add the message end + cutPositions = [0] + [right for left, right in zip( + cutCandidates[:-1], cutCandidates[1:] + ) if right - left > 1] + if cutPositions[-1] != cutCandidates[-1]: + cutPositions[-1] = cutCandidates[-1] + + segments = list() + for lmaxCurr, lmaxNext in zip(cutPositions[:-1], cutPositions[1:]): + segments.append(MessageSegment(self, lmaxCurr, lmaxNext-lmaxCurr)) + return segments + + # TODO Areas of similarity: may also be feasible for bc, hbc, sliding2means, deltaProgression + + class PivotBitCongruence(BitCongruence): """ Repeatedly cut the message(segments) in half, calculate the mean/variance of bit congruence for each half, @@ -451,13 +665,11 @@ class PivotBitCongruence(BitCongruence): that some messages get segmented arbitrarily deep, while others with clearly visible structure are not segmented at all. In this design the analysis is unsuitable. - Alternative: + Fixed Pivot Results: ============ Slide the pivot positions over the whole message (-segment) and use the one maximizing difference in the segments' congruence to recurse. - Results: - ======== Works comparatively well for DNS, is awful for ntp and dhcp. With the same parameters (different fixed and weighted threshold calculation strategies, fixed and weighted pivot selection condition), there is no correlation between fields and segment splits. Some areas of the message are too @@ -478,7 +690,7 @@ def setAnalysisParams(self, args): def analysisParams(self): return self._meanThreshold, - def _recursivePivotMean(self, segment: MessageSegment): + def _recursiveFixedPivotMean(self, segment: MessageSegment): """ Recursively split the segment in half, calculate the mean for the values of each of the two resulting sub-segments, and compare each of them to the original segments mean. If a sub-segment is sufficiently @@ -500,11 +712,11 @@ def _recursivePivotMean(self, segment: MessageSegment): # test for recursion conditions returnSegments = list() if abs(leftSegment.mean() - mymean) > self._meanThreshold: # still different - returnSegments.extend(self._recursivePivotMean(leftSegment)) + returnSegments.extend(self._recursiveFixedPivotMean(leftSegment)) else: returnSegments.append(leftSegment) if abs(rightSegment.mean() - mymean) > self._meanThreshold: # still different - returnSegments.extend(self._recursivePivotMean(rightSegment)) + returnSegments.extend(self._recursiveFixedPivotMean(rightSegment)) else: returnSegments.append(rightSegment) # if abs(lsm - rsm) > .1: # still different @@ -512,10 +724,8 @@ def _recursivePivotMean(self, segment: MessageSegment): else: return [segment] - def messageSegmentation(self) -> List[MessageSegment]: - - segments = self._recursivePivotVar(MessageSegment(BitCongruence(self.message), 0, len(self._message.data))) + segments = self._recursiveDynamicSlidedMean(MessageSegment(BitCongruence(self.message), 0, len(self._message.data))) sortedSegments = sorted(segments, key=lambda x: x.offset) # varPerSeg = list() # for segment in sortedSegments: @@ -532,11 +742,9 @@ def messageSegmentation(self) -> List[MessageSegment]: input('next message: ') return sortedSegments - __debug = False - - def _recursivePivotVar(self, segment: MessageSegment): + def _recursiveDynamicPivotStd(self, segment: MessageSegment): """ Recursively split the segment at positions shifting from 2 to n-2, calculate the standard deviation for the values of each of the two resulting sub-segments, and compare each of them to the original segments deviation. @@ -584,7 +792,7 @@ def _recursivePivotVar(self, segment: MessageSegment): if abs(leftSegment.stdev() - myvar) > weightedThresh: # still different if self.__debug: print('split left', leftSegment.offset) - returnSegments.extend(self._recursivePivotVar(leftSegment)) + returnSegments.extend(self._recursiveDynamicPivotStd(leftSegment)) else: if self.__debug: print('left finished', abs(rightSegment.stdev() - myvar)) @@ -592,7 +800,7 @@ def _recursivePivotVar(self, segment: MessageSegment): if abs(rightSegment.stdev() - myvar) > weightedThresh: # still different if self.__debug: print('split right', rightSegment.offset) - returnSegments.extend(self._recursivePivotVar(rightSegment)) + returnSegments.extend(self._recursiveDynamicPivotStd(rightSegment)) else: if self.__debug: print('right finished', abs(rightSegment.stdev() - myvar)) @@ -603,6 +811,73 @@ def _recursivePivotVar(self, segment: MessageSegment): else: return [segment] + def _recursiveDynamicSlidedMean(self, segment: MessageSegment): + """ + Recursively split the segment at positions shifting from 2 to n-2, calculate the mean for the + values of each of the two resulting sub-segments, and compare each of them to the original segment's mean. + If a sub-segment is sufficiently different from its parent + (meanThreshold = 0.5 parentvar * min(len(vl), len(vr))/(len(vl) + len(vr))) further split the sub-segment. + + :param segment: One message segment that should be segmented. + :return: List of segments after the splitting. + """ + + if not segment.values: + segment.analyzer.analyze() + parentMean = segment.mean() + + if segment.length >= 4: # we need two bytes for each segment to get a bit congruence of them + + # select a suitable pivot: find the one yielding the highest deviation-difference from parent + segmentSplit = dict() + for pivot in range(2, segment.length-1): + leftSegment = MessageSegment(segment.analyzer, segment.offset, pivot) + rightSegment = MessageSegment(segment.analyzer, segment.offset + pivot, segment.length - pivot) + # deviation needs to be higher towards the edges to be a probable splitting point + lenweight = 2 * min(leftSegment.length, rightSegment.length) / segment.length + # add splits: varDiff: (leftSegment, rightSegment) + segmentSplit[abs(leftSegment.mean() - rightSegment.mean()) * lenweight] \ + = (leftSegment, rightSegment) + + if self.__debug: + from tabulate import tabulate + print(tabulate(sorted([(wlrdiff, ls.offset, ls.mean(), rs.offset, rs.mean(), rs.offset + rs.length) + for wlrdiff, (ls, rs) in segmentSplit.items()], key=lambda x: x[0]), headers=[ + 'wlrdiff', 'l.o', 'lmean', 'r.o', 'rmean', 'r.b'])) #abs(x[3] - x[4]) + + # use the segments splitted at selected pivot: search max varDiff in splits + splitdiffmax = max(segmentSplit.keys()) + leftSegment, rightSegment = segmentSplit[splitdiffmax] + # weightedThresh = 0.5 * parentMean * min(leftSegment.length, rightSegment.length) / segment.length + weightedThresh = self._meanThreshold * parentMean + if self.__debug: + print('parent segment mean:', parentMean) + print('weighted threshold:', weightedThresh) + + # test for recursion conditions: recurse if above weightedThresh + returnSegments = list() + if abs(leftSegment.mean() - parentMean) > weightedThresh: # still different + if self.__debug: + print('split left', leftSegment.offset) + returnSegments.extend(self._recursiveDynamicSlidedMean(leftSegment)) + else: + if self.__debug: + print('left finished', abs(rightSegment.mean() - parentMean)) + returnSegments.append(leftSegment) + if abs(rightSegment.mean() - parentMean) > weightedThresh: # still different + if self.__debug: + print('split right', rightSegment.offset) + returnSegments.extend(self._recursiveDynamicSlidedMean(rightSegment)) + else: + if self.__debug: + print('right finished', abs(rightSegment.mean() - parentMean)) + returnSegments.append(rightSegment) + + # if abs(lsm - rsm) > .1: # still different + return returnSegments + else: + return [segment] + class SlidingNmeanBitCongruence(BitCongruence): """ @@ -651,7 +926,7 @@ class SlidingNbcDelta(SlidingNmeanBitCongruence): ==== A difference quotient of n > 1 (8, 6, 4) may show regularly recurring 0s for consecutive fields - of equal length ant type. + of equal length and type. """ def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): super().__init__(message, unit) @@ -668,6 +943,9 @@ def analyze(self): # self._values = numpy.ediff1d(self._values).tolist() + [numpy.nan] # self._values = numpy.divide(numpy.diff(self._values, n=8), 8).tolist() + @property + def values(self): + return super().values + [numpy.nan] class SlidingNbcDeltaGauss(SlidingNbcDelta): """ @@ -677,8 +955,8 @@ def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): super().__init__(message, unit) self._bcvalues = None self._sensitivity = 0.5 - self._startskip += 1 """Sensitivity threshold for the smoothed extrema.""" + self._startskip += 1 def setAnalysisParams(self, horizon=2, sigma=1.5): self._analysisArgs = (horizon, sigma) @@ -823,6 +1101,21 @@ def analyze(self): class HorizonBitcongruence(BitCongruence): """ This is already the DELTA between the mean of the BC of 2 bytes to the left of n and the BC at n. + + >>> from nemere.validation.dissectorMatcher import MessageComparator + >>> from nemere.utils.loader import SpecimenLoader + >>> from nemere.inference.analyzers import * + >>> specimens = SpecimenLoader("../input/maxdiff-fromOrig/smb_SMIA20111010-one-rigid1_maxdiff-100.pcap", + ... relativeToIP=True, layer=2) + >>> # input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap + >>> comparator = MessageComparator(specimens, relativeToIP=True, layer=2) + >>> l4, rm = next(iter(comparator.messages.items())) + >>> analyzer = MessageAnalyzer.findExistingAnalysis(HorizonBitcongruence, MessageAnalyzer.U_BYTE, l4, (2,)) + >>> a = [] + >>> for l4,rm in comparator.messages.items(): + ... a.append((len(l4.data), comparator.fieldEndsPerMessage(rm)[-1])) + + """ def setAnalysisParams(self, horizon): if isinstance(horizon, tuple): @@ -1035,9 +1328,9 @@ def analyze(self): """ if not self._am: raise ParametersNotSet('Analysis method missing.') - results = self._am.values + results = self._am.valuesRaw correlation = numpy.correlate(results, results, 'full') - self._values = correlation.tolist() + self._values = [numpy.nan] * self._am.startskip + correlation.tolist() super().analyze() @@ -1076,143 +1369,87 @@ def analyze(self): self._values = numpy.gradient(self._values).tolist() -class CumulatedProgressionDelta(CumulatedValueProgression): +class EntropyWithinNgrams(MessageAnalyzer): """ - Difference quotient (forward finite difference, h=1) for all values. - - Alternative Idea - ==== - - A difference quotient of n > 1 (8, 6, 4) may show regularly recurring 0s for consecutive fields - of equal length ant type. + Calculates the entropy of each message ngrams based on an alphabet of bytes or nibbles (4 bit). """ - def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): - super().__init__(message, unit) - self._startskip += 1 @property def domain(self): - return 0, 255 if self.unit == MessageAnalyzer.U_BYTE else 128 - - def analyze(self): - super().analyze() - self._values = numpy.ediff1d(self._values).tolist() - # self._values = numpy.divide(numpy.diff(self._values, n=8), 8).tolist() - - def messageSegmentation(self) -> List[MessageSegment]: - """ - produces very bad/unusable results. - - :return: - """ - if not self.values: - self.analyze() - - # sudden drop (inversion?) in progression delta steepness. - sc = self.steepChanges(.3) # TODO iterate best value - - cutat = numpy.add(sorted(set(sc)), self._startskip).tolist() - if len(cutat) == 0 or cutat[0] != 0: - cutat = [0] + cutat - if len(cutat) == 0 or cutat[-1] != len(self._message.data): - cutat = cutat + [len(self._message.data)] # add the message end - - - segments = list() - for cutCurr, cutNext in zip(cutat[:-1], cutat[1:]): - segments.append(MessageSegment(self, cutCurr, cutNext-cutCurr)) - return segments - - def steepChanges(self, epsfact: float=.1): - """ - From the top of the value range to the bottom of the value range directly. - - :param epsfact: value deviation towards the middle considered to be at the limits of the value range. - :return: - """ - if epsfact > 1 or epsfact <= 0: - raise ValueError('epsilon factor for allowed range deviation is below 0 or above 1.') - - vmin = min(self._values) - vmax = max(self._values) - epsilon = (vmax - vmin) * epsfact - emin = vmin + epsilon - emax = vmax - epsilon - return [ ix - for ix, (vl, vr) in enumerate(zip(self._values[:-1], self._values[1:])) - if vl > emax and vr < emin] - - -class CumulatedProgression2ndDelta(CumulatedValueProgression): - """ - 2nd order difference quotient (forward finite difference, h=1) for all values. + from math import log + return 0, log(len(self._message.data) - self._n + 1, 2) - Field boundaries have no obvious property in this 2nd order difference quotient (NTP/DNS). - """ def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): super().__init__(message, unit) - self._startskip += 1 + self._n = None - @property - def domain(self): - extrema = 255 if self.unit == MessageAnalyzer.U_BYTE else 128 - return -extrema, extrema + def setAnalysisParams(self, n: Union[int, Tuple[int]]): + self._n = int(n if not isinstance(n, tuple) else n[0]) + self._startskip = self._n def analyze(self): - super().analyze() - self._values = [x2-2*x1+x0 for x2,x1,x0 in zip(self._values[2:], self._values[1:-1], self._values[:-2])] - # self._values = numpy.divide(numpy.diff(self._values, n=8), 8).tolist() + ngramEntropies = list() + for gram in [gram for gram in self.ngrams(self._n)]: + if self._unit == MessageAnalyzer.U_NIBBLE: + tokens = MessageAnalyzer.nibblesFromBytes(gram) + else: + tokens = gram + ngramEntropies.append(MessageAnalyzer.calcEntropy(tokens)) # should work for bytes + self._values = ngramEntropies -class ValueProgression(MessageAnalyzer): - @property - @abstractmethod - def domain(self): - return super().domain - @abstractmethod - def analyze(self): - super().analyze() +class ValueVariance(MessageAnalyzer): + """ + Shows the difference between subsequent values. + The early analyzer ValueProgressionDelta, i. e., the differential value progression, is the inverse of + ValueVariance: ValueVariance == inverted (minus) ValueProgressionDelta. + We removed ValueProgressionDelta to prevent confusion. -class ValueProgressionDelta(ValueProgression): - """ - Differential value progression. Shows the difference between subsequent values. + LOL. ValueVariance == CumulatedProgression2ndDelta. :-D + + Field boundaries have no obvious property in this 2nd order difference quotient (NTP/DNS). """ def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): super().__init__(message, unit) self._startskip = 1 + self._analysisArgs = (200,) @property def domain(self): extrema = 255 if self.unit == MessageAnalyzer.U_BYTE else 128 return -extrema, extrema - def analyze(self): - valueprogression = [] - if len(self._message.data) < 2: - raise ValueError("Needs at least two tokens to determine a value progression. Message is {}".format( - self._message.data)) + def setAnalysisParams(self, steepness=(200,)): + """ + 200 is a decent tradeoff. + A larger value (220) benefits only NTP. + Other protocols rather benefit from lower values (150). - if self._unit == MessageAnalyzer.U_NIBBLE: - tokens = self.nibblesFromBytes(self._message.data) + see ScoreStatistics-VD-steepness.ods + """ + if isinstance(steepness, tuple): + self._analysisArgs = steepness else: - tokens = self._message.data + self._analysisArgs = (int(steepness),) - prev = 0 - for tokenA, tokenB in zip(tokens[1:], tokens[:-1]): - tokenDiff = tokenB - tokenA - prev += tokenDiff - valueprogression.append(tokenDiff) - self._values = valueprogression + @property + def steepness(self): + return self._analysisArgs[0] + def analyze(self): + """ + Relative variance of single message bytes. + """ + self._values = MessageAnalyzer.tokenDelta(list(self._message.data), self._unit) def messageSegmentation(self) -> List[MessageSegment]: if not self.values: self.analyze() - # value drop or rise more than 200 (?) in one step, split at highest abs(value) - sc = self.steepChanges(200) # TODO iterate best value + # value drop or rise more than steepness threshold in one step, split at highest abs(value) + sc = self.steepChanges() # and value drop to or rise from 0, split at the non-zero value zb = self.zeroBorders() @@ -1228,18 +1465,15 @@ def messageSegmentation(self) -> List[MessageSegment]: segments.append(MessageSegment(self, cutCurr, cutNext-cutCurr)) return segments - - def steepChanges(self, steepness: int): + def steepChanges(self): """ value drop or rise more than steepness in one step, split at highest abs(value) - :param steepness: :return: """ return [ ix if abs(vl) > abs(vr) else ix+1 for ix, (vl, vr) in enumerate(zip(self._values[:-1], self._values[1:])) - if abs(vr-vl) > steepness] - + if abs(vr-vl) > self.steepness] def zeroBorders(self): """ @@ -1251,54 +1485,6 @@ def zeroBorders(self): for ix, (vl, vr) in enumerate(zip(self._values[:-1], self._values[1:])) if (vr == 0) != (vl == 0)] - -class EntropyWithinNgrams(MessageAnalyzer): - """ - Calculates the entropy of each message ngrams based on an alphabet of bytes or nibbles (4 bit). - """ - - @property - def domain(self): - from math import log - return 0, log(len(self._message.data) - self._n + 1, 2) - - def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): - super().__init__(message, unit) - self._n = None - - def setAnalysisParams(self, n: Union[int, Tuple[int]]): - self._n = int(n if not isinstance(n, tuple) else n[0]) - self._startskip = self._n - - def analyze(self): - ngramEntropies = list() - for gram in [gram for gram in self.ngrams(self._n)]: - if self._unit == MessageAnalyzer.U_NIBBLE: - tokens = MessageAnalyzer.nibblesFromBytes(gram) - else: - tokens = gram - - ngramEntropies.append(MessageAnalyzer.calcEntropy(tokens)) # should work for bytes - self._values = ngramEntropies - - -class ValueVariance(MessageAnalyzer): - @property - def domain(self): - extrema = 255 if self.unit == MessageAnalyzer.U_BYTE else 128 - return -extrema*2, extrema*2 - - def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE): - super().__init__(message, unit) - self._startskip = 1 - - def analyze(self): - """ - Relative variance of single message bytes. - """ - self._values = MessageAnalyzer.tokenDelta(list(self._message.data), self._unit) - - class VarianceAmplitude(MessageAnalyzer): @property def domain(self): @@ -1363,10 +1549,17 @@ def mostFrequent(self): class Value(MessageAnalyzer): """ Simply returns the byte values of the message. + + LOL. this is CumulatedProgressionDelta == ValueProgression == Value. :-D + + Alternative Idea + ==== + A difference quotient of n > 1 (8, 6, 4) may show regularly recurring 0s for consecutive fields + of equal length ant type. """ @property def domain(self): - return 0,255 + return 0, 255 if self.unit == MessageAnalyzer.U_BYTE else 128 def analyze(self): """ @@ -1381,10 +1574,53 @@ def values(self): else: return MessageAnalyzer.nibblesFromBytes(self.message.data) + def messageSegmentation(self) -> List[MessageSegment]: + """ + produces very bad/unusable results. + + :return: + """ + if not self.values: + self.analyze() + + # sudden drop (inversion?) in progression delta steepness. + sc = self.steepChanges(.3) # TODO iterate best value + + cutat = numpy.add(sorted(set(sc)), self._startskip).tolist() + if len(cutat) == 0 or cutat[0] != 0: + cutat = [0] + cutat + if len(cutat) == 0 or cutat[-1] != len(self._message.data): + cutat = cutat + [len(self._message.data)] # add the message end + + segments = list() + for cutCurr, cutNext in zip(cutat[:-1], cutat[1:]): + segments.append(MessageSegment(self, cutCurr, cutNext-cutCurr)) + return segments + + def steepChanges(self, epsfact: float=.1): + """ + From the top of the value range to the bottom of the value range directly. + + :param epsfact: value deviation towards the middle considered to be at the limits of the value range. + :return: + """ + if epsfact > 1 or epsfact <= 0: + raise ValueError('epsilon factor for allowed range deviation is below 0 or above 1.') + + vmin = min(self.values) + vmax = max(self.values) + epsilon = (vmax - vmin) * epsfact + emin = vmin + epsilon + emax = vmax - epsilon + return [ ix + for ix, (vl, vr) in enumerate(zip(self.values[:-1], self.values[1:])) + if vl > emax and vr < emin] + class Entropy(SegmentAnalyzer): """ - Calculates the entropy of each message ngrams based on an alphabet of bytes or nibbles (4 bit). + Calculates the entropy of each message segment based on the alphabet of bytes or nibbles (4 bit) in this segment. + This analyzer calculates the entropy of an already existing segment (subclass of SegmentAnalyzer) and is not a segmenter! """ def value(self, start, end): if self._unit == MessageAnalyzer.U_NIBBLE: diff --git a/src/nemere/inference/fieldTypes.py b/src/nemere/inference/fieldTypes.py new file mode 100644 index 00000000..f1fc007b --- /dev/null +++ b/src/nemere/inference/fieldTypes.py @@ -0,0 +1,188 @@ +from typing import Type, Union, Any, Tuple, Iterable + +import numpy +import scipy.spatial + +from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage + +from nemere.inference.analyzers import Value +from nemere.inference.segments import MessageAnalyzer + + + + +class BaseTypeMemento(object): + """ + Base class providing the means to identify a field type by a name and ID. + """ + def __init__(self, fieldtype: str, length = None): + # data type this field represents + self._fieldtype = fieldtype + self.__length = length + + @property + def fieldtype(self): + return self._fieldtype + + def __len__(self): + if self.__length is None: + raise ValueError("Call of len() on a BaseTypeMemento without this property.") + return self.__length + + @property + def typeID(self, short=True): + """ + :param short: Use only the last half (4 bytes) of the hash + :return: As an identifier use the hash of the fieldtype value + """ + tid = "{:02x}".format(hash(self.fieldtype)) + return tid[-8:] if short else tid + + def __repr__(self): + return "FieldTypeMemento " + self.typeID + " for " + self.fieldtype + + +class FieldTypeMemento(BaseTypeMemento): + """ + Class to help persisting field type characteristics from a FieldTypeTemplate represented by mean and covariance. + Contains methods to calculate the covariance matrix, mahalanobis distance to a given vector, and the "confidence" + of a positive match. + """ + + def __init__(self, mean: numpy.ndarray, stdev: numpy.ndarray, cov: numpy.ndarray, fieldtype: str, + analyzerClass: Type[MessageAnalyzer] = Value, analysisParams: Union[Any, Tuple] = None, + unit=MessageAnalyzer.U_BYTE): + super().__init__(fieldtype) + self._mean = mean + self._cov = cov + self._picov = None + self._stdev = stdev + # for reference: + self._analyzerClass = analyzerClass + self._analysisParams = analysisParams + self._unit = unit + + + 'from inference.templates import FieldTypeTemplate' + # noinspection PyUnresolvedReferences + @staticmethod + def fromTemplate(ftt: 'FieldTypeTemplate'): + ftm = FieldTypeMemento(ftt.mean, ftt.stdev, ftt.cov, ftt.fieldtype, + type(ftt.baseSegments[0].analyzer), ftt.baseSegments[0].analyzer.analysisParams, + ftt.baseSegments[0].analyzer.unit) + return ftm + + @property + def mean(self) -> numpy.ndarray: + return self._mean + + @property + def stdev(self) -> numpy.ndarray: + return self._stdev + + @property + def cov(self) -> numpy.ndarray: + """ + There is some rounding error so the stdev is not entierely identical to the diagonal of the covariance matrix. + + >>> from nemere.inference.templates import FieldTypeTemplate + >>> from nemere.utils.baseAlgorithms import generateTestSegments + >>> bs = generateTestSegments() + >>> ftt = FieldTypeTemplate(bs) + >>> # numpy.round(ftt.stdev, 8) == numpy.round(ftt.cov.diagonal(), 8) + >>> numpy.round(ftt.stdev, 8) + array([ 0. , 0. , 0. , 20.40067401, 31.70392545, + 0.49487166, 9.16292441]) + >>> numpy.round(ftt.cov.diagonal(), 8) + array([...e-0..., ...e-0..., ...e-0..., 5.54916667e+02, + 1.20616667e+03, 2.85714290e-01, 9.79523810e+01]) + + :return: The covariance matrix of the template. + """ + return self._cov + + @property + def picov(self) -> numpy.ndarray: + """ + Often cov is a singular matrix in our use case, so we use the approximate Moore-Penrose pseudo-inverse + from numpy. + (G. Strang, Linear Algebra and Its Applications, 2nd Ed., Orlando, FL, Academic Press, Inc., 1980, pp. 139-142.) + + :return: pseudo-inverse of the covariance matrix. + """ + if self._picov is None: + self._picov = numpy.linalg.pinv(self.cov) + return self._picov + + @property + def upper(self) -> numpy.ndarray: + return self._mean + self.stdev + + @property + def lower(self) -> numpy.ndarray: + return self._mean - self.stdev + + @property + def analyzerClass(self) -> Type[MessageAnalyzer]: + """ + :return: The type of the analyzer + """ + return self._analyzerClass + + def recreateAnalyzer(self, message: AbstractMessage) -> MessageAnalyzer: + """ + Recreate an analyzer of the type and configuration given in this memento instance. + + :param message: The message to create the analyzer for. + :return: The newly created analyzer instance. + """ + return MessageAnalyzer.findExistingAnalysis(self._analyzerClass, self._unit, message, self._analysisParams) + + def __len__(self): + return len(self._mean) + + @property + def typeID(self, short=True) -> str: + """ + :param short: Use only the last half (4 bytes) of the hash + :return: As an identifier use the hash of the mean values + """ + tid = "{:02x}".format(hash(tuple(self.mean))) + return tid[-8:] if short else tid + + @property + def codePersist(self) -> str: + """:return: Python code to persist this Memento""" + return "{}(numpy.array({}), numpy.array({}), numpy.array({}), '{}', {}, {}, {})".format( + type(self).__name__, self.mean.tolist(), self.stdev.tolist(), self.cov.tolist(), self._fieldtype, + self._analyzerClass.__name__, self._analysisParams, + "MessageAnalyzer.U_BYTE" if self._unit == MessageAnalyzer.U_BYTE else "MessageAnalyzer.U_NIBBLE") + + def mahalanobis(self, vector: Iterable[float]) -> numpy.ndarray: + """ + Compute the Mahalanobis distance between this fieldtype's mean and the given vector using the + covariance matrix contained in this object. + + Mahalanobis distance measures the distance of a vector from the mean in terms of the multivariate pendent to + the standard deviation: zotero + + :param vector: The vector of which the distance to the mean shall be calculated. + :return: The Mahalanobis distance between the field type mean and the given vector. + """ + return scipy.spatial.distance.mahalanobis(self.mean, vector, self.picov) + + def confidence(self, vector: Iterable[float]) -> numpy.ndarray: + """ + :param vector: A feature vector (e. g. byte values) + :return: The confidence that the given vector is of the field type represented by this memento. + Mostly this is equivalent to the mahalanobis distance between vector and FieldTypeMemento, but for + the fieldtype "id" the confidence is reduced by factor 2 (smaller value => higher confidence). + """ + conf = self.mahalanobis(vector) + # TODO move to be a parameterizable property of the FieldTypeMemento class + # make ids twice as unconfident + if self.fieldtype == "id": + conf *= 2 + return conf + + diff --git a/src/nemere/inference/formatRefinement.py b/src/nemere/inference/formatRefinement.py index 7fd618cc..31ad5722 100644 --- a/src/nemere/inference/formatRefinement.py +++ b/src/nemere/inference/formatRefinement.py @@ -1,11 +1,13 @@ from abc import ABC, abstractmethod from typing import List +from bitstring import Bits +from pyitlib import discrete_random_variable as drv + from nemere.inference.segments import MessageSegment from nemere.inference.segmentHandler import isExtendedCharSeq - def isPrintableChar(char: int): if 0x20 <= char <= 0x7e or char in ['\t', '\n', '\r']: return True @@ -72,6 +74,21 @@ def isOverlapping(segA: MessageSegment, segB: MessageSegment) -> bool: return False +def entropyOfBytes(byteData: bytes, n=3): + bitData = Bits(bytes=byteData) + ngrams = [bitData[offset:offset+n].uint for offset in range(len(bitData)-n+1)] + return drv.entropy(ngrams)/n + + +def entropyOfXor(byteDataA: bytes, byteDataB: bytes, n=3): + bitDataA = Bits(bytes=byteDataA) + bitDataB = Bits(bytes=byteDataB) + trunc = min(len(bitDataA), len(bitDataB)) + xored = bitDataA[:trunc] ^ bitDataB[:trunc] + ngrams = [xored[offset:offset+n].uint for offset in range(len(xored)-n+1)] + return drv.entropy(ngrams)/n + + class MessageModifier(ABC): _debug = False @@ -108,10 +125,8 @@ def merge(self): mergedSegments.append(segr) return mergedSegments - - @staticmethod @abstractmethod - def condition(segl: MessageSegment, segr: MessageSegment) -> bool: + def condition(self, segl: MessageSegment, segr: MessageSegment) -> bool: """ A generic condition called to determine whether a merging is necessary. @@ -119,7 +134,7 @@ def condition(segl: MessageSegment, segr: MessageSegment) -> bool: :param segr: right segment :return: True if merging is required, False otherwise. """ - pass + raise NotImplementedError("A condition for merging needs to be defined by subclassing.") class MergeConsecutiveChars(Merger): @@ -144,8 +159,7 @@ class MergeConsecutiveChars(Merger): ... print("Mismatch!") """ - @staticmethod - def condition(segl: MessageSegment, segr: MessageSegment): + def condition(self, segl: MessageSegment, segr: MessageSegment): """ Check whether both segments consist of printable characters. """ @@ -217,6 +231,8 @@ def split(self): print("{} and {}".format(segc, segmentStack[-1] if segmentStack else 'Empty')) mangledSegments.append(segc) + else: + mangledSegments = self.segments return mangledSegments @staticmethod @@ -521,7 +537,7 @@ def split(self): class CumulativeCharMerger(MessageModifier): """ - Merge consecutive segments that toghether fulfill the char conditions in inference.segmentHandler.isExtendedCharSeq + Merge consecutive segments that together fulfill the char conditions in inference.segmentHandler.isExtendedCharSeq """ def merge(self): diff --git a/src/nemere/inference/segmentHandler.py b/src/nemere/inference/segmentHandler.py index 272df95b..93384ce9 100644 --- a/src/nemere/inference/segmentHandler.py +++ b/src/nemere/inference/segmentHandler.py @@ -9,8 +9,8 @@ from nemere.utils.loader import BaseLoader from nemere.inference.segments import MessageSegment, HelperSegment, TypedSegment, AbstractSegment -from nemere.inference.analyzers import MessageAnalyzer -from nemere.inference.templates import TypedTemplate +from nemere.inference.analyzers import MessageAnalyzer, Value +from nemere.inference.templates import AbstractClusterer, TypedTemplate def segmentMeans(segmentsPerMsg: List[List[MessageSegment]]): @@ -125,6 +125,24 @@ def segmentsFromLabels(analyzer, labels) -> Tuple[TypedSegment]: return tuple(segments) +def segmentsFromSymbols(symbols: List[Symbol]): + msgflds = [(msg,flds) for s in symbols for msg,flds in s.getMessageCells().items()] + segmentedMessages = [] + for msg,flds in msgflds: + analyzer = Value(msg) + msgSegs = [] + pointer = 0 + for bv in flds: + length = len(bv) + if length == 0: + continue + msgSegs.append(MessageSegment(analyzer, pointer, length)) + pointer += length + assert pointer == len(msg.data) + segmentedMessages.append(msgSegs) + return segmentedMessages + + def fixedlengthSegmenter(length: int, specimens: BaseLoader, analyzerType: type, analysisArgs: Union[Tuple, None], unit=MessageAnalyzer.U_BYTE, padded=False) \ -> List[Tuple[MessageSegment]]: @@ -137,7 +155,6 @@ def fixedlengthSegmenter(length: int, specimens: BaseLoader, >>> from nemere.inference.segmentHandler import fixedlengthSegmenter >>> specimens = SpecimenLoader("../input/deduped-orig/ntp_SMIA-20111010_deduped-100.pcap", 2, True) >>> comparator = MessageComparator(specimens, 2, True, debug=False) - Wait for tshark output (max 20s)... >>> segmentedMessages = fixedlengthSegmenter(4, specimens, Value, None) >>> areIdentical = True >>> for msgsegs in segmentedMessages: @@ -172,7 +189,7 @@ def fixedlengthSegmenter(length: int, specimens: BaseLoader, if len(l4msg.data) > lastOffset: # append the overlap if padded: # here are nasty hacks! - # TODO Better define a new subclass of MessageSegment that internally padds values + # TODO Better define a new subclass of MessageSegment that internally pads values # (and bytes? what are the guarantees?) to a given length that exceeds the message length residuepadd = lastOffset + length - len(l4msg.data) newMessage = copy.copy(originalAnalyzer.message) @@ -219,9 +236,10 @@ def segments2types(segments: Iterable[TypedSegment]) -> Dict[str, List[TypedSegm :return: A dict of fieldtype (str) : segments of this type (list) """ + from nemere.utils.evaluationHelpers import unknown typegroups = dict() for seg in segments: - fieldtype = seg.fieldtype if isinstance(seg, (TypedSegment, TypedTemplate)) else '[unknown]' + fieldtype = seg.fieldtype if isinstance(seg, (TypedSegment, TypedTemplate)) else unknown if fieldtype in typegroups: typegroups[fieldtype].append(seg) else: @@ -257,6 +275,38 @@ def bcDeltaGaussMessageSegmentation(specimens, sigma=0.6) -> List[List[MessageSe msgSeg.append(analyzer.messageSegmentation()) return msgSeg +def bcDeltaGaussMessageSegmentationLE(specimens, sigma=0.6) -> List[List[MessageSegment]]: + """ + Little Endian version of + Segment message by determining inflection points of gauss-filtered bit congruence deltas. + + >>> from nemere.utils.loader import SpecimenLoader + >>> sl = SpecimenLoader('../input/hide/random-100-continuous.pcap', layer=0, relativeToIP=True) + >>> segmentsPerMsg = bcDeltaGaussMessageSegmentationLE(sl) + Segmentation by inflections of sigma-0.6-gauss-filtered bit-variance for little endian. + >>> for spm in segmentsPerMsg: + ... if b''.join([seg.bytes for seg in spm]).hex() != spm[0].message.data.hex(): + ... print("Mismatch!") + + :return: Segmentation of the specimens in the pool. + """ + from nemere.inference.analyzers import BitCongruenceDeltaGaussLE + + print('Segmentation by inflections of sigma-{:.1f}-gauss-filtered bit-variance for little endian.'.format( + sigma + )) + msgSeg = list() + for l4msg, rmsg in specimens.messagePool.items(): + analyzer = BitCongruenceDeltaGaussLE(l4msg) + analyzer.setAnalysisParams(sigma) + analyzer.analyze() + msgSeg.append(analyzer.messageSegmentation()) + return msgSeg + + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # Start: Refinements # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # noinspection PyUnusedLocal def refinements(segmentsPerMsg: List[List[MessageSegment]], **kwargs) -> List[List[MessageSegment]]: @@ -293,7 +343,6 @@ def baseRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[ charSplited = refine.ResplitConsecutiveChars(charsMerged).split() refinedPerMsg.append(charSplited) - # for tests use test_segment-refinements.py moco = refine.CropDistinct.countCommonValues(refinedPerMsg) newstuff = list() for msg in refinedPerMsg: @@ -304,6 +353,8 @@ def baseRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[ return newstuff + + def nemetylRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[List[MessageSegment]]: """ Refine the segmentation using specific improvements for the feature: @@ -323,7 +374,6 @@ def nemetylRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> Li charSplited = refine.ResplitConsecutiveChars(charsMerged).split() refinedPerMsg.append(charSplited) - # for tests use test_segment-refinements.py moco = refine.CropDistinct.countCommonValues(refinedPerMsg) newstuff = list() for msg in refinedPerMsg: @@ -344,6 +394,8 @@ def charRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[ * frequency reinforced segments (CropDistinct) and * splitting of first segment (SplitFixed) + Note: This refinement alone actually performes considerably worse than originalRefinements! + :param segmentsPerMsg: a list of one list of segments per message. :return: refined segments in list per message """ @@ -357,12 +409,17 @@ def charRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[ charsMerged = refine.MergeConsecutiveChars(msg).merge() charSplited = refine.ResplitConsecutiveChars(charsMerged).split() refinedPerMsg.append(charSplited) + # assert correct result + if msg[0].offset != charSplited[0].offset or msg[-1].nextOffset != charSplited[-1].nextOffset: + raise RuntimeError("Segment bytes where lost!") - # for tests use test_segment-refinements.py newstuff = list() for msg in refinedPerMsg: charmerged = refine.CumulativeCharMerger(msg).merge() newstuff.append(charmerged) + # assert correct result + if msg[0].offset != charmerged[0].offset or msg[-1].nextOffset != charmerged[-1].nextOffset: + raise RuntimeError("Segment bytes where lost!") return newstuff @@ -433,6 +490,93 @@ def matrixFromTpairs(distances: Iterable[Tuple[T,T,float]], segmentOrder: Sequen return simtrx +def segments2clusteredTypes(clusterer: AbstractClusterer, analysisTitle: str, + singularTemplates=True, charSegments:List[AbstractSegment]=None) \ + -> List[Tuple[str, List[Tuple[str, List[Tuple[str, TypedSegment]]]]]]: + """ + Cluster segments according to the distance of their feature vectors. + Keep and label segments classified as noise. + + :param clusterer: Clusterer object that contains all the segments to be clustered + :param analysisTitle: the string to be used as label for the result + :param singularTemplates: Flag to separate singular values into their own templates. + :param charSegments: list of char segments if they should not be clustered regularly. + :return: List/Tuple structure of annotated analyses, clusters, and segments. + List [ of + Tuples ( + "analysis label", + List [ of cluster + Tuples ( + "cluster label", + List [ of segment + Tuples ( + "segment label (e. g. field type)", + MessageSegment object + ) + ] + ) + ] + ) + ] + """ + from math import log + from .templates import Template + print("Clustering segments...") + noise, *clusters = clusterer.clusterSimilarSegments(False) + + if charSegments is not None and len(charSegments) > 0: + clusters.append(charSegments) + + # TODO handle separately + if singularTemplates: + # extract "large" templates from noise that should rather be its own cluster + for idx, seg in reversed(list(enumerate(noise.copy()))): # type: int, MessageSegment + freqThresh = log(len(clusterer.segments)) + if isinstance(seg, Template): + if len(seg.baseSegments) > freqThresh: + clusters.append([noise.pop(idx)]) # .baseSegments + + print("{} clusters generated from {} segments".format(len(clusters), len(clusterer.segments))) + + segmentClusters = list() + segLengths = set() + numNoise = len(noise) + if numNoise > 0: + noiseSegLengths = {seg.length for seg in noise} + outputLengths = [str(slen) for slen in noiseSegLengths] + if len(outputLengths) > 5: + outputLengths = outputLengths[:2] + ["..."] + outputLengths[-2:] + segLengths.update(noiseSegLengths) + noisetypes = {t: len(s) for t, s in segments2types(noise).items()} + segmentClusters.append(('{} ({} bytes), Noise: {} Seg.s'.format( + analysisTitle, " ".join(outputLengths), numNoise), + [("{}: {} Seg.s".format(cseg.fieldtype, noisetypes[cseg.fieldtype]), cseg) + for cseg in noise] )) # '' + for cnum, segs in enumerate(clusters): + clusterDists = clusterer.distanceCalculator.distancesSubset(segs) + typegroups = segments2types(segs) + clusterSegLengths = {seg.length for seg in segs} + outputLengths = [str(slen) for slen in clusterSegLengths] + if len(outputLengths) > 5: + outputLengths = outputLengths[:2] + ["..."] + outputLengths[-2:] + segLengths.update(clusterSegLengths) + + mostFrequentTypes = sorted(((ftype, len(tsegs)) for ftype, tsegs in typegroups.items()), key=lambda x: -x[1]) + + segmentGroups = ('{} ({} bytes), Cluster #{} ({:.2f} {}): {} Seg.s ($d_{{max}}$={:.3f})'.format( + analysisTitle, " ".join(outputLengths), + cnum, mostFrequentTypes[0][1]/sum(s for t, s in mostFrequentTypes), mostFrequentTypes[0][0], len(segs), clusterDists.max()), list()) + for ftype, tsegs in typegroups.items(): # [label, segment] + segmentGroups[1].extend([("{}: {} Seg.s".format(ftype, len(tsegs)), tseg) for tseg in tsegs]) + segmentClusters.append(segmentGroups) + + segmentClusters = [ ( '{} ({} bytes) {}'.format(analysisTitle, + next(iter(segLengths)) if len(segLengths) == 1 else 'mixedamount', + clusterer if clusterer else 'n/a'), + segmentClusters) ] + return segmentClusters + + def filterSegments(segments: Iterable[MessageSegment]) -> List[MessageSegment]: """ Filter input segment for only those segments that are adding relevant information for further analysis: diff --git a/src/nemere/inference/segments.py b/src/nemere/inference/segments.py index 726cbd2f..25b0e304 100644 --- a/src/nemere/inference/segments.py +++ b/src/nemere/inference/segments.py @@ -5,8 +5,7 @@ from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage - - +MA = TypeVar('MA', bound='MessageAnalyzer') ### MessageAnalyzer base class ######################################### @@ -103,8 +102,8 @@ def analyze(self): @staticmethod - def findExistingAnalysis(analyzerclass: type, unit: int, - message: AbstractMessage, analysisArgs: Union[Any, Tuple]=None) -> 'MessageAnalyzer': + def findExistingAnalysis(analyzerclass: Type[MA], unit: int, + message: AbstractMessage, analysisArgs: Union[Any, Tuple]=None) -> MA: """ Efficiently obtain an analyzer by looking for an already existing identical object instance. @@ -119,7 +118,7 @@ def findExistingAnalysis(analyzerclass: type, unit: int, if keytuple in MessageAnalyzer._analyzerCache: return MessageAnalyzer._analyzerCache[keytuple] else: - ac = analyzerclass(message, unit) # type: MessageAnalyzer + ac = analyzerclass(message, unit) # type: MA if analysisArgs is None: analysisArgs = tuple() try: @@ -720,7 +719,7 @@ def mean(self): """ if self.values is None: raise ValueError('Value of MessageSegment instance must be set to calculate its mean.') - return numpy.mean(self.values) + return numpy.nanmean(self.values) def stdev(self): @@ -730,7 +729,7 @@ def stdev(self): """ if self.values is None: raise ValueError('Value of MessageSegment instance must be set to calculate its standard deviation.') - return numpy.std(self.values) + return numpy.nanstd(self.values) def fillCandidate(self, candidate: Union['MessageSegment', AbstractMessage]): diff --git a/src/nemere/inference/series.py b/src/nemere/inference/series.py new file mode 100644 index 00000000..cf12c470 --- /dev/null +++ b/src/nemere/inference/series.py @@ -0,0 +1,83 @@ +import numpy + + +class AnalysisSeries(dict): + """ + Class to hold multiple analysis results related to the same message or segment. + """ + # segments = dict() # type: List[MessageSegment] + + ID = 'id' + FEATURE = 'feature' + CANDIDATE = 'candidate' + CORRELATE = 'correlation' + + # def __init__(self, **kwargs): + # dict.__init__(self, **kwargs) + + @staticmethod + def fromlist(correlation): + """ + + :param correlation: dict with keys defined in the constants: ID, FEATURE, CANDIDATE, CORRELATE + :return: + """ + import humanhash + + return AnalysisSeries({ humanhash.humanize('{:02x}'.format(ser[AnalysisSeries.ID])): ser for ser in correlation }) + + + def cand(self, humhash): + """ + convenience method + + :param humhash: + :return: values of candidate of analysis series for human hash humhash + at the correlated position and the length of the feature + """ + position = numpy.argmax(self.corr(humhash)) + length = len(self.feat(humhash)) + return self[humhash][AnalysisSeries.CANDIDATE].values[position:position+length] + + + def candFull(self, humhash): + """ + convenience method + + :param humhash: + :return: values of feature of analysis series at human hash humhash + """ + return self[humhash][AnalysisSeries.CANDIDATE].values + + + def feat(self, humhash): + """ + convenience method + + :param humhash: + :return: values of feature of analysis series at human hash humhash + """ + return self[humhash][AnalysisSeries.FEATURE].values + + + def shiftFeature(self, humhash): + """ + values of feature of analysis series at human hash humhash + + :param humhash: + :return: + """ + fshift = numpy.mean(self.cand(humhash)) - numpy.mean(self.feat(humhash)) + return numpy.add(self.feat(humhash), fshift) + + + def corr(self, humhash): + """ + convenience method + + :param humhash: + :return: values of correlation of analysis series at human hash humhash + """ + return self[humhash][AnalysisSeries.CORRELATE].values + + diff --git a/src/nemere/inference/templates.py b/src/nemere/inference/templates.py index 97a99801..ec808c03 100644 --- a/src/nemere/inference/templates.py +++ b/src/nemere/inference/templates.py @@ -1,11 +1,20 @@ from typing import List, Dict, Union, Iterable, Sequence, Tuple, Iterator +from abc import ABC, abstractmethod from os import cpu_count -import numpy, scipy.spatial, itertools +from collections import Counter + +from pandas import DataFrame +from kneed import KneeLocator +import numpy, scipy.spatial, itertools, kneed, math +from scipy import interpolate from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage +from sklearn.cluster import OPTICS +from nemere.inference.fieldTypes import FieldTypeMemento from nemere.inference.analyzers import MessageAnalyzer, Value from nemere.inference.segments import MessageSegment, AbstractSegment, CorrelatedSegment, HelperSegment, TypedSegment +from nemere.utils.baseAlgorithms import ecdf debug = False @@ -17,6 +26,15 @@ """ + +class ClusterAutoconfException(Exception): + """ + Exception to raise in case of an failed clusterer autoconfiguration. + """ + def __init__(self, description: str): + super().__init__(description) + + class DistanceCalculator(object): """ Wrapper to calculate and look up pairwise distances between segments. @@ -1290,6 +1308,7 @@ def __init__(self, values: Union[Tuple[Union[float, int]], MessageSegment], baseSegments: Iterable[AbstractSegment], method='canberra'): from nemere.inference.segments import TypedSegment + from nemere.utils.evaluationHelpers import unknown super().__init__(values, baseSegments, method) ftypes = {bs.fieldtype for bs in baseSegments if isinstance(bs, TypedSegment)} @@ -1299,7 +1318,7 @@ def __init__(self, values: Union[Tuple[Union[float, int]], MessageSegment], elif fcount > 1: self._fieldtype = "[mixed]" else: - self._fieldtype = "[unknown]" + self._fieldtype = unknown @property def fieldtype(self) -> str: @@ -1317,6 +1336,1289 @@ def fieldtype(self, value: str): """ self._fieldtype = value + + +class FieldTypeTemplate(TypedTemplate, FieldTypeMemento): + """ + Template and Memento for collecting base segments representing a common field type. + Also serves to commonly analyze the collevtive base segments, and thus, to determine characteristics of the + field type. This way the class is the basis to verify a segment cluster's suitability as a type template in the + first place. + """ + + def __init__(self, baseSegments: Iterable[AbstractSegment], method='canberra'): + """ + A new FieldTypeTemplate for the collection of base segments given. + + Per vector component, the mean, stdev, and the covariance matrix is calculated. Therefore the collection needs + to be represented by a vector of one common vector space and thus a fixed number of dimensions. + Thus for the calculation: + * zero-only segments are ignored + * nans are used for shorter segments at don't-care positions + + CAVEAT: components with a standard deviation of 0 are "scintillated" to slightly deviate from 0. Thus we do not + fail to calculate a covariance matrix for linearly dependent entries at the price of a minor loss of + numeric precision. + + :param baseSegments: + :param method: + """ + self.baseSegments = list(baseSegments) + """:type List[AbstractSegment]""" + self._baseOffsets = dict() + relevantSegs = [seg for seg in self.baseSegments if set(seg.values) != {0}] + segLens = {seg.length for seg in relevantSegs} + + if len(segLens) == 1: + # all segments have equal length, so we simply can create an array from all of them + segV = numpy.array([seg.values for seg in relevantSegs]) + # TODO overlapping offset from -1 + self._maxLen = next(iter(segLens)) + elif len(segLens) > 1: + # find the optimal shift/offset of each shorter segment to match the longest ones + # with shortest distance according to the method + # TODO overlapping offset from -1 + self._maxLen = max(segLens) + # Better than the longest would be the most common length, but that would increase complexity a lot and + # we assume that for most use cases the longest segments will be the most frequent length. + # TODO -1 shift could also be necessary for comparing two max-long segments + + # tuples of indices, lengths, and values of the segments that all are the longest in the input + maxLenSegs = [(idx, seg.length, seg.values) for idx, seg in enumerate(relevantSegs, 1) + if seg.length == self._maxLen] + segE = list() + for seg in relevantSegs: + if seg.length == self._maxLen: + # only-zero segments are irrelevant and maxLenSegs are processed afterwards. + continue + shortSeg = (0, seg.length, tuple(seg.values)) + # offsets = [DistanceCalculator.embedSegment(shortSeg, longSeg, method)[1] for longSeg in maxLenSegs] + + embeddingsStraight = [DistanceCalculator.embedSegment(shortSeg, longSeg, method) for longSeg in + maxLenSegs] + if seg.length > 2: + evenShorter = (0, seg.length - 1, tuple(seg.values[1:])) + embeddingsTrunc = [DistanceCalculator.embedSegment(evenShorter, longSeg, method) for longSeg in + maxLenSegs] + # method, shift, (shortSegment[0], longSegment[0], distance) + longStraightDistLookup = { es[2][1]: es[2][2] for es in embeddingsStraight } + longTruncDistLookup = { et[2][1]: et[2][2] for et in embeddingsTrunc } + truncMatchDists = [(longTruncDistLookup[longSeg[2][1]], + longStraightDistLookup[longSeg[2][1]]) + for longSeg in embeddingsStraight] + if all(list(map(lambda x: x[0] 1 and self._maxLen > 2: + embeddingsStraight = DistanceCalculator.calcDistances(maxLenSegs) + longStraightDistLookup = {(es[0], es[1]): es[2] for es in embeddingsStraight} + + # iterate the longest segments and determine if truncating the shorter segments further reduces the + # dissimilarity for any shift of the shorter within the longest segments. + for segIdx, segLen, segVals in maxLenSegs: + + seg = relevantSegs[segIdx - 1] + assert seg.values == segVals, "Wrong segment selected during maxLenSegs truncation." + + evenShorter = (0, segLen - 1, tuple(segVals[1:])) + embeddingsTrunc = [DistanceCalculator.embedSegment(evenShorter, longSeg, method) for longSeg in + maxLenSegs if longSeg[0] != segIdx] + longTruncDistLookup = { et[2][1]: et[2][2] for et in embeddingsTrunc } + truncMatchDists = [(longTruncDistLookup[longSeg[2][1]], + longStraightDistLookup[ + (longSeg[2][1], segIdx) if (longSeg[2][1], segIdx) in longStraightDistLookup + else (segIdx, longSeg[2][1]) + ], + longSeg[2][1]) + for longSeg in embeddingsTrunc] + + if method != 'canberra': + # TODO this "-1" is canberra specific. Other methods need different values. + raise NotImplementedError("Threshold for non-caberra dissimilarity improvement is not yet" + "defined.") + + # DEBUG + if debug: + for truncD, straightD, longIdx in truncMatchDists: + if truncD < straightD - 1: + longSeg = next(mls for mls in maxLenSegs if mls[0] == longIdx) + offset = next(et[1] for et in embeddingsTrunc if et[2][1] == longIdx) + comp = [["({})".format(segVals[0])] + list(evenShorter[2]), [""] + list(longSeg[2])] + from tabulate import tabulate + print("Offset", offset, "- truncD", truncD, "- straightD", straightD) + print(tabulate(comp)) + # import IPython + # IPython.embed() + + if all(map(lambda x: x[0] 0 else 0 + for bs in self.baseSegments: + if bs.length == self._maxLen: + self._mean = numpy.array(bs.values) + self._stdev = numpy.zeros(self._mean.shape) + self._cov = numpy.zeros((self._mean.shape[0], self._mean.shape[0])) + break + if not (isinstance(self._mean, numpy.ndarray) and isinstance(self._stdev, numpy.ndarray) + and isinstance(self._cov, numpy.ndarray)): + raise RuntimeError("This collection of base segments is not suited to generate a FieldTypeTemplate.") + # noinspection PyTypeChecker + super().__init__(self._mean, self.baseSegments, method) + return + + self._mean = numpy.nanmean(segV, 0) + self._stdev = numpy.nanstd(segV, 0) + + # for all components that have a stdev of 0 we need to scintillate the values (randomly) to derive a + # covariance matrix that shows no linear dependent entries ("don't care positions") + assert segV.shape == (len(relevantSegs), len(self._stdev)) + if any(self._stdev == 0): + segV = segV.astype(float) + for compidx, compstdev in enumerate(self._stdev): + if compstdev == 0: + segV[:,compidx] = numpy.random.random_sample((len(relevantSegs),)) * .5 + + # print(segV) + + # self._cov = numpy.cov(segV, rowvar=False) + # pandas cov allows for nans, numpy not + if segV.shape[0] > 1: + pd = DataFrame(segV) + self._cov = pd.cov().values + else: + # handle cases that result in a runtime warning of numpy, + # since a single sequence of values cannot yield a cov. + self._cov = numpy.empty((segV.shape[1],segV.shape[1])) + self._cov[:] = numpy.nan + self._picov = None # fill on demand + + assert len(self._mean) == len(self._stdev) == len(self._cov.diagonal()) + super().__init__(self._mean, self.baseSegments, method) + + + def paddedValues(self, segment: AbstractSegment=None): + """ + :param segment: The base segment to get the padded values for, + or None to return an array of all padded values of this Template + :return: The values of the given base segment padded with nans to the length of the + longest segment represented by this template. + """ + if segment is None: + return numpy.array([self.paddedValues(seg) for seg in self.baseSegments]) + + shift = self._baseOffsets[segment] if segment in self._baseOffsets else 0 + # overlapping offset from -1 + segvals = list(segment.values) if shift >= 0 else list(segment.values)[-shift:] + vals = [numpy.nan] * shift + segvals + [numpy.nan] * (self._maxLen - shift - segment.length) + return vals + + @property + def baseOffsetCounts(self): + """ + :return: The amounts of relative offset values. + """ + return Counter([o for o in self._baseOffsets.values()]) + + def paddedPosition(self, segment: MessageSegment): + """ + Only works with MessageSegments, i.e. Templates need to be resolved into their base segments + when creating the FieldTypeTemplate. + + :return: The absolute positions (analog to offset and nextOffset) of the padded values for the given segment. + The values may be before the message start or after the message end! + """ + offset = segment.offset - self._baseOffsets.get(segment, 0) + # TODO overlapping offset from -1 + nextOffset = offset + self._maxLen + return offset, nextOffset + + @property + def maxLen(self): + return self._maxLen + + +class FieldTypeContext(FieldTypeTemplate): + + def __init__(self, baseSegments: Iterable[MessageSegment], method='canberra'): + """ + FieldTypeTemplate-subclass which, instead of a nan-padded offset alignment, + fills shorter segments with the values of the message at the respective position. + + :param baseSegments: Requires a List of MessageSegment not AbstractSegment! + Templates must therefore be resolved beforehand! + :param method: see :py:class:`FieldTypeTemplate` + """ + super().__init__(baseSegments, method) + + def paddedValues(self, segment: MessageSegment=None): + """ + :param segment: The base segment to get the padded values for, + or None to return an array of all padded values of this Template + :return: The values of the given base segment padded with values of the original message to the length of the + longest segment represented by this template. If a padding would exceed the message data, padd with nans + """ + if segment is None: + # noinspection PyTypeChecker + return numpy.array([self.paddedValues(seg) for seg in self.baseSegments]) + + shift = self._baseOffsets[segment] if segment in self._baseOffsets else 0 + paddedOffset = segment.offset - shift + # if padding reaches before the start of the message + if paddedOffset < 0: + toPrepend = [numpy.nan] * -paddedOffset + else: + toPrepend = [] + paddedNext = paddedOffset + self._maxLen + # if padding reaches after the end of the message + if paddedNext > len(segment.analyzer.values): + toAppend = [numpy.nan] * (paddedNext - len(segment.analyzer.values)) + else: + toAppend = [] + values = toPrepend + \ + segment.analyzer.values[max(0, paddedOffset):min(len(segment.analyzer.values), paddedNext)] + \ + toAppend + + assert len(values) == self._maxLen, "value padding failed" + # overlapping offset from -1 + if shift >= 0: + assert tuple(values[shift:shift + segment.length]) == segment.values, "value padding failed (positive shift)" + else: + assert tuple(values[0:shift + segment.length]) == segment.values[-shift:], "value padding failed (negative shift)" + return values + + def baseOffset(self, segment: MessageSegment): + """The offset of the given segment from the relative base offset of all the baseSegments in this object.""" + return self._baseOffsets[segment] if segment in self._baseOffsets else 0 + + +class TemplateGenerator(object): + """ + Generate templates for a list of segments according to their distance. + """ + + def __init__(self, dc: DistanceCalculator, clusterer: 'AbstractClusterer'): + """ + Find similar segments, group them, and return a template for each group. + + :param dc: Segment distances to base the templates on. + """ + self._dc = dc + self._clusterer = clusterer + + @property + def distanceCalculator(self): + return self._dc + + @property + def clusterer(self): + return self._clusterer + + @staticmethod + def generateTemplatesForClusters(dc: DistanceCalculator, segmentClusters: Iterable[List[MessageSegment]], medoid=True) \ + -> List[Template]: + """ + Find templates representing the message segments in the input clusters. + + :param dc: Distance calculator to generate templates with + :param medoid: Use medoid as template (supports mixed-length clusters) if true (default), + use mean of segment values if false (supports only single-length clusters) + :param segmentClusters: list of input clusters + :return: list of templates for input clusters + """ + templates = list() + for cluster in segmentClusters: + if not medoid: + segValues = numpy.array([ seg.values for seg in cluster ]) + center = numpy.mean(segValues, 0) + else: + center = dc.findMedoid(cluster) + templates.append(Template(center, cluster)) + return templates + + + def generateTemplates(self) -> List[Template]: + # noinspection PyUnresolvedReferences + """ + Generate templates for all clusters. Triggers a new clustering run. + + >>> from pprint import pprint + >>> from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage + >>> from nemere.utils.loader import BaseLoader + >>> from nemere.inference.analyzers import Value + >>> + >>> bytedata = [ + ... bytes([1, 2, 3, 4]), + ... bytes([ 2, 3, 4]), + ... bytes([ 1, 3, 4]), + ... bytes([ 2, 4 ]), + ... bytes([ 2, 3 ]), + ... bytes([20, 30, 37, 50, 69, 2, 30]), + ... bytes([ 37, 5, 69 ]), + ... bytes([70, 2, 3, 4]), + ... bytes([3, 2, 3, 4]) + ... ] + >>> messages = [RawMessage(bd) for bd in bytedata] + >>> analyzers = [Value(message) for message in messages] + >>> segments = [MessageSegment(analyzer, 0, len(analyzer.message.data)) for analyzer in analyzers] + >>> specimens = BaseLoader(messages) + >>> DistanceCalculator.debug = False + >>> dc = DistanceCalculator(segments, thresholdFunction=DistanceCalculator.neutralThreshold, thresholdArgs=None) + Calculated distances for 37 segment pairs in ... seconds. + >>> clusterer = DBSCANsegmentClusterer(dc, eps=1.0, min_samples=3) + >>> tg = TemplateGenerator(dc, clusterer) + >>> templates = tg.generateTemplates() + DBSCAN epsilon: 1.000, minpts: 3 + >>> pprint([t.baseSegments for t in templates]) + [[MessageSegment 4 bytes at (0, 4): 01020304 | values: (1, 2, 3..., + MessageSegment 3 bytes at (0, 3): 020304 | values: (2, 3, 4), + MessageSegment 3 bytes at (0, 3): 010304 | values: (1, 3, 4), + MessageSegment 2 bytes at (0, 2): 0204 | values: (2, 4), + MessageSegment 2 bytes at (0, 2): 0203 | values: (2, 3), + MessageSegment 7 bytes at (0, 7): 141e253245021e | values: (20, 30, 37..., + MessageSegment 3 bytes at (0, 3): 250545 | values: (37, 5, 69), + MessageSegment 4 bytes at (0, 4): 46020304 | values: (70, 2, 3..., + MessageSegment 4 bytes at (0, 4): 03020304 | values: (3, 2, 3...]] + >>> pprint(clusterer.getClusters()) + {0: [MessageSegment 4 bytes at (0, 4): 01020304 | values: (1, 2, 3..., + MessageSegment 3 bytes at (0, 3): 020304 | values: (2, 3, 4), + MessageSegment 3 bytes at (0, 3): 010304 | values: (1, 3, 4), + MessageSegment 2 bytes at (0, 2): 0204 | values: (2, 4), + MessageSegment 2 bytes at (0, 2): 0203 | values: (2, 3), + MessageSegment 7 bytes at (0, 7): 141e253245021e | values: (20, 30, 37..., + MessageSegment 3 bytes at (0, 3): 250545 | values: (37, 5, 69), + MessageSegment 4 bytes at (0, 4): 46020304 | values: (70, 2, 3..., + MessageSegment 4 bytes at (0, 4): 03020304 | values: (3, 2, 3...]} + + # Example, not to run by doctest: + # + labels = [-1]*len(segments) + for i, t in enumerate(templates): + for s in t.baseSegments: + labels[segments.index(s)] = i + labels[segments.index(t.medoid)] = "({})".format(i) + from visualization.distancesPlotter import DistancesPlotter + sdp = DistancesPlotter(specimens, 'distances-testcase', True) + sdp.plotSegmentDistances(tg, numpy.array(labels)) + sdp.writeOrShowFigure() + + + :return: A list of Templates for all clusters. + """ + # retrieve all clusters and omit noise for template generation. + allClusters = [cluster for label, cluster in self._clusterer.getClusters().items() if label > -1] + return TemplateGenerator.generateTemplatesForClusters(self._dc, allClusters) + + + + + + + + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # # # Clusterer classes # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + +class AbstractClusterer(ABC): + """ + Wrapper for any clustering implementation to select and adapt the autoconfiguration of the parameters. + """ + def __init__(self, dc: DistanceCalculator, segments: Sequence[MessageSegment] = None): + """ + + :param dc: + :param segments: subset of segments from dc to cluster, use all segments in dc if None + """ + self._dc = dc # type: DistanceCalculator + if segments is None: + self._distances = dc.distanceMatrix + self._segments = dc.segments + else: + self._segments = segments + self._distances = self._dc.distancesSubset(segments) + + @property + def segments(self): + return self._segments + + @property + def distanceCalculator(self): + return self._dc + + def clusterSimilarSegments(self, filterNoise=True) -> List[List[MessageSegment]]: + """ + Find suitable discrimination between dissimilar segments. + + Works on representatives for segments of identical features. + + :param filterNoise: if False, the first element in the returned list of clusters + always is the (possibly empty) noise. + :return: clusters of similar segments + """ + clusters = self.getClusters() + if filterNoise and -1 in clusters: # omit noise + del clusters[-1] + clusterlist = [clusters[l] for l in sorted(clusters.keys())] + return clusterlist + + def getClusters(self) -> Dict[int, List[MessageSegment]]: + """ + Do the initialization of the clusterer and perform the clustering of the list of segments contained in the + distance calculator. + + Works on representatives for segments of identical features. + + :return: A dict of labels to lists of segments with that label. + """ + try: + labels = self.getClusterLabels() + except ValueError as e: + print(self._segments) + # import tabulate + # print(tabulate.tabulate(similarities)) + raise e + assert isinstance(labels, numpy.ndarray) + ulab = set(labels) + + segmentClusters = dict() + for l in ulab: + class_member_mask = (labels == l) + segmentClusters[l] = [seg for seg in itertools.compress(self._segments, class_member_mask)] + return segmentClusters + + @abstractmethod + def getClusterLabels(self) -> numpy.ndarray: + """ + Cluster the entries in the similarities parameter + and return the resulting labels. + + :return: (numbered) cluster labels for each segment in the order given in the (symmetric) distance matrix + """ + raise NotImplementedError("This method needs to be implemented using a cluster algorithm.") + + def lowertriangle(self): + """ + Distances is a symmetric matrix, and we often only need one triangle: + :return: the lower triangle of the matrix, all other elements of the matrix are set to nan + """ + mask = numpy.tril(numpy.ones(self._distances.shape)) != 0 + dist = self._distances.copy() + dist[~mask] = numpy.nan + return dist + + def _nearestPerNeigbor(self) -> List[Tuple[int, float]]: + # noinspection PyUnresolvedReferences,PyProtectedMember + """ + see also DistanceCalculator.neighbors() + In comparison to the general implementation in DistanceCalculator, this one does not return a sorted list, + but just the closest neighbor and its index for all segments. + + numpy.array([[0.0, 0.5, 0.8],[0.1, 0.0, 0.9],[0.7,0.3,0.0]] + + This test uses a non-symmetric matrix to detect bugs if any. This is NOT a use case example! + >>> from pprint import pprint + >>> from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage + >>> from nemere.utils.loader import BaseLoader + >>> from nemere.inference.analyzers import Value + >>> + >>> bytedata = [ + ... bytes([1, 2, 3, 4]), + ... bytes([1, 2 ]), + ... bytes([3, 2, 3, 4]) + ... ] + >>> messages = [RawMessage(bd) for bd in bytedata] + >>> analyzers = [Value(message) for message in messages] + >>> segments = [MessageSegment(analyzer, 0, len(analyzer.message.data)) for analyzer in analyzers] + >>> specimens = BaseLoader(messages) + >>> DistanceCalculator.debug = False + >>> dc = DistanceCalculator(segments) + Calculated distances for 4 segment pairs in ... seconds. + >>> clusterer = DBSCANsegmentClusterer(dc, eps=1, min_samples=2) + >>> print(clusterer._distances) + [[0. 0.3975 0.125 ] + [0.3975 0. 0.5483] + [0.125 0.5483 0. ]] + >>> clusterer._nearestPerNeigbor() + [(2, 0.125), (0, 0.3975), (0, 0.125)] + + :return: a list of the nearest neighbors of each segment in this clusterer object, omitting self identity. + The position in the list is the index of the segment in the distance matrix. + The result is a list of tuples with + * the index of the neigbor (from the distance matrix) and + * the distance to this neighbor + """ + neibrNearest = list() + for homeidx in range(self._distances.shape[0]): + # mask self identity by "None"-value + mask = list(range(homeidx)) + [None] + list(range(homeidx + 1, self._distances.shape[0])) + candNeigbors = self._distances[homeidx] + minNidx = mask[0] + for nidx in mask[1:]: + if nidx is not None and (minNidx is None or candNeigbors[nidx] < candNeigbors[minNidx]): + minNidx = nidx + minNdst = candNeigbors[minNidx] + neibrNearest.append((minNidx, minNdst)) + return neibrNearest + + + def steepestSlope(self): + from math import log + + lnN = round(log(self._distances.shape[0])) + + # simple and (too) generic heuristic: MinPts ≈ ln(n) + minpts = lnN + + # find the first increase, in the mean of the first 2*lnN nearest neighbor distances for all ks, + # which is larger than the mean of those increases + # Inspired by Fatma Ozge Ozkok, Mete Celik: "A New Approach to Determine Eps Parameter of DBSCAN Algorithm" + npn = [self._dc.neighbors(seg) for seg in self._dc.segments] + # iterate all the k-th neighbors up to 2 * log(#neighbors) + dpnmln = list() + for k in range(0, len(npn) - 1): + kthNeigbors4is = [idn[k][1] for idn in npn if idn[k][1] > 0][:2 * lnN] + if len(kthNeigbors4is) > 0: + dpnmln.append(numpy.mean(kthNeigbors4is)) + else: + dpnmln.append(numpy.nan) + + # enumerate the means of deltas starting from an offset of log(#neighbors) + deltamln = numpy.ediff1d(dpnmln) + deltamlnmean = deltamln.mean() # + deltamln.std() + for k, a in enumerate(deltamln[lnN:], lnN): + if a > deltamlnmean: + minpts = k + 1 + break + + steepslopeK = minpts + # add standard deviation to mean-threshold for eps (see authors above) + deltamlnmean = deltamln.mean() + 2 * deltamln.std() + for k, a in enumerate(deltamln[minpts-1:], minpts-1): + if a > deltamlnmean: + steepslopeK = k + 1 + break + + return minpts, steepslopeK + + @abstractmethod + def __repr__(self): + raise NotImplementedError("This method needs to be implemented giving the configuration of this clusterer.") + + + +class HDBSCANsegmentClusterer(AbstractClusterer): + """ + Hierarchical Density-Based Spatial Clustering of Applications with Noise + + https://github.com/scikit-learn-contrib/hdbscan + """ + + def __init__(self, dc: DistanceCalculator, segments: Sequence[MessageSegment] = None, **kwargs): + """ + + :param dc: + :param kwargs: e. g. epsilon: The DBSCAN epsilon value, if it should be fixed. + If not given (None), it is autoconfigured. + """ + super().__init__(dc, segments) + + if len(kwargs) == 0: + # from math import log + # lnN = round(log(self.distances.shape[0])) + self.min_cluster_size = self.steepestSlope()[0] # round(lnN * 1.5) + elif 'min_cluster_size' in kwargs: + self.min_cluster_size = kwargs['min_cluster_size'] + else: + raise ValueError("Parameters for HDBSCAN without autoconfiguration missing. " + "Requires min_cluster_size.") + self.min_samples = round(math.sqrt(len(dc.segments))) + + def getClusterLabels(self) -> numpy.ndarray: + """ + Cluster the entries in the similarities parameter by DBSCAN + and return the resulting labels. + + :return: (numbered) cluster labels for each segment in the order given in the (symmetric) distance matrix + """ + from hdbscan import HDBSCAN + + if numpy.count_nonzero(self._distances) == 0: # the distance matrix contains only identical segments + return numpy.zeros_like(self._distances[0], int) + + dbscan = HDBSCAN(metric='precomputed', allow_single_cluster=True, cluster_selection_method='leaf', + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples + ) + print("HDBSCAN min cluster size:", self.min_cluster_size, "min samples:", self.min_samples) + dbscan.fit(self._distances.astype(float)) + return dbscan.labels_ + + def __repr__(self): + return 'HDBSCAN mcs {} ms {}'.format(self.min_cluster_size, self.min_samples) + + +class OPTICSsegmentClusterer(AbstractClusterer): + """ + Ordering Points To Identify the Clustering Structure + + https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS + """ + + def __init__(self, dc: DistanceCalculator, segments: Sequence[MessageSegment] = None, **kwargs): + """ + + :param dc: + :param kwargs: e. g. epsilon: The DBSCAN epsilon value, if it should be fixed. + If not given (None), it is autoconfigured. + """ + super().__init__(dc, segments) + + self.min_samples = round(math.sqrt(len(dc.segments))) + self.max_eps = .4 + if 'min_samples' in kwargs: + self.min_samples = kwargs['min_samples'] + if 'max_eps' in kwargs: + self.max_eps = kwargs['max_eps'] + + def getClusterLabels(self) -> numpy.ndarray: + """ + Cluster the entries in the similarities parameter by OPTICS + and return the resulting labels. + + :return: (numbered) cluster labels for each segment in the order given in the (symmetric) distance matrix + """ + if numpy.count_nonzero(self._distances) == 0: # the distance matrix contains only identical segments + return numpy.zeros_like(self._distances[0], int) + + optics = OPTICS(metric='precomputed', min_samples=self.min_samples, max_eps=self.max_eps) + print("OPTICS min samples:", self.min_samples, "max eps:", self.max_eps) + optics.fit(self._distances) #.astype(float) + return optics.labels_ + + def __repr__(self): + return 'OPTICS ms {} maxeps {}'.format(self.min_samples, self.max_eps) + + +class DBSCANsegmentClusterer(AbstractClusterer): + """ + Wrapper for DBSCAN from the sklearn.cluster module including autoconfiguration of the parameters. + """ + + def __init__(self, dc: DistanceCalculator, segments: Sequence[MessageSegment] = None, + interp_method="spline", **kwargs): + """ + :param dc: + :param segments: subset of segments from dc to cluster, use all segments in dc if None + :param kwargs: e. g. epsilon: The DBSCAN epsilon value, if it should be fixed. + If not given (None), it is autoconfigured. + For autoconfiguration with Kneedle applied to the ECDF of dissimilarities, + S is Kneedle's sensitivity parameter with a default of 0.8. + """ + super().__init__(dc, segments) + + self._clusterlabelcache = None + self.kneelocator = None # type: Union[None, KneeLocator] + + self.S = kwargs["S"] if "S" in kwargs else 0.8 + self.k = kwargs["k"] if "k" in kwargs else 0 + if len(kwargs) == 0 or "S" in kwargs or "k" in kwargs: + self.min_samples, self.eps = self._autoconfigure(interp_method=interp_method) + else: # eps and min_samples given, preventing autoconfiguration + if not 'eps' in kwargs or not 'min_samples' in kwargs: + raise ValueError("Parameters for DBSCAN without autoconfiguration missing. " + "Requires eps and min_samples.") + self.min_samples, self.eps = kwargs['min_samples'], kwargs['eps'] + + def getClusterLabels(self, noCache=False) -> numpy.ndarray: + """ + Cluster the entries in the similarities parameter by DBSCAN + and return the resulting labels. + + :return: (numbered) cluster labels for each segment in the order given in the (symmetric) distance matrix + """ + if self._clusterlabelcache is not None and noCache == False: + return self._clusterlabelcache + + import sklearn.cluster + + if numpy.count_nonzero(self._distances) == 0: # the distance matrix contains only identical segments + return numpy.zeros_like(self._distances[0], int) + + dbscan = sklearn.cluster.DBSCAN(eps=self.eps, min_samples=self.min_samples, metric='precomputed') + print("DBSCAN epsilon: {:0.3f}, minpts: {}".format(self.eps, int(self.min_samples))) + dbscan.fit(self._distances) + self._clusterlabelcache = dbscan.labels_ + return dbscan.labels_ + + def __repr__(self): + return 'DBSCAN eps {:0.3f} mpt {:0.0f}'.format(self.eps, self.min_samples) \ + if self.eps and self.min_samples \ + else 'DBSCAN unconfigured (need to set epsilon and min_samples)' + + def _autoconfigure(self, **kwargs): + """ + Auto configure the clustering parameters epsilon and minPts regarding the input data + + :return: min_samples, epsilon + """ + # return self._autoconfigureKneedle(**kwargs) + return self._autoconfigureECDFKneedle(**kwargs) + + def _autoconfigureMPC(self): + """ + Auto configure the clustering parameters epsilon and minPts regarding the input data + Maximum Positive Curvature + + :return: min_samples, epsilon + """ + from nemere.utils.baseAlgorithms import autoconfigureDBSCAN + neighbors = [self.distanceCalculator.neighbors(seg) for seg in self.distanceCalculator.segments] + epsilon, min_samples, k = autoconfigureDBSCAN(neighbors) + print("eps {:0.3f} autoconfigured (MPC) from k {}".format(epsilon, k)) + return min_samples, epsilon + + def _maximumPositiveCurvature(self): + """ + Use implementation of utils.baseAlgorithms to determine the maximum positive curvature + :return: k, min_samples + """ + from nemere.utils.baseAlgorithms import autoconfigureDBSCAN + e, min_samples, k = autoconfigureDBSCAN( + [self.distanceCalculator.neighbors(seg) for seg in self.distanceCalculator.segments]) + return k, min_samples + + def _autoconfigureKneedle(self): + """ + Auto configure the clustering parameters epsilon and minPts regarding the input data + + knee is too far right/value too small to be useful: + the clusters are small/zero size and few, perhaps density function too uneven in this use case? + So we added a margin. Here selecting + low factors resulted in only few small clusters. The density function seems too uneven for DBSCAN/Kneedle. + + :return: minpts, epsilon + """ + # min_samples, k = self.steepestSlope() + k, min_samples = self._maximumPositiveCurvature() + print("KneeLocator: dists of", self._distances.shape[0], "neighbors, k", k, "min_samples", min_samples) + + # get k-nearest-neighbor distances: + neighdists = self._knearestdistance(k) + # # add a margin relative to the remaining interval to the number of neighbors + # round(k + (self._distances.shape[0] - 1 - k) * .2)) + # # round(minpts + 0.5 * (self.distances.shape[0] - 1 - min_samples)) + + # # knee by Kneedle alogithm: https://ieeexplore.ieee.org/document/5961514 + kneel = KneeLocator(range(len(neighdists)), neighdists, curve='convex', direction='increasing') + kneeX = kneel.knee + + # import matplotlib.pyplot as plt + # kneel.plot_knee_normalized() + # plt.show() + + if isinstance(kneeX, int): + epsilon = neighdists[kneeX] + else: + print("Warning: Kneedle could not find a knee in {}-nearest distribution.".format(min_samples)) + epsilon = 0.0 + + if not epsilon > 0.0: # fallback if epsilon becomes zero + lt = self.lowertriangle() + epsilon = numpy.nanmean(lt) + numpy.nanstd(lt) + + return min_samples, epsilon + + kneeyThreshold = 0.1 + splineSmooth = 0.03 + + def _autoconfigureECDFKneedle(self, interp_method="spline", recurse=True, trim=None): + """ + + >>> from kneed import KneeLocator + >>> from scipy import interpolate + >>> from itertools import chain + >>> from tabulate import tabulate + >>> import matplotlib.pyplot as plt + >>> + >>> from nemere.utils.loader import SpecimenLoader + >>> from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation + >>> from nemere.utils.baseAlgorithms import ecdf + >>> from nemere.inference.templates import DistanceCalculator, DBSCANsegmentClusterer + >>> from nemere.inference.analyzers import MessageAnalyzer, Value + >>> + >>> specimens = SpecimenLoader("../input/deduped-orig/ntp_SMIA-20111010_deduped-100.pcap", 2, True) + >>> segmentsPerMsg = MessageAnalyzer.convertAnalyzers(bcDeltaGaussMessageSegmentation(specimens, 1.2), Value) + Segmentation by inflections of sigma-1.2-gauss-filtered bit-variance. + >>> segments = list(chain.from_iterable(segmentsPerMsg)) + >>> dc = DistanceCalculator(segments) + Calculated distances for 448879 segment pairs in ... seconds. + >>> clusterer = DBSCANsegmentClusterer(dc, segments, S=24) + DBSCANsegmentClusterer: eps 0.200 autoconfigured (Kneedle on ECDF with S 24) from k 2 + >>> print(clusterer.k) + 2 + >>> kneels = list() + >>> for k in range(1,10): + ... neighdists = clusterer._knearestdistance(k, True) + ... knncdf = ecdf(neighdists, True) + ... tck = interpolate.splrep(knncdf[0], knncdf[1], s=DBSCANsegmentClusterer.splineSmooth) + ... Ds_y = interpolate.splev(knncdf[0], tck, der=0) + ... kneel = KneeLocator(knncdf[0], Ds_y, S=clusterer.S, curve='concave', direction='increasing') + ... kneels.append(kneel) + >>> plt.plot(knncdf[0], knncdf[1], label=f"k = {k}") # doctest: +SKIP + >>> plt.plot(knncdf[0], Ds_y, label=f"k = {k} (smoothed)") # doctest: +SKIP + >>> kneelist = [(k, locator.all_knees) for k, locator in enumerate(kneels,1)] + >>> print(tabulate(kneelist)) + - --------------------- + 1 {0.18154389003537735} + 2 {0.19987050867290748} + 3 {0.22025666655513917} + 4 {0.237042781964657} + 5 {0.24426345351319875} + 6 {0.25393409495926683} + 7 {0.2658486707566462} + 8 {0.2716385690789474} + 9 {0.27354003906249996} + - --------------------- + >>> plt.legend() # doctest: +SKIP + >>> plt.show() # doctest: +SKIP + + + kneel = KneeLocator(knncdf[0], knncdf[1], S=clusterer.S, curve='concave', direction='increasing', + interp_method='polynomial', polynomial_degree=5) + + knee_index = sum(knncdf[0] 1: + epsilon = max(kneel.all_knees) + else: + epsilon = kneel.knee + self.kneelocator = kneel + + print("DBSCANsegmentClusterer: eps {:0.3f} autoconfigured (Kneedle on ECDF with S {}) from k {}".format(epsilon, self.S, self.k)) + return min_samples, epsilon + + + def autoconfigureEvaluation(self, filename: str, markeps: float = False): + """ + Auto configure the clustering parameters epsilon and minPts regarding the input data + + :return: minpts, epsilon + """ + import numpy + import matplotlib.pyplot as plt + from math import ceil, log + from scipy.ndimage.filters import gaussian_filter1d + from kneed import KneeLocator + + from nemere.utils.baseAlgorithms import ecdf + + sigma = log(len(self.segments))/2 + # k, min_samples = self._maximumPositiveCurvature() + # smoothknearest = dict() + # seconddiffMax = dict() + # maxD = 0 + # maxK = 0 + minD = 1 + minK = None + minX = None + # just the first... int(len(self.segments) * 0.7) + for curvK in range(0, ceil(log(len(self.segments) ** 2))): + neighdists = self._knearestdistance(curvK) + knncdf = ecdf(neighdists, True) + smoothknn = gaussian_filter1d(knncdf[1], sigma) + diff2smooth = numpy.diff(smoothknn, 2) / numpy.diff(knncdf[0])[1:] + mX = diff2smooth.argmin() + if minD > diff2smooth[mX]: + print(curvK, minD) + minD = diff2smooth[mX] + minK = curvK + minX = knncdf[0][mX+1] + + # seconddiffMax[curvK] = seconddiff.max() + # if seconddiffMax[curvK] > maxD: + # maxD = seconddiffMax[curvK] + # maxK = curvK + + k = minK + # noinspection PyStatementEffect + minX + # epsilon = minX + min_samples = sigma*2 + + neighdists = self._knearestdistance(k) + knncdf = ecdf(neighdists, True) + # smoothknn = gaussian_filter1d(knncdf[1], sigma) + kneel = KneeLocator(knncdf[0], knncdf[1], curve='concave', direction='increasing') + epsilon = kneel.knee * 0.8 + + print("selected k = {}; epsilon = {:.3f}; min_samples = {:.0f}".format(k, epsilon, min_samples)) + + + # # # # # # # # # # # # # # # # # # # # # # # # # # # # + # plots of k-nearest-neighbor distance histogram and "knee" + plt.rc('legend', frameon=False) + fig = plt.gcf() + fig.set_size_inches(16, 9) + + numK = 100 # ceil(sigma**2)) + minK = max(0, k - numK//2) + maxK = min( len(self.segments) - 1, k + numK//2) + for curvK in range(minK, maxK): + alpha = 1 if curvK == k else .4 + neighdists = self._knearestdistance(curvK) + knncdf = ecdf(neighdists, True) + + if curvK == k: + smoothknn = gaussian_filter1d(knncdf[1], sigma) + + # diff1smooth = numpy.gradient(smoothknn) + diff2smooth = numpy.diff(smoothknn, 2) + # diff3smooth = numpy.gradient(numpy.gradient(numpy.gradient(smoothknn))) + + diffknn = numpy.diff(smoothknn) + mX = diffknn.argmax() + mQ = 0.1 * numpy.diff(knncdf[0]).mean() * sum(diffknn) # diffknn[mX] * 0.15 + a = next(xA for xA in range(mX, -1, -1) if diffknn[xA] < mQ or xA == 0) + b = next(xB for xB in range(mX, len(knncdf[0])-1) if diffknn[xB] < mQ or xB == len(knncdf[0]) - 2) + # tozerone = cdist(numpy.array(knncdf).transpose(), numpy.array([[0, 1]])).argmin() + + diffshrink = 0.05 + + plt.plot(knncdf[0], knncdf[1], alpha=alpha, label=curvK, color='lime') + plt.plot(knncdf[0], smoothknn, label="$g(\cdot)$", color='red') + plt.plot(knncdf[0][1:], diffshrink * diffknn / numpy.diff(knncdf[0]), linestyle="dashed", color='blue', + label="$\Delta(g(\cdot))$") + plt.plot(knncdf[0][2:], 20 * diff2smooth, linestyle="-.", color='violet', label="$\Delta^2(g(\cdot))$") + plt.scatter(knncdf[0][a+1], diffshrink * diffknn[a] / (knncdf[0][a+1] - knncdf[0][a]), + label="a = {:.3f}".format(knncdf[0][a+1])) + plt.scatter(knncdf[0][b+1], diffshrink * diffknn[b] / (knncdf[0][b+1] - knncdf[0][b]), + label="b = {:.3f}".format(knncdf[0][b+1]) ) + # plt.plot(knncdf[0], diff3smooth * 100, linestyle="dotted", + # label="$\Delta^3(g(\cdot))$") + plt.axvline(knncdf[0][diff2smooth.argmin()], color='indigo', label="$min(\Delta^2(\cdot))$") + + else: + plt.plot(knncdf[0], knncdf[1], alpha=alpha) + plt.axvline(epsilon, label="alt eps {:.3f}".format(epsilon), linestyle="dashed", color='lawngreen') + # plt.axvline(sqrt(epsilon), label="sqrt(neps) {:.3f}".format(sqrt(epsilon)), + # linestyle="dashed", color='green') + if markeps: + plt.axvline(markeps, label="applied eps {:.3f}".format(markeps), linestyle="-.", color='orchid') + + plt.tight_layout(rect=[0,0,1,.95]) + plt.legend() + plt.savefig(filename) + + # fig, ax = plt.subplots(1, 2) + # axl, axr = ax.flat + # + # # farthest + # # plt.plot([max([dpn[k] for nid, dpn in npn]) for k in range(0, len(npn)-1)], alpha=.4) + # # axl.plot(dpnmln, alpha=.4) + # # plt.plot([self._knearestdistance(k) for k in range( round(0.5 * (self.distances.shape[0]-1)) )]) + # disttril = numpy.tril(self._distances) + # alldist = [e for e in disttril.flat if e > 0] + # axr.hist(alldist, 50) + # # plt.plot(smoothdists, alpha=.8) + # # axl.axvline(minpts, linestyle='dashed', color='red', alpha=.4) + # axl.axvline(steepslopeK, linestyle='dotted', color='blue', alpha=.4) + # left = axl.get_xlim()[0] + # bottom = axl.get_ylim()[1] + # # axl.text(left, bottom,"mpt={}, eps={:0.3f}".format(minpts, epsilon)) + # # plt.axhline(neighdists[int(round(kneeX))], alpha=.4) + # # plt.plot(range(len(numpy.ediff1d(smoothdists))), numpy.ediff1d(smoothdists), linestyle='dotted') + # # plt.plot(range(len(numpy.ediff1d(neighdists))), numpy.ediff1d(neighdists), linestyle='dotted') + # axl.legend() + + plt.close('all') + plt.clf() + + return min_samples, epsilon + + + def _knearestmean(self, k: int): + """ + it gives no significantly better results than the direct k-nearest distance, + but requires more computation. + + :param k: range of neighbors to be selected + :return: The mean of the k-nearest neighbors for all the distances of this clusterer instance. + """ + neighdistmeans = list() + for neighid in range(self._distances.shape[0]): + ndmean = numpy.mean( + sorted(self._distances[:, neighid])[1:k + 1]) # shift by one: ignore self identity + neighdistmeans.append((neighid, ndmean)) + neighdistmeans = sorted(neighdistmeans, key=lambda x: x[1]) + return [e[1] for e in neighdistmeans] + + + def _knearestdistance(self, k: int, onlyUnique=False): + """ + :param k: neighbor to be selected + :param onlyUnique: if set to true, and dc.segments contains Templates that pool some of the self.segments, + use the pooled distances only between unique segments, + :return: The distances of the k-nearest neighbors for all distances of this clusterer instance. + """ + if onlyUnique and isinstance(self.distanceCalculator, DelegatingDC): + segments = self.distanceCalculator.segments + else: + segments = self.segments + + if not k < len(segments) - 1: + raise IndexError("k={} exeeds the number of neighbors.".format(k)) + neighbordistances = [self._dc.neighbors(seg)[k][1] for seg in segments] + return sorted(neighbordistances) + + + @staticmethod + def _kneebyruleofthumb(neighdists): + """ + according to the paper + (!? this is the wrong reference ?!) https://www.sciencedirect.com/science/article/pii/S0169023X06000218 + + result is far (too far) left of the actual knee + + :param neighdists: k-nearest-neighbor distances + :return: x coordinate of knee point on the distance distribution of the parameter neighdists + """ + # smooth distances to prevent ambiguities about what a "knee" in the L-curve is + from scipy.ndimage.filters import gaussian_filter1d + smoothdists = gaussian_filter1d(neighdists, numpy.log(len([n for n in neighdists if n > 0.001 * max(neighdists)]))) + + # approximate 2nd derivative and get its max + # kneeX = numpy.ediff1d(numpy.ediff1d(smoothdists)).argmax() # alternative for 2nd derivative + kneeX = numpy.array( + [smoothdists[i+1] + smoothdists[i-1] - 2 * smoothdists[i] for i in range(1, len(smoothdists)-1)] + ).argmax() + return int(round(kneeX)) + + # def _selectK(self, maxK) -> int: + # """ + # The k (as in in k-NN ECDF) to use. + # + # :return: Passes through the maxK parameter unchanged. + # """ + # return maxK + + def _selectK(self, maxK) -> int: + """Searching for the sharpest knee in the ks up to maxK.""" + return self._sharpestKnee(maxK) + + def _sharpestKnee(self, maxK) -> int: + """ + Determine the sharpest of the knees for the k-NN ECDFs for ks between 2 and self.k + We define sharpness by the value of the y_difference at the maximum. + + :return: The k (as in in k-NN ECDF) with the sharpest knee. + """ + sharpest = (maxK,0) + for k in range(2,maxK+1): + neighdists = self._knearestdistance(k, True) + knncdf = ecdf(neighdists, True) + tck = interpolate.splrep(knncdf[0], knncdf[1], s=type(self).splineSmooth) + Ds_y = interpolate.splev(knncdf[0], tck, der=0) + kneel = KneeLocator(knncdf[0], Ds_y, S=self.S, curve='concave', direction='increasing') + ydmax = kneel.y_difference[kneel.maxima_indices[-1]] + # print(f"Knee for k={k} has ydmax={ydmax:.5f}") + if ydmax > sharpest[1]: + sharpest = (k, ydmax) + return sharpest[0] + + def preventLargeCluster(self): + """Prevent a large cluster > 60 % of non-noise by searching for a smaller epsilon.""" + if self.kneelocator is None: + # TODO this probably is worth raising an exception + return + clusterLabels = self.getClusterLabels() + # if one cluster is larger than 60% of all non-noise segments... + if type(self).largestClusterExceeds(clusterLabels, 0.6): + print("Cluster larger 60% found. Trim the knncdf to the knee.") + self.min_samples, self.eps = self._autoconfigureECDFKneedle(trim=self.kneelocator.knee) + # force re-clustering without cache + clusterLabels = self.getClusterLabels(True) + + @staticmethod + def largestClusterExceeds(clusterLabels, threshold): + clusterSizes = Counter(clusterLabels) + return max(clusterSizes.values()) > sum(cs for cl, cs in clusterSizes.items() if cl != -1) * threshold + + +class DBSCANadjepsClusterer(DBSCANsegmentClusterer): + """ + DBSCANsegmentClusterer with adjustment of the auto-configured epsilon to fix systematic deviations of the optimal + epsilon value for heuristically determined segments with NEMESYS and zero-segmenter. + + The adjustment is aware of changes in the implementation of the kneed module between versions + lower than and from version 0.7. + """ + epsfrac = 3 # done: 5, 4, + epspivot = 0.15 + + def __init__(self, dc: DistanceCalculator, segments: Sequence[MessageSegment] = None, **kwargs): + super().__init__(dc, segments, **kwargs) + + def _autoconfigureECDFKneedle(self, **kwargs): + min_samples, autoeps = super()._autoconfigureECDFKneedle(**kwargs) + if kneed.__version__ < '0.7.0': + # reduce k if no realistic eps is detected + if autoeps < 0.05: + self.k //= 2 + self.min_samples, autoeps = self._autoconfigure(**kwargs) + # adjust epsilon + adjeps = autoeps + autoeps / type(self).epsfrac * (1 if autoeps < type(self).epspivot else -1) + else: + # adjust epsilon, depending on the sharpness of the knee: + # the sharper (higher ydmax), the farther to the "right" + ydmax = self.kneelocator.y_difference[self.kneelocator.maxima_indices[-1]] + assert ydmax in self.kneelocator.y_difference_maxima + # epsfact = 7*ydmax**2 + -3*ydmax + 0.8 + # epsfact = 17 * ydmax**2 + -12 * ydmax + 2.5 + # epsfact = 21 * ydmax ** 2 + -14.7 * ydmax + 2.7 # (instead of 3, just to improve ntp-1000) + # + # polyfit after iterated best epsilon + epsfact = 16 * ydmax**2 - 10 * ydmax + 1.8 + adjeps = epsfact * autoeps + print(f"DBSCANadjepsClusterer: eps adjusted to {adjeps:.3f} by {epsfact:.2f} based on y_dmax {ydmax:.2f}") + return min_samples, adjeps + + def _selectK(self, maxK) -> int: + """Searching sharpest knee in the ks up to maxK.""" + return self._sharpestKnee(maxK) + + + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # END # # Clusterer classes # # # # # # # # # END # # # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + + + + """ Methods/properties (including super's) diff --git a/src/nemere/inference/trackingBIDE.py b/src/nemere/inference/trackingBIDE.py new file mode 100644 index 00000000..91eb9b04 --- /dev/null +++ b/src/nemere/inference/trackingBIDE.py @@ -0,0 +1,297 @@ +from typing import Hashable, Iterable, Sequence, Dict, List, Tuple +import numpy, uuid, logging +from tabulate import tabulate + +from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage + + +class HashableByteSequence(Hashable): + """ + Make a byte sequence hashable and recognizable if ohash is self defined. + + **Note** that the __hash__() and __eq__ functions are somewhat **abused** and may not behave pythonic! + """ + + def __init__(self, sequence:bytes, ohash:int=None): + self._sequence = sequence + if ohash is None: + self._hash = uuid.uuid4().int + else: + self._hash = ohash + + def __hash__(self) -> int: + return self._hash + + def __eq__(self, other): + """We define HashableByteSequence objects having the same hash value as equal.""" + return hash(other) == hash(self) + + @property + def sequence(self): + return self._sequence + + +class BIDEracker(object): + """ + Occurrence tracking BIDE-based closed sequence mining. + """ + + MIN_SUPPORT = 0.6 + + def __init__(self, sequences: Sequence[HashableByteSequence]): + """ + import inference.trackingBIDE as bide + from tabulate import tabulate + subsfreq = bide.BIDEracker(bide.HashableByteSequence(msg.data[:6], hash(msg)) for msg in specimens.messagePool.keys()) + print(tabulate(sorted([(cnt, bv.hex()) for bv, (cnt, occ) in subsfreq.mostFrequent(1).items()]))) + """ + self._occurranceDrop = 60 + # length: subsequence: (count, message: offset) (last dict is "support" of length - 1) + self._subsequenceLookup = dict() + self._sequenceCount = len(sequences) + self.fillL1(sequences) + self.printExtensions() + self.iterateExtensions() + + @property + def minSupport(self): + """ + :return: Min support threshold: fixed ratio of the number of sequences + """ + return self._sequenceCount * type(self).MIN_SUPPORT + + def countAllOccurences(self): + allcounts = {byteValue: count for freqSeq in self._subsequenceLookup.values() + for byteValue, (count, occurrences) in freqSeq.items()} + return sorted(allcounts.items(), key=lambda x: x[1]) + + def fillL1(self, sequences: Iterable[HashableByteSequence]): + # fill L1 (all one-byte sequences and their positions) + self._subsequenceLookup[1] = dict() + for sequence in sequences: + for offset, intValue in enumerate(sequence.sequence): + byteValue = bytes([intValue]) + if not byteValue in self._subsequenceLookup[1]: + self._subsequenceLookup[1][byteValue] = [0, dict()] + if not sequence in self._subsequenceLookup[1][byteValue][1]: + self._subsequenceLookup[1][byteValue][1][sequence] = list() + self._subsequenceLookup[1][byteValue][0] += 1 # count + self._subsequenceLookup[1][byteValue][1][sequence].append(offset) # location + + # # initial pruning + # allocc = countAllOccurences(subsequenceLookup) + # # print([count for byteValue, count in allocc]) + # knee = numpy.percentile([count for byteValue, count in allocc], 66) + # print("k", 1, "knee", knee) + # for byteValue in list(subsequenceLookup[1]): # since we change the dict, iter over copy of keys + # count, occurrences = subsequenceLookup[1][byteValue] + # if knee > count: # threshold for being frequent + # # remove subsequence just added in k if it is not frequent + # del subsequenceLookup[1][byteValue] + + def iterateExtensions(self): + logger = logging.getLogger(__name__) + k = 2 + # iterate ks as long as there are common sequences with length k + while len(self._subsequenceLookup[k - 1]) > 0: + self._subsequenceLookup[k] = dict() + minSupport = self.minSupport + # debug + if isinstance(self, DynamicBIDEracker): + logger.debug(f"k {k:4d} | knee {minSupport:.2f}") + + # if 8 < k < 12: + # print("next iteration") + # print(tabulate([(bv.hex(), co, lok) for lok, bco in self._subsequenceLookup.items() + # for bv, (co, oc) in bco.items() if bv[:2] == b"\x81\x82"])) + # + everythingfrequent = {(sequence, o): locK for locK, freqSeq in self._subsequenceLookup.items() for + count, occurrences in freqSeq.values() + for sequence, offsets in occurrences.items() for o in offsets if + minSupport <= count} # threshold for being frequent + # sortedoccurences = sorted(subsequenceLookup[k - 1].items(), key=lambda x: x[1][0], + # reverse=True) # key: bytevalue's count + + # extend frequent prefixes known from k-1 + # for byteValue, (count, occurrences) in sortedoccurences: # type: bytes, (int, Dict[AbstractMessage, List[int]]) + for sequence, offset in everythingfrequent.keys(): # search for all the frequent strings in k's supporters + # for message, offsets in occurrences.items(): # search for all the frequent strings in k's supporters + # for o in sorted(offset): + if len(sequence.sequence) < offset + k \ + or (sequence, offset) not in everythingfrequent \ + or not any((sequence, k_1o) in everythingfrequent and everythingfrequent[(sequence, k_1o)] >= k_1l + for k_1o, k_1l in zip(range(offset+1, offset + k + 1), range(k-1, 0, -1))): + # message does not contain an extension of prefix at position o for sequence length k + # ... or no frequent extension + continue + + byteValue = sequence.sequence[offset:offset + k] + if not byteValue in self._subsequenceLookup[k]: + self._subsequenceLookup[k][byteValue] = [0, dict()] + if not sequence in self._subsequenceLookup[k][byteValue][1]: + self._subsequenceLookup[k][byteValue][1][sequence] = list() + # add all frequent occurrences in k's supporters + self._subsequenceLookup[k][byteValue][0] += 1 # count + self._subsequenceLookup[k][byteValue][1][sequence].append(offset) # location + + # print(tabulate(sorted([(cn, bv.hex()) for bv, (cn, sp) in subsequenceLookup[k].items()], key=lambda x: x[0]))) + + # pruning + # all of the new sequences that are frequent will cause the occurrences in their supporters to be removed + if len(self._subsequenceLookup[k]) > 0: + for byteValue, (count, occurrences) in self._subsequenceLookup[k].items(): + # print(count, byteValue) + for sequence, offsets in occurrences.items(): + for ofs in offsets: + assert sequence.sequence[ofs:ofs + k] == byteValue + + # newlyfrequent = {(sequence, o): k for count, occurrences in self._subsequenceLookup[k].values() + # for sequence, offsets in occurrences.items() for o in offsets if + # minSupport <= count} # threshold for being frequent + + # iterate all frequent sequences newly found in k + for byteValue in list(self._subsequenceLookup[k].keys()): # since we change the dict, iter over copy of keys + count, occurrences = self._subsequenceLookup[k][byteValue] + # threshold for being absolutely frequent... + if minSupport > count: + # remove subsequence just added in k if it is not frequent + del self._subsequenceLookup[k][byteValue] + continue # k's sequence to be infrequent causes its support in k-1 to remain valid (and potentially frequent) + # if byteValue[:2] == b"\x63\x82\x53\x63": + # print(count, byteValue.hex()) + + # ... and for being locally frequent (prefix + extension) + keepPrefix = False + for ext in range(1, k): + # print(byteValue[:-ext], ext) + if byteValue[:-ext] in self._subsequenceLookup[k-ext]: + if count < self._subsequenceLookup[k-ext][byteValue[:-ext]][0] * self._occurranceDrop / 100: + # remove subsequence just added in k if it is not frequent + if byteValue in self._subsequenceLookup[k]: + del self._subsequenceLookup[k][byteValue] + keepPrefix = True + # if byteValue[:2] == b"\x81\x82": bytes.fromhex("d23d15") + # print("remove", byteValue) + continue + break + else: + pass + # print(byteValue[:-1].hex(), "not in", k, "-1") + if keepPrefix: + continue # k's sequence to be locally infrequent causes its support in k-1 to remain valid (and potentially frequent) + + # prefix = byteValue[:-1] + # find all occurrences in all supporters in k-1 for k's frequent string and remove from k-1 + for sequence, offsets in occurrences.items(): + for o in sorted(offsets): + # print("newly:", sequence.sequence[o:o+k]) + # remove only if frequent from supporter is completely contained in new frequent + # meaning: remove all occurrences in any supporters for any k_1o >= o and k_1l < k + for k_1o, k_1l in zip(range(o, o + k + 1), range(k, 0, -1)): + if (sequence, k_1o) in everythingfrequent and \ + everythingfrequent[(sequence, k_1o)] <= k_1l: + locK = everythingfrequent[(sequence, k_1o)] + byVa = sequence.sequence[k_1o:k_1o + locK] + # print("rem sup occ:", k_1o, " - ", message.data[k_1o:k_1o+locK]) + # print(subsequenceLookup[locK][byVa][1][message]) + if k_1o in self._subsequenceLookup[locK][byVa][1][sequence]: + self._subsequenceLookup[locK][byVa][1][sequence].remove(k_1o) + self._subsequenceLookup[locK][byVa][0] -= 1 + # + # if 8 < k < 12: + # print("after pruning") + # print(tabulate([(bv.hex(), co, lok) for lok, bco in self._subsequenceLookup.items() + # for bv, (co, oc) in bco.items() if bv[:2] == b"\x81\x82"])) + # + # prune k-1's supporters whose occurrence count have fallen to zero as they have been accounted for in k (in the previous loop) + for locK, bco in self._subsequenceLookup.items(): + if locK == k: + continue + for byteValue in list(bco): # since we change the dict, iter over copy of keys + count, occurrences = self._subsequenceLookup[locK][byteValue] + if count <= 0: # may be negative is multiple longer frequent strings have had this one as supporting occurence + del self._subsequenceLookup[locK][byteValue] + else: + for sequence in list(occurrences): + if len(occurrences[sequence]) == 0: + del self._subsequenceLookup[locK][byteValue][1][sequence] + k += 1 + + def printExtensions(self, start=1): + for kLoc in range(start, max(self._subsequenceLookup.keys())): + print("### k:", kLoc) + print(tabulate( + sorted([(cn, bv.hex()) for bv, (cn, sp) in self._subsequenceLookup[kLoc].items()], key=lambda x: x[0]))) + + + def mostFrequent(self, start=1) -> Dict[bytes, Tuple[int, Dict[HashableByteSequence, List[int]]]]: + """ + :param start: minimum substring length (k) to include + :return: dict of {byte value: (occurrence count, occurrences in terms of {byte sequence: list of offsets})} + """ + # bytevalue: (count, occurrences) + retVal = dict() + for kLoc in range(start, max(self._subsequenceLookup.keys())): + for bv in self._subsequenceLookup[kLoc].keys(): + if bv in retVal: + print("collision!") + retVal.update(self._subsequenceLookup[kLoc].items()) + return retVal + + +class DynamicBIDEracker(BIDEracker): + """ + Dynamic occurrance tracking BIDE-based closed sequence mining. + """ + def __init__(self, sequences: Sequence[HashableByteSequence], percentile=80): + """ + import inference.trackingBIDE as bide + from tabulate import tabulate + subsfreq = bide.DynamicBIDEracker(bide.HashableByteSequence(msg.data[:6], hash(msg)) for msg in specimens.messagePool.keys()) + print(tabulate(sorted([(cnt, bv.hex()) for bv, (cnt, occ) in subsfreq.mostFrequent(1).items()]))) + + :param percentile: The percentile of occurrences per prefix-extension iteration as threshold for being frequent. + """ + self._percentile = percentile + super().__init__(sequences) + + @property + def minSupport(self): + """ + Dynamically determine min support. + :return: Min support dynamically determined from the kee of the occurrences distribution. + """ + # determine k's min support by counting all occurrences + allocc = self.countAllOccurences() + knee = numpy.percentile([count for byteValue, count in allocc], self._percentile) + return knee + + +class DynamicMessageBIDE(DynamicBIDEracker): + """ + Apply BIDEracker to netzob messages. + + import inference.trackingBIDE as bide + from tabulate import tabulate + subsfreq = bide.DynamicMessageBIDE(specimens.messagePool.keys()) + print(tabulate(sorted([(cnt, bv.hex()) for bv, (cnt, occ) in subsfreq.mostFrequent(1).items()]))) + """ + def __init__(self, messages: Iterable[AbstractMessage], percentile=80): + sequences = [HashableByteSequence(msg.data, hash(msg)) for msg in messages] + super().__init__(sequences, percentile) + + +class MessageBIDE(BIDEracker): + """ + Apply BIDEracker to netzob messages. + + import inference.trackingBIDE as bide + from tabulate import tabulate + subsfreq = bide.MessageBIDE(specimens.messagePool.keys()) + print(tabulate(sorted([(cnt, bv.hex()) for bv, (cnt, occ) in subsfreq.mostFrequent(1).items()]))) + """ + def __init__(self, messages: Iterable[AbstractMessage]): + sequences = [HashableByteSequence(msg.data, hash(msg)) for msg in messages] + super().__init__(sequences) + + diff --git a/src/nemere/utils/evaluationHelpers.py b/src/nemere/utils/evaluationHelpers.py index d0ab7328..5a8e0d6f 100644 --- a/src/nemere/utils/evaluationHelpers.py +++ b/src/nemere/utils/evaluationHelpers.py @@ -8,6 +8,7 @@ from itertools import chain import os, csv, pickle, time from os.path import join, splitext, isfile, isdir, basename, exists, abspath + from tabulate import tabulate from nemere.utils.loader import SpecimenLoader @@ -16,9 +17,9 @@ from nemere.inference.analyzers import * from nemere.inference.formatRefinement import isOverlapping from nemere.inference.segmentHandler import segmentsFromLabels, bcDeltaGaussMessageSegmentation, refinements, \ - fixedlengthSegmenter + fixedlengthSegmenter, bcDeltaGaussMessageSegmentationLE from nemere.inference.segments import MessageAnalyzer, TypedSegment, MessageSegment, AbstractSegment -from nemere.inference.templates import DistanceCalculator, DelegatingDC, Template, MemmapDC +from nemere.inference.templates import DistanceCalculator, DelegatingDC, Template, MemmapDC, TypedTemplate Element = TypeVar('Element') @@ -26,48 +27,70 @@ # available analysis methods analyses = { 'bcpnm': BitCongruenceNgramMean, - 'bc': BitCongruence, - 'bcd': BitCongruenceDelta, - 'bcdg': BitCongruenceDeltaGauss, - 'mbhbv': HorizonBitcongruence, + 'bc': BitCongruence, + 'bcg': BitCongruenceGauss, + 'bcd': BitCongruenceDelta, + 'bcdg': BitCongruenceDeltaGauss, + 'hbcg': HorizonBitcongruenceGauss, + 'sbcdg': SlidingNbcDeltaGauss, + 'pivot': PivotBitCongruence, 'variance': ValueVariance, # Note: VARIANCE is the inverse of PROGDIFF - 'progdiff': ValueProgressionDelta, - 'progcumudelta': CumulatedProgressionDelta, 'value': Value, 'ntropy': EntropyWithinNgrams, 'stropy': Entropy, # TODO check applicability of (cosine) distance calculation to this feature } -# raw nemesys - cft-121 "withoutrefinement" +# # raw nemesys - cft-121 "withoutrefinement" +# sigmapertrace = { +# "dhcp_SMIA2011101X_deduped-100.pcap" : 0.6, +# "dns_ictf2010_deduped-100.pcap" : 0.6, +# "dns_ictf2010-new-deduped-100.pcap" : 0.6, +# "nbns_SMIA20111010-one_deduped-100.pcap" : 1.0, +# "ntp_SMIA-20111010_deduped-100.pcap" : 1.2, +# "smb_SMIA20111010-one_deduped-100.pcap" : 0.6, +# "dhcp_SMIA2011101X_deduped-1000.pcap" : 0.6, +# "dns_ictf2010_deduped-982-1000.pcap" : 0.6, +# "dns_ictf2010-new-deduped-1000.pcap" : 1.0, +# "nbns_SMIA20111010-one_deduped-1000.pcap" : 1.0, +# "ntp_SMIA-20111010_deduped-1000.pcap" : 1.2, +# "smb_SMIA20111010-one_deduped-1000.pcap" : 0.6, +# +# # assumptions derived from first traces +# "dhcp_SMIA2011101X-filtered_maxdiff-100.pcap": 0.6, +# "dns_ictf2010_maxdiff-100.pcap": 0.6, +# "dns_ictf2010-new_maxdiff-100.pcap": 0.6, +# "nbns_SMIA20111010-one_maxdiff-100.pcap": 1.0, +# "ntp_SMIA-20111010_maxdiff-100.pcap": 1.2, +# "smb_SMIA20111010-one-rigid1_maxdiff-100.pcap": 0.6, +# "dhcp_SMIA2011101X-filtered_maxdiff-1000.pcap": 0.6, +# "dns_ictf2010_maxdiff-1000.pcap": 0.6, +# "dns_ictf2010-new_maxdiff-1000.pcap": 0.6, +# "nbns_SMIA20111010-one_maxdiff-1000.pcap": 1.0, +# "ntp_SMIA-20111010_maxdiff-1000.pcap": 1.2, +# "smb_SMIA20111010-one-rigid1_maxdiff-1000.pcap": 0.6, +# } + sigmapertrace = { - "dhcp_SMIA2011101X_deduped-100.pcap" : 0.6, - "dns_ictf2010_deduped-100.pcap" : 0.6, - "dns_ictf2010-new-deduped-100.pcap" : 0.6, - "nbns_SMIA20111010-one_deduped-100.pcap" : 1.0, - "ntp_SMIA-20111010_deduped-100.pcap" : 1.2, - "smb_SMIA20111010-one_deduped-100.pcap" : 0.6, - "dhcp_SMIA2011101X_deduped-1000.pcap" : 0.6, - "dns_ictf2010_deduped-982-1000.pcap" : 0.6, - "dns_ictf2010-new-deduped-1000.pcap" : 1.0, - "nbns_SMIA20111010-one_deduped-1000.pcap" : 1.0, - "ntp_SMIA-20111010_deduped-1000.pcap" : 1.2, - "smb_SMIA20111010-one_deduped-1000.pcap" : 0.6, - - # assumptions derived from first traces - "dhcp_SMIA2011101X-filtered_maxdiff-100.pcap": 0.6, - "dns_ictf2010_maxdiff-100.pcap": 0.6, - "dns_ictf2010-new_maxdiff-100.pcap": 0.6, - "nbns_SMIA20111010-one_maxdiff-100.pcap": 1.0, - "ntp_SMIA-20111010_maxdiff-100.pcap": 1.2, - "smb_SMIA20111010-one-rigid1_maxdiff-100.pcap": 0.6, - "dhcp_SMIA2011101X-filtered_maxdiff-1000.pcap": 0.6, - "dns_ictf2010_maxdiff-1000.pcap": 0.6, - "dns_ictf2010-new_maxdiff-1000.pcap": 0.6, - "nbns_SMIA20111010-one_maxdiff-1000.pcap": 1.0, - "ntp_SMIA-20111010_maxdiff-1000.pcap": 1.2, - "smb_SMIA20111010-one-rigid1_maxdiff-1000.pcap": 0.6, + "dhcp_SMIA2011101X-filtered_maxdiff-1000.pcap": 0.4, + "dns_ictf2010-new_maxdiff-1000.pcap": 0.9, + "nbns_SMIA20111010-one_maxdiff-1000.pcap": 0.4, + "ntp_SMIA-20111010_maxdiff-1000.pcap": 1.3, + "smb_SMIA20111010-one-rigid1_maxdiff-1000.pcap": 0.4, + + # assumptions derived from 1000s traces + "dhcp_SMIA2011101X-filtered_maxdiff-100.pcap": 0.4, + "dns_ictf2010-new_maxdiff-100.pcap": 0.9, + "nbns_SMIA20111010-one_maxdiff-100.pcap": 0.4, + "ntp_SMIA-20111010_maxdiff-100.pcap": 1.3, + "smb_SMIA20111010-one-rigid1_maxdiff-100.pcap": 0.4, + + "dhcp_SMIA2011101X_deduped-10000.pcap": 0.4, + "dns_ictf2010-new-deduped-10000.pcap": 0.9, + "nbns_SMIA20111010-one_deduped-10000.pcap": 0.4, + "ntp_SMIA-20111010_deduped-9995-10000.pcap": 1.3, + "smb_SMIA20111010-one_deduped-10000.pcap": 0.4, } epspertrace = { @@ -151,7 +174,7 @@ def labelForSegment(segGrpHier: List[Tuple[str, List[Tuple[str, List[Tuple[str, if inGroup is not None: return inGroup.split(", ", 2)[-1] else: - return "[unknown]" + return unknown return None @@ -305,6 +328,18 @@ def calcHexDist(hexA, hexB): class CachedDistances(object): + + # tokenizers to select from + tokenizers = ('nemesys',) # zeroslices + CropChars + + # refinement methods + refinementMethods = [ + "none", + "original", # WOOT2018 paper + "base", # ConsecutiveChars+moco + "nemetyl", # INFOCOM2020 paper: ConsecutiveChars+moco+splitfirstseg + ] + def __init__(self, pcapfilename: str, analysisTitle: str, layer=2, relativeToIP=True): """ Cache or load the DistanceCalculator to or from the filesystem @@ -427,6 +462,7 @@ def _calc(self): self.specimens, layer=self.layer, relativeToIP=self.relativeToIP, debug=self.debug) print("Segmenting messages...", end=' ') + littleEndian = self.tokenizer[-2:] == "le" segmentationTime = time.time() # select tokenizer by command line parameter if self.tokenizer == "tshark": @@ -441,12 +477,17 @@ def _calc(self): self.segmentedMessages = fixedlengthSegmenter(4, self.specimens, self.analyzerType, self.analysisArgs) elif self.tokenizer in ["nemesys", "nemesysle"]: # 3. segment messages by NEMESYS - segmentsPerMsg = bcDeltaGaussMessageSegmentation(self.specimens, self.sigma) + if self.tokenizer == "nemesysle": # little endian version + segmentsPerMsg = bcDeltaGaussMessageSegmentationLE(self.specimens, self.sigma) + else: + segmentsPerMsg = bcDeltaGaussMessageSegmentation(self.specimens, self.sigma) # get analyzer requested by analyzerType/analysisArgs self.segmentedMessages = MessageAnalyzer.convertAnalyzers( segmentsPerMsg, self.analyzerType, self.analysisArgs) self._callRefinement() + else: + raise ValueError(f"tokenizer {self.tokenizer} is unknown") self.segmentationTime = time.time() - segmentationTime print("done.") @@ -597,13 +638,14 @@ def __init__(self, pcapfilename: str, reportFullPath: str=None): else: self.reportFullPath = reportFullPath """A path name that is inside the report folder and reflects the pcap base name without extension.""" - if not exists(self.reportFullPath): + try: os.makedirs(self.reportFullPath) - elif isdir(self.reportFullPath): - print("Using existing ", self.reportFullPath, " as report folder.") - else: - print("Path that should be used as report folder is an existing file. Aborting.") - exit(1) + except FileExistsError: + if isdir(self.reportFullPath): + print("Using existing ", self.reportFullPath, " as report folder.") + else: + print("Path that should be used as report folder is an existing file. Aborting.") + exit(1) self.timestamp = time.time() self.timeformated = time.strftime("%Y%m%d-%H%M%S", time.gmtime(self.timestamp)) @@ -641,7 +683,7 @@ def writeReportMetadata(self, dcCacheFile: str=None, scriptRuntime: float=None): class TrueOverlays(object): """ - Count and the amount of (falsely) inferred boundaries in the scope of each true field. + Count and the amount of (falsely) inferred boundaries in the scope of each true field. """ def __init__(self, trueSegments: Dict[str, Sequence[MessageSegment]], inferredSegments: List[Sequence[MessageSegment]], comparator: MessageComparator, minLen=3): @@ -819,12 +861,15 @@ def clusterer(self, val): @property def clusterParams(self): - from sklearn.cluster import DBSCAN + from nemere.inference.templates import DBSCANsegmentClusterer, HDBSCANsegmentClusterer, OPTICSsegmentClusterer + from sklearn.cluster import DBSCAN, OPTICS from hdbscan import HDBSCAN - if isinstance(self._clusterer, (DBSCAN)): + if isinstance(self._clusterer, (DBSCANsegmentClusterer, DBSCAN)): return f"eps {self._clusterer.eps:.3f} ms {self._clusterer.min_samples}" - elif isinstance(self._clusterer, (HDBSCAN)): + elif isinstance(self._clusterer, (HDBSCANsegmentClusterer, HDBSCAN)): return f"mcs {self._clusterer.min_cluster_size} ms {self._clusterer.min_samples}" + elif isinstance(self._clusterer, (OPTICSsegmentClusterer, OPTICS)): + return f"ms {self._clusterer.min_samples} maxeps {self._clusterer.max_eps}" @property def plotTitle(self): @@ -851,5 +896,22 @@ class TitleBuilderSens(TitleBuilder): """include Sensitivitiy from clusterer in title""" @property def clusterParams(self): - return super().clusterParams + from nemere.inference.templates import DBSCANsegmentClusterer + if isinstance(self._clusterer, DBSCANsegmentClusterer): + return f"S {self._clusterer.S:.1f} eps {self._clusterer.eps:.3f} ms {self._clusterer.min_samples}" + else: + return super().clusterParams + +def segIsTyped(someSegment): + return isinstance(someSegment, (TypedTemplate, TypedSegment)) + + +uulmColors = { + "uulm" : "#7D9AAA", # blue-gray + "uulm-akzent": "#A9A28D", # beige + "uulm-in" : "#A32638", # magenta + "uulm-med" : "#26247C", # bluish + "uulm-mawi" : "#56AA1C", # green + "uulm-nawi" : "#BD6005" # orange +} diff --git a/src/nemere/utils/loader.py b/src/nemere/utils/loader.py index 50259668..7817f073 100644 --- a/src/nemere/utils/loader.py +++ b/src/nemere/utils/loader.py @@ -80,6 +80,22 @@ def getBaseLayerOfPCAP(self): def __repr__(self): return type(self).__name__ + ": " + self.pcapFileName + f" on layer {self.getBaseLayerOfPCAP()}" + @property + def maximumMessageLength(self): + """ + :return: The maximum message length in bytes of the relevant network layer without its encapsulation for + the messages in this specimen's pool. + """ + return max(len(line.data) for line in self.messagePool.keys()) + + @property + def cumulatedMessageLength(self): + """ + :return: The sum of all message lengths in bytes of the relevant network layer without its encapsulation for + the messages in this specimen's pool. I. e., the cumulated size of all payload in the trace. + """ + return sum(len(line.data) for line in self.messagePool.keys()) + class SpecimenLoader(BaseLoader): """ Wrapper for loading messages from a PCAP as specimens. @@ -106,6 +122,8 @@ def __init__(self, pcap: str, layer:int=-1, relativeToIP:bool=False): if not isfile(pcap): raise FileNotFoundError('File not found:', pcap) self.pcapFileName = pcap + self.layer = layer + self.relativeToIP = relativeToIP absLayer = 2 + layer if relativeToIP else layer # prevent Netzob from producing debug output in certain cases. @@ -139,14 +157,6 @@ def getBaseLayerOfPCAP(self): # Translates pcapy linktype values to tcpdump ones if in dict, otherwise the value is used unchanged return dl if dl not in SpecimenLoader.pcapyDatalinkTranslation else SpecimenLoader.pcapyDatalinkTranslation[dl] - @property - def maximumMessageLength(self): - """ - :return: The maximum message length in bytes of the relevant network layer without its encapsulation for - the messages in this specimen's pool. - """ - return max(len(line.data) for line in self.messagePool.keys()) - class ScaPyCAPimporter(object): def __init__(self, pcapfilename, importLayer=5): @@ -205,4 +215,4 @@ def __decodeLayer2(self, packet: Packet): raise NetzobImportException("NEMERE_PCAP", "Unsupported layer 2 protocol " + l2Proto, PCAPImporter.INVALID_LAYER2) l2Payload = bytes(packet.payload) - return l2Proto, l2SrcAddr, l2DstAddr, l2Payload \ No newline at end of file + return l2Proto, l2SrcAddr, l2DstAddr, l2Payload diff --git a/src/nemere/utils/reportWriter.py b/src/nemere/utils/reportWriter.py index 1d678703..c15ea113 100644 --- a/src/nemere/utils/reportWriter.py +++ b/src/nemere/utils/reportWriter.py @@ -8,17 +8,18 @@ import numpy from typing import Dict, Tuple, Iterable, TypeVar, Hashable, List, Union, Any, Sequence -from os.path import isdir, splitext, basename, join +from os.path import isdir, splitext, basename, join, exists from itertools import chain from collections import Counter, defaultdict, OrderedDict from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage from nemere.inference.segments import AbstractSegment, TypedSegment, MessageSegment -from nemere.inference.templates import Template, TypedTemplate -from nemere.utils.evaluationHelpers import StartupFilecheck, reportFolder +from nemere.inference.templates import Template, TypedTemplate, FieldTypeTemplate +from nemere.utils.evaluationHelpers import StartupFilecheck, reportFolder, segIsTyped, unknown from nemere.utils.loader import SpecimenLoader -from nemere.validation.dissectorMatcher import FormatMatchScore, MessageComparator +from nemere.validation.dissectorMatcher import FormatMatchScore, MessageComparator, BaseDissectorMatcher +from nemere.visualization.simplePrint import FieldtypeComparingPrinter def calcScoreStats(scores: Iterable[float]) -> Tuple[float, float, float, float, float]: @@ -67,9 +68,8 @@ def countMatches(quality: Iterable[FormatMatchScore]): def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore], - runtime: float, - specimens: SpecimenLoader, comparator: MessageComparator, - inferenceTitle: str, folder="reports"): + runtime: float, comparator: MessageComparator, + inferenceTitle: str, folder="reports", withTitle=False): if not isdir(folder): raise NotADirectoryError("The reports folder {} is not a directory. Reports cannot be written there.".format( @@ -77,7 +77,8 @@ def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore], print('Write report to ' + folder) # write Format Match Score and Metrics to csv - with open(os.path.join(folder, 'FormatMatchMetrics.csv'), 'w') as csvfile: + fn = f'FormatMatchMetrics_{inferenceTitle}.csv' if withTitle else 'FormatMatchMetrics.csv' + with open(os.path.join(folder, fn), 'w') as csvfile: fmmcsv = csv.writer(csvfile) fmmcsv.writerow(["Message", "Score", 'I', 'M', 'N', 'S', 'MG', 'SP']) fmmcsv.writerows( [ @@ -88,10 +89,13 @@ def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore], scoreStats = calcScoreStats([q.score for q in formatmatchmetrics.values()]) matchCounts = countMatches(formatmatchmetrics.values()) - with open(os.path.join(folder, 'ScoreStatistics.csv'), 'w') as csvfile: + fn = os.path.join(folder, 'ScoreStatistics.csv') + writeHeader = not exists(fn) + with open(fn, 'a') as csvfile: fmmcsv = csv.writer(csvfile) - fmmcsv.writerow(["inference", "min", "mean", "max", "median", "std", - "exactcount", "offbyonecount", "offbymorecount", "runtime"]) + if writeHeader: + fmmcsv.writerow(["inference", "min", "mean", "max", "median", "std", + "exactcount", "offbyonecount", "offbymorecount", "runtime"]) fmmcsv.writerow( [ inferenceTitle, *scoreStats, *matchCounts, runtime] ) @@ -109,7 +113,7 @@ def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore], symbolcsv.writerow([field.name for field in symbol.fields]) symbolcsv.writerows([val.hex() for val in msg] for msg in symbol.getCells()) else: - fileNameS = 'Symbols' + fileNameS = f'Symbols_{inferenceTitle}' if withTitle else 'Symbols' with open(os.path.join(folder, fileNameS + '.csv'), 'w') as csvfile: symbolcsv = csv.writer(csvfile) msgcells = chain.from_iterable([sym.getCells() for sym in # unique symbols by set @@ -139,7 +143,8 @@ def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore], tikzcode = comparator.tprintInterleaved(symsMinMeanMax) # write Format Match Score and Metrics to csv - with open(join(folder, 'example-inference-minmeanmax.tikz'), 'w') as tikzfile: + fn = f'example-inference-minmeanmax_{inferenceTitle}.tikz' if withTitle else 'example-inference-minmeanmax.tikz' + with open(join(folder, fn), 'w') as tikzfile: tikzfile.write(tikzcode) @@ -153,6 +158,26 @@ def writeSegmentedMessages2CSV(segmentsPerMsg: Sequence[Sequence[MessageSegment] ) +def writeFieldTypesTikz(comparator: MessageComparator, segmentedMessages: List[Tuple[MessageSegment]], + fTypeTemplates: List[FieldTypeTemplate], filechecker: StartupFilecheck): + # select the messages to print by quality: three of around fmsmin, fmsmean, fmsmax each + fmslist = [BaseDissectorMatcher(comparator, msg).calcFMS() for msg in segmentedMessages] + fmsdict = {fms.score: fms for fms in fmslist} # type: Dict[float, FormatMatchScore] + scoreSorted = sorted(fmsdict.keys()) + fmsmin, fmsmean, fmsmax, fmsmedian, fmsstd = calcScoreStats(scoreSorted) + meanI = scoreSorted.index(fmsmean) + scoreSelect = scoreSorted[-3:] + scoreSorted[meanI - 1:meanI + 2] + scoreSorted[:3] + symsMinMeanMax = [fmsdict[mmm].message for mmm in scoreSelect] + + # visualization of segments from clusters in messages. + cp = FieldtypeComparingPrinter(comparator, fTypeTemplates) + tikzcode = cp.fieldtypes(symsMinMeanMax) + # write tikz code to file + with open(join(filechecker.reportFullPath, 'example-messages-fieldtypes.tikz'), 'w') as tikzfile: + tikzfile.write(tikzcode) + + + Element = TypeVar('Element', AbstractMessage, AbstractSegment) class Report(ABC): statsFile = "statistics" @@ -366,7 +391,7 @@ def write(self, clusters: Dict[Hashable, List[Element]], runtitle: Union[str, Di self.segUniqu = sum(len(cl) for cl in clusters.values()) if ignoreUnknown: - unknownKeys = ["[unknown]", "[mixed]"] + unknownKeys = [unknown, "[mixed]"] self.numUnknown = len([gt for gt in self.groundtruth.values() if gt in unknownKeys]) clustersTemp = {lab: [el for el in clu if self.groundtruth[el] not in unknownKeys] for lab, clu in clusters.items()} @@ -566,7 +591,7 @@ def __init__(self, comparator: MessageComparator, segments: List[AbstractSegment reportPath = reportPath if reportPath is not None else pcap.reportFullPath \ if isinstance(pcap, StartupFilecheck) else reportFolder super().__init__(pcap, reportPath) - self.groundtruth = {rawSeg: typSeg[1].fieldtype if typSeg[0] > 0.5 else "[unknown]" + self.groundtruth = {rawSeg: typSeg[1].fieldtype if typSeg[0] > 0.5 else unknown for rawSeg, typSeg in self.typedMatchTemplates.items()} def write(self, clusters: Dict[str, Union[MessageSegment, Template]], runtitle: Union[str, Dict]=None): @@ -595,8 +620,7 @@ def _writeCSV(self, clusters: Dict[str, Union[MessageSegment, Template]], runtit ( cLabel, seg.bytes.hex(), seg.bytes, len(seg.baseSegments) if isinstance(seg, Template) else 1, - typedMatchTemplates[seg][1].fieldtype if SegmentClusterGroundtruthReport.segIsTyped( - typedMatchTemplates[seg][1]) else "[unknown]", + typedMatchTemplates[seg][1].fieldtype if segIsTyped(typedMatchTemplates[seg][1]) else unknown, typedMatchTemplates[seg][0], self._comparator.lookupField( typedMatchTemplates[seg][1].baseSegments[0] if isinstance(typedMatchTemplates[seg][1], @@ -608,7 +632,7 @@ def _writeCSV(self, clusters: Dict[str, Union[MessageSegment, Template]], runtit @staticmethod def segIsTyped(someSegment): - return isinstance(someSegment, (TypedTemplate, TypedSegment)) + return segIsTyped(someSegment) def relativeOffsets(self, infSegment): """(Matched templates have offsets and lengths identical to seg (inferred) and not the true one.)""" diff --git a/src/nemere/validation/clusterInspector.py b/src/nemere/validation/clusterInspector.py new file mode 100644 index 00000000..ade0af65 --- /dev/null +++ b/src/nemere/validation/clusterInspector.py @@ -0,0 +1,845 @@ +import math +from collections import Counter, MutableSequence +from itertools import combinations +from typing import List, Iterable, Sequence, Tuple, Union + +import scipy.stats +import numpy, re +from networkx import Graph +from networkx.algorithms.components.connected import connected_components +from tabulate import tabulate + +from nemere.inference.segmentHandler import segments2types, filterChars +from nemere.inference.segments import MessageSegment, TypedSegment, AbstractSegment +from nemere.inference.templates import DelegatingDC, AbstractClusterer, Template, TypedTemplate, FieldTypeTemplate +from nemere.utils.loader import SpecimenLoader +from nemere.utils.baseAlgorithms import tril +from nemere.visualization.distancesPlotter import DistancesPlotter + + +class ClusterLabel(object): + def __init__(self, clusterNumber: Union[None, str, int] = None): + self.clusterNumber = None # type: Union[None, str] + if clusterNumber is None: + self.isNoise = True # type: bool + else: + self.isNoise = False + if isinstance(clusterNumber, int): + self.clusterNumber = f"{clusterNumber:02d}" + else: + self.clusterNumber = clusterNumber + + self.analysisTitle = None # type: Union[None, str] + self.lengthsString = None # type: Union[None, str] + self.mostFrequentTypes = None # type: Union[None, Sequence[Tuple[str, int]]] + self.clusterSize = None # type: Union[None, str] + self.maxDist = None # type: Union[None, float] + + def __repr__(self): + """ + Generates a string representation of this label, based on the information present in the instance. + Options are: + * if its noise (isNoise): lengthsString and clusterSize + * non-noise: lengthsString, clusterSize, clusterNumber, and maxDist + * optionally with analysisTitle being "split", "merged", or "singular" + * optionally with mostFrequentTypes + * otherwise just "tf" followed by clusterNumber + """ + if self.isNoise: + if self.lengthsString is not None and self.clusterSize is not None: + return 'Noise: {} Seg.s ({} bytes)'.format(self.clusterSize, self.lengthsString) + else: + return "Noise" + if self.lengthsString is not None and self.clusterSize is not None \ + and self.clusterNumber is not None and self.maxDist is not None: + if self.analysisTitle in ("split", "merged", "singular"): + prepend = self.analysisTitle + " " + else: + prepend = "" + # prevent overflowing plots + clusterNumber = self.clusterNumber + if len(self.clusterNumber) > 17: + clusterSplit = self.clusterNumber.split("+") + if len(clusterSplit) > 4: + clusterNumber = "+".join(clusterSplit[:2] + ["..."] + clusterSplit[-2:]) + if self.mostFrequentTypes is not None: + return prepend + 'Cluster #{} ({:.2f} {}): {} Seg.s ($d_{{max}}$={:.3f}, {} bytes)'.format( + clusterNumber, + self.mostFrequentRatio, + self.mostFrequentTypes[0][0], + self.clusterSize, + self.maxDist, + self.lengthsString) + return prepend + 'Cluster #{}: {} Seg.s ($d_{{max}}$={:.3f}, {} bytes)'.format( + clusterNumber, + self.clusterSize, + self.maxDist, + self.lengthsString) + else: + return f"tf{self.clusterNumber}" + + def toString(self): + """More classical string representation than repr""" + if self.isNoise: + if self.analysisTitle and self.lengthsString and self.clusterSize: + return '{} ({} bytes), Noise: {} Seg.s'.format(self.analysisTitle, self.lengthsString, self.clusterSize) + else: + return "Noise" + if self.analysisTitle and self.lengthsString and self.clusterSize and self.clusterNumber \ + and self.clusterSize and self.maxDist: + if self.mostFrequentTypes: + return '{} ({} bytes), Cluster #{} ({:.2f} {}): {} Seg.s ($d_{{max}}$={:.3f})'.format( + self.analysisTitle, + self.lengthsString, + self.clusterNumber, + self.mostFrequentRatio, + self.mostFrequentTypes[0][0], + self.clusterSize, + self.maxDist) + return '{} ({} bytes), Cluster #{}: {} Seg.s ($d_{{max}}$={:.3f})'.format( + self.analysisTitle, + self.lengthsString, + self.clusterNumber, + self.clusterSize, + self.maxDist) + else: + return f"tf{self.clusterNumber}" + + @property + def mostFrequentRatio(self) -> Union[None, float]: + if isinstance(self.mostFrequentTypes, Sequence): + return self.mostFrequentTypes[0][1] / sum(s for t, s in self.mostFrequentTypes) + return None + + +class SegmentClusterCauldron(object): + """ + Container class for results of the clustering of segments + """ + noise: List[AbstractSegment] + clusters: List[List[AbstractSegment]] + + def __init__(self, clusterer: AbstractClusterer, analysisTitle: str): + """ + Cluster segments according to the distance of their feature vectors. + Keep and label segments classified as noise. + + Start post processing of clusters (splitting, merging, singular/regular clusters, ...) after the desired + preparation of clusters (e.g., by extractSingularFromNoise, appendCharSegments, ...) by calling + **clustersOfUniqueSegments()** + before advanced function are available. + + :param clusterer: Clusterer object that contains all the segments to be clustered + :type analysisTitle: the string to be used as label for the result + """ + self._analysisTitle = analysisTitle + self.clusterer = clusterer + + print("Clustering segments...") + self.noise, *self.clusters = clusterer.clusterSimilarSegments(False) + if any(isinstance(seg, Template) for seg in clusterer.segments): + distinct = "distinct " + else: + dc = self.clusterer.distanceCalculator + self.noise = list({ dc.segments[dc.segments2index([tSegment])[0]] for tSegment in self.noise }) + distinct = "" + print("{} clusters generated from {} {}segments".format(len(self.clusters), len(clusterer.segments), distinct)) + self.unisegClusters = None # type: Union[None, SegmentClusters] + self.regularClusters = None # type: Union[None, SegmentClusters] + self.singularClusters = None # type: Union[None, SegmentClusters] + self.originalRegularClusters = None # type: Union[None, SegmentClusters] + + def extractSingularFromNoise(self): + """ + Extract "large" templates from noise that should rather be its own cluster. + Works in place, i.e. changing the contained cluster containers. + """ + from nemere.inference.templates import Template + + for idx, seg in reversed(list(enumerate(self.noise.copy()))): # type: int, MessageSegment + freqThresh = math.log(len(self.clusterer.segments)) + if isinstance(seg, Template): + if len(seg.baseSegments) > freqThresh: + self.clusters.append([self.noise.pop(idx)]) # .baseSegments + + def appendCharSegments(self, charSegments: List[AbstractSegment]): + """Append the given char segments to the cluster container. Needs """ + if len(charSegments) > 0: + self.clusters.append(charSegments) + + @staticmethod + def truncateList(values: Iterable, maxLen=5): + """Truncate a list of values if its longer than maxLen by adding ellipsis.""" + output = [str(v) for v in values] + if len(output) > maxLen: + return output[:2] + ["..."] + output[-2:] + return output + + # # # # # # # # # # # # # # # # # # # # # # # # # # # + + def clustersOfUniqueSegments(self): + """ + Consolidate cluster contents so that the same value in different segments is only represented once per cluster. + The clusters are also stored in the instance as property self.unisegClusters + + :return: structure of segments2clusteredTypes + """ + segmentClusters = [ # identical to the second iteration of segments2clusteredTypes() + ( self._clusterLabel(segs), type(self)._segmentsLabels(segs) ) + for segs in self.clusters + ] + dc = self.clusterer.distanceCalculator + self.unisegClusters = SegmentClusters(dc) + for cLabel, elements in segmentClusters: + # consolidates multiple (raw) segments in their respective templates, + # while using a set to distinguish multiple identical templates with different labels + # TODO this is somewhat strange, since it assumes that the same segment + # can appear multiple times across clusters. I don't remember, why I thought this is necessary. + uniqueSegments = {(sLabel, dc.segments[dc.segments2index([tSegment])[0]]) + for sLabel, tSegment in elements} + self.unisegClusters.append((cLabel, sorted(uniqueSegments, key=lambda x: x[1].values))) + self.__mixed() + self._regularAndSingularClusters() + return [(self.analysisLabel(), self.unisegClusters.clusters)] + + def __mixed(self): + """Replace duplicate clusters of different types with one cluster of segments with a [mixed] label.""" + # TODO This becomes obsolete + # if the duplicate checks in SegmentClusters would be implemented as stated in the TODOs there. + mixedSegments = [seg for seg, cnt in Counter( + tSegment for cLabel, elements in self.unisegClusters for sLabel, tSegment in elements).items() + if cnt > 1] + for tSegment in mixedSegments: + mixedClusters = [elements for cLabel, elements in self.unisegClusters + if tSegment in (sElem for sLabel, sElem in elements)] + # if len(mixedClusters) >= 2: + # print("len(mixedClusters) >= 2 # that would be strange and we needed to find some solution then") + # IPython.embed() + assert len(mixedClusters) < 2 # that would be strange and we needed to find some solution then + toReplace = [sIdx for sIdx, sTuple in enumerate(mixedClusters[0]) if sTuple[1] == tSegment] + for rIdx in reversed(sorted(toReplace)): + del mixedClusters[0][rIdx] + mixedClusters[0].append(("[mixed]", tSegment)) + # TODO fix function: mixedClusters contains only the elements of the cluster + # and thus the change is never written to the actual list + + def _regularAndSingularClusters(self): + """ + Fill lists with clusters that contain at least three distinct values (regular) and less (singular). + see also nemere.inference.segmentHandler.extractEnumClusters() + """ + self.regularClusters = SegmentClusters(self.clusterer.distanceCalculator) + self.singularClusters = SegmentClusters(self.clusterer.distanceCalculator) + for uc in self.unisegClusters: + if len({seg[1].bytes for seg in uc[1]}) > 3: + self.regularClusters.append(uc) + else: + # TODO evaluate nemere.inference.segmentHandler.extractEnumClusters + self.singularClusters.append(uc) + # redundantly store regular clusters before any mering and splitting + self.originalRegularClusters = SegmentClusters(self.clusterer.distanceCalculator) + self.originalRegularClusters.extend(self.regularClusters) + self.regularClusters.mergeOnDensity() + self.regularClusters.splitOnOccurrence() + self.unisegClusters = SegmentClusters(self.clusterer.distanceCalculator) + self.unisegClusters.extend(self.regularClusters) + self.unisegClusters.extend(self.singularClusters) + + def analysisLabel(self): + """Generate a label for the analysis the clusters in self are the result of.""" + segLengths = set() + if self.noise: + segLengths.update({seg.length for seg in self.noise}) + for segs in self.clusters: + segLengths.update({seg.length for seg in segs}) + + return '{} ({} bytes) {}'.format( + self._analysisTitle, + next(iter(segLengths)) if len(segLengths) == 1 else 'mixedamount', + self.clusterer if self.clusterer else 'n/a') + + @staticmethod + def _segmentsLabels(cluster: List[AbstractSegment]): + """Generate an empty label for each segment in the given cluster. Used by subclasses. + The order of segments is NOT retained!""" + labeledSegments = [(None, seg) for seg in cluster] + return labeledSegments + + def _clusterLabel(self, cluster: List[AbstractSegment]): + """ + Generate a label for the given cluster, containing its index number some statistics. + The method recognizes any known cluster and the noise per identity of the list of Segments. + """ + segLenStr = " ".join(SegmentClusterCauldron.truncateList({seg.length for seg in cluster})) + # the label for noise is a bit different than for the others + if cluster == self.noise: + cLabel = ClusterLabel() + cLabel.analysisTitle = self._analysisTitle + cLabel.lengthsString = segLenStr + cLabel.clusterSize = len(self.noise) + return cLabel + else: + # raises a ValueError if cluster is not known + cLabel = ClusterLabel(self.clusters.index(cluster)) + cLabel.analysisTitle = self._analysisTitle + cLabel.lengthsString = " ".join(SegmentClusterCauldron.truncateList({seg.length for seg in cluster})) + cLabel.clusterSize = len(cluster) + cLabel.maxDist = self.clusterer.distanceCalculator.distancesSubset(cluster).max() + return cLabel + + def exportAsTemplates(self): + fTypeTemplates = list() + for i in self.regularClusters.clusterIndices: + # generate FieldTypeTemplates (padded nans) - Templates as is + ftype = FieldTypeTemplate(self.unisegClusters.clusterElements(i)) + ftype.fieldtype = self.unisegClusters.clusterLabel(i) + fTypeTemplates.append(ftype) + # treat all singular clusters as one + singularElements = [element for i in self.singularClusters.clusterIndices + for element in self.singularClusters.clusterElements(i)] + if len(singularElements) > 0: + singularLabel = ClusterLabel( + "+".join(self.singularClusters[i][0].clusterNumber for i in self.singularClusters.clusterIndices)) + singularLabel.analysisTitle = "singular" + singularLabel.clusterSize = sum( + len(e.baseSegments) if isinstance(e, Template) else 1 for e in singularElements) + # noinspection PyArgumentList + singularLabel.maxDist = self.clusterer.distanceCalculator.distancesSubset(singularElements).max() + singularLabel.lengthsString = " ".join(SegmentClusterCauldron.truncateList({ + seg.length for seg in singularElements})) + ftype = FieldTypeTemplate(singularElements) + ftype.fieldtype = str(singularLabel) + fTypeTemplates.append(ftype) + return fTypeTemplates + + +class TypedSegmentClusterCauldron(SegmentClusterCauldron): + """ + Container class for results of the clustering of segments and the evaluation of the clustering result. + """ + noise: List[TypedSegment] + clusters: List[List[TypedSegment]] + + def __init__(self, clusterer: AbstractClusterer, analysisTitle: str): + """ + Cluster segments according to the distance of their feature vectors. + Keep and label segments classified as noise. + + :param clusterer: Clusterer object that contains all the segments to be clustered + :type analysisTitle: the string to be used as label for the result + """ + assert all(isinstance(seg, TypedSegment) for seg in clusterer.segments), \ + "This class is used for evaluating the result quality. Thus, its necessary to use segments that are " \ + "annotated with there true data type. See annotateFieldTypes()" + super().__init__(clusterer,analysisTitle) + + def segments2clusteredTypes(self): + """ + TODO replace nemere.inference.segmentHandler.segments2clusteredTypes in callers + + :return: List/Tuple structure of annotated analyses, clusters, and segments. + List [ of + Tuples ( + "analysis label", + List [ of cluster + Tuples ( + "cluster label", + List [ of segment + Tuples ( + "segment label (e. g. field type)", + MessageSegment object + ) + ] + ) + ] + ) + ] + """ + segmentClusters = list() + if self.noise: + segmentClusters.append(( + self._clusterLabel(self.noise), + type(self)._segmentsLabels(self.noise) + )) + for segs in self.clusters: + segmentClusters.append(( + self._clusterLabel(segs), + type(self)._segmentsLabels(segs) + )) + return [(self.analysisLabel(), segmentClusters)] + + @staticmethod + def _segmentsLabels(cluster: List[TypedSegment]): + """Generate a label for each segment in the given cluster. The order of segments is NOT retained!""" + typeGroups = segments2types(cluster) + labeledSegments = list() + for ftype, tsegs in typeGroups.items(): # [label, segments] + occurence = len(tsegs) + labeledSegments.extend([( + "{}: {} Seg.s".format(ftype, occurence), + tseg + ) for tseg in tsegs]) + return labeledSegments + + def _clusterLabel(self, cluster: List[TypedSegment]): + """ + Generate a label for the given cluster, containing its index number some statistics. + The method recognizes any known cluster and the noise per identity of the list of Segments. + """ + cLabel = super()._clusterLabel(cluster) + if cluster != self.noise: + cLabel.mostFrequentTypes = TypedSegmentClusterCauldron.getMostFrequentTypes(cluster) + return cLabel + + @staticmethod + def getMostFrequentTypes(cluster: List[TypedSegment]): + typeGroups = segments2types(cluster) + return sorted(((ftype, len(tsegs)) for ftype, tsegs in typeGroups.items()), + key=lambda x: -x[1]) + + def label4segment(self, seg: AbstractSegment) -> Union[bool, str]: + """ + Prepare string label of seg for usage in plot legend. Returns False if no information about + the segment is present in this instance. + """ + # simple case: directly in one cluster + if seg in self.noise: + return str(self._clusterLabel(self.noise)) + for i in self.unisegClusters.clusterIndices: + if seg in self.unisegClusters.clusterElements(i): + return self.unisegClusters.clusterLabel(i) + # complex case: seg is a template (that was not in a cluster directly) + # and we need to check all groups/clusters for the basesegments of the template (seg). + if isinstance(seg, Template): + inGroup = None # type: Union[None, str] + for bs in seg.baseSegments: + if bs in self.noise: + inGroup = str(self._clusterLabel(self.noise)) + for i in self.unisegClusters.clusterIndices: + if bs in self.unisegClusters.clusterElements(i): + name = self.unisegClusters.clusterLabel(i) + if inGroup is None or inGroup == name: + inGroup = name + else: + return "[mixed]" + if inGroup is not None: + return inGroup + else: + # not anywhere to be found + return False + # not anywhere to be found + return False + +class SegmentClusterContainer(MutableSequence): + """Container for Clusters of unique segments.""" + + _il3q = 99 # parameter: percentile q to remove extreme outliers from distances (for std calc) + _il3t = .4 # parameter: threshold for the ratio of stds to indicate a linear chain + _il4t = 3. # parameter: threshold for the ratio of the increase of the matrix traces of the sorted distance matrix + # in direction of the largest extent of the cluster + + def __init__(self, dc: DelegatingDC): + self._clusters = list() # type: List[Tuple[ClusterLabel, List[Tuple[str, TypedSegment]]]] + self._distanceCalculator = dc + + def insert(self, index: int, o: Tuple[ClusterLabel, List[Tuple[str, TypedSegment]]]) -> None: + # TODO check if o[1] is already in cluster #i and do not do anything (raise an Error?) except updating the element labels to [mixed] if they were not identical in o[1] and cluster[i][1] + self._clusters.insert(index, o) + + def __getitem__(self, i: int) -> Tuple[ClusterLabel, List[Tuple[str, TypedSegment]]]: + return self._clusters.__getitem__(i) + + def __setitem__(self, i: int, o: Tuple[ClusterLabel, List[Tuple[str, TypedSegment]]]) -> None: + # TODO check if o[1] is already in cluster #i and do not do anything (raise an Error?) except updating the element labels to [mixed] if they were not identical in o[1] and cluster[i][1] + self._clusters.__setitem__(i, o) + + def __delitem__(self, i: int) -> None: + self._clusters.__delitem__(i) + + def __len__(self) -> int: + return self._clusters.__len__() + + def __contains__(self, o: Tuple[ClusterLabel, List[Tuple[str, TypedSegment]]]): + # TODO check if o[1] is already in cluster #i and do not do anything (raise an Error?) except updating the element labels to [mixed] if they were not identical in o[1] and cluster[i][1] + return self._clusters.__contains__(o) + + def clusterContains(self, i: int, o: Tuple[str, TypedSegment]): + # TODO check if o[1] is already in cluster #i and do not do anything (raise an Error?) except updating the element labels to [mixed] if they were not identical in o[1] and cluster[i][1] + for lab, ele in self._clusters[i][1]: + if lab == o[0] and ele.values == o[1].values: + return True + return False + + def clusterIndexOfSegment(self, segment: TypedSegment): + for ci in self.clusterIndices: + if segment in self.clusterElements(ci): + return ci + return None + + def clusterLabel(self, i: int) -> str: + return str(self._clusters[i][0]) + + def clusterElements(self, i: int) -> List[TypedSegment]: + return [b for a,b in self._clusters[i][1]] + + @property + def clusters(self): + return self._clusters + + def dcSubMatrix(self, i: int): + return self._distanceCalculator.distancesSubset(self.clusterElements(i)) + + def __repr__(self): + return "\n".join(self.clusterLabel(i) for i in self.clusterIndices) + + # # # # # # # # # # # # # # # # # # # # # # # # # # # + + def _modClusterLabel(self, subc: List[Tuple[str, Union[AbstractSegment, Template]]], + hasGT: bool, clusterNumber: str, analysisTitle: str): + cLabel = ClusterLabel(clusterNumber) + cLabel.analysisTitle = analysisTitle + cLabel.lengthsString = " ".join(SegmentClusterCauldron.truncateList({seg.length for l, seg in subc})) + cLabel.clusterSize = sum(len(e.baseSegments) if isinstance(e, Template) else 1 for e in subc) + # noinspection PyArgumentList + cLabel.maxDist = self._distanceCalculator.distancesSubset([seg for l, seg in subc]).max() + if hasGT: + # noinspection PyTypeChecker + typedSegs = [seg for l, seg in subc] # type: List[TypedSegment] + cLabel.mostFrequentTypes = TypedSegmentClusterCauldron.getMostFrequentTypes(typedSegs) + return cLabel + + def splitOnOccurrence(self): + """ + Split clusters if they have extremely polarized occurrences + (e.g., many unique values, very few very high occurring values). + As pivot use ln(occSum). (Determination of a knee has some false positives + and is way too complex for its benefit). + """ + splitClusters = list() + rankThreshold = 95 + for i in self.clusterIndices: + # if rank > 95 and std > ln(occSum) split at ln(occSum) + pivot = math.log(self.occurrenceSum(i)) + if self.occurrenceLnPercentRank(i) > rankThreshold \ + and numpy.std(self.occurrences(i)) > pivot: + hasGT = False + if all(isinstance(seg, (TypedSegment, TypedTemplate)) for seg in self.clusterElements(i)): + hasGT = True + # perform the splitting + subcA = list() # type: List[Tuple[str, Union[AbstractSegment, Template]]] + subcB = list() # type: List[Tuple[str, Union[AbstractSegment, Template]]] + for l,e in self[i][1]: + if isinstance(e, Template) and len(e.baseSegments) > pivot: + subcA.append((l,e)) + else: + subcB.append((l,e)) + + cLabel = self._modClusterLabel(subcA, hasGT, f"{self[i][0].clusterNumber}s0", "split") + splitClusters.append( (cLabel, subcA) ) + + cLabel = self._modClusterLabel(subcB, hasGT, f"{self[i][0].clusterNumber}s1", "split") + splitClusters.append( (cLabel, subcB ) ) + else: + splitClusters.append(self[i]) + self._clusters = splitClusters + return splitClusters + + def mergeOnDensity(self): + """Merge nearby (single-linked) clusters with very similar densities.""" + import warnings + + epsilonDensityThreshold = 0.01 + neighborDensityThreshold = 0.002 + + # median values for the 1-nearest neighbor ("minimum" distance). + minmedians = [numpy.median([self._distanceCalculator.neighbors(ce, self.clusterElements(i))[1][1] + for ce in self.clusterElements(i)]) + for i in self.clusterIndices] + maxdists = [self.maxDist(i) for i in self.clusterIndices] + + trils = [self.trilFlat(i) for i in self.clusterIndices] + cpDists = { (i, j): self._distanceCalculator.distancesSubset(self.clusterElements(i), self.clusterElements(j)) + for i, j in combinations(self.clusterIndices, 2) } + + # in case of empty distances, the median may be requested from an empty list. This is no problem, thus ignore. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + vals = list() + for i, j in combinations(self.clusterIndices, 2): + # density in $\epsilon$-neighborhood around nearest points between similar clusters + # $\epsilon$-neighborhood: link segments (closest in two clusters) s_lc_ic_j + # d(s_lc_ic_j, s_k) <= $\epsilon$ for all (s_k in c_i) != s_lc_ic_j + # the nearest points ("link segments") between the clusters + coordmin = numpy.unravel_index(cpDists[(i, j)].argmin(), cpDists[(i, j)].shape) + # index of the smaller cluster + smallCluster = i if maxdists[i] < maxdists[j] else j + # extent of the smaller cluster + smallClusterExtent = maxdists[smallCluster] + # density as median distances in $\epsilon$-neighborhood with smallClusterExtent as $\epsilon$ + dists2linki = numpy.delete(self.dcSubMatrix(i)[coordmin[0]], coordmin[0]) + dists2linkj = numpy.delete(self.dcSubMatrix(j)[coordmin[1]], coordmin[1]) + densityi = numpy.median(dists2linki[dists2linki <= smallClusterExtent / 2]) + densityj = numpy.median(dists2linkj[dists2linkj <= smallClusterExtent / 2]) + + vals.append(( + (i,j), # 0: indices tuple + trils[i].mean(), None, minmedians[i], # 1*, 2, 3*: of distances in cluster i + None, # 4 + trils[j].mean(), None, minmedians[j], # 5*, 6, 7*: of distances in cluster j + cpDists[(i, j)].min(), # 8*: min of distances between i and j + None, # 9 + densityi, # 10*: density within epsilon around link segment in i + densityj # 11*: density within epsilon around link segment in j + )) + + # merge cluster conditions: areVeryCloseBy and linkHasSimilarEpsilonDensity or areSomewhatCloseBy and haveSimilarDensity + areVeryCloseBy = [bool(v[8] < v[1] or v[8] < v[5]) for v in vals] + linkHasSimilarEpsilonDensity = [bool(abs(v[10] - v[11]) < epsilonDensityThreshold) for v in vals] + # closer as the mean between both cluster's "densities" normalized to the extent of the cluster + areSomewhatCloseBy = [bool(v[8] < numpy.mean([v[3] / v[1], v[7] / v[5]])) for v in vals] + haveSimilarDensity = [bool(abs(v[3] - v[7]) < neighborDensityThreshold) for v in vals] + + # filter pairs of clusters to merge that + # areVeryCloseBy and linkHasSimilarEpsilonDensity + # or areSomewhatCloseBy and haveSimilarDensity + toMerge = [ + ij[0] for ij, ca1, ca2, cb1, cb2 in + zip(vals, areVeryCloseBy, linkHasSimilarEpsilonDensity, areSomewhatCloseBy, haveSimilarDensity) + if ca1 and ca2 or cb1 and cb2 + ] + + # determine chains of merging candidates by graph analysis + dracula = Graph() + dracula.add_nodes_from(self.clusterIndices) + dracula.add_edges_from(toMerge) + connectedDracula = list(connected_components(dracula)) + + # now actually merge + mergedClusters = list() # type: List[Tuple[ClusterLabel, List[Tuple[str, AbstractSegment]]]] + for connected in connectedDracula: + if len(connected) == 1: + mergedClusters.append(self[next(iter(connected))]) + else: + mc = list() # type: List[Tuple[str, TypedSegment]] + cnums = list() + for cid in connected: + cnums.append(self[cid][0].clusterNumber) + mc.extend(self[cid][1]) + + cLabel = self._modClusterLabel(mc, + all(isinstance(seg, (TypedSegment, TypedTemplate)) for seg in mc), + "+".join(cnums), "merged") + mergedClusters.append( ( cLabel, mc ) ) + self._clusters = mergedClusters + return mergedClusters + + # # # # # # # # # # # # # # # # # # # # # # # # # # # + + def trilFlat(self, i: int) -> numpy.ndarray: + """ + :param i: cluster index + :return: The values of the lower triangle of the distance matrix of the given cluster omitting the diagonal + as a list. + """ + return tril(self.dcSubMatrix(i)) + + def occurrences(self, i: int) -> List[int]: + """ + + You may put this list in a Counter: + >>> from collections import Counter + >>> from nemere.utils.baseAlgorithms import generateTestSegments + >>> from nemere.inference.templates import DelegatingDC + >>> from nemere.validation.clusterInspector import SegmentClusterContainer + >>> segments = generateTestSegments() + >>> dc = DelegatingDC(segments) + Calculated distances for 37 segment pairs in ... seconds. + >>> someclusters = SegmentClusterContainer(dc) + >>> someclusters.append(("Sample Cluster", list(("Some Type", s) for s in segments))) + >>> cnt = Counter(someclusters.occurrences(0)) + >>> # noinspection PyUnresolvedReferences + >>> sum(a*b for a,b in cnt.items()) == someclusters.occurrenceSum(0) + True + + :param i: Cluster index + :return: The numbers of occurrences of distinct values (not the values themselves!) + """ + return [len(e.baseSegments) if isinstance(e, Template) else 1 for e in self.clusterElements(i)] + + def occurrenceSum(self, i: int): + """ + The sum of occurrences equals the true number of segments including value duplicates. + + :param i: Cluster index + :return: The sum of occurrences of all values. + """ + return sum(self.occurrences(i)) + + def occurrenceLnPercentRank(self, i: int): + """ + "%rank of ln(sum(occ))": is a measure of the occurrence and value diversity. + The percent-rank (80% means that 80% of the scores in a are below the given score) for the occurrences + with the score ln(#elements). + + :param i: + :return: percent rank + """ + lnsumocc = math.log(self.occurrenceSum(i)) # ln of amount of all values (also identical ones) + return scipy.stats.percentileofscore(self.occurrences(i), lnsumocc) + + def distinctValues(self, i: int): + """ + :param i: Cluster index + :return: Number of differing values + """ + return len(self.clusterElements(i)) + + def maxDist(self, i: int): + """ + :param i: Cluster index + :return: The maximum distance between any two segments in cluster i. + """ + if self.distinctValues(i) < 2: + # If cluster contains only one template, we define the (maximum) distance to itself to be zero. + # (Less than 1 template/segment should not happen, but we handle it equally and do not fail, + # should it happen.) + return 0 + dist = self.trilFlat(i) # type: numpy.ndarray + # noinspection PyArgumentList + return dist.max() + + def remotestSegments(self, i: int): + """ + Determine the segment with the maximum sum of distances to all other segments (A) + and the segment farthest away from this (C). + + >>> from pprint import pprint + >>> from tabulate import tabulate + >>> from nemere.utils.baseAlgorithms import generateTestSegments + >>> from nemere.inference.templates import DistanceCalculator + >>> from nemere.validation.clusterInspector import SegmentClusterCauldron + >>> from nemere.inference.templates import DBSCANsegmentClusterer + >>> + >>> segments = generateTestSegments() + >>> dc = DelegatingDC(segments) + Calculated distances for 37 segment pairs in ... seconds. + >>> clusterer = DBSCANsegmentClusterer(dc, segments) + DBSCANsegmentClusterer: eps 0.476 autoconfigured (Kneedle on ECDF with S 0.8) from k 2 + >>> cauldron = SegmentClusterCauldron(clusterer, "DocTest") + Clustering segments... + DBSCAN epsilon: 0.476, minpts: 2 + 1 clusters generated from 9 segments + >>> listOfClusters = cauldron.clustersOfUniqueSegments() + >>> pprint(listOfClusters) + [('DocTest (mixedamount bytes) DBSCAN eps 0.476 mpt 2', + [(Cluster #00: 6 Seg.s ($d_{max}$=0.440, 2 3 4 bytes), + [(None, MessageSegment 4 bytes at (0, 4): 01020304 | values: (1, 2, 3...), + (None, MessageSegment 3 bytes at (0, 3): 010304 | values: (1, 3, 4)), + (None, MessageSegment 2 bytes at (0, 2): 0203 | values: (2, 3)), + (None, MessageSegment 3 bytes at (0, 3): 020304 | values: (2, 3, 4)), + (None, MessageSegment 2 bytes at (0, 2): 0204 | values: (2, 4)), + (None, + MessageSegment 4 bytes at (0, 4): 03020304 | values: (3, 2, 3...)])])] + >>> vals = list() + >>> for i in range(len(cauldron.regularClusters)): + ... idxA, idxC, segA, segC = cauldron.regularClusters.remotestSegments(i) + ... cxDistances = cauldron.regularClusters.dcSubMatrix(i) + ... directAC = cxDistances[idxA, idxC] + ... vals.append(( + ... cauldron.regularClusters.clusterLabel(i), + ... cauldron.regularClusters.maxDist(i), + ... directAC, + ... cauldron.regularClusters.maxDist(i) == directAC # mostly but not always True + ... )) + >>> print(tabulate(vals)) + --------------------------------------------------- ------- ------- - + Cluster #00: 6 Seg.s ($d_{max}$=0.440, 2 3 4 bytes) 0.44043 0.44043 1 + --------------------------------------------------- ------- ------- - + + :param i: Cluster index + :return: The two segments as MessageSegment and the index in the cluster elements list self.clusterElements(i). + """ + distances = self.dcSubMatrix(i) # type: numpy.ndarray + idxA = distances.sum(0).argmax() + idxC = distances[idxA].argmax() + segA = self.clusterElements(i)[idxA] + segC = self.clusterElements(i)[idxC] + return idxA, idxC, segA, segC + + def elementLengths(self, i: int) -> numpy.ndarray: + segLens = numpy.array(list({e.length for e in self.clusterElements(i)})) + return segLens + + def charSegmentCount(self, i: int): + return len(filterChars(self.clusterElements(i))) + + @staticmethod + def mostFrequentTypes(cluster: List[Tuple[str, TypedSegment]]): + segLabelExtractor = re.compile(r"(\w*): (\d*) Seg.s") + allLabels = {l for l,e in cluster} + typeStats = [next(segLabelExtractor.finditer(l)).groups() for l in allLabels] + mostFrequentTypes = sorted(((ftype, int(tsegs)) for ftype, tsegs in typeStats), + key=lambda x: -x[1]) + return mostFrequentTypes + + def distancesSortedByLargestExtent(self, i: int): + smi = self.dcSubMatrix(i) + dfari = smi[self.remotestSegments(i)[0]] + cmi = self.clusterElements(i) + idxi = sorted(range(len(cmi)), key=lambda x: dfari[x]) + sfari = [cmi[e] for e in idxi] + return self._distanceCalculator.distancesSubset(sfari) + + def traceMeanDiff(self, i: int): + """ + Difference of the first and the mean of all other diffs of all $k$-traces (sums of the + $k$th superdiagonals) of the sorted distance matrix for the segments in this cluster. + It is sorted by the distance from the remotest segment in the cluster. + + :param i: + :return: + """ + # + # + sortedSMi = self.distancesSortedByLargestExtent(i) + trMeans = [sortedSMi.trace(k) / (sortedSMi.shape[0] - k) for k in range(sortedSMi.shape[0])] + return trMeans[1] / numpy.diff(trMeans[1:]).mean() + # plt.pcolormesh(sortedSMi) + # plt.title(cauldron.regularClusters.clusterLabel(i)) + # plt.show() + # print(cauldron.regularClusters.clusterLabel(i)) + + # # # # # # # # # # # # # # # # # # # # # # # # # # # + + + + @property + def clusterIndices(self): + return range(len(self)) + + +class SegmentClusters(SegmentClusterContainer): + + def plotDistances(self, i: int, specimens: SpecimenLoader, comparator=None): + if len(self.clusterElements(i)) < 2: + print("Too few elements to plot in", self.clusterLabel(i), "Ignoring it.") + return + dists = self.dcSubMatrix(i) + postfix = "" + fnLabels = None + if comparator: + fnLabels = [set([comparator.lookupField(bs)[1] for bs in seg.baseSegments] + if isinstance(seg, Template) else [comparator.lookupField(seg)[1]]) for seg in dists] + fnLabels = [next(iter(fl)) if len(fl) == 1 else repr(fl)[1:-1].replace("'", "") for fl in fnLabels] + postfix = "-fnLabeled" + sdp = DistancesPlotter(specimens, f"distances-cluster{i}" + postfix, False) + if comparator: + sdp.plotManifoldDistances(self.clusterElements(i), dists, numpy.array(fnLabels)) + else: + idxA, idxC, segA, segC = self.remotestSegments(i) + labels = [None]*len(self.clusterElements(i)) + labels[idxA] = "A" + labels[idxC] = "C" + sdp.plotManifoldDistances(self.clusterElements(i), dists, numpy.array(labels)) + sdp.axesFlat[1].set_title(self.clusterLabel(i)) + sdp.writeOrShowFigure() diff --git a/src/nemere/validation/dissectorMatcher.py b/src/nemere/validation/dissectorMatcher.py index a968a916..e133ea50 100644 --- a/src/nemere/validation/dissectorMatcher.py +++ b/src/nemere/validation/dissectorMatcher.py @@ -83,14 +83,17 @@ class MessageComparator(BaseComparator): __messageCellCache = dict() # type: Dict[(netzob.Symbol, AbstractMessage), List] def __init__(self, specimens: sl.SpecimenLoader, layer: int = -1, relativeToIP: bool = False, - failOnUndissectable=True, debug = False): + failOnUndissectable=True, debug = False, dissectOneshot=True): super().__init__(specimens, layer, relativeToIP, debug) self._failOnUndissectable = failOnUndissectable # Cache messages that already have been parsed and labeled self._messageCache = dict() # type: Dict[netzob.RawMessage, ] - self._dissections = self._dissectAndLabel(self.messages.values()) + if dissectOneshot: + self._dissections = ParsedMessage.parseOneshot(specimens, failOnUndissectable) + else: + self._dissections = self._dissectAndLabel(self.messages.values()) def _dissectAndLabel(self, messages: Iterable[netzob.RawMessage]) \ @@ -246,6 +249,8 @@ def pprint2Interleaved(self, message: AbstractMessage, segmentsPerMsg: Sequence[ mark: Union[Tuple[int,int], MessageSegment]=None, messageSlice: Tuple[Union[int,None],Union[int,None]]=None): """ + TODO deprecated: use ComparingPrinter directly! + :param message: The message from which to print the byte hex values. Also used to look up the true field boundaries to mark by spaces between in the printed byte hex values. :param segmentsPerMsg: The segments that should be visualized by color changes. @@ -570,7 +575,6 @@ def lookupValues4FieldName(self, fieldName: str): >>> from collections import Counter >>> specimens = SpecimenLoader("../input/deduped-orig/ntp_SMIA-20111010_deduped-100.pcap", 2, True) >>> comparator = MessageComparator(specimens, 2, True, debug=False) - Wait for tshark output (max 20s)... >>> lv = comparator.lookupValues4FieldName("ntp.ppoll") >>> Counter(lv).most_common() [('0a', 43), ('06', 41), ('09', 6), ('0e', 4), ('08', 2), ('0f', 2), ('0d', 2)] diff --git a/src/nemere/validation/messageParser.py b/src/nemere/validation/messageParser.py index 712dd3fa..8aec7f0f 100644 --- a/src/nemere/validation/messageParser.py +++ b/src/nemere/validation/messageParser.py @@ -14,7 +14,7 @@ from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage, AbstractMessage from nemere.validation import protocols -from nemere.validation.tsharkConnector import TsharkConnector +from nemere.validation.tsharkConnector import TsharkConnector, TsharkOneshot # TODO make more central @@ -32,6 +32,24 @@ def __init__(self, compatibleProtocols: Sequence[Type['ParsingConstants']]): self.importProtocol(p.MESSAGE_TYPE_IDS) FOR_PROTCOL = dict() + """ + A message type discriminator may be a field name string (from the tshark dissector) or a dict: + + ``` + { + 'field': 'fieldname', + 'filter': lambda v: v != 'ff', + 'select': lambda w: w + } + ``` + + field: is the field name string to apply the rule to, + filter: is a function with one parameter containing the value of a single message's field + and it needs to return a Boolean: True if the field is a discriminator, False if it should be ignored + select: is a function with one parameter containing the value of a single message's field + and it needs to return a transformation of the value, e.g. a single bit of it, that should be used as + message type discriminating value. + """ NAMED_TYPES = dict() def _collect_superclasses(self): @@ -47,7 +65,9 @@ def __resolveTypeName(self, fieldname: str, fieldvalue: str): else "{}={}".format(fieldname, fieldvalue) def typeOfMessage(self, message: 'ParsedMessage'): - """Retrieve the type of the given message""" + """ + Retrieve the type of the given message from the FOR_PROTCOL dict. + """ if message.protocolname in self.FOR_PROTCOL: idFields = self.FOR_PROTCOL[message.protocolname] resolvedTypeName = [] @@ -129,7 +149,14 @@ class MessageTypeIdentifiers226(MessageTypeIdentifiers): 'filter': lambda v: True, 'select': lambda w: (int.from_bytes(bytes.fromhex(w), "big") & 128) != 0 # first bit denotes request/response }], - 'ntp' : ['ntp.flags', 'ntp.stratum'] + # 'ntp' : ['ntp.flags', 'ntp.stratum'] + 'ntp': [ { + 'field': 'ntp.flags', + 'filter': lambda v: True, + 'select': lambda w: int.from_bytes(bytes.fromhex(w), "big") & 0xc7 # mask out the version + # select for only leap indicator - gt_liandmodev1: + # int.from_bytes(bytes.fromhex(w), "big") >>6 # The first two bits is the leap indicator we use + } ] } NAMED_TYPES = { # assumes hex bytes are lower-case @@ -187,21 +214,21 @@ class MessageTypeIdentifiers226(MessageTypeIdentifiers): '3010': 'Release', '8500': 'Response', }, - 'ntp.flags': { - '13': 'v2 client', - '19': 'v3 symmetric active', - '1b': 'v3 client', - '1c': 'v3 server', - '23': 'v4 client', - '24': 'v4 server', - '25': 'v4 broadcast', - 'd9': 'v3 symmetric active (unsynchronized, MAC)', - 'db': 'v3 client (unsynchronized)', - 'dc': 'v3 server (unsynchronized)', - 'e3': 'v4 client (unsynchronized, MAC)', - 'e4': 'v4 server (unsynchronized)', - 'e5': 'v4 broadcast (unsynchronized)', - }, + # 'ntp.flags': { + # '13': 'v2 client', + # '19': 'v3 symmetric active', + # '1b': 'v3 client', + # '1c': 'v3 server', + # '23': 'v4 client', + # '24': 'v4 server', + # '25': 'v4 broadcast', + # 'd9': 'v3 symmetric active (unsynchronized, MAC)', + # 'db': 'v3 client (unsynchronized)', + # 'dc': 'v3 server (unsynchronized)', + # 'e3': 'v4 client (unsynchronized, MAC)', + # 'e4': 'v4 server (unsynchronized)', + # 'e5': 'v4 broadcast (unsynchronized)', + # }, # 'ntp.stratum': { # '00': '', # '03': '', @@ -209,6 +236,20 @@ class MessageTypeIdentifiers226(MessageTypeIdentifiers): # '05': '', # '06': '', # } + # 'ntp.flags': { # only leap indicator - gt_liandmodev1 + # 0: 'synchronized', + # 3: 'unsynchronized', + # }, + 'ntp.flags': { # leap indicator and mode + 3: 'client synchronized', + 1: 'client synchronized', # 'symmetric active synchronized' + 4: 'server synchronized', + 5: 'broadcast synchronized', + 193: 'client unsynchronized', # 'symmetric active unsynchronized' + 195: 'client unsynchronized', + 196: 'server unsynchronized', + 197: 'broadcast unsynchronized' + }, } # type: Dict[str, Dict[str, str]] @@ -448,11 +489,11 @@ class ParsingConstants226(ParsingConstants): 'nbdgm.first_raw', 'nbdgm.node_type_raw', 'smb.security_blob_raw', 'gss-api_raw', 'spnego_raw', 'spnego.negTokenInit_element_raw', 'spnego.mechTypes_raw', 'ntlmssp_raw', 'ntlmssp.version_raw', 'ntlmssp.challenge.target_name_raw', - 'ntlmssp.challenge.target_info_raw' + 'ntlmssp.challenge.target_info_raw', 'browser.windows_version_raw' ] EXCLUDE_SUB_FIELDS = [ - 'dns.flags_tree', 'ntp.flags_tree', + 'dns.flags_tree', 'dns.id_tree', 'ntp.flags_tree', 'bootp.flags_tree', 'bootp.fqdn.flags_tree', 'bootp.secs_tree', 'smb.flags_tree', 'smb.flags2_tree', 'smb.sm_tree', 'smb.server_cap_tree', 'nbns.flags_tree', 'nbns.nb_flags_tree', @@ -465,7 +506,7 @@ class ParsingConstants226(ParsingConstants): 'smb.fs_attr_tree', 'smb.nt.notify.completion_filter_tree', 'smb2.ioctl.function_tree', 'smb.nt.ioctl.completion_filter_tree', 'smb2.ioctl.function_tree', - 'smb.nt.ioctl.completion_filter_tree', 'smb.lock.type_tree' + 'smb.nt.ioctl.completion_filter_tree', 'smb.lock.type_tree', 'smb.nt_qsd_tree' ] # names of field nodes in the json which should be descended into. @@ -495,13 +536,19 @@ class ParsingConstants226(ParsingConstants): 'NT Trans Response (0xa0)', 'Trans2 Response (0x32)', 'NT IOCTL Setup', 'NT IOCTL Data', 'Range', 'Write AndX Request (0x2f)', 'Write AndX Response (0x2f)', 'Locking AndX Request (0x24)', 'Locking AndX Response (0x24)', 'Echo Request (0x2b)', 'Echo Response (0x2b)', - 'Unlocks', 'Unlock', 'Locks', 'Lock', 'SET_FILE_INFO Parameters', 'SET_FILE_INFO Data' + 'Unlocks', 'Unlock', 'Locks', 'Lock', 'SET_FILE_INFO Parameters', 'SET_FILE_INFO Data', + 'NT QUERY SECURITY DESC Parameters', 'NT QUERY SECURITY DESC Data', 'NT Security Descriptor', + 'NT User (DACL) ACL' # 'dcerpc.cn_ctx_item', 'dcerpc.cn_bind_abstract_syntax', 'dcerpc.cn_bind_trans', # 'smb.security_blob_tree', 'gss-api', # 'spnego', 'spnego.negTokenInit_element', 'spnego.mechTypes_tree', 'spnego.negHints_element', # 'ntlmssp', 'ntlmssp.version', 'ntlmssp.challenge.target_name_tree', 'ntlmssp.challenge.target_info', # 'Servers', 'lanman.server_tree' - ] + ] + + INCLUDE_SUBFIELDS_RE = [re.compile(pattern) for pattern in [ + 'NT ACE: .*' + ]] # names of field nodes in the json that have a record structure (list[list[tuples], not list[tuples[str, tuple]]). RECORD_STRUCTURE = ['Queries', 'Answers', # in dns, nbns @@ -532,6 +579,13 @@ class ParsingConstants226(ParsingConstants): TYPELOOKUP['ntp.priv.auth_seq'] = 'int' # has value: 97 TYPELOOKUP['ntp.priv.impl'] = 'int' # has value: 00 TYPELOOKUP['ntp.priv.reqcode'] = 'int' # has value: 00 + TYPELOOKUP['ntp.ctrl.flags2'] = 'flags' # has value: 82 + TYPELOOKUP['ntp.ctrl.sequence'] = 'int' # has value: 0001 + TYPELOOKUP['ntp.ctrl.status'] = 'flags' # has value: 0615 + TYPELOOKUP['ntp.ctrl.associd'] = 'id' # has value: 0000 + TYPELOOKUP['ntp.ctrl.offset'] = 'int' # has value: 0000 + TYPELOOKUP['ntp.ctrl.count'] = 'int' # has value: 0178 + TYPELOOKUP['ntp.ctrl.data'] = 'chars' # has value: 7665...0d0a # dhcp TYPELOOKUP['bootp.type'] = 'flags' # or enum @@ -620,6 +674,12 @@ class ParsingConstants226(ParsingConstants): TYPELOOKUP['dns.aaaa'] = 'ipv6' TYPELOOKUP['dns.cname'] = 'chars' + # eth + TYPELOOKUP['eth.addr'] = 'macaddr' + TYPELOOKUP['eth.dst'] = 'macaddr' + TYPELOOKUP['eth.src'] = 'macaddr' + TYPELOOKUP['eth.type'] = 'int' # unsigned integer, 2 bytes + # irc TYPELOOKUP['irc.request.prefix'] = 'chars' TYPELOOKUP['irc.request.command'] = 'chars' @@ -637,20 +697,6 @@ class ParsingConstants226(ParsingConstants): TYPELOOKUP['smtp.response.code'] = 'chars' # has value: 323230 TYPELOOKUP['smtp.rsp.parameter'] = 'chars' - # smb - TYPELOOKUP['smb.server_component'] = 'id' # has value: ff534d42 = ".SMB" - TYPELOOKUP['smb.cmd'] = 'int' # has value: 73 - TYPELOOKUP['smb.nt_status'] = 'int' # has value: 00000000 - TYPELOOKUP['smb.flags'] = 'flags' # has value: 18 - TYPELOOKUP['smb.flags2'] = 'flags' # has value: 07c8 - TYPELOOKUP['smb.pid.high'] = 'id' # has value: 0000 - TYPELOOKUP['smb.signature'] = 'crypto' # has value: 4253525350594c20 - TYPELOOKUP['smb.reserved'] = 'int' # has value: 0000 - TYPELOOKUP['smb.tid'] = 'id' # 'id' behaves like flags # has value: 0000 - TYPELOOKUP['smb.pid'] = 'id' # 'id' behaves like flags # has value: fffe - TYPELOOKUP['smb.uid'] = 'id' # 'id' behaves like flags # has value: 0000 - TYPELOOKUP['smb.mid'] = 'id' # 'id' behaves like flags # has value: 4000 - # nbns TYPELOOKUP['nbns.id'] = 'id' TYPELOOKUP['nbns.flags'] = 'flags' # has value: 0110 @@ -669,11 +715,23 @@ class ParsingConstants226(ParsingConstants): # smb - mostly little endian numbers TYPELOOKUP['nbss.type'] = 'id' # has value: 00 TYPELOOKUP['nbss.length'] = 'int' # has value: 000038 + TYPELOOKUP['smb.server_component'] = 'addr' # has value: ff534d42 = ".SMB" # somewhat similar to a addr + TYPELOOKUP['smb.cmd'] = 'int' # has value: 73 + TYPELOOKUP['smb.nt_status'] = 'int' # has value: 00000000 + TYPELOOKUP['smb.flags'] = 'flags' # has value: 18 + TYPELOOKUP['smb.flags2'] = 'flags' # has value: 07c8 + TYPELOOKUP['smb.pid.high'] = 'id' # has value: 0000 + TYPELOOKUP['smb.signature'] = 'crypto' # has value: 4253525350594c20 + TYPELOOKUP['smb.reserved'] = 'int' # has value: 0000 + TYPELOOKUP['smb.tid'] = 'id' # 'id' behaves like flags # has value: 0000 + TYPELOOKUP['smb.pid'] = 'id' # 'id' behaves like flags # has value: fffe + TYPELOOKUP['smb.uid'] = 'id' # 'id' behaves like flags # has value: 0000 + TYPELOOKUP['smb.mid'] = 'id' # 'id' behaves like flags # has value: 4000 TYPELOOKUP['smb.wct'] = 'int' # has value: 07 TYPELOOKUP['smb.andxoffset'] = 'int_le' # has value: 3800 - little endian TYPELOOKUP['smb.connect.support'] = 'int_le' # has value: 0100 TYPELOOKUP['smb.bcc'] = 'int_le' # has value: 0700 (Byte count) - TYPELOOKUP['smb.service'] = 'enum' # its coded as 8 bit ASCII 'chars', e.g: 49504300 - http://ubiqx.org/cifs/Book.html p. 311 + TYPELOOKUP['smb.service'] = 'chars' # its coded as 8 bit ASCII 'chars', e.g: 49504300 - http://ubiqx.org/cifs/Book.html p. 311 TYPELOOKUP['smb.native_fs'] = 'chars' # has value: 0000 TYPELOOKUP['smb.tpc'] = 'int_le' # has value: 1a00 TYPELOOKUP['smb.tdc'] = 'int_le' # has value: 0000 @@ -709,7 +767,7 @@ class ParsingConstants226(ParsingConstants): TYPELOOKUP['smb.unknown_data'] = 'unknown' # has value: 00000000 TYPELOOKUP['smb.max_buf'] = 'int' # has value: 0411 TYPELOOKUP['smb.max_mpx_count'] = 'int_le' # has value: 3200 - TYPELOOKUP['smb.vc'] = 'int' # has value: 0000 + TYPELOOKUP['smb.vc'] = 'id' # has value: 0000 # virtual circuits (VCs) are often identical to the pid TYPELOOKUP['smb.session_key'] = 'bytes' # has value: 00000000 TYPELOOKUP['smb.security_blob_len'] = 'int_le' # has value: 6b00 TYPELOOKUP['smb.server_cap'] = 'flags' # has value: d4000080 @@ -817,6 +875,25 @@ class ParsingConstants226(ParsingConstants): TYPELOOKUP['smb2.ioctl.enumerate_snapshots.num_snapshots'] = 'int_le' # has value: 00000000 TYPELOOKUP['smb2.ioctl.enumerate_snapshots.num_snapshots_returned'] = 'int_le' # has value: 00000000 TYPELOOKUP['smb2.ioctl.enumerate_snapshots.array_size'] = 'int_le' # has value: 02000000 + TYPELOOKUP['smb.trans_data.parameters'] = 'bytes' # has value: 000093ff04000400 + TYPELOOKUP['smb.nt_qsd'] = 'int_le' + TYPELOOKUP['smb.sec_desc_len'] = 'int_le' # has value: bc000000 + TYPELOOKUP['nt.sec_desc.revision'] = 'int_le' # has value: 0100 + TYPELOOKUP['nt.sec_desc.type'] = 'enum' # has value: 0484 + TYPELOOKUP['nt.offset_to_owner_sid'] = 'id' # has value: 00000000 + TYPELOOKUP['nt.offset_to_group_sid'] = 'id' # has value: 00000000 + TYPELOOKUP['nt.offset_to_sacl'] = 'int_le' # has value: 00000000 + TYPELOOKUP['nt.offset_to_dacl'] = 'int_le' # has value: 14000000 + TYPELOOKUP['nt.acl.revision'] = 'int_le' # has value: 0200 + TYPELOOKUP['nt.acl.size'] = 'int_le' # has value: a800 + TYPELOOKUP['nt.acl.num_aces'] = 'int_le' # has value: 06000000 + TYPELOOKUP['nt.ace.type'] = 'enum' # has value: 00 + TYPELOOKUP['nt.ace.flags'] = 'flags' # has value: 10 + TYPELOOKUP['nt.ace.size'] = 'int_le' # has value: 1400 + TYPELOOKUP['nt.access_mask'] = 'flags' # has value: ff011f00 + TYPELOOKUP['nt.sid'] = 'id' # has value: 010500000000000515000000ff424cbf49cbe5ae01ea0c4af4010000 + TYPELOOKUP['nt.access_mask'] = '???' # has value: ff011f00 + # TODO enable reuse by providing the original field name to each hook @@ -1161,6 +1238,7 @@ class ParsingConstants325(ParsingConstants263): TYPELOOKUP['dhcp.option.vendor_class_id'] = 'chars' # has value: 4d53465420352e30 TYPELOOKUP['dhcp.option.vendor.value'] = 'bytes' # has value: 5e00 TYPELOOKUP['dhcp.option.request_list_item'] = 'enum' # uint; has value: 01 + TYPELOOKUP['dhcp.option.request_list'] = 'bytes' # has value: 010f03062c2e2f1f2179f92b TYPELOOKUP['dhcp.option.broadcast_address'] = 'ipv4' # has value: ac1203ff TYPELOOKUP['dhcp.option.dhcp_server_id'] = 'ipv4' # has value: ac120301 TYPELOOKUP['dhcp.option.ip_address_lease_time'] = 'int' # uint; has value: 00000e10 @@ -1245,7 +1323,7 @@ class ParsedMessage(object): RK = '_raw' - __tshark = None # type: TsharkConnector + __tshark = None # type: Union[TsharkConnector, TsharkOneshot] """Cache the last used tsharkConnector for reuse.""" __constants = None @@ -1487,6 +1565,29 @@ def _parseMultiple(messages: List[RawMessage], target = None, layer=-1, relative return prsdmsgs # type: dict[AbstractMessage: ParsedMessage] + @classmethod + def parseOneshot(cls, specimens, failOnUndissectable=True): + cls.__tshark = TsharkOneshot() + jsontext = cls.__tshark.readfile(specimens.pcapFileName) + dissectjson = json.loads(jsontext, object_pairs_hook=list) + prsdmsgs = {} + + for paketjson, msg in zip(dissectjson, specimens.messagePool.values()): + # Prevent individual tshark call for parsing by creating a + # ParsedMessage with message set to None... + pm = ParsedMessage(None, layernumber=specimens.layer, relativeToIP=specimens.relativeToIP, + failOnUndissectable=failOnUndissectable) + # ... and set the message afterwards + pm.message = msg + pm._parseJSON([paketjson]) + # assert "".join(pm.getFieldValues()) == msg.data.hex(), \ + # f"msg data and dissector mismatch:\n{msg.data.hex()}\n{''.join(pm.getFieldValues())}" + # TODO validate correct msg to pm association (via byte data) + prsdmsgs[msg] = pm + + return prsdmsgs # type: dict[AbstractMessage: ParsedMessage] + + def _parseJSON(self, dissectjson: List[Tuple[str, any]]): """ Read the structure of dissectjson and from this populate: @@ -1684,6 +1785,13 @@ def _reassemblePostProcessing(self): field was not parsed correctly. This needs manual postprosessing of the dissector output. Therefore see :func:`_prehooks` and :func:`_posthooks`. """ + # fix ari field seuquence - TODO find better location for that + for ix, fk in enumerate([fk for fk, fv in self._fieldsflat]): + if fk == 'ari.length' and ix > 0 and self._fieldsflat[ix-1][0] == 'ari.message_name': + arilen = self._fieldsflat[ix] + self._fieldsflat[ix] = self._fieldsflat[ix-1] + self._fieldsflat[ix-1] = arilen + rest = str(self.protocolbytes) toInsert = list() # iterate all fields and search their value at the next position in the raw message @@ -1730,6 +1838,9 @@ def _reassemblePostProcessing(self): if len(rest) <= 4: # 2 bytes in hex notation self._fieldsflat.append(('delimiter', rest)) + # the dissector failed for this packet + elif len(self._fieldsflat) == 0: + self._fieldsflat.append(('data.data', rest)) # for strange smb trails (perhaps some kind of undissected checksum): elif self._fieldsflat[-1][0] in ['smb2.ioctl.shadow_copy.count', 'smb2.ioctl.enumerate_snapshots.array_size', 'smb.trans_name']: @@ -1740,6 +1851,37 @@ def _reassemblePostProcessing(self): else: # a two byte delimiter is probably still reasonable raise DissectionIncomplete("Unparsed trailing field found. Value: {:s}".format(rest), rest=rest) + # make some final adjustments if necessary # TODO move to ParsingConstants325 + needsMerging = { # merge all adjacent fields named like to one field named + "dhcp.option.request_list_item": "dhcp.option.request_list" + } + needsSplitting = { # split all fields named into chunks of length [0] bytes named [1] + "awdl_pd.mfbuf.chunk_data": (4, "awdl_pd.mfbuf.sample_component") + } + for mergeFrom, mergeTo in needsMerging.items(): + # prevent needless list copying + if not mergeFrom in self.getFieldNames(): + continue + newFieldsflat = list() + for field in self._fieldsflat: + if field[0] == mergeFrom and newFieldsflat[-1][0] in [mergeFrom, mergeTo]: + newFieldsflat[-1] = (mergeTo, newFieldsflat[-1][1] + field[1]) + else: + newFieldsflat.append(field) + self._fieldsflat = newFieldsflat + for splitFrom, splitTo in needsSplitting.items(): + # prevent needless list copying + if not splitFrom in self.getFieldNames(): + continue + newFieldsflat = list() + for field in self._fieldsflat: + if field[0] == splitFrom: + n = splitTo[0]*2 # chunk length (cave HEX string! Thus 2 times) + chunks = [(splitTo[1], field[1][i:i + n]) for i in range(0, len(field[1]), n)] + newFieldsflat.extend(chunks) + else: + newFieldsflat.append(field) + self._fieldsflat = newFieldsflat @staticmethod def _nodeValue(node) -> Tuple[int, Union[str, List]]: @@ -1768,6 +1910,7 @@ def _nodeValue(node) -> Tuple[int, Union[str, List]]: @staticmethod def walkSubTree(root: List[Tuple[str, any]], allSubFields=False) -> List[Tuple[str, str]]: + # noinspection PyUnresolvedReferences """ Walk the tree structure of the tshark-json, starting from ``root`` and generate a flat representation of the field sequence as it is in the message. @@ -1902,7 +2045,7 @@ def __getCompatibleConstants(cls) -> ParsingConstants: @staticmethod def closetshark(): - if ParsedMessage.__tshark: + if isinstance(ParsedMessage.__tshark,TsharkConnector): ParsedMessage.__tshark.terminate(2) diff --git a/src/nemere/validation/netzobFormatMatchScore.py b/src/nemere/validation/netzobFormatMatchScore.py index 253bcd83..44dcc5e4 100644 --- a/src/nemere/validation/netzobFormatMatchScore.py +++ b/src/nemere/validation/netzobFormatMatchScore.py @@ -99,15 +99,14 @@ def minMaxMean(formatmatchmetrics: Dict[Tuple[int, AbstractMessage], FormatMatch print("Empty inferences ignored:", countEmpty) return {th: (numpy.min(sc), numpy.max(sc), numpy.mean(sc)) for th, sc in thrScores.items()} - - def printMinMax(self, - formatmatchmetrics: Dict[Tuple[int, AbstractMessage], FormatMatchScore]): + @classmethod + def printMinMax(cls, formatmatchmetrics: Dict[Tuple[int, AbstractMessage], FormatMatchScore]): """ Print the Format Match Score min/max per threshold. :param formatmatchmetrics: Dict[Threshold, Message], FormatMatchScore] """ - mmm = self.minMaxMean(formatmatchmetrics) + mmm = cls.minMaxMean(formatmatchmetrics) qualmatrix = [["Thresh"], ["min"], ["max"], ["mean"]] for th, (minft, maxft, meanft) in mmm.items(): diff --git a/src/nemere/validation/protocols/ari.py b/src/nemere/validation/protocols/ari.py new file mode 100644 index 00000000..a35e58f1 --- /dev/null +++ b/src/nemere/validation/protocols/ari.py @@ -0,0 +1,47 @@ +from ..messageParser import ParsingConstants, MessageTypeIdentifiers + +class MTID_ARI(MessageTypeIdentifiers): + FOR_PROTCOL = { + 'ari': [ 'ari.message_name' ] + } + NAMED_TYPES = {} + +class ARI(ParsingConstants): + COMPATIBLE_TO = b'3.2.5' + MESSAGE_TYPE_IDS = MTID_ARI + + IGNORE_FIELDS = ['_ws.lua.text_raw', 'ari.message_id_raw', 'ari.gmid_raw', 'ari.seq_num_raw', 'ari.ack_opt_raw', + 'ari.unknown_4_raw', 'ari.unknown_8_raw', 'ari.unknown_10_raw', + 'ari.tlv.mandatory_raw', 'ari.tlv.codec.name_raw', 'ari.tlv.type_desc_raw', + 'ari.tlv.unknown_0_raw', 'ari.tlv.unknown_2_raw', 'ari.tlv.data_uint_raw', + 'ari.ibiuint8.value_raw', 'ari.ibiuint16.value_raw', 'ari.ibiuint32.value_raw', + 'ari.utauint8.value_raw', 'ari.utauint16.value_raw', 'ari.utauint32.value_raw', + 'ari.tlv.version_raw', 'ari.tlv.data_asstring_uint_value_raw', + 'gsm_sms_raw', 'ari.ibibool.value_raw', 'ari.utabool.value_raw', + 'ari.ibiltecellinfot.index_raw', 'ari.ibiltecellinfot.mcc_raw', 'ari.ibiltecellinfot.mnc_raw', + 'ari.ibiltecellinfot.band_info_raw', 'ari.ibiltecellinfot.area_code_raw', + 'ari.ibiltecellinfot.cell_id_raw', 'ari.ibiltecellinfot.earfcn_raw', 'ari.ibiltecellinfot.pid_raw', + 'ari.ibiltecellinfot.latitude_raw', 'ari.ibiltecellinfot.longitude_raw', + 'ari.ibiltecellinfot.bandwidth_raw', 'ari.ibiltecellinfot.deployment_type_raw', + ] + EXCLUDE_SUB_FIELDS = ['ari.gmid', 'ari.seq_num', 'gsm_sms'] + INCLUDE_SUBFIELDS = ['_ws.lua.text'] + RECORD_STRUCTURE = [] + + # mapping of field names to general value types. + TYPELOOKUP = dict() + """:type: Dict[str, str]""" + + # {'int', 'chars', 'bytes', 'checksum', 'crypto', 'flags', '???', 'macaddr', 'addr', 'ipv6', 'int_le', + # 'timestamp', 'enum', 'unknown', 'pad', 'id', 'ipv4', 'timestamp_le'} + TYPELOOKUP['ari.proto_flag'] = 'flags' # has value: dec07eab + TYPELOOKUP['ari.gid'] = 'id' # has value: 98c3 + TYPELOOKUP['ari.length'] = 'int_le' # has value: 0d04 + TYPELOOKUP['ari.message_name'] = 'id' # has value: 07e7 + TYPELOOKUP['ari.transaction'] = 'id' # has value: 0000 + TYPELOOKUP['ari.tlv.id'] = 'id' # has value: 0200 + TYPELOOKUP['ari.tlv.length'] = 'int_le' # has value: 1000 + TYPELOOKUP['ari.tlv.data'] = 'bytes' # has value: 01000000 + + prehooks = dict() + posthooks = dict() \ No newline at end of file diff --git a/src/nemere/validation/protocols/autounlock.py b/src/nemere/validation/protocols/autounlock.py new file mode 100644 index 00000000..2da751cc --- /dev/null +++ b/src/nemere/validation/protocols/autounlock.py @@ -0,0 +1,90 @@ +from ..messageParser import ParsingConstants, MessageTypeIdentifiers + +class MTID_AWDL_PD(MessageTypeIdentifiers): + FOR_PROTCOL = dict() + FOR_PROTCOL['wlan.mgt'] = [ 'wlan.fixed.category_code', 'awdl_pd.tag.id', # use sequence of tag ids as type + 'wlan.fixed.ftm.param.delim1' ] + + NAMED_TYPES = { + # 'wlan.fixed.category_code': {'09': 'Protected Dual of Public Action'}, + # 'awdl_pd.tag.id': { + # '01': 'Additional Request Parameters (req)', + # '02': 'Measurement Information (meas)', + # '03': 'Security Parameters (sec)', + # '04': 'Toast Parameters (seq)', + # '05': 'Multi Frame Buffer (mf_buf)', + # } + 'wlan.fixed.category_code': {'09': 'PDPA'}, + 'awdl_pd.tag.id': { + '01': 'req', + '02': 'meas', + '03': 'sec', + '04': 'seq', + '05': 'mf_buf', + }, + 'wlan.fixed.ftm.param.delim1': { + '00b2': 'si0', + '01b2': 'si1' + } + } + +class AWDL_PD(ParsingConstants): + COMPATIBLE_TO = b'3.2.5' + MESSAGE_TYPE_IDS = MTID_AWDL_PD + + IGNORE_FIELDS = ['wlan.tagged.all_raw', 'wlan.tag_raw', 'awdl_pd_raw', + 'wlan.fixed.ftm.param.delim1_tree', 'wlan.fixed.ftm.param.delim2_tree', + 'wlan.fixed.ftm.param.delim3_tree', 'awdl_pd.samples_raw', 'awdl_pd.mfbuf.fragments_raw' ] + # 'awdl_pd.mfbuf.reassembled.data_raw' + EXCLUDE_SUB_FIELDS = ['awdl_pd.samples_tree', 'awdl_pd.mfbuf.fragments', 'wlan.fixed.ftm.param.delim1_tree', + 'wlan.fixed.ftm.param.delim2_tree', 'wlan.fixed.ftm.param.delim3_tree'] + INCLUDE_SUBFIELDS = ['wlan.tagged.all', 'awdl_pd', # 'wlan.tag' : handled by WLAN module + 'Additional Request Parameters (req)', 'Toast Parameters (seq)', + 'Security Parameters (sec)', 'Measurement Information (meas)', + 'Multi Frame Buffer (mf_buf)', + ] + RECORD_STRUCTURE = [] + + # mapping of field names to general value types. + TYPELOOKUP = dict() + """:type: Dict[str, str]""" + + TYPELOOKUP['wlan.fixed.publicact'] = 'enum' # has value: 21 + TYPELOOKUP['wlan.fixed.followup_dialog_token'] = 'int' # has value: 08 + TYPELOOKUP['wlan.fixed.ftm_tod'] = 'int_le' # has value: 000000000000 + TYPELOOKUP['wlan.fixed.ftm_toa'] = 'int_le' # has value: b00400000000 + TYPELOOKUP['wlan.fixed.ftm_tod_err'] = 'enum' # has value: 0000 + TYPELOOKUP['wlan.fixed.ftm_toa_err'] = 'enum' # has value: 0000 + TYPELOOKUP['wlan.fixed.trigger'] = 'enum' # has value: 01 + TYPELOOKUP['wlan.fixed.ftm.param.delim1'] = 'flags' # has value: 01b2 + TYPELOOKUP['wlan.fixed.ftm.param.delim2'] = 'flags' # has value: 3239011e + TYPELOOKUP['wlan.fixed.ftm.param.delim3'] = 'flags' # has value: 240100 + + TYPELOOKUP['awdl_pd.version'] = 'int' # has value: 01 + TYPELOOKUP['awdl_pd.tag.id'] = 'int' # has value: 05 + TYPELOOKUP['awdl_pd.tag.length'] = 'int' # has value: 26 + TYPELOOKUP['awdl_pd.mfbuf.reserved'] = 'flags' # has value: 0100 + TYPELOOKUP['awdl_pd.mfbuf.chunk_offset'] = 'int_le' # has value: e001 + TYPELOOKUP['awdl_pd.mfbuf.total_len'] = 'int_le' # has value: 0002 + TYPELOOKUP['awdl_pd.mfbuf.chunk_data'] = 'bytes' # has value: 8c0000003efe... + TYPELOOKUP['awdl_pd.mfbuf.sample_component'] = 'int_le' # single I/I signal measurement value + TYPELOOKUP['awdl_pd.meas.reserved1'] = 'int_le' # has value: 0300 + TYPELOOKUP['awdl_pd.meas.phy_error'] = 'enum' # has value: 00000000 + TYPELOOKUP['awdl_pd.meas.reserved2'] = 'pad' # has value: 0000 + TYPELOOKUP['awdl_pd.sec.reserved1'] = 'int_le' # has value: 0100 + TYPELOOKUP['awdl_pd.sec.reserved2'] = 'pad' # has value: 00 + TYPELOOKUP['awdl_pd.sec.phy_ri_rr_len'] = 'int' # has value: 08 + TYPELOOKUP['awdl_pd.sec.phy_ri'] = 'bytes' # has value: 7e62931b35647313 + TYPELOOKUP['awdl_pd.sec.phy_rr'] = 'bytes' # has value: 49128de496b25b55 + TYPELOOKUP['awdl_pd.unknown'] = 'pad' # has value: 0000000000000000000000000000000000 + TYPELOOKUP['awdl_pd.req.length'] = 'int_le' # has value: 1b00 + TYPELOOKUP['awdl_pd.req.unk1'] = 'int' # has value: 00 + TYPELOOKUP['awdl_pd.req.unk2'] = 'int' # has value: 06 + TYPELOOKUP['awdl_pd.req.unk3'] = 'int_le' # has value: 06100000 + TYPELOOKUP['awdl_pd.seq.length'] = 'int_le' # has value: 0100 + TYPELOOKUP['awdl_pd.seq.data1'] = 'bytes' # has value: 01150a + TYPELOOKUP['awdl_pd.seq.data2'] = 'pad' # has value: 00 + + prehooks = dict() + posthooks = dict() + diff --git a/src/nemere/validation/protocols/awdl.py b/src/nemere/validation/protocols/awdl.py new file mode 100644 index 00000000..f001ab9e --- /dev/null +++ b/src/nemere/validation/protocols/awdl.py @@ -0,0 +1,167 @@ +from typing import List, Tuple, Union + +from ..messageParser import ParsingConstants, MessageTypeIdentifiers + + +class MessageTypeIdentifiers_AWDL(MessageTypeIdentifiers): + FOR_PROTCOL = dict() + # AWDL + FOR_PROTCOL['wlan.mgt'] = [ 'wlan.fixed.category_code', 'awdl.type', 'awdl.subtype', + 'awdl.datastate.extflags'] # see nemesys-reports/protocol-awdl/discriminator-candidates.ods + + + NAMED_TYPES = { + 'wlan.fixed.category_code': {'7f': 'Vendor Specific'}, + 'awdl.type': {'08': 'AWDL'}, + 'awdl.subtype': { + '00': 'Periodic Synchronization Frame (PSF) (0)', + '03': 'Master Indication Frame (MIF) (3)' } + } + +class AWDL(ParsingConstants): + COMPATIBLE_TO = b'3.2.5' + MESSAGE_TYPE_IDS = MessageTypeIdentifiers_AWDL + + IGNORE_FIELDS = [ 'awdl.fixed.all_raw', 'awdl.tagged.all_raw', 'awdl.tag_raw' ] + EXCLUDE_SUB_FIELDS = [ + 'awdl.version_tree', 'awdl.dns.name', 'awdl.dns.target', 'awdl.dns.ptr', 'awdl.arpa', + 'awdl.datastate.flags_tree', 'awdl.datastate.social_channel_map_tree', 'awdl.datastate.extflags_tree', + 'awdl.serviceparams.valuess', 'awdl.ht.capabilities_tree', 'awdl.ht.ampduparam_tree', 'awdl.ht.mcsset', + + # TODO actually, these should be walked + 'awdl.channelseq.channel_list' + ] + INCLUDE_SUBFIELDS = [ 'awdl.fixed.all', 'awdl.tagged.all', 'awdl.tag' ] + # names of field nodes in the json that have a record structure (list[list[tuples], not list[tuples[str, tuple]]). + RECORD_STRUCTURE = [ ] + + # mapping of field names to general value types. + TYPELOOKUP = dict() + """:type: Dict[str, str]""" + + # awdl + TYPELOOKUP['awdl.type'] = 'enum' # has value: 08 + TYPELOOKUP['awdl.version'] = 'int' # has value: 10 + TYPELOOKUP['awdl.subtype'] = 'enum' # has value: 00 + TYPELOOKUP['awdl.reserved'] = 'unknown' # has value: 00 + TYPELOOKUP['awdl.phytime'] = 'timestamp_le' # has value: 7392fa8b + TYPELOOKUP['awdl.targettime'] = 'timestamp_le' # has value: f891fa8b + TYPELOOKUP['awdl.unknown'] = 'unknown' # has value: 00 + TYPELOOKUP['awdl.tag.number'] = 'int' # has value: 02 + TYPELOOKUP['awdl.tag.length'] = 'int_le' # has value: 2000 + TYPELOOKUP['awdl.tag.padding'] = 'pad' # has value: 0000 + + TYPELOOKUP['awdl.syncparams.txchannel'] = 'int' # has value: 95 + TYPELOOKUP['awdl.syncparams.txcounter'] = 'int_le' # has value: 3000 + TYPELOOKUP['awdl.syncparams.masterchan'] = 'int' # has value: 95 + TYPELOOKUP['awdl.syncparams.guardtime'] = 'int' # has value: 00 + TYPELOOKUP['awdl.syncparams.awperiod'] = 'int_le' # has value: 1000 + TYPELOOKUP['awdl.syncparams.afperiod'] = 'int_le' # has value: 6e00 + TYPELOOKUP['awdl.syncparams.awdlflags'] = 'flags' # has value: 0018 + TYPELOOKUP['awdl.syncparams.aw.ext_len'] = 'int_le' # has value: 1000 + TYPELOOKUP['awdl.syncparams.aw.common_len'] = 'int_le' # has value: 1000 + TYPELOOKUP['awdl.syncparams.aw.remaining'] = 'int_le' # has value: 0000 + TYPELOOKUP['awdl.syncparams.ext.min'] = 'int' # has value: 03 + TYPELOOKUP['awdl.syncparams.ext.max_multicast'] = 'int' # has value: 03 + TYPELOOKUP['awdl.syncparams.ext.max_unicast'] = 'int' # has value: 03 + TYPELOOKUP['awdl.syncparams.ext.max_af'] = 'int' # has value: 03 + TYPELOOKUP['awdl.syncparams.master'] = 'macaddr' # has value: eea1c937585c + TYPELOOKUP['awdl.syncparams.presencemode'] = 'enum' # has value: 04 + TYPELOOKUP['awdl.syncparams.awseqcounter'] = 'int_le' # has value: a153 + TYPELOOKUP['awdl.syncparams.apbeaconalignment'] = 'int_le' # has value: 0000 + + TYPELOOKUP['awdl.electionparams.flags'] = 'flags' # has value: 00 + TYPELOOKUP['awdl.electionparams.id'] = 'int' # has value: 0000 + TYPELOOKUP['awdl.electionparams.distance'] = 'int' # has value: 00 + TYPELOOKUP['awdl.electionparams.unknown'] = 'unknown' # has value: 00 + TYPELOOKUP['awdl.electionparams.master'] = 'macaddr' # has value: 126adc00a260 + TYPELOOKUP['awdl.electionparams.mastermetric'] = 'int_le' # has value: 09020000 + TYPELOOKUP['awdl.electionparams.selfmetric'] = 'int_le' # has value: 09020000 + + TYPELOOKUP['awdl.electionparams2.master'] = 'macaddr' # has value: 126adc00a260 + TYPELOOKUP['awdl.electionparams2.other'] = 'macaddr' # has value: 126adc00a260 + TYPELOOKUP['awdl.electionparams2.mastercounter'] = 'int_le' # has value: f9030000 + TYPELOOKUP['awdl.electionparams2.disstance'] = 'int_le' # has value: 00000000 + TYPELOOKUP['awdl.electionparams2.mastermetric'] = 'int_le' # has value: 09020000 + TYPELOOKUP['awdl.electionparams2.selfmetric'] = 'int_le' # has value: 09020000 + TYPELOOKUP['awdl.electionparams2.unknown'] = 'unknown' # has value: 00000000 + TYPELOOKUP['awdl.electionparams2.reserved'] = 'unknown' # has value: 00000000 + TYPELOOKUP['awdl.electionparams2.selfcounter'] = 'int_le' # has value: f9030000 + + TYPELOOKUP['awdl.channelseq.channels'] = 'int' # has value: 0f + TYPELOOKUP['awdl.channelseq.encoding'] = 'enum' # has value: 01 + TYPELOOKUP['awdl.channelseq.duplicate'] = 'flags' # has value: 00 + TYPELOOKUP['awdl.channelseq.step_count'] = 'int' # has value: 03 + TYPELOOKUP['awdl.channelseq.fill_channel'] = 'enum' # has value: ffff + TYPELOOKUP[ + 'awdl.channelseq.channel_list'] = 'int_le' # has value: 1d9700000000000000000000000000002b061d971d9700000000000000000000 + # TODO actually a list of int_le + + TYPELOOKUP['awdl.datastate.flags'] = 'flags' # has value: 239f + TYPELOOKUP['awdl.datastate.countrycode'] = 'chars' # has value: 555300 + TYPELOOKUP['awdl.datastate.social_channel_map'] = 'flags' # has value: 0700 + TYPELOOKUP['awdl.datastate.social_channel'] = 'int' # has value: 0000 + TYPELOOKUP['awdl.datastate.infra_bssid'] = 'macaddr' # has value: 703a0e888052 + TYPELOOKUP['awdl.datastate.infra_channel'] = 'int_le' # has value: 6800 + TYPELOOKUP['awdl.datastate.infra_addr'] = 'macaddr' # has value: 126adc00a260 + TYPELOOKUP['awdl.datastate.own_awdladdr'] = 'macaddr' # has value: 42915dbee89b + TYPELOOKUP['awdl.datastate.unicast_options_length'] = 'int_le' # has value: 0400 + TYPELOOKUP['awdl.datastate.unicast_options'] = 'flags' # has value: 00000000 + TYPELOOKUP['awdl.datastate.extflags'] = 'flags' # has value: 2d00 + TYPELOOKUP['awdl.datastate.logtrigger'] = 'int' # has value: 0000f903 + TYPELOOKUP['awdl.datastate.undecoded'] = 'unknown' # has value: 000014400300c0320000e0040000 + + TYPELOOKUP['awdl.serviceparams.sui'] = 'int_le' # has value: c800 + TYPELOOKUP['awdl.serviceparams.valuess'] = 'flags' # has value: 101088804001408008 + + TYPELOOKUP['awdl.ht.unknown'] = 'unknown' # has value: 0000 + TYPELOOKUP['awdl.ht.capabilities'] = 'flags' # has value: 6f01 + TYPELOOKUP['awdl.ht.ampduparam'] = 'flags' # has value: 17 + TYPELOOKUP['awdl.ht.mcsset'] = 'flags' # has value: ffff + + TYPELOOKUP['awdl.synctree.addr'] = 'macaddr' # has value: 126adc00a260 + TYPELOOKUP['awdl.arpa.flags'] = 'flags' # has value: 03 + TYPELOOKUP['awdl.arpa'] = 'chars' # has value: 0c4e6f6168732d4970686f6e65c00c + TYPELOOKUP['awdl.version.device_class'] = 'int' # has value: 02 + + TYPELOOKUP['awdl.dns.name.len'] = 'int_le' # has value: 1000 + TYPELOOKUP['awdl.dns.name'] = 'chars' # has value: 0c393666303861646338313632c007 + TYPELOOKUP['awdl.dns.type'] = 'enum' # has value: 10 + TYPELOOKUP['awdl.dns.data_len'] = 'int_le' # has value: 0a00 + TYPELOOKUP['awdl.dns.unknown'] = 'unknown' # has value: 0000 + TYPELOOKUP['awdl.dns.txt'] = 'chars' # has value: 09666c6167733d353033 + TYPELOOKUP['awdl.dns.ptr'] = 'chars' # has value: 0d313939707036696472747a3473c000 + TYPELOOKUP['awdl.dns.priority'] = 'int_le' # has value: 0000 + TYPELOOKUP['awdl.dns.weight'] = 'int_le' # has value: 0000 + TYPELOOKUP['awdl.dns.port'] = 'int' # has value: 2242 (not little endian here!) + TYPELOOKUP['awdl.dns.target'] = 'chars' # has value: 0c4e6f6168732d4970686f6e65c00c + + + # noinspection PyUnusedLocal + @staticmethod + def _hookAWDLtag(value: list, siblings: List[Tuple[str, str]]) -> Union[List[Tuple[str, str]], None]: + """ + Hook to parse the Service Response (2) of an awdl.tag. + + :param value: hex value of the field we are working on + :param siblings: subfields that we know of by now + :return: tuple of field name and value to add as new field + """ + from ..messageParser import ParsedMessage + + # retrieve the tag type ("number"), we are interested only in "Service Response (2)" + tagnumbers = [tag[1] for tag in value if tag[0] == 'awdl.tag.number'] + # print(tagnumbers[0]) + if len(tagnumbers) != 1 or tagnumbers[0] != '2': + return None + if not value[-1][1][0][0].startswith('awdl.dns'): + # unexpected format + raise RuntimeWarning("Unexpected format of 'Service Response' AWDL tag. Ignoring.") + fields = ParsedMessage.walkSubTree(value[-1][1]) + return fields + + + prehooks = dict() + posthooks = dict() + # noinspection PyUnresolvedReferences + posthooks['awdl.tag'] = _hookAWDLtag.__func__ diff --git a/src/nemere/validation/tsharkConnector.py b/src/nemere/validation/tsharkConnector.py index c78fbe6a..97a2e866 100644 --- a/src/nemere/validation/tsharkConnector.py +++ b/src/nemere/validation/tsharkConnector.py @@ -2,9 +2,80 @@ from queue import Queue from tempfile import NamedTemporaryFile from typing import Dict, Union +import os -class TsharkConnector(object): + +def _binary(): + pathlist = ["/usr/bin/tshark", "/usr/local/bin/tshark"] # v3.6 - v3.2 + for tp in pathlist: + if os.path.isfile(tp) and os.access(tp, os.X_OK): + logging.getLogger(__name__).debug(f"Selecting tshark from {tp}") + return tp + return None + + +class TsharkBase(object): + # __tsharkline = [_binary()] + + def __init__(self): + self._linktype = None + self._tshark = None # type: Union[subprocess.Popen, None] + self._tsharkqueue = Queue() + self._tempfile = None # type: Union[io.BufferedRandom, None] + self._tempreader = None # type: Union[io.BufferedReader, None] + self._version = None + + @property + def linktype(self): + return self._linktype + + @property + def version(self): + return self._version + + @staticmethod + def checkTsharkCompatibility(): + versionstring = subprocess.check_output(("tshark", "-v")) + versionlist = versionstring.split(maxsplit=4) + if versionlist[2] < b'2.1.1': + raise Exception('ERROR: The installed tshark does not support JSON output, which is required for ' + 'dissection parsing. Found tshark version {}. ' + 'Upgrade!\”'.format(versionlist[2].decode())) + if versionlist[2] not in (b'2.2.6', b'2.6.3', b'2.6.5', b'2.6.8', b'3.2.3', b'3.2.5'): + print("WARNING: Unchecked version {} of tshark in use! Dissections may be misfunctioning or faulty. " + "Check compatibility of JSON output!\n".format(versionlist[2].decode())) + return versionlist[2], False + return versionlist[2], True + + def __getstate__(self): + """ + Handling of runtime specific object attributes for pickling. This basically omits all instances of + io.BufferedReader, io.BufferedRandom, and subprocess.Popen + that need to be freshly instanciated after pickle.load() anyway. + + :return: The dict of this object for use in pickle.dump() + """ + return { + '_TsharkConnector_linktype': self._linktype, + '_TsharkConnector_version': self._version, + } + + def __setstate__(self, state: Dict): + """ + Handling of runtime specific object attributes for pickling. + + :param state: The dict of this object got from pickle.load() + :return: + """ + self._linktype = state['_TsharkConnector_linktype'] + self._version = state['_TsharkConnector_version'] + self._tsharkqueue = Queue() + self._tempfile = None + self._tempreader = None + self._tshark = None + +class TsharkConnector(TsharkBase): """ Class to manage a tshark process and encapsulate the communication with the process' input and output. @@ -31,28 +102,12 @@ class TsharkConnector(object): # -o tcp.analyze_sequence_numbers:FALSE : # prevent error messages associated with the circumstance that it is no true trace tshark gets to dissect # here. Spares the necessity of restarting the tshark process after every packet. - __tsharkline = ["/usr/bin/tshark", "-Q", "-a", "duration:600", "-l", "-n", "-i", "-", "-T", "json", "-x", + __tsharkline = [_binary(), "-Q", "-a", "duration:600", "-l", "-n", "-i", "-", "-T", "json", "-x", "-o", "tcp.analyze_sequence_numbers:FALSE"] def __init__(self, linktype : int): - self.__linktype = linktype - self.__tshark = None # type: Union[subprocess.Popen, None] - self.__tsharkqueue = Queue() - self.__tempfile = None # type: Union[io.BufferedRandom, None] - self.__tempreader = None # type: Union[io.BufferedReader, None] - self.__version = None - logging.getLogger(__name__).setLevel(logging.DEBUG) - - - @property - def linktype(self): - return self.__linktype - - - @property - def version(self): - return self.__version - + super().__init__() + self._linktype = linktype def writePacket(self, paketdata: bytes): """ @@ -70,7 +125,6 @@ def writePacket(self, paketdata: bytes): cmd.stdin.write(paketdata) cmd.stdin.flush() - @staticmethod def __readlines(pipe: io.BufferedReader, queue: Queue): """ @@ -115,7 +169,6 @@ def __readlines(pipe: io.BufferedReader, queue: Queue): time.sleep(.01) break - def readPacket(self): """ Read a dissected packet definition from the queue. @@ -124,14 +177,15 @@ def readPacket(self): :raises ValueError: A ValueError if the JSON was incomplete. :return: A JSON string, trimmed and superficially validated. """ - assert self.__tempreader is not None and not self.__tempreader.closed, "Call writePacket() first" + logger = logging.getLogger(__name__) + assert self._tempreader is not None and not self._tempreader.closed, "Call writePacket() first" import threading - readThread = threading.Thread(target=TsharkConnector.__readlines, args=(self.__tempreader, self.__tsharkqueue)) + readThread = threading.Thread(target=TsharkConnector.__readlines, args=(self._tempreader, self._tsharkqueue)) readThread.start() logging.getLogger(__name__).info("Wait for queue to fill from the tshark-pipe...") for timeout in range(20): - if self.__tsharkqueue.empty(): + if self._tsharkqueue.empty(): time.sleep(.05) logging.getLogger(__name__).debug(f"Wait a little for queue to fill... {timeout:02d}") else: @@ -139,14 +193,19 @@ def readPacket(self): print("Wait for tshark output (max 20s)...") readThread.join(20.0) - if readThread.is_alive() or self.__tsharkqueue.empty(): + if readThread.is_alive() or self._tsharkqueue.empty(): raise TimeoutError("tshark timed out with no result.") logging.getLogger(__name__).info("Queue filled. Capture tshark JSON output.") tjson = "" - while not self.__tsharkqueue.empty(): - tjson += self.__tsharkqueue.get_nowait().decode("utf-8") + while not self._tsharkqueue.empty(): + queueEntry = self._tsharkqueue.get_nowait() + try: + tjson += queueEntry.decode("utf-8") + except UnicodeDecodeError as e: + logger.info(f"Ignored and replaced offending character due to {e}") + tjson += queueEntry.decode("utf-8", "replace") if tjson == ']\n': return None @@ -165,7 +224,6 @@ def readPacket(self): return tjsonS - def _retrieveProcess(self) -> subprocess.Popen: """ Retrieve the running tshark process or start a new one if none is open. @@ -173,33 +231,32 @@ def _retrieveProcess(self) -> subprocess.Popen: :return: A running tshark process, to await packets written to it via :func:`_tsharkWritePacket()`. """ # if there is a tshark process running... - if self.__tshark is not None and self.__tshark.poll() is None \ - and (self.__tempfile is None or self.__tempreader is None - or self.__tempfile.closed or self.__tempreader.closed): - # ... there must also be a open self.__tempfile and self.__tempreader + if self._tshark is not None and self._tshark.poll() is None \ + and (self._tempfile is None or self._tempreader is None + or self._tempfile.closed or self._tempreader.closed): + # ... there must also be a open self._tempfile and self._tempreader self.terminate(2) - # print("Terminated tshark", self.__tshark.poll()) + # print("Terminated tshark", self._tshark.poll()) - if self.__tshark is None or self.__tshark.poll() is not None: - self.__version = TsharkConnector.checkTsharkCompatibility()[0] + if self._tshark is None or self._tshark.poll() is not None: + self._version = TsharkConnector.checkTsharkCompatibility()[0] - header = struct.pack("IHHIIII", 0xa1b2c3d4, 2, 4, 0, 0, 0x7fff, self.__linktype) + header = struct.pack("IHHIIII", 0xa1b2c3d4, 2, 4, 0, 0, 0x7fff, self._linktype) # create tempfile # print("create tempfile") - self.__tempfile = NamedTemporaryFile() - self.__tempreader = open(self.__tempfile.name, "rb") - self.__tshark = subprocess.Popen(TsharkConnector.__tsharkline, - stdout=self.__tempfile, stdin=subprocess.PIPE) - self.__tshark.stdin.write(header) + self._tempfile = NamedTemporaryFile() + self._tempreader = open(self._tempfile.name, "rb") + self._tshark = subprocess.Popen(TsharkConnector.__tsharkline, + stdout=self._tempfile, stdin=subprocess.PIPE) + self._tshark.stdin.write(header) time.sleep(.3) - assert self.__tshark is not None and self.__tshark.poll() is None \ - and self.__tempfile is not None and self.__tempreader is not None \ - and not self.__tempfile.closed and not self.__tempreader.closed - - return self.__tshark + assert self._tshark is not None and self._tshark.poll() is None \ + and self._tempfile is not None and self._tempreader is not None \ + and not self._tempfile.closed and not self._tempreader.closed + return self._tshark def terminate(self, wait=2): """ @@ -208,72 +265,39 @@ def terminate(self, wait=2): :param wait: Wait for the process with timeout (see Popen.wait) """ - if self.__tshark is not None and self.__tshark.poll() is None: # poll returns None if tshark running - self.__tshark.terminate() + if self._tshark is not None and self._tshark.poll() is None: # poll returns None if tshark running + self._tshark.terminate() if wait: - self.__tshark.wait(wait) - if self.__tshark.poll() is None: # still running - print("kill", self.__tshark.pid) - self.__tshark.kill() - if self.__tshark.poll() is None: # still running + self._tshark.wait(wait) + if self._tshark.poll() is None: # still running + print("kill", self._tshark.pid) + self._tshark.kill() + if self._tshark.poll() is None: # still running raise ChildProcessError("tshark process could not be terminated.") - if self.__tempreader: - self.__tempreader.close() - if self.__tempfile: - self.__tempfile.close() - - assert self.__tshark is None or self.__tshark.poll() is not None + if self._tempreader: + self._tempreader.close() + if self._tempfile: + self._tempfile.close() + assert self._tshark is None or self._tshark.poll() is not None def isRunning(self): """ :return: whether a tshark process is running. """ - return self.__tshark.poll() is None if self.__tshark else False - - - @staticmethod - def checkTsharkCompatibility(): - versionstring = subprocess.check_output(("tshark", "-v")) - versionlist = versionstring.split(maxsplit=4) - if versionlist[2] < b'2.1.1': - raise Exception('ERROR: The installed tshark does not support JSON output, which is required for ' - 'dissection parsing. Found tshark version {}. ' - 'Upgrade!\”'.format(versionlist[2].decode())) - if versionlist[2] not in (b'2.2.6', b'2.6.3', b'2.6.5', b'2.6.8', b'3.2.3', b'3.2.5'): - print("WARNING: Unchecked version {} of tshark in use! Dissections may be misfunctioning or faulty. " - "Check compatibility of JSON output!\n".format(versionlist[2].decode())) - return versionlist[2], False - return versionlist[2], True - - - def __getstate__(self): - """ - Handling of runtime specific object attributes for pickling. This basically omits all instances of - io.BufferedReader, io.BufferedRandom, and subprocess.Popen - that need to be freshly instanciated after pickle.load() anyway. + return self._tshark.poll() is None if self._tshark else False - :return: The dict of this object for use in pickle.dump() - """ - return { - '_TsharkConnector__linktype': self.__linktype, - '_TsharkConnector__version': self.__version, - } +class TsharkOneshot(TsharkBase): + __tsharkline = [_binary(), "-Q", "-l", "-n", "-T", "json", "-x", + "-o", "tcp.analyze_sequence_numbers:FALSE", "-r"] - def __setstate__(self, state: Dict): - """ - Handling of runtime specific object attributes for pickling. - - :param state: The dict of this object got from pickle.load() - :return: - """ - self.__linktype = state['_TsharkConnector__linktype'] - self.__version = state['_TsharkConnector__version'] - self.__tsharkqueue = Queue() - self.__tempfile = None - self.__tempreader = None - self.__tshark = None - + def __init__(self): + super().__init__() + self._version = TsharkBase.checkTsharkCompatibility()[0] + def readfile(self, pcapfilename): + tstdout = subprocess.check_output(type(self).__tsharkline + [pcapfilename]) + jsontext = tstdout.decode("utf-8", "replace") + return jsontext diff --git a/src/nemere/visualization/distancesPlotter.py b/src/nemere/visualization/distancesPlotter.py index 4b98db81..e21c0763 100644 --- a/src/nemere/visualization/distancesPlotter.py +++ b/src/nemere/visualization/distancesPlotter.py @@ -2,18 +2,18 @@ import matplotlib.pyplot as plt from matplotlib import cm, colors -from typing import List, Any, Union, Sequence +from typing import List, Any, Union, Sequence, Tuple, Hashable from itertools import compress + from sklearn import manifold from sklearn.decomposition import PCA from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage -from nemere.visualization.plotter import MessagePlotter from nemere.utils.loader import BaseLoader -from nemere.inference.segments import MessageSegment, TypedSegment -from nemere.inference.templates import Template, TypedTemplate, DistanceCalculator - +from nemere.inference.segments import MessageSegment, TypedSegment, AbstractSegment +from nemere.inference.templates import Template, TypedTemplate, DistanceCalculator, FieldTypeTemplate +from nemere.visualization.plotter import MessagePlotter class DistancesPlotter(MessagePlotter): @@ -205,6 +205,8 @@ def plotManifoldDistances(self, assert isinstance(labels, numpy.ndarray) assert len(segments) == distances.shape[0] == distances.shape[1] + from nemere.utils.evaluationHelpers import unknown + axMDS, axSeg = self._axes # type: plt.Axes, plt.Axes axMDS.set_aspect('equal', adjustable='datalim') @@ -216,6 +218,7 @@ def plotManifoldDistances(self, botlef = (0, -5) else: botlef = (0.1, 0.1) + # noinspection PyTypeChecker axSeg.text(*botlef, 'Subsampled: {} of {} segments'.format(len(segments), originalSegmentCount)) # without subsampling, existing values need not to be overwritten @@ -254,10 +257,10 @@ def plotManifoldDistances(self, if any(isinstance(seg, (TypedSegment, TypedTemplate, RawMessage)) for seg in segments): if any(isinstance(seg, (TypedSegment, TypedTemplate)) for seg in segments): ftypes = numpy.array([seg.fieldtype if isinstance(seg, (TypedSegment, TypedTemplate)) - else "[unknown]" for seg in segments]) # PP + else unknown for seg in segments]) # PP elif any(isinstance(seg, RawMessage) and seg.messageType != 'Raw' for seg in segments): ftypes = numpy.array([msg.messageType if isinstance(msg, RawMessage) and msg.messageType != 'Raw' - else "[unknown]" for msg in segments]) # PP + else unknown for msg in segments]) # PP else: ftypes = set() # identify unique types @@ -440,3 +443,50 @@ def plotSegmentDistanceDistribution(dc: DistanceCalculator): plt.show() +class SegmentTopology(object): + """Create a distance Topology plot for the given Segment cluster data.""" + + from nemere.utils.evaluationHelpers import TitleBuilder, StartupFilecheck, unknown + # leads to import clash: + # from nemere.utils.reportWriter import SegmentClusterGroundtruthReport + + # show only largest clusters + clusterCutoff = 15 + + def __init__(self, clusterStats: List[Tuple[Hashable, str, float, float, int]], + fTypeTemplates: List[FieldTypeTemplate], noise: List[AbstractSegment], + dc: DistanceCalculator): + # look up inferred data types for the segments in the selected subset of clusters and generate labels for them. + clusterStatsLookup = {stats[0]: (stats[4], stats[2], stats[1]) # label, mostFreqentType, precision, recall, numSegsinCuster + for stats in clusterStats if stats is not None} + sortedClusters = sorted(fTypeTemplates, key=lambda x: -len(x.baseSegments)) + if type(self).clusterCutoff > 0: + selectedClusters = [ftt for ftt in sortedClusters + if clusterStatsLookup[ftt.fieldtype][2] != type(self).unknown][:type(self).clusterCutoff] + else: + selectedClusters = sortedClusters + omittedClusters = [ftt for ftt in sortedClusters if ftt not in selectedClusters] + clustermask = {segid: "{}: {} seg.s ({:.2f} {})".format(ftt.fieldtype, *clusterStatsLookup[ftt.fieldtype]) + for ftt in selectedClusters for segid in dc.segments2index(ftt.baseSegments)} + clustermask.update({segid: "Noise" for segid in dc.segments2index( + noise + [bs for ftt in omittedClusters for bs in ftt.baseSegments] + )}) + self.labels = numpy.array([clustermask[segid] for segid in range(len(dc.segments))]) + self.dc = dc + + def writeFigure(self, specimens: BaseLoader, inferenceParams: TitleBuilder, + elementsReport: "SegmentClusterGroundtruthReport", filechecker: StartupFilecheck): + print("Plot distances...") + if type(self).clusterCutoff > 0: + inferenceParams.postProcess = "largest{}clusters".format(type(self).clusterCutoff) + atitle = 'segment-distances_' + inferenceParams.plotTitle + + sdp = DistancesPlotter(specimens, atitle, False) + # hand over selected subset of clusters to plot + sdp.plotManifoldDistances( + [elementsReport.typedMatchTemplates[seg][1] if elementsReport.typedMatchTemplates[seg][0] > 0.5 + else seg for seg in self.dc.segments], + self.dc.distanceMatrix, self.labels) + # sdp.plotSegmentDistances(dc, labels) + sdp.writeOrShowFigure(filechecker.reportFullPath) + del sdp diff --git a/src/nemere/visualization/multiPlotter.py b/src/nemere/visualization/multiPlotter.py index 084a3b92..f98fa46e 100644 --- a/src/nemere/visualization/multiPlotter.py +++ b/src/nemere/visualization/multiPlotter.py @@ -8,6 +8,7 @@ from nemere.visualization.plotter import MessagePlotter from nemere.inference.segments import MessageSegment, TypedSegment, MessageAnalyzer +from nemere.utils.evaluationHelpers import uulmColors class MultiMessagePlotter(MessagePlotter): @@ -18,7 +19,7 @@ class MultiMessagePlotter(MessagePlotter): def __init__(self, specimens: SpecimenLoader, analysisTitle: str, nrows: int, ncols: int=None, - isInteractive: bool=False): + isInteractive: bool=False, sameYscale=True): """ :param nrows: The number of rows the sheet should have. If ncols is not set, this is interpreted as the expected count of plots and the number of rows and cols are determined automatically. @@ -32,12 +33,16 @@ def __init__(self, specimens: SpecimenLoader, analysisTitle: str, if not isinstance(self._axes, numpy.ndarray): self._axes = numpy.array(self._axes) self._fig.set_size_inches(16, 9) - + self._sameYscale = sameYscale @property def axes(self) -> List[plt.Axes]: return self._axes.flat + @property + def fig(self) -> plt.Figure: + """Convenience property for future change of the class to use something else then pyplot.""" + return self._fig @staticmethod def _autoconfRowsCols(plotCount): @@ -87,15 +92,16 @@ def textInEachAx(self, textList: List[str]): ax.text(left + marginH, top + marginV, text) - def scatterInEachAx(self, valuesList: List[Tuple[List, List]], marker='_'): + def scatterInEachAx(self, valuesList: List[Tuple[List, List]], marker='_', color=uulmColors["uulm"]): """ Scatter plot of the given values. Each pair of value-sequences is plotted into a separate subplot. :param valuesList: List of value-sequence pairs. First sequence are x-values, second y-values. :param marker: The marker to use in the plot. + :param color: color of the scatter points. """ for ax, values in zip(self._axes.flat, valuesList): - ax.scatter(values[0], values[1], marker=marker, s=5) + ax.scatter(values[0], values[1], marker=marker, s=5, c=color) def fieldmarkersInEachAx(self, fieldEnds: List[List[int]]): @@ -104,10 +110,11 @@ def fieldmarkersInEachAx(self, fieldEnds: List[List[int]]): :param fieldEnds: Values to mark in each subplot. """ - for ax, fends in zip(self._axes.flat, fieldEnds): + for ax, fends in zip(self._axes.flat, fieldEnds): # type: plt.Axes, List[int] for fe in fends: ax.axvline(x=fe, **MessagePlotter.STYLE_FIELDENDLINE) ax.set_xticks(sorted(fends)) + ax.tick_params(axis='x', labelrotation=90) def nameEachAx(self, labels: List[str]): @@ -202,9 +209,15 @@ def plotSubfigs(self, analysisResults: List[List[float]], subfigName: List[str]= self._fig.legend() # noinspection PyDefaultArgument - def printMessageBytes(self, messages: List[AbstractMessage], fontdict={'size': 2}): + def printMessageBytes(self, messages: List[AbstractMessage], fontdict={'size': 2, 'family': 'monospace'}): + minY, maxY = None, None + if self._sameYscale: + minY, maxY = self._commonY for ax, message in zip(self._axes.flat, messages): # type: plt.Axes, AbstractMessage - ymin, ymax = ax.get_ylim() + if self._sameYscale: + ymin, ymax = minY, maxY + else: + ymin, ymax = ax.get_ylim() ypos = ymin + (ymax-ymin) * .05 for idx, byt in enumerate(message.data): # type: bytes ax.text(float(idx)+.2, ypos, "{:02x}".format(byt), fontdict=fontdict) @@ -230,7 +243,8 @@ def plotCompareFill(self, analysisResults: List[List[float]], compareValues: Lis :param compareValues: The second list of another analysis result """ for ax, analysisResult, compareValue in zip(self._axes.flat, analysisResults, compareValues): - MessagePlotter.fillDiffToCompare(ax, analysisResult, compareValue) + if analysisResult is not None and compareValue is not None: + MessagePlotter.fillDiffToCompare(ax, analysisResult, compareValue) from nemere.inference.segments import CorrelatedSegment @@ -317,7 +331,7 @@ def plotToSubfig(self, subfigid: Union[int, plt.Axes], values: Union[List, numpy >>> import nemere.visualization.multiPlotter >>> import nemere.utils.loader >>> import numpy - >>> loader = nemere.utils.loader.SpecimenLoader("../input/maxdiff-fromOrig/dns_ictf2010_maxdiff-100.pcap") + >>> loader = nemere.utils.loader.SpecimenLoader("../input/maxdiff-fromOrig/dns_ictf2010-new_maxdiff-100.pcap") >>> mmp = nemere.visualization.multiPlotter.MultiMessagePlotter(loader, "test", 4) >>> mmp.plotToSubfig(2, numpy.random.poisson(5,100)) @@ -335,6 +349,13 @@ def histoToSubfig(self, subfigid: int, data, **kwargs): return ret + @property + def _commonY(self): + ylims = [sf.get_ylim() for sf in self.axes] + minY, maxY = zip(*ylims) + return min(minY), max(maxY) + + def writeOrShowFigure(self, plotfolder: str = None): for sf in self.axes: # deduplicate labels @@ -345,6 +366,10 @@ def writeOrShowFigure(self, plotfolder: str = None): newLabels.append(label) newHandles.append(handle) sf.legend(newHandles, newLabels) + if self._sameYscale: + minY, maxY = self._commonY + for sf in self.axes: + sf.set_ylim(minY, maxY) super().writeOrShowFigure(plotfolder) @@ -420,3 +445,5 @@ def appendPlot(self, cid: int, title: str, def appendSegment(self, cid: int, pid: int, title: str, segment: TypedSegment): self.plotGroups[cid][1][pid][1].append((title, segment)) return len(self.plotGroups[cid][1][pid][1]) - 1 + + diff --git a/src/nemere/visualization/plotter.py b/src/nemere/visualization/plotter.py index 2d34cbbb..0e7c422f 100644 --- a/src/nemere/visualization/plotter.py +++ b/src/nemere/visualization/plotter.py @@ -3,7 +3,7 @@ import matplotlib.pyplot as plt -from nemere.utils.evaluationHelpers import reportFolder +from nemere.utils.evaluationHelpers import reportFolder, uulmColors class MessagePlotter(object): @@ -12,12 +12,12 @@ class MessagePlotter(object): """ from nemere.utils.loader import SpecimenLoader - STYLE_MAINLINE = { 'linewidth': .6, 'alpha': .6, 'c': 'red' } - STYLE_BLUMAINLINE = { 'linewidth': .6, 'alpha': .6, 'c': 'blue'} - STYLE_ALTMAINLINE = { 'linewidth': .6, 'alpha': 1, 'c': 'red' } + STYLE_MAINLINE = { 'linewidth': .6, 'alpha': .6, 'c': uulmColors['uulm-in'] } + STYLE_BLUMAINLINE = { 'linewidth': .6, 'alpha': .6, 'c': uulmColors['uulm-med']} + STYLE_ALTMAINLINE = { 'linewidth': .6, 'alpha': 1, 'c': uulmColors['uulm-in'] } STYLE_COMPARELINE = { 'linewidth': .2, 'alpha': .6, 'c': 'black'} - STYLE_FIELDENDLINE = { 'linewidth': .5, 'linestyle': '--', 'alpha': .6 } - STYLE_CORRELATION = dict(linewidth=.4, alpha=.6, c='green') + STYLE_FIELDENDLINE = { 'linewidth': .5, 'linestyle': '--', 'alpha': .6, 'c': uulmColors['uulm'] } + STYLE_CORRELATION = dict(linewidth=.4, alpha=.6, c=uulmColors['uulm-mawi']) def __init__(self, specimens: SpecimenLoader, analysisTitle: str, isInteractive: bool=False): """ @@ -37,6 +37,7 @@ def __init__(self, specimens: SpecimenLoader, analysisTitle: str, isInteractive: self._interactive = isInteractive self._autoLegend = True + @property def title(self) -> str: return self._title @@ -60,7 +61,7 @@ def writeOrShowFigure(self, plotfolder: str=None): if self._autoLegend: plt.legend() plt.suptitle('{} | {}'.format(pcapName, self._title)) - plt.tight_layout(rect=[0,0,1,.95]) + plt.tight_layout(rect=[0,0,1,1]) if not self._interactive and not exists(plotfile): plt.savefig(plotfile) @@ -83,7 +84,6 @@ def color_y_axis(ax, color): t.set_color(color) return None - @staticmethod def fillDiffToCompare(ax: plt.Axes, analysisResult: List[float], compareValue: List[float]): """ @@ -97,4 +97,13 @@ def fillDiffToCompare(ax: plt.Axes, analysisResult: List[float], compareValue: L """ ax.fill_between(range(len(analysisResult)), analysisResult, compareValue, color='b', alpha=.4) + @property + def ax(self) -> plt.Axes: + """Convenience property for future change of the class to use something else then pyplot.""" + return plt.gca() + + @property + def fig(self) -> plt.Figure: + """Convenience property for future change of the class to use something else then pyplot.""" + return plt.gcf() diff --git a/src/nemere/visualization/simplePrint.py b/src/nemere/visualization/simplePrint.py index 0911e711..1b6600a4 100644 --- a/src/nemere/visualization/simplePrint.py +++ b/src/nemere/visualization/simplePrint.py @@ -1,14 +1,16 @@ +from collections import defaultdict from itertools import chain from time import strftime from typing import Tuple, Iterable, Sequence, Dict, List, Union from tabulate import tabulate +from colorhash import ColorHash from netzob.Common.Utils.MatrixList import MatrixList from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage from nemere.inference.segments import MessageSegment -from nemere.inference.templates import DistanceCalculator, Template +from nemere.inference.templates import DistanceCalculator, Template, FieldTypeTemplate from nemere.validation.dissectorMatcher import MessageComparator from nemere.visualization import bcolors as bcolors @@ -415,3 +417,222 @@ def _trueFieldEnds(self, message: AbstractMessage): """ +class FieldtypeHelper(object): + """Common functions to handle field types.""" + def __init__(self, ftclusters: List[FieldTypeTemplate]): + self.ftclusters = ftclusters # type: List[FieldTypeTemplate] + self.segmentedMessages = defaultdict(list) # type: Dict[AbstractMessage, List[MessageSegment]] + self._mapMessages2Segments(ftclusters) + # this is FieldtypeComparingPrinter specific: + self._segments2labels = type(self).segments2Labels(ftclusters) # type: Dict[MessageSegment, str] + + @classmethod + def segments2Labels(cls, templates: List[FieldTypeTemplate]): + """Used by classes like FieldtypeComparingPrinter""" + return {bs: templ.fieldtype for templ in templates for bs in cls._recurseSegments2Labels(templ)} + + @classmethod + def _recurseSegments2Labels(cls, template: Template): + """Used by classes like FieldtypeComparingPrinter""" + for bs in template.baseSegments: + if isinstance(bs, MessageSegment): + yield bs + else: + yield from cls._recurseSegments2Labels(bs) + + def _mapMessages2Segments(self, ftclusters: List[Template]): + """Recover list of segments per message""" + for ftc in ftclusters: + self._recurseTemplates(ftc) + sortedSegments = dict() + for msg, segs in self.segmentedMessages.items(): + sortedSegments[msg] = sorted(set(segs), key=lambda s: s.offset) + self.segmentedMessages = sortedSegments + + def _recurseTemplates(self, template: Template): + for bs in template.baseSegments: + if isinstance(bs, MessageSegment): + self.segmentedMessages[bs.message].append(bs) + else: + # recursively add base segments of Templates among segments in the cluster + self._recurseTemplates(bs) + + def offset2colorlabel(self, message): + """ + style label for the inferred segment cluster as color + """ + segs = self.segmentedMessages[message] if message in self.segmentedMessages else [] + offlab = list() + for seg in segs: + offdiff = seg.offset - len(offlab) + if offdiff > 0: + offlab += [None] * offdiff + offlab += [self._segments2labels[seg]] * len(seg) + offdiff = len(message.data) - len(offlab) + if offdiff > 0: + offlab += [None] * offdiff + return offlab + + def colorHashStyles(self, selectMessages: Iterable[AbstractMessage]): + """ + Returns two lists and two dicts: + :return: List of tikz style definitions; map of label to style name; map of color name to style name; list of color definitions + """ + # define labels, colors, styles + ftstyles = {lab: "fts" + lab.replace("_", "").replace(" ", "") + for lab in self._colorlabels(selectMessages)} # field-type label to style name + ftcolornames = {tag: "col" + tag[3:] for lab, tag in ftstyles.items() } # style name to color name + ftcolors = list() # color definition + for tag in ftcolornames.values(): + red, green, blue = [tint/256 for tint in ColorHash(tag[3:], lightness=(0.5, 0.6, 0.7, 0.8)).rgb] + ftcolors.append( f"\definecolor{{{tag}}}{{rgb}}{{{red},{green},{blue}}}" ) + styles = [f"{sty}/.style={{fill={ftcolornames[sty]}}}" for sty in ftstyles.values() ] + return styles, ftstyles, ftcolornames, ftcolors + + def _colorlabels(self, selectMessages: Iterable[AbstractMessage]): + """ + Return a set of all possible color labels to be returned by _offset2colorlabel. + TODO Currently ignores selectMessages. Should return only those fieldtypes that are present in + selectMessages' segments. + """ + # TODO use all messages if selectMessages is None + return {ftc.fieldtype for ftc in self.ftclusters} + + + +class FieldtypeComparingPrinter(ComparingPrinter): + + def __init__(self, comparator: MessageComparator, ftclusters: List[FieldTypeTemplate]): + # We populate the inferred segments from ftclusters ourselves right afterwards by _mapMessages2Segments(). + super().__init__(comparator, ()) + self._ftHelper = FieldtypeHelper(ftclusters) + self._segmentedMessages = self._ftHelper.segmentedMessages + self._ftclusters = ftclusters # type: List[FieldTypeTemplate] + # transiently used during fieldtypes() + self._offset2color = dict() + self._offset2text = dict() + self.__ftstyles = dict() + + def _offset2colorlabel(self, message): + return self._ftHelper.offset2colorlabel(message) + + def _offset2textlabel(self, message): + """ + true data types as labels + """ + pm = self._comparator.parsedMessages[self._comparator.messages[message]] + trueDatatypeMap = dict() + offset = 0 + for name, lgt in pm.getTypeSequence(): + trueDatatypeMap[offset] = name.replace("_", "\\_") + offset += lgt + return trueDatatypeMap + + def _msgoffs2label(self, msg, po): + # place offset labels in caches + if msg not in self._offset2color: + self._offset2color[msg] = self._offset2colorlabel(msg) + if msg not in self._offset2text: + self._offset2text[msg] = self._offset2textlabel(msg) + + labels = list() + # style for the color label + labels.append(self.__ftstyles[self._offset2color[msg][po - 1]] + if self._offset2color[msg][po - 1] is not None else "nonelabel"), + # text label below the offset + if po - 1 in self._offset2text[msg]: + labels.append(f"label={{[tfelabel]below:\\sffamily\\tiny {self._offset2text[msg][po - 1]}}}") + + return ", ".join(labels) + + def toConsole(self, selectMessages: Iterable[AbstractMessage] = None, + mark: Union[Tuple[int, int], MessageSegment] = None, + messageSlice: Tuple[Union[int, None], Union[int, None]] = None): + # TODO make this a subclass of SegmentPrinter (toConsole needs to be adjusted to use colors for type marking + # and a way to discern two subsequent inferred segments of the same type/color on the terminal. + raise NotImplementedError() + + def fieldtypes(self, selectMessages: List[AbstractMessage]): + """ + Generate tikz source code visualizing the the byte values of selected messages overlaid with interleaved + representation of true and inferred field types. + + requires \\usetikzlibrary{positioning, fit} in the including latex document header. + + Adapted from nemere.validation.dissectorMatcher.MessageComparator.tprintInterleaved. + + :return LaTeX/tikz code + """ + assert all(msg in self._comparator.messages for msg in selectMessages), \ + "At least one of the selected messages is not present in the comparator." + + # define labels, colors, styles + styles, self.__ftstyles, ftcolornames, ftcolors = self._ftHelper.colorHashStyles(selectMessages) + + # start filling the texcode variable + texcode = "\n ".join(ftcolors) + "\n" + texcode += self.toTikz(selectMessages, styles) + + # color legend + texcode += "Field type colors:\\\\\n" + for lab, tag in self.__ftstyles.items(): + texlab = lab.replace("_", "\\_") + texcode += f"\\colorbox{{{ftcolornames[tag]}}}{{{texlab}}}\\\\\n" + + return texcode + "\n" + + def toTikz(self, selectMessages: Iterable[AbstractMessage] = None, styles = None): + # define labels, colors, styles + autostyles, self.__ftstyles, ftcolornames, ftcolors = self._ftHelper.colorHashStyles(selectMessages) + if styles is not None: + autostyles += styles + return super().toTikz(selectMessages, autostyles) + + +class FieldClassesPrinter(SegmentPrinter): + def __init__(self, ftclusters: List[FieldTypeTemplate]): + super().__init__(()) + self._ftHelper = FieldtypeHelper(ftclusters) + self._segmentedMessages = self._ftHelper.segmentedMessages + self._ftclusters = ftclusters # type: List[FieldTypeTemplate] + self._offset2color = dict() + # set at the beginning of toTikz() and used in _msgoffs2label() during callback of super().toTikz() + self.__ftstyles = dict() + + def _msgoffs2label(self, msg, po): + # place offset labels in caches + if msg not in self._offset2color: + self._offset2color[msg] = self._offset2colorlabel(msg) + # style for the color label + return self.__ftstyles[self._offset2color[msg][po - 1]] \ + if self._offset2color[msg][po - 1] is not None else "nonelabel" + + def _offset2colorlabel(self, message): + return self._ftHelper.offset2colorlabel(message) + + def toTikz(self, selectMessages: Iterable[AbstractMessage] = None, styles = None): + """ + Generate tikz source code visualizing the the byte values of selected messages and inferred field types. + + requires \\usetikzlibrary{positioning, fit} in the including latex document header. + + :return LaTeX/tikz code + """ + selectMessagesList = list(selectMessages) if selectMessages is not None else None + assert selectMessages is None or all(msg in self._segmentedMessages for msg in selectMessagesList), \ + "At least one of the selected messages has no representative in any of the segments in the ftclusters." + + # define labels, colors, styles + styles, self.__ftstyles, ftcolornames, ftcolors = self._ftHelper.colorHashStyles(selectMessagesList) + + # start filling the texcode variable + texcode = "\n ".join(ftcolors) + "\n" + texcode += super().toTikz(selectMessagesList, styles) + + # color legend + texcode += "Field type (from classification) colors:\\\\\n" + for lab, tag in self.__ftstyles.items(): + texlab = lab.replace("_", "\\_") + texcode += f"\\colorbox{{{ftcolornames[tag]}}}{{{texlab}}}\\\\\n" + + return texcode + "\n" diff --git a/src/nemere/visualization/singlePlotter.py b/src/nemere/visualization/singlePlotter.py index 0aa06500..5f6cab39 100644 --- a/src/nemere/visualization/singlePlotter.py +++ b/src/nemere/visualization/singlePlotter.py @@ -1,5 +1,6 @@ from typing import List, Union, Tuple, Dict, Sequence import numpy +import matplotlib as mpl import matplotlib.pyplot as plt from netzob.Model.Vocabulary.Symbol import Symbol @@ -7,6 +8,7 @@ from nemere.visualization.plotter import MessagePlotter from nemere.validation.dissectorMatcher import MessageComparator from nemere.utils.loader import SpecimenLoader +from nemere.utils.evaluationHelpers import uulmColors # noinspection PyMethodMayBeStatic @@ -21,6 +23,8 @@ def __init__(self, specimens: SpecimenLoader, analysisTitle: str, isInteractive: super().__init__(specimens, analysisTitle, isInteractive) plt.rc('xtick', labelsize=8) # fontsize of the tick labels plt.rc('ytick', labelsize=8) # fontsize of the tick labels + self._ax = plt.gca() + self._fig = plt.gcf() def plotAnalysis(self, analysisResults, compareValue = None, fieldEnds = None, labels = None): @@ -44,7 +48,21 @@ def plotAnalysis(self, analysisResults, compareValue = None, fieldEnds = None, l raise NotImplementedError("Plotting fieldEnds and labels is not implemented.") - def plotColormesh(self, analysisResults: Union[List[List[float]], numpy.ndarray], fieldEnds: List[List[int]]=None): + def plotColormesh(self, + analysisResults: Union[List[List[float]], numpy.ndarray], + fieldEnds: List[List[int]]=None, + valueDomain=(0,255), ylabel="byte value"): + """ + + :param analysisResults: + :param fieldEnds: + :param valueDomain: Use a tuple of two float values to change the normed color scale + to be continuous instead of discrete classes. + :param ylabel: Label for the colorbar legend. + :return: + """ + from mpl_toolkits.axes_grid1 import make_axes_locatable + if isinstance(analysisResults, numpy.ndarray): paddedar = analysisResults else: @@ -53,11 +71,43 @@ def plotColormesh(self, analysisResults: Union[List[List[float]], numpy.ndarray] paddedar = numpy.array( [line + [numpy.nan]*(mslen - len(line)) for line in analysisResults] ) + paperheight = 8 + paperwidth = max(16, paddedar.shape[1] * (paperheight * 0.90 / paddedar.shape[0])) + self.fig.set_size_inches(paperwidth,paperheight) + # plt.figure(figsize=(paperwidth,paperheight)) + + # optimize boundaries, colors and ticks for byte value range + cmap = mpl.cm.plasma # cubehelix, jet + if isinstance(valueDomain[0], float) or isinstance(valueDomain[1], float): + intervalNum = cmap.N + else: + intervalNum = int(valueDomain[1] - valueDomain[0] + 1) + boundaries = numpy.linspace(*valueDomain, intervalNum) + if intervalNum <= cmap.N: + norm = mpl.colors.BoundaryNorm(boundaries, cmap.N) + else: + norm = mpl.colors.Normalize(*valueDomain) + # pcm = plt.pcolormesh(paddedar, norm=norm, cmap=cmap) + # ensure square mesh elements/2D regular raster by using imshow + pcm = plt.imshow(paddedar, norm=norm, cmap=cmap) - plt.pcolormesh(paddedar) if fieldEnds: for msgnr, fe in enumerate(fieldEnds): - plt.scatter(fe, [msgnr + 0.5] * len(fe), color='black', marker='.', s=2) + # for pcolormesh, the coordinate needs to be (fe, [msgnr + 0.5]) + plt.scatter(numpy.array(fe) - 0.5, [msgnr] * len(fe), color='white', marker='.', s=10) + + tickSep = intervalNum//8 + ticks = numpy.append(boundaries[::tickSep], boundaries[-1]) + divider = make_axes_locatable(self.ax) + cax1 = divider.append_axes("right", size=0.15, pad=0.05) + self.fig.colorbar(pcm, ticks=ticks, boundaries=boundaries, cax=cax1) # , values=list(range(255)) + cax1.set_ylabel(ylabel) + + self.ax.tick_params(axis='both', which='major', labelsize=8) + # xlim = plt.xlim() + # ylim = plt.ylim() + # plt.xlim(xlim[0] - 1, xlim[1] + 1) + # plt.ylim(ylim[0] - 1, ylim[1] + 1) plt.autoscale(tight=True) @@ -179,7 +229,7 @@ def heatMapFieldComparison(self, comparator: MessageComparator, symbols: List[Sy # # ax.plot([dcenter], [0.5], color='black', marker='.', markersize=6) # ax.axvline(x=[0], **MessagePlotter.STYLE_FIELDENDLINE) # TODO change to one continuous bar plot - plt.bar(combinedDistances[0], combinedDistances[1], width=1.0, color="green") + plt.bar(combinedDistances[0], combinedDistances[1], width=1.0, color=uulmColors['uulm-mawi']) maxtrueticks = list() mintickdist = combinedDistances[0,-1] * 0.04 # the x coordinate of the last of the plot columns offset = maxTrueLens[0] @@ -236,3 +286,14 @@ def text(self, text: str): plt.text(left + marginH, top + marginV, text) + @property + def ax(self) -> plt.Axes: + """Convenience property for future change of the class to use something else then pyplot.""" + return self._ax + + + @property + def fig(self) -> plt.Figure: + """Convenience property for future change of the class to use something else then pyplot.""" + return self._fig + diff --git a/src/nemesys.py b/src/nemesys.py index 721a7a43..c6186cd2 100644 --- a/src/nemesys.py +++ b/src/nemesys.py @@ -30,7 +30,7 @@ parser.add_argument('-l', '--layer', type=int, default=2, help='Protocol layer to consider. Default is layer 2. Use --relativeToIP ' 'to use a layer relative to IP layer.') - parser.add_argument('-r', '--relativeToIP', default=False, action='store_true', \ + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true', help='Consider a layer relative to the IP layer (see also --layer flag)') args = parser.parse_args() if not isfile(args.pcapfilename): diff --git a/src/nemesys_fms.py b/src/nemesys_fms.py index 21aa7cdd..2e9984cc 100644 --- a/src/nemesys_fms.py +++ b/src/nemesys_fms.py @@ -10,14 +10,14 @@ from os import makedirs import matplotlib.pyplot as plt -import IPython from nemere.validation.dissectorMatcher import MessageComparator, FormatMatchScore, DissectorMatcher from nemere.utils.loader import SpecimenLoader from nemere.inference.analyzers import * -from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation, \ - baseRefinements, symbolsFromSegments +from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation, baseRefinements, originalRefinements, \ + symbolsFromSegments from nemere.utils import reportWriter +from nemere.utils.evaluationHelpers import sigmapertrace debug = False """Some modules and methods contain debug output that can be activated by this flag.""" @@ -125,7 +125,14 @@ def bcDeltaPlot(bcdg_mmm: List[BitCongruenceDeltaGauss]): print('File not found: ' + args.pcapfilename) exit(1) - sigma = 0.6 if not args.sigma else args.sigma + # sigma = 0.6 if not args.sigma else args.sigma + if not args.sigma: + # input="input/maxdiff-fromOrig/*-100.pcap input/deduped-orig/dhcp_SMIA2011101X_deduped-10000.pcap input/deduped-orig/dns_ictf2010-new-deduped-10000.pcap input/deduped-orig/nbns_SMIA20111010-one_deduped-10000.pcap input/deduped-orig/ntp_SMIA-20111010_deduped-9995-10000.pcap input/deduped-orig/smb_SMIA20111010-one_deduped-10000.pcap" + # for fn in $input ; do python src/nemesys_fms.py -rl2 ${fn} ; done + pcapBasename = basename(args.pcapfilename) + sigma = sigmapertrace[pcapBasename] if pcapBasename in sigmapertrace else 0.6 + else: + sigma = args.sigma print("Load messages...") specimens = SpecimenLoader(args.pcapfilename, layer=args.layer, @@ -142,8 +149,8 @@ def bcDeltaPlot(bcdg_mmm: List[BitCongruenceDeltaGauss]): startsegmentation = time.time() segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma) runtimeSegmentation = time.time() - startsegmentation - # refinedPerMsg = originalRefinements(segmentsPerMsg) - refinedPerMsg = baseRefinements(segmentsPerMsg) + refinedPerMsg = originalRefinements(segmentsPerMsg) + # refinedPerMsg = baseRefinements(segmentsPerMsg) runtimeRefinement = time.time() - startsegmentation print('Segmented and refined in {:.3f}s'.format(time.time() - startsegmentation)) @@ -180,6 +187,6 @@ def bcDeltaPlot(bcdg_mmm: List[BitCongruenceDeltaGauss]): IPython.embed() else: reportWriter.writeReport(message2quality, runtimeRefinement, - specimens, comparator, inferenceTitle) + comparator, inferenceTitle) reportWriter.writeReport(DissectorMatcher.symbolListFMS(comparator, symbols), runtimeSegmentation, - specimens, comparator, inferenceTitle + '_withoutRefinement') + comparator, inferenceTitle + '_withoutRefinement') diff --git a/src/nemesys_vd.py b/src/nemesys_vd.py new file mode 100644 index 00000000..976c72ec --- /dev/null +++ b/src/nemesys_vd.py @@ -0,0 +1,195 @@ +""" +Infer messages from PCAPs by Value Delta +and write FMS and other evaluation data to report. + +Based on NEMESYS, Usenix WOOT 2018. +""" + +import argparse, time +from os.path import isfile, join, splitext, basename, abspath, isdir +from os import makedirs + +import matplotlib.pyplot as plt + +from nemere.validation.dissectorMatcher import MessageComparator, FormatMatchScore, DissectorMatcher +from nemere.utils.loader import SpecimenLoader +from nemere.inference.analyzers import * +from nemere.inference.segmentHandler import originalRefinements, baseRefinements, symbolsFromSegments, charRefinements +from nemere.utils import reportWriter + +debug = False +"""Some modules and methods contain debug output that can be activated by this flag.""" + + +def mapQualities2Messages(m2q: Dict[AbstractMessage, FormatMatchScore]) \ + -> Dict[float, List[AbstractMessage]]: + """ + Create a mapping from FMS values to messages of this quality aspects. + + :param m2q: A mapping of Messages to their quality aspects. + :return: A mapping of FMS (rounded to 3 positions) to a list of messages with that quality. + """ + q2m = dict() + for q in m2q.values(): + qkey = round(q.score, 3) + if qkey not in q2m: # for messages with identical scores, have a list + q2m[qkey] = list() + q2m[qkey].append(q.message) + return q2m + + +# noinspection PyShadowingNames,PyShadowingNames +def writeResults(tikzcode: str, specimens: SpecimenLoader, inferenceTitle: str, folder="reports"): + """ + Write NEMESYS inference evaluation results to a report + + :param tikzcode: tikz code of inference examples (e. g. worst, average, best result) + :param specimens: The input data encasulated in a SpecimenLoader object + :param inferenceTitle: A title for this inference report + :param folder: The folder to safe the report to + :return: + """ + + absFolder = abspath(folder) + if not isdir(absFolder): + raise NotADirectoryError("The reports folder {} is not a directory. Reports cannot be written there.".format( + absFolder)) + + pcapName = splitext(basename(specimens.pcapFileName))[0] + reportFolder = join(absFolder, pcapName + "_{}_fms_{}".format( + inferenceTitle, time.strftime("%Y%m%d-%H%M%S", time.localtime()))) + makedirs(reportFolder) + + print('Write report to ' + reportFolder) + + # write Format Match Score and Metrics to csv + with open(join(reportFolder, 'example-inference.tikz'), 'w') as tikzfile: + tikzfile.write(tikzcode) + + +# noinspection PyShadowingNames +def bcDeltaPlot(bcdg_mmm: List[ValueVariance]): + """ + Plot BCD(G) values for the messages with the best, average, and worst FMS. + + :param bcdg_mmm: Example message analysis results to plot. Expects three elements in the list. + """ + from nemere.visualization.multiPlotter import MultiMessagePlotter + + fieldEnds = [comparator.fieldEndsPerMessage(bcdg.message) for bcdg in bcdg_mmm] + + # mark the byte right before the max delta + # inflectionXs = [[offset + int(numpy.nanargmax(wd)) - 1 for offset, wd in a.risingDeltas()] for a in bcdg_mmm] + # inflections = [(pwp, [bcdg.values[p] for p in pwp]) for pwp, bcdg in zip(inflectionXs, bcdg_mmm)] + + # pinpointedInflections = [a.inflectionPoints() for a in bcdg_mmm] + # preInflectionXs = [[i - 1 for i in xs] for xs,ys in pinpointedInflections] + # preInflectionPoints = [ (pwp, [bcdg.bcdeltas[p] for p in pwp]) for pwp, bcdg in zip(preInflectionXs, bcdg_mmm)] + + mmp = MultiMessagePlotter(specimens, 'valueDelta', 3, 1, args.interactive) + # noinspection PyProtectedMember + for ax in mmp._axes.flat: # type: plt.Axes + ax.tick_params(labelsize=7) + ax.set_xlabel('Byte Position', fontdict={'fontsize':7}) + # aspect ratio 2:3 + mmp.setFigureSize(3.136, 3 * (.667 * 3.136 + 0.14) ) # 10 pt = 0.139 in + mmp.plotSubfigs([a.values for a in bcdg_mmm], + # compareValue=[a.bcdeltas for a in bcdg_mmm], + fieldEnds=fieldEnds, fieldEndMarks=False) + # mmp.scatterInEachAx(preInflectionPoints, 'v') + # mmp.scatterInEachAx(inflections, 'o') + mmp.printMessageBytes([a.message for a in bcdg_mmm], {'size': 4}) # set to 4 for DNS, 2.5 for NTP + mmp.writeOrShowFigure() + + + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate segments of messages using the NEMESYS method and evaluate against tshark dissectors: ' + 'Write a report containing the FMS for each message and other evaluation data.') + parser.add_argument('pcapfilename', help='pcapfilename') + parser.add_argument('-i', '--interactive', help='open ipython prompt after finishing the analysis.', + action="store_true") + parser.add_argument('-s', '--sigma', type=float, help='sigma for noise reduction (gauss filter)') + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + args = parser.parse_args() + if not isfile(args.pcapfilename): + print('File not found: ' + args.pcapfilename) + exit(1) + + sigma = 0.6 if not args.sigma else args.sigma + + print("Load messages...") + specimens = SpecimenLoader(args.pcapfilename, layer=args.layer, + relativeToIP = args.relativeToIP) + comparator = MessageComparator(specimens, layer=args.layer, + relativeToIP=args.relativeToIP, + failOnUndissectable=False, debug=debug) + + ######################## + + print("Segment messages...") + BCDG = True + if BCDG: # for comparison reasons. + inferenceTitle = 'bcDeltaGauss{:.1f}'.format(sigma) + else: + inferenceTitle = 'ValueVariance{:.1f}'.format(sigma) + + startsegmentation = time.time() + segmentsPerMsg = list() + for l4msg, rmsg in specimens.messagePool.items(): + if BCDG: + analyzer = BitCongruenceDeltaGauss(l4msg) + analyzer.setAnalysisParams(0.9) + else: + analyzer = ValueVariance(l4msg) + analyzer.analyze() + segmentsPerMsg.append(analyzer.messageSegmentation()) + runtimeSegmentation = time.time() - startsegmentation + # refinedPerMsg = originalRefinements(segmentsPerMsg) + refinedPerMsg = charRefinements(segmentsPerMsg) + runtimeRefinement = time.time() - startsegmentation + + print('Segmented and refined in {:.3f}s'.format(time.time() - startsegmentation)) + + symbols = symbolsFromSegments(segmentsPerMsg) + refinedSymbols = symbolsFromSegments(refinedPerMsg) + + ######################## + + # Without refinement: + # comparator.pprintInterleaved(symbols) + comparator.pprintInterleaved(refinedSymbols) + + # calc FMS per message + print("Calculate FMS...") + message2quality = DissectorMatcher.symbolListFMS(comparator, refinedSymbols) + + # have a mapping from quality to messages + quality2messages = mapQualities2Messages(message2quality) + msg2analyzer = {segs[0].message: segs[0].analyzer for segs in refinedPerMsg} + minmeanmax = reportWriter.getMinMeanMaxFMS([round(q.score, 3) for q in message2quality.values()]) + + # here we only use one message as example of a quality! There may be more messages with the same quality. + bcdg_mmm = [msg2analyzer[quality2messages[q][0]] for q in minmeanmax] # type: List[ValueVariance] + bcDeltaPlot(bcdg_mmm) + + ######################## + + # writeResults(tikzcode, specimens, inferenceTitle) + if args.interactive: + print('Loaded PCAP in: specimens, comparator') + print('Inferred messages in: symbols, refinedSymbols') + print('FMS of messages in: message2quality, quality2messages, minmeanmax') + IPython.embed() + else: + reportWriter.writeReport(message2quality, runtimeRefinement, + comparator, inferenceTitle) + reportWriter.writeReport(DissectorMatcher.symbolListFMS(comparator, symbols), runtimeSegmentation, + comparator, inferenceTitle + '_withoutRefinement') diff --git a/src/nemetyl.py b/src/nemetyl.py index e1d6db65..bf9088c3 100644 --- a/src/nemetyl.py +++ b/src/nemetyl.py @@ -18,6 +18,7 @@ from nemere.alignment.alignMessages import TypeIdentificationByAlignment from nemere.inference.segmentHandler import originalRefinements, nemetylRefinements +from nemere.inference.templates import ClusterAutoconfException from nemere.utils.evaluationHelpers import StartupFilecheck, CachedDistances, TitleBuilder, writePerformanceStatistics # https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy @@ -49,10 +50,11 @@ parser.add_argument('-l', '--layer', type=int, default=2, help='Protocol layer to consider. Default is layer 2. Use --relativeToIP ' 'to use a layer relative to IP layer.') - parser.add_argument('-r', '--relativeToIP', default=False, action='store_true', \ + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true', help='Consider a layer relative to the IP layer (see also --layer flag)') parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.', choices=tokenizers, default="nemesys") + parser.add_argument('-e', '--littleendian', help='Toggle presumed endianness to little.', action="store_true") parser.add_argument('-s', '--sigma', type=float, help='Only NEMESYS: sigma for noise reduction (gauss filter), default: 0.9') parser.add_argument('-f', '--refinement', help='Select segment refinement method.', choices=refinementMethods, @@ -62,7 +64,10 @@ filechecker = StartupFilecheck(args.pcapfilename) withplots = args.with_plots + littleendian = args.littleendian == True tokenizer = args.tokenizer + if littleendian: + tokenizer += "le" # # # # # # # # # # # # # # # # # # # # # # # # # Cache/load the segmentation and segment dissimilarities @@ -87,7 +92,12 @@ else: print(f"The refinement {args.refinement} is not supported with this tokenizer. Abort.") exit(2) - fromCache.get() + try: + fromCache.get() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) segmentedMessages = fromCache.segmentedMessages specimens, _, dc = fromCache.specimens, fromCache.comparator, fromCache.dc segments = dc.rawSegments diff --git a/src/nemetyl_align-segments.py b/src/nemetyl_align-segments.py index ed7ebd95..af4b6ca7 100644 --- a/src/nemetyl_align-segments.py +++ b/src/nemetyl_align-segments.py @@ -12,13 +12,12 @@ which is used as feature to determine their similarity. Similar fields are then aligned. """ -import argparse, IPython - +import argparse from nemere.alignment.alignMessages import TypeIdentificationByAlignment -from nemere.inference.segmentHandler import originalRefinements, baseRefinements, \ - nemetylRefinements +from nemere.inference.segmentHandler import originalRefinements, baseRefinements, nemetylRefinements from nemere.alignment.hirschbergAlignSegments import HirschbergOnSegmentSimilarity +from nemere.inference.templates import ClusterAutoconfException from nemere.utils.evaluationHelpers import * from nemere.utils.reportWriter import IndividualClusterReport, CombinatorialClustersReport from nemere.visualization.multiPlotter import MultiMessagePlotter @@ -350,6 +349,7 @@ def discriminators(single, selectcount): parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.', choices=tokenizers, default="tshark") + parser.add_argument('-e', '--littleendian', help='Toggle presumed endianness to little.', action="store_true") parser.add_argument('-s', '--sigma', type=float, help='Only NEMESYS: sigma for noise reduction (gauss filter),' 'default: 0.9') parser.add_argument('-f', '--refinement', help='Select segment refinement method.', choices=refinementMethods, @@ -361,10 +361,13 @@ def discriminators(single, selectcount): filechecker = StartupFilecheck(args.pcapfilename) withplots = args.with_plots + littleendian = args.littleendian == True analyzerType = analyses[analysis_method] analysisArgs = None analysisTitle = analysis_method tokenizer = args.tokenizer + if littleendian: + tokenizer += "le" # # # # # # # # # # # # # # # # # # # # # # # # # cache/load the DistanceCalculator to the filesystem @@ -387,7 +390,12 @@ def discriminators(single, selectcount): fromCache.configureRefinement(nemetylRefinements) else: print("No refinement selected. Performing raw segmentation.") - fromCache.get() + try: + fromCache.get() + except ClusterAutoconfException as e: + print("Initial clustering of the segments in the trace failed. The protocol in this trace cannot be inferred. " + "The original exception message was:\n", e) + exit(10) segmentedMessages = fromCache.segmentedMessages specimens, comparator, dc = fromCache.specimens, fromCache.comparator, fromCache.dc segmentationTime, dist_calc_segmentsTime = fromCache.segmentationTime, fromCache.dist_calc_segmentsTime @@ -436,8 +444,8 @@ def discriminators(single, selectcount): # epsilon = message_epspertrace[filechecker.pcapbasename] # if filechecker.pcapbasename in message_epspertrace else 0.15 if withplots: - epsConfirm = epsautoconfeval(tyl.eps, tokenizer + f"-s{args.sigma}-{args.refinement}" - if tokenizer[:7] == "nemesys" else "") + epsConfirm = epsautoconfeval(tyl.eps, tokenizer + + (f"-s{args.sigma}-{args.refinement}" if tokenizer[:7] == "nemesys" else "") ) # # # # # # # # # # # # # # # # # # # # # # # # # DEBUG and TESTING # # # # # # # # # # # # # # # # # # # # # # # # diff --git a/src/netzob_fms.py b/src/netzob_fms.py index 29e7ecd9..4e913c69 100644 --- a/src/netzob_fms.py +++ b/src/netzob_fms.py @@ -45,6 +45,8 @@ "nbns_SMIA20111010-one_maxdiff-" : 53, "ntp_SMIA-20111010_maxdiff-" : 66, "smb_SMIA20111010-one-rigid1_maxdiff-" : 53, + "awdl-filtered" : 57, + "au-wifi-filtered" : 51, } @@ -154,8 +156,8 @@ def reduceBitsToBytes(formatdescbit): if __name__ == '__main__': parser = argparse.ArgumentParser( - description='Compare netzob inference and a scapy dissector for a set of messages (pcap).') - parser.add_argument('pcapfilename', help='pcapfilename') + description='Compare netzob inference and a protocol dissector for a set of messages (PCAP).') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') parser.add_argument('--smin', type=int, help='minimum similarity threshold to iterate.') parser.add_argument('--smax', type=int, help='maximum similarity threshold to iterate. Omit to only infer at the threshold of smin') parser.add_argument('-p', '--profile', help='profile the netzob run.', @@ -240,13 +242,12 @@ def reduceBitsToBytes(formatdescbit): qpfSimilarity[metrics.trueFormat].append(thresh) # TODO biggest/most correct cluster per threshold - # TODO format correctness, consiseness, (coverage) of each symbol + # TODO format correctness, conciseness, (coverage) of each symbol # ## Output # FMS.printFMS(formatmatchmetrics, False) # plot_scatter3d(underOverSpecific, formatMatchScore, similarityThreshold) - scoreStats = FMS.MessageScoreStatistics(comparator) - scoreStats.printMinMax(formatmatchmetrics) + FMS.MessageScoreStatistics.printMinMax(formatmatchmetrics) # experimental # plt.ion() diff --git a/src/netzob_messagetypes.py b/src/netzob_messagetypes.py index 81686c5c..c49c786f 100644 --- a/src/netzob_messagetypes.py +++ b/src/netzob_messagetypes.py @@ -81,6 +81,26 @@ def iterSimilarities(minSimilarity=40, maxSimilarity=60) \ symFmt[similaritythreshold][0][symbol] = tformats return symFmt +def writeNetzobPerformanceStatistics(specimens, threshTime): + import os, csv + from nemere.utils.evaluationHelpers import reportFolder + + fileNameS = "Netzob-performance-statistics" + csvpath = os.path.join(reportFolder, fileNameS + '.csv') + csvWriteHead = False if os.path.exists(csvpath) else True + + print('Write performance statistics to {}...'.format(csvpath)) + with open(csvpath, 'a') as csvfile: + statisticscsv = csv.writer(csvfile) + if csvWriteHead: + statisticscsv.writerow([ + 'script', 'pcap', 'threshold', 'runtime' + ]) + # noinspection PyUnresolvedReferences,PyPackageRequirements + import __main__ as main + statisticscsv.writerows([ + [os.path.basename(main.__file__), os.path.basename(specimens.pcapFileName), threshold, runtime] + for threshold, runtime in threshTime.items() ]) @@ -119,7 +139,9 @@ def iterSimilarities(minSimilarity=40, maxSimilarity=60) \ maxThresh = args.smax if args.smax else args.smin threshSymbTfmtTime = iterSimilarities(minThresh, maxThresh) threshSymbTfmt = {t: s for t, (s, r) in threshSymbTfmtTime.items()} + threshTime = {t: r for t, (s, r) in threshSymbTfmtTime.items()} + writeNetzobPerformanceStatistics(specimens, threshTime) print('\nCalculate Cluster Statistics...') swstart = time.time() diff --git a/src/prep_filter-maxdiff-trace.py b/src/prep_filter-maxdiff-trace.py new file mode 100644 index 00000000..14942ec6 --- /dev/null +++ b/src/prep_filter-maxdiff-trace.py @@ -0,0 +1,270 @@ +""" +Filter a PCAP for the subset of packets that have the maximum difference to all other messages. +For the maximum difference, multiple approaches are conceivable. Here we implement three for comparison and apply +the metric of the average least common segment values per message. + +For some evaluation results see: nemesys-reports/NEMEPrep/prep_filter-maxdiff-trace.terminal.txt +""" + +import logging # hide warnings of scapy: https://stackoverflow.com/questions/24812604/hide-scapy-warning-message-ipv6 +logging.getLogger("scapy.runtime").setLevel(logging.ERROR) + + +import argparse, bisect, csv, IPython, numpy, time +import scapy.all as sy +from dataclasses import dataclass +from os.path import isfile, splitext, exists, join +from itertools import chain +from collections import Counter, OrderedDict +from typing import List +from tabulate import tabulate + +from netzob.Model.Vocabulary.Messages.RawMessage import AbstractMessage + +from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation, baseRefinements +from nemere.inference.segments import MessageSegment +from nemere.inference.templates import DelegatingDC, Template +from nemere.inference.analyzers import Value +from nemere.utils.loader import SpecimenLoader, BaseLoader +from nemere.utils.evaluationHelpers import reportFolder +from nemere.validation.dissectorMatcher import MessageComparator + +PACKET_LIMIT = 100 +sigma = 1.2 + + +@dataclass +class MessageValueCommonality: + commonality : float + message : AbstractMessage + + def __typecheck(self, other): + if not isinstance(other, type(self)): + raise TypeError("Non-comparable objects.") + + def __lt__(self, other): + self.__typecheck(other) + return self.commonality < other.commonality + + def __le__(self, other): + self.__typecheck(other) + return self.commonality <= other.commonality + + def __ge__(self, other): + self.__typecheck(other) + return self.commonality >= other.commonality + + def __gt__(self, other): + self.__typecheck(other) + return self.commonality <= other.commonality + + +# noinspection PyShadowingNames +def canberraDissimFilter(specimens: BaseLoader, packetcount: int): + """ + Interpret each message as one segment, calculate their Canberra dissimilarity and filter for the largest mean + dissimilarities in the resulting DC matrix. + + The result mostly looks alright at first glance, but is heavily computation intense. + """ + oneSegPerMsg = [MessageSegment(Value(msg), 0, len(msg.data)) for msg in specimens.messagePool.keys()] + dc = DelegatingDC(oneSegPerMsg) + # get packetcount largest mean dissimilarities from matrix -> msg-indices. + preFilteredMsgs = [msg for msg, meanDiss in + sorted(((msg, meanDiss) for msg, meanDiss in zip(dc.segments, dc.distanceMatrix.mean(axis=0))), + key=lambda x: x[1])[-packetcount:]] + # replace all templates by one of their base segments. The other messages are duplicates and should be removed. + filteredMsgs = [msg.baseSegments[0] if isinstance(msg, Template) else msg for msg in preFilteredMsgs] + return filteredMsgs + +# noinspection PyShadowingNames +def cosineCommonalityFilter(refinedPerMsg: List[List[MessageSegment]], packetcount: int): + """ + unfinished filter, originally intended to validate simpler filters. + + Calculates the similarity of messages by the cosine similarity from the segment commonalities used as feature + vectors and filters the most dissimilar messages. + + Is comparably computation intense and, from the looks of it, seems to give a skewed result. + """ + from sklearn.metrics.pairwise import cosine_similarity + + # all the non-zero-only segment + valCounter = Counter(s.bytes for s in chain.from_iterable(refinedPerMsg) if set(s.bytes) != b"\x00") + # mapping from segment values to an index for it and its global count + valIdx = {val: (idx,cnt) for idx,(val,cnt) in enumerate(valCounter.most_common())} + + # represent global counts of values in a feature matrix + vectors = numpy.zeros((len(refinedPerMsg), len(valCounter))) + for mid,msg in enumerate(refinedPerMsg): + for seg in msg: + sid,cnt = valIdx[seg.bytes] + vectors[mid,sid] = cnt + + cosim = cosine_similarity(vectors) + + # get packetcount lowest mean cosine similarities from matrix -> msg-indices. + filteredMsgs = [MessageValueCommonality(meanDiss, msg) for msg, meanDiss in + sorted(((msgSegs[0].message, meanSimi) for msgSegs, meanSimi in zip(refinedPerMsg, cosim.mean(axis=0))), + key=lambda x: x[1])[-packetcount:]] + + return filteredMsgs + +# noinspection PyShadowingNames +def valueCommonalityFilter(refinedPerMsg: List[List[MessageSegment]], packetcount: int): + """ + Filter for the messages with the least common segments on average. The average is calculated by the median since it + is sensitive to the extremes of singular segment values. + """ + print("Count Segment values...") + segments = chain.from_iterable(refinedPerMsg) + valCounter = Counter(s.bytes for s in segments) + + # # number of messages supporting the 100 most common segment values + # msgCounter = dict() + # for b, c in valCounter.most_common(100): + # for msg in refinedPerMsg: + # if b in (r.bytes for r in msg): + # if b not in msgCounter: + # msgCounter[b] = 0 + # msgCounter[b] += 1 + + # # # # # # # # # # # # # # # # # # # # # # # # # # # # + # sorted mean commonality of each messages' segments + print("Determine messages' commonalities...") + valueCommonalityPerMsg = list() # type: List[MessageValueCommonality] + for msg in refinedPerMsg: + if len(msg) < 1: + print("Message ignored, since empty?!") # TODO investigate cause of error + continue + valCom = float(numpy.median([valCounter[seg.bytes] for seg in msg])) + mvc = MessageValueCommonality(valCom, msg[0].message) + bisect.insort(valueCommonalityPerMsg, mvc) + + # Deduplicate + uniqueMsgs = OrderedDict() + for valCom in valueCommonalityPerMsg: + if valCom.message.data in uniqueMsgs: + continue # skip the existing packet + uniqueMsgs[valCom.message.data] = valCom + + # the selected messages of most "uncommon messages" + filteredMsgs = list(uniqueMsgs.values())[:packetcount] + # filteredMsgs = list(uniqueMsgs.values())[-packetcount:] # TODO + return filteredMsgs + + +filterOptions = ["candis", "coscom", "valcom"] + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description= + 'Filter a PCAP for the subset of packets that have the maximum difference to all ' + 'and write these packets to a new file with name: ' + '[pcapfilename]_maxdiff-[packetcount].pcap') + parser.add_argument('pcapfilename', help='pcapfilename') + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true', + help="Interpret --layer relative to the IP layer.") + parser.add_argument('-f', '--filter', choices=filterOptions, default=filterOptions[-1], + help="Filter to apply for optimizing the trace.") + parser.add_argument('-p', '--packetcount', nargs='?', type= int, + help='packet count (default: {:d})'.format(PACKET_LIMIT), default=PACKET_LIMIT) + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + args = parser.parse_args() + + pcapfilename = args.pcapfilename + packetcount = args.packetcount + + if not isfile(pcapfilename): + print('File not found: ' + pcapfilename) + exit(1) + + infile,ext = splitext(pcapfilename) + outfile = infile + "_maxdiff-" + args.filter + "-{:d}".format(packetcount) + ext # TODO _maxdiff _mindiff + if exists(outfile): + print('Output file exists: ' + outfile) + exit(1) + + print("Loading", pcapfilename) + # get segments from messages and their common values + specimens = SpecimenLoader(pcapfilename, args.layer, args.relativeToIP) + if args.filter in filterOptions[1:]: # only the second two filters need segments. + segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma) + refinedPerMsg = baseRefinements(segmentsPerMsg) + + print("Filter messages...") + filterduration = time.time() + if args.filter == filterOptions[0]: + filteredMsgs = canberraDissimFilter(specimens, packetcount) + elif args.filter == filterOptions[1]: + filteredMsgs = cosineCommonalityFilter(refinedPerMsg, packetcount) + elif args.filter == filterOptions[2]: + filteredMsgs = valueCommonalityFilter(refinedPerMsg, packetcount) + else: + raise RuntimeError(f"Unknown filter {args.filter} selected.") + filterduration = time.time() - filterduration + print(f"Filtered in {filterduration:.2f} s") + filteredSpecimens = BaseLoader( # also used for resolving messages from l4 to raw + (fm.message for fm in filteredMsgs), (specimens.messagePool[fm.message] for fm in filteredMsgs), + baselayer=specimens.getBaseLayerOfPCAP() + ) + + # + # # # # # # # # # # # # # # # # # # # # # # # # # # # # + # Statistics about selected message types + try: + print("Get groundtruth from tshark...") + comparator = MessageComparator(specimens, args.layer, args.relativeToIP) + + print("\nMessage types found in trace vs. filter result:") + originalMsgtypes = Counter( + (pm.protocolname, pm.messagetype) for pm in [pm for pm in comparator.parsedMessages.values()]) + filteredMsgtypes = Counter( + (comparator.parsedMessages[fm].protocolname, comparator.parsedMessages[fm].messagetype) + for fm in filteredSpecimens.messagePool.values()) + stats = [(*pm,c,filteredMsgtypes[pm]) for pm,c in originalMsgtypes.most_common()] + headers = ["Protocol", "Message Type", "Original Count", "Filtered Count"] + # write print and write statistic into csv + print(tabulate(stats, headers=headers) + "\n") + # outbase, ext = splitext(outfile) + csvfile = join(reportFolder, "prep_filter-maxdiff_" + args.filter + ".csv") + writeHead = True + if exists(csvfile): + print('CSV file exists: ' + csvfile, "\nAppending data.") + writeHead = False + with open(csvfile, "a") as cf: + cw = csv.writer(cf) + if writeHead: + cw.writerow(headers) + cw.writerows(stats) + # + # msgRlookup = {v:k for k,v in specimens.messagePool.items()} + # for pmsg in pms: + # filteredComparator.pprint2Interleaved(msgRlookup[pmsg.message]) # RawMessage -> L4Message + # print("\n") + except NotImplementedError as e: + print("Groundtruth not available for unknown protocol, comparison aborted.\n" + "Original exception was: ", e) + # # # # # # # # # # # # # # # # # # # # # # # # # # # # + # + + + + # write back the packets + print("Re-read trace with scapy...") + packetList = sy.rdpcap(pcapfilename) + # # The order of packets is not constent for netzob's PCAPImporter and scapy's rdpcap, despite the following is true + # sorted([a.date for a in specimens.messagePool.values()]) == sorted([a.time for a in packetList]) + packetMap = {bytes(packet): packet for packet in packetList} + filteredPackets = [packetMap[rawmsg.data] for rawmsg in filteredSpecimens.messagePool.values()] + sortedPackets = sorted(filteredPackets, key=lambda x: x.time) + print("Write filtered trace to", outfile) + sy.wrpcap(outfile, sortedPackets, linktype=specimens.getBaseLayerOfPCAP()) + + if args.interactive: + # globals().update(locals()) + IPython.embed() diff --git a/src/transform_cluster-statistics.py b/src/transform_cluster-statistics.py new file mode 100644 index 00000000..f34b5c8b --- /dev/null +++ b/src/transform_cluster-statistics.py @@ -0,0 +1,88 @@ +""" +Evaluation of field type clustering quality: + +Transform the output of the clustering process performed by characterize_fieldtypes.py +into a table of cluster quality scores. It expects the input to be named segment-cluster-statistics.csv +as defined in utils.evaluationHelpers.scStatsFile +and outputs scoreTable.csv +""" + +import csv, os +from tabulate import tabulate +from typing import Dict, List + +from nemere.utils.evaluationHelpers import reportFolder + +cols = [ + # 0 1 2 3 4 5 6 7 + 'run_title', 'trace', 'conciseness', 'cluster_label', 'most_freq_type', 'precision', 'recall', 'cluster_size' +] + +def typedrecallsums(clusterlist: List) -> Dict[str, float]: + """ + TODO set typedrecallsum to explicit "0.0" for types that are present in trace but not the majority of any cluster. + + :param clusterlist: + :return: + """ + # recall for clusters and their most frequent type + typedrecall = [(e[cols[4]].split(':')[0] if e[cols[3]] != "NOISE" else "NOISE", float(e[cols[6]])) + for e in clusterlist] + + # recall sums per group of field type in all clusters + trsums = dict() + for t, r in typedrecall: + if t not in trsums: + trsums[t] = 0 + trsums[t] += r + return trsums + + + +if __name__ == '__main__': + scStatsFile = os.path.join(reportFolder, 'segment-cluster-statistics.csv') + cstat = dict() + with open(scStatsFile, 'r') as csvfile: + cstatr = csv.DictReader(csvfile) + + for colheader in cols: + if colheader not in cstatr.fieldnames: + print("incompatible csv format!", colheader, "missing.") + print(cstatr.fieldnames) + exit(1) + + for row in cstatr: + analysis = (row[cols[0]], row[cols[1]]) + if analysis not in cstat: + cstat[analysis] = list() + cstat[analysis].append(row) + + # min precision per run - cols[5]: precision, cols[3]: cluster_label + mppr = {k: min([float(e[cols[5]]) for e in v if e[cols[3]] != "NOISE"]) for k, v in cstat.items() + if len([c[cols[5]] for c in v if c[cols[3]] != "NOISE"]) > 0} + smppr = sorted(list(mppr.items()), key=lambda x: x[1]) + print(tabulate(smppr, headers=['Analysis', 'Min precision per run'], tablefmt="pipe")) + + # # recall sums per most frequent type of all clusters in the run with the least type-mixed cluster results + # leastmixedrecallsums = typedrecallsums(cstat[smppr[-1][0]]) + + # scores per run (sorted by min precision per run) + mcstat = list() + ftypes = set() + for k in [l[0] for l in smppr]: + trs = typedrecallsums(cstat[k]) + mcstat.append({'analysis': k[0], 'atrace': k[1], 'mppr': mppr[k], **trs}) + ftypes.update(trs.keys()) + + # make score table from list of dicts + # scoreheaders = sorted({t for e in mcstat for t in list(e.keys())}, key=lambda x: 'az_' if x in ('mppr', 'NOISE') else x) + scoreheaders = ['analysis', 'atrace', 'mppr'] + sorted(ftypes, key=lambda x: 'a_' + x if x in ('NOISE', '[unknown]') else x) + scoretable = [ [ e[h] if h in e else None for h in scoreheaders ] for e in mcstat] + print(tabulate(scoretable, scoreheaders, tablefmt="pipe")) + + with open(os.path.join(reportFolder, 'scoreTable.csv'), 'w') as scorefile: + sfw = csv.writer(scorefile) + sfw.writerow(scoreheaders) + for line in scoretable: + sfw.writerow(line) + diff --git a/src/visualize_fieldtype_separation.py b/src/visualize_fieldtype_separation.py new file mode 100644 index 00000000..2c61b285 --- /dev/null +++ b/src/visualize_fieldtype_separation.py @@ -0,0 +1,106 @@ +""" +Write a topology plot to visualize where centers of true field types are and an type-separation histogram per true field type. +Helps in determining which field types may be distinct enough to be later recognized from a template generated from this ground truth. +Use groundtruth about field segmentation by dissectors and determine the medoid of all segments of one data type. + +Takes a PCAP trace of a known protocol, dissects each message into their fields, and yields segments from each of them. +These segments get analyzed by the "value" analysis method which is used as feature to determine their similarity. +Real field types are separated using ground truth and the quality of this separation is visualized. +""" + +import argparse +from itertools import chain +from os.path import isfile + +from nemere.inference.analyzers import * +from nemere.inference.segmentHandler import segments2types +from nemere.inference.templates import Template, TemplateGenerator, MemmapDC +from nemere.utils.evaluationHelpers import annotateFieldTypes +from nemere.utils.loader import SpecimenLoader +from nemere.validation.dissectorMatcher import MessageComparator +from nemere.visualization.distancesPlotter import DistancesPlotter +from nemere.visualization.multiPlotter import MultiMessagePlotter + +debug = False + +# fix the analysis method to VALUE +analysisTitle = 'value' +analyzerType = Value +analysisArgs = None +# fix the distance method to canberra +distance_method = 'canberra' + + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Analyze fields as segments of messages and plot field type identification quality.') + parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') + parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and ' + 'open ipython prompt after finishing the analysis.', + action="store_true") + parser.add_argument('-l', '--layer', type=int, default=2, + help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' + '(typically the payload of a transport protocol).') + parser.add_argument('-r', '--relativeToIP', default=False, action='store_true') + args = parser.parse_args() + if not isfile(args.pcapfilename): + print('File not found: ' + args.pcapfilename) + exit(1) + + # dissect and label messages + print("Load messages...") + specimens = SpecimenLoader(args.pcapfilename, layer=args.layer, relativeToIP=args.relativeToIP) + comparator = MessageComparator(specimens, layer=args.layer, relativeToIP=args.relativeToIP, debug=debug) + + # segment messages according to true fields from the labels + print("Segmenting messages...") + segmentedMessages = annotateFieldTypes(analyzerType, analysisArgs, comparator) + # # filter segments + # filteredSegments = filterSegments(chain.from_iterable(segmentedMessages)) # type: List[TypedSegment] + # # all segments + filteredSegments = list(chain.from_iterable(segmentedMessages)) + + print("Calculating dissimilarities...") + # dc = DistanceCalculator(filteredSegments) + # dc = DelegatingDC(filteredSegments) + dc = MemmapDC(filteredSegments) + + print("Generate type groups and templates...") + typegroups = segments2types(filteredSegments) + typelabels = list(typegroups.keys()) + templates = TemplateGenerator.generateTemplatesForClusters(dc, [typegroups[ft] for ft in typelabels]) + + # labels of templates + labels = ['Noise'] * len(dc.segments) # TODO check: list of segment indices (from raw segment list) per message + # ^ here the question is, whether we like to print resolved segements or representatives + for l, t in zip(typelabels, templates): + labels[dc.segments2index([t.medoid])[0]] = l + + print("Plot dissimilarities...") + sdp = DistancesPlotter(specimens, 'distances-templatecenters', args.interactive) + sdp.plotSegmentDistances(dc, numpy.array(labels)) + sdp.writeOrShowFigure() + + print("Plot histograms...") + # import matplotlib.pyplot as plt + mmp = MultiMessagePlotter(specimens, 'histo-templatecenters', len(templates)) + for figIdx, (typlabl, typlate) in enumerate(zip(typelabels, templates)): + # h_match histogram of distances to medoid for segments of typlate's type + match = [di for di, of in typlate.distancesToMixedLength(dc)] + # abuse template to get distances to non-matching field types + filtermismatch = [typegroups[tl] for tl in typelabels if tl != typlabl] + mismatchtemplate = Template(typlate.medoid, list(chain.from_iterable(filtermismatch))) + # h_mimatch histogram of distances to medoid for segments that are not of typlate's type + mismatch = [di for di, of in mismatchtemplate.distancesToMixedLength(dc)] + # plot both histograms h overlapping (i.e. for each "bin" have two bars). + # the bins denote ranges of distances from the medoid + mmp.histoToSubfig(figIdx, [match, mismatch], bins=numpy.linspace(0, 1, 20), label=[typlabl, 'not ' + typlabl]) + # plot in subfigures on one page + mmp.writeOrShowFigure() + + + if args.interactive: + IPython.embed() diff --git a/tests/messageparsing.py b/tests/messageparsing.py index e85e45a0..d5399886 100644 --- a/tests/messageparsing.py +++ b/tests/messageparsing.py @@ -12,62 +12,62 @@ TESTDNS = "../input/deduped-orig/dns_ictf2010_deduped-100.pcap" HUNDRED_COOKIES = "['63825363']\n"*99 + "['63825363']" -HUNDRED_REQUEST_LIST_ITEMS = """['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '2c', '2e', '2f', '06'] -['01', '0f', '03', '2c', '2e', '2f', '06'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '2b', '4d'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '79', 'f9', '2b', 'fc'] -['01', '0f', '03', '2c', '2e', '2f', '06'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', 'f9', '2b', 'fc'] -['01', '0f', '03', '06', '2c', '2e', '2f', '1f', '21', '2b'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a'] -['01', '1c', '02', '03', '0f', '06', '77', '0c', '2c', '2f', '1a', '79', '2a']""" +HUNDRED_REQUEST_LIST_ITEMS = """['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f2179f92b'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92bfc'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92b'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92bfc'] +['010f032c2e2f06'] +['010f032c2e2f06'] +['010f03062c2e2f1f2179f92bfc'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f2179f92bfc'] +['010f03062c2e2f1f2179f92b'] +['010f03062c2e2f1f21f92bfc'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92bfc'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a'] +['010f03062c2e2f1f2179f92b'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f212b4d'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f2179f92b'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f21f92b'] +['010f03062c2e2f1f2179f92bfc'] +['010f032c2e2f06'] +['010f03062c2e2f1f21f92bfc'] +['010f03062c2e2f1f212b'] +['011c02030f06770c2c2f1a792a'] +['011c02030f06770c2c2f1a792a']""" HUNDRED_MESSAGE_TYPES = """Request ACK Request @@ -220,7 +220,7 @@ def test_getValuesByName(self): """Test retrieving field values by name.""" with captured_output() as (out, err): for pms in self.dhcpPms.values(): - elements = pms.getValuesByName("dhcp.option.request_list_item") # cookie + elements = pms.getValuesByName("dhcp.option.request_list") # cookie if not isinstance(elements,bool) and elements: print(elements) output = out.getvalue().strip() diff --git a/tests/netzob-support.py b/tests/netzob-support.py index c491d467..d445dbdc 100644 --- a/tests/netzob-support.py +++ b/tests/netzob-support.py @@ -7,9 +7,7 @@ """ import logging import unittest, itertools -from os import path -import netzob from netzob.Import.PCAPImporter.PCAPImporter import PCAPImporter from netzob.Model.Vocabulary.Field import Field from netzob.Model.Vocabulary.Symbol import Symbol @@ -120,4 +118,4 @@ def test_layer4plus(self): if "__main__" == __name__: - unittest.main() \ No newline at end of file + unittest.main()