-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcda_tool.py
More file actions
executable file
·277 lines (248 loc) · 10.2 KB
/
cda_tool.py
File metadata and controls
executable file
·277 lines (248 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#!/usr/bin/env python3
# coding=UTF-8
#
# Cross Drive Analysis tool for bulk extractor.
#
# Features of this program:
# --netmap -- makes a map of which computers exchanged packets (ethernet maps)
# --makestop -- Creates a stoplist of features that are on more than a fraction of the drives
# --threshold -- sets the fraction of drives necessary for a feature to be ignored
# --idfeatures -- spcifies which feature files are used for identity operations
#
# reads multiple bulk_extractor histogram files and outputs:
# stoplist.txt - list of email addresses on more than 1/3 of the disks
# targets.txt - list of email addresses not on stoplist and the # of drives on which they appear.
#
# Version 1.3 - Complete rewrite; elimiantes driveids and featureids, since strings
# in Python are hashable (and thus integers). Also uses bulk_extractor_reader
__version__ = "1.4.0"
import os.path
import os
import sys
import collections
import argparse
import glob
from bitcurator_python_tools import bulk_extractor_reader
class Correlator:
"""The main correlator class.
Correlates features on different disks.
Python does not automatically uniquify all strings, so we do.
@property features - a dictionary of all features found. Each value is a dictionary of drives and counts.
"""
def __init__(self, name):
self.name = name # what we are correlating
self.drives = set() # the drives that we have seen
self.features = collections.defaultdict(
dict
) # for each feature, maps to a tupple of (drivename,count)
def longest_drive_name(self):
return max(len(s) for s in self.drives)
def longest_feature_name(self):
return max(len(s) for s in self.features.keys())
def ingest_feature_file(self, f, context_stop_list):
"""Read the lines in a feature file; returns how many lines were procesed"""
drivename = None
count = 0
for line in f:
if isinstance(line, bytes):
line = line.decode("utf-8")
m = bulk_extractor_reader.get_property_line(line)
if m:
if m[0] == "Filename":
drivename = m[1]
self.drives.add(drivename)
print(f"Scanning {drivename} for {self.name}")
if bulk_extractor_reader.is_comment_line(line):
continue
count += 1
if context_stop_list is not None:
(_, feature, context) = line.split("\t")
context_stop_list.add((feature, context))
continue
feature = line.split("\t")[1]
featuredict = self.features[feature]
featuredict[drivename] = featuredict.get(drivename, 0) + 1
print(f" processed {count} features")
return count
def ingest_histogram_file(self, f):
drivename = None
for line in f:
if isinstance(line, bytes):
line = line.decode("utf-8")
m = bulk_extractor_reader.get_property_line(line)
if m:
if m[0] == "Filename":
drivename = m[1]
self.drives.add(drivename)
print(f"Scanning {drivename} for {self.name}")
continue
if bulk_extractor_reader.is_comment_line(line):
continue
fields = line.split("\t")
count = int(fields[0][2:])
feature = fields[1].strip()
featuredict = self.features[feature]
featuredict[drivename] = featuredict.get(drivename, 0) + count
def dump_stats(self, f):
f.write(f"Total Drives: {len(self.drives)}\n")
f.write(f"Distinct {self.name} features: {len(self.features)}\n")
lfn = int(self.longest_feature_name())
f.write(f"{'Feature':{lfn}} Count Drives\n")
def keysortfun(k):
return (-len(self.features[k]), k)
for d in sorted(self.features.keys(), key=keysortfun):
f.write(f"{d:{lfn}} {len(self.features[d])} {self.features[d]}\n")
def main():
parser = argparse.ArgumentParser(
description="Cross Drive Analysis with bulk_extractor output"
)
parser.add_argument(
"--netmap",
help="General GraphViz data for network correlation map",
action="store_true",
)
parser.add_argument(
"--idcor", help="Perform identity-based correlation", action="store_true"
)
parser.add_argument(
"--makestop",
help="Make a stop list of identity features on more than THRESHOLD (0..1) drives",
type=str,
)
parser.add_argument(
"--threshold",
help="Specify the faction of drives for the threshold",
type=float,
default=0.667,
)
parser.add_argument(
"--makecombined",
help="Combine multiple feature files into a single context stop list with no offests",
action="store_true",
)
parser.add_argument(
"--idfeatures",
help="Specifies feature files used for identity operations",
type=str,
default="email,ccn,telephone",
)
parser.add_argument("--dump", help="Dump the CDA database", action="store_true")
parser.add_argument(
"reports",
type=str,
nargs="+",
help="bulk_extractor report directories or ZIP files",
)
parser.add_argument("-v", "--version", action='version', version=f"%(prog)s {__version__}")
args = parser.parse_args()
if args.makestop:
if os.path.exists(args.makestop):
raise IOError(f"{args.makestop}: file exists")
if args.threshold < 0 or args.threshold > 1:
raise RuntimeError(
f"threshold should be between 0 and 1; you supplied {str(args.threshold)}"
)
# Create the correlators, one for each feature file
correlators = set()
for name in args.idfeatures.split(","):
correlators.add(Correlator(name))
# Create the br readers, one for each report
br_readers = set()
for fname in args.reports:
# On windows the '*' may not be expanded....
if "*" in fname:
fns = glob.glob(fname)
else:
fns = [fname]
for fn in fns:
try:
br_readers.add(bulk_extractor_reader.BulkReport(fn))
except IOError:
print(
f"{fn} is an invalid bulk_extractor report. Cannot continue. STOP.\n"
)
sys.exit(1)
# Now read each feature file from each reader
# Either ingest (in the case of cda) or create the context stop list (if making combined)
for c in correlators:
context_stop_list = set()
for br in br_readers:
b = br.open(f"{c.name}.txt", mode="r")
if args.makecombined:
count = c.ingest_feature_file(b, context_stop_list)
else:
count = c.ingest_feature_file(b, None)
if args.makecombined:
fn = f"combined-{c.name}.txt"
with open(fn, mode="w") as f:
for feature, context in context_stop_list:
f.write("".join(["", "\t", feature, "\t", context, "\n"]))
print(f"Created {fn} with {len(context_stop_list)} lines\n")
print("DONE")
sys.exit(0)
if args.dump:
for c in correlators:
c.dump_stats(sys.stdout)
# Does the user want to make a stoplist?
if args.makestop:
stoplist = set()
drive_threshold = int(float(len(args.reports)) * float(args.threshold))
drives_per_feature = collections.defaultdict(int)
for c in correlators:
for feature, drives in c.features.items():
drivecount = len(drives)
drives_per_feature[drivecount] += 1
if drivecount >= drive_threshold:
stoplist.add(feature)
with open(args.makestop, "w") as f:
for feature in sorted(stoplist):
f.write(f"{feature}\n")
print(f"Stoplist {args.makestop} created with {len(stoplist)} features")
print(" DPF Feature Count")
for i in sorted(drives_per_feature.keys()):
print(f"{i:6} {drives_per_feature[i]:8}")
print("--------------------")
print("DPF = Drives per Feature")
print(f"Only features on {drive_threshold} or more drives were written.")
# Perhaps the user wants to perform identity-based correlation?
# This will calculate a correlation coefficient between each pair of drives
if args.idcor:
print("Identity-based correlation: computes drive affinity using TF-IDF")
# First get a list of all the drives
drives_all = set()
for c in correlators:
drives_all = drives_all.union(c.drives)
# Now compute the affinity between all the drives
# ((driveA,driveB),score) added
scores = []
for driveA in drives_all:
for driveB in drives_all:
if driveA >= driveB:
continue # don't auto-correlate
score = 0.0
factors = (
[]
) # keep track of which are the most important factors for this pair
for c in correlators:
for feature, drives in c.features.items():
if driveA in drives and driveB in drives:
factor = 1.0 / len(drives)
score += factor
factors.append((factor, feature))
factors.sort(key=lambda a: -int(a[0]))
scores.append(((driveA, driveB), score, factors[0:5]))
scores.sort(key=lambda a: -a[1])
for (a, b), score, factors in scores:
if score == 0:
continue
print(f"Drive A: {a}")
print(f"Drive B: {b}")
print(f"Score : {score}")
print("Top factors:")
for a, b in factors[0:10]:
print(f" {a:1.4} {b}")
print("")
# A network map shows all of the Mac addresses and all of the packets that were carved.
# We restrict the map to IP addresses
if __name__ == "__main__":
main()