1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 import hashlib
20
21 from error import error, warning, debug, set_debug, get_debug
22 from similarity import *
23 from analysis import *
24
25 import bytecode
26
27
28
31
34
35 self.buff = ""
36 self.entropy = 0.0
37
38 code = m1.get_code()
39 if code != None :
40
41 bc = code.get_bc()
42
43 buff = ""
44
45 for i in bc.get() :
46 buff += "%s" % i.get_name()
47 if i.type_ins_tag == 0 :
48 for op in i.get_operands() :
49 if "#" in op[0] :
50 buff += "%s" % op
51
52 self.buff = buff
53 self.entropy = sim.entropy( self.buff )
54
57
60
63
65 a1 = getattr( m1, "checksum_" + name_attribute )
66 a2 = getattr( m2, "checksum_" + name_attribute )
67
68 e1 = a1.get_entropy()
69 e2 = a2.get_entropy()
70
71 return (max(e1, e2) - min(e1, e2))
72
85
94
98
101 self.basic_block = basic_block
102 self.buff = ""
103 for i in self.basic_block.ins :
104 self.buff += i.get_name()
105
106
107 self.hash = hashlib.sha256( self.buff ).hexdigest()
108
111
114
117
119 m = len(X)
120 n = len(Y)
121
122 C = [[0] * (n+1) for i in range(m+1)]
123 for i in range(1, m+1):
124 for j in range(1, n+1):
125 if X[i-1] == Y[j-1]:
126 C[i][j] = C[i-1][j-1] + 1
127 else:
128 C[i][j] = max(C[i][j-1], C[i-1][j])
129 return C
130
132 if i > 0 and j > 0 and X[i-1] == Y[j-1]:
133 getDiff(C, X, Y, i-1, j-1, a, r)
134 debug(" " + "%02X" % ord(X[i-1]))
135 else:
136 if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
137 getDiff(C, X, Y, i, j-1, a, r)
138 a.append( (j-1, Y[j-1]) )
139 debug(" + " + "%02X" % ord(Y[j-1]))
140 elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
141 getDiff(C, X, Y, i-1, j, a, r)
142 r.append( (i-1, X[i-1]) )
143 debug(" - " + "%02X" % ord(X[i-1]))
144
146 S = ""
147
148 for i in bb.ins :
149 ident = i.get_name()
150 for op in i.get_operands() :
151 if i.type_ins_tag == 0 :
152 if "#" in op[0] :
153 ident += "%s" % op
154
155
156 if ident not in hS :
157 hS[ ident ] = len(hS)
158 rS[ chr( hS[ ident ] ) ] = ident
159
160 S += chr( hS[ ident ] )
161
162 return S
163
164 DIFF_INS_TAG = {
165 "ORIG" : 0,
166 "ADD" : 1,
167 "REMOVE" : 2
168 }
169
172 self.bb1 = bb1
173 self.bb2 = bb2
174 self.info = info
175
176 self.start = self.bb1.start
177 self.end = self.bb1.end
178 self.name = self.bb1.name
179
180 self.di = None
181 self.ins = []
182
184 self.di = di
185
186 off_add = {}
187 off_rm = {}
188 for i in self.di.add_ins :
189 off_add[ i[0] ] = i
190
191 for i in self.di.remove_ins :
192 off_rm[ i[0] ] = i
193
194 nb = 0
195 for i in self.bb1.ins :
196 ok = False
197 if nb in off_add :
198 debug("%d ADD %s %s" % (nb, off_add[ nb ][2].get_name(), off_add[ nb ][2].get_operands()))
199 self.ins.append( off_add[ nb ][2] )
200 setattr( off_add[ nb ][2], "diff_tag", DIFF_INS_TAG["ADD"] )
201 del off_add[ nb ]
202
203 if nb in off_rm :
204 debug("%d RM %s %s" % (nb, off_rm[ nb ][2].get_name(), off_rm[ nb ][2].get_operands()))
205 self.ins.append( off_rm[ nb ][2] )
206 setattr( off_rm[ nb ][2], "diff_tag", DIFF_INS_TAG["REMOVE"] )
207 del off_rm[ nb ]
208 ok = True
209
210 if ok == False :
211 self.ins.append( i )
212 debug("%d %s %s" % (nb, i.get_name(), i.get_operands()))
213 setattr( i, "diff_tag", DIFF_INS_TAG["ORIG"] )
214
215 nb += 1
216
217
218
219 nbmax = nb
220 if off_add != {} :
221 nbmax = sorted(off_add)[-1]
222 if off_rm != {} :
223 nbmax = max(nbmax, sorted(off_rm)[-1])
224
225 while nb <= nbmax :
226 if nb in off_add :
227 debug("%d ADD %s %s" % (nb, off_add[ nb ][2].get_name(), off_add[ nb ][2].get_operands()))
228 self.ins.append( off_add[ nb ][2] )
229 setattr( off_add[ nb ][2], "diff_tag", DIFF_INS_TAG["ADD"] )
230 del off_add[ nb ]
231
232 if nb in off_rm :
233 debug("%d RM %s %s" % (nb, off_rm[ nb ][2].get_name(), off_rm[ nb ][2].get_operands()))
234 self.ins.append( off_rm[ nb ][2] )
235 setattr( off_rm[ nb ][2], "diff_tag", DIFF_INS_TAG["REMOVE"] )
236 del off_rm[ nb ]
237
238 nb += 1
239
240
241
243 setattr( self.bb1.ins[-1], "childs", self.bb1.childs )
244
245 for i in self.ins :
246 if i == self.bb2.ins[-1] :
247
248 childs = []
249 for c in self.bb2.childs :
250 if c[2].name in abb :
251 debug("SET %s %s" % (c[2], abb[ c[2].name ]))
252 childs.append( (c[0], c[1], abb[ c[2].name ]) )
253 else :
254 debug("SET ORIG %s" % str(c))
255 childs.append( c )
256
257 setattr( i, "childs", childs )
258
260 print "\tADD INSTRUCTIONS :"
261 for i in self.di.add_ins :
262 print "\t\t", i[0], i[1], i[2].get_name(), i[2].get_operands()
263
264 print "\tREMOVE INSTRUCTIONS :"
265 for i in self.di.remove_ins :
266 print "\t\t", i[0], i[1], i[2].get_name(), i[2].get_operands()
267
270 self.bb = bb
271
272 self.start = self.bb.start
273 self.end = self.bb.end
274 self.name = self.bb.name
275 self.ins = self.bb.ins
276
278 childs = []
279 for c in self.bb.childs :
280 if c[2].name in abb :
281 debug("SET %s %s " % (c[2], abb[ c[2].name ]))
282 childs.append( (c[0], c[1], abb[ c[2].name ]) )
283 else :
284 debug("SET ORIG %s" % str(c))
285 childs.append( c )
286
287 setattr( self, "childs", childs )
288
290 - def __init__(self, add_ins, remove_ins) :
291 self.add_ins = add_ins
292 self.remove_ins = remove_ins
293
295 final_add = []
296 final_rm = []
297
298 hS = {}
299 rS = {}
300
301 X = toString( dbb.bb1, hS, rS )
302 Y = toString( dbb.bb2, hS, rS )
303
304
305 debug("%s %d" % (repr(X), len(X)))
306 debug("%s %d" % (repr(Y), len(Y)))
307
308 m = len(X)
309 n = len(Y)
310
311 C = LCS( X, Y )
312 a = []
313 r = []
314
315 getDiff(C, X, Y, m, n, a, r)
316 debug(a)
317 debug(r)
318
319 debug("DEBUG ADD")
320 for i in a :
321 debug(" \t %s %s %s" % (i[0], dbb.bb2.ins[ i[0] ].get_name(), dbb.bb2.ins[ i[0] ].get_operands()))
322 final_add.append( (i[0], 0, dbb.bb2.ins[ i[0] ]) )
323
324 debug("DEBUG REMOVE")
325 for i in r :
326 debug(" \t %s %s %s" % (i[0], dbb.bb1.ins[ i[0] ].get_name(), dbb.bb1.ins[ i[0] ].get_operands()))
327 final_rm.append( (i[0], 0, dbb.bb1.ins[ i[0] ]) )
328
329 dbb.diff_ins( DiffINS( final_add, final_rm ) )
330
331
332 DIFF_BB_TAG = {
333 "ORIG" : 0,
334 "DIFF" : 1,
335 "NEW" : 2
336 }
337
340 self.m = m
341 self.vm = vm
342 self.vmx = vmx
343 self.mx = vmx.get_method( m )
344
345 self.sim = sim
346
347
348
349
350
351
352
353
354
355
357 bb = {}
358 bbhash = {}
359
360 fm = func_meth( self.m, self.sim )
361
362 for i in self.mx.basic_blocks.get() :
363 bb[ i.name ] = func_checksum_bb( i )
364
365 try :
366 bbhash[ bb[ i.name ].get_hash() ].append( bb[ i.name ] )
367 except KeyError :
368 bbhash[ bb[ i.name ].get_hash() ] = []
369 bbhash[ bb[ i.name ].get_hash() ].append( bb[ i.name ] )
370
371 setattr(self, "checksum_" + name, fm)
372
373 setattr(self, "bb_" + name, bb)
374 setattr(self, "bb_sha256_" + name, bbhash)
375 setattr(self, "sha256_" + name, hashlib.sha256( fm.get_buff() ).hexdigest())
376
378 return func_sim( self, new_method, self.sim, name_attribute )
379
380 - def similarity(self, name_attribute, new_method, func_sim) :
381 x = None
382 try :
383 x = getattr( self, "hash_" + name_attribute )
384 except AttributeError :
385 setattr( self, "hash_" + name_attribute, {} )
386 x = getattr( self, "hash_" + name_attribute )
387
388 x[ new_method ] = func_sim( self, new_method, self.sim, name_attribute )
389
390 - def sort(self, name_attribute, func_sort) :
391 x = getattr( self, "hash_" + name_attribute )
392 z = func_sort( x )
393 setattr( self, "sort_" + name_attribute, z )
394
395 if z == [] :
396 return False
397 return True
398
399 - def checksort(self, name_attribute, method) :
400 z = getattr( self, "sort_" + name_attribute )
401 for i in z :
402 if method == i[0] :
403 return True
404 return False
405
407 z = getattr( self, "sort_" + name_attribute )
408 if z == [] :
409 return 1.0
410
411 return z[0][0]
412
414 z = getattr( self, "sort_" + name_attribute )
415 if z == [] :
416 return 1.0
417
418 return z[0][1]
419
420 - def diff(self, name_attribute, func_sim_bb, func_diff_ins):
421 z = getattr( self, "sort_" + name_attribute )
422 if z == [] :
423 setattr(self, "dbb_" + name_attribute, {})
424 setattr(self, "nbb_" + name_attribute, {})
425 return
426
427 bb1 = getattr( self, "bb_" + name_attribute )
428
429
430
431 diff_bb = {}
432
433
434 direct_diff_bb = []
435
436
437 new_bb = {}
438
439
440 associated_bb = {}
441
442 for b1 in bb1 :
443 diff_bb[ bb1[ b1 ] ] = {}
444
445 debug("%s 0x%x" % (b1, bb1[ b1 ].basic_block.end))
446 for i in z :
447 bb2 = getattr( i[0], "bb_" + name_attribute )
448 b_z = diff_bb[ bb1[ b1 ] ]
449
450 bb2hash = getattr( i[0], "bb_sha256_" + name_attribute )
451
452
453
454 if bb1[ b1 ].get_hash() in bb2hash :
455 for equal_bb in bb2hash[ bb1[ b1 ].get_hash() ] :
456 b_z[ equal_bb.basic_block.name ] = 0.0
457
458
459
460 else :
461 for b2 in bb2 :
462 b_z[ b2 ] = func_sim_bb( bb1[ b1 ], bb2[ b2 ], self.sim )
463
464 sorted_bb = sorted(b_z.iteritems(), key=lambda (k,v): (v,k))
465
466 debug("\t\t%s" % sorted_bb[:2])
467
468 for new_diff in sorted_bb :
469 associated_bb[ new_diff[0] ] = bb1[ b1 ].basic_block
470
471 if new_diff[1] == 0.0 :
472 direct_diff_bb.append( new_diff[0] )
473
474 if sorted_bb[0][1] != 0.0 :
475 diff_bb[ bb1[ b1 ] ] = (bb2[ sorted_bb[0][0] ], sorted_bb[0][1])
476 direct_diff_bb.append( sorted_bb[0][0] )
477 else :
478 del diff_bb[ bb1[ b1 ] ]
479
480 for i in z :
481 bb2 = getattr( i[0], "bb_" + name_attribute )
482 for b2 in bb2 :
483 if b2 not in direct_diff_bb :
484 new_bb[ b2 ] = bb2[ b2 ]
485
486 dbb = {}
487 nbb = {}
488
489 for d in diff_bb :
490 dbb[ d.basic_block.name ] = DiffBB( d.basic_block, diff_bb[ d ][0].basic_block, diff_bb[ d ] )
491
492
493 for n in new_bb :
494 nbb[ new_bb[ n ].basic_block ] = NewBB( new_bb[ n ].basic_block )
495 del associated_bb[ n ]
496
497 setattr(self, "dbb_" + name_attribute, dbb)
498 setattr(self, "nbb_" + name_attribute, nbb)
499
500
501 for d in dbb :
502 func_diff_ins( dbb[d], self.sim, name_attribute )
503
504
505
506 for d in dbb :
507 dbb[ d ].set_childs( associated_bb )
508
509
510 for d in nbb :
511 nbb[ d ].set_childs( associated_bb )
512
513
514 self.create_bbs( name_attribute )
515
517 dbb = getattr(self, "dbb_" + name_attribute)
518 nbb = getattr(self, "nbb_" + name_attribute)
519
520
521
522
523
524
525
526 l = []
527 for bb in self.mx.basic_blocks.get() :
528 if bb.name not in dbb :
529
530 setattr( bb, "bb_tag", DIFF_BB_TAG["ORIG"] )
531 l.append( bb )
532 else :
533
534 setattr( dbb[ bb.name ], "bb_tag", DIFF_BB_TAG["DIFF"] )
535 l.append( dbb[ bb.name ] )
536
537 for i in nbb :
538
539 setattr( nbb[ i ], "bb_tag", DIFF_BB_TAG["NEW"] )
540 l.append( nbb[ i ] )
541
542
543 l = sorted(l, key = lambda x : x.start)
544 setattr( self, "bbs_" + name_attribute, l )
545
547 return getattr(self, "sha256_" + name_attribute)
548
549 - def show(self, name_attribute, details=False) :
550 print self.m.get_class_name(), self.m.get_name(), self.m.get_descriptor(),
551 print "with",
552
553 z = getattr( self, "sort_" + name_attribute )
554 for i in z :
555 print i[0].m.get_class_name(), i[0].m.get_name(), i[0].m.get_descriptor(), i[1]
556
557 dbb = getattr(self, "dbb_" + name_attribute)
558 nbb = getattr(self, "nbb_" + name_attribute)
559
560 print "\tDIFF BASIC BLOCKS :"
561 for d in dbb :
562 print "\t\t", dbb[d].bb1.name, " --->", dbb[d].bb2.name, ":", dbb[d].info[1]
563 if details :
564 dbb[d].show()
565
566 print "\tNEW BASIC BLOCKS :"
567 for b in nbb :
568 print "\t\t", nbb[b].name
569
570 if details :
571
572 bytecode.PrettyShow2( getattr(self, "bbs_" +name_attribute) )
573
574 - def show2(self, details=False) :
578
579 FILTER_NAME = "FILTER_NAME"
580 FILTER_CHECKSUM_METH = "FILTER_CHECKSUM_METH"
581 FILTER_SIM_METH = "FILTER_SIM_METH"
582 FILTER_SORT_METH = "FILTER_SORT_METH"
583 FILTER_SKIP_METH = "FILTER_SKIP_METH"
584 FILTER_MARK_METH = "FILTER_MARK_METH"
585 FILTER_MARK_VM = "FILTER_MARK_VM"
586 FILTER_CHECKSUM_BB = "FILTER_CHECKSUM_BB"
587 FILTER_SIM_BB = "FILTER_SIM_BB"
588 FILTER_DIFF_INS = "FILTER_DIFF_INS"
589 FILTER_CHECKSUM_VM = "FILTER_CHECKSUM_VM"
590 FILTER_SIM_VM = "FILTER_SIM_VM"
591
592 FILTERS_DIFF = {
593 "FILTER_BASIC" : { FILTER_CHECKSUM_METH : filter_checksum_meth_basic,
594 FILTER_SIM_METH : filter_sim_meth_basic,
595 FILTER_SORT_METH : filter_sort_meth_basic,
596 FILTER_SKIP_METH : filter_skip_meth_basic,
597
598 FILTER_CHECKSUM_BB : filter_checksum_bb_basic,
599 FILTER_SIM_BB : filter_sim_bb_basic,
600
601 FILTER_DIFF_INS : filter_diff_ins_basic,
602 },
603 }
604
605 BASE = "base"
606 METHODS = "methods"
607 HASHSUM = "hashsum"
608 DIFFMETHODS = "diffmethods"
609 NEWMETHODS = "newmethods"
610 DELETEMETHODS = "deletemethods"
611 MATCHMETHODS = "matchmethods"
612 DIFFVMS = "diffvms"
613 -class Diff(object) :
634
635
647
665
689
704
711
713
714 for fil in self.filters :
715 for j in self.filters[fil][METHODS][self.vm2[0]] :
716
717
718 if j not in self.filters[fil][DIFFMETHODS] :
719
720 if j.getsha256( self.filters[fil][BASE][FILTER_NAME] ) not in self.filters[fil][HASHSUM][self.vm1[0]] :
721 ok = True
722
723 for diff_method in self.filters[fil][DIFFMETHODS] :
724
725 if diff_method.checksort( self.filters[fil][BASE][FILTER_NAME], j ) :
726 ok = False
727 break
728
729 if ok :
730 self.filters[fil][NEWMETHODS].append( j )
731
734
737
740
743
745 d = {}
746 for fil in self.filters :
747 d[ fil ] = [ x for x in self.filters[fil][attr] ]
748 return d
749
750
751
759
763
766
767 COEFF_SIM_VM = {
768 "STRING" : 1,
769 "CONSTANT_FLOAT" : 1,
770 "CLINIT" : 1
771 }
772
774 svm1 = ''.join( vm1.vm[0].get_strings() )
775 svm2 = ''.join( vm2.vm[0].get_strings() )
776
777 return { "STRING" : sim.ncd( svm1, svm2 ) }
778
780 code = m.get_code()
781 if code != None :
782 if code.get_length() < 100 :
783 return True
784
785 return False
786
788 return values.values()
789
791 if v >= 0.2 :
792 return 1.0
793
794 return v
795
796 FILTERS_SIM = {
797 "FILTER_SIM" : {
798 FILTER_CHECKSUM_VM : filter_checksum_vm_sim,
799 FILTER_SIM_VM : filter_sim_vm_sim,
800 FILTER_MARK_VM : filter_mark_vm,
801
802 FILTER_CHECKSUM_METH : filter_checksum_meth_basic,
803 FILTER_SIM_METH : filter_sim_meth_sim,
804 FILTER_SORT_METH : filter_sort_meth_basic,
805 FILTER_SKIP_METH : filter_skip_meth_sim,
806 FILTER_MARK_METH : filter_mark_meth,
807
808 FILTER_CHECKSUM_BB : filter_checksum_bb_basic,
809 },
810 }
811
812
813
814
815
816
817
818
819
820
821
822
823
826
827 self.marks = {}
828 super(Sim, self).__init__(vm1, vm2, F)
829
830 self._init_diff_vms()
831 self._init_mark_methods()
832
833 print self.marks
834 for fil in self.marks :
835 s = 0.0
836 for i in self.marks[fil] :
837 s += (1.0 - i)
838 print "\t", (s/len(self.marks[fil])) * 100
839
845
854
858
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915