Module diff
[hide private]
[frames] | no frames]

Source Code for Module diff

  1  # This file is part of Androguard. 
  2  # 
  3  # Copyright (C) 2010, Anthony Desnos <desnos at t0t0.org> 
  4  # All rights reserved. 
  5  # 
  6  # Androguard is free software: you can redistribute it and/or modify 
  7  # it under the terms of the GNU Lesser General Public License as published by 
  8  # the Free Software Foundation, either version 3 of the License, or 
  9  # (at your option) any later version. 
 10  # 
 11  # Androguard is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU Lesser General Public License for more details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License 
 17  # along with Androguard.  If not, see <http://www.gnu.org/licenses/>. 
 18   
 19  import hashlib 
 20   
 21  from error import error, warning, debug, set_debug, get_debug 
 22  from similarity import * 
 23  from analysis import * 
 24   
 25  import bytecode 
 26   
 27  ######################### DIFF ############################### 
 28   
29 -def filter_skip_meth_basic( m ) :
30 return False
31
32 -class CheckSumMeth :
33 - def __init__(self, m1, sim) :
34 # FIXME 35 self.buff = "" 36 self.entropy = 0.0 37 38 code = m1.get_code() 39 if code != None : 40 41 bc = code.get_bc() 42 43 buff = "" 44 45 for i in bc.get() : 46 buff += "%s" % i.get_name() 47 if i.type_ins_tag == 0 : 48 for op in i.get_operands() : 49 if "#" in op[0] : 50 buff += "%s" % op 51 52 self.buff = buff 53 self.entropy = sim.entropy( self.buff )
54
55 - def get_entropy(self) :
56 return self.entropy
57
58 - def get_buff(self) :
59 return self.buff
60
61 -def filter_checksum_meth_basic( m1, sim ) :
62 return CheckSumMeth( m1, sim )
63
64 -def filter_sim_meth_old( m1, m2, sim, name_attribute ) :
65 a1 = getattr( m1, "checksum_" + name_attribute ) 66 a2 = getattr( m2, "checksum_" + name_attribute ) 67 68 e1 = a1.get_entropy() 69 e2 = a2.get_entropy() 70 71 return (max(e1, e2) - min(e1, e2))
72
73 -def filter_sim_meth_basic( m1, m2, sim, name_attribute ) :
74 s1 = m1.vmx.get_method_signature( m1.m, predef_sign = SIGNATURE_L0_0 ).get_string() 75 s2 = m2.vmx.get_method_signature( m2.m, predef_sign = SIGNATURE_L0_0 ).get_string() 76 77 ncd1 = sim.ncd( s1, s2 ) 78 79 a1 = getattr( m1, "checksum_" + name_attribute ) 80 a2 = getattr( m2, "checksum_" + name_attribute ) 81 82 ncd2 = sim.ncd( a1.get_buff(), a2.get_buff() ) 83 84 return (ncd1 + ncd2) / 2.0
85
86 -def filter_sort_meth_basic( x ) :
87 z = sorted(x.iteritems(), key=lambda (k,v): (v,k)) 88 89 if get_debug() : 90 for i in z : 91 debug("\t %s %s %s %d %f" %(i[0].m.get_class_name(), i[0].m.get_name(), i[0].m.get_descriptor(), i[0].m.get_length(), i[1])) 92 93 return z[:1]
94
95 -def filter_sim_bb_basic( bb1, bb2, sim ) :
96 ncd = sim.ncd( bb1.get_buff(), bb2.get_buff() ) 97 return ncd
98
99 -class CheckSumBB :
100 - def __init__(self, basic_block) :
101 self.basic_block = basic_block 102 self.buff = "" 103 for i in self.basic_block.ins : 104 self.buff += i.get_name() 105 106 #self.hash = hashlib.sha256( self.buff + "%d%d" % (len(basic_block.childs), len(basic_block.fathers)) ).hexdigest() 107 self.hash = hashlib.sha256( self.buff ).hexdigest()
108
109 - def get_buff(self) :
110 return self.buff
111
112 - def get_hash(self) :
113 return self.hash
114
115 -def filter_checksum_bb_basic( basic_block ) :
116 return CheckSumBB( basic_block )
117
118 -def LCS(X, Y):
119 m = len(X) 120 n = len(Y) 121 # An (m+1) times (n+1) matrix 122 C = [[0] * (n+1) for i in range(m+1)] 123 for i in range(1, m+1): 124 for j in range(1, n+1): 125 if X[i-1] == Y[j-1]: 126 C[i][j] = C[i-1][j-1] + 1 127 else: 128 C[i][j] = max(C[i][j-1], C[i-1][j]) 129 return C
130
131 -def getDiff(C, X, Y, i, j, a, r):
132 if i > 0 and j > 0 and X[i-1] == Y[j-1]: 133 getDiff(C, X, Y, i-1, j-1, a, r) 134 debug(" " + "%02X" % ord(X[i-1])) 135 else: 136 if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]): 137 getDiff(C, X, Y, i, j-1, a, r) 138 a.append( (j-1, Y[j-1]) ) 139 debug(" + " + "%02X" % ord(Y[j-1])) 140 elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]): 141 getDiff(C, X, Y, i-1, j, a, r) 142 r.append( (i-1, X[i-1]) ) 143 debug(" - " + "%02X" % ord(X[i-1]))
144
145 -def toString( bb, hS, rS ) :
146 S = "" 147 148 for i in bb.ins : 149 ident = i.get_name() 150 for op in i.get_operands() : 151 if i.type_ins_tag == 0 : 152 if "#" in op[0] : 153 ident += "%s" % op 154 155 # print i.get_name(), i.get_operands() 156 if ident not in hS : 157 hS[ ident ] = len(hS) 158 rS[ chr( hS[ ident ] ) ] = ident 159 160 S += chr( hS[ ident ] ) 161 162 return S
163 164 DIFF_INS_TAG = { 165 "ORIG" : 0, 166 "ADD" : 1, 167 "REMOVE" : 2 168 } 169
170 -class DiffBB :
171 - def __init__(self, bb1, bb2, info) :
172 self.bb1 = bb1 173 self.bb2 = bb2 174 self.info = info 175 176 self.start = self.bb1.start 177 self.end = self.bb1.end 178 self.name = self.bb1.name 179 180 self.di = None 181 self.ins = []
182
183 - def diff_ins(self, di) :
184 self.di = di 185 186 off_add = {} 187 off_rm = {} 188 for i in self.di.add_ins : 189 off_add[ i[0] ] = i 190 191 for i in self.di.remove_ins : 192 off_rm[ i[0] ] = i 193 194 nb = 0 195 for i in self.bb1.ins : 196 ok = False 197 if nb in off_add : 198 debug("%d ADD %s %s" % (nb, off_add[ nb ][2].get_name(), off_add[ nb ][2].get_operands())) 199 self.ins.append( off_add[ nb ][2] ) 200 setattr( off_add[ nb ][2], "diff_tag", DIFF_INS_TAG["ADD"] ) 201 del off_add[ nb ] 202 203 if nb in off_rm : 204 debug("%d RM %s %s" % (nb, off_rm[ nb ][2].get_name(), off_rm[ nb ][2].get_operands())) 205 self.ins.append( off_rm[ nb ][2] ) 206 setattr( off_rm[ nb ][2], "diff_tag", DIFF_INS_TAG["REMOVE"] ) 207 del off_rm[ nb ] 208 ok = True 209 210 if ok == False : 211 self.ins.append( i ) 212 debug("%d %s %s" % (nb, i.get_name(), i.get_operands())) 213 setattr( i, "diff_tag", DIFF_INS_TAG["ORIG"] ) 214 215 nb += 1 216 217 #print nb, off_add, off_rm 218 219 nbmax = nb 220 if off_add != {} : 221 nbmax = sorted(off_add)[-1] 222 if off_rm != {} : 223 nbmax = max(nbmax, sorted(off_rm)[-1]) 224 225 while nb <= nbmax : 226 if nb in off_add : 227 debug("%d ADD %s %s" % (nb, off_add[ nb ][2].get_name(), off_add[ nb ][2].get_operands())) 228 self.ins.append( off_add[ nb ][2] ) 229 setattr( off_add[ nb ][2], "diff_tag", DIFF_INS_TAG["ADD"] ) 230 del off_add[ nb ] 231 232 if nb in off_rm : 233 debug("%d RM %s %s" % (nb, off_rm[ nb ][2].get_name(), off_rm[ nb ][2].get_operands())) 234 self.ins.append( off_rm[ nb ][2] ) 235 setattr( off_rm[ nb ][2], "diff_tag", DIFF_INS_TAG["REMOVE"] ) 236 del off_rm[ nb ] 237 238 nb += 1
239 240 #print off_add, off_rm 241
242 - def set_childs(self, abb) :
243 setattr( self.bb1.ins[-1], "childs", self.bb1.childs ) 244 245 for i in self.ins : 246 if i == self.bb2.ins[-1] : 247 248 childs = [] 249 for c in self.bb2.childs : 250 if c[2].name in abb : 251 debug("SET %s %s" % (c[2], abb[ c[2].name ])) 252 childs.append( (c[0], c[1], abb[ c[2].name ]) ) 253 else : 254 debug("SET ORIG %s" % str(c)) 255 childs.append( c ) 256 257 setattr( i, "childs", childs )
258
259 - def show(self) :
260 print "\tADD INSTRUCTIONS :" 261 for i in self.di.add_ins : 262 print "\t\t", i[0], i[1], i[2].get_name(), i[2].get_operands() 263 264 print "\tREMOVE INSTRUCTIONS :" 265 for i in self.di.remove_ins : 266 print "\t\t", i[0], i[1], i[2].get_name(), i[2].get_operands()
267
268 -class NewBB :
269 - def __init__(self, bb) :
270 self.bb = bb 271 272 self.start = self.bb.start 273 self.end = self.bb.end 274 self.name = self.bb.name 275 self.ins = self.bb.ins
276
277 - def set_childs(self, abb) :
278 childs = [] 279 for c in self.bb.childs : 280 if c[2].name in abb : 281 debug("SET %s %s " % (c[2], abb[ c[2].name ])) 282 childs.append( (c[0], c[1], abb[ c[2].name ]) ) 283 else : 284 debug("SET ORIG %s" % str(c)) 285 childs.append( c ) 286 287 setattr( self, "childs", childs )
288
289 -class DiffINS :
290 - def __init__(self, add_ins, remove_ins) :
291 self.add_ins = add_ins 292 self.remove_ins = remove_ins
293
294 -def filter_diff_ins_basic( dbb, sim, name_attribute ) :
295 final_add = [] 296 final_rm = [] 297 298 hS = {} 299 rS = {} 300 301 X = toString( dbb.bb1, hS, rS ) 302 Y = toString( dbb.bb2, hS, rS ) 303 304 305 debug("%s %d" % (repr(X), len(X))) 306 debug("%s %d" % (repr(Y), len(Y))) 307 308 m = len(X) 309 n = len(Y) 310 311 C = LCS( X, Y ) 312 a = [] 313 r = [] 314 315 getDiff(C, X, Y, m, n, a, r) 316 debug(a) 317 debug(r) 318 319 debug("DEBUG ADD") 320 for i in a : 321 debug(" \t %s %s %s" % (i[0], dbb.bb2.ins[ i[0] ].get_name(), dbb.bb2.ins[ i[0] ].get_operands())) 322 final_add.append( (i[0], 0, dbb.bb2.ins[ i[0] ]) ) 323 324 debug("DEBUG REMOVE") 325 for i in r : 326 debug(" \t %s %s %s" % (i[0], dbb.bb1.ins[ i[0] ].get_name(), dbb.bb1.ins[ i[0] ].get_operands())) 327 final_rm.append( (i[0], 0, dbb.bb1.ins[ i[0] ]) ) 328 329 dbb.diff_ins( DiffINS( final_add, final_rm ) )
330 331 332 DIFF_BB_TAG = { 333 "ORIG" : 0, 334 "DIFF" : 1, 335 "NEW" : 2 336 } 337
338 -class Method :
339 - def __init__(self, vm, vmx, m, sim) :
340 self.m = m 341 self.vm = vm 342 self.vmx = vmx 343 self.mx = vmx.get_method( m ) 344 345 self.sim = sim
346 347 ####### 348 # Attribute : 349 # Method <-> sorted Methods 350 # 351 # Method <-> Methods[0] : 352 # 353 # 354 # 355 #
356 - def add_attribute(self, name, func_meth, func_checksum_bb) :
357 bb = {} 358 bbhash = {} 359 360 fm = func_meth( self.m, self.sim ) 361 362 for i in self.mx.basic_blocks.get() : 363 bb[ i.name ] = func_checksum_bb( i ) 364 365 try : 366 bbhash[ bb[ i.name ].get_hash() ].append( bb[ i.name ] ) 367 except KeyError : 368 bbhash[ bb[ i.name ].get_hash() ] = [] 369 bbhash[ bb[ i.name ].get_hash() ].append( bb[ i.name ] ) 370 371 setattr(self, "checksum_" + name, fm) 372 373 setattr(self, "bb_" + name, bb) 374 setattr(self, "bb_sha256_" + name, bbhash) 375 setattr(self, "sha256_" + name, hashlib.sha256( fm.get_buff() ).hexdigest())
376
377 - def quick_similarity(self, name_attribute, new_method, func_sim) :
378 return func_sim( self, new_method, self.sim, name_attribute )
379
380 - def similarity(self, name_attribute, new_method, func_sim) :
381 x = None 382 try : 383 x = getattr( self, "hash_" + name_attribute ) 384 except AttributeError : 385 setattr( self, "hash_" + name_attribute, {} ) 386 x = getattr( self, "hash_" + name_attribute ) 387 388 x[ new_method ] = func_sim( self, new_method, self.sim, name_attribute )
389
390 - def sort(self, name_attribute, func_sort) :
391 x = getattr( self, "hash_" + name_attribute ) 392 z = func_sort( x ) 393 setattr( self, "sort_" + name_attribute, z ) 394 395 if z == [] : 396 return False 397 return True
398
399 - def checksort(self, name_attribute, method) :
400 z = getattr( self, "sort_" + name_attribute ) 401 for i in z : 402 if method == i[0] : 403 return True 404 return False
405
406 - def get_meth_first_sort(self, name_attribute) :
407 z = getattr( self, "sort_" + name_attribute ) 408 if z == [] : 409 return 1.0 410 411 return z[0][0]
412
413 - def get_value_first_sort(self, name_attribute) :
414 z = getattr( self, "sort_" + name_attribute ) 415 if z == [] : 416 return 1.0 417 418 return z[0][1]
419
420 - def diff(self, name_attribute, func_sim_bb, func_diff_ins):
421 z = getattr( self, "sort_" + name_attribute ) 422 if z == [] : 423 setattr(self, "dbb_" + name_attribute, {}) 424 setattr(self, "nbb_" + name_attribute, {}) 425 return 426 427 bb1 = getattr( self, "bb_" + name_attribute ) 428 429 ### Dict for diff basic blocks 430 ### vm1 basic block : vm2 basic blocks -> value (0.0 to 1.0) 431 diff_bb = {} 432 433 ### List to get directly all diff basic blocks 434 direct_diff_bb = [] 435 436 ### Dict for new basic blocks 437 new_bb = {} 438 439 ### Reverse Dict with matches diff basic blocks 440 associated_bb = {} 441 442 for b1 in bb1 : 443 diff_bb[ bb1[ b1 ] ] = {} 444 445 debug("%s 0x%x" % (b1, bb1[ b1 ].basic_block.end)) 446 for i in z : 447 bb2 = getattr( i[0], "bb_" + name_attribute ) 448 b_z = diff_bb[ bb1[ b1 ] ] 449 450 bb2hash = getattr( i[0], "bb_sha256_" + name_attribute ) 451 452 # If b1 is in bb2 : 453 # we can have one or more identical basic blocks to b1, we must add them 454 if bb1[ b1 ].get_hash() in bb2hash : 455 for equal_bb in bb2hash[ bb1[ b1 ].get_hash() ] : 456 b_z[ equal_bb.basic_block.name ] = 0.0 457 458 # If b1 is not in bb2 : 459 # we must check similarities between all bb2 460 else : 461 for b2 in bb2 : 462 b_z[ b2 ] = func_sim_bb( bb1[ b1 ], bb2[ b2 ], self.sim ) 463 464 sorted_bb = sorted(b_z.iteritems(), key=lambda (k,v): (v,k)) 465 466 debug("\t\t%s" % sorted_bb[:2]) 467 468 for new_diff in sorted_bb : 469 associated_bb[ new_diff[0] ] = bb1[ b1 ].basic_block 470 471 if new_diff[1] == 0.0 : 472 direct_diff_bb.append( new_diff[0] ) 473 474 if sorted_bb[0][1] != 0.0 : 475 diff_bb[ bb1[ b1 ] ] = (bb2[ sorted_bb[0][0] ], sorted_bb[0][1]) 476 direct_diff_bb.append( sorted_bb[0][0] ) 477 else : 478 del diff_bb[ bb1[ b1 ] ] 479 480 for i in z : 481 bb2 = getattr( i[0], "bb_" + name_attribute ) 482 for b2 in bb2 : 483 if b2 not in direct_diff_bb : 484 new_bb[ b2 ] = bb2[ b2 ] 485 486 dbb = {} 487 nbb = {} 488 # Add all different basic blocks 489 for d in diff_bb : 490 dbb[ d.basic_block.name ] = DiffBB( d.basic_block, diff_bb[ d ][0].basic_block, diff_bb[ d ] ) 491 492 # Add all new basic blocks 493 for n in new_bb : 494 nbb[ new_bb[ n ].basic_block ] = NewBB( new_bb[ n ].basic_block ) 495 del associated_bb[ n ] 496 497 setattr(self, "dbb_" + name_attribute, dbb) 498 setattr(self, "nbb_" + name_attribute, nbb) 499 500 # Found diff instructions 501 for d in dbb : 502 func_diff_ins( dbb[d], self.sim, name_attribute ) 503 504 # Set new childs for diff basic blocks 505 # The instructions will be tag with a new flag "childs" 506 for d in dbb : 507 dbb[ d ].set_childs( associated_bb ) 508 509 # Set new childs for new basic blocks 510 for d in nbb : 511 nbb[ d ].set_childs( associated_bb ) 512 513 # Create and tag all (orig/diff/new) basic blocks 514 self.create_bbs( name_attribute )
515
516 - def create_bbs(self, name_attribute) :
517 dbb = getattr(self, "dbb_" + name_attribute) 518 nbb = getattr(self, "nbb_" + name_attribute) 519 520 # For same block : 521 # tag = 0 522 # For diff block : 523 # tag = 1 524 # For new block : 525 # tag = 2 526 l = [] 527 for bb in self.mx.basic_blocks.get() : 528 if bb.name not in dbb : 529 # add the original basic block 530 setattr( bb, "bb_tag", DIFF_BB_TAG["ORIG"] ) 531 l.append( bb ) 532 else : 533 # add the diff basic block 534 setattr( dbb[ bb.name ], "bb_tag", DIFF_BB_TAG["DIFF"] ) 535 l.append( dbb[ bb.name ] ) 536 537 for i in nbb : 538 # add the new basic block 539 setattr( nbb[ i ], "bb_tag", DIFF_BB_TAG["NEW"] ) 540 l.append( nbb[ i ] ) 541 542 # Sorted basic blocks by addr (orig, new, diff) 543 l = sorted(l, key = lambda x : x.start) 544 setattr( self, "bbs_" + name_attribute, l )
545
546 - def getsha256(self, name_attribute) :
547 return getattr(self, "sha256_" + name_attribute)
548
549 - def show(self, name_attribute, details=False) :
550 print self.m.get_class_name(), self.m.get_name(), self.m.get_descriptor(), 551 print "with", 552 553 z = getattr( self, "sort_" + name_attribute ) 554 for i in z : 555 print i[0].m.get_class_name(), i[0].m.get_name(), i[0].m.get_descriptor(), i[1] 556 557 dbb = getattr(self, "dbb_" + name_attribute) 558 nbb = getattr(self, "nbb_" + name_attribute) 559 560 print "\tDIFF BASIC BLOCKS :" 561 for d in dbb : 562 print "\t\t", dbb[d].bb1.name, " --->", dbb[d].bb2.name, ":", dbb[d].info[1] 563 if details : 564 dbb[d].show() 565 566 print "\tNEW BASIC BLOCKS :" 567 for b in nbb : 568 print "\t\t", nbb[b].name 569 570 if details : 571 # show diff ! 572 bytecode.PrettyShow2( getattr(self, "bbs_" +name_attribute) )
573
574 - def show2(self, details=False) :
575 print self.m.get_class_name(), self.m.get_name(), self.m.get_descriptor() 576 if details : 577 bytecode.PrettyShow1( self.mx.basic_blocks.get() )
578 579 FILTER_NAME = "FILTER_NAME" # filter attribute name 580 FILTER_CHECKSUM_METH = "FILTER_CHECKSUM_METH" # function to checksum a method 581 FILTER_SIM_METH = "FILTER_SIM_METH" # function to calculate the similarity between two methods 582 FILTER_SORT_METH = "FILTER_SORT_METH" # function to sort all diffing methods 583 FILTER_SKIP_METH = "FILTER_SKIP_METH" # function to skip methods 584 FILTER_MARK_METH = "FILTER_MARK_METH" # function to mark all diffing methods 585 FILTER_MARK_VM = "FILTER_MARK_VM" # function to mark vms 586 FILTER_CHECKSUM_BB = "FILTER_CHECKSUM_BB" # function to checksum a basic block 587 FILTER_SIM_BB = "FILTER_SIM_BB" # function to calculate the similarity between two basic blocks 588 FILTER_DIFF_INS = "FILTER_DIFF_INS" # function to diff two basic blocks 589 FILTER_CHECKSUM_VM = "FILTER_CHECKSUM_VM" # function to checksum a vm 590 FILTER_SIM_VM = "FILTER_SIM_VM" # function to calculate the similarity between two vms 591 592 FILTERS_DIFF = { 593 "FILTER_BASIC" : { FILTER_CHECKSUM_METH : filter_checksum_meth_basic, 594 FILTER_SIM_METH : filter_sim_meth_basic, 595 FILTER_SORT_METH : filter_sort_meth_basic, 596 FILTER_SKIP_METH : filter_skip_meth_basic, 597 598 FILTER_CHECKSUM_BB : filter_checksum_bb_basic, 599 FILTER_SIM_BB : filter_sim_bb_basic, 600 601 FILTER_DIFF_INS : filter_diff_ins_basic, 602 }, 603 } 604 605 BASE = "base" 606 METHODS = "methods" 607 HASHSUM = "hashsum" 608 DIFFMETHODS = "diffmethods" 609 NEWMETHODS = "newmethods" 610 DELETEMETHODS = "deletemethods" 611 MATCHMETHODS = "matchmethods" 612 DIFFVMS = "diffvms"
613 -class Diff(object) :
614 - def __init__(self, vm1, vm2, F=FILTERS_DIFF) :
615 set_debug() 616 617 self.vms = [ vm1, vm2 ] 618 self.vm1 = vm1 619 self.vm2 = vm2 620 621 self.sim = SIMILARITY( "classification/libsimilarity/libsimilarity.so" ) 622 self.sim.set_compress_type( SNAPPY_COMPRESS ) 623 624 self.F = F 625 self.filters = {} 626 627 628 self._init_filters() 629 self._init_index_methods() 630 self._init_similarity() 631 self._init_sort_methods() 632 self._init_diff_methods() 633 self._init_new_methods()
634 635
636 - def _init_filters(self) :
637 for i in self.F : 638 self.filters[ i ] = {} 639 self.filters[ i ][ BASE ] = { FILTER_NAME : i } 640 self.filters[ i ][ BASE ].update( self.F[ i ] ) 641 self.filters[ i ][ METHODS ] = {} 642 self.filters[ i ][ HASHSUM ] = {} 643 self.filters[ i ][ DIFFMETHODS ] = [] 644 self.filters[ i ][ NEWMETHODS ] = [] 645 self.filters[ i ][ DELETEMETHODS ] = [] 646 self.filters[ i ][ MATCHMETHODS ] = []
647
648 - def _init_index_methods(self) :
649 for i in self.vms : 650 for method in i[0].get_methods() : 651 m = Method( i[0], i[1], method, self.sim ) 652 653 for fil in self.filters : 654 # Skip the method ? 655 if self.filters[fil][BASE][FILTER_SKIP_METH]( method ) : 656 continue 657 658 if i[0] not in self.filters[fil][METHODS] : 659 self.filters[fil][METHODS][ i[0] ] = [] 660 self.filters[fil][HASHSUM][ i[0] ] = [] 661 662 self.filters[fil][METHODS][ i[0] ].append( m ) 663 m.add_attribute( self.filters[fil][BASE][FILTER_NAME], self.filters[fil][BASE][FILTER_CHECKSUM_METH], self.filters[fil][BASE][FILTER_CHECKSUM_BB] ) 664 self.filters[fil][HASHSUM][i[0]].append( m.getsha256( self.filters[fil][BASE][FILTER_NAME] ) )
665
666 - def _init_similarity(self) :
667 # Check if some methods in the first file has been modified 668 for fil in self.filters : 669 for j in self.filters[fil][METHODS][self.vm1[0]] : 670 debug("SIM FOR %s %s %s" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor())) 671 for i1 in self.filters[fil][METHODS] : 672 if i1 != self.vm1[0] : 673 # B1 not at 0.0 in BB2 674 if j.getsha256( self.filters[fil][BASE][FILTER_NAME] ) not in self.filters[fil][HASHSUM][i1] : 675 for k in self.filters[fil][METHODS][i1] : 676 # B2 not at 0.0 in BB1 677 if k.getsha256( self.filters[fil][BASE][FILTER_NAME] ) not in self.filters[fil][HASHSUM][self.vm1[0]] : 678 j.similarity( self.filters[fil][BASE][FILTER_NAME], k, self.filters[fil][BASE][FILTER_SIM_METH] ) 679 if j not in self.filters[fil][DIFFMETHODS] : 680 self.filters[fil][DIFFMETHODS].append(j) 681 # B2 matched perfectly 682 else : 683 if k not in self.filters[fil][MATCHMETHODS] : 684 self.filters[fil][MATCHMETHODS].append( k ) 685 # B1 matched perfectly 686 else : 687 if j not in self.filters[fil][MATCHMETHODS] : 688 self.filters[fil][MATCHMETHODS].append( j )
689
690 - def _init_sort_methods(self) :
691 # print "DEBUG DIFF METHODS" 692 for fil in self.filters : 693 delete_methods = [] 694 for j in self.filters[fil][DIFFMETHODS] : 695 debug("%s %s %s %d" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor(), j.m.get_length())) 696 ret = j.sort( self.filters[fil][BASE][FILTER_NAME], self.filters[fil][BASE][FILTER_SORT_METH] ) 697 if ret == False : 698 delete_methods.append( j ) 699 700 for j in delete_methods : 701 self.filters[ fil ][ DELETEMETHODS ].append( j ) 702 pos = self.filters[ fil ][ DIFFMETHODS ].index( j ) 703 self.filters[ fil ][ DIFFMETHODS ].remove( j )
704
705 - def _init_diff_methods(self) :
706 # print "DEBUG DIFF METHODS" 707 for fil in self.filters : 708 for j in self.filters[fil][DIFFMETHODS] : 709 # print "DEBUG", j, j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor() 710 j.diff( self.filters[fil][BASE][FILTER_NAME], self.filters[fil][BASE][FILTER_SIM_BB], self.filters[fil][BASE][FILTER_DIFF_INS] )
711
712 - def _init_new_methods(self) :
713 # Check if some methods in the second file are totally new ! 714 for fil in self.filters : 715 for j in self.filters[fil][METHODS][self.vm2[0]] : 716 717 # new methods can't be in diff methods 718 if j not in self.filters[fil][DIFFMETHODS] : 719 # new methods hashs can't be in first file 720 if j.getsha256( self.filters[fil][BASE][FILTER_NAME] ) not in self.filters[fil][HASHSUM][self.vm1[0]] : 721 ok = True 722 # new methods can't be compared to another one 723 for diff_method in self.filters[fil][DIFFMETHODS] : 724 #print diff_method, "--->", j 725 if diff_method.checksort( self.filters[fil][BASE][FILTER_NAME], j ) : 726 ok = False 727 break 728 729 if ok : 730 self.filters[fil][NEWMETHODS].append( j )
731
732 - def get_diff_methods(self) :
733 return self.get_elem( DIFFMETHODS )
734
735 - def get_new_methods(self) :
736 return self.get_elem( NEWMETHODS )
737
738 - def get_delete_methods(self) :
739 return self.get_elem( DELETEMETHODS )
740
741 - def get_match_methods(self) :
742 return self.get_elem( MATCHMETHODS )
743
744 - def get_elem(self, attr) :
745 d = {} 746 for fil in self.filters : 747 d[ fil ] = [ x for x in self.filters[fil][attr] ] 748 return d
749 750 ######################### SIM ############################### 751
752 -def filter_sim_meth_sim( m1, m2, sim, name_attribute ) :
753 mysign = SIGNATURE_L0_4 754 s1 = m1.vmx.get_method_signature( m1.m, predef_sign=mysign ).get_string() 755 s2 = m2.vmx.get_method_signature( m2.m, predef_sign=mysign ).get_string() 756 757 ncd1 = sim.ncd( s1, s2 ) 758 return ncd1
759
760 -class CheckSumVM :
761 - def __init__(self, vm) :
762 self.vm = vm
763
764 -def filter_checksum_vm_sim( vm ):
765 return CheckSumVM( vm )
766 767 COEFF_SIM_VM = { 768 "STRING" : 1, 769 "CONSTANT_FLOAT" : 1, 770 "CLINIT" : 1 771 } 772
773 -def filter_sim_vm_sim( vm1, vm2, name_attribute, sim ):
774 svm1 = ''.join( vm1.vm[0].get_strings() ) 775 svm2 = ''.join( vm2.vm[0].get_strings() ) 776 777 return { "STRING" : sim.ncd( svm1, svm2 ) }
778
779 -def filter_skip_meth_sim( m ) :
780 code = m.get_code() 781 if code != None : 782 if code.get_length() < 100 : 783 return True 784 785 return False
786
787 -def filter_mark_vm( values ) :
788 return values.values()
789
790 -def filter_mark_meth( v ) :
791 if v >= 0.2 : 792 return 1.0 793 794 return v
795 796 FILTERS_SIM = { 797 "FILTER_SIM" : { 798 FILTER_CHECKSUM_VM : filter_checksum_vm_sim, 799 FILTER_SIM_VM : filter_sim_vm_sim, 800 FILTER_MARK_VM : filter_mark_vm, 801 802 FILTER_CHECKSUM_METH : filter_checksum_meth_basic, 803 FILTER_SIM_METH : filter_sim_meth_sim, 804 FILTER_SORT_METH : filter_sort_meth_basic, 805 FILTER_SKIP_METH : filter_skip_meth_sim, 806 FILTER_MARK_METH : filter_mark_meth, 807 808 FILTER_CHECKSUM_BB : filter_checksum_bb_basic, 809 }, 810 } 811 812 ### SIM : 813 # DATA 814 # string 815 # constant (int, float ...) 816 # clinit 817 # CODE 818 # Instructions : module Diff 819 # Exceptions 820 # API 821 # CFG method 822 # Fill array data 823 # Format
824 -class Sim(Diff) :
825 - def __init__(self, vm1, vm2, F=FILTERS_SIM) :
826 #set_debug() 827 self.marks = {} 828 super(Sim, self).__init__(vm1, vm2, F) 829 830 self._init_diff_vms() 831 self._init_mark_methods() 832 833 print self.marks 834 for fil in self.marks : 835 s = 0.0 836 for i in self.marks[fil] : 837 s += (1.0 - i) 838 print "\t", (s/len(self.marks[fil])) * 100
839
840 - def _init_filters(self) :
841 super(Sim, self)._init_filters() 842 843 for i in self.F : 844 self.marks[ i ] = []
845
846 - def _init_diff_vms(self) :
847 self.sim.set_compress_type( XZ_COMPRESS ) 848 for fil in self.filters : 849 x1 = self.filters[fil][BASE][FILTER_CHECKSUM_VM]( self.vm1 ) 850 x2 = self.filters[fil][BASE][FILTER_CHECKSUM_VM]( self.vm2 ) 851 852 val = self.filters[fil][BASE][FILTER_SIM_VM]( x1, x2, self.filters[fil][BASE][FILTER_NAME], self.sim ) 853 self.marks[fil].extend( self.filters[fil][BASE][FILTER_MARK_VM]( val ) )
854
855 - def _init_diff_methods(self) :
856 # we don't want to diff instructions of basic blocks 857 pass
858
859 - def _init_mark_methods(self) :
860 # Change the compression to have a better result for a one <-> one comparison 861 self.sim.set_compress_type( XZ_COMPRESS ) 862 863 for fil in self.filters : 864 # mark diff methods 865 for j in self.filters[fil][DIFFMETHODS] : 866 debug("%s %s %s" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor())) 867 868 # get the first method which match 869 k = j.get_meth_first_sort( self.filters[fil][BASE][FILTER_NAME] ) 870 871 # recalculate the similarity to have better percentage with a better algorithm 872 v1 = j.quick_similarity( self.filters[fil][BASE][FILTER_NAME], k, self.filters[fil][BASE][FILTER_SIM_METH] ) 873 874 # filter the mark to eliminate totaly diff method 875 v2 = self.filters[fil][BASE][FILTER_MARK_METH]( v1 ) 876 self.marks[fil].append( v2 ) 877 878 # mark match methods 879 for m in self.filters[ fil ][ MATCHMETHODS ] : 880 v = self.filters[fil][BASE][FILTER_MARK_METH]( 0.0 ) 881 self.marks[fil].append( v )
882 883 # Check if some methods in the second file are totally new ! 884 #for fil in self.filters : 885 # for j in self.filters[fil][METHODS][vm2[0]] : 886 887 # new methods can't be in diff methods 888 # if j not in self.filters[fil][DIFFMETHODS] : 889 # new methods hashs can't be in first file 890 # if j.getsha256( self.filters[fil][BASE][FILTER_NAME] ) not in self.filters[fil][HASHSUM][vm1[0]] : 891 # ok = True 892 # new methods can't be compared to another one 893 # for diff_method in self.filters[fil][DIFFMETHODS] : 894 # #print diff_method, "--->", j 895 # if diff_method.checksort( self.filters[fil][BASE][FILTER_NAME], j ) : 896 # ok = False 897 898 # break 899 900 # It's a new method in VM2, compare them to VM1 901 # if ok : 902 # for k in self.filters[fil][METHODS][vm1[0]] : 903 # k must have an invalid hash in vm2 to be compare 904 # if k.getsha256( self.filters[fil][BASE][FILTER_NAME] ) not in self.filters[fil][HASHSUM][vm2[0]] : 905 # j.similarity( k, self.filters[fil][BASE][FILTER_SIM_METH], self.filters[fil][BASE][FILTER_NAME] ) 906 # self.filters[fil][NEWMETHODS].append( j ) 907 908 #print "DEBUG NEW METHODS" 909 #for fil in self.filters : 910 #print "\tDEBUG", self.filters[fil][NEWMETHODS] 911 # for method in self.filters[fil][NEWMETHODS] : 912 #print "DEBUG", method.m.get_class_name(), method.m.get_name(), method.m.get_descriptor() 913 # method.sort( self.filters[fil][BASE][FILTER_NAME] ) 914 # self.scoring.append( method.getclosesort( self.filters[fil][BASE][FILTER_NAME] ) ) 915