其他
基于LSTM的二进制代码相似性检测
本文为看雪论坛优秀文章
看雪论坛作者ID:会飞的鱼油
一
前言
二
总体框架
三
函数嵌入
W和b都是LSTM待学习的参数,具体参数细节可以参考pytorch的官方文档。
四
指令嵌入
[0xXXXXXXXX] -> [mem]
[0xXXXXXXXX + index*scale + base] -> [disp + index*scale + base]
0xXXXXXXXX -> imm
五
代码实现
model = Word2Vec(tokensList, vector_size=wordDim, negative=15, window=5, min_count=1, workers=1, epochs=10, sg=1)
model.save('insn2vec.model')
指令嵌入的实现在lstm.py文件中,实现如下:
class instruction2vec(nn.Module):
def __init__(self, word2vec_model_path:str):
super(instruction2vec, self).__init__()
word2vec = Word2Vec.load(word2vec_model_path)
self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec.wv.vectors))
self.token_size = word2vec.wv.vector_size#维度大小
self.key_to_index = word2vec.wv.key_to_index.copy() #dict
self.index_to_key = word2vec.wv.index_to_key.copy() #list
del word2vec
def keylist_to_tensor(self, keyList:list):
indexList = [self.key_to_index[token] for token in keyList]
return self.embedding(torch.LongTensor(indexList))
def InsnStr2Tensor(self, insnStr:str) -> torch.tensor:
insnStr = RefineAsmCode(insnStr)
tokenList = re.findall('\w+|[\+\-\*\:\[\]\,]', insnStr)
opcode_tensor = self.keylist_to_tensor(tokenList[0:1])[0]
op_zero_tensor = torch.zeros(self.token_size)
insn_tensor = None
if(1 == len(tokenList)):
#没有操作数
insn_tensor = torch.cat((opcode_tensor, op_zero_tensor, op_zero_tensor), dim=0)
else:
op_token_list = tokenList[1:]
if(op_token_list.count(',') == 0):
#一个操作数
op1_tensor = self.keylist_to_tensor(op_token_list)
insn_tensor = torch.cat((opcode_tensor, op1_tensor.mean(dim=0), op_zero_tensor), dim=0)#tensor.mean求均值后变成一维
elif(op_token_list.count(',') == 1):
#两个操作数
dot_index = op_token_list.index(',')
op1_tensor = self.keylist_to_tensor(op_token_list[0:dot_index])
op2_tensor = self.keylist_to_tensor(op_token_list[dot_index+1:])
insn_tensor = torch.cat((opcode_tensor, op1_tensor.mean(dim=0), op2_tensor.mean(dim=0)), dim=0)
elif(op_token_list.count(',') == 2):
#三个操作数
dot1_index = op_token_list.index(',')
dot2_index = op_token_list.index(',', dot1_index+1)
op1_tensor = self.keylist_to_tensor(op_token_list[0:dot1_index])
op2_tensor = self.keylist_to_tensor(op_token_list[dot1_index+1:dot2_index])
op3_tensor = self.keylist_to_tensor(op_token_list[dot2_index+1:])
op2_tensor = (op2_tensor.mean(dim=0) + op3_tensor.mean(dim=0)) / 2
insn_tensor = torch.cat((opcode_tensor, op1_tensor.mean(dim=0), op2_tensor), dim=0)
if(None == insn_tensor):
print("error: None == insn_tensor")
raise
insn_size = insn_tensor.shape[0]
if(self.token_size * 3 != insn_size):
print("error: (token_size)%d != %d(insn_size)" % (self.token_size, insn_size))
raise
return insn_tensor #[len(tokenList), token_size]
def forward(self, insnStrList:list) -> torch.tensor:
insnTensorList = [self.InsnStr2Tensor(insnStr) for insnStr in insnStrList]
return torch.stack(insnTensorList) #[insn_count, token_size]
函数嵌入的代码实现如下:
class SiameseNet(nn.Module):
def __init__(self, hidden_size=60, n_layers=2, bidirectional = False):
super(SiameseNet, self).__init__()
self.insn_embedding = instruction2vec("./insn2vec.model")
input_size = self.insn_embedding.token_size * 3
#input_size为指令的维度, hidden_size为整个指令序列的维度
self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True, bidirectional = bidirectional)
self.D = int(bidirectional)+1
self.w_omega = nn.Parameter(torch.Tensor(hidden_size * self.D, hidden_size * self.D))
self.b_omega = nn.Parameter(torch.Tensor(hidden_size * self.D))
self.u_omega = nn.Parameter(torch.Tensor(hidden_size * self.D, 1))
nn.init.uniform_(self.w_omega, -0.1, 0.1)
nn.init.uniform_(self.u_omega, -0.1, 0.1)
def attention_score(self, x):
#x:[batch_size, seq_len, hidden_size*D]
u = torch.tanh(torch.matmul(x, self.w_omega))
#u:[batch_size, seq_len, hidden_size*D]
att = torch.matmul(u, self.u_omega)
#att:[batch_size, seq_len, 1]
att_score = F.softmax(att, dim=1)#得到每一个step的hidden权重
#att_score:[batch_size, seq_len, 1]
scored_x = x*att_score #类似矩阵倍乘
return torch.sum(scored_x, dim=1)#加权求和
def forward_once(self, input:list) -> torch.tensor:
lengths = []#记录每个指令序列的长度
out = []
for insnStrList in input:
insnVecTensor = self.insn_embedding(insnStrList)#把指令转换到向量
out.append(insnVecTensor)
lengths.append(len(insnStrList))
pad_out = pad_sequence(out, batch_first=True)#填充0使所有handler的seq_len相同
pack_padded_out = pack_padded_sequence(pad_out, lengths, batch_first=True, enforce_sorted=False)
packed_out,(hn,_) = self.lstm(pack_padded_out)#input shape:[batch_size, seq_len, input_size]
#hn:[D*num_layers,batch_size,hidden_size]
#out:[batch_size, seq_len, hidden_size*D],此时out有一些零填充
out,lengths = pad_packed_sequence(packed_out, batch_first=True)
out = self.attention_score(out)
return out
def forward(self, input1, input2):
out1 = self.forward_once(input1)#out1:[batch_size,hidden_size]
out2 = self.forward_once(input2)
out = F.cosine_similarity(out1, out2, dim=1)
return out
六
模型评估
ntdll_7600_x64.dll
ntoskrnl_7600_x64.exe
win32kfull_17134_x64.sys
ntdll_7600_x32.dll
ntoskrnl_7600_x32.exe
win32kfull_17134_x32.sys
前面两张图片分别是训练集和验证集随着迭代次数的损失loss下降情况。可以看到根据训练集的loss还没收敛的情况下,验证集的loss就已经收敛不怎么下降。再训练下去的话,大概率是会过拟合(overfitting),主要原因还是数据太少了。
七
总结
mov eax, [0x12345678] add eax, [0x12345678]
shl eax, 2 shl eax, 2
ret inc eax
参考资料
A simple function embedding approach for binary similarity detection;
Understanding LSTM Networks
看雪ID:会飞的鱼油
https://bbs.pediy.com/user-home-742617.htm
# 往期推荐
6.Windows本地提权漏洞CVE-2014-1767分析及EXP编写指导
球分享
球点赞
球在看
点击“阅读原文”,了解更多!