aboutsummaryrefslogtreecommitdiff
path: root/utils/intel-x86.py
diff options
context:
space:
mode:
Diffstat (limited to 'utils/intel-x86.py')
-rw-r--r--utils/intel-x86.py283
1 files changed, 283 insertions, 0 deletions
diff --git a/utils/intel-x86.py b/utils/intel-x86.py
new file mode 100644
index 0000000..a9a3a66
--- /dev/null
+++ b/utils/intel-x86.py
@@ -0,0 +1,283 @@
+from idc import *
+import sys
+import binascii
+from sets import Set
+import sys
+import re
+
+def replace_farptr(matchobj):
+ s1 = matchobj.group()
+ return s1.replace('far ptr ', '')
+
+def replace_ptr(matchobj):
+ s1 = matchobj.group()
+ pos = s1.rfind(' ')
+ assert pos != -1
+ return s1[0:pos] + ' [' + s1[pos+1:] + ']'
+
+def replace_ptr2(matchobj):
+ s1 = matchobj.group()
+ pos = s1.rfind(':')
+ assert pos != -1
+ return s1[0:pos+1] + '[' + s1[pos+1:] + ']'
+
+def replace_hex(matchobj):
+ nstr = matchobj.group()
+ assert not (nstr in ['ah', 'bh', 'ch', 'dh'])
+ nstr = nstr.replace('h', '')
+ return nstr
+
+def repl(matchobj):
+ s1 = matchobj.group()
+ s2 = s1.replace(' ', '')
+ pos1 = s2.find('[')
+ pos2 = s2.find(']')
+ if pos1 == -1 or pos2 == -1:
+ return s1
+ nstr = s2[0:pos1]
+ nstr = nstr.replace('h', '')
+ num = int(nstr, 16)
+ hex = "%08x" % num
+ s3 = s2[pos1:pos2] + '+' + hex + ']'
+ return s3
+
+p_seg = re.compile(r'(es:|ds:|cs:|fs:|gs:|ss:)')
+p_seg_abs = re.compile(r'(es:|ds:|cs:|fs:|gs:|ss:)[0-9][0-9a-fA-F]*h')
+p_farptr = re.compile(r'far\sptr\s[0-9a-fA-F]+:[0-9a-fA-F]+')
+p_ptr = re.compile(r'(byte|word|dword|qword|oword|fword)\sptr\s[0-9][0-9a-fA-F]*')
+p_ptr2 = re.compile(r'(byte|word|dword|qword|oword|fword)\sptr\s(es:|ds:|cs:|fs:|gs:|ss:)[0-9][0-9a-fA-F]*')
+p_hex = re.compile(r'([0-9][0-9a-fA-F]*)(h|H)')
+p_spaces = re.compile(r'(\s+)')
+p_repl = re.compile(r'[0-9][0-9a-fA-F]*(h|H)\s*\[[^\]]+\]')
+
+replacements = [('retn', 'ret'), ('retnw', 'ret'), ('iretw', 'iret'), ('retfw', 'retf'),
+ ('pushfw', 'pushf'), ('popfw', 'popf'), ('pushaw', 'pusha'),
+ ('popaw', 'popa'), ('enterw', 'enter'), ('enterw', 'enter'),
+ ('cmova', 'cmovnbe'), ('cmovg', 'cmovnle'), ('cmovge', 'cmovnl'),
+ ('leavew', 'leave'),
+ ('int 3', 'int 03')]
+
+def is_invalid_insn(insn_binary):
+ k = 0
+ while True:
+ b = ord(insn_binary[k])
+ if not(b == 0x26 or b == 0x2e or b == 0x36 or b == 0x3e or b == 0x64 or b == 0x65):
+ break
+ k += 1
+ if k >= len(insn_binary) - 1:
+ return True
+ b = ord(insn_binary[k])
+ b2 = ord(insn_binary[k + 1])
+ if b == 0x0f and (b2 == 0x19 or b2 == 0x24 or b2 == 0x26 or b2 == 0xa6 or b2 == 0xa7):
+ return True
+ if b == 0xcd and b2 == 0x20: #vxdcall
+ return True
+ if b == 0xd6: #setalc
+ return True
+ if b == 0x0f and b2 == 0x0d:
+ if k > len(insn_binary) - 2:
+ return True
+ if ord(insn_binary[k + 2]) == 0x13:
+ return True
+ return False
+
+# Miscellaneous replacements
+def misc_replacements(opcode_str):
+ for r_from, r_to in replacements:
+ if opcode_str == r_from:
+ opcode_str = r_to
+ return opcode_str
+
+def remove_ds_prefix(insn_binary, rest_str):
+ pos = insn_binary.find('\x3e')
+ # No prefix - remove ds:
+ if -1 == pos:
+ return -1 != rest_str.find('ds:')
+ if pos == 0:
+ return True
+ ds_prefix = True
+ for i in range(0, pos):
+ if not (ord(insn_binary[i]) in [0x66, 0x67, 0xF0, 0xF2, 0xF3]):
+ ds_prefix = False
+ break
+ if ds_prefix:
+ return True
+ else:
+ # No prefix - remove ds
+ return -1 != rest_str.find('ds:')
+
+def replace_ds_seg(matchobj):
+ nstr = matchobj.group()
+ if nstr[0:2].lower() == 'ds':
+ return '[' + nstr[3:] + ']'
+ else:
+ return nstr[0:3] + '[' + nstr[3:] + ']'
+
+def replace_seg(matchobj):
+ nstr = matchobj.group()
+ if nstr[0:2].lower() == 'ds':
+ return '[' + nstr[3:] + ']'
+ else:
+ return nstr[0:3] + '[' + nstr[3:] + ']'
+
+def replace_lea_seg(matchobj):
+ nstr = matchobj.group()
+ return '[' + nstr[3:] + ']'
+
+def replace_segments(insn_binary, opcode_str, rest_str):
+ # Remove segments from LEA (for absolute and relative offsets)
+ if opcode_str.lower() == 'lea':
+ tmp = p_seg_abs.sub(replace_lea_seg, rest_str)
+ if tmp == rest_str:
+ return p_seg.sub('', rest_str)
+ else:
+ return tmp
+ # Now search for ?s:01020304, replace to ?s:[01020304] except of ds: -> [01020304]
+ if remove_ds_prefix(insn_binary, rest_str):
+ return p_seg_abs.sub(replace_ds_seg, rest_str)
+ else:
+ return p_seg_abs.sub(replace_seg, rest_str)
+
+# Apply fixes to IDA opcode
+def ida_disasm_fix(insn_binary, insn_str):
+ # Remove extra spaces and tabs. Replace tabs with spaces
+ insn_str = p_spaces.sub(r' ', insn_str)
+ # Avoid opcode changing
+ pos = insn_str.find(' ')
+ if pos == -1:
+ return misc_replacements(insn_str) # This is opcode like 'cli'
+ opcode_str = insn_str[0:pos]
+ rest_str = insn_str[pos+1:]
+ # remove 'small'
+ rest_str = rest_str.replace('small ', '')
+ # Transform '6050403[eax], al' to '[eax+6050403], al'
+ rest_str = p_repl.sub(repl, rest_str)
+ rest_str = replace_segments(insn_binary, opcode_str, rest_str)
+ # Remove 'ds:' if no 3Eh prefix found
+ if remove_ds_prefix(insn_binary, rest_str):
+ rest_str = rest_str.replace('ds:', '')
+ # Replace 'xmmword' to 'oword'
+ rest_str = rest_str.replace('xmmword', 'oword')
+ # Remove 'h' after hex constants
+ rest_str = p_hex.sub(replace_hex, rest_str)
+ # Transform 'ptr 012345' -> 'ptr [012345]'
+ rest_str = p_ptr.sub(replace_ptr, rest_str)
+ rest_str = p_ptr2.sub(replace_ptr2, rest_str)
+ # Transform 'call far ptr 1817:16151413' -> 'call 1817:16151413'
+ rest_str = p_farptr.sub(replace_farptr, rest_str)
+ opcode_str = misc_replacements(opcode_str)
+ return opcode_str + ' ' + rest_str
+
+def get_insn(ea, len):
+ s = ''
+ for i in range(0, len):
+ s += chr(Byte(ea + i))
+ return s
+
+def insn_write(f, insn_binary, insn_str, header):
+ assert len(insn_binary) != 0
+ s = ''
+ sz = len(insn_binary)
+ if header:
+ s += '{%d, "' % sz
+ for i in range(0, sz):
+ s += '\\x%02x' % ord(insn_binary[i])
+ s += '", "' + insn_str + '"},\n'
+ else:
+ s += binascii.hexlify(insn_binary)
+ s += (' %s\n' % insn_str)
+ f.write(s)
+ f.flush()
+
+# Normalize operand (replace numeric operands with -1)
+def normalize_operand(op_type, op_str):
+ if op_type in [o_mem, o_displ, o_imm, o_near, o_far]:
+ return "-1"
+ else:
+ return op_str
+
+def is_unique(set, ea, insn_str, mnem):
+ # Consider undisassemblable opcodes as unique
+ if insn_str[0:2].lower() == 'db':
+ return True
+ ot1 = GetOpType(ea, 0)
+ ot2 = GetOpType(ea, 1)
+ ot3 = GetOpType(ea, 2)
+ v1 = GetOpnd(ea, 0)
+ v2 = GetOpnd(ea, 1)
+ v3 = GetOpnd(ea, 2)
+ hashstr = "%s|%s|%s|%s" % (mnem,
+ normalize_operand(ot1, v1),
+ normalize_operand(ot2, v2),
+ normalize_operand(ot3, v3))
+ if hashstr in set:
+ return False
+ else:
+ set.add(hashstr)
+ return True
+
+def generate_x86(filename):
+ set = Set()
+ ea = GetEntryPoint(GetEntryOrdinal(0))
+ for i in range(0, 20):
+ PatchByte(ea + i, i + 0x10)
+ flog = open(filename + '.log', 'wt')
+ f = open(filename, 'wt')
+ n = 0
+ len = 0
+ for p0 in range(0x10, 0x110):
+ q0 = p0 & 0xff
+ set.clear() # Clear cached of opcodes (they become unrelevant)
+ PatchByte(ea, q0)
+ for p1 in range(0x10, 0x110):
+ q1 = p1 & 0xff
+ PatchByte(ea + 1, q1)
+ for p2 in range(0x10, 0x110):
+ q2 = p2 & 0xff
+ PatchByte(ea + 2, q2)
+ len = MakeCode(ea)
+ str = GetDisasm(ea)
+ mnem = GetMnem(ea)
+ # Now we got disasm
+ # Remove comments
+ pos = str.find(';')
+ if pos != -1:
+ str = str[0:pos]
+ # Remove spaces at start and end
+ str = str.strip(' ')
+ if str[0:2] == 'db' or str == '' or len == 0:
+ insn_binary = get_insn(ea, 10)
+ flog.write('INPUT hex: %20s; disasm: "%s"\n' % (binascii.hexlify(insn_binary), str))
+ insn_write(f, insn_binary, 'db', False)
+ flog.write('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tOUTPUT disasm: "db"\n')
+ continue
+ if not is_unique(set, ea, str, mnem):
+ continue
+ insn_binary = get_insn(ea, len)
+ flog.write('INPUT hex: %20s; disasm: "%s"\n' % (binascii.hexlify(insn_binary), str))
+ if is_invalid_insn(insn_binary):
+ flog.write('*** Skipping invalid opcode ***')
+ continue
+ # Add unique disasms to file
+ str = ida_disasm_fix(insn_binary, str)
+ flog.write('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tOUTPUT disasm: "%s"\n' % str)
+ flog.flush()
+ insn_write(f, insn_binary, str, False)
+ n += 1
+ if n % 1000 == 0:
+ print '%d opcodes processed' % n
+ if len == 2 or len == 1:
+ break # Optimization: break if third byte does not matter
+ if len == 1:
+ break # Optimization: break if second byte does not matter
+ f.close()
+ flog.write('Finished\n')
+ flog.close()
+
+def main():
+ generate_x86("./intel-x86-opcodes.txt")
+
+if __name__ == "__main__":
+ main()
+
+