aboutsummaryrefslogtreecommitdiff
path: root/utils/intel-x86.py
blob: a9a3a667cbf5e17977e83cf6bafd6a8e8300ea20 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
from idc import *
import sys
import binascii
from sets import Set
import sys
import re

def replace_farptr(matchobj):
	s1 = matchobj.group()
	return s1.replace('far ptr ', '')

def replace_ptr(matchobj):
	s1 = matchobj.group()
	pos = s1.rfind(' ')
	assert pos != -1
	return s1[0:pos] + ' [' + s1[pos+1:] + ']'

def replace_ptr2(matchobj):
	s1 = matchobj.group()
	pos = s1.rfind(':')
	assert pos != -1
	return s1[0:pos+1] + '[' + s1[pos+1:] + ']'

def replace_hex(matchobj):
	nstr = matchobj.group()
	assert not (nstr in ['ah', 'bh', 'ch', 'dh'])
	nstr = nstr.replace('h', '')
	return nstr

def repl(matchobj):
	s1 = matchobj.group()
	s2 = s1.replace(' ', '')
	pos1 = s2.find('[')
	pos2 = s2.find(']')
	if pos1 == -1 or pos2 == -1:
		return s1
	nstr = s2[0:pos1]
	nstr = nstr.replace('h', '')
	num = int(nstr, 16)
	hex = "%08x" % num
	s3 = s2[pos1:pos2] + '+' + hex + ']'
	return s3

p_seg  = re.compile(r'(es:|ds:|cs:|fs:|gs:|ss:)')
p_seg_abs = re.compile(r'(es:|ds:|cs:|fs:|gs:|ss:)[0-9][0-9a-fA-F]*h')
p_farptr = re.compile(r'far\sptr\s[0-9a-fA-F]+:[0-9a-fA-F]+')
p_ptr = re.compile(r'(byte|word|dword|qword|oword|fword)\sptr\s[0-9][0-9a-fA-F]*')
p_ptr2 = re.compile(r'(byte|word|dword|qword|oword|fword)\sptr\s(es:|ds:|cs:|fs:|gs:|ss:)[0-9][0-9a-fA-F]*')
p_hex = re.compile(r'([0-9][0-9a-fA-F]*)(h|H)')
p_spaces = re.compile(r'(\s+)')
p_repl = re.compile(r'[0-9][0-9a-fA-F]*(h|H)\s*\[[^\]]+\]')

replacements = [('retn', 'ret'), ('retnw', 'ret'), ('iretw', 'iret'), ('retfw', 'retf'),
				('pushfw', 'pushf'), ('popfw', 'popf'), ('pushaw', 'pusha'),
				('popaw', 'popa'), ('enterw', 'enter'), ('enterw', 'enter'),
				('cmova', 'cmovnbe'), ('cmovg', 'cmovnle'), ('cmovge', 'cmovnl'),
				('leavew', 'leave'),
				('int 3', 'int 03')]

def is_invalid_insn(insn_binary):
	k = 0
	while True:
		b = ord(insn_binary[k])
		if not(b == 0x26 or b == 0x2e or b == 0x36 or b == 0x3e or b == 0x64 or b == 0x65):
			break
		k += 1
	if k >= len(insn_binary) - 1:
		return True
	b = ord(insn_binary[k])
	b2 = ord(insn_binary[k + 1])
	if b == 0x0f and (b2 == 0x19 or b2 == 0x24 or b2 == 0x26 or b2 == 0xa6 or b2 == 0xa7):
		return True
	if b == 0xcd and b2 == 0x20: #vxdcall
		return True
	if b == 0xd6: #setalc
		return True
	if b == 0x0f and b2 == 0x0d:
		if k > len(insn_binary) - 2:
			return True
		if ord(insn_binary[k + 2]) == 0x13:
			return True
	return False

# Miscellaneous replacements
def misc_replacements(opcode_str):
	for r_from, r_to in replacements:
		if opcode_str == r_from:
			opcode_str = r_to
	return opcode_str

def remove_ds_prefix(insn_binary, rest_str):
	pos = insn_binary.find('\x3e')
	# No prefix - remove ds:
	if -1 == pos:
		return -1 != rest_str.find('ds:')
	if pos == 0:
		return True
	ds_prefix = True
	for i in range(0, pos):
		if not (ord(insn_binary[i]) in [0x66, 0x67, 0xF0, 0xF2, 0xF3]):
			ds_prefix = False
			break
	if ds_prefix:
		return True
	else:
		# No prefix - remove ds
		return -1 != rest_str.find('ds:')

def replace_ds_seg(matchobj):
	nstr = matchobj.group()
	if nstr[0:2].lower() == 'ds':
		return '[' + nstr[3:] + ']'
	else:
		return nstr[0:3] + '[' + nstr[3:] + ']'

def replace_seg(matchobj):
	nstr = matchobj.group()
	if nstr[0:2].lower() == 'ds':
		return '[' + nstr[3:] + ']'
	else:
		return nstr[0:3] + '[' + nstr[3:] + ']'

def replace_lea_seg(matchobj):
	nstr = matchobj.group()
	return '[' + nstr[3:] + ']'

def replace_segments(insn_binary, opcode_str, rest_str):
	# Remove segments from LEA (for absolute and relative offsets)
	if opcode_str.lower() == 'lea':
		tmp = p_seg_abs.sub(replace_lea_seg, rest_str)
		if tmp == rest_str:
			return p_seg.sub('', rest_str)
		else:
			return tmp
	# Now search for ?s:01020304, replace to ?s:[01020304] except of ds: -> [01020304]
	if remove_ds_prefix(insn_binary, rest_str):
		return p_seg_abs.sub(replace_ds_seg, rest_str)
	else:
		return p_seg_abs.sub(replace_seg, rest_str)

# Apply fixes to IDA opcode
def ida_disasm_fix(insn_binary, insn_str):
	# Remove extra spaces and tabs. Replace tabs with spaces
	insn_str = p_spaces.sub(r' ', insn_str)
	# Avoid opcode changing
	pos = insn_str.find(' ')
	if pos == -1:
		return misc_replacements(insn_str) # This is opcode like 'cli'
	opcode_str = insn_str[0:pos]
	rest_str = insn_str[pos+1:]
	# remove 'small'
	rest_str = rest_str.replace('small ', '')
	# Transform '6050403[eax], al' to '[eax+6050403], al'
	rest_str = p_repl.sub(repl, rest_str)
	rest_str = replace_segments(insn_binary, opcode_str, rest_str)
	# Remove 'ds:' if no 3Eh prefix found
	if remove_ds_prefix(insn_binary, rest_str):
		rest_str = rest_str.replace('ds:', '')
	# Replace 'xmmword' to 'oword'
	rest_str = rest_str.replace('xmmword', 'oword')
	# Remove 'h' after hex constants
	rest_str = p_hex.sub(replace_hex, rest_str)
	# Transform 'ptr 012345' -> 'ptr [012345]'
	rest_str = p_ptr.sub(replace_ptr, rest_str)
	rest_str = p_ptr2.sub(replace_ptr2, rest_str)
	# Transform 'call far ptr 1817:16151413' -> 'call 1817:16151413'
	rest_str = p_farptr.sub(replace_farptr, rest_str)
	opcode_str = misc_replacements(opcode_str)
	return opcode_str + ' ' + rest_str

def get_insn(ea, len):
	s = ''
	for i in range(0, len):
		s += chr(Byte(ea + i))
	return s

def insn_write(f, insn_binary, insn_str, header):
	assert len(insn_binary) != 0
	s = ''
	sz = len(insn_binary)
	if header:
		s += '{%d, "' % sz
		for i in range(0, sz):
			s += '\\x%02x' % ord(insn_binary[i])
		s += '", "' + insn_str + '"},\n'
	else:
		s += binascii.hexlify(insn_binary)
		s += (' %s\n' % insn_str)
	f.write(s)
	f.flush()

# Normalize operand (replace numeric operands with -1)
def normalize_operand(op_type, op_str):
	if op_type in [o_mem, o_displ, o_imm, o_near, o_far]:
		return "-1"
	else:
		return op_str

def is_unique(set, ea, insn_str, mnem):
	# Consider undisassemblable opcodes as unique
	if insn_str[0:2].lower() == 'db':
		return True
	ot1 = GetOpType(ea, 0)
	ot2 = GetOpType(ea, 1)
	ot3 = GetOpType(ea, 2)
	v1 = GetOpnd(ea, 0)
	v2 = GetOpnd(ea, 1)
	v3 = GetOpnd(ea, 2)
	hashstr = "%s|%s|%s|%s" % (mnem, 
			normalize_operand(ot1, v1),
			normalize_operand(ot2, v2),
			normalize_operand(ot3, v3))
	if hashstr in set:
		return False
	else:
		set.add(hashstr)
		return True

def generate_x86(filename):
	set = Set()
	ea = GetEntryPoint(GetEntryOrdinal(0))
	for i in range(0, 20):
		PatchByte(ea + i, i + 0x10)
	flog = open(filename + '.log', 'wt')
	f = open(filename, 'wt')
	n = 0
	len = 0
	for p0 in range(0x10, 0x110):
		q0 = p0 & 0xff
		set.clear() # Clear cached of opcodes (they become unrelevant)
		PatchByte(ea, q0)
		for p1 in range(0x10, 0x110):
			q1 = p1 & 0xff
			PatchByte(ea + 1, q1)
			for p2 in range(0x10, 0x110):
				q2 = p2 & 0xff
				PatchByte(ea + 2, q2)
				len = MakeCode(ea)
				str = GetDisasm(ea)
				mnem = GetMnem(ea)
				# Now we got disasm
				# Remove comments
				pos = str.find(';')
				if pos != -1:
					str = str[0:pos]
				# Remove spaces at start and end
				str = str.strip(' ')
				if str[0:2] == 'db' or str == '' or len == 0:
					insn_binary = get_insn(ea, 10)
					flog.write('INPUT hex: %20s; disasm: "%s"\n' % (binascii.hexlify(insn_binary), str))
					insn_write(f, insn_binary, 'db', False)
					flog.write('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tOUTPUT disasm: "db"\n')
					continue
				if not is_unique(set, ea, str, mnem):
					continue
				insn_binary = get_insn(ea, len)
				flog.write('INPUT hex: %20s; disasm: "%s"\n' % (binascii.hexlify(insn_binary), str))
				if is_invalid_insn(insn_binary):
					flog.write('*** Skipping invalid opcode ***')
					continue
				# Add unique disasms to file
				str = ida_disasm_fix(insn_binary, str)
				flog.write('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tOUTPUT disasm: "%s"\n' % str)
				flog.flush()
				insn_write(f, insn_binary, str, False)
				n += 1
				if n % 1000 == 0:
					print '%d opcodes processed' % n
				if len == 2 or len == 1:
					break # Optimization: break if third byte does not matter
			if len == 1:
				break # Optimization: break if second byte does not matter
	f.close()
	flog.write('Finished\n')
	flog.close()

def main():
	generate_x86("./intel-x86-opcodes.txt")

if __name__ == "__main__":
	main()