1.基础知识
Python中的字节码(bytecode) 是一种数据类型, Python代码的编译结果就是bytecode对象。bytecode对象可以由Python加载后直接运行,而pyc文件就是bytecode在硬盘上的保存形式。 Python常常会把py文件编译成字节码文件,存放在__pycache__子目录内,用.pyc结尾。
为加快执行速度, Pyinstaller, py2exe等库会把编译生成的bytecode打包进exe中。掌握了字节码的知识, 离提取exe的源码, 以及反提取也就不远了!
为什么要使用字节码 如果py文件不经过编译, 直接执行, 那么只会降低Python执行的速度。 如果py文件编译成机器码, 类似C语言, 由于不同平台使用的不是同一种机器码, 就失掉了Python跨平台的特性。 所以, 使用字节码是Python最佳的选择。
字节码的结构
>>> def f():print("hello world")
>>> code=f.__code__
>>> print(dir(code))
打印出来的就是字节码的属性。 (在 Python 3.8及以后的版本中, 增加了一个属性 co_posonlyargcount) 参考: Python官方文档。
2.包装字节码
在python中, bytecode对象的属性是不可修改的。如:
>>> def f():pass
>>> f.__code__.co_code = b''
Traceback (most recent call last): ... ...
AttributeError: readonly attribute
为了使bytecode对象更易用, 我编写了Code类, 用于包装 (wrap)字节码对象, 使字节码对象变得更易操作。
import sys
try:
from importlib._bootstrap_external import MAGIC_NUMBER
except ImportError:
from importlib._bootstrap import MAGIC_NUMBER
from types import CodeType, FunctionType
from collections import OrderedDict
import marshal
import dis
import pickle
_py38=hasattr(compile('','','exec'), 'co_posonlyargcount')
class Code:
"""
# 用于doctest
>>> def f():print("Hello")
>>> c=Code.fromfunc(f)
>>> c.co_consts
(None, 'Hello')
>>> c.co_consts=(None, 'Hello World!')
>>> c.exec()
Hello World!
>>>
>>> import os,pickle
>>> temp=os.getenv('temp')
>>> with open(os.path.join(temp,"temp.pkl"),'wb') as f:
... pickle.dump(c,f)
...
>>>
>>> f=open(os.path.join(temp,"temp.pkl"),'rb')
>>> pickle.load(f).to_func()()
Hello World!
>>>
>>> c.to_pycfile(os.path.join(temp,"temppyc.pyc"))
>>> sys.path.append(temp)
>>> import temppyc
Hello World!
>>> Code.from_pycfile(os.path.join(temp,"temppyc.pyc")).exec()
Hello World!
"""
_default_args=OrderedDict(
[('co_argcount',0),
('co_kwonlyargcount',0),
('co_nlocals',0),
('co_stacksize',1),
('co_flags',64),
('co_code',b'd\x00S\x00'),
('co_consts',(None,)),
('co_names',()),
('co_varnames',()),
('co_filename',''),
('co_name',''),
('co_firstlineno',1),
('co_lnotab',b''),
('co_freevars',()),
('co_cellvars',())
])
if _py38:
_default_args['co_posonlyargcount']=0
_default_args.move_to_end('co_posonlyargcount', last=False)
_default_args.move_to_end('co_argcount', last=False)
_arg_types={key:type(value) for key,value in _default_args.items()}
def __init__(self,code=None,auto_update=True):
super().__setattr__('_args',self._default_args.copy())
if code is not None:
if isinstance(code,Code):
self._args = code._args
self._update_code()
else:
self._code=code
for key in self._args.keys():
self._args[key]=getattr(code,key)
else:
self._update_code()
self.auto_update=auto_update
def __getattr__(self,name):
_args=object.__getattribute__(self,'_args')
if name in _args:
return _args[name]
else:
return object.__getattribute__(self,name)
def __setattr__(self,name,value):
if name not in self._args:
return object.__setattr__(self,name,value)
if not isinstance(value,self._arg_types[name]):
raise TypeError(name,value)
self._args[name]=value
if self.auto_update: self._update_code()
def _update_code(self):
self._code=CodeType(*self._args.values())
def exec(self,globals_=None,locals_=None):
if not self.auto_update: self._update_code()
default={"__builtins__":__builtins__,"__doc__":None,
"__loader__":__loader__,"__name__":"__main__"}
globals_ = globals_ or default
if not locals_:locals_ = default.copy()
return exec(self._code,globals_,locals_)
def eval(self,globals_=None,locals_=None):
if not self.auto_update: self._update_code()
return eval(self._code,globals_,locals_)
def __getstate__(self):
return self._args
def __setstate__(self,state):
super().__setattr__('_args',self._default_args.copy())
self._args.update(state)
if not _py38 and 'co_posonlyargcount' in state:
del state['co_posonlyargcount']
self._update_code()
def __dir__(self):
return object.__dir__(self) + list(self._args.keys())
@classmethod
def fromfunc(cls,function):
c=function.__code__
return cls(c)
@classmethod
def fromstring(cls,string,mode='exec',filename=''):
return cls(compile(string,filename,mode))
def to_code(self):
return self._code
def to_func(self,globals_=None,name=''):
if globals_ is None:
import builtins
globals_=vars(builtins)
return FunctionType(self._code,globals_,name)
def pickle(self,filename):
with open(filename,'wb') as f:
pickle.dump(self,f)
def show(self,*args,**kw):
desc(self._code,*args,**kw)
view=show
def info(self):
dis.show_code(self._code)
def dis(self,*args,**kw):
dis.dis(self._code,*args,**kw)
3.压缩字节码
压缩字节码的原理是构造一个新的bytecode, 也就是压缩壳, 然后把原先的bytecode用marshal.dumps() 转为bytes 类型, 然后压缩bytes , 再放入压缩壳中。类似EXE文件的加壳。 程序运行时, 先解压这个bytes 数据, 再使用marshal.loads() 重新转换为bytecode, 并执行。
import sys,marshal,zlib
try:
from importlib._bootstrap_external import MAGIC_NUMBER
except ImportError:
from importlib._bootstrap import MAGIC_NUMBER
def dump_to_pyc(pycfilename,code,pycheader=None):
c=Code()
c.co_code=b'''d\x00d\x01l\x00Z\x00d\x00d\x01l\x01Z\x01e\x02\
e\x01\xa0\x03e\x00\xa0\x04d\x02\xa1\x01\xa1\x01\x83\x01\x01\x00d\x01S\x00'''
c.co_names=('zlib', 'marshal', 'exec', 'loads', 'decompress')
c.co_consts=(0, None,zlib.compress(marshal.dumps(code._code),
zlib.Z_BEST_COMPRESSION))
c.co_flags=64
c.co_stacksize=6
with open(pycfilename,'wb') as f:
if pycheader is None:
if sys.winver >= '3.7':
pycheader=MAGIC_NUMBER+b'\x00'*12
else:
pycheader=MAGIC_NUMBER+b'\x00'*8
f.write(pycheader)
marshal.dump(c._code,f)
if len(sys.argv) == 1:
print('Usage: %s [filename]' % sys.argv[0])
for file in sys.argv[1:]:
data=open(file,'rb').read()
if data[16]==0xe3:
old_header=data[:16];data=data[16:]
else:
old_header=data[:12];data=data[12:]
co = Code(marshal.loads(data))
dump_to_pyc(file,co,pycheader=old_header)
print('Processed:',file)
4.加壳字节码
加壳字节码与压缩不同, 加壳字节码会阻止字节码被uncompyle6 之类的反编译器反编译。 程序在每个bytecode的co_code 末尾加上多余的S\x00 。 但co_consts 里依然有bytecode, 而这些bytecode又有co_consts , 所以需要递归处理。
import sys,marshal
from inspect import iscode
try:
from importlib._bootstrap_external import MAGIC_NUMBER
except ImportError:
from importlib._bootstrap import MAGIC_NUMBER
def process_code(co):
co.co_lnotab = b''
co.co_code += b'S\x00'
co.co_filename = ''
co_consts = co.co_consts
for i in range(len(co_consts)):
obj = co_consts[i]
if iscode(obj):
data=process_code(Code(obj))
co_consts = co_consts[:i] + (data._code,) + co_consts[i+1:]
co.co_consts = co_consts
return co
def dump_to_pyc(pycfilename,code,pycheader=None):
with open(pycfilename,'wb') as f:
if pycheader is None:
if sys.winver >= '3.7':
pycheader=MAGIC_NUMBER+b'\x00'*12
else:
pycheader=MAGIC_NUMBER+b'\x00'*8
f.write(pycheader)
marshal.dump(code._code,f)
for file in sys.argv[1:]:
process_code(co)
dump_to_pyc(file,co,pycheader=old_header)
print('Processed:',file)
尝试反编译加壳后的pyc文件, 意外发现:
L. 2 0 LOAD_CONST 0
2 LOAD_CONST None
4 IMPORT_NAME sys
6 STORE_NAME sys
8 LOAD_CONST 0
10 LOAD_CONST None
12 IMPORT_NAME marshal
14 STORE_NAME marshal
... ...
294 LOAD_CONST None
296 RETURN_VALUE
298 RETURN_VALUE
-1 RETURN_LAST
Parse error at or near `None' instruction at offset -1
说明加壳字节码, 的确能阻止字节码被uncompyle6 等反编译器反编译。
4.解压缩, 脱壳字节码
解压缩, 脱壳字节码, 也就是解压原先压缩壳中的bytes 数据, 再使用marshal.loads() 重新转换为bytecode, 并写入pyc文件。
import sys,marshal,traceback
try:
from importlib._bootstrap_external import MAGIC_NUMBER
except ImportError:
from importlib._bootstrap import MAGIC_NUMBER
def dump_to_pyc(pycfilename,data,pycheader=None):
for file in sys.argv[1:]:
try:
with open(file,'rb') as f:
d=f.read()
if d[16]==227:
old_header=d[:16];d=d[16:]
else:
old_header=d[:12];d=d[12:]
c=marshal.loads(d)
modname=c.co_names[0] if len(c.co_names)>=1 else ''
if modname in ('bz2','lzma','zlib'):
mod=__import__(modname)
data=mod.decompress(c.co_consts[2])
marshal.loads(data)
dump_to_pyc(file,data,old_header)
print('Processed:',file)
else:
raise TypeError('不是压缩的pyc文件: '+file)
except Exception:
traceback.print_exc()
总结
前面介绍了Python字节码的压缩, 加壳和脱壳, 主要途径是修改字节码。 Python 字节码这一特性有广泛的用途, 例如pyc文件加密、结构优化, 防止反编译pyc文件等。
|