Detection Requirement
- Detect the DGA domains from the corpus
What is the DGA
- Domain Generating Algorithm
- using seed(E.g. Date) to generate random domains to escape
the detection
Training Data
- train.txt
- format : domain name,label
Test Data
- test.txt
- format : domain name
Output
- result.txt
- format : domain name,label
test.py will read train.txt and test.txt, then labeled the domains in test.txt and output them to result.txt
Hint
- Python3 required
- domain name length
- numbers in the domain name
- entropy of letteres
- segmentation
安装scikit-learn
pip install scikit-learn
train.txt
ywrbxg.info,dga
ikpjfwkcyzo.info,dga
lncyajfvi.info,dga
...
14g0m9ifidf44qixi2qd1icc.com,dga
1witl72kyutqa1gti1iqjiyjpg.biz,dga
3lwl0q1jnzthz1orq3601k5tgbo.biz,dga
...
google.com,notdga
youtube.com,notdga
tmall.com,notdga
...
课上给出了一个Demo,我放在这里:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
domainlist = []
class Domain:
def __init__(self, _name, _label, _min, _max, _numip, _ipset):
self.name = _name
self.label = _label
self.ttlmin = _min
self.ttlmax = _max
self.numip = _numip
self.ipset = _ipset
def returnData(self):
return [self.ttlmin, self.ttlmax, self.numip]
def returnLabel(self):
if self.label == "good":
return 0
else:
return 1
def initData(filename):
with open(filename) as f:
for line in f:
line = line.strip()
if line.startswitch("#") or line == "":
continue
tokens = line.split(",")
name = tokens[0]
label = tokens[1]
ttlmin = int(tokens[2])
ttlmax = int(tokens[3])
numIP = int(tokens[4])
ipset = set()
for i in range(numIP):
ipset.add(tokens[5+i])
domainlist.append(Domain(self, name, label, ttlmin, ttlmax, numIP, ipset))
def main():
initData("baddomaininfo")
initData("gooddomaininfo")
featureMatrix = []
labelList = []
for item in domainlist:
featureMatrix.append(item.returnData())
labelList.append(item.returnLabel())
clf = RandomForestClassifier(random_state = 0)
clf.fit(featureMatrix,labelList)
print(clf.predict([[3600,10000,3]]))
print(clf.predict([[3600,3600,1]]))
print(clf.predict([[100,100,3]]))
print(clf.predict([[100,100,1]]))
if __name__ == '__main__':
main()
接下来我们就以Demo为基础 按照题目给出的提示来尝试一下吧
domain name length
这个特征比较简单
def initData(filename):
with open(filename) as f:
for line in f:
line = line.strip()
if line.startswitch("#") or line == "":
continue
tokens = line.split(",")
name = tokens[0]
length = len(name)
label = tokens[1]
domainlist.append(Domain(name,label,length))
numbers in the domain name
我们可以设定一个专门的函数 用于计算name中的数字个数
def cal_num(str):
num = 0
for i in str:
if i.isdigit():
num += 1
return num
num = cal_num(name)
entropy of letteres
我们可以参考这篇blog中的代码
import math
def cal_entropy(str):
h = 0.0
sumLetter = 0
letter = [0] * 26
str = str.lower()
for i in range(len(str)):
if str[i].isalpha():
letter[ord(str[i]) - ord('a')] += 1
sumLetter += 1
for i in range(26):
p = 1.0 * letter[i] / sumLetter
if p > 0:
h += -(p * math.log(p, 2))
return h
segmentation
def cal_seg(str):
num = 0
for i in str:
if i == '.':
num += 1
return num
输出ML结果
我们需要从train.txt文件中,读出domain,将特征提取出来存放到列表中,用于训练模型 我们需要从test.txt文件中,读出domain,将特征提取出来存放到列表中,根据特征预测domain的分类 特征提取我们可以复用initData initData需要按照我们的需求稍微修改一下
def initData(filename,domainlist):
with open(filename) as f:
for line in f:
line = line.strip()
if line.startswith("#") or line == "":
continue
tokens = line.split(",")
name = tokens[0]
if len(tokens) > 1:
label = tokens[1]
else
label = "?"
length = len(name)
num = cal_num(name)
entropy = cal_entropy(name)
domainlist.append(Domain(name,label,length,num,entropy))
def main():
domainlist1 = []
initData("train",domainlist1)
...
domainlist2 = []
initData("test.txt",domainlist2)
with open("result.txt","w") as f:
for i in domainlist2:
f.write(i.name)
f.write(",")
if clf.predict([i.returnData()])[0] == 0:
f.write("notdga")
else:
f.write("dga")
f.write("\n")
...
完整代码
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import math
class Domain:
def __init__(self, _name, _label, _length, _num, _entropy):
self.name = _name
self.label = _label
self.length = _length
self.num = _num
self.entropy = _entropy
def returnData(self):
return [self.length, self.num, self.entropy]
def returnLabel(self):
if self.label == "dga":
return 1
else:
return 0
def cal_num(str):
num = 0
for i in str:
if i.isdigit():
num += 1
return num
def cal_seg(str):
num = 0
for i in str:
if i == '.':
num += 1
return num
def cal_entropy(str):
h = 0.0
sumLetter = 0
letter = [0] * 26
str = str.lower()
for i in range(len(str)):
if str[i].isalpha():
letter[ord(str[i]) - ord('a')] += 1
sumLetter += 1
for i in range(26):
p = 1.0 * letter[i] / sumLetter
if p > 0:
h += -(p * math.log(p, 2))
return h
def initData(filename,domainlist):
with open(filename) as f:
for line in f:
line = line.strip()
if line.startswith("#") or line == "":
continue
tokens = line.split(",")
name = tokens[0]
if len(tokens) > 1:
label = tokens[1]
else:
label = "?"
length = len(name)
num = cal_num(name)
entropy = cal_entropy(name)
domainlist.append(Domain(name, label, length, num, entropy))
def main():
domainlist1 = []
initData("train.txt",domainlist1)
featureMatrix = []
labelList = []
for item in domainlist1:
featureMatrix.append(item.returnData())
labelList.append(item.returnLabel())
clf = RandomForestClassifier(random_state = 0)
clf.fit(featureMatrix,labelList)
domainlist2 = []
initData("test.txt",domainlist2)
with open("result.txt","w") as f:
for i in domainlist2:
f.write(i.name)
f.write(",")
if clf.predict([i.returnData()])[0] == 0:
f.write("notdga")
else:
f.write("dga")
f.write("\n")
if __name__ == '__main__':
main()
测试方法
python3 test.py
|