随机森林算法及其实现
算法实现
- 先实现随机化,有放回抽取样本,以及随机抽取属性(无放回)
IntArray* RandomForestClassifier::bootStrap()
{
int count = 0;
int tempIndex;
IntArray* resInstances;
int length = trainingSet->getRows();
int* tempIndices = new int[length];
memset(tempIndices, 0, length * sizeof(int));
for (int i = 0; i < length; i++)
{
tempIndex = rand() % length;
if (tempIndices[tempIndex] == 0) {
tempIndices[tempIndex] = 1;
count++;
}
}
resInstances = new IntArray(count);
for (int i = 0, j = 0; i < length; i++)
{
if (tempIndices[i] == 1)
{
resInstances->setValue(j++, i);
}
}
std::cout << resInstances->toString() << std::endl;
delete[] tempIndices;
return resInstances;
}
IntArray* RandomForestClassifier::getAttributes()
{
int tempIndex;
int numAvailableAttributes = sqrt(numAttributes);
int* tempAttributes = new int[numAttributes];
memset(tempAttributes, 0, numAttributes * sizeof(int));
IntArray* resAttributes = new IntArray(numAvailableAttributes);
for (int i = 0; i < numAvailableAttributes; i++)
{
tempIndex = rand() % numAttributes;
while (tempAttributes[tempIndex] == 1) {
tempIndex = rand() % numAttributes;
}
tempAttributes[tempIndex] = 1;
}
tempIndex = 0;
for (int i = 0; i < numAttributes; i++)
{
if (tempAttributes[i] == 1) {
resAttributes->setValue(tempIndex++, i);
}
}
delete[] tempAttributes;
return resAttributes;
}
- 构造有穷颗树
Tree* RandomForestClassifier::buildTree()
{
IntArray* availableInstances = bootStrap();
IntArray* availableAttributes = getAttributes();
Tree* tree = new Tree(trainingSet, trainingLables, availableInstances, availableAttributes, numClasses);
tree->train();
delete availableInstances;
delete availableAttributes;
return tree;
}
void RandomForestClassifier::train()
{
trees = new Tree * [numTrees];
for (int i = 0; i < numTrees; i++)
{
trees[i] = buildTree();
}
}
- 预测以及投票
int RandomForestClassifier::vote(IntArray* paraLabels)
{
int* tempCountClasses = new int[numClasses];
memset(tempCountClasses, 0, numClasses * sizeof(int));
int max = 0;
for (int i = 0; i < paraLabels->getLength(); i++)
{
tempCountClasses[paraLabels->getValue(i)]++;
if (tempCountClasses[max] < tempCountClasses[paraLabels->getValue(i)])
{
max = paraLabels->getValue(i);
}
}
delete[] tempCountClasses;
return vote;
}
int RandomForestClassifier::predict(DoubleMatrix* paraInstance)
{
IntArray* tempLabels = new IntArray(numTrees);
for (int i = 0; i < numTrees; i++)
{
tempLabels->setValue(i, trees[i]->predict(paraInstance));
}
int resLable = vote(tempLabels);
delete tempLabels;
return resLable;
}
实现过程出现的问题以及解决方式
实现之后发现准确率不太行,这里有可能是算法中还有点小问题,同时我在想会不会和数据集也有关系,这里测试的数据集数量只有17个,划分成训练集和测试集后就更少了。
数据集:
0, 0, 0, 0, 0, 0, 1
1, 0, 1, 0, 0, 0, 1
1, 0, 0, 0, 0, 0, 1
0, 0, 1, 0, 0, 0, 1
2, 0, 0, 0, 0, 0, 1
0, 1, 0, 0, 1, 1, 1
1, 1, 0, 1, 1, 1, 1
1, 1, 0, 0, 1, 0, 1
1, 1, 1, 1, 1, 0, 0
0, 2, 2, 0, 2, 1, 0
2, 2, 2, 2, 2, 0, 0
2, 0, 0, 2, 2, 1, 0
0, 1, 0, 1, 0, 0, 0
2, 1, 1, 1, 0, 0, 0
1, 1, 0, 0, 1, 1, 0
2, 0, 0, 2, 2, 0, 0
0, 0, 1, 1, 1, 0, 0
然后又测试了那个weather数据集,结果也是不太理想,如下图: 数据集:
0, 0, 0, 0, 0
0, 0, 0, 1, 0
1, 0, 0, 0, 1
2, 1, 0, 0, 1
2, 2, 1, 0, 1
2, 2, 1, 1, 0
1, 2, 1, 1, 1
0, 1, 0, 0, 0
0, 2, 1, 0, 1
2, 1, 1, 0, 1
0, 1, 1, 1, 1
1, 1, 0, 1, 1
1, 0, 1, 0, 1
2, 1, 0, 1, 0
应该是决策树里哪个部分有偏差,当然也参考了我导师的文章,日撸 Java 三百行(61-70天,决策树与集成学习),估计是在转 C++ 的过程里有些细节问题有疏忽,因此还得继续找 bug 。。。
|