随机森林算法及其实现

算法实现

先实现随机化，有放回抽取样本，以及随机抽取属性(无放回)

/*
* Random instances are obtained through sampling methods with replacement.
* Return: Available instances.
*/
IntArray* RandomForestClassifier::bootStrap()
{
	int count = 0;
	int tempIndex;
	IntArray* resInstances;
	int length = trainingSet->getRows();
	int* tempIndices = new int[length];
	memset(tempIndices, 0, length * sizeof(int));

	for (int i = 0; i < length; i++)
	{
		tempIndex = rand() % length;
		if (tempIndices[tempIndex] == 0) {
			tempIndices[tempIndex] = 1;
			count++;
		}// Of if
	}// Of for

	resInstances = new IntArray(count);

	for (int i = 0, j = 0; i < length; i++)
	{
		if (tempIndices[i] == 1)
		{
			resInstances->setValue(j++, i);
		}// Of if
	}// Of for

	std::cout << resInstances->toString() << std::endl;

	delete[] tempIndices;

	return resInstances;
}// Of bootStrap


/*
* Random attributes are obtained through sampling without replacement
* Return: Available attributes.
*/
IntArray* RandomForestClassifier::getAttributes()
{
	int tempIndex;
	int numAvailableAttributes = sqrt(numAttributes);
	int* tempAttributes = new int[numAttributes];
	memset(tempAttributes, 0, numAttributes * sizeof(int));
	IntArray* resAttributes = new IntArray(numAvailableAttributes);

	for (int i = 0; i < numAvailableAttributes; i++)
	{
		tempIndex = rand() % numAttributes;
		while (tempAttributes[tempIndex] == 1) {
			tempIndex = rand() % numAttributes;
		}// Of while
		tempAttributes[tempIndex] = 1;
	}// Of for

	tempIndex = 0;
	for (int i = 0; i < numAttributes; i++)
	{
		if (tempAttributes[i] == 1) {
			resAttributes->setValue(tempIndex++, i);
		}
	}// Of for
	
	delete[] tempAttributes;
	return resAttributes;
}// Of getAttributes

构造有穷颗树

/*
* Build a tree
* Return: A tree.
*/
Tree* RandomForestClassifier::buildTree()
{
	IntArray* availableInstances = bootStrap();
	IntArray* availableAttributes = getAttributes();

	Tree* tree = new Tree(trainingSet, trainingLables, availableInstances, availableAttributes, numClasses);
	tree->train();

	delete availableInstances;
	delete availableAttributes;
	return tree;
}// Of buildTree

/*
* Build a given number of trees.
*/
void RandomForestClassifier::train()
{
	trees = new Tree * [numTrees];
	for (int i = 0; i < numTrees; i++)
	{
		trees[i] = buildTree();
	}// Of for
}// Of train

预测以及投票

/*
* Return the most frequent label by counting the number of various labels
* Return: Label.
*/
int RandomForestClassifier::vote(IntArray* paraLabels)
{
	int* tempCountClasses = new int[numClasses];
	memset(tempCountClasses, 0, numClasses * sizeof(int));
	int max = 0;
	for (int i = 0; i < paraLabels->getLength(); i++)
	{
		tempCountClasses[paraLabels->getValue(i)]++;
		if (tempCountClasses[max] < tempCountClasses[paraLabels->getValue(i)])
		{
			max = paraLabels->getValue(i);
		}// Of if
	}// Of for
	delete[] tempCountClasses;
	return vote;
}// Of if

/*
* Predict.
*/
int RandomForestClassifier::predict(DoubleMatrix* paraInstance)
{
	IntArray* tempLabels = new IntArray(numTrees);

	for (int i = 0; i < numTrees; i++)
	{
		tempLabels->setValue(i, trees[i]->predict(paraInstance));
	}// Of for

	int resLable = vote(tempLabels);

	delete tempLabels;

	return resLable;
}// Of predict

实现过程出现的问题以及解决方式

实现之后发现准确率不太行，这里有可能是算法中还有点小问题，同时我在想会不会和数据集也有关系，这里测试的数据集数量只有17个，划分成训练集和测试集后就更少了。

数据集：

0, 0, 0, 0, 0, 0, 1
1, 0, 1, 0, 0, 0, 1
1, 0, 0, 0, 0, 0, 1
0, 0, 1, 0, 0, 0, 1
2, 0, 0, 0, 0, 0, 1
0, 1, 0, 0, 1, 1, 1
1, 1, 0, 1, 1, 1, 1
1, 1, 0, 0, 1, 0, 1
1, 1, 1, 1, 1, 0, 0
0, 2, 2, 0, 2, 1, 0
2, 2, 2, 2, 2, 0, 0
2, 0, 0, 2, 2, 1, 0
0, 1, 0, 1, 0, 0, 0
2, 1, 1, 1, 0, 0, 0
1, 1, 0, 0, 1, 1, 0
2, 0, 0, 2, 2, 0, 0
0, 0, 1, 1, 1, 0, 0

实现方式是利用C++去实现，目前实现了ID3决策树算法的大部分代码，不过测试还有点小问题，正在更改，同时需要考虑数据的特性，目前只实现了基于离散属性的决策树，在这个过程中需要结合该框架下的基础类进行编码，因此需要添加很多未存在的方法，感觉会使代码比较臃肿，比如 DoubleMatrix 类，因此有些方法还是放在了当前实现的类中以单独使用。
在这里插入图片描述

然后又测试了那个weather数据集，结果也是不太理想，如下图：
在这里插入图片描述
数据集：

0, 0, 0, 0, 0
0, 0, 0, 1, 0
1, 0, 0, 0, 1
2, 1, 0, 0, 1
2, 2, 1, 0, 1
2, 2, 1, 1, 0
1, 2, 1, 1, 1
0, 1, 0, 0, 0
0, 2, 1, 0, 1
2, 1, 1, 0, 1
0, 1, 1, 1, 1
1, 1, 0, 1, 1
1, 0, 1, 0, 1
2, 1, 0, 1, 0

应该是决策树里哪个部分有偏差，当然也参考了我导师的文章，日撸 Java 三百行（61-70天，决策树与集成学习），估计是在转 C++ 的过程里有些细节问题有疏忽，因此还得继续找 bug 。。。