   随机森林算法及其实现(2)




  1. 先实现随机化,有放回抽取样本,以及随机抽取属性(无放回)
* Random instances are obtained through sampling methods with replacement.
* Return: Available instances.
IntArray* RandomForestClassifier::bootStrap()
	int count = 0;
	int tempIndex;
	IntArray* resInstances;
	int length = trainingSet->getRows();
	int* tempIndices = new int[length];
	memset(tempIndices, 0, length * sizeof(int));

	for (int i = 0; i < length; i++)
		tempIndex = rand() % length;
		if (tempIndices[tempIndex] == 0) {
			tempIndices[tempIndex] = 1;
		}// Of if
	}// Of for

	resInstances = new IntArray(count);

	for (int i = 0, j = 0; i < length; i++)
		if (tempIndices[i] == 1)
			resInstances->setValue(j++, i);
		}// Of if
	}// Of for

	std::cout << resInstances->toString() << std::endl;

	delete[] tempIndices;

	return resInstances;
}// Of bootStrap

* Random attributes are obtained through sampling without replacement
* Return: Available attributes.
IntArray* RandomForestClassifier::getAttributes()
	int tempIndex;
	int numAvailableAttributes = sqrt(numAttributes);
	int* tempAttributes = new int[numAttributes];
	memset(tempAttributes, 0, numAttributes * sizeof(int));
	IntArray* resAttributes = new IntArray(numAvailableAttributes);

	for (int i = 0; i < numAvailableAttributes; i++)
		tempIndex = rand() % numAttributes;
		while (tempAttributes[tempIndex] == 1) {
			tempIndex = rand() % numAttributes;
		}// Of while
		tempAttributes[tempIndex] = 1;
	}// Of for

	tempIndex = 0;
	for (int i = 0; i < numAttributes; i++)
		if (tempAttributes[i] == 1) {
			resAttributes->setValue(tempIndex++, i);
	}// Of for
	delete[] tempAttributes;
	return resAttributes;
}// Of getAttributes
  1. 构造有穷颗树
* Build a tree
* Return: A tree.
Tree* RandomForestClassifier::buildTree()
	IntArray* availableInstances = bootStrap();
	IntArray* availableAttributes = getAttributes();

	Tree* tree = new Tree(trainingSet, trainingLables, availableInstances, availableAttributes, numClasses);

	delete availableInstances;
	delete availableAttributes;
	return tree;
}// Of buildTree

* Build a given number of trees.
void RandomForestClassifier::train()
	trees = new Tree * [numTrees];
	for (int i = 0; i < numTrees; i++)
		trees[i] = buildTree();
	}// Of for
}// Of train
  1. 预测以及投票
* Return the most frequent label by counting the number of various labels
* Return: Label.
int RandomForestClassifier::vote(IntArray* paraLabels)
	int* tempCountClasses = new int[numClasses];
	memset(tempCountClasses, 0, numClasses * sizeof(int));
	int max = 0;
	for (int i = 0; i < paraLabels->getLength(); i++)
		if (tempCountClasses[max] < tempCountClasses[paraLabels->getValue(i)])
			max = paraLabels->getValue(i);
		}// Of if
	}// Of for
	delete[] tempCountClasses;
	return vote;
}// Of if

* Predict.
int RandomForestClassifier::predict(DoubleMatrix* paraInstance)
	IntArray* tempLabels = new IntArray(numTrees);

	for (int i = 0; i < numTrees; i++)
		tempLabels->setValue(i, trees[i]->predict(paraInstance));
	}// Of for

	int resLable = vote(tempLabels);

	delete tempLabels;

	return resLable;
}// Of predict




0, 0, 0, 0, 0, 0, 1
1, 0, 1, 0, 0, 0, 1
1, 0, 0, 0, 0, 0, 1
0, 0, 1, 0, 0, 0, 1
2, 0, 0, 0, 0, 0, 1
0, 1, 0, 0, 1, 1, 1
1, 1, 0, 1, 1, 1, 1
1, 1, 0, 0, 1, 0, 1
1, 1, 1, 1, 1, 0, 0
0, 2, 2, 0, 2, 1, 0
2, 2, 2, 2, 2, 0, 0
2, 0, 0, 2, 2, 1, 0
0, 1, 0, 1, 0, 0, 0
2, 1, 1, 1, 0, 0, 0
1, 1, 0, 0, 1, 1, 0
2, 0, 0, 2, 2, 0, 0
0, 0, 1, 1, 1, 0, 0

实现方式是利用C++去实现,目前实现了ID3决策树算法的大部分代码,不过测试还有点小问题,正在更改,同时需要考虑数据的特性,目前只实现了基于离散属性的决策树,在这个过程中需要结合该框架下的基础类进行编码,因此需要添加很多未存在的方法,感觉会使代码比较臃肿,比如 DoubleMatrix 类,因此有些方法还是放在了当前实现的类中以单独使用。


0, 0, 0, 0, 0
0, 0, 0, 1, 0
1, 0, 0, 0, 1
2, 1, 0, 0, 1
2, 2, 1, 0, 1
2, 2, 1, 1, 0
1, 2, 1, 1, 1
0, 1, 0, 0, 0
0, 2, 1, 0, 1
2, 1, 1, 0, 1
0, 1, 1, 1, 1
1, 1, 0, 1, 1
1, 0, 1, 0, 1
2, 1, 0, 1, 0

应该是决策树里哪个部分有偏差,当然也参考了我导师的文章,日撸 Java 三百行(61-70天,决策树与集成学习),估计是在转 C++ 的过程里有些细节问题有疏忽,因此还得继续找 bug 。。。

加:2021-09-27 14:21:50  更:2021-09-27 14:23:48 
