K-表示算法C++,交换出点
K-means algorithm C++, swapping out Points?
对于我的任务,我必须对600个数据点应用K-means算法,这些数据点都有60个维度(或者属性,如果你愿意的话)。我将600个数据点存储到6个集群中(因此K=6),这就是我目前所拥有的:
#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <cmath> //std::cmath
#include <array> //std::array
#define K 6
#define MAX_ITERATIONS 10000000
#define NUM_ATTRIBUTES 60
struct Point{
std::array<double, NUM_ATTRIBUTES> point;
std::string classType;
};
struct Cluster{
std::vector<Point> points;
Point centroid;
};
int randNumGenerator(int max){
int num = (rand() % max);
return num;
}
void setData(Point p, std::string line, std::vector<Point> &data, int index){
std::stringstream s(line);
std::string classes[] = {"Normal", "Cyclic", "Increasing trend",
"Decreasing trend", "Upward shift", "Downward shift"};
for(int i = 0; i < NUM_ATTRIBUTES; i++){
double num;
if(s >> num){
p.point[i] = num;
}
}
if(index > 0 && index <= 100){
p.classType = classes[0];
}
else if(index > 100 && index <= 200){
p.classType = classes[1];
}
else if(index > 200 && index <= 300){
p.classType = classes[2];
}
else if(index > 300 && index <= 400){
p.classType = classes[3];
}
else if(index > 400 && index <= 500){
p.classType = classes[4];
}
else if(index > 500 && index <= 600){
p.classType = classes[5];
}
data.push_back(p);
}
void initializeCentroids(std::vector<Point> &points, int num_clusters,
std::vector<Point> ¢roids){
Point p;
std::vector<bool> numsUsedAlready(points.size());
for(int i = 0; i < num_clusters; i++){
int randNum = randNumGenerator(points.size());
while(numsUsedAlready[randNum]){
randNum = randNumGenerator(points.size());
}
numsUsedAlready[randNum] = true;
p = points[randNum];
centroids.push_back(p);
}
}
double calculateDistance(Point p, Point centroid){
double ret = 0;
for(int i = 0; i < p.point.size(); i++){
double distance = p.point[i] - centroid.point[i];
ret += distance * distance;
}
return sqrt(ret);
}
void setCentroids(std::vector<Cluster> &clusters, std::vector<Point> centroids){
for(int i = 0; i < centroids.size(); i++){
Cluster c;
c.points.push_back(centroids[i]);
c.centroid = centroids[i];
clusters.push_back(c);
}
}
void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
int sendToCluster = 99999999;
for(int index = 0; index < points.size(); index++){
double minDist = 99999999999;
Point p = points[index];
for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
//std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
if(tempDist < minDist){
minDist = tempDist;
sendToCluster = clusterNum;
}
}
//std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl;
clusters[sendToCluster].points.push_back(p);
}
}
void updateCentroid(std::vector<Cluster> &clusters){
for(int i = 0; i < clusters.size(); i++){
Cluster c = clusters[i];
for(int j = 0; j < NUM_ATTRIBUTES; j++){
double avg = 0;
for(int h = 0; h < c.points.size(); h++){
Point p = c.points[h];
avg += p.point[j];
}
double oldCentroidValue = c.centroid.point[j];
c.centroid.point[j] = avg / c.points.size();
std::cout << "old: " << oldCentroidValue << " new: " << c.centroid.point[j] << std::endl;
}
}
}
void k_clustering(std::vector<Point> &points, int num_clusters){
std::vector<Point> centroids;
initializeCentroids(points, num_clusters, centroids);
std::vector<Cluster> clusters;
setCentroids(clusters, centroids);
/*for(int i = 0; i < clusters.size(); i++){
for(int j = 0; j < NUM_ATTRIBUTES; j++){
std::cout << clusters[i].centroid.point[j] << " ";
}
std::cout << std::endl;
}
*/
setClusters(clusters, points);
updateCentroid(clusters);
for(int i = 0; i < clusters.size(); i++){
std::cout << i << " " << clusters[i].points.size() << std::endl;
}
}
void readInputFile(std::ifstream &file, std::vector<Point> &data){
std::string line;
int counter = 0;
while(getline(file,line)){
counter++;
Point p;
setData(p, line, data, counter);
}
}
void usageString(){
std::cout << "Usage: output <input_file>" << std::endl;
}
char* checkFile(int argc, char *argv[]){
if(argc < 2){
usageString();
exit(0);
}
else{
char *inputfile = argv[1];
return inputfile;
}
}
int main(int argc, char *argv[]){
char *inputfile;
inputfile = checkFile(argc, argv);
std::ifstream input(inputfile);
if(!input.is_open()){
std::cerr << "Error: Data file doesn't exist" << std::endl;
return EXIT_FAILURE;
}
srand(time(NULL));
std::vector<Point> data;
readInputFile(input, data);
k_clustering(data, K);
//printData(data);
//Attribute closest cluster to each data point
//For each point, calculate distance to each centroid
//Assign point to cluster (and update centroid value, by finding mean values of all points)
//Repeat until nothing changed or after a certain number of iterations
return 1;
}
然而,我注意到我的代码只适用于一次迭代。但在第一次迭代之后,在我的"setClusters"函数中,我正在推进一个Point。我必须将该点移动到另一个集群(如果需要的话),但我很困惑如何做到这一点,而不需要从该集群中删除该点,然后将其推送到另一个。我相信有一种更有效的方法可以将Point交换到另一个集群。
我还没有时间浏览整个代码,但如果您对优化setClusters()
感兴趣,那么也许您可以尝试C++的移动语义和vector
的emplace_back()
。
void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
int sendToCluster = 99999999;
for(int index = 0; index < points.size(); index++){
double minDist = 99999999999;
Point& p = points[index]; // use a reference to avoid copying, unless you don't want to alter the original "points" vector
for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
//std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
if(tempDist < minDist){
minDist = tempDist;
sendToCluster = clusterNum;
}
}
//std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl;
clusters[sendToCluster].points
.emplace_back( // construct the new item in place
std::move(p) // by "moving" from p
);
}
points.clear(); // clear the input vector, unless you don't want to alter it. cf. the comment on using references
}
另一个想法是Cluster
保持"索引"而不是实际的"点"。在这种情况下,您将声明该结构类似于:
struct Cluster{
std::vector<int> pointIndices;
Point centroid;
};
然后,在setClusters()
中,您可以直接推送索引:
void setClusters(std::vector<Cluster> &clusters, const std::vector<Point> points){ // points can be const now
int sendToCluster = 99999999;
for(int index = 0; index < points.size(); index++){
double minDist = 99999999999;
const Point& p = points[index]; // use a reference to avoid copying
for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
//std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
if(tempDist < minDist){
minDist = tempDist;
sendToCluster = clusterNum;
}
}
//std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl;
clusters[sendToCluster].pointIndices
.push_back(index); // cheap compared to pushing a "Point"
}
}
我希望它能有所帮助。
相关文章:
- 在编译时,C++项目抛出错误 C2228,这是预期的,因为控件在运行时未达到该点
- 我无法在某个点(从用户那里)获得输入,即使没有错误弹出编译
- 如何找出浮点异常的位置
- 绘制的 OpenGL 点消失,绘制调用和交换缓冲区问题
- 为什么将浮点变量与另一个浮点变量的缩减为int给出了完全不同的答案
- OpenCV不失真点没有给出失真模型的精确逆
- Qt - 在课堂上连接 - 出了点问题 - 这个
- 我如何限制计算出的浮点值的小数位数
- 当INT初始化为浮动点值时,为什么填充初始化不会给出错误
- 为什么trace()抛出浮点底流异常
- 既然C++知道类型,它能推断出点和箭头吗?
- 快速排序中的交换函数给出错误的输出
- 为什么在浮点比较中,最后一个小数位数为5的浮点值能给出正确的输出,而在其他情况下却不是
- 粗制滥造的蒙特卡洛整合出了更多点的问题
- 在该点打印回溯,在捕获块中抛出了一个例外
- 在使用交换和弹出进行迭代时擦除矢量中的元素
- 找出数组中有多少不同的浮点值
- 为什么表达式 a = a + b - ( b = a ) 在 c++ 中给出序列点警告?
- K-表示算法C++,交换出点
- 在小于0 (N)的时间内找出点是否在N个(可能重叠)矩形中的一个内