K-表示算法C++,交换出点

K-means algorithm C++, swapping out Points?

本文关键字:交换 出点 C++ 表示 算法      更新时间:2023-10-16

对于我的任务,我必须对600个数据点应用K-means算法,这些数据点都有60个维度(或者属性,如果你愿意的话)。我将600个数据点存储到6个集群中(因此K=6),这就是我目前所拥有的:

#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <cmath> //std::cmath
#include <array> //std::array
#define K 6
#define MAX_ITERATIONS 10000000
#define NUM_ATTRIBUTES 60
struct Point{
        std::array<double, NUM_ATTRIBUTES> point;
        std::string classType;
};
struct Cluster{
        std::vector<Point> points;
        Point centroid;
};
int randNumGenerator(int max){
        int num = (rand() % max);
        return num;
}
void setData(Point p, std::string line, std::vector<Point> &data, int index){
        std::stringstream s(line);
        std::string classes[] = {"Normal", "Cyclic", "Increasing trend", 
                                "Decreasing trend", "Upward shift", "Downward shift"};
        for(int i = 0; i < NUM_ATTRIBUTES; i++){
                double num;
                if(s >> num){
                        p.point[i] = num;
                }
        }
        if(index > 0 && index <= 100){
                p.classType = classes[0];
        }
        else if(index > 100 && index <= 200){
                p.classType = classes[1];
        }
        else if(index > 200 && index <= 300){
                p.classType = classes[2];
        }
        else if(index > 300 && index <= 400){
                p.classType = classes[3];
        }
        else if(index > 400 && index <= 500){
                p.classType = classes[4];
        }
        else if(index > 500 && index <= 600){
                p.classType = classes[5];
        }
        data.push_back(p);
}
void initializeCentroids(std::vector<Point> &points, int num_clusters,
                                std::vector<Point> &centroids){
        Point p;
        std::vector<bool> numsUsedAlready(points.size());
        for(int i = 0; i < num_clusters; i++){
                int randNum = randNumGenerator(points.size());
                while(numsUsedAlready[randNum]){
                        randNum = randNumGenerator(points.size());
                }
                numsUsedAlready[randNum] = true;
                p = points[randNum];
                centroids.push_back(p);
        }
}
double calculateDistance(Point p, Point centroid){
        double ret = 0;
        for(int i = 0; i < p.point.size(); i++){
                double distance = p.point[i] - centroid.point[i];
                ret += distance * distance;
        }
        return sqrt(ret);
}
void setCentroids(std::vector<Cluster> &clusters, std::vector<Point> centroids){
        for(int i = 0; i < centroids.size(); i++){
                Cluster c;
                c.points.push_back(centroids[i]);
                c.centroid = centroids[i];
                clusters.push_back(c);
        }
}
void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                Point p = points[index];
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].points.push_back(p);
        }
}
void updateCentroid(std::vector<Cluster> &clusters){
        for(int i = 0; i < clusters.size(); i++){
                Cluster c = clusters[i];
                for(int j = 0; j < NUM_ATTRIBUTES; j++){
                        double avg = 0;
                        for(int h = 0; h < c.points.size(); h++){
                                Point p = c.points[h];
                                avg += p.point[j];
                        }
                        double oldCentroidValue = c.centroid.point[j];
                        c.centroid.point[j] = avg / c.points.size();
                        std::cout << "old: " << oldCentroidValue << " new: " << c.centroid.point[j] << std::endl;
                }
        }
}
void k_clustering(std::vector<Point> &points, int num_clusters){
        std::vector<Point> centroids;
        initializeCentroids(points, num_clusters, centroids);
        std::vector<Cluster> clusters;
        setCentroids(clusters, centroids);
        /*for(int i = 0; i < clusters.size(); i++){
                for(int j = 0; j < NUM_ATTRIBUTES; j++){
                        std::cout << clusters[i].centroid.point[j] << " ";
                }
                std::cout << std::endl;
        }
        */
        setClusters(clusters, points);
        updateCentroid(clusters);
        for(int i = 0; i < clusters.size(); i++){
                std::cout << i << " " << clusters[i].points.size() << std::endl;
        }
}
void readInputFile(std::ifstream &file, std::vector<Point> &data){
        std::string line;
        int counter = 0;
        while(getline(file,line)){
                counter++;
                Point p;
                setData(p, line, data, counter);
        }
}
void usageString(){
        std::cout << "Usage: output <input_file>" << std::endl;
}
char* checkFile(int argc, char *argv[]){
        if(argc < 2){
                usageString();
                exit(0);
        }
        else{
                char *inputfile = argv[1];
                return inputfile;
        }
}
int main(int argc, char *argv[]){
        char *inputfile;
        inputfile = checkFile(argc, argv);
        std::ifstream input(inputfile);
        if(!input.is_open()){
                std::cerr << "Error: Data file doesn't exist" << std::endl;
                return EXIT_FAILURE;
        }
        srand(time(NULL));
        std::vector<Point> data;
        readInputFile(input, data);
        k_clustering(data, K);
        //printData(data);
        //Attribute closest cluster to each data point
          //For each point, calculate distance to each centroid
            //Assign point to cluster (and update centroid value, by finding mean values of all points)
          //Repeat until nothing changed or after a certain number of iterations
        return 1;
}

然而,我注意到我的代码只适用于一次迭代。但在第一次迭代之后,在我的"setClusters"函数中,我正在推进一个Point。我必须将该点移动到另一个集群(如果需要的话),但我很困惑如何做到这一点,而不需要从该集群中删除该点,然后将其推送到另一个。我相信有一种更有效的方法可以将Point交换到另一个集群。

我还没有时间浏览整个代码,但如果您对优化setClusters()感兴趣,那么也许您可以尝试C++的移动语义和vectoremplace_back()

void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                Point& p = points[index];  // use a reference to avoid copying, unless you don't want to alter the original "points" vector
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].points
                                       .emplace_back(    // construct the new item in place
                                           std::move(p)  // by "moving" from p
                                        );
        }
        points.clear();  // clear the input vector, unless you don't want to alter it. cf. the comment on using references
}

另一个想法是Cluster保持"索引"而不是实际的"点"。在这种情况下,您将声明该结构类似于:

struct Cluster{
        std::vector<int> pointIndices;
        Point centroid;
};

然后,在setClusters()中,您可以直接推送索引:

void setClusters(std::vector<Cluster> &clusters, const std::vector<Point> points){  // points can be const now
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                const Point& p = points[index];  // use a reference to avoid copying
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].pointIndices
                                       .push_back(index);  // cheap compared to pushing a "Point"
        }
}

我希望它能有所帮助。