Signature | Description | Parameters |
---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<size_t K, typename T, typename I = unsigned long> struct KMeansVisitor; |
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This functor class finds the K means in the data. It could also cluster the data around the means. The constructor takes two parameters
There is also a get_clusters() method that returns an array of K VectorPtrView’s which contain the data clustered around the K-Means. The first element in each VectorPtrView is the mean and the reset are the data points belonging to that cluster. |
K: Number of means to find T: Column data type I: Index type |
struct Point { double x { 0.0 }; double y { 0.0 }; Point() = default; Point(double xx, double yy) : x(xx), y(yy) { } Point(const Point &) = default; Point &operator = (const Point &) = default; friend Point operator + (const Point &lhs, const Point &rhs) { return (Point(lhs.x + rhs.x, lhs.y + rhs.y)); } friend Point operator / (const Point &lhs, double rhs) { return (Point(lhs.x / rhs, lhs.y / rhs)); } template<typename S> friend S &operator << (S &s, const Point &rhs) { return (s << rhs.x << ", " << rhs.y); } }; static double point_distance(const Point &lhs, const Point &rhs) { return ((lhs.x - rhs.x) * (lhs.x - rhs.x) + (lhs.y - rhs.y) * (lhs.y - rhs.y)); } static void test_k_means() { std::cout << "\nTesting k-means visitor ..." << std::endl; const size_t item_cnt = 1024; MyDataFrame df; RandGenParams<double> p; p.mean = 1.0; // Default p.std = 0.005; p.seed = 10; df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt, 1), std::make_pair("col1", gen_lognormal_dist<double>(item_cnt, p))); KMeansVisitor<5, double> km_visitor(1000); df.single_act_visit<double>("col1", km_visitor); // Using the calculated means, separate the given column into clusters const auto clusters = km_visitor.get_clusters(df.get_index(), df.get_column<double>("col1")); bool found = false; /* for (auto iter : clusters) { if (::fabs(iter[0] - 1.89348) < 0.00001) { if (::fabs(iter[6] - 1.44231) < 0.00001) { found = true; break; } } } assert(found); found = false; for (auto iter : clusters) { if (::fabs(iter[0] - 0.593126) < 0.00001) { if (::fabs(iter[2] - 0.950026) < 0.00001) { found = true; break; } } } assert(found); found = false; for (auto iter : clusters) { if (::fabs(iter[0] - 14.2245) < 0.0001) { found = true; break; } } assert(found); found = false; for (auto iter : clusters) { if (::fabs(iter[0] - 6.90427) < 0.00001) { found = true; break; } } assert(found); found = false; for (auto iter : clusters) { if (::fabs(iter[0] - 3.8146) < 0.00001) { found = true; break; } } assert(found); */ // Now try with Points // p.seed = 200; auto x_vec = gen_lognormal_dist<double>(item_cnt, p); p.seed = 4356; auto y_vec = gen_lognormal_dist<double>(item_cnt, p); std::vector<Point> points; points.reserve(item_cnt); for (size_t i = 0; i < item_cnt; ++i) points.push_back(Point(x_vec[i], y_vec[i])); df.load_column("point_col", std::move(points)); KMeansVisitor<5, Point> km_visitor2(1000, point_distance); df.single_act_visit<Point>("point_col", km_visitor2); // Using the calculated means, separate the given column into clusters const auto clusters2 = km_visitor2.get_clusters(df.get_index(), df.get_column<Point>("point_col")); for (auto iter : clusters2) { for (auto iter2 : iter) { std::cout << iter2.x << " | " << iter2.y << ", "; } std::cout << "\n\n" << std::endl; } /* found = false; for (auto iter : clusters2) { if (::fabs(iter[0].x - 18.9556) < 0.1 && ::fabs(iter[0].y - 2.17537) < 0.1) { if (::fabs(iter[6].x - 16.7309) < 0.1 && ::fabs(iter[6].y - 0.872376) < 0.1) { found = true; break; } } } assert(found); */ /* found = false; for (auto iter : clusters2) { if (::fabs(iter[0].x - 0.943977) < 0.1 && ::fabs(iter[0].y - 0.910989) < 0.1) { if (::fabs(iter[2].x - 0.30509) < 0.1 && ::fabs(iter[2].y - 1.69017) < 0.1) { found = true; break; } } } assert(found); found = false; for (auto iter : clusters2) { if (::fabs(iter[0].x - 4.31973) < 0.1 && ::fabs(iter[0].y - 1.24214) < 0.1) { if (::fabs(iter[3].x - 4.68381) < 0.1 && ::fabs(iter[3].y - 0.453632) < 0.1) { found = true; break; } } } assert(found); found = false; for (auto iter : clusters2) { if (::fabs(iter[0].x - 1.5694) < 0.1 && ::fabs(iter[0].y - 15.3338) < 0.1) { found = true; break; } } assert(found); found = false; for (auto iter : clusters2) { if (::fabs(iter[0].x - 1.29624) < 0.1 && ::fabs(iter[0].y - 4.13919) < 0.1) { found = true; break; } } assert(found); */ }