Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<size_t K, typename T, typename I = unsigned long>
struct KMeansVisitor;
        
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This functor class finds the K means in the data. It could also cluster the data round the means.
The constructor takes two parameters
  1. Number of iterations
  2. A function to calculate distance between to data points of type T with a default value
    KMeansVisitor(std::size_t num_of_iter,
                  distance_func f =
                      [](const T &x, const T &y) -> double {
                          return ((x - y) * (x - y));
                      })
            
The result type is an array of K means of type T.
There is also a get_clusters() method that returns an array of K VectorPtrView’s which contain the data clustered around the K-Means. The first element in each VectorPtrView is the mean and the reset are the data points belonging to that cluster.
K: Number of means to find
T: Column data type
I: Index type
struct  Point  {

    double  x { 0.0 };
    double  y { 0.0 };

    Point() = default;
    Point(double xx, double yy) : x(xx), y(yy)  {   }
    Point(const Point &) = default;
    Point &operator = (const Point &) = default;

    friend Point operator + (const Point &lhs, const Point &rhs)  {

        return (Point(lhs.x + rhs.x, lhs.y + rhs.y));
    }
    friend Point operator / (const Point &lhs, double rhs)  {

        return (Point(lhs.x / rhs, lhs.y / rhs));
    }

    template<typename S>
    friend S &operator << (S &s, const Point &rhs)  {

        return (s << rhs.x << ", " << rhs.y);
    }
};

static double point_distance(const Point &lhs, const Point &rhs)  {

    return ((lhs.x - rhs.x) * (lhs.x - rhs.x) + (lhs.y - rhs.y) * (lhs.y - rhs.y));
}

static void test_k_means()  {

    std::cout << "\nTesting k-means visitor ..." << std::endl;

    const size_t            item_cnt = 1024;
    MyDataFrame             df;
    RandGenParams<double>   p;

    p.mean = 1.0;  // Default
    p.std = 0.005;
    p.seed = 10;

    df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt, 1),
                 std::make_pair("col1", gen_lognormal_dist<double>(item_cnt, p)));

    KMeansVisitor<5, double>    km_visitor(1000);

    df.single_act_visit<double>("col1", km_visitor);

    // Using the calculated means, separate the given column into clusters
    const auto  clusters = km_visitor.get_clusters(df.get_index(), df.get_column<double>("col1"));
    bool        found = false;

/*
    for (auto iter : clusters)  {
        if (::fabs(iter[0] - 1.89348) < 0.00001)  {
            if (::fabs(iter[6] - 1.44231) < 0.00001)  {
                found = true;
                break;
            }
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters)  {
        if (::fabs(iter[0] - 0.593126) < 0.00001)  {
            if (::fabs(iter[2] - 0.950026) < 0.00001)  {
                found = true;
                break;
            }
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters)  {
        if (::fabs(iter[0] - 14.2245) < 0.0001)  {
            found = true;
            break;
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters)  {
        if (::fabs(iter[0] - 6.90427) < 0.00001)  {
            found = true;
            break;
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters)  {
        if (::fabs(iter[0] - 3.8146) < 0.00001)  {
            found = true;
            break;
        }
    }
    assert(found);
*/

    // Now try with Points
    //
    p.seed = 200;

    auto    x_vec = gen_lognormal_dist<double>(item_cnt, p);

    p.seed = 4356;

    auto                y_vec = gen_lognormal_dist<double>(item_cnt, p);
    std::vector<Point>  points;

    points.reserve(item_cnt);
    for (size_t i = 0; i < item_cnt; ++i)
        points.push_back(Point(x_vec[i], y_vec[i]));
    df.load_column("point_col", std::move(points));

    KMeansVisitor<5, Point> km_visitor2(1000, point_distance);

    df.single_act_visit<Point>("point_col", km_visitor2);

    // Using the calculated means, separate the given column into clusters
    const auto  clusters2 =
        km_visitor2.get_clusters(df.get_index(), df.get_column<Point>("point_col"));

    for (auto iter : clusters2)  {
        for (auto iter2 : iter)  {
            std::cout << iter2.x << " | " << iter2.y << ", ";
        }
        std::cout << "\n\n" << std::endl;
    }

/*
    found = false;
    for (auto iter : clusters2)  {
        if (::fabs(iter[0].x - 18.9556) < 0.1 &&
            ::fabs(iter[0].y - 2.17537) < 0.1)  {
            if (::fabs(iter[6].x - 16.7309) < 0.1 &&
                ::fabs(iter[6].y - 0.872376) < 0.1)  {
                found = true;
                break;
            }
        }
    }
    assert(found);
*/
/*
    found = false;
    for (auto iter : clusters2)  {
        if (::fabs(iter[0].x - 0.943977) < 0.1 &&
            ::fabs(iter[0].y - 0.910989) < 0.1)  {
            if (::fabs(iter[2].x - 0.30509) < 0.1 &&
                ::fabs(iter[2].y - 1.69017) < 0.1)  {
                found = true;
                break;
            }
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters2)  {
        if (::fabs(iter[0].x - 4.31973) < 0.1 &&
            ::fabs(iter[0].y - 1.24214) < 0.1)  {
            if (::fabs(iter[3].x - 4.68381) < 0.1 &&
                ::fabs(iter[3].y - 0.453632) < 0.1)  {
                found = true;
                break;
            }
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters2)  {
        if (::fabs(iter[0].x - 1.5694) < 0.1 &&
            ::fabs(iter[0].y - 15.3338) < 0.1)  {
            found = true;
            break;
        }
    }
    assert(found);
    found = false;
    for (auto iter : clusters2)  {
        if (::fabs(iter[0].x - 1.29624) < 0.1 &&
            ::fabs(iter[0].y - 4.13919) < 0.1)  {
            found = true;
            break;
        }
    }
    assert(found);
*/
}