Signature Description

enum class  prob_dist_type : unsigned char  {
    arithmetic = 1,  // yi = 
xi / ∑ xi
All values must be >= 0 log = 2, // yi =
loge(xi) / ∑ loge(xi)
All values must be >= 1 softmax = 3, // yi =
exi / ∑ exi
pow2 = 4, // yi =
2xi / ∑ 2xi
pow10 = 5, // yi =
10xi / ∑ 10xi
};
This specifies how to convert a vector of values to a probability distribution.
Values in a probability distribution are between 0 and 1 and they all sum up to 1.
Note that some methods do not work with negative values or zero.

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct ProbabilityDistVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using pd_v = ProbabilityDistVisitor<T, I, A>;
        
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This class converts the given column of values into a probability distribution based on one of the above methods. Values in a probability distribution are between 0 and 1 and they all add up to 1.
Please note that some of that methods require that values be only positive or above 1. If you don’t adhere to the requirement, you will get garbage
    explicit
    ProbabilityDistVisitor(prob_dist_type pdtype);
        
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct NormalizeVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using norm_v = NormalizeVisitor<T, I, A>;
        
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This functor class normalizes column data by doing y = (x - min) / (max - min).
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct StandardizeVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using stand_v = StandardizeVisitor<T, I, A>;
        
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This functor class standardizes column data by doing y = (x - mean) / std.
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_ProbabilityDistVisitor()  {

    std::cout << "\nTesting ProbabilityDistVisitor{  } ..." << std::endl;

    MyDataFrame                df;
    StlVecType<unsigned long>  idxvec = { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
    StlVecType<double>         dblvec = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14};
    StlVecType<double>         dblvec2 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    StlVecType<double>         dblvec3 = { 0, 1, -2, 3, 4, 5, 6, 7, -8, 9, 10, -11, 12, -13, 14};

    df.load_data(std::move(idxvec),
                 std::make_pair("dbl_col", dblvec),
                 std::make_pair("dbl_col_2", dblvec2),
                 std::make_pair("dbl_col_3", dblvec3));

    pd_v<double, unsigned long, 256>    pd { prob_dist_type::arithmetic };
    double                              sum { 0 };

    df.single_act_visit<double>("dbl_col", pd);
    assert(pd.get_result().size() == 15);
    for (const auto val : pd.get_result())  {
        assert(val >= 0 && val <= 1.0);
        sum += val;
    }
    assert(std::abs(sum - 1.0) < 0.0001);

    pd_v<double, unsigned long, 256>    pd2 { prob_dist_type::log };

    df.single_act_visit<double>("dbl_col_2", pd2);
    assert(pd2.get_result().size() == 15);
    sum = 0;
    for (const auto val : pd2.get_result())  {
        assert(val >= 0 && val <= 1.0);
        sum += val;
    }
    assert(std::abs(sum - 1.0) < 0.0001);

    pd_v<double, unsigned long, 256>    pd3 { prob_dist_type::softmax };

    df.single_act_visit<double>("dbl_col_3", pd3);
    assert(pd3.get_result().size() == 15);
    sum = 0;
    for (const auto val : pd3.get_result())  {
        assert(val >= 0 && val <= 1.0);
        sum += val;
    }
    assert(std::abs(sum - 1.0) < 0.0001);

    pd_v<double, unsigned long, 256>    pd4 { prob_dist_type::pow2 };

    df.single_act_visit<double>("dbl_col_3", pd4);
    assert(pd4.get_result().size() == 15);
    sum = 0;
    for (const auto val : pd4.get_result())  {
        assert(val >= 0 && val <= 1.0);
        sum += val;
    }
    assert(std::abs(sum - 1.0) < 0.0001);

    pd_v<double, unsigned long, 256>    pd5 { prob_dist_type::pow10 };

    df.single_act_visit<double>("dbl_col_3", pd5);
    assert(pd5.get_result().size() == 15);
    sum = 0;
    for (const auto val : pd5.get_result())  {
        assert(val >= 0 && val <= 1.0);
        sum += val;
    }
    assert(std::abs(sum - 1.0) < 0.0001);
}

// -----------------------------------------------------------------------------
static void test_NormalizeVisitor()  {

    std::cout << "\nTesting NormalizeVisitor{ } ..." << std::endl;

    std::vector<unsigned long>  ulgvec2 =
        { 123450, 123451, 123452, 123450, 123455, 123450, 123449, 123450, 123451, 123450, 123452, 123450, 123455, 123450,
          123454, 123450, 123450, 123457, 123458, 123459, 123450, 123441, 123442, 123432, 123450, 123450, 123435, 123450 };
    std::vector<double>         dblvec =
        { 1.2345, 2.2345, 3.2345, 4.2345, 5.2345, 3.0, 0.9999, 10.0, 4.25, 0.009, 8.0, 2.2222, 3.3333, 15.6,
          11.0, 5.25, 1.009, 2.111, 9.0, 3.2222, 4.3333, 12.0, 6.25, 2.009, 3.111, 10.0, 4.2222, 5.3333 };

    MyDataFrame df;

    df.load_data(std::move(ulgvec2), std::make_pair("dbl_col", dblvec));

    NormalizeVisitor<double>    norm_v;
    StandardizeVisitor<double>  stand_v;
    auto                        result = df.single_act_visit<double>("dbl_col", norm_v).get_result();
    std::vector<double>         norm_result = {
        0.078603, 0.142743, 0.206882, 0.271022, 0.335161, 0.191841, 0.0635559,
        0.640818, 0.272016, 0, 0.512539, 0.141954, 0.213219, 1, 0.704958,
        0.336155, 0.0641396, 0.134821, 0.576679, 0.206093, 0.277359, 0.769098,
        0.400295, 0.128279, 0.198961, 0.640818, 0.270233, 0.341498,
    };
    std::vector<double>         stand_result = {
        -1.00542, -0.744444, -0.48347, -0.222497, 0.0384758, -0.544669, -1.06664,
        1.28214, -0.218452, -1.32524, 0.760197, -0.747654, -0.457686, 2.74359,
        1.54312, 0.0425209, -1.06427, -0.776674, 1.02117, -0.48668, -0.196713,
        1.80409, 0.303494, -0.803293, -0.515701, 1.28214, -0.225707, 0.06426
    };

    for (size_t idx = 0; idx < result.size(); ++idx)
       assert(fabs(result[idx] - norm_result[idx]) < 0.00001);
    result = df.single_act_visit<double>("dbl_col", stand_v).get_result();
    for (size_t idx = 0; idx < result.size(); ++idx)
       assert(fabs(result[idx] - stand_result[idx]) < 0.00001);
}
C++ DataFrame