Signature Description

enum class impurity_type : unsigned char  {
    // Measure how often a randomly chosen element from the set would be
    // incorrectly labeled
    // Gini = 1 - ∑Pi2
    //
    gini_index = 1,
    // Average level of "information", "surprise", or "uncertainty" inherent
    // to the variable's possible outcomes.
    // Entropy = -∑Pi log2(Pi)
    //
    info_entropy = 2,
};
      
Impurity type

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct EntropyVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using ent_v = EntropyVisitor<T, I, A>;
        
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor calculates the rolling values of information entropy
In information theory, the entropy of a random variable is the average level of "information", "surprise", or "uncertainty" inherent in the variable's possible outcomes. The concept of information entropy was introduced by Claude Shannon in his 1948 paper.
A die has higher entropy (p=1/6) versus a coin (p=1/2).
This method is an approximation and not as computationally intensive as the method below.
    explicit
    EntropyVisitor(std::size_t roll_count, T log_base = 2);
        
T: Column data type.
I: Index type.
A: Memory alignment boundary for vectors. Default is system default alignment
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct ImpurityVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using impu_v = ImpurityVisitor<T, I, A>;
        
This is a “single action visitor”, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor calculates the rolling impurity of the data column.
Decision trees recursively split features with regard to their target variable’s purity. The algorithm is designed to find the optimal point of the most predictive feature in order to split 1 dataset into 2. These 2 new datasets’ target variable will be more pure than the original dataset’s.
“Pure” is the key word here, however. What does that word mean, exactly? In a general sense “purity” can be thought of as how homogenized a group is. But homogeneity can mean different things depending on which mathematical backbone your decision tree runs on. The 2 most popular backbones for decision tree’s decisions are Gini Index and Information Entropy.

This method is computationally intensive. It dynamically finds the probability of each feature during each rolling period and applies the formula for the selected method.
Type T must be hash-able
    ImpurityVisitor(size_type roll_count, impurity_type it);
        
T: Column data type.
I: Index type.
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_EntropyVisitor()  {

    std::cout << "\nTesting EntropyVisitor{  } ..." << std::endl;

    std::vector<unsigned long>  idx =
        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466, 123467, 123468,
          123469, 123470, 123471, 123472, 123473, 22, 23, 24, 25, 26, 27, 28
        };
    std::vector<double>         close =
        { 1.80, 2.80, 1.90, 14.00, 1.10, 6.00, 13.00, 8.00, 9.00, 2.80, 1.90, 4.30, 20.00, 1.85, 3.00, 34.00, 67.00, 23.00, 87.00, 9.00, 45.00,
          1.00, 11.00, 456.00, 34.00, 7.00, 7778.00, 5.00
        };
    MyDataFrame                 df;

    df.load_data(std::move(idx), std::make_pair("close", close));

    EntropyVisitor<double>  e_v (5);

    df.single_act_visit<double>("close", e_v);

    assert(e_v.get_result().size() == 28);
    assert(std::isnan(e_v.get_result()[0]));
    assert(std::isnan(e_v.get_result()[7]));
    assert(std::abs(e_v.get_result()[8] - 2.18974) < 0.00001);
    assert(std::abs(e_v.get_result()[10] - 1.98477) < 0.00001);
    assert(std::abs(e_v.get_result()[14] - 1.7154) < 0.0001);
    assert(std::abs(e_v.get_result()[27] - 0.596666) < 0.00001);
    assert(std::abs(e_v.get_result()[25] - 0.822228) < 0.00001);
    assert(std::abs(e_v.get_result()[22] - 1.49397) < 0.0001);
}
// -----------------------------------------------------------------------------

static void test_ImpurityVisitor()  {

    std::cout << "\nTesting ImpurityVisitor{  } ..." << std::endl;

    StlVecType<unsigned long>   idx =
        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466,
          123467, 123468, 123469, 123470, 123471, 123472, 123473 };
    StlVecType<std::string>     metal = { "Gold", "Gold", "Gold", "Gold" };
    StlVecType<std::string>     metal2 = { "Gold", "Silver", "Silver", "Gold" };
    StlVecType<double>          numbers =
        { 2.5, 2.5, 2.5, -0.1, -1.1, -0.1, -1.1, -1.1, -0.1, 34.5, -1.1,
          34.5, 34.5, 34.5, 0.123, 0.123, 0.123, 0.5, 0.4, 2.5, 0.5 };
    MyDataFrame                 df;

    df.load_data(std::move(idx), std::make_pair("Numbers", numbers));
    df.load_column("Metals", std::move(metal), nan_policy::dont_pad_with_nans);
    df.load_column("Metals2", std::move(metal2), nan_policy::dont_pad_with_nans);

    impu_v<std::string> impu (4, impurity_type::gini_index);
    const auto          result = df.single_act_visit<std::string>("Metals", impu).get_result();

    assert(result.size() == 1);
    assert(result[0] == 0);

    impu_v<std::string> impu2 (4, impurity_type::info_entropy);
    const auto          result2 = df.single_act_visit<std::string>("Metals", impu2).get_result();

    assert(result2.size() == 1);
    assert(result2[0] == 0);

    impu_v<std::string> impu3 (4, impurity_type::gini_index);
    const auto          result3 = df.single_act_visit<std::string>("Metals2", impu3).get_result();

    assert(result3.size() == 1);
    assert(result3[0] == 0.5);

    impu_v<std::string> impu4 (4, impurity_type::info_entropy);
    const auto          result4 = df.single_act_visit<std::string>("Metals2", impu4).get_result();

    assert(result4.size() == 1);
    assert(result4[0] == 1.0);

    impu_v<double>  impu5 (3, impurity_type::gini_index);
    const auto      result5 = df.single_act_visit<double>("Numbers", impu5).get_result();

    assert(result5.size() == 19);
    assert(result5[0] == 0);
    assert(fabs(result5[1] - 0.4444) < 0.0001);
    assert(fabs(result5[2] - 0.6667) < 0.0001);
    assert(fabs(result5[3] - 0.4444) < 0.0001);
    assert(fabs(result5[4] - 0.4444) < 0.0001);
    assert(fabs(result5[18] - 0.6667) < 0.0001);
    assert(fabs(result5[17] - 0.6667) < 0.0001);
    assert(fabs(result5[16] - 0.6667) < 0.0001);
    assert(fabs(result5[15] - 0.4444) < 0.0001);
    assert(result5[14] == 0);
    assert(fabs(result5[13] - 0.4444) < 0.0001);

    impu_v<double>  impu6 (3, impurity_type::info_entropy);
    const auto      result6 = df.single_act_visit<double>("Numbers", impu6).get_result();

    assert(result6.size() == 19);
    assert(result6[0] == 0);
    assert(fabs(result6[1] - 0.9183) < 0.0001);
    assert(fabs(result6[2] - 1.585) < 0.0001);
    assert(fabs(result6[3] - 0.9183) < 0.0001);
    assert(fabs(result6[4] - 0.9183) < 0.0001);
    assert(fabs(result6[18] - 1.585) < 0.0001);
    assert(fabs(result6[17] - 1.585) < 0.0001);
    assert(fabs(result6[16] - 1.585) < 0.0001);
    assert(fabs(result6[15] - 0.9183) < 0.0001);
    assert(result6[14] == 0);
    assert(fabs(result6[13] - 0.9183) < 0.0001);
}
C++ DataFrame