Signature Description

enum class join_policy : unsigned char  {
    inner_join = 1,
    left_join = 2,
    right_join = 3,
    left_right_join = 4 // This is merge
};
        
Enumerated type to specify joining two DataFrames

Signature Description Parameters

template<typename RHS_T, typename T, typename ... Ts>
StdDataFrame<unsigned int>
join_by_column(const RHS_T &rhs,
               const char *name,
               join_policy jp) const;
        
It joins the data between self (lhs) and rhs and returns the joined data in a StdDataFrame, based on specification in join_policy.
The returned DataFrame is indexed by a sequence of unsigned integers from 0 to N. The returned DataFrame will at least have two columns names lhs.INDEX and rhs.INDEX containing the lhs and rhs indices based on join policy.
The following conditions must be meet for this method
to compile and work properly:
  1. Ordering (< > != ==) must be well defined for the type of the named column.
  2. Both lhs and rhs must contain the named column
  3. In both lhs and rhs, columns with the same name must have the same type
RHS_T: Type of DataFrame rhs
T: Type of the named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
rhs: The rhs DataFrame
name: Name of the column which the join will be based on
join_policy: Specifies how to join. For example inner join, or left join, etc. (See join_policy definition)

template<typename RHS_T, typename ... Ts>
StdDataFrame<I>
join_by_index(const RHS_T &rhs, join_policy jp) const;
        
It joins the data between self (lhs) and rhs and returns the joined data
in a StdDataFrame, based on specification in join_policy.
The following conditions must be meet for this method
to compile and work properly:
  1. I type must be the same between lhs and rhs.
  2. Ordering (< > != ==) must be well defined for type I
  3. In both lhs and rhs, columns with the same name must have the same Type
RHS_T: Type of DataFrame rhs
Ts: List all the types of all data columns. A type should be specified in the list only once.
rhs: The rhs DataFrame
join_policy: Specifies how to join. For example inner join, or left join, etc. (See join_policy definition)
static void test_index_inner_join()  {

    std::cout << "\nTesting Index Inner Join ..." << std::endl;

    std::vector<unsigned long>  idx =
        { 123456, 123451, 123452, 123453, 123454, 123455, 123450, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    std::vector<double> d1 = { 7, 2, 3, 4, 5, 6, 1, 8, 9, 10, 11, 12, 13, 14 };
    std::vector<double> d2 = { 14, 9, 10, 11, 12, 13, 8, 20, 22, 23, 30, 31, 32, 1.89};
    std::vector<double> d3 = { 21, 16, 15, 18, 19, 16, 15, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
    std::vector<int>    i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame         df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    std::vector<unsigned long>  idx2 =
        { 123452, 123453, 123455, 123458, 123466, 223450, 223451, 223454, 223456, 223457, 223459, 223460, 223462, 223461 };
    std::vector<double> d12 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 114, 113 };
    std::vector<double> d22 = { 18, 19, 110, 111, 112, 113, 114, 120, 122, 123, 130, 131, 11.89, 132 };
    std::vector<double> d32 = { 115, 116, 115, 118, 119, 116, 121, 10.34, 11.56, 10.34, 12.3, 119.0, 10.34 };
    std::vector<int>    i12 = { 122, 123, 124, 125, 199 };
    MyDataFrame         df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    std::cout << "First DF:" << std::endl;
    df.write<std::ostream, double, int>(std::cout);
    std::cout << "Second DF2:" << std::endl;
    df2.write<std::ostream, double, int>(std::cout);

    MyDataFrame join_df =
        df.join_by_index<decltype(df2), double, int>(df2, join_policy::inner_join);

    std::cout << "Now The joined DF:" << std::endl;
    join_df.write<std::ostream, double, int>(std::cout);
}

// -----------------------------------------------------------------------------

static void test_index_left_join()  {

    std::cout << "\nTesting Index Left Join ..." << std::endl;

    std::vector<unsigned long>  idx =
        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89};
    std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
    std::vector<int>    i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame         df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    std::vector<unsigned long>  idx2 =
        { 123452, 123453, 123455, 123458, 123466, 223450, 223451, 223454, 223456, 223457, 223459, 223460, 223461, 223462 };
    std::vector<double> d12 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 113, 114 };
    std::vector<double> d22 = { 18, 19, 110, 111, 112, 113, 114, 120, 122, 123, 130, 131, 132, 11.89 };
    std::vector<double> d32 = { 115, 116, 115, 118, 119, 116, 121, 10.34, 11.56, 10.34, 12.3, 10.34, 119.0 };
    std::vector<int>    i12 = { 122, 123, 124, 125, 199 };
    MyDataFrame         df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    std::cout << "First DF:" << std::endl;
    df.write<std::ostream, double, int>(std::cout);
    std::cout << "Second DF2:" << std::endl;
    df2.write<std::ostream, double, int>(std::cout);

    MyDataFrame join_df =
        df.join_by_index<decltype(df2), double, int>(df2, join_policy::left_join);

    std::cout << "Now The joined DF:" << std::endl;
    join_df.write<std::ostream, double, int>(std::cout);
}

// -----------------------------------------------------------------------------

static void test_index_right_join()  {

    std::cout << "\nTesting Index Right Join ..." << std::endl;

    std::vector<unsigned long>  idx =
        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89};
    std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
    std::vector<int>    i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame         df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    std::vector<unsigned long>  idx2 =
        { 123452, 123453, 123455, 123458, 123466, 223450, 223451, 223454, 223456, 223457, 223459, 223460, 223461, 223462 };
    std::vector<double> d12 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 113, 114 };
    std::vector<double> d22 = { 18, 19, 110, 111, 112, 113, 114, 120, 122, 123, 130, 131, 132, 11.89 };
    std::vector<double> d32 = { 115, 116, 115, 118, 119, 116, 121, 10.34, 11.56, 10.34, 12.3, 10.34, 119.0 };
    std::vector<int>    i12 = { 122, 123, 124, 125, 199 };
    MyDataFrame         df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    std::cout << "First DF:" << std::endl;
    df.write<std::ostream, double, int>(std::cout);
    std::cout << "Second DF2:" << std::endl;
    df2.write<std::ostream, double, int>(std::cout);

    MyDataFrame join_df =
        df.join_by_index<decltype(df2), double, int>(df2, join_policy::right_join);

    std::cout << "Now The joined DF:" << std::endl;
    join_df.write<std::ostream, double, int>(std::cout);
}

// -----------------------------------------------------------------------------

static void test_index_left_right_join()  {

    std::cout << "\nTesting Index Left Right Join ..." << std::endl;

    std::vector<unsigned long>  idx =
        { 123466, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123450 };
    std::vector<double> d1 = { 14, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1 };
    std::vector<double> d2 = { 1.89, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 8 };
    std::vector<double> d3 = { 19.0, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 15.0 };
    std::vector<int>    i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame         df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    std::vector<unsigned long>  idx2 =
        { 123452, 123453, 123455, 123458, 123466, 223450, 223451, 223454, 223456, 223457, 223459, 223461, 223460, 223462 };
    std::vector<double> d12 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 113, 112, 114 };
    std::vector<double> d22 = { 18, 19, 110, 111, 112, 113, 114, 120, 122, 123, 130, 132, 131, 11.89 };
    std::vector<double> d32 = { 115, 116, 115, 118, 119, 116, 121, 10.34, 11.56, 10.34, 10.34, 12.3, 119.0 };
    std::vector<int>    i12 = { 122, 123, 124, 125, 199 };
    MyDataFrame         df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    std::cout << "First DF:" << std::endl;
    df.write<std::ostream, double, int>(std::cout);
    std::cout << "Second DF2:" << std::endl;
    df2.write<std::ostream, double, int>(std::cout);

    MyDataFrame join_df =
        df.join_by_index<decltype(df2), double, int>(df2, join_policy::left_right_join);

    std::cout << "Now The joined DF:" << std::endl;
    join_df.write<std::ostream, double, int>(std::cout);
}

// -----------------------------------------------------------------------------

static void test_join_by_column()  {

    std::cout << "\nTesting join by column ..." << std::endl;

    std::vector<unsigned long>  idx =
        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89};
    std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
    std::vector<int>    i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame         df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    std::vector<unsigned long>  idx2 =
        { 123452, 123453, 123455, 123458, 123466, 223450, 223451, 223454, 223456, 223457, 223459, 223460, 223461, 223462 };
    std::vector<double> d12 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 113, 114 };
    std::vector<double> d22 = { 8, 19, 110, 111, 9, 113, 114, 99, 122, 123, 130, 131, 20, 11.89 };
    std::vector<double> d32 = { 115, 116, 115, 118, 119, 116, 121, 10.34, 11.56, 10.34, 12.3, 10.34, 119.0 };
    std::vector<int>    i12 = { 122, 123, 124, 125, 199 };
    MyDataFrame         df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    StdDataFrame<unsigned int>  inner_result =
        df.join_by_column<decltype(df2), double, double, int>(df2, "col_2", join_policy::inner_join);

    assert(inner_result.get_index().size() == 3);
    assert(inner_result.get_column<double>("xcol_1")[2] == 113.0);
    assert(inner_result.get_column<double>("xcol_3")[1] == 119.0);
    assert(inner_result.get_column<double>("col_1")[2] == 8.0);
    assert(inner_result.get_column<double>("col_3")[0] == 15.0);
    assert(inner_result.get_column<int>("rhs.col_4")[2] == 0);
    assert(inner_result.get_column<int>("lhs.col_4")[0] == 22);
    assert(inner_result.get_column<unsigned long>("rhs.INDEX")[1] == 123466);
    assert(inner_result.get_column<unsigned long>("lhs.INDEX")[2] == 123457);

    StdDataFrame<unsigned int>  left_result =
        df.join_by_column<decltype(df2), double, double, int>(df2, "col_2", join_policy::left_join);

    assert(left_result.get_index().size() == 14);
    assert(std::isnan(left_result.get_column<double>("xcol_1")[5]));
    assert(left_result.get_column<double>("xcol_3")[8] == 119.0);
    assert(left_result.get_column<double>("col_1")[13] == 13.0);
    assert(left_result.get_column<double>("col_3")[9] == 1.56);
    assert(left_result.get_column<int>("rhs.col_4")[2] == 199);
    assert(left_result.get_column<int>("lhs.col_4")[5] == 99);
    assert(left_result.get_column<unsigned long>("rhs.INDEX")[3] == 0);
    assert(left_result.get_column<unsigned long>("lhs.INDEX")[11] == 123460);

    StdDataFrame<unsigned int>  right_result =
        df.join_by_column<decltype(df2), double, double, int>(df2, "col_2", join_policy::right_join);

    assert(right_result.get_index().size() == 14);
    assert(right_result.get_column<double>("xcol_1")[5] == 18.0);
    assert(std::isnan(right_result.get_column<double>("xcol_3")[2]));
    assert(right_result.get_column<double>("col_1")[4] == 8.0);
    assert(std::isnan(right_result.get_column<double>("col_3")[5]));
    assert(right_result.get_column<int>("rhs.col_4")[2] == 0);
    assert(right_result.get_column<int>("lhs.col_4")[5] == 0);
    assert(right_result.get_column<unsigned long>("rhs.INDEX")[3] == 123453);
    assert(right_result.get_column<unsigned long>("lhs.INDEX")[11] == 0);

    StdDataFrame<unsigned int>  left_right_result =
        df.join_by_column<decltype(df2), double, double, int>(df2, "col_2", join_policy::left_right_join);

    assert(left_right_result.get_index().size() == 25);
    assert(left_right_result.get_column<double>("xcol_1")[2] == 15.0);
    assert(left_right_result.get_column<double>("xcol_3")[1] == 115.0);
    assert(left_right_result.get_column<double>("col_1")[2] == 2.0);
    assert(std::isnan(left_right_result.get_column<double>("col_3")[0]));
    assert(left_right_result.get_column<int>("rhs.col_4")[2] == 199);
    assert(left_right_result.get_column<int>("lhs.col_4")[0] == 0);
    assert(left_right_result.get_column<unsigned long>("rhs.INDEX")[1] == 123452);
    assert(left_right_result.get_column<unsigned long>("lhs.INDEX")[2] == 123451);
}
C++ DataFrame