Signature Description

enum class remove_dup_spec : unsigned char  {
    keep_first = 1,  // Keep the first duplicated row
    keep_last = 2,   // Keep the last duplicated row
    keep_none = 3    // Discard all duplicated rows
};
Specifies which duplicated row to keep

Signature Description Parameters

template<typename T, typename ... Ts>
[[nodiscard]] DataFrame
remove_duplicates(const char *name,
                  bool include_index,
                  remove_dup_spec rds) const;
        
It removes duplicate rows and returns a new DataFrame. Duplication is determined by the given column. remove_dup_spec determines which of the duplicated rows to keep.

NOTE: The given column type must be hash-able and must have equality (==) operator well defined.
T: Type of the named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
name: Name of the data column
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep

template<typename T1, typename T2, typename ... Ts>
[[nodiscard]] DataFrame
remove_duplicates(const char *name1, const char *name2,
                  bool include_index,
                  remove_dup_spec rds) const;
        
It removes duplicate rows and returns a new DataFrame. Duplication is determined by the two given columns. remove_dup_spec determines which of the duplicated rows to keep.

NOTE: The two given column types must be hash-able and must have equality (==) operator well defined.
T1: Type of the first named column
T2: Type of the second named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
name1: Name of the first data column
name2: Name of the second data column
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep

template<typename T1, typename T2, typename T3,
         typename ... Ts>
[[nodiscard]] DataFrame
remove_duplicates(const char *name1, const char *name2,
                  const char *name3,
                  bool include_index,
                  remove_dup_spec rds) const;
        
Same as above, but there are 3 columns involved T1: Type of the first named column
T2: Type of the second named column
T3: Type of the third named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
name1: Name of the first data column
name2: Name of the second data column
name3: Name of the third data column
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep

template<typename T1, typename T2, typename T3, typename T4,
         typename ... Ts>
[[nodiscard]] DataFrame
remove_duplicates(const char *name1, const char *name2,
                  const char *name3, const char *name4,
                  bool include_index,
                  remove_dup_spec rds) const;
        
Same as above, but there are 4 columns involved T1: Type of the first named column
T2: Type of the second named column
T3: Type of the third named column
T4: Type of the fourth named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
name1: Name of the first data column
name2: Name of the second data column
name3: Name of the third data column
name4: Name of the fourth data column
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep

template<typename T1, typename T2, typename T3, typename T4,
         typename T5, typename ... Ts>
[[nodiscard]] DataFrame
remove_duplicates(const char *name1, const char *name2,
                  const char *name3, const char *name4,
                  const char *name5,
                  bool include_index,
                  remove_dup_spec rds) const;
        
Same as above, but there are 5 columns involved T1: Type of the first named column
T2: Type of the second named column
T3: Type of the third named column
T4: Type of the fourth named column
T5: Type of the fifth named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
name1: Name of the first data column
name2: Name of the second data column
name3: Name of the third data column
name4: Name of the fourth data column
name5: Name of the fifth data column
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep

template<typename T1, typename T2, typename T3, typename T4,
         typename T5, typename T6, typename ... Ts>
[[nodiscard]] DataFrame
remove_duplicates(const char *name1, const char *name2,
                  const char *name3, const char *name4,
                  const char *name5, const char *name6,
                  bool include_index,
                  remove_dup_spec rds) const;
        
Same as above, but there are 6 columns involved T1: Type of the first named column
T2: Type of the second named column
T3: Type of the third named column
T4: Type of the fourth named column
T5: Type of the fifth named column
T6: Type of the sixth named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
name1: Name of the first data column
name2: Name of the second data column
name3: Name of the third data column
name4: Name of the fourth data column
name5: Name of the fifth data column
name6: Name of the sixth data column
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep
static void test_remove_duplicates()  {

    std::cout << "\nTesting remove_duplicates( ) ..." << std::endl;

    MyDataFrame df;

    std::vector<unsigned long>  idxvec =
        { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, 10UL, 15UL, 14UL };
    std::vector<double>         dblvec =
        { 0.0, 15.0, 14.0, 2.0, 15.0, 12.0, 11.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 9.0, 10.0};
    std::vector<double>         dblvec2 =
        { 100.0, 101.0, 102.0, 103.0, 101.0, 105.0, 106.55, 107.34, 1.8, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0};
    std::vector<int>            intvec = { 1, 2, 3, 4, 2, 8, 6, 7, 11, 14, 9 };
    std::vector<std::string>    strvec =
        { "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };

    df.load_data(std::move(idxvec),
                 std::make_pair("dbl_col", dblvec),
                 std::make_pair("dbl_col_2", dblvec2),
                 std::make_pair("str_col", strvec));
    df.load_column("int_col", std::move(intvec), nan_policy::dont_pad_with_nans);

    auto    result1 =
        df.remove_duplicates<double, int, double, std::string, int>
        ("dbl_col", "int_col", false, remove_dup_spec::keep_first);
    auto    result2 =
        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
        ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_first);

    std::vector<double>         actual_d {
        100, 101, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
    std::vector<std::string>    actual_s {
        "zz", "bb", "cc", "ww", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };

    assert(result2.get_index().size() == 14);
    assert(result2.get_column<double>("dbl_col_2") == actual_d);
    assert(result2.get_column<std::string>("str_col") == actual_s);

    auto    result3 =
        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
        ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_last);

    actual_d = std::vector<double> {
        100, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
    actual_s = std::vector<std::string> {
        "zz", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
    assert(result3.get_index().size() == 14);
    assert(result3.get_column<double>("dbl_col_2") == actual_d);
    assert(result3.get_column<std::string>("str_col") == actual_s);

    auto    result4 =
        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
        ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_none);

    actual_d = std::vector<double> {
        100, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
    actual_s = std::vector<std::string> {
        "zz", "cc", "ww", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
    assert(result4.get_index().size() == 13);
    assert(result4.get_column<double>("dbl_col_2") == actual_d);
    assert(result4.get_column<std::string>("str_col") == actual_s);

    auto    result5 =
        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
        ("dbl_col", "dbl_col_2", "int_col", "str_col", true, remove_dup_spec::keep_none);

    actual_d = std::vector<double> {
        100, 101, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
    actual_s = std::vector<std::string> {
        "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
    assert(result5.get_index().size() == 15);
    assert(result5.get_column<double>("dbl_col_2") == actual_d);
    assert(result5.get_column<std::string>("str_col") == actual_s);
}