template<typename V, typename IT = int> std::size_t load_indicators(const char *cat_col_name, const char *numeric_cols_prefix = nullptr); |
Given a categorical (nominal) column, it generates a series of numerical columns (0 or 1) for each category in the given column. These numeric indictors encode categorical information. In machine learning, this is also sometimes referred to as “one-hot” encoding of categorical data NOTE: The values of the categorical column must be converted to string to generate names for indicator columns NOTE: The values of the categorical column must be hashable |
T: Type of the named categorical column IT: Type of the generated numerical indicator columns cat_col_name: Name of the categorical column numeric_cols_prefix: Optional prefix for generated column names |
template<typename V = int, typename CT = std::string> std::size_t from_indicators(const std::vector |
This does the opposite of the load_indicators(). Given a bunch of one-hot indicator columns, it reconstructs the category column. |
T: Type of the indicator columns CT: Type of the new categorical column ind_col_names: Names of the indicator columns cat_col_name: Name of the new categorical column numeric_cols_prefix: Prefix of indicator column names |
static void test_load_indicators() { std::cout << "\nTesting load_indicators( ) ..." << std::endl; MyDataFrame df; std::vector<unsigned long> idxvec = { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, 10UL, 15UL, 14UL }; std::vector<double> dblvec = { 0.0, 15.0, 14.0, 0.0, 1.0, 14.0, 11.5, 11.5, 7.25, 7.25, 7.25, 14.0, 7.25, 15.0, 0.0}; std::vector<double> dblvec2 = { 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.55, 107.34, 1.8, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0}; std::vector<int> intvec = { 1, 2, 3, 4, 5, 8, 6, 7, 11, 14, 9 }; std::vector<std::string> strvec = { "blue", "blue", "red", "green", "black", "green", "white", "black", "black", "white", "red", "yellow", "green", "green", "green" }; df.load_data(std::move(idxvec), std::make_pair("levels", dblvec), std::make_pair("dbl_col_2", dblvec2), std::make_pair("colors", strvec)); df.load_column("int_col", std::move(intvec), nan_policy::dont_pad_with_nans); df.write<std::ostream, std::string, double, int>(std::cout, io_format::csv2); auto count = df.load_indicators<std::string>("colors"); df.write<std::ostream, std::string, double, int>(std::cout, io_format::csv2); std::cout << "Load count: " << count << std::endl; count = df.load_indicators<std::string, bool>("colors", "bool_"); df.write<std::ostream, std::string, double, int, bool>(std::cout, io_format::csv2); std::cout << "Load count: " << count << std::endl; count = df.load_indicators<double, double>("levels", "level_"); df.write<std::ostream, std::string, double, int, bool>(std::cout, io_format::csv2); std::cout << "Load count: " << count << std::endl; } // ----------------------------------------------------------------------------- static void test_from_indicators() { std::cout << "\nTesting from_indicators( ) ..." << std::endl; MyDataFrame df; std::vector<unsigned long> idxvec = { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, 10UL, 15UL, 14UL }; std::vector<double> dblvec = { 0.0, 15.0, 14.0, 0.0, 1.0, 14.0, 11.5, 11.5, 7.25, 7.25, 7.25, 14.0, 7.25, 15.0, 0.0}; std::vector<double> dblvec2 = { 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.55, 107.34, 1.8, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0}; std::vector<int> intvec = { 1, 2, 3, 4, 5, 8, 6, 7, 11, 14, 9 }; std::vector<std::string> strvec = { "blue", "blue", "red", "green", "black", "green", "white", "black", "black", "white", "red", "yellow", "green", "green", "green" }; df.load_data(std::move(idxvec), std::make_pair("levels", dblvec), std::make_pair("dbl_col_2", dblvec2), std::make_pair("colors", strvec)); df.load_column("int_col", std::move(intvec), nan_policy::dont_pad_with_nans); df.load_indicators<std::string>("colors"); df.load_indicators<std::string, bool>("colors", "bool_"); df.load_indicators<double, double>("levels", "level_"); auto count = df.from_indicators({ "blue", "green", "white", "black", "red", "yellow" }, "colors_copy"); assert(count == 15); assert((df.get_column<std::string>("colors") == df.get_column<std::string>("colors_copy"))); count = df.from_indicators<double, double>({ "level_0", "level_15", "level_14", "level_1", "level_11.5", "level_7.25" }, "levels_copy", "level_"); assert(count == 15); assert((df.get_column<std::string>("levels") == df.get_column<std::string>("levels_copy"))); }