From 6b4e298504349be024f83a5f90f3d98f8aba761b Mon Sep 17 00:00:00 2001 From: aholovenko Date: Thu, 23 Feb 2023 10:44:13 +0200 Subject: [PATCH 1/3] pymfe features extractor script --- SynRD/features_extractor/__init__.py | 0 SynRD/features_extractor/features.json | 1 + .../features_extractor/features_extractor.py | 92 +++++++++++++++++++ setup.py | 27 +++--- 4 files changed, 108 insertions(+), 12 deletions(-) create mode 100644 SynRD/features_extractor/__init__.py create mode 100644 SynRD/features_extractor/features.json create mode 100644 SynRD/features_extractor/features_extractor.py diff --git a/SynRD/features_extractor/__init__.py b/SynRD/features_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SynRD/features_extractor/features.json b/SynRD/features_extractor/features.json new file mode 100644 index 0000000..a2c7dc0 --- /dev/null +++ b/SynRD/features_extractor/features.json @@ -0,0 +1 @@ +{"fairman2019marijuana": {"general": {"attr_to_inst": 1.703107489926119e-05, "cat_to_num": 0.0, "freq_class.mean": 0.16666666666666666, "freq_class.sd": 0.14470247838587796, "inst_to_attr": 58716.2, "nr_attr": 5, "nr_bin": 1, "nr_cat": 0, "nr_class": 6, "nr_inst": 293581, "nr_num": 5, "num_to_cat": null}, "statistical": {"can_cor.mean": 0.27323666640127386, "can_cor.sd": 0.37956564708808893, "cor.mean": 0.07406275721610567, "cor.sd": 0.1771369598863154, "cov.mean": 1.304644717701642, "cov.sd": 3.524904804376082, "eigenvalues.mean": 14.621120087611876, "eigenvalues.sd": 21.564255640650842, "g_mean.mean": null, "g_mean.sd": null, "gravity": 13.76146670705108, "h_mean.mean": 0.0, "h_mean.sd": 0.0, "iq_range.mean": 5.8, "iq_range.sd": 4.868264577855234, "kurtosis.mean": -1.316388250277453, "kurtosis.sd": 0.5530853223474288, "lh_trace": 8.64405359804705, "mad.mean": 2.9652000000000003, "mad.sd": 3.1450695413615257, "max.mean": 9.4, "max.sd": 7.368853370776216, "mean.mean": 3.862251984971779, "mean.sd": 2.992971847542306, "median.mean": 4.0, "median.sd": 4.527692569068709, "min.mean": 0.0, "min.sd": 0.0, "nr_cor_attr": 0.1, "nr_disc": 5, "nr_norm": 1.0, "nr_outliers": 0, "p_trace": 0.9495717011280036, "range.mean": 9.4, "range.sd": 7.368853370776216, "roy_root": 8.589078392532363, "sd.mean": 3.1637659684598214, "sd.sd": 2.400964645831791, "sd_ratio": 1.110047598819871, "skewness.mean": 0.23240652488599678, "skewness.sd": 0.49992378018674405, "sparsity.mean": 0.17584134858261163, "sparsity.sd": 0.1844815935639719, "t_mean.mean": 3.639869655802758, "t_mean.sd": 3.142390782055264, "var.mean": 14.621120087611851, "var.sd": 20.01485297669193, "w_lambda": 0.0987625612414452}, "info-theory": {"attr_conc.mean": 0.008614358887037692, "attr_conc.sd": 0.024789208781955833, "attr_ent.mean": 2.5171242952379385, "attr_ent.sd": 1.086706290676518, "class_conc.mean": 0.07817078162837117, "class_conc.sd": 0.14361375404460913, "class_ent": 2.1567101571814913, "eq_num_attr": 8.473202572599119, "joint_ent.mean": 4.419301394971244, "joint_ent.sd": 1.0027439889770196, "mut_inf.mean": 0.2545330574481863, "mut_inf.sd": 0.4321564861621098, "ns_ratio": 8.88918422020815}, "itemset": {"one_itemset.mean": 0.11111111111111109, "one_itemset.sd": 0.12959929569442002, "two_itemset.mean": 0.21220159151193635, "two_itemset.sd": 0.15211769593415295}}, "adult": {"general": {"attr_to_inst": 0.00042996222474739717, "cat_to_num": 1.3333333333333333, "freq_class.mean": 0.5, "freq_class.sd": 0.3665506390973169, "inst_to_attr": 2325.785714285714, "nr_attr": 14, "nr_bin": 1, "nr_cat": 8, "nr_class": 2, "nr_inst": 32561, "nr_num": 6, "num_to_cat": 0.75}, "statistical": {"can_cor.mean": 0.6075382184616811, "can_cor.sd": null, "cor.mean": 0.020575612632741463, "cor.sd": 0.04535153811741261, "cov.mean": 211.1074182124332, "cov.sd": 8025.120485282461, "eigenvalues.mean": 110846564.91990046, "eigenvalues.sd": 1108509665.939067, "g_mean.mean": 1586.32115786632, "g_mean.sd": 15933.907433185732, "gravity": 4511.757083547598, "h_mean.mean": 1251.6757800002574, "h_mean.sd": 12571.548183529578, "iq_range.mean": 1180.7722772277227, "iq_range.sd": 11863.197745870368, "kurtosis.mean": 834.6502878290377, "kurtosis.sd": 3300.235280828971, "lh_trace": 0.5850439987975836, "mad.mean": 879.4019881188119, "mad.sd": 8835.794743563976, "max.mean": 15736.237623762376, "max.sd": 147964.76551477704, "mean.mean": 1891.4795428899488, "mean.sd": 18882.70462422157, "median.mean": 1766.8019801980197, "median.sd": 17746.995603774634, "min.mean": 121.82178217821782, "min.sd": 1222.3854579962863, "nr_cor_attr": 0.0011881188118811883, "nr_disc": 1, "nr_norm": 3.0, "nr_outliers": 95, "p_trace": 0.36910268689159337, "range.mean": 15614.415841584158, "range.sd": 146745.1749871366, "roy_root": 0.5850439987975836, "sd.mean": 1122.5829209841374, "sd.sd": 10520.562543441738, "sd_ratio": null, "skewness.mean": 17.616097299843247, "skewness.sd": 23.04905261843532, "sparsity.mean": 0.09852711143124308, "sparsity.sd": 0.13239694804959984, "t_mean.mean": 1758.7537427080476, "t_mean.sd": 17665.95817440692, "var.mean": 110846564.91990048, "var.sd": 1108509663.0036533, "w_lambda": 0.6308973131084066}, "info-theory": {"attr_conc.mean": 0.03834148783690674, "attr_conc.sd": 0.13791611041574148, "attr_ent.mean": 2.2197528515718576, "attr_ent.sd": 1.5139967849269165, "class_conc.mean": 0.02703239097275378, "class_conc.sd": 0.03320210419262275, "class_ent": 0.7963839552022132, "eq_num_attr": 12.006187880013178, "joint_ent.mean": 2.9498056812610374, "joint_ent.sd": 1.5033030990636995, "mut_inf.mean": 0.0663311255130333, "mut_inf.sd": 0.05319818541302973, "ns_ratio": 32.464724658346135}, "itemset": {"one_itemset.mean": 0.07106598984771574, "one_itemset.sd": 0.155956043719432, "two_itemset.mean": 0.13962906444418358, "two_itemset.sd": 0.19991073911332466}}, "mushrooms": {"general": {"attr_to_inst": 0.0027080256031511572, "cat_to_num": 0.0, "freq_class.mean": 0.00012309207287050715, "freq_class.sd": 0.0, "inst_to_attr": 369.27272727272725, "nr_attr": 22, "nr_bin": 0, "nr_cat": 0, "nr_class": 8124, "nr_inst": 8124, "nr_num": 22, "num_to_cat": null}, "statistical": {"can_cor.mean": null, "can_cor.sd": null, "cor.mean": null, "cor.sd": null, "cov.mean": null, "cov.sd": null, "eigenvalues.mean": null, "eigenvalues.sd": null, "g_mean.mean": null, "g_mean.sd": null, "gravity": null, "h_mean.mean": null, "h_mean.sd": null, "iq_range.mean": null, "iq_range.sd": null, "kurtosis.mean": null, "kurtosis.sd": null, "lh_trace": null, "mad.mean": null, "mad.sd": null, "max.mean": null, "max.sd": null, "mean.mean": null, "mean.sd": null, "median.mean": null, "median.sd": null, "min.mean": null, "min.sd": null, "nr_cor_attr": 0.0, "nr_disc": null, "nr_norm": 22.0, "nr_outliers": 0, "p_trace": null, "range.mean": null, "range.sd": null, "roy_root": null, "sd.mean": null, "sd.sd": null, "sd_ratio": null, "skewness.mean": null, "skewness.sd": null, "sparsity.mean": 1.0, "sparsity.sd": 0.0, "t_mean.mean": null, "t_mean.sd": null, "var.mean": null, "var.sd": null, "w_lambda": null}, "info-theory": {"attr_conc.mean": 0.5, "attr_conc.sd": 0.0, "attr_ent.mean": 0.0, "attr_ent.sd": 0.0, "class_conc.mean": 0.5000203091753722, "class_conc.sd": 1.136349493877995e-16, "class_ent": 12.98797452429615, "eq_num_attr": -13847.43282808942, "joint_ent.mean": 12.98891245804066, "joint_ent.sd": 0.0, "mut_inf.mean": -0.000937933744509678, "mut_inf.sd": 0.0, "ns_ratio": -1.0}, "itemset": {"one_itemset.mean": 1.0, "one_itemset.sd": 0.0, "two_itemset.mean": 0.0, "two_itemset.sd": 0.0}}} \ No newline at end of file diff --git a/SynRD/features_extractor/features_extractor.py b/SynRD/features_extractor/features_extractor.py new file mode 100644 index 0000000..2d2578a --- /dev/null +++ b/SynRD/features_extractor/features_extractor.py @@ -0,0 +1,92 @@ +import json +import os +import math +import numpy as np +import pandas as pd + +from pymfe.mfe import MFE + +# FEATURE_GROUPS = ["general", "statistical", "info-theory", "concept", "itemset", "complexity"] +FEATURE_GROUPS = ["general", "statistical", "info-theory", "itemset"] + +MAPPINGS = { + # "saw2018cross_dataframe.tsv": {"name": "saw2018cross", "target": ""}, + # "lee2021ability_dataframe.tsv": {"name": "lee2021ability", "target": ""}, + # "jeong2021math_dataframe.tsv": {"name": "jeong2021math", "target": "X1TXMSCR"}, + # "iverson22football_dataframe.tsv": {"name": "iverson22football", "target": ""}, + # "fruiht2018naturally_dataframe.tsv": {"name": "fruiht2018naturally", "target": ""}, + "29621-0001-Data.tsv": {"name": "fairman2019marijuana", "target": "CLASS"}, + "adult.data": {"name": "adult", "target": "income", "columns": ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]}, + "agaricus-lepiota.data": {"name": "mushrooms", "target": "class", "columns": ["class", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]}, +} + + +def load_data(input_file_path, column_names=None): + if input_file_path.endswith('.tsv'): + return pd.read_csv(input_file_path, sep='\t') + if input_file_path.endswith('.data'): + return pd.read_csv(input_file_path, sep=',\s', names=column_names, index_col=None) + if input_file_path.endswith('.csv'): + return pd.read_csv(input_file_path) + if input_file_path.endswith('.json'): + return pd.read_json(input_file_path) + raise ValueError(f'file {input_file_path} is not supported') + + +def load_dataframes(input_dir='data/papers'): + dataframes = dict() + for input_file_path in MAPPINGS: + input_file_dict = MAPPINGS[input_file_path] + dataframe = load_data(os.path.join(input_dir, input_file_path), column_names=input_file_dict.get('columns')) + dataframes[input_file_dict["name"]] = dataframe + return dataframes + + +def get_target(dataframe, target, features=None): + if features is not None: + features = features + [target] + dataframe = dataframe[features] + X = dataframe.drop(columns=[target], errors='ignore').to_numpy() + y = dataframe[[target]].to_numpy() + return X, y + + +def __replace_nans(vals): + return [None if math.isnan(v) else v for v in vals] + + +def get_features(dataframes): + features = dict() + name2target = {d['name']: d['target'] for d in MAPPINGS.values()} + name2features = {d['name']: d.get('features') for d in MAPPINGS.values()} + for dataframe_name, dataframe in dataframes.items(): + X, y = get_target(dataframe, features=name2features[dataframe_name], target=name2target[dataframe_name]) + dataframe_features = dict() + for group_name in FEATURE_GROUPS: + mfe = MFE(groups=group_name) + mfe.fit(X, y) + features_names, features_vals = mfe.extract() + dataframe_features[group_name] = dict(zip(features_names, __replace_nans(features_vals))) + features[dataframe_name] = dataframe_features + return features + + +class NpEncoder(json.JSONEncoder): + """ + based on: https://stackoverflow.com/a/57915246 + """ + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + super(NpEncoder, self).default(obj) + + +def main(): + dataframes = load_dataframes() + features = get_features(dataframes) + with open('features.json', 'w') as output_file: + json.dump(features, output_file, cls=NpEncoder) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index ceecab4..c3f7d23 100644 --- a/setup.py +++ b/setup.py @@ -24,18 +24,21 @@ "SynRD.datasets"], package_data={'SynRD': ['papers/process.R']}, # setup_requires=['wheel'], - install_requires=["DataSynthesizer", - "smartnoise-synth", - "pandas", - "numpy", - "tqdm", - "requests", - "scikit-learn", - "disjoint-set", - "networkx", - "diffprivlib", - "pathlib", - "statsmodels"], + install_requires=[ + "DataSynthesizer", + "smartnoise-synth", + "pandas", + "numpy", + "tqdm", + "requests", + "scikit-learn", + "disjoint-set", + "networkx", + "diffprivlib", + "pathlib", + "statsmodels", + "pymfe", + ], ) # NOTE: Independent installation of mbi required with: From 1b41e83859a113a949e85ccb57ed0dc00472dd44 Mon Sep 17 00:00:00 2001 From: aholovenko Date: Thu, 23 Feb 2023 14:29:57 +0200 Subject: [PATCH 2/3] add features for jeong2021math --- SynRD/features_extractor/features.json | 351 +++++++++++++++++- .../features_extractor/features_extractor.py | 2 +- SynRD/publication.py | 9 +- 3 files changed, 358 insertions(+), 4 deletions(-) diff --git a/SynRD/features_extractor/features.json b/SynRD/features_extractor/features.json index a2c7dc0..ea77f0c 100644 --- a/SynRD/features_extractor/features.json +++ b/SynRD/features_extractor/features.json @@ -1 +1,350 @@ -{"fairman2019marijuana": {"general": {"attr_to_inst": 1.703107489926119e-05, "cat_to_num": 0.0, "freq_class.mean": 0.16666666666666666, "freq_class.sd": 0.14470247838587796, "inst_to_attr": 58716.2, "nr_attr": 5, "nr_bin": 1, "nr_cat": 0, "nr_class": 6, "nr_inst": 293581, "nr_num": 5, "num_to_cat": null}, "statistical": {"can_cor.mean": 0.27323666640127386, "can_cor.sd": 0.37956564708808893, "cor.mean": 0.07406275721610567, "cor.sd": 0.1771369598863154, "cov.mean": 1.304644717701642, "cov.sd": 3.524904804376082, "eigenvalues.mean": 14.621120087611876, "eigenvalues.sd": 21.564255640650842, "g_mean.mean": null, "g_mean.sd": null, "gravity": 13.76146670705108, "h_mean.mean": 0.0, "h_mean.sd": 0.0, "iq_range.mean": 5.8, "iq_range.sd": 4.868264577855234, "kurtosis.mean": -1.316388250277453, "kurtosis.sd": 0.5530853223474288, "lh_trace": 8.64405359804705, "mad.mean": 2.9652000000000003, "mad.sd": 3.1450695413615257, "max.mean": 9.4, "max.sd": 7.368853370776216, "mean.mean": 3.862251984971779, "mean.sd": 2.992971847542306, "median.mean": 4.0, "median.sd": 4.527692569068709, "min.mean": 0.0, "min.sd": 0.0, "nr_cor_attr": 0.1, "nr_disc": 5, "nr_norm": 1.0, "nr_outliers": 0, "p_trace": 0.9495717011280036, "range.mean": 9.4, "range.sd": 7.368853370776216, "roy_root": 8.589078392532363, "sd.mean": 3.1637659684598214, "sd.sd": 2.400964645831791, "sd_ratio": 1.110047598819871, "skewness.mean": 0.23240652488599678, "skewness.sd": 0.49992378018674405, "sparsity.mean": 0.17584134858261163, "sparsity.sd": 0.1844815935639719, "t_mean.mean": 3.639869655802758, "t_mean.sd": 3.142390782055264, "var.mean": 14.621120087611851, "var.sd": 20.01485297669193, "w_lambda": 0.0987625612414452}, "info-theory": {"attr_conc.mean": 0.008614358887037692, "attr_conc.sd": 0.024789208781955833, "attr_ent.mean": 2.5171242952379385, "attr_ent.sd": 1.086706290676518, "class_conc.mean": 0.07817078162837117, "class_conc.sd": 0.14361375404460913, "class_ent": 2.1567101571814913, "eq_num_attr": 8.473202572599119, "joint_ent.mean": 4.419301394971244, "joint_ent.sd": 1.0027439889770196, "mut_inf.mean": 0.2545330574481863, "mut_inf.sd": 0.4321564861621098, "ns_ratio": 8.88918422020815}, "itemset": {"one_itemset.mean": 0.11111111111111109, "one_itemset.sd": 0.12959929569442002, "two_itemset.mean": 0.21220159151193635, "two_itemset.sd": 0.15211769593415295}}, "adult": {"general": {"attr_to_inst": 0.00042996222474739717, "cat_to_num": 1.3333333333333333, "freq_class.mean": 0.5, "freq_class.sd": 0.3665506390973169, "inst_to_attr": 2325.785714285714, "nr_attr": 14, "nr_bin": 1, "nr_cat": 8, "nr_class": 2, "nr_inst": 32561, "nr_num": 6, "num_to_cat": 0.75}, "statistical": {"can_cor.mean": 0.6075382184616811, "can_cor.sd": null, "cor.mean": 0.020575612632741463, "cor.sd": 0.04535153811741261, "cov.mean": 211.1074182124332, "cov.sd": 8025.120485282461, "eigenvalues.mean": 110846564.91990046, "eigenvalues.sd": 1108509665.939067, "g_mean.mean": 1586.32115786632, "g_mean.sd": 15933.907433185732, "gravity": 4511.757083547598, "h_mean.mean": 1251.6757800002574, "h_mean.sd": 12571.548183529578, "iq_range.mean": 1180.7722772277227, "iq_range.sd": 11863.197745870368, "kurtosis.mean": 834.6502878290377, "kurtosis.sd": 3300.235280828971, "lh_trace": 0.5850439987975836, "mad.mean": 879.4019881188119, "mad.sd": 8835.794743563976, "max.mean": 15736.237623762376, "max.sd": 147964.76551477704, "mean.mean": 1891.4795428899488, "mean.sd": 18882.70462422157, "median.mean": 1766.8019801980197, "median.sd": 17746.995603774634, "min.mean": 121.82178217821782, "min.sd": 1222.3854579962863, "nr_cor_attr": 0.0011881188118811883, "nr_disc": 1, "nr_norm": 3.0, "nr_outliers": 95, "p_trace": 0.36910268689159337, "range.mean": 15614.415841584158, "range.sd": 146745.1749871366, "roy_root": 0.5850439987975836, "sd.mean": 1122.5829209841374, "sd.sd": 10520.562543441738, "sd_ratio": null, "skewness.mean": 17.616097299843247, "skewness.sd": 23.04905261843532, "sparsity.mean": 0.09852711143124308, "sparsity.sd": 0.13239694804959984, "t_mean.mean": 1758.7537427080476, "t_mean.sd": 17665.95817440692, "var.mean": 110846564.91990048, "var.sd": 1108509663.0036533, "w_lambda": 0.6308973131084066}, "info-theory": {"attr_conc.mean": 0.03834148783690674, "attr_conc.sd": 0.13791611041574148, "attr_ent.mean": 2.2197528515718576, "attr_ent.sd": 1.5139967849269165, "class_conc.mean": 0.02703239097275378, "class_conc.sd": 0.03320210419262275, "class_ent": 0.7963839552022132, "eq_num_attr": 12.006187880013178, "joint_ent.mean": 2.9498056812610374, "joint_ent.sd": 1.5033030990636995, "mut_inf.mean": 0.0663311255130333, "mut_inf.sd": 0.05319818541302973, "ns_ratio": 32.464724658346135}, "itemset": {"one_itemset.mean": 0.07106598984771574, "one_itemset.sd": 0.155956043719432, "two_itemset.mean": 0.13962906444418358, "two_itemset.sd": 0.19991073911332466}}, "mushrooms": {"general": {"attr_to_inst": 0.0027080256031511572, "cat_to_num": 0.0, "freq_class.mean": 0.00012309207287050715, "freq_class.sd": 0.0, "inst_to_attr": 369.27272727272725, "nr_attr": 22, "nr_bin": 0, "nr_cat": 0, "nr_class": 8124, "nr_inst": 8124, "nr_num": 22, "num_to_cat": null}, "statistical": {"can_cor.mean": null, "can_cor.sd": null, "cor.mean": null, "cor.sd": null, "cov.mean": null, "cov.sd": null, "eigenvalues.mean": null, "eigenvalues.sd": null, "g_mean.mean": null, "g_mean.sd": null, "gravity": null, "h_mean.mean": null, "h_mean.sd": null, "iq_range.mean": null, "iq_range.sd": null, "kurtosis.mean": null, "kurtosis.sd": null, "lh_trace": null, "mad.mean": null, "mad.sd": null, "max.mean": null, "max.sd": null, "mean.mean": null, "mean.sd": null, "median.mean": null, "median.sd": null, "min.mean": null, "min.sd": null, "nr_cor_attr": 0.0, "nr_disc": null, "nr_norm": 22.0, "nr_outliers": 0, "p_trace": null, "range.mean": null, "range.sd": null, "roy_root": null, "sd.mean": null, "sd.sd": null, "sd_ratio": null, "skewness.mean": null, "skewness.sd": null, "sparsity.mean": 1.0, "sparsity.sd": 0.0, "t_mean.mean": null, "t_mean.sd": null, "var.mean": null, "var.sd": null, "w_lambda": null}, "info-theory": {"attr_conc.mean": 0.5, "attr_conc.sd": 0.0, "attr_ent.mean": 0.0, "attr_ent.sd": 0.0, "class_conc.mean": 0.5000203091753722, "class_conc.sd": 1.136349493877995e-16, "class_ent": 12.98797452429615, "eq_num_attr": -13847.43282808942, "joint_ent.mean": 12.98891245804066, "joint_ent.sd": 0.0, "mut_inf.mean": -0.000937933744509678, "mut_inf.sd": 0.0, "ns_ratio": -1.0}, "itemset": {"one_itemset.mean": 1.0, "one_itemset.sd": 0.0, "two_itemset.mean": 0.0, "two_itemset.sd": 0.0}}} \ No newline at end of file +{ + "adult": { + "general": { + "attr_to_inst": 0.00042996222474739717, + "cat_to_num": 1.3333333333333333, + "freq_class.mean": 0.5, + "freq_class.sd": 0.3665506390973169, + "inst_to_attr": 2325.785714285714, + "nr_attr": 14, + "nr_bin": 1, + "nr_cat": 8, + "nr_class": 2, + "nr_inst": 32561, + "nr_num": 6, + "num_to_cat": 0.75 + }, + "info-theory": { + "attr_conc.mean": 0.03990996507418588, + "attr_conc.sd": 0.1387983393422685, + "attr_ent.mean": 2.2197528515718576, + "attr_ent.sd": 1.5139967849269165, + "class_conc.mean": 0.02703239097275378, + "class_conc.sd": 0.03320210419262275, + "class_ent": 0.7963839552022132, + "eq_num_attr": 12.006187880013178, + "joint_ent.mean": 2.9498056812610374, + "joint_ent.sd": 1.5033030990636995, + "mut_inf.mean": 0.0663311255130333, + "mut_inf.sd": 0.05319818541302973, + "ns_ratio": 32.464724658346135 + }, + "itemset": { + "one_itemset.mean": 0.07106598984771574, + "one_itemset.sd": 0.155956043719432, + "two_itemset.mean": 0.13962906444418358, + "two_itemset.sd": 0.19991073911332466 + }, + "statistical": { + "can_cor.mean": 0.6075382184616811, + "can_cor.sd": null, + "cor.mean": 0.020575612632741463, + "cor.sd": 0.04535153811741261, + "cov.mean": 211.1074182124332, + "cov.sd": 8025.120485282461, + "eigenvalues.mean": 110846564.91990046, + "eigenvalues.sd": 1108509665.939067, + "g_mean.mean": 1586.32115786632, + "g_mean.sd": 15933.907433185732, + "gravity": 4511.757083547598, + "h_mean.mean": 1251.6757800002574, + "h_mean.sd": 12571.548183529578, + "iq_range.mean": 1180.7722772277227, + "iq_range.sd": 11863.197745870368, + "kurtosis.mean": 834.6502878290377, + "kurtosis.sd": 3300.235280828971, + "lh_trace": 0.5850439987975836, + "mad.mean": 879.4019881188119, + "mad.sd": 8835.794743563976, + "max.mean": 15736.237623762376, + "max.sd": 147964.76551477704, + "mean.mean": 1891.4795428899488, + "mean.sd": 18882.70462422157, + "median.mean": 1766.8019801980197, + "median.sd": 17746.995603774634, + "min.mean": 121.82178217821782, + "min.sd": 1222.3854579962863, + "nr_cor_attr": 0.0011881188118811883, + "nr_disc": 1, + "nr_norm": 3.0, + "nr_outliers": 95, + "p_trace": 0.36910268689159337, + "range.mean": 15614.415841584158, + "range.sd": 146745.1749871366, + "roy_root": 0.5850439987975836, + "sd.mean": 1122.5829209841374, + "sd.sd": 10520.562543441738, + "sd_ratio": null, + "skewness.mean": 17.616097299843247, + "skewness.sd": 23.04905261843532, + "sparsity.mean": 0.09852711143124308, + "sparsity.sd": 0.13239694804959984, + "t_mean.mean": 1758.7537427080476, + "t_mean.sd": 17665.95817440692, + "var.mean": 110846564.91990048, + "var.sd": 1108509663.0036533, + "w_lambda": 0.6308973131084066 + } + }, + "fairman2019marijuana": { + "general": { + "attr_to_inst": 0.00001703107489926119, + "cat_to_num": 0.0, + "freq_class.mean": 0.16666666666666666, + "freq_class.sd": 0.14470247838587796, + "inst_to_attr": 58716.2, + "nr_attr": 5, + "nr_bin": 1, + "nr_cat": 0, + "nr_class": 6, + "nr_inst": 293581, + "nr_num": 5, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.008614358887037692, + "attr_conc.sd": 0.024789208781955833, + "attr_ent.mean": 2.5171242952379385, + "attr_ent.sd": 1.086706290676518, + "class_conc.mean": 0.07817078162837117, + "class_conc.sd": 0.14361375404460913, + "class_ent": 2.1567101571814913, + "eq_num_attr": 8.473202572599119, + "joint_ent.mean": 4.419301394971244, + "joint_ent.sd": 1.0027439889770196, + "mut_inf.mean": 0.2545330574481863, + "mut_inf.sd": 0.4321564861621098, + "ns_ratio": 8.88918422020815 + }, + "itemset": { + "one_itemset.mean": 0.11111111111111109, + "one_itemset.sd": 0.12959929569442002, + "two_itemset.mean": 0.21220159151193635, + "two_itemset.sd": 0.15211769593415295 + }, + "statistical": { + "can_cor.mean": 0.27323666640127386, + "can_cor.sd": 0.37956564708808893, + "cor.mean": 0.07406275721610567, + "cor.sd": 0.1771369598863154, + "cov.mean": 1.304644717701642, + "cov.sd": 3.524904804376082, + "eigenvalues.mean": 14.621120087611876, + "eigenvalues.sd": 21.564255640650842, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": 13.76146670705108, + "h_mean.mean": 0.0, + "h_mean.sd": 0.0, + "iq_range.mean": 5.8, + "iq_range.sd": 4.868264577855234, + "kurtosis.mean": -1.316388250277453, + "kurtosis.sd": 0.5530853223474288, + "lh_trace": 8.64405359804705, + "mad.mean": 2.9652000000000003, + "mad.sd": 3.1450695413615257, + "max.mean": 9.4, + "max.sd": 7.368853370776216, + "mean.mean": 3.862251984971779, + "mean.sd": 2.992971847542306, + "median.mean": 4.0, + "median.sd": 4.527692569068709, + "min.mean": 0.0, + "min.sd": 0.0, + "nr_cor_attr": 0.1, + "nr_disc": 5, + "nr_norm": 1.0, + "nr_outliers": 0, + "p_trace": 0.9495717011280036, + "range.mean": 9.4, + "range.sd": 7.368853370776216, + "roy_root": 8.589078392532363, + "sd.mean": 3.1637659684598214, + "sd.sd": 2.400964645831791, + "sd_ratio": 1.110047598819871, + "skewness.mean": 0.23240652488599678, + "skewness.sd": 0.49992378018674405, + "sparsity.mean": 0.17584134858261163, + "sparsity.sd": 0.1844815935639719, + "t_mean.mean": 3.639869655802758, + "t_mean.sd": 3.142390782055264, + "var.mean": 14.621120087611851, + "var.sd": 20.01485297669193, + "w_lambda": 0.0987625612414452 + } + }, + "jeong2021math": { + "general": { + "attr_to_inst": 0.0037199415437757406, + "cat_to_num": 0.0, + "freq_class.mean": 0.5, + "freq_class.sd": 0.01324592216650769, + "inst_to_attr": 268.82142857142856, + "nr_attr": 56, + "nr_bin": 14, + "nr_cat": 0, + "nr_class": 2, + "nr_inst": 15054, + "nr_num": 56, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.013708609620212937, + "attr_conc.sd": 0.04805804686514912, + "attr_ent.mean": 1.6774033809497375, + "attr_ent.sd": 1.1130854889416772, + "class_conc.mean": 0.016853385542393653, + "class_conc.sd": 0.06683286177402585, + "class_ent": 0.9997468579230723, + "eq_num_attr": 50.597586033055094, + "joint_ent.mean": 2.657391453201297, + "joint_ent.sd": 1.10145806941284, + "mut_inf.mean": 0.01975878567151294, + "mut_inf.sd": 0.025945288082994112, + "ns_ratio": 83.89405213641845 + }, + "itemset": { + "one_itemset.mean": 0.1728395061728395, + "one_itemset.sd": 0.21827694955900606, + "two_itemset.mean": 0.2904204594711747, + "two_itemset.sd": 0.22445489887183492 + }, + "statistical": { + "can_cor.mean": 0.5403579611152178, + "can_cor.sd": null, + "cor.mean": 0.06554819579321111, + "cor.sd": 0.09786276332742616, + "cov.mean": 0.16468940993285874, + "cov.sd": 1.0329496620057688, + "eigenvalues.mean": 5.269255637375979, + "eigenvalues.sd": 16.19876176989202, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": 5.282793071239359, + "h_mean.mean": 0.0, + "h_mean.sd": 0.0, + "iq_range.mean": 1.9116517857142858, + "iq_range.sd": 3.201587258220015, + "kurtosis.mean": 7.720150421216947, + "kurtosis.sd": 39.111865301024565, + "lh_trace": 0.41240289825212684, + "mad.mean": 1.1516625, + "mad.sd": 2.303620744110588, + "max.mean": 5.61625, + "max.sd": 7.8327054567835805, + "mean.mean": 2.5655758792157752, + "mean.sd": 4.365244583529911, + "median.mean": 2.514464285714286, + "median.sd": 4.392893401286324, + "min.mean": 0.0, + "min.sd": 0.0, + "nr_cor_attr": 0.016883116883116882, + "nr_disc": 1, + "nr_norm": 0.0, + "nr_outliers": 32, + "p_trace": 0.2919867261405953, + "range.mean": 5.61625, + "range.sd": 7.8327054567835805, + "roy_root": 0.41240289825212684, + "sd.mean": 1.3640819995400726, + "sd.sd": 1.8629303037310834, + "sd_ratio": 1.0200179969901564, + "skewness.mean": 0.3431678774719039, + "skewness.sd": 2.8750837173281245, + "sparsity.mean": 0.25647443354243105, + "sparsity.sd": 0.16495250052343224, + "t_mean.mean": 2.5380055306935705, + "t_mean.sd": 4.466928721617069, + "var.mean": 5.269255637375979, + "var.sd": 14.183393684074005, + "w_lambda": 0.7080132738594047 + } + }, + "mushrooms": { + "general": { + "attr_to_inst": 0.0027080256031511572, + "cat_to_num": 0.0, + "freq_class.mean": 0.00012309207287050715, + "freq_class.sd": 0.0, + "inst_to_attr": 369.27272727272725, + "nr_attr": 22, + "nr_bin": 0, + "nr_cat": 0, + "nr_class": 8124, + "nr_inst": 8124, + "nr_num": 22, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.5, + "attr_conc.sd": 0.0, + "attr_ent.mean": 0.0, + "attr_ent.sd": 0.0, + "class_conc.mean": 0.5000203091753722, + "class_conc.sd": 1.136349493877995E-16, + "class_ent": 12.98797452429615, + "eq_num_attr": -13847.43282808942, + "joint_ent.mean": 12.98891245804066, + "joint_ent.sd": 0.0, + "mut_inf.mean": -0.000937933744509678, + "mut_inf.sd": 0.0, + "ns_ratio": -1.0 + }, + "itemset": { + "one_itemset.mean": 1.0, + "one_itemset.sd": 0.0, + "two_itemset.mean": 0.0, + "two_itemset.sd": 0.0 + }, + "statistical": { + "can_cor.mean": null, + "can_cor.sd": null, + "cor.mean": null, + "cor.sd": null, + "cov.mean": null, + "cov.sd": null, + "eigenvalues.mean": null, + "eigenvalues.sd": null, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": null, + "h_mean.mean": null, + "h_mean.sd": null, + "iq_range.mean": null, + "iq_range.sd": null, + "kurtosis.mean": null, + "kurtosis.sd": null, + "lh_trace": null, + "mad.mean": null, + "mad.sd": null, + "max.mean": null, + "max.sd": null, + "mean.mean": null, + "mean.sd": null, + "median.mean": null, + "median.sd": null, + "min.mean": null, + "min.sd": null, + "nr_cor_attr": 0.0, + "nr_disc": null, + "nr_norm": 22.0, + "nr_outliers": 0, + "p_trace": null, + "range.mean": null, + "range.sd": null, + "roy_root": null, + "sd.mean": null, + "sd.sd": null, + "sd_ratio": null, + "skewness.mean": null, + "skewness.sd": null, + "sparsity.mean": 1.0, + "sparsity.sd": 0.0, + "t_mean.mean": null, + "t_mean.sd": null, + "var.mean": null, + "var.sd": null, + "w_lambda": null + } + } +} \ No newline at end of file diff --git a/SynRD/features_extractor/features_extractor.py b/SynRD/features_extractor/features_extractor.py index 2d2578a..94e521f 100644 --- a/SynRD/features_extractor/features_extractor.py +++ b/SynRD/features_extractor/features_extractor.py @@ -12,7 +12,7 @@ MAPPINGS = { # "saw2018cross_dataframe.tsv": {"name": "saw2018cross", "target": ""}, # "lee2021ability_dataframe.tsv": {"name": "lee2021ability", "target": ""}, - # "jeong2021math_dataframe.tsv": {"name": "jeong2021math", "target": "X1TXMSCR"}, + "jeong2021math_dataframe.tsv": {"name": "jeong2021math", "target": "TARGET"}, # "iverson22football_dataframe.tsv": {"name": "iverson22football", "target": ""}, # "fruiht2018naturally_dataframe.tsv": {"name": "fruiht2018naturally", "target": ""}, "29621-0001-Data.tsv": {"name": "fairman2019marijuana", "target": "CLASS"}, diff --git a/SynRD/publication.py b/SynRD/publication.py index 6bfee50..a22aa12 100644 --- a/SynRD/publication.py +++ b/SynRD/publication.py @@ -82,8 +82,13 @@ def __init__(self, dataframe=None, description=None): self.dataframe = dataframe self.real_dataframe = dataframe else: - raise ValueError("Must set dataframe to initialize a paper class.") - + try: + self.dataframe = self._recreate_dataframe() + except NotImplementedError: + raise ValueError("Must set dataframe to initialize a paper class or implement _recreate_dataframe().") + except Exception as e: + raise ValueError(f"Couldn't initialize dataframe for paper.\nCaught {e}.") + self._description = description self.columns = self.real_dataframe.columns From e9988d172ec1058dde29a6693449f06fa5dc7336 Mon Sep 17 00:00:00 2001 From: aholovenko Date: Sat, 25 Feb 2023 18:39:18 +0200 Subject: [PATCH 3/3] add final features --- SynRD/features_extractor/features.json | 425 +++++++++++++++--- .../features_extractor/features_extractor.py | 111 +++-- 2 files changed, 443 insertions(+), 93 deletions(-) diff --git a/SynRD/features_extractor/features.json b/SynRD/features_extractor/features.json index ea77f0c..555464c 100644 --- a/SynRD/features_extractor/features.json +++ b/SynRD/features_extractor/features.json @@ -15,8 +15,8 @@ "num_to_cat": 0.75 }, "info-theory": { - "attr_conc.mean": 0.03990996507418588, - "attr_conc.sd": 0.1387983393422685, + "attr_conc.mean": 0.0458436925407551, + "attr_conc.sd": 0.13948556412198465, "attr_ent.mean": 2.2197528515718576, "attr_ent.sd": 1.5139967849269165, "class_conc.mean": 0.02703239097275378, @@ -89,88 +89,262 @@ "fairman2019marijuana": { "general": { "attr_to_inst": 0.00001703107489926119, - "cat_to_num": 0.0, + "cat_to_num": 4.0, "freq_class.mean": 0.16666666666666666, "freq_class.sd": 0.14470247838587796, "inst_to_attr": 58716.2, "nr_attr": 5, "nr_bin": 1, - "nr_cat": 0, + "nr_cat": 4, "nr_class": 6, "nr_inst": 293581, - "nr_num": 5, - "num_to_cat": null + "nr_num": 1, + "num_to_cat": 0.25 }, "info-theory": { - "attr_conc.mean": 0.008614358887037692, - "attr_conc.sd": 0.024789208781955833, - "attr_ent.mean": 2.5171242952379385, - "attr_ent.sd": 1.086706290676518, - "class_conc.mean": 0.07817078162837117, - "class_conc.sd": 0.14361375404460913, + "attr_conc.mean": 0.00862284316796299, + "attr_conc.sd": 0.024786672418147204, + "attr_ent.mean": 2.521293122340193, + "attr_ent.sd": 1.083027203766085, + "class_conc.mean": 0.07817273846496639, + "class_conc.sd": 0.14361253242900696, "class_ent": 2.1567101571814913, - "eq_num_attr": 8.473202572599119, - "joint_ent.mean": 4.419301394971244, - "joint_ent.sd": 1.0027439889770196, - "mut_inf.mean": 0.2545330574481863, - "mut_inf.sd": 0.4321564861621098, - "ns_ratio": 8.88918422020815 + "eq_num_attr": 8.469380238328613, + "joint_ent.mean": 4.423355348210112, + "joint_ent.sd": 1.0000881548804132, + "mut_inf.mean": 0.25464793131157215, + "mut_inf.sd": 0.4320763698298892, + "ns_ratio": 8.90109406879606 }, "itemset": { - "one_itemset.mean": 0.11111111111111109, - "one_itemset.sd": 0.12959929569442002, - "two_itemset.mean": 0.21220159151193635, - "two_itemset.sd": 0.15211769593415295 + "one_itemset.mean": 0.10869565217391304, + "one_itemset.sd": 0.1291662893565054, + "two_itemset.mean": 0.2068095838587642, + "two_itemset.sd": 0.15217973907789384 }, "statistical": { - "can_cor.mean": 0.27323666640127386, - "can_cor.sd": 0.37956564708808893, - "cor.mean": 0.07406275721610567, - "cor.sd": 0.1771369598863154, - "cov.mean": 1.304644717701642, - "cov.sd": 3.524904804376082, - "eigenvalues.mean": 14.621120087611876, - "eigenvalues.sd": 21.564255640650842, + "can_cor.mean": 0.28979750344903177, + "can_cor.sd": 0.36940043439051734, + "cor.mean": 0.033916475598542994, + "cor.sd": 0.04940358423991378, + "cov.mean": 0.01143781554280118, + "cov.sd": 0.05534029193569672, + "eigenvalues.mean": 1.86415005899953, + "eigenvalues.sd": 9.414023090683303, "g_mean.mean": null, "g_mean.sd": null, - "gravity": 13.76146670705108, + "gravity": 13.446017977834678, "h_mean.mean": 0.0, "h_mean.sd": 0.0, - "iq_range.mean": 5.8, - "iq_range.sd": 4.868264577855234, - "kurtosis.mean": -1.316388250277453, - "kurtosis.sd": 0.5530853223474288, - "lh_trace": 8.64405359804705, - "mad.mean": 2.9652000000000003, - "mad.sd": 3.1450695413615257, - "max.mean": 9.4, - "max.sd": 7.368853370776216, - "mean.mean": 3.862251984971779, - "mean.sd": 2.992971847542306, - "median.mean": 4.0, - "median.sd": 4.527692569068709, + "iq_range.mean": 0.5357142857142857, + "iq_range.sd": 2.645501322750992, + "kurtosis.mean": 15.126693058873796, + "kurtosis.sd": 38.79828388336963, + "lh_trace": 8.852862245584198, + "mad.mean": 0.26475, + "mad.sd": 1.4009253192087006, + "max.mean": 1.7142857142857142, + "max.sd": 3.7796447300922718, + "mean.mean": 0.3903238921164127, + "mean.sd": 1.518058035058907, + "median.mean": 0.39285714285714285, + "median.sd": 2.0788046015507495, "min.mean": 0.0, "min.sd": 0.0, - "nr_cor_attr": 0.1, + "nr_cor_attr": 0.0, "nr_disc": 5, - "nr_norm": 1.0, - "nr_outliers": 0, - "p_trace": 0.9495717011280036, - "range.mean": 9.4, - "range.sd": 7.368853370776216, - "roy_root": 8.589078392532363, - "sd.mean": 3.1637659684598214, - "sd.sd": 2.400964645831791, - "sd_ratio": 1.110047598819871, - "skewness.mean": 0.23240652488599678, - "skewness.sd": 0.49992378018674405, + "nr_norm": 11.0, + "nr_outliers": 26, + "p_trace": 0.9657396887380695, + "range.mean": 1.7142857142857142, + "range.sd": 3.7796447300922718, + "roy_root": 8.783409166902468, + "sd.mean": 0.5252950734977363, + "sd.sd": 1.2833697549409835, + "sd_ratio": null, + "skewness.mean": 3.2629118443516076, + "skewness.sd": 2.590241657401533, "sparsity.mean": 0.17584134858261163, "sparsity.sd": 0.1844815935639719, - "t_mean.mean": 3.639869655802758, - "t_mean.sd": 3.142390782055264, - "var.mean": 14.621120087611851, - "var.sd": 20.01485297669193, - "w_lambda": 0.0987625612414452 + "t_mean.mean": 0.30357294919966293, + "t_mean.sd": 1.512107557228859, + "var.mean": 1.8641500589990838, + "var.sd": 9.409284946491422, + "w_lambda": 0.09542781732881896 + } + }, + "fruiht2018naturally": { + "general": { + "attr_to_inst": 0.0023963575365444525, + "cat_to_num": 0.0, + "freq_class.mean": 0.07692307692307693, + "freq_class.sd": 0.09732973624554023, + "inst_to_attr": 417.3, + "nr_attr": 10, + "nr_bin": 8, + "nr_cat": 0, + "nr_class": 13, + "nr_inst": 4173, + "nr_num": 10, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.08443177796724899, + "attr_conc.sd": 0.22978046900859486, + "attr_ent.mean": 1.0156816369401709, + "attr_ent.sd": 0.6593692099153022, + "class_conc.mean": 0.12440914830592446, + "class_conc.sd": 0.3109366084954623, + "class_ent": 2.8175661528428, + "eq_num_attr": 27.168989187285476, + "joint_ent.mean": 3.72954256418844, + "joint_ent.sd": 0.7277575442618792, + "mut_inf.mean": 0.1037052255945305, + "mut_inf.sd": 0.2561992452064771, + "ns_ratio": 8.793929198045529 + }, + "itemset": { + "one_itemset.mean": 0.37037037037037035, + "one_itemset.sd": 0.28483736758582634, + "two_itemset.mean": 0.48417721518987344, + "two_itemset.sd": 0.20428261038074555 + }, + "statistical": { + "can_cor.mean": 0.16798613010999006, + "can_cor.sd": 0.30500122840629146, + "cor.mean": 0.1274799626397623, + "cor.sd": 0.18247284301061095, + "cov.mean": 0.030737729912916505, + "cov.sd": 0.04508699250430289, + "eigenvalues.mean": 0.5260493369349912, + "eigenvalues.sd": 0.9527701055836804, + "g_mean.mean": 1.5355053816343125, + "g_mean.sd": 4.855694365410508, + "gravity": 1.3251618277297676, + "h_mean.mean": 1.5251679969670537, + "h_mean.sd": 4.823004684812668, + "iq_range.mean": 0.8, + "iq_range.sd": 0.9189365834726815, + "kurtosis.mean": 1.7466195684725043, + "kurtosis.sd": 5.476645027785524, + "lh_trace": 175703465.65490502, + "mad.mean": 0.14826, + "mad.sd": 0.4688392858965639, + "max.mean": 3.4, + "max.sd": 6.310485101972924, + "mean.mean": 2.0397555715312725, + "mean.sd": 4.7284946820689155, + "median.mean": 2.1, + "median.sd": 4.557045826702499, + "min.mean": 1.1, + "min.sd": 3.478505426185217, + "nr_cor_attr": 0.06666666666666667, + "nr_disc": 10, + "nr_norm": 0.0, + "nr_outliers": 5, + "p_trace": 1.119425143057426, + "range.mean": 2.3, + "range.sd": 2.9832867780352594, + "roy_root": 175703465.5260015, + "sd.mean": 0.5881618650527289, + "sd.sd": 0.44735637972511794, + "sd_ratio": null, + "skewness.mean": 0.6322671598235938, + "skewness.sd": 1.7834962915526746, + "sparsity.mean": 0.4256199337575176, + "sparsity.sd": 0.1575698045779039, + "t_mean.mean": 2.0454690618762474, + "t_mean.sd": 4.73431893634828, + "var.mean": 0.5260493369349921, + "var.sd": 0.9372545899022917, + "w_lambda": 5.027594768321065E-9 + } + }, + "iverson22football": { + "general": { + "attr_to_inst": 0.014755959137343927, + "cat_to_num": 0.0, + "freq_class.mean": 0.5, + "freq_class.sd": 0.46070294256649064, + "inst_to_attr": 67.76923076923077, + "nr_attr": 26, + "nr_bin": 1, + "nr_cat": 0, + "nr_class": 2, + "nr_inst": 1762, + "nr_num": 26, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.25323533267430237, + "attr_conc.sd": 0.24824770519055797, + "attr_ent.mean": 0.4985664656944789, + "attr_ent.sd": 0.9924918415361424, + "class_conc.mean": 0.35137502737347687, + "class_conc.sd": 0.22793497984259123, + "class_ent": 0.6672989356393069, + "eq_num_attr": 189.4880525349412, + "joint_ent.mean": 1.1623438128921282, + "joint_ent.sd": 0.9914437226702077, + "mut_inf.mean": 0.0035215884416578626, + "mut_inf.sd": 0.00989859442438207, + "ns_ratio": 140.57431339698746 + }, + "itemset": { + "one_itemset.mean": 0.4406779661016949, + "one_itemset.sd": 0.42328118527056224, + "two_itemset.mean": 0.5191944619257395, + "two_itemset.sd": 0.3621615952024312 + }, + "statistical": { + "can_cor.mean": null, + "can_cor.sd": null, + "cor.mean": null, + "cor.sd": null, + "cov.mean": null, + "cov.sd": null, + "eigenvalues.mean": null, + "eigenvalues.sd": null, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": null, + "h_mean.mean": null, + "h_mean.sd": null, + "iq_range.mean": null, + "iq_range.sd": null, + "kurtosis.mean": null, + "kurtosis.sd": null, + "lh_trace": null, + "mad.mean": null, + "mad.sd": null, + "max.mean": null, + "max.sd": null, + "mean.mean": null, + "mean.sd": null, + "median.mean": null, + "median.sd": null, + "min.mean": null, + "min.sd": null, + "nr_cor_attr": 0.0, + "nr_disc": null, + "nr_norm": 18.0, + "nr_outliers": 4, + "p_trace": null, + "range.mean": null, + "range.sd": null, + "roy_root": null, + "sd.mean": null, + "sd.sd": null, + "sd_ratio": null, + "skewness.mean": null, + "skewness.sd": null, + "sparsity.mean": 0.2998697442692332, + "sparsity.sd": 0.17959195161730515, + "t_mean.mean": null, + "t_mean.sd": null, + "var.mean": null, + "var.sd": null, + "w_lambda": null } }, "jeong2021math": { @@ -189,8 +363,8 @@ "num_to_cat": null }, "info-theory": { - "attr_conc.mean": 0.013708609620212937, - "attr_conc.sd": 0.04805804686514912, + "attr_conc.mean": 0.010090896851239785, + "attr_conc.sd": 0.033703906974763255, "attr_ent.mean": 1.6774033809497375, "attr_ent.sd": 1.1130854889416772, "class_conc.mean": 0.016853385542393653, @@ -260,6 +434,44 @@ "w_lambda": 0.7080132738594047 } }, + "lee2021ability": { + "general": { + "attr_to_inst": 0.000548885077186964, + "cat_to_num": 0.0, + "freq_class.mean": 0.00006961849067112225, + "freq_class.sd": 0.00000837287816198932, + "inst_to_attr": 1821.875, + "nr_attr": 8, + "nr_bin": 1, + "nr_cat": 0, + "nr_class": 14364, + "nr_inst": 14575, + "nr_num": 8, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.0067981219400851745, + "attr_conc.sd": 0.015967117588566067, + "attr_ent.mean": 2.9264965465129515, + "attr_ent.sd": 1.2768155520476885, + "class_conc.mean": 0.9847774711734155, + "class_conc.sd": 0.0015393240740786877, + "class_ent": 13.802099195221773, + "eq_num_attr": 4.821933447480678, + "joint_ent.mean": 13.866237862773751, + "joint_ent.sd": 0.035495585619732886, + "mut_inf.mean": 2.8623578789609745, + "mut_inf.sd": 1.2421986714275994, + "ns_ratio": 0.022407633938233872 + }, + "itemset": { + "one_itemset.mean": 0.08602150537634405, + "one_itemset.sd": 0.10389229939062207, + "two_itemset.mean": 0.1674641148325359, + "two_itemset.sd": 0.1286196232436834 + }, + "statistical": null + }, "mushrooms": { "general": { "attr_to_inst": 0.0027080256031511572, @@ -346,5 +558,92 @@ "var.sd": null, "w_lambda": null } + }, + "saw2018cross": { + "general": { + "attr_to_inst": 0.0003952178638474459, + "cat_to_num": null, + "freq_class.mean": 0.5, + "freq_class.sd": 0.5121127068567723, + "inst_to_attr": 2530.25, + "nr_attr": 8, + "nr_bin": 4, + "nr_cat": 8, + "nr_class": 2, + "nr_inst": 20242, + "nr_num": 0, + "num_to_cat": 0.0 + }, + "info-theory": { + "attr_conc.mean": 0.0841495924145353, + "attr_conc.sd": 0.227670095417584, + "attr_ent.mean": 1.270132629600944, + "attr_ent.sd": 0.6556396727229159, + "class_conc.mean": 0.17201296378306205, + "class_conc.sd": 0.24549176852151852, + "class_ent": 0.5786639974253174, + "eq_num_attr": 4.06014146306263, + "joint_ent.mean": 1.7062735147924277, + "joint_ent.sd": 0.7180867078110545, + "mut_inf.mean": 0.14252311223383382, + "mut_inf.sd": 0.14517552019309857, + "ns_ratio": 7.911766026530994 + }, + "itemset": { + "one_itemset.mean": 0.24242424242424243, + "one_itemset.sd": 0.2623268785252508, + "two_itemset.mean": 0.3854625550660793, + "two_itemset.sd": 0.23647531117015527 + }, + "statistical": { + "can_cor.mean": 0.9415954799514438, + "can_cor.sd": null, + "cor.mean": 0.05235378612739712, + "cor.sd": 0.12169954685116255, + "cov.mean": 0.006570606870586722, + "cov.sd": 0.0210517529628225, + "eigenvalues.mean": 0.09794872814709431, + "eigenvalues.sd": 0.13949616361808648, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": 1.2696149183358842, + "h_mean.mean": 0.0, + "h_mean.sd": 0.0, + "iq_range.mean": 0.14423076923076922, + "iq_range.sd": 0.3476569927606586, + "kurtosis.mean": 42.944514865439814, + "kurtosis.sd": 90.49556593722505, + "lh_trace": 7.818501403000752, + "mad.mean": 0.02851153846153846, + "mad.sd": 0.1453808909779358, + "max.mean": 1.0, + "max.sd": 0.0, + "mean.mean": 0.14086096691570457, + "mean.sd": 0.15491377046678032, + "median.mean": 0.057692307692307696, + "median.sd": 0.21572774865200242, + "min.mean": 0.0, + "min.sd": 0.0, + "nr_cor_attr": 0.01846153846153846, + "nr_disc": 1, + "nr_norm": 0.0, + "nr_outliers": 22, + "p_trace": 0.8866020478649898, + "range.mean": 1.0, + "range.sd": 0.0, + "roy_root": 7.818501403000752, + "sd.mean": 0.27888261379095614, + "sd.sd": 0.14484524330162923, + "sd_ratio": null, + "skewness.mean": 4.6893540450473745, + "skewness.sd": 4.885739045540896, + "sparsity.mean": 0.3355326500196442, + "sparsity.sd": 0.17948327036390738, + "t_mean.mean": 0.06796159546036049, + "t_mean.sd": 0.16136020097951118, + "var.mean": 0.09794872814709499, + "var.sd": 0.08276203933293032, + "w_lambda": 0.11339795213501022 + } } } \ No newline at end of file diff --git a/SynRD/features_extractor/features_extractor.py b/SynRD/features_extractor/features_extractor.py index 94e521f..5a4da06 100644 --- a/SynRD/features_extractor/features_extractor.py +++ b/SynRD/features_extractor/features_extractor.py @@ -3,6 +3,7 @@ import math import numpy as np import pandas as pd +from typing import Tuple, Union, List from pymfe.mfe import MFE @@ -10,16 +11,32 @@ FEATURE_GROUPS = ["general", "statistical", "info-theory", "itemset"] MAPPINGS = { - # "saw2018cross_dataframe.tsv": {"name": "saw2018cross", "target": ""}, - # "lee2021ability_dataframe.tsv": {"name": "lee2021ability", "target": ""}, - "jeong2021math_dataframe.tsv": {"name": "jeong2021math", "target": "TARGET"}, - # "iverson22football_dataframe.tsv": {"name": "iverson22football", "target": ""}, - # "fruiht2018naturally_dataframe.tsv": {"name": "fruiht2018naturally", "target": ""}, - "29621-0001-Data.tsv": {"name": "fairman2019marijuana", "target": "CLASS"}, - "adult.data": {"name": "adult", "target": "income", "columns": ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]}, - "agaricus-lepiota.data": {"name": "mushrooms", "target": "class", "columns": ["class", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]}, + "29621-0001-Data.tsv": {"name": "fairman2019marijuana", "target": "CLASS", + "cat_cols": ["YEAR", "CLASS", "SEX", "AGE", "RACE"]}, + "adult.data": {"name": "adult", "target": "income", + "columns": ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", + "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", + "hours-per-week", "native-country", "income"], + "cat_cols": ["workclass", "education", "marital-status", "occupation", "relationship", "race", + "sex", "native-country", "income"]}, + "agaricus-lepiota.data": {"name": "mushrooms", "target": "class", + "columns": ["class", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", + "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", + "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", + "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", + "ring-number", "ring-type", "spore-print-color", "population", "habitat"], + "cat_cols": "all"}, + "saw2018cross_dataframe.tsv": {"name": "saw2018cross", "target": "stem_career_aspirations", "cat_cols": "all"}, + "lee2021ability_dataframe.tsv": {"name": "lee2021ability", "target": "math", "cat_cols": "auto"}, + "jeong2021math_dataframe.tsv": {"name": "jeong2021math", "target": "TARGET", "cat_cols": "auto"}, + "iverson22football_dataframe.tsv": {"name": "iverson22football", "target": "H5ID6G"}, + "fruiht2018naturally_dataframe.tsv": {"name": "fruiht2018naturally", "target": "EDU_ATTAINED"}, } +PAPER2TARGET = {d['name']: d['target'] for d in MAPPINGS.values()} +PAPER2FEATURES = {d['name']: d.get('features') for d in MAPPINGS.values()} +PAPER2CAT_COLS = {d['name']: d.get('cat_cols', 'auto') for d in MAPPINGS.values()} + def load_data(input_file_path, column_names=None): if input_file_path.endswith('.tsv'): @@ -42,33 +59,26 @@ def load_dataframes(input_dir='data/papers'): return dataframes -def get_target(dataframe, target, features=None): +def features_target_split(dataframe, target, features=None) -> Tuple[pd.DataFrame, pd.DataFrame]: if features is not None: features = features + [target] dataframe = dataframe[features] - X = dataframe.drop(columns=[target], errors='ignore').to_numpy() - y = dataframe[[target]].to_numpy() + X = dataframe.drop(columns=[target], errors='ignore') + y = dataframe[[target]] return X, y -def __replace_nans(vals): - return [None if math.isnan(v) else v for v in vals] +def get_categorical_columns(dataframe_name: str, X: pd.DataFrame) -> Union[str, List[int]]: + cat_cols = PAPER2CAT_COLS[dataframe_name] + if cat_cols == 'all': + return [i for i in range(len(X.columns))] + if cat_cols == 'auto': + return cat_cols + return [i for i, c in enumerate(X.columns) if c in cat_cols] -def get_features(dataframes): - features = dict() - name2target = {d['name']: d['target'] for d in MAPPINGS.values()} - name2features = {d['name']: d.get('features') for d in MAPPINGS.values()} - for dataframe_name, dataframe in dataframes.items(): - X, y = get_target(dataframe, features=name2features[dataframe_name], target=name2target[dataframe_name]) - dataframe_features = dict() - for group_name in FEATURE_GROUPS: - mfe = MFE(groups=group_name) - mfe.fit(X, y) - features_names, features_vals = mfe.extract() - dataframe_features[group_name] = dict(zip(features_names, __replace_nans(features_vals))) - features[dataframe_name] = dataframe_features - return features +def __replace_nans(vals): + return [None if math.isnan(v) else v for v in vals] class NpEncoder(json.JSONEncoder): @@ -81,11 +91,52 @@ def default(self, obj): super(NpEncoder, self).default(obj) +def get_features_from_files(): + features = dict() + for peper_name in PAPER2TARGET.keys(): + file_name = f'features_{peper_name}.json' + features[peper_name] = json.load(open(file_name)) + return features + + +def calculate_features(dataframe_name, dataframe): + X, y = features_target_split(dataframe, features=PAPER2FEATURES[dataframe_name], + target=PAPER2TARGET[dataframe_name]) + dataframe_features = dict() + for group_name in FEATURE_GROUPS: + mfe = MFE(groups=group_name) + cat_cols = get_categorical_columns(dataframe_name, X) + mfe.fit(X.to_numpy(), y.to_numpy(), cat_cols=cat_cols) + features_names, features_vals = mfe.extract() + dataframe_features[group_name] = dict(zip(features_names, __replace_nans(features_vals))) + print(f'Calculated features for {dataframe_name}') + return dataframe_features + + +def save_to_file(objs, file_name): + with open(file_name, 'w') as output_file: + json.dump(objs, output_file, cls=NpEncoder) + print(f'Saved to file {file_name}') + + +def load_from_file(file_name): + objs = json.load(open(file_name)) + print(f'Loaded from {file_name}') + return objs + + def main(): dataframes = load_dataframes() - features = get_features(dataframes) - with open('features.json', 'w') as output_file: - json.dump(features, output_file, cls=NpEncoder) + features = dict() + for dataframe_name, dataframe in dataframes.items(): + features_file_name = f'features_{dataframe_name}.json' + if os.path.exists(features_file_name): + dataframe_features = load_from_file(features_file_name) + else: + dataframe_features = calculate_features(dataframe_name, dataframe) + save_to_file(dataframe_features, features_file_name) + features[dataframe_name] = dataframe_features + save_to_file(features, 'features.json') if __name__ == '__main__':