From dda3bac2b6cf523ce1f2e08b152feef4f9cca17e Mon Sep 17 00:00:00 2001
From: Kirill <kirill.petrov@intel.com>
Date: Mon, 7 Jun 2021 13:07:13 +0300
Subject: [PATCH] Add reduce sklearn config for blog (#73)

---
 configs/blogs/skl_conda_config.json | 427 ++++++++++++++++++++++++++++
 1 file changed, 427 insertions(+)
 create mode 100755 configs/blogs/skl_conda_config.json

diff --git a/configs/blogs/skl_conda_config.json b/configs/blogs/skl_conda_config.json
new file mode 100755
index 000000000..07557d2bf
--- /dev/null
+++ b/configs/blogs/skl_conda_config.json
@@ -0,0 +1,427 @@
+{
+    "common": {
+        "lib": ["sklearn"],
+        "data-format": ["pandas"],
+        "data-order": ["F"],
+        "dtype": ["float64"]
+    },
+    "cases": [
+        {
+            "algorithm": "kmeans",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "blobs",
+                    "n_clusters": 1000,
+                    "n_features": 20,
+                    "training": {
+                        "n_samples": 1000000
+                    }
+                }
+            ],
+            "time-method": ["box_filter"],
+            "time-limit": [50],
+            "n-clusters": [1000],
+            "maxiter": [50],
+            "tol": [0.0]
+        },
+        {
+            "algorithm": "kmeans",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "blobs",
+                    "n_clusters": 5,
+                    "n_features": 50,
+                    "training": {
+                        "n_samples": 10000000
+                    }
+                }
+            ],
+            "time-method": ["box_filter"],
+            "time-limit": [50],
+            "n-clusters": [5],
+            "maxiter": [50],
+            "init": ["k-means++"],
+            "tol": [0.0]
+        },
+        {
+            "algorithm": "kmeans",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "blobs",
+                    "n_clusters": 20,
+                    "n_features": 50,
+                    "training": {
+                        "n_samples": 3000000
+                    }
+                }
+            ],
+            "time-method": ["box_filter"],
+            "time-limit": [50],
+            "n-clusters": [20],
+            "maxiter": [50],
+            "tol": [0.0]
+        },
+        {
+            "algorithm": "pca",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 100,
+                    "training": {
+                        "n_samples": 1000000
+                    },
+                    "testing": {
+                        "n_samples": 100000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 2000,
+                    "training": {
+                        "n_samples": 10000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 1000,
+                    "training": {
+                        "n_samples": 30000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 4000,
+                    "training": {
+                        "n_samples": 6000
+                    }
+                }
+            ],
+            "svd-solver": ["full"],
+            "n-components": [10]
+        },
+        {
+            "algorithm": "df_clsf",
+            "dtype": ["float32"],
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "data/higgs1m_x_train.npy",
+                        "y": "data/higgs1m_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs1m_x_test.npy",
+                        "y": "data/higgs1m_y_test.npy"
+                    }
+                }
+            ],
+            "num-trees": [50],
+            "max-depth": [16],
+            "max-leaf-nodes": [131072],
+            "max-features": [0.2]
+        },
+        {
+            "algorithm": "ridge",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "regression",
+                    "n_features": 20,
+                    "training": {
+                        "n_samples": 10000000
+                    }
+                }
+            ],
+            "alpha": [5]
+        },
+        {
+            "algorithm": "linear",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "regression",
+                    "n_features": 20,
+                    "training": {
+                        "n_samples": 10000000
+                    }
+                }
+            ]
+        },
+        {
+            "algorithm": "log_reg",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 20,
+                    "training": {
+                        "n_samples": 10000000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 100,
+                    "training": {
+                        "n_samples": 2000000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 5,
+                    "n_features": 20,
+                    "training": {
+                        "n_samples": 10000000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 5,
+                    "n_features": 100,
+                    "training": {
+                        "n_samples": 2000000
+                    }
+                }
+            ],
+            "maxiter": [100],
+            "tol": [0]
+        },
+        {
+            "algorithm": "svm",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "a9a",
+                    "training":
+                    {
+                        "x": "data/a9a_x_train.npy",
+                        "y": "data/a9a_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/a9a_x_test.npy",
+                        "y": "data/a9a_y_test.npy"
+                    }
+                }
+            ],
+            "C": [500.0],
+            "kernel": ["rbf"]
+        },
+        {
+            "algorithm": "svm",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "gisette",
+                    "training":
+                    {
+                        "x": "data/gisette_x_train.npy",
+                        "y": "data/gisette_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/gisette_x_test.npy",
+                        "y": "data/gisette_y_test.npy"
+                    }
+                }
+            ],
+            "C": [1.5e-3],
+            "kernel": ["linear"]
+        },
+        {
+            "algorithm": "svm",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "connect",
+                    "training":
+                    {
+                        "x": "data/connect_x_train.npy",
+                        "y": "data/connect_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/connect_x_test.npy",
+                        "y": "data/connect_y_test.npy"
+                    }
+                }
+            ],
+            "C": [100.0],
+            "kernel": ["linear"]
+        },
+        {
+            "algorithm": "svm",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "mnist",
+                    "training":
+                    {
+                        "x": "data/mnist_x_train.npy",
+                        "y": "data/mnist_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/mnist_x_test.npy",
+                        "y": "data/mnist_y_test.npy"
+                    }
+                }
+            ],
+            "C": [50.0],
+            "kernel": ["rbf"]
+        },
+        {
+            "algorithm": "dbscan",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "blobs",
+                    "n_clusters": 50,
+                    "n_features": 3,
+                    "training": {
+                        "n_samples": 500000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "blobs",
+                    "n_clusters": 50,
+                    "n_features": 10,
+                    "training": {
+                        "n_samples": 500000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "blobs",
+                    "n_clusters": 100,
+                    "n_features": 50,
+                    "training": {
+                        "n_samples": 500000
+                    }
+                }
+            ]
+        },
+        {
+            "algorithm": "knn_clsf",
+            "dtype": ["float32"],
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 3,
+                    "training": {
+                        "n_samples": 100000
+                    },
+                    "testing": {
+                        "n_samples": 100000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 10,
+                    "training": {
+                        "n_samples": 100000
+                    },
+                    "testing": {
+                        "n_samples": 100000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 50,
+                    "training": {
+                        "n_samples": 20000
+                    },
+                    "testing": {
+                        "n_samples": 20000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 10,
+                    "n_features": 16,
+                    "training": {
+                        "n_samples": 250000
+                    },
+                    "testing": {
+                        "n_samples": 250000
+                    }
+                }
+            ],
+            "method": ["brute"]
+        },
+        {
+            "algorithm": "knn_clsf",
+            "dtype": ["float32"],
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 50,
+                    "training": {
+                        "n_samples": 20000
+                    },
+                    "testing": {
+                        "n_samples": 20000
+                    }
+                },
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 10,
+                    "n_features": 16,
+                    "training": {
+                        "n_samples": 250000
+                    },
+                    "testing": {
+                        "n_samples": 250000
+                    }
+                }
+            ],
+            "method": ["kd_tree"]
+        },
+        {
+            "algorithm": "train_test_split",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 2,
+                    "n_features": 100,
+                    "training": {
+                        "n_samples": 1000000
+                    }
+                }
+            ],
+            "include-y": [""],
+            "train-size": [0.75],
+            "test-size": [0.25]
+        }
+    ]
+}