1. Add embedding_dim to numeric features

2. Update examples in the demo
reczoo · May 20, 2024 · c9f70a5 · c9f70a5
1 parent 7028d3e
commit c9f70a5
Show file tree

Hide file tree

Showing 29 changed files with 332 additions and 104 deletions.
diff --git a/data/tiny_parquet/feature_map.json b/data/tiny_parquet/feature_map.json
@@ -0,0 +1,123 @@
+{
+    "dataset_id": "tiny_parquet",
+    "num_fields": 14,
+    "total_features": 485,
+    "input_length": 14,
+    "labels": [
+        "clk"
+    ],
+    "features": [
+        {
+            "userid": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 26
+            }
+        },
+        {
+            "adgroup_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 96
+            }
+        },
+        {
+            "pid": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        },
+        {
+            "cate_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 49
+            }
+        },
+        {
+            "campaign_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 99
+            }
+        },
+        {
+            "customer": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 98
+            }
+        },
+        {
+            "brand": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 67
+            }
+        },
+        {
+            "cms_segid": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 11
+            }
+        },
+        {
+            "cms_group_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 11
+            }
+        },
+        {
+            "final_gender_code": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        },
+        {
+            "age_level": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 7
+            }
+        },
+        {
+            "pvalue_level": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        },
+        {
+            "shopping_level": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 5
+            }
+        },
+        {
+            "occupation": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        }
+    ]
+}
diff --git a/data/tiny_parquet/test.parquet b/data/tiny_parquet/test.parquet
diff --git a/data/tiny_parquet/train.parquet b/data/tiny_parquet/train.parquet
diff --git a/data/tiny_parquet/valid.parquet b/data/tiny_parquet/valid.parquet
diff --git a/demo/config/example1_config/dataset_config.yaml b/demo/config/example1_config/dataset_config.yaml
@@ -11,4 +11,3 @@ tiny_example1:
                  "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
                  active: True, dtype: str, type: categorical}]
     label_col: {name: clk, dtype: float}
-
diff --git a/demo/config/example2_config/dataset_config.yaml b/demo/config/example2_config/dataset_config.yaml
@@ -1,7 +1,7 @@
 ### Tiny data for demo only
-tiny_npz:
+tiny_parquet:
     data_root: ../data/
-    data_format: npz
-    train_data: ../data/tiny_npz/train.npz
-    valid_data: ../data/tiny_npz/valid.npz
-    test_data: ../data/tiny_npz/test.npz
+    data_format: parquet
+    train_data: ../data/tiny_parquet/train.parquet
+    valid_data: ../data/tiny_parquet/valid.parquet
+    test_data: ../data/tiny_parquet/test.parquet
diff --git a/demo/config/example2_config/model_config.yaml b/demo/config/example2_config/model_config.yaml
@@ -12,9 +12,9 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_npz:
+DeepFM_test_parquet:
     model: DeepFM
-    dataset_id: tiny_npz
+    dataset_id: tiny_parquet
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification

diff --git a/demo/config/example3_config/dataset_config.yaml b/demo/config/example3_config/dataset_config.yaml
@@ -1,14 +1,7 @@
 ### Tiny data for demo only
-tiny_example3:
+tiny_npz:
     data_root: ../data/
-    data_format: csv
-    train_data: ../data/tiny_csv/train_sample.csv
-    valid_data: ../data/tiny_csv/valid_sample.csv
-    test_data: ../data/tiny_csv/test_sample.csv
-    min_categr_count: 1
-    feature_cols:
-        [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
-                 "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
-                 active: True, dtype: str, type: categorical}]
-    label_col: {name: clk, dtype: float}
-
+    data_format: npz
+    train_data: ../data/tiny_npz/train.npz
+    valid_data: ../data/tiny_npz/valid.npz
+    test_data: ../data/tiny_npz/test.npz
diff --git a/demo/config/example3_config/model_config.yaml b/demo/config/example3_config/model_config.yaml
@@ -12,9 +12,9 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_csv:
+DeepFM_test_npz:
     model: DeepFM
-    dataset_id: tiny_example3
+    dataset_id: tiny_npz
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification
@@ -30,6 +30,6 @@ DeepFM_test_csv:
     embedding_dim: 4
     epochs: 1
     shuffle: True
-    seed: 2019
+    seed: 2023
     monitor: 'AUC'
     monitor_mode: 'max'
diff --git a/demo/config/example4_config/dataset_config.yaml b/demo/config/example4_config/dataset_config.yaml
@@ -7,9 +7,7 @@ tiny_example4:
     test_data: ../data/tiny_csv/test_sample.csv
     min_categr_count: 1
     feature_cols:
-        [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz",
-          embedding_dim: 8, freeze_emb: True},
-         {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
+        [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
                  "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
                  active: True, dtype: str, type: categorical}]
     label_col: {name: clk, dtype: float}
diff --git a/demo/config/example4_config/model_config.yaml b/demo/config/example4_config/model_config.yaml
@@ -12,7 +12,7 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_pretrain: 
+DeepFM_test_csv:
     model: DeepFM
     dataset_id: tiny_example4
     loss: 'binary_crossentropy'
@@ -27,10 +27,9 @@ DeepFM_test_pretrain:
     batch_norm: False
     net_dropout: 0
     batch_size: 128
-    embedding_dim: 8
+    embedding_dim: 4
     epochs: 1
     shuffle: True
-    seed: 2023
+    seed: 2019
     monitor: 'AUC'
     monitor_mode: 'max'
-
diff --git a/demo/config/example5_config/dataset_config.yaml b/demo/config/example5_config/dataset_config.yaml
@@ -1,7 +1,15 @@
 ### Tiny data for demo only
-tiny_seq:
+tiny_example5:
     data_root: ../data/
-    data_format: npz
-    train_data: ../data/tiny_seq/train.npz
-    valid_data: ../data/tiny_seq/valid.npz
-    test_data: ../data/tiny_seq/test.npz
+    data_format: csv
+    train_data: ../data/tiny_csv/train_sample.csv
+    valid_data: ../data/tiny_csv/valid_sample.csv
+    test_data: ../data/tiny_csv/test_sample.csv
+    min_categr_count: 1
+    feature_cols:
+        [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz",
+          embedding_dim: 8, freeze_emb: True},
+         {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
+                 "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
+                 active: True, dtype: str, type: categorical}]
+    label_col: {name: clk, dtype: float}
diff --git a/demo/config/example5_config/model_config.yaml b/demo/config/example5_config/model_config.yaml
@@ -12,32 +12,25 @@ Base:
     feature_specs: null
     feature_config: null
 
-DIN_test:
-    model: DIN
-    dataset_id: tiny_seq
+DeepFM_test_pretrain: 
+    model: DeepFM
+    dataset_id: tiny_example5
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification
     optimizer: adam
-    learning_rate: 1.0e-3
-    embedding_regularizer: 0
+    hidden_units: [64, 32]
+    hidden_activations: relu
     net_regularizer: 0
-    batch_size: 128
-    embedding_dim: 4
-    dnn_hidden_units: [64, 32]
-    dnn_activations: relu
-    attention_hidden_units: [64]
-    attention_hidden_activations: "Dice"
-    attention_output_activation: null
-    attention_dropout: 0
-    din_target_field: adgroup_id
-    din_sequence_field: click_sequence
-    feature_specs: [{name: click_sequence, feature_encoder: null}]
-    net_dropout: 0
+    embedding_regularizer: 1.e-8
+    learning_rate: 1.e-3
     batch_norm: False
+    net_dropout: 0
+    batch_size: 128
+    embedding_dim: 8
     epochs: 1
     shuffle: True
-    seed: 2019
+    seed: 2023
     monitor: 'AUC'
     monitor_mode: 'max'
-    
+
diff --git a/demo/config/example6_config/dataset_config.yaml b/demo/config/example6_config/dataset_config.yaml
@@ -1,21 +1,7 @@
 ### Tiny data for demo only
-tiny_example6:
+tiny_seq:
     data_root: ../data/
-    data_format: csv 
-    train_data: ../data/tiny_csv/custom_preprocess_train_sample.csv
-    valid_data: ../data/tiny_csv/custom_preprocess_valid_sample.csv
-    test_data: ../data/tiny_csv/custom_preprocess_test_sample.csv
-    min_categr_count: 1
-    feature_cols:
-        -   active: true
-            dtype: str
-            name: [msno, song_id, source_system_tab, source_screen_name, source_type,
-                city, gender, registered_via, language]
-            type: categorical
-        - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: genre_ids,
-            type: sequence}
-        - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: artist_name,
-            type: sequence}
-        - {active: true, dtype: str, name: isrc, preprocess: extract_country_code, type: categorical}
-        - {active: true, dtype: str, name: bd, preprocess: bucketize_age, type: categorical}
-    label_col: {dtype: float, name: label}
+    data_format: npz
+    train_data: ../data/tiny_seq/train.npz
+    valid_data: ../data/tiny_seq/valid.npz
+    test_data: ../data/tiny_seq/test.npz
diff --git a/demo/config/example6_config/model_config.yaml b/demo/config/example6_config/model_config.yaml
@@ -12,24 +12,32 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_csv:
-    model: DeepFM
-    dataset_id: tiny_example6
+DIN_test:
+    model: DIN
+    dataset_id: tiny_seq
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification
     optimizer: adam
-    hidden_units: [64, 32]
-    hidden_activations: relu
+    learning_rate: 1.0e-3
+    embedding_regularizer: 0
     net_regularizer: 0
-    embedding_regularizer: 1.e-8
-    learning_rate: 1.e-3
-    batch_norm: False
-    net_dropout: 0
     batch_size: 128
     embedding_dim: 4
+    dnn_hidden_units: [64, 32]
+    dnn_activations: relu
+    attention_hidden_units: [64]
+    attention_hidden_activations: "Dice"
+    attention_output_activation: null
+    attention_dropout: 0
+    din_target_field: adgroup_id
+    din_sequence_field: click_sequence
+    feature_specs: [{name: click_sequence, feature_encoder: null}]
+    net_dropout: 0
+    batch_norm: False
     epochs: 1
     shuffle: True
     seed: 2019
     monitor: 'AUC'
     monitor_mode: 'max'
+
diff --git a/demo/config/example7_config/dataset_config.yaml b/demo/config/example7_config/dataset_config.yaml
@@ -0,0 +1,21 @@
+### Tiny data for demo only
+tiny_example7:
+    data_root: ../data/
+    data_format: csv 
+    train_data: ../data/tiny_csv/custom_preprocess_train_sample.csv
+    valid_data: ../data/tiny_csv/custom_preprocess_valid_sample.csv
+    test_data: ../data/tiny_csv/custom_preprocess_test_sample.csv
+    min_categr_count: 1
+    feature_cols:
+        -   active: true
+            dtype: str
+            name: [msno, song_id, source_system_tab, source_screen_name, source_type,
+                city, gender, registered_via, language]
+            type: categorical
+        - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: genre_ids,
+            type: sequence}
+        - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: artist_name,
+            type: sequence}
+        - {active: true, dtype: str, name: isrc, preprocess: extract_country_code, type: categorical}
+        - {active: true, dtype: str, name: bd, preprocess: bucketize_age, type: categorical}
+    label_col: {dtype: float, name: label}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,4 +11,3 @@ tiny_example1:
		"cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"],
		active: True, dtype: str, type: categorical}]
		label_col: {name: clk, dtype: float}