From 9551f330a90c43a2264dd7d8fa4bb2e6fba968fc Mon Sep 17 00:00:00 2001 From: xpai Date: Thu, 11 Jul 2024 15:30:51 +0800 Subject: [PATCH] + Fix #94 (typo error in tf feature_embedding) + Fix issue in build_dataset for skipping rebuilding dataset + Fix typo error in DIEN --- CHANGELOG.md | 7 +- fuxictr/preprocess/build_dataset.py | 74 +++++++++---------- .../layers/embeddings/feature_embedding.py | 2 +- model_zoo/DIEN/src/DIEN.py | 2 +- 4 files changed, 44 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9de1f7..d61dc86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,10 +4,13 @@ [Doing] Add support for saving pb file, exporting embeddings [Doing] Add support of multi-gpu training -**FuxiCTR v2.3.2, 2024-07-14** -+ [Fix] Fix typo error in copy_from of version v2.3.1 +**FuxiCTR v2.3.2, 2024-07-11** + [Feature] Add TransAct model + [Feature] Add new feature type `embedding`, supporting [`meta`, `numeric`, `embedding`, `categorical`, `sequence`] ++ [Fix] Fix typo error in copy_from of version v2.3.1 ++ [Fix] Fix issue in build_dataset for skipping rebuilding dataset ++ [Fix] Fix typo error in DIEN (AUGRUCell->AGRUCell) ++ [Fix] Fix typo error in feature_embedding of tf version ([#94](https://github.com/reczoo/FuxiCTR/issues/94)) **FuxiCTR v2.3.1, 2024-06-09** + [Fix] Fix customized preprossors based on polars and update demos diff --git a/fuxictr/preprocess/build_dataset.py b/fuxictr/preprocess/build_dataset.py index 8f66185..33666a5 100644 --- a/fuxictr/preprocess/build_dataset.py +++ b/fuxictr/preprocess/build_dataset.py @@ -82,47 +82,47 @@ def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=N if rebuild_dataset: feature_map_path = os.path.join(feature_encoder.data_dir, "feature_map.json") if os.path.exists(feature_map_path): - logging.warn(f"Skip rebuilding {feature_map_path}. " + logging.warn(f"Skip rebuilding {feature_map_path}. " + "Please delete it manually if rebuilding is required.") + else: + # Load data files + train_ddf = feature_encoder.read_data(train_data, **kwargs) + valid_ddf = None + test_ddf = None - # Load data files - train_ddf = feature_encoder.read_data(train_data, **kwargs) - valid_ddf = None - test_ddf = None - - # Split data for train/validation/test - if valid_size > 0 or test_size > 0: - valid_ddf = feature_encoder.read_data(valid_data, **kwargs) - test_ddf = feature_encoder.read_data(test_data, **kwargs) - # TODO: check split_train_test in lazy mode - train_ddf, valid_ddf, test_ddf = split_train_test(train_ddf, valid_ddf, test_ddf, - valid_size, test_size, split_type) - - # fit and transform train_ddf - train_ddf = feature_encoder.preprocess(train_ddf) - feature_encoder.fit(train_ddf, rebuild_dataset=True, **kwargs) - transform(feature_encoder, train_ddf, 'train', block_size=data_block_size) - del train_ddf - gc.collect() - - # Transfrom valid_ddf - if valid_ddf is None and (valid_data is not None): - valid_ddf = feature_encoder.read_data(valid_data, **kwargs) - if valid_ddf is not None: - valid_ddf = feature_encoder.preprocess(valid_ddf) - transform(feature_encoder, valid_ddf, 'valid', block_size=data_block_size) - del valid_ddf + # Split data for train/validation/test + if valid_size > 0 or test_size > 0: + valid_ddf = feature_encoder.read_data(valid_data, **kwargs) + test_ddf = feature_encoder.read_data(test_data, **kwargs) + # TODO: check split_train_test in lazy mode + train_ddf, valid_ddf, test_ddf = split_train_test(train_ddf, valid_ddf, test_ddf, + valid_size, test_size, split_type) + + # fit and transform train_ddf + train_ddf = feature_encoder.preprocess(train_ddf) + feature_encoder.fit(train_ddf, rebuild_dataset=True, **kwargs) + transform(feature_encoder, train_ddf, 'train', block_size=data_block_size) + del train_ddf gc.collect() - # Transfrom test_ddf - if test_ddf is None and (test_data is not None): - test_ddf = feature_encoder.read_data(test_data, **kwargs) - if test_ddf is not None: - test_ddf = feature_encoder.preprocess(test_ddf) - transform(feature_encoder, test_ddf, 'test', block_size=data_block_size) - del test_ddf - gc.collect() - logging.info("Transform csv data to parquet done.") + # Transfrom valid_ddf + if valid_ddf is None and (valid_data is not None): + valid_ddf = feature_encoder.read_data(valid_data, **kwargs) + if valid_ddf is not None: + valid_ddf = feature_encoder.preprocess(valid_ddf) + transform(feature_encoder, valid_ddf, 'valid', block_size=data_block_size) + del valid_ddf + gc.collect() + + # Transfrom test_ddf + if test_ddf is None and (test_data is not None): + test_ddf = feature_encoder.read_data(test_data, **kwargs) + if test_ddf is not None: + test_ddf = feature_encoder.preprocess(test_ddf) + transform(feature_encoder, test_ddf, 'test', block_size=data_block_size) + del test_ddf + gc.collect() + logging.info("Transform csv data to parquet done.") train_data, valid_data, test_data = ( os.path.join(feature_encoder.data_dir, "train"), \ diff --git a/fuxictr/tensorflow/layers/embeddings/feature_embedding.py b/fuxictr/tensorflow/layers/embeddings/feature_embedding.py index 4711fea..95fc9a5 100644 --- a/fuxictr/tensorflow/layers/embeddings/feature_embedding.py +++ b/fuxictr/tensorflow/layers/embeddings/feature_embedding.py @@ -91,7 +91,7 @@ def __init__(self, continue if feature_spec["type"] == "numeric": - self.embedding_layers[feature] = tf.keras.layers.Dense(feat_emb_dim, user_bias=False) + self.embedding_layers[feature] = tf.keras.layers.Dense(feat_emb_dim, use_bias=False) elif feature_spec["type"] == "categorical": padding_idx = feature_spec.get("padding_idx", None) embedding_matrix = Embedding(feature_spec["vocab_size"], diff --git a/model_zoo/DIEN/src/DIEN.py b/model_zoo/DIEN/src/DIEN.py index 43d621f..cad6556 100644 --- a/model_zoo/DIEN/src/DIEN.py +++ b/model_zoo/DIEN/src/DIEN.py @@ -325,7 +325,7 @@ def __init__(self, input_size, hidden_size, bias=True, gru_type='AUGRU'): if gru_type == "AUGRU": self.gru_cell = AUGRUCell(input_size, hidden_size, bias=bias) elif gru_type == "AGRU": - self.gru_cell = AUGRUCell(input_size, hidden_size, bias=bias) + self.gru_cell = AGRUCell(input_size, hidden_size, bias=bias) def forward(self, packed_seq_emb, attn_score=None, h=None): assert isinstance(packed_seq_emb, PackedSequence) and isinstance(attn_score, PackedSequence), \