Fix issues #84, #85

reczoo · Apr 16, 2024 · 4646e25 · 4646e25
1 parent 5b9260e
commit 4646e25
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@
 [Doing] Add support for saving pb file, exporting embeddings
 [Doing] Add support of NVTabular data
 
+**FuxiCTR v2.2.1, 2024-04-16**
++ [Fix] Fix issue of evaluation not performed at epoch end when streaming=True ([#85](https://github.com/xue-pai/FuxiCTR/issues/85))
++ [Fix] Fix issue when loading pretrain_emb in npz format ([#84](https://github.com/xue-pai/FuxiCTR/issues/84))
+
 **FuxiCTR v2.2.0, 2024-02-17**
 + [Feature] Add support of npz format for pretrained_emb
 + [Refactor] Change data format from h5 to npz
@@ -16,7 +20,7 @@
 + [Feature] Add GDCN model
 + [Refactor] Rename FINAL model to FinalNet
 + [Refactor] Update RecZoo URLs
-+ [Fix] Fix bug #75
++ [Fix] Fix bug [#75](https://github.com/xue-pai/FuxiCTR/issues/75)
 + [Fix] Fix h5 file extenstion issue
 + [Fix] Fix typo in FinalNet
 

diff --git a/fuxictr/preprocess/tokenizer.py b/fuxictr/preprocess/tokenizer.py
@@ -127,9 +127,13 @@ def encode_sequence(self, texts):
         return np.array(sequence_list)
 
     def load_pretrained_vocab(self, feature_dtype, pretrain_path, expand_vocab=True):
-        with h5py.File(pretrain_path, 'r') as hf:
-            keys = hf["key"][:]
-            keys = keys.astype(feature_dtype) # in case mismatch of dtype between int and str
+        if pretrain_path.endswith(".h5"):
+            with h5py.File(pretrain_path, 'r') as hf:
+                keys = hf["key"][:]
+                # in case mismatch of dtype between int and str
+                keys = keys.astype(feature_dtype)
+        elif pretrain_path.endswith(".npz"):
+            keys = np.load(pretrain_path)["key"]
         # Update vocab with pretrained keys in case new tokens appear in validation or test set
         # Do NOT update OOV index here since it is used in PretrainedEmbedding
         if expand_vocab:

diff --git a/fuxictr/pytorch/dataloaders/npz_block_dataloader.py b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py
@@ -82,9 +82,8 @@ def __len__(self):
 
     def count_batches_and_samples(self):
         num_samples = 0
-        num_batches = 0
         for block_path in self.data_blocks:
             block_size = np.load(block_path)[self.feature_map.labels[0]].shape[0]
             num_samples += block_size
-            num_batches += int(np.ceil(block_size * 1.0 / self.batch_size))
+        num_batches = int(np.ceil(num_samples / self.batch_size))
         return num_batches, num_samples
diff --git a/fuxictr/version.py b/fuxictr/version.py
@@ -1 +1 @@
-__version__="2.2.0"
+__version__="2.2.1"
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="fuxictr",
-    version="2.2.0",
+    version="2.2.1",
     author="RECZOO",
     author_email="[email protected]",
     description="A configurable, tunable, and reproducible library for CTR prediction",