Merge branch 'main' into fix_nb_error

NVIDIA-Merlin · Oct 8, 2024 · 3b4d47f · 3b4d47f
2 parents 4efd765 + 73782c9
commit 3b4d47f
Show file tree

Hide file tree

Showing 16 changed files with 107 additions and 54 deletions.
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
diff --git a/.github/workflows/gpu-ci-integration.yml b/.github/workflows/gpu-ci-integration.yml
@@ -3,16 +3,23 @@ name: GPU NOTEBOOK CI
 on:
   workflow_dispatch:
   push:
-    branches: [main]
+    branches:
+      - main
+      - "pull-request/[0-9]+"
     tags:
       - "v[0-9]+.[0-9]+.[0-9]+"
-  pull_request:
-    branches: [main]
-    types: [opened, synchronize, reopened]
 
 jobs:
   gpu-ci-integration:
-    runs-on: 1GPU
+    runs-on: linux-amd64-gpu-p100-latest-1
+    container:
+      image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+      options: --shm-size=1G
+      credentials:
+        username: $oauthtoken
+        password: ${{ secrets.NGC_TOKEN }}
 
     steps:
       - uses: actions/checkout@v3
@@ -29,4 +36,4 @@ jobs:
             # find the release branch that we're pointing at
             branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
           fi
-          cd ${{ github.workspace }}; tox -e test-gpu-integration -- $branch
+          tox -e test-gpu-integration -- $branch
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -3,34 +3,37 @@ name: GPU CI
 on:
   workflow_dispatch:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - "pull-request/[0-9]+"
     tags:
       - "v[0-9]+.[0-9]+.[0-9]+"
-  pull_request:
-    branches: [ main ]
-    types: [opened, synchronize, reopened]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
 
 jobs:
   gpu-ci:
-    runs-on: 2GPU
+    runs-on: linux-amd64-gpu-p100-latest-1
+    container:
+      image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+      options: --shm-size=1G
+      credentials:
+        username: $oauthtoken
+        password: ${{ secrets.NGC_TOKEN }}
 
     steps:
-    - uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
-    - name: Run tests
-      run: |
-        ref_type=${{ github.ref_type }}
-        branch=main
-        if [[ $ref_type == "tag"* ]]
-        then
-          # fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
-          git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
-          # find the release branch that we're pointing at
-          branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
-        fi
-        cd ${{ github.workspace }}; tox -e test-gpu -- $branch
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Run tests
+        run: |
+          ref_type=${{ github.ref_type }}
+          branch=main
+          if [[ $ref_type == "tag"* ]]
+          then
+            # fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
+            git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
+            # find the release branch that we're pointing at
+            branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
+          fi
+          tox -e test-gpu -- $branch
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -18,4 +18,4 @@ jobs:
         with:
           cache: 'pip'
           cache-dependency-path: '**/**.txt'
-      - uses: pre-commit/action@v2.0.3
+      - uses: pre-commit/action@v3.0.0
diff --git a/ci/test_integration.sh b/ci/test_integration.sh
@@ -34,7 +34,7 @@ pip install -r requirements.txt
 ## Get data
 cd t4r_paper_repro
 FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt
-pip install gdown
+pip install gdown==4.6.0
 gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV
 apt-get update -y
 apt-get install unzip -y

diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
@@ -31,4 +31,16 @@ p.banner {
   border-radius: 4px;
   color: #004831;
   background: #76b900;
-}
+}
+
+footer div p {
+  font-size: 80%;
+}
+
+footer div p a {
+  color: var(--small-font-color);
+}
+
+footer div p a:hover {
+  color: var(--small-font-color);
+}
diff --git a/docs/source/_templates/footer.html b/docs/source/_templates/footer.html
@@ -0,0 +1,14 @@
+{% extends '!footer.html' %}
+{% block contentinfo %}
+<p>
+<a href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank">Privacy Policy</a> |
+<a href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank">Manage My Privacy</a> |
+<a href="https://www.nvidia.com/en-us/preferences/start/" target="_blank">Do Not Sell or Share My Data</a> |
+<a href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank">Terms of Service</a> |
+<a href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank">Accessibility</a> |
+<a href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank">Corporate Policies</a> |
+<a href="https://www.nvidia.com/en-us/product-security/" target="_blank">Product Security</a> |
+<a href="https://www.nvidia.com/en-us/contact/" target="_blank">Contact</a>
+</p>
+{{ super() }}
+{% endblock %}
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -30,6 +30,7 @@
 import re
 import subprocess
 import sys
+from datetime import datetime
 from typing import List, cast
 
 from natsort import natsorted
@@ -42,8 +43,13 @@
 
 # -- Project information -----------------------------------------------------
 
+year_range = "2021"
+year_now = str(datetime.now().year)
+if year_range != year_now:
+    year_range = year_range + chr(8211) + year_now
+
 project = "Transformers4Rec"
-copyright = "2021, NVIDIA"
+copyright = year_range + ", NVIDIA"
 author = "NVIDIA"
 
 
@@ -108,6 +114,7 @@
 }
 html_copy_source = False
 html_show_sourcelink = False
+html_show_sphinx = False
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

diff --git a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb
@@ -286,6 +286,19 @@
     "- <b>per device batch size for evaluation</b>: see above"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c9e83d47-380c-4118-bc29-8bc108163fa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If only 1 GPU are available, starts a single process to use that GPU\n",
+    "from torch.cuda import device_count\n",
+    "num_gpus = device_count()\n",
+    "NUM_PROCESSES = min(num_gpus, 2)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -502,7 +515,7 @@
     "LR = float(os.environ.get(\"LEARNING_RATE\", \"0.0005\"))\n",
     "BATCH_SIZE_TRAIN = int(os.environ.get(\"BATCH_SIZE_TRAIN\", \"256\"))\n",
     "BATCH_SIZE_VALID = int(os.environ.get(\"BATCH_SIZE_VALID\", \"128\"))\n",
-    "!python -m torch.distributed.run --nproc_per_node 2 {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
+    "!python -m torch.distributed.run --nproc_per_node {NUM_PROCESSES} {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
    ]
   },
   {
@@ -554,7 +567,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py b/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py
@@ -224,7 +224,7 @@ def mask_last_interaction(x):
     logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}")
     output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt")
     with open(output_file, "a") as writer:
-        writer.write(f"\n***** Recall@10 of simulated inference  = {recall_10} *****\n")
+        writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n")
     # Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually
     if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling):
         # TODO fix inference discrepancy for permutation language modeling

diff --git a/requirements/base_external.txt b/requirements/base_external.txt
@@ -1,4 +1,4 @@
-transformers[torch]>=4.12,<5
+transformers[torch]>=4.12,<4.31.0
 tqdm>=4.27
 pyarrow>=1.0
 torchmetrics>=0.10.0
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -1,4 +1,9 @@
-Sphinx<3.6
+Sphinx==3.5.4
+sphinxcontrib-applehelp==1.0.4
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
 sphinx_rtd_theme==1.0.0
 sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git
 sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git
@@ -8,4 +13,5 @@ jinja2<3.1
 markupsafe==2.0.1
 natsort==8.0.1
 myst-nb<0.14
+lxml<5.1
 linkify-it-py<1.1
diff --git a/transformers4rec/torch/experimental.py b/transformers4rec/torch/experimental.py
@@ -97,7 +97,7 @@ def forward(self, inputs, training=False, testing=False, **kwargs):
             output = seq_rep + context_rep
         else:
             raise ValueError(
-                f"The aggregation {self.fusion_aggregation} is not supported,"
+                f"The aggregation {self.fusion_aggregation} is not supported, "
                 f"please select one of the following aggregations "
                 f"['concat', 'elementwise-mul', 'elementwise-sum']"
             )

diff --git a/transformers4rec/torch/ranking_metric.py b/transformers4rec/torch/ranking_metric.py
@@ -131,7 +131,7 @@ def _metric(self, ks: torch.Tensor, scores: torch.Tensor, labels: torch.Tensor)
 
         # Compute recalls at K
         num_relevant = torch.sum(labels, dim=-1)
-        rel_indices = (num_relevant != 0).nonzero().squeeze()
+        rel_indices = (num_relevant != 0).nonzero().squeeze(dim=1)
         rel_count = num_relevant[rel_indices]
 
         if rel_indices.shape[0] > 0: