From 30f0946447794d8edbfb32a3605c47a08213f98a Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Tue, 5 Sep 2023 13:05:21 -0400 Subject: [PATCH 1/6] Use `copy-pr-bot` (#742) This PR replaces the `copy_prs` functionality from the `ops-bot` with the new dedicated `copy-pr-bot` GitHub application. Thorough documentation for the new `copy-pr-bot` application can be viewed below. - https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ **Important**: `copy-pr-bot` enforces signed commits. If an organization member opens a PR that contains unsigned commits, it will be deemed untrusted and therefore require an `/ok to test` comment. See the GitHub docs [here](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) for information on how to set up commit signing. Any time a PR is deemed untrusted, it will receive a comment that looks like this: https://github.com/rapidsai/ci-imgs/pull/63#issuecomment-1688973208. Every subsequent commit on an untrusted PR will require an additional `/ok to test` comment. Any existing PRs that have unsigned commits after this change is merged will require an `/ok to test` comment for each subsequent commit _or_ the PR can be rebased to sign the unsigned commits as mentioned in the docs here: https://docs.gha-runners.nvidia.com/cpr/contributors. This information is all included on the documentation page linked above. _I've skipped CI on this PR since it's not a change that is tested._ [skip ci] --- .github/copy-pr-bot.yaml | 4 ++++ .github/ops-bot.yaml | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 .github/copy-pr-bot.yaml delete mode 100644 .github/ops-bot.yaml diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 0000000000..895ba83ee5 --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +# Configuration file for `copy-pr-bot` GitHub App +# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ + +enabled: true diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml deleted file mode 100644 index 84bbe71f46..0000000000 --- a/.github/ops-bot.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# This file controls which features from the `ops-bot` repository below are enabled. -# - https://github.com/rapidsai/ops-bot - -copy_prs: true From 5bef974b6a12c62e6d573421a24148036dbf849d Mon Sep 17 00:00:00 2001 From: Julio Perez <37191411+jperez999@users.noreply.github.com> Date: Mon, 16 Oct 2023 20:54:17 -0400 Subject: [PATCH 2/6] add rapids infra (#753) * add rapids infra * add integration to rapids ci --- .github/workflows/gpu-ci-integration.yml | 19 ++++++--- .github/workflows/gpu-ci.yml | 51 +++++++++++++----------- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/.github/workflows/gpu-ci-integration.yml b/.github/workflows/gpu-ci-integration.yml index 18e4fd6407..740f3eccd0 100644 --- a/.github/workflows/gpu-ci-integration.yml +++ b/.github/workflows/gpu-ci-integration.yml @@ -3,16 +3,23 @@ name: GPU NOTEBOOK CI on: workflow_dispatch: push: - branches: [main] + branches: + - main + - "pull-request/[0-9]+" tags: - "v[0-9]+.[0-9]+.[0-9]+" - pull_request: - branches: [main] - types: [opened, synchronize, reopened] jobs: gpu-ci-integration: - runs-on: 1GPU + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + options: --shm-size=1G + credentials: + username: $oauthtoken + password: ${{ secrets.NGC_TOKEN }} steps: - uses: actions/checkout@v3 @@ -29,4 +36,4 @@ jobs: # find the release branch that we're pointing at branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') fi - cd ${{ github.workspace }}; tox -e test-gpu-integration -- $branch + tox -e test-gpu-integration -- $branch diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 56cc609ead..97d17aa626 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -3,34 +3,37 @@ name: GPU CI on: workflow_dispatch: push: - branches: [ main ] + branches: + - main + - "pull-request/[0-9]+" tags: - "v[0-9]+.[0-9]+.[0-9]+" - pull_request: - branches: [ main ] - types: [opened, synchronize, reopened] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true jobs: gpu-ci: - runs-on: 2GPU + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + options: --shm-size=1G + credentials: + username: $oauthtoken + password: ${{ secrets.NGC_TOKEN }} steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Run tests - run: | - ref_type=${{ github.ref_type }} - branch=main - if [[ $ref_type == "tag"* ]] - then - # fetch release branches (the branch name is not automatically fetched by the actions/checkout step) - git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release* - # find the release branch that we're pointing at - branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') - fi - cd ${{ github.workspace }}; tox -e test-gpu -- $branch + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Run tests + run: | + ref_type=${{ github.ref_type }} + branch=main + if [[ $ref_type == "tag"* ]] + then + # fetch release branches (the branch name is not automatically fetched by the actions/checkout step) + git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release* + # find the release branch that we're pointing at + branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') + fi + tox -e test-gpu -- $branch From 625897c3f55135342dbe53c32b7650a8ecb86c75 Mon Sep 17 00:00:00 2001 From: Gabriel Moreira Date: Mon, 6 Nov 2023 14:10:12 -0300 Subject: [PATCH 3/6] Fix transformer and error on example when CI uses single-GPU (#757) * fix on going errors in transformers unit tests * added range to package versions * add linting changes required by linter * update pre-commit hook action * update file to pass lint action * Fixed bug in end-to-end-session-based that was failing test on CI when only a single GPU was available (which prevented multigpu training) --------- Co-authored-by: Julio Co-authored-by: Julio Perez <37191411+jperez999@users.noreply.github.com> --- .github/workflows/lint.yml | 2 +- ...-based-Yoochoose-multigpu-training-PyT.ipynb | 17 +++++++++++++++-- .../t4r_paper_repro/transf_exp_main.py | 2 +- requirements/base_external.txt | 2 +- transformers4rec/torch/experimental.py | 2 +- 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 016fc4a5b7..01a528855d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -18,4 +18,4 @@ jobs: with: cache: 'pip' cache-dependency-path: '**/**.txt' - - uses: pre-commit/action@v2.0.3 + - uses: pre-commit/action@v3.0.0 diff --git a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb index 3bd729412e..3f502234f8 100644 --- a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb +++ b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb @@ -286,6 +286,19 @@ "- per device batch size for evaluation: see above" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c9e83d47-380c-4118-bc29-8bc108163fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# If only 1 GPU are available, starts a single process to use that GPU\n", + "from torch.cuda import device_count\n", + "num_gpus = device_count()\n", + "NUM_PROCESSES = min(num_gpus, 2)" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -502,7 +515,7 @@ "LR = float(os.environ.get(\"LEARNING_RATE\", \"0.0005\"))\n", "BATCH_SIZE_TRAIN = int(os.environ.get(\"BATCH_SIZE_TRAIN\", \"256\"))\n", "BATCH_SIZE_VALID = int(os.environ.get(\"BATCH_SIZE_VALID\", \"128\"))\n", - "!python -m torch.distributed.run --nproc_per_node 2 {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}" + "!python -m torch.distributed.run --nproc_per_node {NUM_PROCESSES} {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}" ] }, { @@ -554,7 +567,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py b/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py index 5da463b16c..3cd0efedfa 100644 --- a/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py +++ b/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py @@ -224,7 +224,7 @@ def mask_last_interaction(x): logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}") output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt") with open(output_file, "a") as writer: - writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n") + writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n") # Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling): # TODO fix inference discrepancy for permutation language modeling diff --git a/requirements/base_external.txt b/requirements/base_external.txt index b036a28265..b3282044f2 100644 --- a/requirements/base_external.txt +++ b/requirements/base_external.txt @@ -1,4 +1,4 @@ -transformers[torch]>=4.12,<5 +transformers[torch]>=4.12,<4.31.0 tqdm>=4.27 pyarrow>=1.0 torchmetrics>=0.10.0 diff --git a/transformers4rec/torch/experimental.py b/transformers4rec/torch/experimental.py index 38850b6c30..4631c60b9c 100644 --- a/transformers4rec/torch/experimental.py +++ b/transformers4rec/torch/experimental.py @@ -97,7 +97,7 @@ def forward(self, inputs, training=False, testing=False, **kwargs): output = seq_rep + context_rep else: raise ValueError( - f"The aggregation {self.fusion_aggregation} is not supported," + f"The aggregation {self.fusion_aggregation} is not supported, " f"please select one of the following aggregations " f"['concat', 'elementwise-mul', 'elementwise-sum']" ) From d0cce61b988a1e923545f94ac22499fb57928d18 Mon Sep 17 00:00:00 2001 From: Emma Date: Thu, 11 Jan 2024 15:22:09 +0800 Subject: [PATCH 4/6] Fix version for gdown (#767) --- ci/test_integration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test_integration.sh b/ci/test_integration.sh index 82b42b175c..f91a1386aa 100755 --- a/ci/test_integration.sh +++ b/ci/test_integration.sh @@ -34,7 +34,7 @@ pip install -r requirements.txt ## Get data cd t4r_paper_repro FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt -pip install gdown +pip install gdown==4.6.0 gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV apt-get update -y apt-get install unzip -y From 23d5e3ba73b3d490400e45bf4feba94ed473432f Mon Sep 17 00:00:00 2001 From: Mike McKiernan Date: Thu, 25 Apr 2024 06:49:56 -0400 Subject: [PATCH 5/6] Add GDPR info to footer (#774) * Add GDPR info to footer Signed-off-by: Mike McKiernan * Update docs dependencies Signed-off-by: Mike McKiernan --------- Signed-off-by: Mike McKiernan --- docs/source/_static/css/custom.css | 14 +++++++++++++- docs/source/_templates/footer.html | 14 ++++++++++++++ docs/source/_templates/layout.html | 9 --------- docs/source/conf.py | 9 ++++++++- requirements/docs.txt | 8 +++++++- 5 files changed, 42 insertions(+), 12 deletions(-) create mode 100644 docs/source/_templates/footer.html delete mode 100644 docs/source/_templates/layout.html diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css index 319ddff89a..8a5b94b381 100644 --- a/docs/source/_static/css/custom.css +++ b/docs/source/_static/css/custom.css @@ -31,4 +31,16 @@ p.banner { border-radius: 4px; color: #004831; background: #76b900; -} \ No newline at end of file +} + +footer div p { + font-size: 80%; +} + +footer div p a { + color: var(--small-font-color); +} + +footer div p a:hover { + color: var(--small-font-color); +} diff --git a/docs/source/_templates/footer.html b/docs/source/_templates/footer.html new file mode 100644 index 0000000000..72cae10b79 --- /dev/null +++ b/docs/source/_templates/footer.html @@ -0,0 +1,14 @@ +{% extends '!footer.html' %} +{% block contentinfo %} +

+Privacy Policy | +Manage My Privacy | +Do Not Sell or Share My Data | +Terms of Service | +Accessibility | +Corporate Policies | +Product Security | +Contact +

+{{ super() }} +{% endblock %} \ No newline at end of file diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html deleted file mode 100644 index 76917f64c1..0000000000 --- a/docs/source/_templates/layout.html +++ /dev/null @@ -1,9 +0,0 @@ -{% extends "!layout.html" %} -{% block extrabody %} - -{% endblock %} diff --git a/docs/source/conf.py b/docs/source/conf.py index 05b13b7948..598915e382 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -30,6 +30,7 @@ import re import subprocess import sys +from datetime import datetime from typing import List, cast from natsort import natsorted @@ -42,8 +43,13 @@ # -- Project information ----------------------------------------------------- +year_range = "2021" +year_now = str(datetime.now().year) +if year_range != year_now: + year_range = year_range + chr(8211) + year_now + project = "Transformers4Rec" -copyright = "2021, NVIDIA" +copyright = year_range + ", NVIDIA" author = "NVIDIA" @@ -108,6 +114,7 @@ } html_copy_source = False html_show_sourcelink = False +html_show_sphinx = False # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/requirements/docs.txt b/requirements/docs.txt index a1efbbaaf2..7788b3bc26 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,4 +1,9 @@ -Sphinx<3.6 +Sphinx==3.5.4 +sphinxcontrib-applehelp==1.0.4 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==2.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.5 sphinx_rtd_theme==1.0.0 sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git @@ -8,4 +13,5 @@ jinja2<3.1 markupsafe==2.0.1 natsort==8.0.1 myst-nb<0.14 +lxml<5.1 linkify-it-py<1.1 From 73782c926be66fd110a38d5a3297394e8a1718f5 Mon Sep 17 00:00:00 2001 From: sungho-ham <19978686+sungho-ham@users.noreply.github.com> Date: Wed, 7 Aug 2024 05:10:34 +0900 Subject: [PATCH 6/6] Fix recall at k when batch size = 1 (#779) Co-authored-by: sungho-ham --- transformers4rec/torch/ranking_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers4rec/torch/ranking_metric.py b/transformers4rec/torch/ranking_metric.py index 5495b98a45..a281dd83b2 100644 --- a/transformers4rec/torch/ranking_metric.py +++ b/transformers4rec/torch/ranking_metric.py @@ -131,7 +131,7 @@ def _metric(self, ks: torch.Tensor, scores: torch.Tensor, labels: torch.Tensor) # Compute recalls at K num_relevant = torch.sum(labels, dim=-1) - rel_indices = (num_relevant != 0).nonzero().squeeze() + rel_indices = (num_relevant != 0).nonzero().squeeze(dim=1) rel_count = num_relevant[rel_indices] if rel_indices.shape[0] > 0: