Skip to content

Commit

Permalink
Merge branch 'main' into fix_nb_error
Browse files Browse the repository at this point in the history
  • Loading branch information
rnyak authored Oct 8, 2024
2 parents 4efd765 + 73782c9 commit 3b4d47f
Show file tree
Hide file tree
Showing 16 changed files with 107 additions and 54 deletions.
4 changes: 4 additions & 0 deletions .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Configuration file for `copy-pr-bot` GitHub App
# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

enabled: true
4 changes: 0 additions & 4 deletions .github/ops-bot.yaml

This file was deleted.

19 changes: 13 additions & 6 deletions .github/workflows/gpu-ci-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,23 @@ name: GPU NOTEBOOK CI
on:
workflow_dispatch:
push:
branches: [main]
branches:
- main
- "pull-request/[0-9]+"
tags:
- "v[0-9]+.[0-9]+.[0-9]+"
pull_request:
branches: [main]
types: [opened, synchronize, reopened]

jobs:
gpu-ci-integration:
runs-on: 1GPU
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
credentials:
username: $oauthtoken
password: ${{ secrets.NGC_TOKEN }}

steps:
- uses: actions/checkout@v3
Expand All @@ -29,4 +36,4 @@ jobs:
# find the release branch that we're pointing at
branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
fi
cd ${{ github.workspace }}; tox -e test-gpu-integration -- $branch
tox -e test-gpu-integration -- $branch
51 changes: 27 additions & 24 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,37 @@ name: GPU CI
on:
workflow_dispatch:
push:
branches: [ main ]
branches:
- main
- "pull-request/[0-9]+"
tags:
- "v[0-9]+.[0-9]+.[0-9]+"
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
gpu-ci:
runs-on: 2GPU
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
credentials:
username: $oauthtoken
password: ${{ secrets.NGC_TOKEN }}

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Run tests
run: |
ref_type=${{ github.ref_type }}
branch=main
if [[ $ref_type == "tag"* ]]
then
# fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
# find the release branch that we're pointing at
branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
fi
cd ${{ github.workspace }}; tox -e test-gpu -- $branch
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Run tests
run: |
ref_type=${{ github.ref_type }}
branch=main
if [[ $ref_type == "tag"* ]]
then
# fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
# find the release branch that we're pointing at
branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
fi
tox -e test-gpu -- $branch
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ jobs:
with:
cache: 'pip'
cache-dependency-path: '**/**.txt'
- uses: pre-commit/action@v2.0.3
- uses: pre-commit/action@v3.0.0
2 changes: 1 addition & 1 deletion ci/test_integration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pip install -r requirements.txt
## Get data
cd t4r_paper_repro
FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt
pip install gdown
pip install gdown==4.6.0
gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV
apt-get update -y
apt-get install unzip -y
Expand Down
14 changes: 13 additions & 1 deletion docs/source/_static/css/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,16 @@ p.banner {
border-radius: 4px;
color: #004831;
background: #76b900;
}
}

footer div p {
font-size: 80%;
}

footer div p a {
color: var(--small-font-color);
}

footer div p a:hover {
color: var(--small-font-color);
}
14 changes: 14 additions & 0 deletions docs/source/_templates/footer.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{% extends '!footer.html' %}
{% block contentinfo %}
<p>
<a href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank">Privacy Policy</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank">Manage My Privacy</a> |
<a href="https://www.nvidia.com/en-us/preferences/start/" target="_blank">Do Not Sell or Share My Data</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank">Terms of Service</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank">Accessibility</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank">Corporate Policies</a> |
<a href="https://www.nvidia.com/en-us/product-security/" target="_blank">Product Security</a> |
<a href="https://www.nvidia.com/en-us/contact/" target="_blank">Contact</a>
</p>
{{ super() }}
{% endblock %}
9 changes: 0 additions & 9 deletions docs/source/_templates/layout.html

This file was deleted.

9 changes: 8 additions & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import re
import subprocess
import sys
from datetime import datetime
from typing import List, cast

from natsort import natsorted
Expand All @@ -42,8 +43,13 @@

# -- Project information -----------------------------------------------------

year_range = "2021"
year_now = str(datetime.now().year)
if year_range != year_now:
year_range = year_range + chr(8211) + year_now

project = "Transformers4Rec"
copyright = "2021, NVIDIA"
copyright = year_range + ", NVIDIA"
author = "NVIDIA"


Expand Down Expand Up @@ -108,6 +114,7 @@
}
html_copy_source = False
html_show_sourcelink = False
html_show_sphinx = False

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,19 @@
"- <b>per device batch size for evaluation</b>: see above"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c9e83d47-380c-4118-bc29-8bc108163fa0",
"metadata": {},
"outputs": [],
"source": [
"# If only 1 GPU are available, starts a single process to use that GPU\n",
"from torch.cuda import device_count\n",
"num_gpus = device_count()\n",
"NUM_PROCESSES = min(num_gpus, 2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
Expand Down Expand Up @@ -502,7 +515,7 @@
"LR = float(os.environ.get(\"LEARNING_RATE\", \"0.0005\"))\n",
"BATCH_SIZE_TRAIN = int(os.environ.get(\"BATCH_SIZE_TRAIN\", \"256\"))\n",
"BATCH_SIZE_VALID = int(os.environ.get(\"BATCH_SIZE_VALID\", \"128\"))\n",
"!python -m torch.distributed.run --nproc_per_node 2 {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
"!python -m torch.distributed.run --nproc_per_node {NUM_PROCESSES} {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
]
},
{
Expand Down Expand Up @@ -554,7 +567,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def mask_last_interaction(x):
logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}")
output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt")
with open(output_file, "a") as writer:
writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n")
writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n")
# Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually
if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling):
# TODO fix inference discrepancy for permutation language modeling
Expand Down
2 changes: 1 addition & 1 deletion requirements/base_external.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
transformers[torch]>=4.12,<5
transformers[torch]>=4.12,<4.31.0
tqdm>=4.27
pyarrow>=1.0
torchmetrics>=0.10.0
8 changes: 7 additions & 1 deletion requirements/docs.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
Sphinx<3.6
Sphinx==3.5.4
sphinxcontrib-applehelp==1.0.4
sphinxcontrib-devhelp==1.0.2
sphinxcontrib-htmlhelp==2.0.1
sphinxcontrib-qthelp==1.0.3
sphinxcontrib-serializinghtml==1.1.5
sphinx_rtd_theme==1.0.0
sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git
sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git
Expand All @@ -8,4 +13,5 @@ jinja2<3.1
markupsafe==2.0.1
natsort==8.0.1
myst-nb<0.14
lxml<5.1
linkify-it-py<1.1
2 changes: 1 addition & 1 deletion transformers4rec/torch/experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def forward(self, inputs, training=False, testing=False, **kwargs):
output = seq_rep + context_rep
else:
raise ValueError(
f"The aggregation {self.fusion_aggregation} is not supported,"
f"The aggregation {self.fusion_aggregation} is not supported, "
f"please select one of the following aggregations "
f"['concat', 'elementwise-mul', 'elementwise-sum']"
)
Expand Down
2 changes: 1 addition & 1 deletion transformers4rec/torch/ranking_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _metric(self, ks: torch.Tensor, scores: torch.Tensor, labels: torch.Tensor)

# Compute recalls at K
num_relevant = torch.sum(labels, dim=-1)
rel_indices = (num_relevant != 0).nonzero().squeeze()
rel_indices = (num_relevant != 0).nonzero().squeeze(dim=1)
rel_count = num_relevant[rel_indices]

if rel_indices.shape[0] > 0:
Expand Down

0 comments on commit 3b4d47f

Please sign in to comment.