Skip to content

Commit

Permalink
Merge branch 'main' into onur/log-props
Browse files Browse the repository at this point in the history
  • Loading branch information
oyilmaz-nvidia authored Oct 29, 2024
2 parents ac2403f + 217b528 commit 5c06387
Show file tree
Hide file tree
Showing 53 changed files with 3,344 additions and 932 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ jobs:
echo "log=$(tail -c 2000 err.log | base64 -w 0)" >> "$GITHUB_OUTPUT"
potential_infra_failure=$(cat err.log | grep -Eqi "gpu|cuda|device" && echo true || echo false)
potential_infra_failure=$(cat err.log | grep -Eqiw "device" && echo true || echo false)
echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
exit $EXIT_CODE
Expand Down
46 changes: 34 additions & 12 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true


jobs:
pre-flight:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -2589,27 +2590,19 @@ jobs:
mkdir examples/llm/auto_configurator/auto_conf_logs
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--log_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--run_number=1
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--log_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--run_number=2
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--log_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--run_number=3
python examples/llm/auto_configurator/auto_config.py \
--logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
--tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
--log_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
--get_results
AFTER_SCRIPT: |
rm -rf examples/llm/auto_configurator/auto_conf_logs
Expand Down Expand Up @@ -3887,6 +3880,34 @@ jobs:
rm -rf tests/collections/llm/gpt_pretrain_results
rm -rf tests/collections/llm/gpt_index_mappings
L2_NeMo_2_llama3_pretraining_recipe:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_llama3_pretraining_recipe') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/llama3_pretraining.py \
--seq-length 1024 \
--devices=2 \
--max-steps=6 \
--early-stop=3 \
--experiment-dir=/tmp/llm_tests/llama_pretrain_results \
--data-path=/home/TestData/nlp/megatron_llama/data/rp2_sample_sentencepiece_preproc_text_document \
--tokenizer-path=/home/TestData/nlp/megatron_llama/tokenizer.model \
--index-mapping-dir=/tmp/llm_tests/llama_index_mappings \
python tests/collections/llm/llama3_pretraining.py \
--seq-length 1024 \
--devices=2 \
--max-steps=6 \
--experiment-dir=/tmp/llm_tests/llama_pretrain_results \
--data-path=/home/TestData/nlp/megatron_llama/data/rp2_sample_sentencepiece_preproc_text_document \
--tokenizer-path=/home/TestData/nlp/megatron_llama/tokenizer.model \
--index-mapping-dir=/tmp/llm_tests/llama_index_mappings \
--cp 1 --tp 2 --sp 1
L2_NeMo_2_GPT_DDP_Param_Parity_check:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4439,6 +4460,7 @@ jobs:
- L2_NeMo_2_GPT_Pretraining_no_transformer_engine
- L2_NeMo_2_GPT_DDP_Param_Parity_check
- L2_NeMo_2_HF_MODEL_IMPORT
- L2_NeMo_2_llama3_pretraining_recipe
- L2_NeMo_2_SSM_Pretraining
- L2_NeMo_2_SSM_Finetuning
- L2_NeMo_2_T5_Pretraining
Expand Down
59 changes: 59 additions & 0 deletions .github/workflows/copyright-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Copyright check

on:
pull_request:

jobs:
main:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
fetch-depth: 0

- name: Check files have copyright notice
run: |
cd ${{ github.run_id }}
# Files ending with .py should have Copyright notice in the first 10 lines
find_files_with_missing_copyright() {
find ./ -type f -name '*.py' -not -path "./.git/*" -not -path "./*__init__.py" | while read path; do
echo -en $path"\t"
head -n 10 $path | tr '\n' '\t' | sed 's/\t$/\n/'
done \
| egrep -iv 'Copyright.*NVIDIA CORPORATION.*' \
| egrep -iv '*MIT.*Licen.e.*' \
| egrep -iv '*Copyright.*Apache.*' \
| egrep -iv '*Apache.*License.*' \
| while read line; do
echo $line | cut -d' ' -f1
done
}
declare RESULT=($(find_files_with_missing_copyright)) # (..) = array
if [ "${#RESULT[@]}" -gt 0 ]; then
echo "Error: Found files with missing copyright:"
for (( i=0; i<"${#RESULT[@]}"; i++ )); do
echo "path= ${RESULT[$i]}"
done
exit 1;
else
echo "Ok: All (Python) files start with copyright notice"
fi
9 changes: 4 additions & 5 deletions .github/workflows/monitor-vms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
name: Reboots VMs in a controlled way
on:
schedule:
- cron: /15 * * * *
- cron: 0/15 * * * *
workflow_dispatch:
pull_request:

jobs:
pre-flight:
Expand All @@ -28,7 +27,7 @@ jobs:
| jq -c '[
.runners[]
| select(.status == "online")
| select(.name | contains("gpu")
| select(.name | contains("gpu"))
| {
"vm": .name,
"n_gpus": [
Expand All @@ -47,8 +46,8 @@ jobs:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
uses: .github/workflows/monitor-single-vm.yml
uses: ./.github/workflows/monitor-single-vm.yml
with:
vm: ${{ matrix.vm }}
n_gpus: ${{ matrix.n_gpus }}
secrets: inherit
secrets: inherit # pragma: allowlist secret
64 changes: 33 additions & 31 deletions .github/workflows/secrets-detector.yml
Original file line number Diff line number Diff line change
@@ -1,35 +1,37 @@
# # Copyright (c) 2020-2021, NVIDIA CORPORATION.
# #
# # Licensed under the Apache License, Version 2.0 (the "License");
# # you may not use this file except in compliance with the License.
# # You may obtain a copy of the License at
# #
# # http://www.apache.org/licenses/LICENSE-2.0
# #
# # Unless required by applicable law or agreed to in writing, software
# # distributed under the License is distributed on an "AS IS" BASIS,
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
# name: Secrets detector
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Secrets detector

# on:
# pull_request:
on:
pull_request:
branches:
- 'main'

# jobs:
# main:
# runs-on: ubuntu-latest
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# with:
# path: ${{ github.run_id }}
# fetch-depth: 0
jobs:
main:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
fetch-depth: 0

# - name: Install secrets detector
# run: pip install detect-secrets
- name: Install secrets detector
run: pip install detect-secrets

# - name: Run on change-set
# run: |
# cd ${{ github.run_id }}
# git diff --name-only --diff-filter=d --merge-base origin/${{ github.base_ref }} -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline
- name: Run on change-set
run: |
cd ${{ github.run_id }}
git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline
18 changes: 9 additions & 9 deletions .github/workflows/config/.secrets.baseline → .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@
}
],
"results": {
".github/workflows/cicd-main.yml": [
".github/workflows/node-reboot.yml": [
{
"type": "Base64 High Entropy String",
"filename": ".github/workflows/cicd-main.yml",
"hashed_secret": "593951c440200143335452427205ae7c8580d463",
"type": "Secret Keyword",
"filename": ".github/workflows/node-reboot.yml",
"hashed_secret": "3e26d6750975d678acb8fa35a0f69237881576b0",
"is_verified": false,
"line_number": 1503
"line_number": 52
}
],
"docs/source/nlp/question_answering.rst": [
Expand Down Expand Up @@ -1229,9 +1229,9 @@
{
"type": "Base64 High Entropy String",
"filename": "tests/infer_data_path.py",
"hashed_secret": "e3fb89ccb261c88146519164f7e8a47786d33fee",
"hashed_secret": "8e0937151cfd9750db688fbe66be37d0c53ed6ab",
"is_verified": false,
"line_number": 271
"line_number": 63
}
],
"tutorials/asr/Multilang_ASR.ipynb": [
Expand Down Expand Up @@ -1902,7 +1902,7 @@
"filename": "tutorials/multimodal/Multimodal Data Preparation.ipynb",
"hashed_secret": "b641cbe299c9e27b480cc8a823bb020d45962236",
"is_verified": false,
"line_number": 660
"line_number": 658
}
],
"tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb": [
Expand Down Expand Up @@ -2083,5 +2083,5 @@
}
]
},
"generated_at": "2024-09-08T19:00:15Z"
"generated_at": "2024-10-25T13:43:17Z"
}
Loading

0 comments on commit 5c06387

Please sign in to comment.