Merge pull request #1 from Saibo-creator/dev

fix issues reported from Berkay
epfl-dlab · Oct 11, 2023 · 19864e8 · 19864e8
2 parents e55bbe2 + 69cf23f
commit 19864e8
Show file tree

Hide file tree

Showing 43 changed files with 286 additions and 1,665 deletions.
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
@@ -2,9 +2,9 @@ name: Docker Image CI
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "main", "dev" ]
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main"]
 
 jobs:
 
@@ -13,6 +13,15 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - name: checkout code
+      uses: actions/checkout@v2 # here we use the v2 of the checkout action, it is the stable version from github
+    - name: Workaround for Disk Space #https://github.com/actions/runner-images/issues/2840
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf "/usr/local/share/boost"
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+    - name: 'Create env file'
+      run: echo "${{ secrets.DOT_ENV }}" > .env
     - name: Build the Docker image
-      run: docker build . --file docker/Dockerfile --tag tmp:$(date +%s)
+      run: docker build . --file docker/Dockerfile --tag tmp:$(date +%s) --build-arg USER_NAME=geng --build-arg PROJECT_NAME=gcd --secret id=dot_env,src=.env
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,11 +15,10 @@ repos:
     -   id: check-docstring-first # Checks for a common error of placing code before the docstring.
     -   id: check-executables-have-shebangs # Checks that executable files have a shebang.
     -   id: detect-private-key # Detects the presence of private keys.
-#-   repo: https://github.com/ambv/black # automatic format code style
-#    rev: 22.3.0
-#    hooks:
-#    -   id: black
-
+-   repo: https://github.com/ambv/black # automatic format code style
+    rev: 22.3.0
+    hooks:
+    -   id: black
 -   repo: local
     hooks:
         -   id: unittest

diff --git a/README.md b/README.md
@@ -25,36 +25,12 @@ Install the required packages:
 pip install -r requirements.txt
 ```
 
-## 3. Downloading the dataset, grammar objects and models
+## Experiments
 
-check the [docs/download_data.md](docs/download_data.md) for instructions on how to download them.
-
-
-## 4. Build task-specific grammars
-
-c.f.  [GF_helper repo](https://github.com/Saibo-creator/GF_helper)
-
-
-## Running the experiments
-
-```shell
-# run the experiments for the CP task
-bash run_CP.sh
-
-# run the experiments for the IE task
-bash run_IE.sh
-
-# run the experiments for the ED task
-bash run_ED.sh
-```
-
-
-The generated prediction sequences will be logged to [Weights and Biases](https://wandb.ai/site).
-
-
-## Developer Guide
-
-If you want to extend the codebase, please check the [docs/developer_guide.md](docs/developer_guide.md) for more details.
+- [Download datasets, grammars and models](docs/download_data.md)
+- [Build task-specific grammars](https://github.com/Saibo-creator/GF_helper)
+- [Windows-specific setting](docs/windows.md)
+- [Running the experiments](docs/run_experiments.md)
 
 
 ## Citation

diff --git a/..._constraint_module/ED/canonical_aida.yaml → ...nt/gf_constraint_module/ED/canonical.yaml b/..._constraint_module/ED/canonical_aida.yaml → ...nt/gf_constraint_module/ED/canonical.yaml
diff --git a/configs/hydra_conf/inference_root.yaml b/configs/hydra_conf/inference_root.yaml
@@ -39,4 +39,4 @@ logs_subfolder: inference
 # determines the log directory's identifier
 #run_name: ???
 
-run_name: Task_${task}_Model_${model.name}_Datamodule_${datamodule.name}_Constraint_${model.gf_constraint_module.name}
+run_name: Task_${task}_Model_${model.name}_Datamodule_${datamodule.name}_Constraint_${oc.select:model.gf_constraint_module.name,null}
diff --git a/configs/hydra_conf/model/HFmodel_cp_old.yaml b/configs/hydra_conf/model/HFmodel_cp_old.yaml
diff --git a/configs/hydra_conf/model/HFmodel_default_old.yaml b/configs/hydra_conf/model/HFmodel_default_old.yaml
diff --git a/configs/hydra_conf/model/HFmodel_ed.yaml b/configs/hydra_conf/model/HFmodel_ed.yaml
@@ -2,6 +2,4 @@ defaults:
   - base_model
   - HFmodel_default
 
-#_target_: src.models.ELHFModelPL
-
 _target_: src.models.ED_model.EDHFModelPL
diff --git a/configs/hydra_conf/model/HFmodel_ed_old.yaml b/configs/hydra_conf/model/HFmodel_ed_old.yaml
diff --git a/configs/hydra_conf/model/HFmodel_ie.yaml b/configs/hydra_conf/model/HFmodel_ie.yaml
@@ -4,5 +4,4 @@ defaults:
 
 linearization_class_id: ${datamodule.linearization_class_id}
 
-
 _target_: src.models.IE_model.IEHFModelPL
diff --git a/configs/hydra_conf/model/HFmodel_ie_old.yaml b/configs/hydra_conf/model/HFmodel_ie_old.yaml
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,5 +1,107 @@
-# Container image that runs your code
-FROM alpine:3.10
+# Choose a docker template, with ARG you can set the base image from the command line
+# for example: docker build --build-arg BASE_IMAGE=xxx --build-arg USERNAME=xxx --build-arg PROJECT_NAME=xxx .
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:22.02-py3
+FROM $BASE_IMAGE
+ARG USER_NAME
+ARG PROJECT_NAME
+
+# install sudo (necessary for adding a new user, see below)
+RUN apt update && \
+    apt -y install sudo
+
+# Setup your user profile with the right group permission to access NFS folder
+# For the command that gives the ids and names you would need checkout the .env file
+RUN --mount=type=secret,id=dot_env source /run/secrets/dot_env && \
+     groupadd -g ${GROUP_ID1} ${GROUP_NAME1} && \
+     groupadd -g ${GROUP_ID2} ${GROUP_NAME2} && \
+     useradd -rm -d /home/${USER_NAME} -s /bin/bash -g ${GROUP_ID1} -G sudo,${GROUP_ID2} -u ${USER_ID} ${USER_NAME} && \
+     chown ${USER_ID} -R /home/${USER_NAME} && \
+     echo -e "${USER_NAME}\n${USER_NAME}" | passwd ${USER_NAME}
+
+# Set some basic ENV vars for readability
+ENV HOME=/home/${USER_NAME}
+ENV CONDA_PREFIX=${HOME}/.conda
+ENV CONDA=${CONDA_PREFIX}/condabin/conda
+
+# WORKDIR instruction sets the directory the following instructions should be run from
+WORKDIR ${HOME}
+
+# [optional and recommended] Install conda
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+RUN bash miniconda.sh -b -p ${CONDA_PREFIX}
+RUN ${CONDA} config --set auto_activate_base false
+RUN ${CONDA} init bash
+
+# [optional] Github user configuration (necessary for accessing private repos)
+RUN --mount=type=secret,id=dot_env source /run/secrets/dot_env && \
+    git config --global user.name ${GITHUB_NAME}
+RUN --mount=type=secret,id=dot_env source /run/secrets/dot_env && \
+    git config --global user.email ${GITHUB_EMAIL}
+# RUN git config --global pull.rebase false
+
+# Prepare the NFS mount
+RUN mkdir /mnt/dlabdata1
+RUN mkdir /mnt/scratch
+
+# Avoid ascii errors when reading files in Python
+# https://stackoverflow.com/a/60084243/12234753
+RUN apt-get install -y locales && locale-gen en_US.UTF-8
+ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8'
+
+
+##############################################
+#
+#        project specific
+#
+##############################################
+
+# Create conda environment and install requirements
+COPY docker/requirements.txt .
+ENV ENV_NAME=${PROJECT_NAME}
+RUN ${CONDA} create -y -n ${ENV_NAME} python=3.9
+RUN ${CONDA} run --name ${ENV_NAME} pip install -r requirements.txt
+
+# [Optional] Login to wandb
+RUN --mount=type=secret,id=dot_env source /run/secrets/dot_env && \
+    ${CONDA} run -n ${ENV_NAME} wandb login ${WANDB_API_KEY}
+
+# Login to HF
+RUN --mount=type=secret,id=dot_env source /run/secrets/dot_env && \
+     ${CONDA} run -n ${ENV_NAME} python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('${HF_API_KEY}')"
+#################################################
+#
+#          GF
+#
+##################################################
+
+RUN wget https://github.com/GrammaticalFramework/gf-core/releases/download/3.11/gf-3.11-ubuntu-20.04.deb
+RUN apt-get install ./gf-3.11-ubuntu-20.04.deb
+
+
+# Change ownership of the whole /home/USER forlder, so that the files created by root can be accesible by world user, such as git cloned repo etc.
+# By default, they are owned by root.
+RUN --mount=type=secret,id=dot_env source /run/secrets/dot_env && \
+    chown ${USER_ID} -R /home/${USER_NAME}
+
+
+# fix problem running a Haskell app on an Ubuntu docker image.
+# https://github.com/snoyberg/http-client/issues/292
+RUN apt-get install netbase
+#################################################
+#
+#          Install zsh
+#
+##################################################
+RUN apt install -y zsh
+
+# N.B. the following commands are run as the user, not as root; Don't move them above the USER command
+# Uses "Spaceship" theme with some customization. Uses some bundled plugins and installs some more from github
+RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.5/zsh-in-docker.sh)" -- \
+    -t robbyrussell \
+    -p https://github.com/zsh-users/zsh-autosuggestions \
+    -p https://github.com/zsh-users/zsh-completions
+
+USER ${USER_NAME}
 
 # Copies your code file from your action repository to the filesystem path `/` of the container
 COPY docker/entrypoint.sh /entrypoint.sh

diff --git a/docker/build_image.sh b/docker/build_image.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+project="${1:-gcd}"  # Use "llama" if no argument is provided
+tag="${2:-latest}"     # Use "latest" if no tag argument is provided
+user="${3:-geng}"
+
+docker build -f docker/Dockerfile  --build-arg USER_NAME=${user} --build-arg PROJECT_NAME=${project} -t ic-registry.epfl.ch/dlab/"${user}"/"${project}":"${tag}" --secret id=dot_env,src=docker/.env .
+
+#docker push ic-registry.epfl.ch/dlab/"${user}"/"${project}":"${tag}"
diff --git a/docker/prod.Dockerfile b/docker/prod.Dockerfile
diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/huggingface/transformers
+wandb~=0.13.7
diff --git a/docs/download_data.md b/docs/download_data.md
@@ -3,9 +3,11 @@
 
 ## Download data for the experiments
 
+At the root of the repository, run the following command to download the data files
 ```bash
 git lfs install
 git clone https://huggingface.co/datasets/saibo/GCD-data-v2
+mv GCD-data-v2 data
 ```
 
 
@@ -18,7 +20,7 @@ git lfs install
 git clone https://huggingface.co/datasets/saibo/GCD-grammar-v2 assets/pgf
 ```
 
-Unzip the files
+Unzip the compressed grammar files
 ```bash
 cd assets/pgf
 # unzip and remove the zip files
@@ -46,4 +48,6 @@ Then, we set the environment variable `HF_MODELS_DIR` to `~/models` by running t
 export HF_MODELS_DIR=~/models
 ```
 
+The models such as LLAMA-7B need to be in HuggingFace format.
+
 We don't provide other model weights as they are too large and may have licensing issues.
diff --git a/docs/run_experiments.md b/docs/run_experiments.md
@@ -0,0 +1,73 @@
+# Run experiments
+
+## requirements
+
+Check the env variable is set correctly
+```shell
+echo $HF_MODELS_DIR
+```
+
+Check the data and grammar objects are downloaded correctly
+```shell
+ls data assets/grammar_objects
+# -> CP ED IE
+```
+
+Check the pre-trained models are downloaded correctly
+```shell
+ls assets/pgf
+# -> CP ED IE
+```
+
+If anything is missing, check the [docs/download_data.md](docs/download_data.md) for instructions on how to set it.
+
+
+## Run the experiments
+
+### Quick start
+
+Suppose you have already `LLAMA-7B` in `$HF_MODELS_DIR`, run the following commands:
+
+```shell
+# run the experiments for the CP task
+bash run_CP.sh LLAMA-7B
+
+# run the experiments for the IE task
+bash run_IE.sh LLAMA-7B
+
+# run the experiments for the ED task
+bash run_ED.sh LLAMA-7B
+```
+
+The above scripts will run the experiments for the CP, IE and ED tasks respectively with a few data samples.
+To run the experiments with the full dataset, please remove the `datamodule.debug_k=2` option in the scripts.
+
+## Results
+
+The generated prediction sequences will be logged to [Weights and Biases](https://wandb.ai/site).
+
+## Dry run
+
+If you don't have the model yet, you can run the experiments with a dummy model.
+```shell
+# run the experiments for the CP task
+bash run_CP.sh saibo/llama-1B
+```
+
+`saibo/llama-1B` is a dummy model that has the same tokenizer as `LLAMA-7B` but with random weights.
+It only has two layers so it's much smaller.
+But as the model is randomly initialized, the results will be meaningless.
+
+
+
+
+
+
+
+## Run experiments without constraints
+
+You can check the results of the experiments without constraints by removing the constraints flags in the scripts.
+
+For example, remove `+constraint/gf_constraint_module/[email protected]_constraint_module="$gf_constraint_module_option"` in `run_CP.sh` will run the experiments without constraints.
+
+