zia-ai · faya98 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -0,0 +1,75 @@
+version: 2.1
+workflows:
+  build:
+    jobs:
+      - build-docker-image:
+          context:
+            - org-global
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+
+
+aliases:
+  - &configure_buildx_context
+    name: Configure remote docker buildx context
+    command: |
+      docker buildx create --name remote-kubernetes --driver remote --driver-opt cacert=/certs/ca.pem,cert=/certs/cert.pem,key=/certs/key.pem tcp://buildkitd-0.buildkitd-headless.circleci.svc.cluster.local:1234
+      docker buildx use remote-kubernetes
+
+  - &gcp_auth
+    name: "Authenticate to Google Cloud"
+    command: |
+      # Snippet from https://discuss.circleci.com/t/walk-through-oidc-to-gcp/44224
+      # Configures application default credentials without requiring gcloud to be installed
+      GCP_OIDC_AUDIENCE="projects/${GOOGLE_PROJECT_ID}/locations/global/workloadIdentityPools/${OIDC_WIP_ID}/providers/${OIDC_WIP_PROVIDER_ID}"
+      GCP_IMPERSONATION_URL="https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/${OIDC_SERVICE_ACCOUNT_EMAIL}:generateAccessToken"
+
+      mkdir -p ~/.config/gcloud
+      echo "${CIRCLE_OIDC_TOKEN}" > $HOME/.config/gcloud/oidc_token
+
+      cat >> $HOME/.config/gcloud/application_default_credentials.json \<<- EOF
+      {
+        "type": "external_account",
+        "audience": "//iam.googleapis.com/${GCP_OIDC_AUDIENCE}",
+        "subject_token_type": "urn:ietf:params:oauth:token-type:jwt",
+        "token_url": "https://sts.googleapis.com/v1/token",
+        "credential_source": {
+          "file": "$HOME/.config/gcloud/oidc_token"
+        },
+        "service_account_impersonation_url": "${GCP_IMPERSONATION_URL}"
+      }
+      EOF
+  - &configure_docker_credential
+    name: Install docker credential helper
+    command: |
+      mkdir -p ~/bin
+      pushd ~/bin
+        curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv
+      popd
+      export PATH=$PATH:~/bin
+      mkdir -p $HOME/.config/gcloud/
+      echo ${GOOGLE_AUTH} > $HOME/.config/gcloud/application_default_credentials.json
+      docker-credential-gcr configure-docker
+
+
+jobs:
+  build-docker-image:
+    docker:
+      - image: cimg/base:current-22.04
+    resource_class: zia-ai/small
+    steps:
+      - checkout
+      - run: *configure_buildx_context
+      - run: *gcp_auth
+      - run: *configure_docker_credential
+      - run:
+          name: Build docker image
+          command: |
+            IMAGE_SHA1=$(echo $CIRCLE_SHA1 | cut -c -7)
+            IMAGE_BRANCH=$(echo $CIRCLE_BRANCH | sed -e 's/\//\-/g')
+            IMAGE_TAG="${CIRCLE_TAG:-$IMAGE_BRANCH}"
+            IMAGE_NAME="clu-integration"
+            docker buildx build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_SHA1 --progress plain --push .
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,27 @@
+
+.git
+.github
+.dockerignore
+.gitignore
+
+.idea
+.vscode
+
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+htmlcov/
+.coverage
+.coverage.*
+.pytest_cache/
+.venv
+venv
+
+.DS_Store
+.AppleDouble
+.LSOverride
+._*
+
+.vscode
+.idea
diff --git a/Dockerfile b/Dockerfile
@@ -11,9 +11,10 @@ WORKDIR /src
 # RUN git clone -b <branch_name> --single-branch https://github.com/zia-ai/hf-custom-integration.git .
 
 # Clone from master branch
-RUN git clone https://github.com/zia-ai/hf-custom-integration.git .
+# RUN git clone https://github.com/zia-ai/hf-custom-integration.git .
+COPY . /src
 
 # Generate MTLS credentials from the commands given in the README.md
-COPY ./credentials /src/credentials
+# COPY ./credentials /src/credentials
 
 RUN poetry config virtualenvs.create false && poetry install
diff --git a/README.md b/README.md
@@ -121,30 +121,34 @@ Example command installing CLI-1.35.0
 7. Get the Azure endpoint and API key using https://portal.azure.com/
     Go to resource
     Find Endpoint and keys
-
-8. Set HumanFirst environment variables
-    ```
-    export HF_USERNAME="<HumanFirst Username>"
-    export HF_PASSWORD="<HumanFirst Password>"
-    ```
-9. Set environment variables for running CLU integration
+8. Set environment variables for running CLU integration
     ```
     export CLU_ENDPOINT="<CLU Endpoint>"
     export CLU_KEY="<CLU API key>"
     ```
 **Note: In case of restarting the instance, ensure to run the follwoing command again - `sudo sysctl -w net.ipv4.ip_unprivileged_port_start=443`**
 
-10. Launch the integration service:
+9. Launch the integration service:
     ```
     poetry run python3 -m hf_integration.main ./credentials/mtls-credentials.json 0.0.0.0:443 <integration-generic,clu,example> "<config - key1::value1,key2::value2,..,keyN::valueN>"
     ```
 
     Example:
     ```
-    poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug"
+    poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500,training_delimiter::---"
+
+    CLU config params:
+    clu_endpoint
+    clu_key
+    training_delimiter
+    workspace_delimiter
+    clu_language
+    clu_multilingual
+    clu_training_mode
+    max_batch_size
     ```
 
-11. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
+10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
 `hf integrations --id intg-id-here set-address -a <Public IP Address>:443`
 
 ## Docker
@@ -208,14 +212,14 @@ Follow the steps here - https://www.notion.so/humanfirst/Custom-NLU-d4bb84f08676
 
 ### Create, attach and run the commands manually
 ```
-sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null
+sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null
 
 sudo docker exec -it clu-custom-connector-0 /bin/bash
 
-poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
+poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
 
 ### Run the commands while creating the container
-sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
+sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
 
 ### Free up the port 443
 sudo kill -9 $(sudo lsof -t -i :443)

diff --git a/hf_integration/clu_converters.py b/hf_integration/clu_converters.py
@@ -137,7 +137,7 @@ class clu_to_hf_converter:
     def clu_to_hf_process(
             self,
             clu_json: dict,
-            delimiter: str = "-",
+            delimiter: str,
             language: str = "en-us") -> None:
 
         # TODO: note potential clashes with utf16 and utf8 in future depending on PVA
@@ -299,7 +299,10 @@ def clu_to_hf_entity_mapper(self, clu_entity_object: dict, language: str) -> dic
     def clu_to_hf_intent_mapper(self, intent_name: str, hf_workspace: humanfirst.objects.HFWorkspace, delimiter: str) -> None:
         """Builds the parent and child structures for an intent name"""
         # clu doesn't have separate IDs (current understanding)
-        intent_hierarchy = intent_name.split(delimiter)
+        if delimiter != "":
+            intent_hierarchy = intent_name.split(delimiter)
+        else:
+            intent_hierarchy = intent_name
         hf_workspace.intent(intent_hierarchy)
 
     def clu_to_hf_utterance_mapper(self, 
@@ -309,7 +312,12 @@ def clu_to_hf_utterance_mapper(self,
                                    delimiter: str) -> None:
         """Builds HF example"""
         fully_qualified_intent_name = str(row["intent"])
-        intent_hierarchy = fully_qualified_intent_name.split(delimiter)
+
+        if delimiter != "":
+            intent_hierarchy = fully_qualified_intent_name.split(delimiter)
+        else:
+            intent_hierarchy = fully_qualified_intent_name
+
         try:
             tag_name = row["dataset"]
             if pandas.isna(tag_name):
@@ -327,8 +335,8 @@ class hf_to_clu_converter:
     def hf_to_clu_process(self,
                         hf_json: dict,
                         clu_json: dict,
+                        delimiter: str,
                         language: str = "en-us",
-                        delimiter: str = "-",
                         skip: bool = False) -> None:
         """Process HF to CLU conversion"""
 
@@ -337,7 +345,11 @@ def hf_to_clu_process(self,
         # get a HFWorkspace object to get fully qualified intent names
         # logger.info("delimiter blah blah")
         logger.info(f"Delimiter {delimiter}")
-        hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter)
+
+        if delimiter != "":
+            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=delimiter)
+        else:
+            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=None)
 
         # get the tag for Test dataset
         test_tag_id = None

diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py
@@ -32,6 +32,7 @@
 
 TRAIN_SPLIT=100
 MAX_BATCH_SIZE=1000
+TRAINING_DELIMITER = "---"
 CLU_SUPPORTED_LANGUAGE_CODES = [
     "af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", 
     "cy", "da", "de", "el", "en-us", "en-gb", "eo", "es", "et", "eu", "fa", 
@@ -147,7 +148,6 @@ def __init__(self, config: dict) -> None:
         super().__init__(config)
         self.clu_api = clu_apis(clu_endpoint=self.config["clu_endpoint"],
                                 clu_key=self.config["clu_key"])
-        self.workspace = WorkspaceServiceCLU(config=config)
 
         # Check for language code support
         if self.config["clu_language"] in CLU_SUPPORTED_LANGUAGE_CODES:
@@ -174,6 +174,19 @@ def __init__(self, config: dict) -> None:
         if self.config["max_batch_size"] <= 0:
             raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0')
 
+        # check for delimiter
+        if "training_delimiter" in self.config:
+            if self.config["training_delimiter"] != "":
+                self.format_options.hierarchical_delimiter=self.config["training_delimiter"]
+                self.config["workspace_delimiter"] = self.config["training_delimiter"]
+            else:
+                self.format_options.hierarchical_delimiter = TRAINING_DELIMITER
+                self.config["workspace_delimiter"] = TRAINING_DELIMITER
+        else:
+            self.format_options.hierarchical_delimiter = TRAINING_DELIMITER
+            self.config["workspace_delimiter"] = TRAINING_DELIMITER
+
+        self.workspace = WorkspaceServiceCLU(config=config)
 
     def _flip_dict(self, input_dict, delimiter):
         # Ensure that all values in the original dictionary are unique
@@ -182,8 +195,12 @@ def _flip_dict(self, input_dict, delimiter):
 
         # Flip the dictionary
         flipped_dict = {}
-        for key, value in input_dict.items():
-            flipped_dict[value] = [key, value.split(delimiter)[-1]]
+        if delimiter != "":
+            for key, value in input_dict.items():
+                flipped_dict[value] = [key, value.split(delimiter)[-1]]
+        else:
+            for key, value in input_dict.items():
+                flipped_dict[value] = [key, value]
         return flipped_dict
 
 
@@ -280,7 +297,9 @@ def on_cancel():
                 namespace=request.namespace,
                 integration_id=request.integration_id,
                 data=request.data,
-                workspace_id=project_name
+                workspace_id=project_name,
+                data_format=self.data_format,
+                format_options=self.format_options
             )
 
             hf_file_path = os.path.join(self.snapshot_path, "import", f"{timestamp}_hf_{request.namespace}_{project_name}.json")
@@ -449,10 +468,11 @@ async def _Classify(self, request: models_pb2.ClassifyRequest, context) -> model
             with open(self.handle_map[request.model_id]["hf_file_path"], mode="r", encoding="utf8") as f:
                 hf_json = json.load(f)
 
-            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, self.config["delimiter"])
+            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,
+                                                                    self.format_options.hierarchical_delimiter)
             intent_index = self._flip_dict(hf_workspace.get_intent_index(
-                delimiter=self.config["delimiter"]),
-                delimiter=self.config["delimiter"]
+                delimiter=self.format_options.hierarchical_delimiter),
+                delimiter=self.format_options.hierarchical_delimiter
             )
             predictions = []
             predict_results = []