From 7d993bc2248456502f05c470ee0607d0fa24b6a9 Mon Sep 17 00:00:00 2001
From: Mohammed Fayaz Ansar Jelani <amdfayazz@gmail.com>
Date: Tue, 26 Nov 2024 05:17:10 +0000
Subject: [PATCH 1/2] delimiter fix

---
 README.md                        | 16 +++++-----------
 hf_integration/clu_converters.py | 22 +++++++++++++++++-----
 hf_integration/model_clu.py      | 16 ++++++++++++----
 hf_integration/workspace_clu.py  | 32 ++++++++++++++++++++++++++++----
 4 files changed, 62 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index d2071fa..71dc4f0 100644
--- a/README.md
+++ b/README.md
@@ -121,20 +121,14 @@ Example command installing CLI-1.35.0
 7. Get the Azure endpoint and API key using https://portal.azure.com/
     Go to resource
     Find Endpoint and keys
-    
-8. Set HumanFirst environment variables
-    ```
-    export HF_USERNAME="<HumanFirst Username>"
-    export HF_PASSWORD="<HumanFirst Password>"
-    ```
-9. Set environment variables for running CLU integration
+8. Set environment variables for running CLU integration
     ```
     export CLU_ENDPOINT="<CLU Endpoint>"
     export CLU_KEY="<CLU API key>"
     ```
 **Note: In case of restarting the instance, ensure to run the follwoing command again - `sudo sysctl -w net.ipv4.ip_unprivileged_port_start=443`**
 
-10. Launch the integration service:
+9. Launch the integration service:
     ```
     poetry run python3 -m hf_integration.main ./credentials/mtls-credentials.json 0.0.0.0:443 <integration-generic,clu,example> "<config - key1::value1,key2::value2,..,keyN::valueN>"
     ```
@@ -144,7 +138,7 @@ Example command installing CLI-1.35.0
     poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug"
     ```
 
-11. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
+10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
 `hf integrations --id intg-id-here set-address -a <Public IP Address>:443`
 
 ## Docker
@@ -208,14 +202,14 @@ Follow the steps here - https://www.notion.so/humanfirst/Custom-NLU-d4bb84f08676
 
 ### Create, attach and run the commands manually
 ```
-sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null
+sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null
 
 sudo docker exec -it clu-custom-connector-0 /bin/bash
 
 poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
 
 ### Run the commands while creating the container
-sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
+sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
 
 ### Free up the port 443
 sudo kill -9 $(sudo lsof -t -i :443)
diff --git a/hf_integration/clu_converters.py b/hf_integration/clu_converters.py
index 44d2bf2..4628674 100644
--- a/hf_integration/clu_converters.py
+++ b/hf_integration/clu_converters.py
@@ -137,7 +137,7 @@ class clu_to_hf_converter:
     def clu_to_hf_process(
             self,
             clu_json: dict,
-            delimiter: str = "-",
+            delimiter: str,
             language: str = "en-us") -> None:
 
         # TODO: note potential clashes with utf16 and utf8 in future depending on PVA
@@ -299,7 +299,10 @@ def clu_to_hf_entity_mapper(self, clu_entity_object: dict, language: str) -> dic
     def clu_to_hf_intent_mapper(self, intent_name: str, hf_workspace: humanfirst.objects.HFWorkspace, delimiter: str) -> None:
         """Builds the parent and child structures for an intent name"""
         # clu doesn't have separate IDs (current understanding)
-        intent_hierarchy = intent_name.split(delimiter)
+        if delimiter != "":
+            intent_hierarchy = intent_name.split(delimiter)
+        else:
+            intent_hierarchy = intent_name
         hf_workspace.intent(intent_hierarchy)
 
     def clu_to_hf_utterance_mapper(self, 
@@ -309,7 +312,12 @@ def clu_to_hf_utterance_mapper(self,
                                    delimiter: str) -> None:
         """Builds HF example"""
         fully_qualified_intent_name = str(row["intent"])
-        intent_hierarchy = fully_qualified_intent_name.split(delimiter)
+
+        if delimiter != "":
+            intent_hierarchy = fully_qualified_intent_name.split(delimiter)
+        else:
+            intent_hierarchy = fully_qualified_intent_name
+
         try:
             tag_name = row["dataset"]
             if pandas.isna(tag_name):
@@ -327,8 +335,8 @@ class hf_to_clu_converter:
     def hf_to_clu_process(self,
                         hf_json: dict,
                         clu_json: dict,
+                        delimiter: str,
                         language: str = "en-us",
-                        delimiter: str = "-",
                         skip: bool = False) -> None:
         """Process HF to CLU conversion"""
 
@@ -337,7 +345,11 @@ def hf_to_clu_process(self,
         # get a HFWorkspace object to get fully qualified intent names
         # logger.info("delimiter blah blah")
         logger.info(f"Delimiter {delimiter}")
-        hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter)
+
+        if delimiter != "":
+            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=delimiter)
+        else:
+            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=None)
 
         # get the tag for Test dataset
         test_tag_id = None
diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py
index 96b0e8a..e90f8a6 100644
--- a/hf_integration/model_clu.py
+++ b/hf_integration/model_clu.py
@@ -174,6 +174,11 @@ def __init__(self, config: dict) -> None:
         if self.config["max_batch_size"] <= 0:
             raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0')
 
+        # check for delimiter
+        if "delimiter" in self.config:
+            if self.config["delimiter"] != "":
+                self.format_options.hierarchical_delimiter=self.config["delimiter"]
+
 
     def _flip_dict(self, input_dict, delimiter):
         # Ensure that all values in the original dictionary are unique
@@ -280,7 +285,9 @@ def on_cancel():
                 namespace=request.namespace,
                 integration_id=request.integration_id,
                 data=request.data,
-                workspace_id=project_name
+                workspace_id=project_name,
+                data_format=self.data_format,
+                format_options=self.format_options
             )
 
             hf_file_path = os.path.join(self.snapshot_path, "import", f"{timestamp}_hf_{request.namespace}_{project_name}.json")
@@ -449,10 +456,11 @@ async def _Classify(self, request: models_pb2.ClassifyRequest, context) -> model
             with open(self.handle_map[request.model_id]["hf_file_path"], mode="r", encoding="utf8") as f:
                 hf_json = json.load(f)
 
-            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, self.config["delimiter"])
+            hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,
+                                                                    self.format_options.hierarchical_delimiter)
             intent_index = self._flip_dict(hf_workspace.get_intent_index(
-                delimiter=self.config["delimiter"]),
-                delimiter=self.config["delimiter"]
+                delimiter=self.format_options.hierarchical_delimiter),
+                delimiter=self.format_options.hierarchical_delimiter
             )
             predictions = []
             predict_results = []
diff --git a/hf_integration/workspace_clu.py b/hf_integration/workspace_clu.py
index d22911e..c30e44f 100644
--- a/hf_integration/workspace_clu.py
+++ b/hf_integration/workspace_clu.py
@@ -147,6 +147,12 @@ def __init__(self, config: dict) -> None:
         
         self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]]
 
+        # check for delimiter
+        if "delimiter" in self.config:
+            if self.config["delimiter"] != "":
+                self.format_options.hierarchical_delimiter=self.config["delimiter"]
+
+
     def _write_json(self,path: str, data: dict ) -> None:
         with open(path,mode="w",encoding="utf8") as f:
             json.dump(data,f,indent=2)
@@ -154,6 +160,9 @@ def _write_json(self,path: str, data: dict ) -> None:
     def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context) -> workspace_pb2.ListWorkspacesResponse:
         """List Workspaces"""
 
+        print("ListWorkspaces")
+        print(request)
+
         workspaces = []
         for project in self.clu_api.list_projects():
                 workspaces.append(workspace_pb2.Workspace(id=project, name=project))
@@ -163,6 +172,10 @@ def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context)
     def GetWorkspace(self, request: workspace_pb2.GetWorkspaceRequest, context) -> workspace_pb2.Workspace:
         """Get workspace"""
 
+        print("GetWorkspace")
+
+        print(request)
+
         if request.workspace_id in self.clu_api.list_projects():
             return workspace_pb2.Workspace(id=request.workspace_id, name=request.workspace_id)
         else:
@@ -172,6 +185,8 @@ def CreateWorkspace(self, request: workspace_pb2.CreateWorkspaceRequest, context
         """
         Create a new workspace
         """
+        print("CreateWorkspace")
+        print(request)
         self.clu_api.clu_create_project(project_name=request.workspace.name,
                                         des = request.workspace.description,
                                         language=self.language,
@@ -185,14 +200,19 @@ def GetImportParameters(self, request: workspace_pb2.GetImportParametersRequest,
 
         In this case, we specifically request the HF json format
         """
-
+        print("GetImportParameters")
+        print(request)
+        # print(request.language_code)
         return workspace_pb2.GetImportParametersResponse(data_format=self.data_format, format_options=self.format_options)
 
     def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context) -> workspace_pb2.ImportWorkspaceResponse:
         """
         Import a workspace into the integration, from the provided data exported from Studio
         """
-
+        print("ImportWorkspace")
+        print(request)
+        print(f"Hierarchical Delimiter: {request.format_options.hierarchical_delimiter}")
+        # print(request.language_code)
         # Get the current timestamp
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
         project_name = self.clu_api._remove_non_alphanumeric(input_string=request.workspace_id)
@@ -241,7 +261,7 @@ def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context
         clu_json = self.clu_converter.hf_to_clu_process(
             hf_json=hf_json,
             clu_json=clu_json,
-            delimiter=self.config["delimiter"],
+            delimiter=self.format_options.hierarchical_delimiter,
             language=self.language)
 
         self._write_json(
@@ -258,6 +278,10 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context
         Exports a workspace from the integration, importing it into Studio
         """
 
+        print("ExportWorkspace")
+        print(request)
+        # print(request.language_code)
+
         # Get the current timestamp
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
 
@@ -268,7 +292,7 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context
 
         hf_json = self.clu_converter.clu_to_hf_process(
             clu_json=clu_project,
-            delimiter=self.config["delimiter"],
+            delimiter=self.format_options.hierarchical_delimiter,
             language=self.language)
 
         self._write_json(

From 2e2a154d1df0e0445e42dfbc32e72a3e6b69dc69 Mon Sep 17 00:00:00 2001
From: Mohammed Fayaz Ansar Jelani <amdfayazz@gmail.com>
Date: Wed, 11 Dec 2024 07:37:58 +0000
Subject: [PATCH 2/2] training delimiter

---
 README.md                       | 12 +++++++++++-
 hf_integration/model_clu.py     | 24 ++++++++++++++++++------
 hf_integration/workspace_clu.py |  6 +++---
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 71dc4f0..99e1f42 100644
--- a/README.md
+++ b/README.md
@@ -135,7 +135,17 @@ Example command installing CLI-1.35.0
 
     Example:
     ```
-    poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug"
+    poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500,training_delimiter::---"
+
+    CLU config params:
+    clu_endpoint
+    clu_key
+    training_delimiter
+    workspace_delimiter
+    clu_language
+    clu_multilingual
+    clu_training_mode
+    max_batch_size
     ```
 
 10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py
index e90f8a6..419aeb0 100644
--- a/hf_integration/model_clu.py
+++ b/hf_integration/model_clu.py
@@ -32,6 +32,7 @@
 
 TRAIN_SPLIT=100
 MAX_BATCH_SIZE=1000
+TRAINING_DELIMITER = "---"
 CLU_SUPPORTED_LANGUAGE_CODES = [
     "af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", 
     "cy", "da", "de", "el", "en-us", "en-gb", "eo", "es", "et", "eu", "fa", 
@@ -147,7 +148,6 @@ def __init__(self, config: dict) -> None:
         super().__init__(config)
         self.clu_api = clu_apis(clu_endpoint=self.config["clu_endpoint"],
                                 clu_key=self.config["clu_key"])
-        self.workspace = WorkspaceServiceCLU(config=config)
 
         # Check for language code support
         if self.config["clu_language"] in CLU_SUPPORTED_LANGUAGE_CODES:
@@ -175,10 +175,18 @@ def __init__(self, config: dict) -> None:
             raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0')
 
         # check for delimiter
-        if "delimiter" in self.config:
-            if self.config["delimiter"] != "":
-                self.format_options.hierarchical_delimiter=self.config["delimiter"]
+        if "training_delimiter" in self.config:
+            if self.config["training_delimiter"] != "":
+                self.format_options.hierarchical_delimiter=self.config["training_delimiter"]
+                self.config["workspace_delimiter"] = self.config["training_delimiter"]
+            else:
+                self.format_options.hierarchical_delimiter = TRAINING_DELIMITER
+                self.config["workspace_delimiter"] = TRAINING_DELIMITER
+        else:
+            self.format_options.hierarchical_delimiter = TRAINING_DELIMITER
+            self.config["workspace_delimiter"] = TRAINING_DELIMITER
 
+        self.workspace = WorkspaceServiceCLU(config=config)
 
     def _flip_dict(self, input_dict, delimiter):
         # Ensure that all values in the original dictionary are unique
@@ -187,8 +195,12 @@ def _flip_dict(self, input_dict, delimiter):
 
         # Flip the dictionary
         flipped_dict = {}
-        for key, value in input_dict.items():
-            flipped_dict[value] = [key, value.split(delimiter)[-1]]
+        if delimiter != "":
+            for key, value in input_dict.items():
+                flipped_dict[value] = [key, value.split(delimiter)[-1]]
+        else:
+            for key, value in input_dict.items():
+                flipped_dict[value] = [key, value]
         return flipped_dict
 
 
diff --git a/hf_integration/workspace_clu.py b/hf_integration/workspace_clu.py
index c30e44f..ebecb54 100644
--- a/hf_integration/workspace_clu.py
+++ b/hf_integration/workspace_clu.py
@@ -148,9 +148,9 @@ def __init__(self, config: dict) -> None:
         self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]]
 
         # check for delimiter
-        if "delimiter" in self.config:
-            if self.config["delimiter"] != "":
-                self.format_options.hierarchical_delimiter=self.config["delimiter"]
+        if "workspace_delimiter" in self.config:
+            if self.config["workspace_delimiter"] != "":
+                self.format_options.hierarchical_delimiter=self.config["workspace_delimiter"]
 
 
     def _write_json(self,path: str, data: dict ) -> None: