From 7d993bc2248456502f05c470ee0607d0fa24b6a9 Mon Sep 17 00:00:00 2001 From: Mohammed Fayaz Ansar Jelani Date: Tue, 26 Nov 2024 05:17:10 +0000 Subject: [PATCH 1/2] delimiter fix --- README.md | 16 +++++----------- hf_integration/clu_converters.py | 22 +++++++++++++++++----- hf_integration/model_clu.py | 16 ++++++++++++---- hf_integration/workspace_clu.py | 32 ++++++++++++++++++++++++++++---- 4 files changed, 62 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index d2071fa..71dc4f0 100644 --- a/README.md +++ b/README.md @@ -121,20 +121,14 @@ Example command installing CLI-1.35.0 7. Get the Azure endpoint and API key using https://portal.azure.com/ Go to resource Find Endpoint and keys - -8. Set HumanFirst environment variables - ``` - export HF_USERNAME="" - export HF_PASSWORD="" - ``` -9. Set environment variables for running CLU integration +8. Set environment variables for running CLU integration ``` export CLU_ENDPOINT="" export CLU_KEY="" ``` **Note: In case of restarting the instance, ensure to run the follwoing command again - `sudo sysctl -w net.ipv4.ip_unprivileged_port_start=443`** -10. Launch the integration service: +9. Launch the integration service: ``` poetry run python3 -m hf_integration.main ./credentials/mtls-credentials.json 0.0.0.0:443 "" ``` @@ -144,7 +138,7 @@ Example command installing CLI-1.35.0 poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug" ``` -11. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF +10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF `hf integrations --id intg-id-here set-address -a :443` ## Docker @@ -208,14 +202,14 @@ Follow the steps here - https://www.notion.so/humanfirst/Custom-NLU-d4bb84f08676 ### Create, attach and run the commands manually ``` -sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null +sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null sudo docker exec -it clu-custom-connector-0 /bin/bash poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" ### Run the commands while creating the container -sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" +sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" ### Free up the port 443 sudo kill -9 $(sudo lsof -t -i :443) diff --git a/hf_integration/clu_converters.py b/hf_integration/clu_converters.py index 44d2bf2..4628674 100644 --- a/hf_integration/clu_converters.py +++ b/hf_integration/clu_converters.py @@ -137,7 +137,7 @@ class clu_to_hf_converter: def clu_to_hf_process( self, clu_json: dict, - delimiter: str = "-", + delimiter: str, language: str = "en-us") -> None: # TODO: note potential clashes with utf16 and utf8 in future depending on PVA @@ -299,7 +299,10 @@ def clu_to_hf_entity_mapper(self, clu_entity_object: dict, language: str) -> dic def clu_to_hf_intent_mapper(self, intent_name: str, hf_workspace: humanfirst.objects.HFWorkspace, delimiter: str) -> None: """Builds the parent and child structures for an intent name""" # clu doesn't have separate IDs (current understanding) - intent_hierarchy = intent_name.split(delimiter) + if delimiter != "": + intent_hierarchy = intent_name.split(delimiter) + else: + intent_hierarchy = intent_name hf_workspace.intent(intent_hierarchy) def clu_to_hf_utterance_mapper(self, @@ -309,7 +312,12 @@ def clu_to_hf_utterance_mapper(self, delimiter: str) -> None: """Builds HF example""" fully_qualified_intent_name = str(row["intent"]) - intent_hierarchy = fully_qualified_intent_name.split(delimiter) + + if delimiter != "": + intent_hierarchy = fully_qualified_intent_name.split(delimiter) + else: + intent_hierarchy = fully_qualified_intent_name + try: tag_name = row["dataset"] if pandas.isna(tag_name): @@ -327,8 +335,8 @@ class hf_to_clu_converter: def hf_to_clu_process(self, hf_json: dict, clu_json: dict, + delimiter: str, language: str = "en-us", - delimiter: str = "-", skip: bool = False) -> None: """Process HF to CLU conversion""" @@ -337,7 +345,11 @@ def hf_to_clu_process(self, # get a HFWorkspace object to get fully qualified intent names # logger.info("delimiter blah blah") logger.info(f"Delimiter {delimiter}") - hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter) + + if delimiter != "": + hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=delimiter) + else: + hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=None) # get the tag for Test dataset test_tag_id = None diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py index 96b0e8a..e90f8a6 100644 --- a/hf_integration/model_clu.py +++ b/hf_integration/model_clu.py @@ -174,6 +174,11 @@ def __init__(self, config: dict) -> None: if self.config["max_batch_size"] <= 0: raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0') + # check for delimiter + if "delimiter" in self.config: + if self.config["delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["delimiter"] + def _flip_dict(self, input_dict, delimiter): # Ensure that all values in the original dictionary are unique @@ -280,7 +285,9 @@ def on_cancel(): namespace=request.namespace, integration_id=request.integration_id, data=request.data, - workspace_id=project_name + workspace_id=project_name, + data_format=self.data_format, + format_options=self.format_options ) hf_file_path = os.path.join(self.snapshot_path, "import", f"{timestamp}_hf_{request.namespace}_{project_name}.json") @@ -449,10 +456,11 @@ async def _Classify(self, request: models_pb2.ClassifyRequest, context) -> model with open(self.handle_map[request.model_id]["hf_file_path"], mode="r", encoding="utf8") as f: hf_json = json.load(f) - hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, self.config["delimiter"]) + hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, + self.format_options.hierarchical_delimiter) intent_index = self._flip_dict(hf_workspace.get_intent_index( - delimiter=self.config["delimiter"]), - delimiter=self.config["delimiter"] + delimiter=self.format_options.hierarchical_delimiter), + delimiter=self.format_options.hierarchical_delimiter ) predictions = [] predict_results = [] diff --git a/hf_integration/workspace_clu.py b/hf_integration/workspace_clu.py index d22911e..c30e44f 100644 --- a/hf_integration/workspace_clu.py +++ b/hf_integration/workspace_clu.py @@ -147,6 +147,12 @@ def __init__(self, config: dict) -> None: self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]] + # check for delimiter + if "delimiter" in self.config: + if self.config["delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["delimiter"] + + def _write_json(self,path: str, data: dict ) -> None: with open(path,mode="w",encoding="utf8") as f: json.dump(data,f,indent=2) @@ -154,6 +160,9 @@ def _write_json(self,path: str, data: dict ) -> None: def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context) -> workspace_pb2.ListWorkspacesResponse: """List Workspaces""" + print("ListWorkspaces") + print(request) + workspaces = [] for project in self.clu_api.list_projects(): workspaces.append(workspace_pb2.Workspace(id=project, name=project)) @@ -163,6 +172,10 @@ def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context) def GetWorkspace(self, request: workspace_pb2.GetWorkspaceRequest, context) -> workspace_pb2.Workspace: """Get workspace""" + print("GetWorkspace") + + print(request) + if request.workspace_id in self.clu_api.list_projects(): return workspace_pb2.Workspace(id=request.workspace_id, name=request.workspace_id) else: @@ -172,6 +185,8 @@ def CreateWorkspace(self, request: workspace_pb2.CreateWorkspaceRequest, context """ Create a new workspace """ + print("CreateWorkspace") + print(request) self.clu_api.clu_create_project(project_name=request.workspace.name, des = request.workspace.description, language=self.language, @@ -185,14 +200,19 @@ def GetImportParameters(self, request: workspace_pb2.GetImportParametersRequest, In this case, we specifically request the HF json format """ - + print("GetImportParameters") + print(request) + # print(request.language_code) return workspace_pb2.GetImportParametersResponse(data_format=self.data_format, format_options=self.format_options) def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context) -> workspace_pb2.ImportWorkspaceResponse: """ Import a workspace into the integration, from the provided data exported from Studio """ - + print("ImportWorkspace") + print(request) + print(f"Hierarchical Delimiter: {request.format_options.hierarchical_delimiter}") + # print(request.language_code) # Get the current timestamp timestamp = datetime.now().strftime("%Y%m%d%H%M%S") project_name = self.clu_api._remove_non_alphanumeric(input_string=request.workspace_id) @@ -241,7 +261,7 @@ def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context clu_json = self.clu_converter.hf_to_clu_process( hf_json=hf_json, clu_json=clu_json, - delimiter=self.config["delimiter"], + delimiter=self.format_options.hierarchical_delimiter, language=self.language) self._write_json( @@ -258,6 +278,10 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context Exports a workspace from the integration, importing it into Studio """ + print("ExportWorkspace") + print(request) + # print(request.language_code) + # Get the current timestamp timestamp = datetime.now().strftime("%Y%m%d%H%M%S") @@ -268,7 +292,7 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context hf_json = self.clu_converter.clu_to_hf_process( clu_json=clu_project, - delimiter=self.config["delimiter"], + delimiter=self.format_options.hierarchical_delimiter, language=self.language) self._write_json( From 2e2a154d1df0e0445e42dfbc32e72a3e6b69dc69 Mon Sep 17 00:00:00 2001 From: Mohammed Fayaz Ansar Jelani Date: Wed, 11 Dec 2024 07:37:58 +0000 Subject: [PATCH 2/2] training delimiter --- README.md | 12 +++++++++++- hf_integration/model_clu.py | 24 ++++++++++++++++++------ hf_integration/workspace_clu.py | 6 +++--- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 71dc4f0..99e1f42 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,17 @@ Example command installing CLI-1.35.0 Example: ``` - poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug" + poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500,training_delimiter::---" + + CLU config params: + clu_endpoint + clu_key + training_delimiter + workspace_delimiter + clu_language + clu_multilingual + clu_training_mode + max_batch_size ``` 10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py index e90f8a6..419aeb0 100644 --- a/hf_integration/model_clu.py +++ b/hf_integration/model_clu.py @@ -32,6 +32,7 @@ TRAIN_SPLIT=100 MAX_BATCH_SIZE=1000 +TRAINING_DELIMITER = "---" CLU_SUPPORTED_LANGUAGE_CODES = [ "af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy", "da", "de", "el", "en-us", "en-gb", "eo", "es", "et", "eu", "fa", @@ -147,7 +148,6 @@ def __init__(self, config: dict) -> None: super().__init__(config) self.clu_api = clu_apis(clu_endpoint=self.config["clu_endpoint"], clu_key=self.config["clu_key"]) - self.workspace = WorkspaceServiceCLU(config=config) # Check for language code support if self.config["clu_language"] in CLU_SUPPORTED_LANGUAGE_CODES: @@ -175,10 +175,18 @@ def __init__(self, config: dict) -> None: raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0') # check for delimiter - if "delimiter" in self.config: - if self.config["delimiter"] != "": - self.format_options.hierarchical_delimiter=self.config["delimiter"] + if "training_delimiter" in self.config: + if self.config["training_delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["training_delimiter"] + self.config["workspace_delimiter"] = self.config["training_delimiter"] + else: + self.format_options.hierarchical_delimiter = TRAINING_DELIMITER + self.config["workspace_delimiter"] = TRAINING_DELIMITER + else: + self.format_options.hierarchical_delimiter = TRAINING_DELIMITER + self.config["workspace_delimiter"] = TRAINING_DELIMITER + self.workspace = WorkspaceServiceCLU(config=config) def _flip_dict(self, input_dict, delimiter): # Ensure that all values in the original dictionary are unique @@ -187,8 +195,12 @@ def _flip_dict(self, input_dict, delimiter): # Flip the dictionary flipped_dict = {} - for key, value in input_dict.items(): - flipped_dict[value] = [key, value.split(delimiter)[-1]] + if delimiter != "": + for key, value in input_dict.items(): + flipped_dict[value] = [key, value.split(delimiter)[-1]] + else: + for key, value in input_dict.items(): + flipped_dict[value] = [key, value] return flipped_dict diff --git a/hf_integration/workspace_clu.py b/hf_integration/workspace_clu.py index c30e44f..ebecb54 100644 --- a/hf_integration/workspace_clu.py +++ b/hf_integration/workspace_clu.py @@ -148,9 +148,9 @@ def __init__(self, config: dict) -> None: self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]] # check for delimiter - if "delimiter" in self.config: - if self.config["delimiter"] != "": - self.format_options.hierarchical_delimiter=self.config["delimiter"] + if "workspace_delimiter" in self.config: + if self.config["workspace_delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["workspace_delimiter"] def _write_json(self,path: str, data: dict ) -> None: