Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delimiter fix and readme update #4

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,30 +121,34 @@ Example command installing CLI-1.35.0
7. Get the Azure endpoint and API key using https://portal.azure.com/
Go to resource
Find Endpoint and keys

8. Set HumanFirst environment variables
```
export HF_USERNAME="<HumanFirst Username>"
export HF_PASSWORD="<HumanFirst Password>"
```
9. Set environment variables for running CLU integration
8. Set environment variables for running CLU integration
```
export CLU_ENDPOINT="<CLU Endpoint>"
export CLU_KEY="<CLU API key>"
```
**Note: In case of restarting the instance, ensure to run the follwoing command again - `sudo sysctl -w net.ipv4.ip_unprivileged_port_start=443`**

10. Launch the integration service:
9. Launch the integration service:
```
poetry run python3 -m hf_integration.main ./credentials/mtls-credentials.json 0.0.0.0:443 <integration-generic,clu,example> "<config - key1::value1,key2::value2,..,keyN::valueN>"
```

Example:
```
poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug"
poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500,training_delimiter::---"

CLU config params:
clu_endpoint
clu_key
training_delimiter
workspace_delimiter
clu_language
clu_multilingual
clu_training_mode
max_batch_size
```

11. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF
`hf integrations --id intg-id-here set-address -a <Public IP Address>:443`

## Docker
Expand Down Expand Up @@ -208,14 +212,14 @@ Follow the steps here - https://www.notion.so/humanfirst/Custom-NLU-d4bb84f08676

### Create, attach and run the commands manually
```
sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null
sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null

sudo docker exec -it clu-custom-connector-0 /bin/bash

poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"

### Run the commands while creating the container
sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"
sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500"

### Free up the port 443
sudo kill -9 $(sudo lsof -t -i :443)
Expand Down
22 changes: 17 additions & 5 deletions hf_integration/clu_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class clu_to_hf_converter:
def clu_to_hf_process(
self,
clu_json: dict,
delimiter: str = "-",
delimiter: str,
language: str = "en-us") -> None:

# TODO: note potential clashes with utf16 and utf8 in future depending on PVA
Expand Down Expand Up @@ -299,7 +299,10 @@ def clu_to_hf_entity_mapper(self, clu_entity_object: dict, language: str) -> dic
def clu_to_hf_intent_mapper(self, intent_name: str, hf_workspace: humanfirst.objects.HFWorkspace, delimiter: str) -> None:
"""Builds the parent and child structures for an intent name"""
# clu doesn't have separate IDs (current understanding)
intent_hierarchy = intent_name.split(delimiter)
if delimiter != "":
intent_hierarchy = intent_name.split(delimiter)
else:
intent_hierarchy = intent_name
hf_workspace.intent(intent_hierarchy)

def clu_to_hf_utterance_mapper(self,
Expand All @@ -309,7 +312,12 @@ def clu_to_hf_utterance_mapper(self,
delimiter: str) -> None:
"""Builds HF example"""
fully_qualified_intent_name = str(row["intent"])
intent_hierarchy = fully_qualified_intent_name.split(delimiter)

if delimiter != "":
intent_hierarchy = fully_qualified_intent_name.split(delimiter)
else:
intent_hierarchy = fully_qualified_intent_name

try:
tag_name = row["dataset"]
if pandas.isna(tag_name):
Expand All @@ -327,8 +335,8 @@ class hf_to_clu_converter:
def hf_to_clu_process(self,
hf_json: dict,
clu_json: dict,
delimiter: str,
language: str = "en-us",
delimiter: str = "-",
skip: bool = False) -> None:
"""Process HF to CLU conversion"""

Expand All @@ -337,7 +345,11 @@ def hf_to_clu_process(self,
# get a HFWorkspace object to get fully qualified intent names
# logger.info("delimiter blah blah")
logger.info(f"Delimiter {delimiter}")
hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter)

if delimiter != "":
hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=delimiter)
else:
hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=None)

# get the tag for Test dataset
test_tag_id = None
Expand Down
34 changes: 27 additions & 7 deletions hf_integration/model_clu.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

TRAIN_SPLIT=100
MAX_BATCH_SIZE=1000
TRAINING_DELIMITER = "---"
CLU_SUPPORTED_LANGUAGE_CODES = [
"af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs",
"cy", "da", "de", "el", "en-us", "en-gb", "eo", "es", "et", "eu", "fa",
Expand Down Expand Up @@ -147,7 +148,6 @@ def __init__(self, config: dict) -> None:
super().__init__(config)
self.clu_api = clu_apis(clu_endpoint=self.config["clu_endpoint"],
clu_key=self.config["clu_key"])
self.workspace = WorkspaceServiceCLU(config=config)

# Check for language code support
if self.config["clu_language"] in CLU_SUPPORTED_LANGUAGE_CODES:
Expand All @@ -174,6 +174,19 @@ def __init__(self, config: dict) -> None:
if self.config["max_batch_size"] <= 0:
raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0')

# check for delimiter
if "training_delimiter" in self.config:
if self.config["training_delimiter"] != "":
self.format_options.hierarchical_delimiter=self.config["training_delimiter"]
self.config["workspace_delimiter"] = self.config["training_delimiter"]
else:
self.format_options.hierarchical_delimiter = TRAINING_DELIMITER
self.config["workspace_delimiter"] = TRAINING_DELIMITER
else:
self.format_options.hierarchical_delimiter = TRAINING_DELIMITER
self.config["workspace_delimiter"] = TRAINING_DELIMITER

self.workspace = WorkspaceServiceCLU(config=config)

def _flip_dict(self, input_dict, delimiter):
# Ensure that all values in the original dictionary are unique
Expand All @@ -182,8 +195,12 @@ def _flip_dict(self, input_dict, delimiter):

# Flip the dictionary
flipped_dict = {}
for key, value in input_dict.items():
flipped_dict[value] = [key, value.split(delimiter)[-1]]
if delimiter != "":
for key, value in input_dict.items():
flipped_dict[value] = [key, value.split(delimiter)[-1]]
else:
for key, value in input_dict.items():
flipped_dict[value] = [key, value]
return flipped_dict


Expand Down Expand Up @@ -280,7 +297,9 @@ def on_cancel():
namespace=request.namespace,
integration_id=request.integration_id,
data=request.data,
workspace_id=project_name
workspace_id=project_name,
data_format=self.data_format,
format_options=self.format_options
)

hf_file_path = os.path.join(self.snapshot_path, "import", f"{timestamp}_hf_{request.namespace}_{project_name}.json")
Expand Down Expand Up @@ -449,10 +468,11 @@ async def _Classify(self, request: models_pb2.ClassifyRequest, context) -> model
with open(self.handle_map[request.model_id]["hf_file_path"], mode="r", encoding="utf8") as f:
hf_json = json.load(f)

hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, self.config["delimiter"])
hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,
self.format_options.hierarchical_delimiter)
intent_index = self._flip_dict(hf_workspace.get_intent_index(
delimiter=self.config["delimiter"]),
delimiter=self.config["delimiter"]
delimiter=self.format_options.hierarchical_delimiter),
delimiter=self.format_options.hierarchical_delimiter
)
predictions = []
predict_results = []
Expand Down
32 changes: 28 additions & 4 deletions hf_integration/workspace_clu.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,22 @@ def __init__(self, config: dict) -> None:

self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]]

# check for delimiter
if "workspace_delimiter" in self.config:
if self.config["workspace_delimiter"] != "":
self.format_options.hierarchical_delimiter=self.config["workspace_delimiter"]


def _write_json(self,path: str, data: dict ) -> None:
with open(path,mode="w",encoding="utf8") as f:
json.dump(data,f,indent=2)

def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context) -> workspace_pb2.ListWorkspacesResponse:
"""List Workspaces"""

print("ListWorkspaces")
print(request)

workspaces = []
for project in self.clu_api.list_projects():
workspaces.append(workspace_pb2.Workspace(id=project, name=project))
Expand All @@ -163,6 +172,10 @@ def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context)
def GetWorkspace(self, request: workspace_pb2.GetWorkspaceRequest, context) -> workspace_pb2.Workspace:
"""Get workspace"""

print("GetWorkspace")

print(request)

if request.workspace_id in self.clu_api.list_projects():
return workspace_pb2.Workspace(id=request.workspace_id, name=request.workspace_id)
else:
Expand All @@ -172,6 +185,8 @@ def CreateWorkspace(self, request: workspace_pb2.CreateWorkspaceRequest, context
"""
Create a new workspace
"""
print("CreateWorkspace")
print(request)
self.clu_api.clu_create_project(project_name=request.workspace.name,
des = request.workspace.description,
language=self.language,
Expand All @@ -185,14 +200,19 @@ def GetImportParameters(self, request: workspace_pb2.GetImportParametersRequest,

In this case, we specifically request the HF json format
"""

print("GetImportParameters")
print(request)
# print(request.language_code)
return workspace_pb2.GetImportParametersResponse(data_format=self.data_format, format_options=self.format_options)

def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context) -> workspace_pb2.ImportWorkspaceResponse:
"""
Import a workspace into the integration, from the provided data exported from Studio
"""

print("ImportWorkspace")
print(request)
print(f"Hierarchical Delimiter: {request.format_options.hierarchical_delimiter}")
# print(request.language_code)
# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
project_name = self.clu_api._remove_non_alphanumeric(input_string=request.workspace_id)
Expand Down Expand Up @@ -241,7 +261,7 @@ def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context
clu_json = self.clu_converter.hf_to_clu_process(
hf_json=hf_json,
clu_json=clu_json,
delimiter=self.config["delimiter"],
delimiter=self.format_options.hierarchical_delimiter,
language=self.language)

self._write_json(
Expand All @@ -258,6 +278,10 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context
Exports a workspace from the integration, importing it into Studio
"""

print("ExportWorkspace")
print(request)
# print(request.language_code)

# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

Expand All @@ -268,7 +292,7 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context

hf_json = self.clu_converter.clu_to_hf_process(
clu_json=clu_project,
delimiter=self.config["delimiter"],
delimiter=self.format_options.hierarchical_delimiter,
language=self.language)

self._write_json(
Expand Down