From 2b7267077f0feb018260a9e765a38d1b5dc6acbf Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Wed, 10 Jan 2024 15:36:27 +0530 Subject: [PATCH] storage init container changes, fixing imports --- .../storage_init_container/Dockerfile | 4 +-- .../storage_init_container/constants.py | 1 + .../storage_init_container/hugging_face.py | 30 +++++++++++++++---- .../storage_init_container/requirements.txt | 4 +-- .../kubeflow/storage_init_container/s3.py | 20 +++++++++++-- .../storage_init_container/storage.py | 4 +-- .../kubeflow/storage_init_container/types.py | 19 ------------ 7 files changed, 47 insertions(+), 35 deletions(-) create mode 100644 sdk/python/kubeflow/storage_init_container/constants.py delete mode 100644 sdk/python/kubeflow/storage_init_container/types.py diff --git a/sdk/python/kubeflow/storage_init_container/Dockerfile b/sdk/python/kubeflow/storage_init_container/Dockerfile index 3c95a8ebdc..984f3ec5a7 100644 --- a/sdk/python/kubeflow/storage_init_container/Dockerfile +++ b/sdk/python/kubeflow/storage_init_container/Dockerfile @@ -5,7 +5,7 @@ FROM python:3.11 WORKDIR /app # Copy the Python package and its source code into the container -COPY . /app/storage +COPY . /app/storage_init_container # Copy the requirements.txt file into the container COPY requirements.txt /app/requirements.txt @@ -14,4 +14,4 @@ COPY requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Run storage.py when the container launches -ENTRYPOINT ["python", "storage/storage.py"] +ENTRYPOINT ["python", "-m", "storage_init_container.storage"] diff --git a/sdk/python/kubeflow/storage_init_container/constants.py b/sdk/python/kubeflow/storage_init_container/constants.py new file mode 100644 index 0000000000..1a2af2196d --- /dev/null +++ b/sdk/python/kubeflow/storage_init_container/constants.py @@ -0,0 +1 @@ +INIT_CONTAINER_MOUNT_PATH = "/workspace" diff --git a/sdk/python/kubeflow/storage_init_container/hugging_face.py b/sdk/python/kubeflow/storage_init_container/hugging_face.py index 8c6ef2236b..9f42c10a4b 100644 --- a/sdk/python/kubeflow/storage_init_container/hugging_face.py +++ b/sdk/python/kubeflow/storage_init_container/hugging_face.py @@ -1,8 +1,22 @@ -from kubeflow.storage_init_container.types import * -from kubeflow.storage_init_container.abstract_model_provider import modelProvider -from kubeflow.storage_init_container.abstract_dataset_provider import datasetProvider - -INIT_CONTAINER_MOUNT_PATH = "/workspace" +from dataclasses import dataclass, field +import transformers +from peft import LoraConfig +from urllib.parse import urlparse +import json, os +from typing import Union +from .constants import INIT_CONTAINER_MOUNT_PATH +from .abstract_model_provider import modelProvider +from .abstract_dataset_provider import datasetProvider + + +TRANSFORMER_TYPES = Union[ + transformers.AutoModelForSequenceClassification, + transformers.AutoModelForTokenClassification, + transformers.AutoModelForQuestionAnswering, + transformers.AutoModelForCausalLM, + transformers.AutoModelForMaskedLM, + transformers.AutoModelForImageClassification, +] @dataclass @@ -28,7 +42,9 @@ def download_dir(self, value): @dataclass class HuggingFaceTrainParams: - training_parameters: TrainingArguments = field(default_factory=TrainingArguments) + training_parameters: transformers.TrainingArguments = field( + default_factory=transformers.TrainingArguments + ) lora_config: LoraConfig = field(default_factory=LoraConfig) @@ -82,6 +98,8 @@ def load_config(self, serialised_args): def download_dataset(self): print("downloading dataset") + import huggingface_hub + from datasets import load_dataset if self.config.access_token: huggingface_hub.login(self.config.access_token) diff --git a/sdk/python/kubeflow/storage_init_container/requirements.txt b/sdk/python/kubeflow/storage_init_container/requirements.txt index ca4206f3e2..75d19e2919 100644 --- a/sdk/python/kubeflow/storage_init_container/requirements.txt +++ b/sdk/python/kubeflow/storage_init_container/requirements.txt @@ -5,6 +5,4 @@ transformers>=4.20.0 peft>=0.3.0 huggingface_hub==0.16.4 datasets>=2.13.2 -torch>=1.13.1 -torchvision>=0.9.1 -torchaudio>=0.8.1 + diff --git a/sdk/python/kubeflow/storage_init_container/s3.py b/sdk/python/kubeflow/storage_init_container/s3.py index 930e2ee228..2a322c70ee 100644 --- a/sdk/python/kubeflow/storage_init_container/s3.py +++ b/sdk/python/kubeflow/storage_init_container/s3.py @@ -1,5 +1,9 @@ -from .types import * -from kubeflow.storage_init_container.abstract_dataset_provider import datasetProvider +from dataclasses import dataclass, field +import json, os +import boto3 +from urllib.parse import urlparse +from .abstract_dataset_provider import datasetProvider +from .constants import INIT_CONTAINER_MOUNT_PATH @dataclass @@ -10,7 +14,9 @@ class S3DatasetParams: region_name: str = None access_key: str = None secret_key: str = None - download_dir: str = field(default="/workspace/datasets") + download_dir: str = field( + default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "datasets") + ) def is_valid_url(self, url): try: @@ -30,6 +36,14 @@ def __post_init__(self): raise ValueError("bucket_name or endpoint_url or file_key is None") self.is_valid_url(self.endpoint_url) + @property + def download_dir(self): + return self.download_dir + + @download_dir.setter + def download_dir(self, value): + raise AttributeError("Cannot modify read-only field 'download_dir'") + class S3(datasetProvider): def load_config(self, serialised_args): diff --git a/sdk/python/kubeflow/storage_init_container/storage.py b/sdk/python/kubeflow/storage_init_container/storage.py index 173b81c591..73937ad822 100644 --- a/sdk/python/kubeflow/storage_init_container/storage.py +++ b/sdk/python/kubeflow/storage_init_container/storage.py @@ -1,6 +1,6 @@ import argparse -from hugging_face import HuggingFace, HuggingFaceDataset -from s3 import S3 +from .hugging_face import HuggingFace, HuggingFaceDataset +from .s3 import S3 def model_factory(model_provider, model_provider_parameters): diff --git a/sdk/python/kubeflow/storage_init_container/types.py b/sdk/python/kubeflow/storage_init_container/types.py deleted file mode 100644 index 0a124104ac..0000000000 --- a/sdk/python/kubeflow/storage_init_container/types.py +++ /dev/null @@ -1,19 +0,0 @@ -from dataclasses import dataclass, field -from urllib.parse import urlparse -import json, os -from datasets import load_dataset -from peft import LoraConfig -import transformers -from transformers import TrainingArguments -import enum -import huggingface_hub -from typing import Union - -TRANSFORMER_TYPES = Union[ - transformers.AutoModelForSequenceClassification, - transformers.AutoModelForTokenClassification, - transformers.AutoModelForQuestionAnswering, - transformers.AutoModelForCausalLM, - transformers.AutoModelForMaskedLM, - transformers.AutoModelForImageClassification, -]