groq · ataheridezfouli-groq · Mar 25, 2024 · Nov 10, 2023 · Dec 13, 2023 · Jan 29, 2024
diff --git a/cla.md b/cla.md
@@ -55,4 +55,4 @@ required to provide support for your Contributions, except to the extent you des
 
 You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions
 into the project. The decision to use or incorporate your contributions into the project will be made at the
-sole discretion of the maintainers or their authorized delegates.
+sole discretion of the maintainers or their authorized delegates.
diff --git a/demo_helpers/MANIFEST.in b/demo_helpers/MANIFEST.in
@@ -0,0 +1,3 @@
+include demo_helpers/datasets/README.md
+include demo_helpers/pretrained_models/m5.pt
+include demo_helpers/pretrained_models/pointnet.pth
diff --git a/demo_helpers/demo_helpers/compute_performance.py b/demo_helpers/demo_helpers/compute_performance.py
@@ -182,16 +182,24 @@ def pytorch_model_inference(dataset, model):
             out = model(**inputs)
 
             if not isinstance(out, torch.Tensor):
-                if "logits" in out:
-                    out = out.logits
-                elif "start_logits" in out and "end_logits" in out:
-                    out = torch.vstack((out["start_logits"], out["end_logits"]))
-                elif "last_hidden_state" in out:
-                    out = out.last_hidden_state
+                if isinstance(out, tuple):
+                    if len(out) == 1:
+                        out = out[0]
+                    else:
+                        raise ValueError("Cannot handle tuple with len", len(out))
+                elif isinstance(out, dict):
+                    if "logits" in out:
+                        out = out.logits
+                    elif "start_logits" in out and "end_logits" in out:
+                        out = torch.vstack((out["start_logits"], out["end_logits"]))
+                    elif "last_hidden_state" in out:
+                        out = out.last_hidden_state
+                    else:
+                        raise ValueError(
+                            "Unknown output key. List of keys:", list(out.keys())
+                        )
                 else:
-                    raise ValueError(
-                        "Unknown output key. List of keys:", list(out.keys())
-                    )
+                    raise ValueError("Unknown output type", type(out))
             pred.append(out)
 
     return dataset.postprocess(pred)

diff --git a/demo_helpers/demo_helpers/dataset.py b/demo_helpers/demo_helpers/dataset.py
@@ -856,7 +856,7 @@ def _download_dataset(self):
 
     def preprocess(self):
         return [
-            {"image_arrays": self.coco[i][0].unsqueeze(0).numpy()}
+            {"images": self.coco[i][0].unsqueeze(0).numpy()}
             for i in range(len(self.coco))
         ]
 

diff --git a/demo_helpers/demo_helpers/model_download.py b/demo_helpers/demo_helpers/model_download.py
@@ -1,19 +1,23 @@
 import os
+import zipfile
 
 from datasets.utils.file_utils import cached_path
 from groqflow.common.build import DEFAULT_CACHE_DIR
 
 
-YOLOV6N_ONNX = "yolov6n_onnx"
+YOLOV6N_MODEL = "yolov6n_model"
+YOLOV6N_SOURCE = "yolov6n_source"
 
 
 DATA_URLS = {
-    YOLOV6N_ONNX: "https://github.com/meituan/YOLOv6/releases/download/0.1.0/yolov6n.onnx",
+    YOLOV6N_MODEL: "https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6n.pt",
+    YOLOV6N_SOURCE: "https://github.com/meituan/YOLOv6/archive/refs/tags/0.4.0.zip",
 }
 
 
 DST_PATHS = {
-    YOLOV6N_ONNX: "onnx_models/yolov6n.onnx",
+    YOLOV6N_MODEL: "pytorch_models/yolov6_nano/yolov6n.pt",
+    YOLOV6N_SOURCE: "pytorch_models/yolov6_nano/YOLOv6",
 }
 
 
@@ -27,3 +31,18 @@ def download_model(model):
     download_path = cached_path(url)
     os.symlink(download_path, dst_path)
     return dst_path
+
+
+def download_source(source):
+    dst_path = os.path.join(DEFAULT_CACHE_DIR, DST_PATHS[source])
+    if os.path.exists(dst_path):
+        return dst_path
+
+    os.makedirs(os.path.dirname(dst_path), exist_ok=True)
+    url = DATA_URLS[source]
+    download_path = cached_path(url)
+    with zipfile.ZipFile(download_path, "r") as zip_ref:
+        extracted_dir = os.path.dirname(dst_path)
+        zip_ref.extractall(extracted_dir)
+        os.rename(os.path.join(extracted_dir, zip_ref.infolist()[0].filename), dst_path)
+    return dst_path
diff --git a/demo_helpers/demo_helpers/models.py b/demo_helpers/demo_helpers/models.py
@@ -1,8 +1,18 @@
 import os
+import subprocess
+import sys
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from demo_helpers.model_download import (
+    YOLOV6N_MODEL,
+    YOLOV6N_SOURCE,
+    download_model,
+    download_source,
+)
+
 
 class M5(nn.Module):
     def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
@@ -130,6 +140,33 @@ def forward(self, input):
         return self.logsoftmax(output)
 
 
+def get_yolov6n_model():
+    weights = download_model(YOLOV6N_MODEL)
+    source = download_source(YOLOV6N_SOURCE)
+    export_script = os.path.join(source, "deploy/ONNX/export_onnx.py")
+
+    cmd = [
+        sys.executable,
+        export_script,
+        "--weights",
+        weights,
+        "--img",
+        "640",
+        "--batch",
+        "1",
+        "--simplify",
+    ]
+    p = subprocess.Popen(
+        cmd, cwd=source, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    p.communicate()
+    if p.returncode != 0:
+        raise RuntimeError("Unable to get ONNX model")
+
+    onnx_file = weights.replace(".pt", ".onnx")
+    return onnx_file
+
+
 def load_pretrained(model_name):
     """Loads a pre-trained model
 

diff --git a/demo_helpers/setup.py b/demo_helpers/setup.py
@@ -10,16 +10,16 @@
     packages=find_packages(
         exclude=["*.__pycache__.*"],
     ),
+    include_package_data=True,
     install_requires=[
-        "charset-normalizer==2.1.0",
-        "torch>=1.12.0",
+        "charset-normalizer==3.3.2",
         "transformers>=4.20.0",
         "datasets>=2.3.2",
         "prettytable>=3.3.0",
         "wget>=3.2",
         "setuptools==57.2.0",
-        "torchvision>=0.11.3",
-        "torchaudio>=0.12.1",
+        "torchvision==0.16.0",
+        "torchaudio==2.1.0",
         "path>=16.4.0",
     ],
     classifiers=[],

diff --git a/docs/readme.md b/docs/readme.md
@@ -1,4 +1,5 @@
 # Documentation
+
 The following are links to GroqFlow documentation:
 
 - [Install Guide](install.md): Instructions on how to install GroqFlow.

diff --git a/docs/known_issues.md → docs/release_notes.md b/docs/known_issues.md → docs/release_notes.md
@@ -1,7 +1,25 @@
-# GroqFlow Known Issues
+# Release Notes
+
+## v4.3.1
+
+### Changes
+
+* Support for SDK 0.11.
+* Add beta support for groq-torch-importer front-end support.
+* Clean up package dependencies.
+* Various bug fixes.
+
+### Known Issues
+
+* Yolo V6 proof point downloads the pytorch weights and invokes the export script to get the ONNX file.
+* Pip install of GroqFlow may complain about incompatible protobuf version.
+
+## v4.2.1
+
+### Known Issues
 
 * Runtime errors due to mismatches in tensor sizes may occur even though GroqFlow checks the data shape. (G14148)
 * Whacky terminal line wrapping when printing groqit error messages. (G13235)
 * GroqFlow requires both the runtime and developer package to be installed. (G18283, G18284)
 * GroqFlow BERT Quantization Proof Point fails to compile in SDK0.9.3 due to a scheduling error. (G16739)
-* Yolo v6 Proof Points fails to run the evaluation after compilation in SDK0.9.2.1. (G18209)
+* Yolo v6 Proof Points fails to run the evaluation after compilation in SDK0.9.2.1. (G18209)
diff --git a/groqflow/common/build.py b/groqflow/common/build.py
@@ -25,6 +25,7 @@
     "dont_use_sdk": "GROQFLOW_BAKE_SDK",
     "debug": "GROQFLOW_DEBUG",
     "internal": "GROQFLOW_INTERNAL_FEATURES",
+    "torch_importer": "GROQFLOW_USE_TORCH_IMPORTER",
 }
 
 # Allow an environment variable to override the default
@@ -63,6 +64,13 @@
 # the topology argument to groqit().
 TOPOLOGY = DRAGONFLY
 
+# Allow users to use the Torch Importer and bypass ONNX. Only applicable for
+# Torch models, has no other effect on other model types.
+if os.environ.get(environment_variables["torch_importer"]):
+    USE_TORCH_IMPORTER = True
+else:
+    USE_TORCH_IMPORTER = False
+
 
 class Backend(enum.Enum):
     AUTO = "auto"
@@ -170,6 +178,9 @@ class GroqInfo(of_build.Info):
     num_parameters: Optional[int] = None
     opt_onnx_unsupported_ops: Optional[List[str]] = None
     opt_onnx_all_ops_supported: Optional[bool] = None
+    torch_script_exported: Optional[bool] = None
+    torch_importer_success: Optional[bool] = None
+    torch_importer_command: Optional[str] = None
     compiler_success: Optional[bool] = None
     compiler_command: Optional[str] = None
     assembler_success: Optional[bool] = None
@@ -181,8 +192,8 @@ class GroqInfo(of_build.Info):
     estimated_pcie_output_latency: Optional[float] = None
     estimated_throughput: Optional[float] = None
     estimated_latency: Optional[float] = None
-    compiled_onnx_input_bytes: Optional[int] = None
-    compiled_onnx_output_bytes: Optional[int] = None
+    compiled_model_input_bytes: Optional[int] = None
+    compiled_model_output_bytes: Optional[int] = None
     compiler_ram_bytes: Optional[float] = None
 
 
@@ -233,6 +244,19 @@ def latency_file(self):
             of_build.output_dir(self.cache_dir, self.config.build_name), "latency.npy"
         )
 
+    @property
+    def torch_script_dir(self):
+        return os.path.join(
+            of_build.output_dir(self.cache_dir, self.config.build_name), "torchscript"
+        )
+
+    @property
+    def torch_script_file(self):
+        return os.path.join(
+            self.torch_script_dir,
+            f"{self.config.build_name}.pt",
+        )
+
     @property
     def compile_dir(self):
         return os.path.join(

diff --git a/groqflow/common/sdk_helpers.py b/groqflow/common/sdk_helpers.py
@@ -33,7 +33,11 @@ def get_num_chips_available(pci_devices=None):
 
     # Capture the list of pci devices on the system using the linux lspci utility
     if pci_devices is None:
-        pci_devices = subprocess.check_output([lspci, "-n"]).decode("utf-8").split("\n")
+        pci_devices = (
+            subprocess.check_output([lspci, "-n"], stderr=subprocess.DEVNULL)
+            .decode("utf-8")
+            .split("\n")
+        )
 
     # Unique registered vendor id: 1de0, and device id: "0000"
     groq_card_id = "1de0:0000"
@@ -74,7 +78,11 @@ def _installed_package_version(package: str, os_version: OS) -> Union[bool, str]
         # Get package info
         try:
             cmd = ["apt-cache", "policy", package]
-            package_info = subprocess.check_output(cmd).decode("utf-8").split("\n")
+            package_info = (
+                subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+                .decode("utf-8")
+                .split("\n")
+            )
         except (FileNotFoundError, subprocess.CalledProcessError) as e:
             raise exp.Error("apt-cache policy command failed") from e
 
@@ -89,7 +97,11 @@ def _installed_package_version(package: str, os_version: OS) -> Union[bool, str]
         # Get package info
         cmd = ["dnf", "info", package]
         try:
-            package_info = subprocess.check_output(cmd).decode("utf-8").split("\n")
+            package_info = (
+                subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+                .decode("utf-8")
+                .split("\n")
+            )
         except FileNotFoundError as e:
             raise exp.Error("dnf info command failed") from e
         except subprocess.CalledProcessError as e:

diff --git a/groqflow/groqmodel/groqmodel.py b/groqflow/groqmodel/groqmodel.py
@@ -139,13 +139,13 @@ def estimate_performance(self) -> GroqEstimatedPerformance:
 
         # Calculate compute latency and estimate PCIe latency
         self.state.info.estimated_pcie_input_latency = (
-            self.state.info.compiled_onnx_input_bytes / pcie_bandwidth
+            self.state.info.compiled_model_input_bytes / pcie_bandwidth
         ) + pcie_latency
         self.state.info.deterministic_compute_latency = on_chip_compute_cycles / (
             frequency
         )
         self.state.info.estimated_pcie_output_latency = (
-            self.state.info.compiled_onnx_output_bytes / pcie_bandwidth
+            self.state.info.compiled_model_output_bytes / pcie_bandwidth
         ) + pcie_latency
 
         # When pipelined, the reported cycle is the duration of a single pipelining stage