From aaf58137b8d880293d0274ec794a1fa0134e461a Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Wed, 13 Mar 2024 00:10:10 -0700
Subject: [PATCH] add the api script

---
 scripts/python_api_example_for_gptq.py | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 scripts/python_api_example_for_gptq.py

diff --git a/scripts/python_api_example_for_gptq.py b/scripts/python_api_example_for_gptq.py
new file mode 100644
index 000000000..089ee85dd
--- /dev/null
+++ b/scripts/python_api_example_for_gptq.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from transformers import AutoTokenizer, TextStreamer
+from neural_speed import Model
+
+if len(sys.argv) != 2:
+    print("Usage: python python_api_example.py model_path")
+model_name = sys.argv[1]
+
+prompt = "Once upon a time, a little girl"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+
+model = Model()
+# If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
+model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True)
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)