-
Notifications
You must be signed in to change notification settings - Fork 0
/
searchindex.js
1 lines (1 loc) · 56.4 KB
/
searchindex.js
1
Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[16, "achieving-peak-throughput"]], "Add Unit Tests": [[13, "add-unit-tests"]], "Add a Runner": [[8, "add-a-runner"]], "Add the model to the test suite": [[20, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[16, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[11, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Batches": [[4, "Batches"]], "Batching": [[10, "batching"]], "Benchmark": [[12, "benchmark"]], "Benchmark and Profiling": [[12, null]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[21, "cuda-error-an-illegal-memory-access-was-encountered"]], "CUDA out of memory": [[21, "cuda-out-of-memory"]], "Chat Completions": [[4, "Chat-Completions"]], "Chat Template": [[6, "Chat-Template"]], "Choices Methods in SGLang": [[9, null]], "Classify (reward model)": [[2, "Classify-(reward-model)"]], "Clean": [[0, "clean"]], "Common Notes": [[22, "common-notes"]], "Completions": [[4, "Completions"]], "Constrained Decoding": [[10, "constrained-decoding"]], "Contributor Guide": [[13, null]], "Control Flow": [[10, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[14, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Models": [[20, "embedding-models"]], "Encode (embedding model)": [[2, "Encode-(embedding-model)"]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Example: Run Llama 3.1 405B": [[1, "example-run-llama-3-1-405b"]], "Examples": [[19, "examples"]], "Flush Cache": [[2, "Flush-Cache"]], "Format Your Code": [[13, "format-your-code"]], "Frequently Asked Questions": [[15, null]], "Frontend Tutorial": [[11, null]], "Frontend: Structured Generation Language (SGLang)": [[10, null]], "Generate (text generation model)": [[2, "Generate-(text-generation-model)"]], "Generative Models": [[20, "generative-models"]], "Get Maximum Total Number of Tokens": [[2, "Get-Maximum-Total-Number-of-Tokens"]], "Get Memory Pool Size": [[2, "Get-Memory-Pool-Size"]], "Get Model Info": [[2, "Get-Model-Info"]], "Get Server Args": [[2, "Get-Server-Args"]], "Getting Started": [[11, null]], "Grafana Dashboard": [[18, "grafana-dashboard"]], "Greedy Token Selection": [[9, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[16, null]], "Health Check": [[2, "Health-Check"]], "How to Support a New Model": [[20, "how-to-support-a-new-model"]], "Install SGLang": [[22, null]], "Interactive debugging": [[20, "interactive-debugging"]], "JSON": [[4, "JSON"]], "JSON Decoding": [[10, "json-decoding"]], "JSON Format": [[14, "json-format"]], "Jinja Format": [[14, "jinja-format"]], "Language Feature": [[10, "language-feature"]], "Launch A Server": [[2, "Launch-A-Server"], [4, "Launch-A-Server"], [5, "Launch-A-Server"], [6, "Launch-A-Server"], [23, "Launch-A-Server"]], "Learn more": [[17, null]], "Make a release in GitHub": [[7, "make-a-release-in-github"]], "Method 1: With pip": [[22, "method-1-with-pip"]], "Method 2: From source": [[22, "method-2-from-source"]], "Method 3: Using docker": [[22, "method-3-using-docker"]], "Method 4: Using docker compose": [[22, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[22, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[9, "methods"]], "More Examples": [[10, "more-examples"]], "Multi modal": [[19, "multi-modal"]], "Multi-Modality": [[10, "multi-modality"]], "Multiple-Image Inputs": [[6, "Multiple-Image-Inputs"]], "Native APIs": [[2, null]], "Non-streaming Asynchronous Generation": [[3, "Non-streaming-Asynchronous-Generation"]], "Non-streaming Synchronous Generation": [[3, "Non-streaming-Synchronous-Generation"]], "Normal": [[19, "normal"]], "Offline Batch Inference": [[3, "Offline-Batch-Inference"]], "Offline Engine API": [[3, null]], "OpenAI APIs - Completions": [[4, null]], "OpenAI APIs - Embedding": [[5, null]], "OpenAI APIs - Vision": [[6, null]], "OpenAI Compatible API": [[1, "openai-compatible-api"]], "Other tips": [[12, "other-tips"]], "Parallelism": [[10, "parallelism"]], "Parameters": [[4, "Parameters"], [4, "id2"]], "Port a model from vLLM to SGLang": [[20, "port-a-model-from-vllm-to-sglang"]], "Production Metrics": [[18, null]], "Profile with Nsight": [[12, "profile-with-nsight"]], "PyPI Package Release Process": [[7, null]], "Quick Start": [[1, "quick-start"], [10, "quick-start"]], "Quick Start: Sending Requests": [[23, null]], "References": [[11, null]], "Regular expression": [[4, "Regular-expression"]], "Reward Models": [[20, "reward-models"]], "Roles": [[10, "roles"]], "SGLang Documentation": [[0, null], [11, null]], "Sampling Parameters in SGLang Runtime": [[19, null]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-Hosted Runners for GitHub Action": [[8, null]], "Setup Guide": [[18, "setup-guide"]], "Step 1: Start a docker container.": [[8, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[8, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[8, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[10, "streaming"], [19, "streaming"], [23, "Streaming"], [23, "id1"]], "Streaming Asynchronous Generation": [[3, "Streaming-Asynchronous-Generation"]], "Streaming Synchronous Generation": [[3, "Streaming-Synchronous-Generation"]], "Structured decoding (JSON, Regex)": [[4, "Structured-decoding-(JSON,-Regex)"], [19, "structured-decoding-json-regex"]], "Supported Models": [[20, null]], "Test the correctness": [[20, "test-the-correctness"]], "The results are not deterministic, even with a temperature of 0": [[15, "the-results-are-not-deterministic-even-with-a-temperature-of-0"]], "Tips and Implementation Details": [[10, "tips-and-implementation-details"]], "Token Length Normalized": [[9, "token-length-normalized"]], "Troubleshooting": [[21, null]], "Try Advanced Options": [[16, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[16, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[16, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[16, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[16, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[9, "unconditional-likelihood-normalized"]], "Update Weights": [[2, "Update-Weights"]], "Update the version in code": [[7, "update-the-version-in-code"]], "Upload the PyPI package": [[7, "upload-the-pypi-package"]], "Usage": [[4, "Usage"], [4, "id1"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[5, "Using-Input-IDs"]], "Using Local Models": [[10, "using-local-models"]], "Using Native Generation APIs": [[23, "Using-Native-Generation-APIs"]], "Using OpenAI Models": [[10, "using-openai-models"]], "Using OpenAI Python Client": [[5, "Using-OpenAI-Python-Client"], [6, "Using-OpenAI-Python-Client"], [23, "Using-OpenAI-Python-Client"]], "Using Python Requests": [[5, "Using-Python-Requests"], [6, "Using-Python-Requests"], [23, "Using-Python-Requests"]], "Using cURL": [[5, "Using-cURL"], [6, "Using-cURL"], [23, "Using-cURL"]]}, "docnames": ["README", "backend/backend", "backend/native_api", "backend/offline_engine_api", "backend/openai_api_completions", "backend/openai_api_embeddings", "backend/openai_api_vision", "developer/release_process", "developer/setup_github_runner", "frontend/choices_methods", "frontend/frontend", "index", "references/benchmark_and_profiling", "references/contributor_guide", "references/custom_chat_template", "references/faq", "references/hyperparameter_tuning", "references/learn_more", "references/production_metrics", "references/sampling_params", "references/supported_models", "references/troubleshooting", "start/install", "start/send_request"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend/backend.md", "backend/native_api.ipynb", "backend/offline_engine_api.ipynb", "backend/openai_api_completions.ipynb", "backend/openai_api_embeddings.ipynb", "backend/openai_api_vision.ipynb", "developer/release_process.md", "developer/setup_github_runner.md", "frontend/choices_methods.md", "frontend/frontend.md", "index.rst", "references/benchmark_and_profiling.md", "references/contributor_guide.md", "references/custom_chat_template.md", "references/faq.md", "references/hyperparameter_tuning.md", "references/learn_more.md", "references/production_metrics.md", "references/sampling_params.md", "references/supported_models.md", "references/troubleshooting.md", "start/install.md", "start/send_request.ipynb"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 4, 5, 6, 9, 10, 16, 18, 19, 20, 22, 23], "0": [1, 2, 3, 4, 5, 6, 8, 10, 16, 18, 19, 22, 23], "00": [3, 4, 5, 6, 23], "0000": 16, "0006747245788574219": 5, "0006804466247558594": 5, "000682830810546875": 5, "001": 18, "0020961761474609375": 5, "0020999908447265625": 5, "003025054931640625": 5, "0030345916748046875": 5, "005": 18, "006198883056640625": 5, "006214141845703125": 5, "006cae7e590f": 4, "007263183594": 18, "00807952880859375": 5, "00830078125": 5, "00830841064453125": 5, "009002685546875": 5, "01": [3, 4, 5, 6, 10, 16, 18, 23], "01239013671875": 5, "01438140869140625": 5, "015": 18, "02": [3, 4, 5, 6, 18], "025": 18, "02it": 5, "03": [4, 5, 6, 18], "04": [4, 5, 6, 8, 18, 23], "04it": 23, "05": [4, 5, 18], "06": [4, 5, 18], "06e8a80d08fd": 4, "07": [4, 5], "075": 18, "0780844688416": 18, "08": [4, 18], "08it": 6, "09": 4, "09it": 4, "1": [2, 3, 4, 5, 6, 10, 12, 16, 18, 19, 20, 23], "10": [2, 3, 4, 5, 12, 18], "100": [3, 4, 5, 6, 10, 18, 23], "1000": [12, 18], "10000": 18, "100000": 18, "10081": 4, "10177": 4, "1024": [12, 22], "1025173": 5, "10347": 4, "104": 6, "10597": 4, "106": 4, "107": 6, "10715": 4, "108a41b3": 4, "10984": 4, "11": [3, 4, 5, 6, 23], "112025": 4, "11284": 4, "113": 23, "118": 4, "1182": 18, "1187": 18, "11b": 6, "12": [3, 4, 6, 8, 22], "121": 4, "122": 4, "123": 4, "12425": 4, "1266": 18, "127": [1, 4, 5, 6, 23], "1270": 18, "128": [1, 4, 19, 22], "128009": [4, 6, 23], "12895": 6, "128g": 8, "129": 4, "12910": 6, "12950": 6, "13": [4, 5, 6, 23], "130": 4, "131072": [4, 5, 6, 23], "132025": 4, "1335": 15, "134": 4, "1350": 18, "136": [4, 23], "137": [4, 23], "139394": 18, "14": [4, 5, 23], "146": 4, "147": 4, "15": [5, 6, 18, 23], "150": 4, "152025": 4, "1563": 18, "16": [1, 4, 6, 10, 23], "160": [4, 5, 6, 23], "161721": 18, "1623": 18, "16384": [4, 5, 6, 18, 23], "16425": 4, "16g": 22, "17": [5, 23], "172": 1, "172025": 4, "1729": 15, "1732317242": 4, "1732317243": 4, "1732317244": 4, "1732317245": 4, "1732317248": 4, "1732317399": 6, "1732317400": 6, "1732317453": 23, "1732317454": 23, "1748": 18, "18": [4, 5, 6, 23], "18243": 4, "1825": 4, "186": 4, "1883": 18, "18905": 4, "18it": [3, 4], "19": [4, 5], "191": 18, "192025": 4, "19786": 4, "1980": 4, "1998": 4, "1b": 2, "2": [1, 2, 3, 4, 5, 6, 10, 14, 18, 19, 20, 23], "20": [4, 6, 18], "200": [4, 5, 6, 18, 23], "2000": 18, "20000": [1, 18], "2019": 3, "202": 18, "2024": [4, 5, 6, 23], "20425": 4, "2048": [2, 16, 21], "2049": [4, 6, 23], "20527": 4, "207": 18, "207632945": 23, "2095": 18, "20th": 3, "21": [4, 5, 18], "2102": 18, "2104": 18, "212025": 4, "2124": 18, "21456": 4, "2147000": 4, "2190": 4, "22": [3, 4, 5, 6, 18, 23], "22423": 4, "2254": 4, "23": [3, 4, 5, 6, 23], "233": 16, "23377": 4, "24": 4, "243": [3, 4, 5, 6, 23], "24425": 4, "24438": 4, "24885": 18, "24h": 4, "25": [3, 4, 5, 10, 18, 23], "256": [4, 5, 6, 10, 12, 23], "25762": 4, "27": [4, 5, 6], "27b": 20, "27it": 23, "28": [4, 5, 6, 23], "28425": 4, "29": [5, 6, 23], "298440": 6, "29846": 18, "29b9": 4, "2f34740108fc": 4, "3": [2, 3, 4, 5, 6, 10, 12, 14, 16, 18, 19, 20, 23], "30": [4, 5, 18, 23], "300": [4, 5, 6, 23], "3000": 18, "30000": [1, 4, 5, 6, 10, 14, 18, 19, 20, 22, 23], "30010": 2, "30020": 2, "30030": 2, "30060": 18, "3072": 2, "31": [4, 6], "311": 18, "3150733": 4, "3151742": 5, "3152749": 6, "3154524": 23, "317": 16, "32": [1, 4, 5, 6, 12, 19, 22, 23], "32025": 4, "32425": 4, "3279": 4, "32g": 22, "33": [4, 5, 23], "33230": 6, "33240": 6, "33254": 6, "334": 4, "34": [4, 5, 6, 23], "35": [3, 4, 6, 23], "36": [3, 4], "36425": 4, "37": [4, 23], "370": 4, "370959": 16, "38": 4, "3871": 4, "39": [3, 4, 5, 6, 23], "39184": 18, "39408": 23, "39420": 23, "39432": 23, "39440": 23, "39454": 23, "39458": 23, "3b": 2, "3e9f": 4, "4": [1, 3, 4, 5, 6, 10, 18, 20, 23], "40": [4, 5, 6, 23], "4005": 5, "40425": 4, "40948": 4, "4096": [1, 4, 5, 6, 16, 21, 23], "4097": 18, "41": [4, 5, 6, 23], "419f": 4, "42": [4, 6, 23], "421": 18, "421951215": 5, "422029": 18, "422424": 18, "422425": 18, "424529": 18, "424549": 18, "425": 4, "4267ab99d950": 4, "42944": 4, "42it": 4, "43": [4, 5, 6, 23], "43590": 23, "43596": 23, "43598": 23, "44": 4, "44018": 5, "44026": 5, "44030": 5, "4425": 4, "442913": [4, 23], "44425": 4, "448": 18, "44d2": 4, "45": 4, "45404": 4, "45410": 4, "45426": 4, "456": 18, "4594": 16, "45bf": 4, "46": [4, 23], "47": 4, "470668074": 4, "48": 4, "48425": 4, "48ca": 4, "49": [6, 18], "4aa20d31f2c44f49968cc077a504736": 23, "4c13": 4, "4ea4": 4, "4f09": 4, "5": [1, 3, 4, 5, 6, 10, 18, 19, 20, 23], "50": [3, 4, 16, 18, 23], "500": [4, 16, 18], "5000": 18, "50000": [1, 18], "500552": 18, "506780": 18, "51": 4, "511": 18, "512": 12, "51802": 6, "51804": 6, "52": 1, "52025": 4, "52144": 4, "52425": 4, "53": 23, "5393948555": 18, "54": [4, 18], "55": [3, 4, 23], "56": [4, 6], "56068": 4, "56076": 4, "56086": 4, "56280": 6, "56292": 6, "563": 18, "56730": 5, "56740": 5, "56750": 5, "57": [4, 5, 23], "58": [4, 6], "58464": 5, "59": [4, 5, 6, 23], "598": 18, "5b": 20, "5de2": 4, "6": [3, 4, 5, 6, 8, 20, 22, 23], "60": [6, 12, 23], "60701a3a4efc42478a27440c07796d2a": 23, "61": [4, 18], "61387": 18, "62": 4, "62cca6dec16e4c45b1655348ca50aaab": 6, "62f": 4, "62it": 3, "63": [4, 23], "63it": 3, "64": [1, 4, 5, 12, 19, 23], "6452": 6, "6462": 6, "6463": 6, "6469": 6, "6481": 6, "6490": 6, "6491": 6, "6531": 6, "656": 18, "66": 6, "67": 4, "675": 4, "67it": 6, "687409856": 6, "69it": 23, "6fd9eb40": 4, "7": [1, 4, 5, 6, 18, 23], "70": 12, "70517": 18, "707": 18, "70it": [6, 23], "71": 5, "72025": 4, "723a": 4, "72b": [6, 20], "73": [4, 6, 23], "733": 18, "74": 4, "75": [3, 4, 18, 23], "75it": 6, "76": 4, "77": 4, "77172a66f936": 4, "78": [4, 5, 6, 23], "78835": 18, "7b": [1, 2, 5, 6, 14, 19, 20], "7bb1750ba72244f3b80b1e7497fdb9fc": 4, "7fa2af80": 12, "8": [1, 3, 4, 5, 6, 19, 20, 22, 23], "80": [4, 6], "8000": 0, "81": 4, "810": 4, "812": 4, "814": 18, "8192": [4, 5, 6, 18, 23], "81it": 4, "82": 16, "825": 18, "83": 4, "84": 6, "8425": 4, "85it": 3, "86": [4, 5, 23], "87": 4, "88": [4, 5, 6, 23], "88it": 6, "89": 4, "8955": 4, "8b": [1, 2, 3, 4, 10, 12, 19, 20, 22, 23], "9": [1, 4, 10, 16], "90": 4, "900": 18, "90it": 6, "91": 5, "9152": 4, "916c": 4, "92": 4, "92025": 4, "92it": 23, "93": 23, "9336": 4, "95": [1, 3, 4, 15], "9507": 4, "95e4": 4, "9602": 18, "96it": 3, "97": 6, "9711": 4, "979": 18, "99": 6, "9930": 4, "997": 18, "99934530258": 18, "9998": 16, "9b": 18, "9d65": 4, "9e7e": 4, "A": [3, 10, 12, 16, 22], "As": [3, 6], "At": 3, "But": 3, "By": [14, 19], "For": [1, 3, 4, 9, 20, 23], "If": [1, 14, 16, 19, 21, 22], "In": [1, 3, 4, 5, 6, 10, 15, 23], "It": [1, 2, 4, 9, 10, 11, 14, 16, 19, 22], "NOT": 14, "On": 16, "One": 3, "Or": 1, "THE": 4, "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 18, 19, 20, 22, 23], "Their": 4, "Then": [8, 10, 18], "There": 14, "To": [0, 1, 3, 4, 5, 6, 10, 12, 15, 16, 18, 20, 22, 23], "With": 3, "__init__": 7, "__main__": 1, "__name__": 1, "_build": 0, "a01501defee640fd962b1836ed5b03af": 4, "a10": 22, "a100": 22, "a546": 4, "a78d": 4, "abil": 3, "abl": 20, "about": [1, 3, 4, 10, 14, 15, 16, 17], "abov": [9, 12, 19, 21, 22], "acceler": [1, 16, 22], "accept": [4, 19], "access": [0, 1, 3, 18, 22], "accord": [10, 12, 22], "account": [3, 15], "accumul": 15, "achiev": [4, 15], "across": [9, 15], "activ": 11, "ad": 22, "adam": 3, "add": [1, 3, 4, 5, 6, 10, 12, 15, 16, 19, 22], "addit": [3, 9, 10], "addition": 3, "addr": 1, "address": [1, 10, 15], "adjust": 6, "administ": 3, "adopt": 11, "ador": 3, "adv": 12, "advanc": [3, 4, 11], "after": 23, "again": 4, "against": 9, "ai": [1, 3, 4, 22], "alia": 22, "alibaba": [2, 5, 20], "aliv": 10, "all": [0, 1, 3, 6, 8, 9, 10, 13, 16, 20, 22], "all_other_model": 20, "allow": [12, 22], "almost": [1, 16, 20], "alon": 3, "also": [1, 2, 3, 4, 5, 6, 10, 14, 15, 16, 19, 21, 23], "altern": [9, 10], "alwai": 16, "am": 3, "amd": [8, 22], "ami": 3, "an": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 18, 19, 22, 23], "analysi": 4, "ancient": 4, "ani": [1, 4, 10, 19, 22], "annot": 12, "anoth": [6, 20], "answer": [9, 10], "answer_1": 10, "answer_2": 10, "anthrop": 10, "antidisestablishmentarian": 9, "anxieti": 3, "anyon": 23, "anyth": 3, "apart": 2, "api": [9, 10, 11, 14, 19, 20, 22], "api_kei": [1, 4, 5, 6, 23], "appear": 19, "append": 4, "appli": 4, "applic": [1, 2, 4, 5, 6, 11, 23], "apply_chat_templ": 2, "approach": [4, 22], "apt": [8, 12], "aqueduct": 4, "ar": [1, 2, 3, 4, 5, 6, 9, 10, 12, 14, 16, 18, 19, 20, 22, 23], "arc": 3, "arch": 8, "architectur": [2, 3, 4, 12, 23], "area": 3, "arg": [9, 22], "argument": [2, 10, 12, 19], "aris": 15, "around": 3, "art": [3, 4, 23], "articl": 3, "artifici": [3, 4], "artwork": 3, "ask": 11, "assembli": 4, "assert": [2, 4], "asset": 6, "assist": [1, 2, 3, 4, 6, 9, 10, 14, 19, 23], "assistant_begin": 10, "assistant_end": 10, "assum": 18, "async": 3, "async_gener": 3, "asyncio": 3, "atmospher": 4, "attain": 16, "attent": [11, 20, 22], "attention_backend": [4, 5, 6, 23], "attract": [3, 4, 9, 10], "audio": [4, 23], "auror": 10, "aurora": 4, "australia": [4, 23], "author": [3, 4], "auto": [3, 4, 5, 6, 23], "autom": 3, "automat": [2, 4, 6, 19], "autoregress": 10, "autosc": 22, "autotoken": [2, 5], "avail": [1, 2, 4, 5, 6, 18, 22, 23], "averag": 9, "avoid": [4, 21, 22], "await": 3, "awq": 11, "b": [2, 22], "b0e3f1816f44": 4, "b0e7": 4, "b28f3e781d7d40388ce7e076cde96cd8": 23, "bab03c9cc0f94278a0f077978ac8d59f": 23, "back": [2, 3, 6, 11], "backend": [9, 12, 22], "backend_input_fil": 4, "backend_result_fil": 4, "background": 6, "backyard": 3, "bad": 9, "bai": 3, "baichuan2": 20, "balanc": [4, 6, 10], "base": [2, 9, 19], "base64": 19, "base_gpu_id": [4, 5, 6, 23], "base_url": [1, 4, 5, 6, 23], "bash": [7, 8], "batch": [1, 5, 6, 11, 12, 15, 16, 19, 22, 23], "batch_89c5c37b": 4, "batch_bda3b481": 4, "batch_detail": 4, "batch_eea73405": 4, "batch_id": 4, "batch_job": 4, "batch_request": 4, "batch_respons": 4, "batchrequestcount": 4, "bbb0cb0c": 4, "beauti": 3, "becam": 4, "becaus": [6, 10, 16], "becom": 3, "been": [3, 4], "befor": [3, 6, 12, 19], "begin": [4, 5, 6, 10, 23], "beij": 4, "being": [3, 16], "below": [3, 8, 10, 19, 22], "bench_offline_throughput": 12, "bench_one_batch": [12, 20, 22], "bench_serv": 12, "benchmark": 11, "benefit": 3, "berlin": 9, "bespok": 9, "best": 3, "better": [1, 4, 16, 20, 22], "between": [1, 2, 19], "bfloat16": [4, 6, 23], "bia": [3, 10], "bias": 3, "big": 3, "bin": 8, "bit": 3, "black": [4, 5, 6, 23], "blackwood": 4, "blob": [6, 19], "block": [4, 6, 10, 23], "blog": 17, "blogpost": 9, "blood": 10, "blue": [4, 5, 6, 23], "board": 6, "bodi": [4, 10], "bogart": 10, "book": 3, "bool": 19, "born": 10, "both": [3, 16, 21], "bottleneck": 16, "bottom": 6, "bound": 2, "bra": 4, "brain": 3, "branch": [3, 22], "bras\u00edlia": [4, 23], "brazil": [4, 23], "break": [19, 23], "breathabl": 4, "breathtak": 3, "browser": 0, "bucket": 3, "bug": 4, "bui": 3, "build": [1, 3, 7, 22], "built": [3, 4, 22], "busi": [3, 6], "c": [4, 5, 6, 22, 23], "c51865f6a5f343d5becaa6a831cec6a2": 6, "c86f3bee8a16": 4, "cab": 6, "cach": [1, 4, 5, 6, 8, 11, 12, 15, 16, 18, 21, 22, 23], "cache_hit_r": 18, "cached_token": [18, 23], "calcul": 10, "call": [9, 10, 11], "campaign": 4, "can": [1, 2, 3, 4, 6, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "canberra": [4, 23], "cancel": 4, "cancelled_job": 4, "candid": 4, "cannot": 19, "cap": 22, "capit": [1, 2, 3, 4, 9, 10, 19, 23], "captain": 4, "captiv": 3, "captur": [4, 6, 23], "car": [3, 6], "case": [3, 4, 16], "cater": 3, "cathedr": 3, "caus": 15, "cd": [7, 13, 18, 22], "center": [3, 4], "central": 3, "centuri": [3, 4], "ceremoni": 3, "chain": 11, "challeng": 3, "chang": [8, 15, 20], "charact": 10, "character_gen": 10, "character_regex": 10, "chat": [1, 10, 11, 19, 20, 23], "chat_exampl": 10, "chat_templ": [4, 5, 6, 14, 23], "chatcomplet": [4, 23], "chatcompletionmessag": [4, 23], "chatglm": 20, "chatml": [6, 14, 19, 20], "check": [1, 4, 22], "check_output": [5, 6, 23], "checkpoint": [1, 3, 4, 5, 6, 12, 23], "chief": 3, "children": 3, "china": 4, "choic": [4, 6, 10, 11, 23], "choices_method": 9, "chunk": [1, 3, 4, 6, 11, 19, 21, 23], "chunked_prefill_s": [4, 5, 6, 23], "ci": 13, "cicero": 4, "citi": [3, 6, 23], "citizen": 3, "civil": 4, "clariti": [4, 5, 6, 23], "class": [3, 19], "classifi": 23, "clean": 4, "cli": 12, "click": [3, 18], "client": [1, 4, 12], "climat": 3, "clone": [0, 22], "cloth": 6, "cluster": 22, "co": [11, 14], "code": [4, 5, 6, 10, 12, 15, 20, 23], "collabor": 3, "collect": 3, "colleg": 3, "color": [4, 5, 6, 12, 23], "colosseum": 4, "com": [6, 7, 8, 12, 19, 22], "combin": [4, 5, 6, 23], "come": [3, 16], "command": [1, 3, 5, 8, 12, 13, 20, 22], "commit": 13, "common": [3, 21], "commun": 11, "compar": 20, "comparison": [9, 20], "compat": [2, 4, 5, 6, 10, 14, 19, 23], "compil": [1, 16], "complet": [1, 2, 3, 5, 6, 10, 11, 23], "completion_token": [4, 6, 23], "completion_tokens_detail": [4, 23], "completion_tokens_wo_jump_forward": 23, "completion_window": 4, "completionchoic": 4, "completionusag": [4, 23], "complex": [3, 10], "compos": 18, "compuls": 3, "comput": [1, 4, 6, 10, 12, 15, 16], "concept": 4, "concern": 3, "concis": 4, "conda": 22, "confid": [3, 9], "config": [1, 12], "configur": 2, "connect": [10, 22], "conquest": 4, "consequ": 3, "consid": [3, 19], "constrain": [1, 4, 11, 16, 19], "constrained_json_whitespace_pattern": [4, 5, 6, 23], "constraint": [4, 10, 19], "contact": 3, "contain": 9, "content": [1, 2, 4, 6, 10, 23], "context": 18, "context_len": [4, 5, 6, 18, 23], "context_length": [4, 5, 6, 23], "continu": [2, 3, 10, 11], "contribut": 14, "contributor": 11, "control": 11, "conv": 2, "convers": 14, "convert": 20, "cool": 3, "copi": 22, "core": [10, 11], "corner": 6, "correct": [3, 12, 19], "cosmo": 4, "cost": 4, "could": 19, "count": 4, "counter": 18, "countri": [1, 3, 4, 23], "cours": 3, "cover": [4, 5, 6], "coverag": 20, "cpu": [1, 16], "creat": [1, 3, 4, 5, 6, 20, 23], "created_at": 4, "creativ": 4, "crew": 4, "critic": 12, "crop": 3, "crucial": 3, "ctrl": [4, 5, 6, 23], "cu121": 22, "cubla": 15, "cuda": [1, 4, 5, 6, 8, 12, 15, 22, 23], "cuda_graph_max_b": [4, 5, 6, 23], "cuda_visible_devic": 8, "cuisin": [3, 23], "cultur": [3, 23], "curl": [1, 2, 8, 19], "curl_command": [6, 23], "curl_id": 5, "curl_text": 5, "current": [1, 2, 3, 4, 15, 16, 18], "custom": [1, 3, 4, 11], "custom_id": 4, "custom_serv": 3, "d": [0, 1, 4, 5, 6, 10, 12, 22, 23], "d165": 4, "d375": 4, "daili": 3, "dame": 3, "dark": 4, "data": [1, 2, 4, 5, 6, 16, 19, 22, 23], "dataclass": 19, "dataset": 12, "date": 3, "dbrx": 20, "de": 3, "deactiv": 22, "deadlock": 1, "death": 10, "deb": 12, "deceas": 10, "decis": 3, "decod": [1, 6, 11, 16, 21, 23], "decode_log_interv": [4, 5, 6, 23], "decode_unicod": [19, 23], "decor": 10, "decreas": [16, 21], "deepseek": [11, 20], "def": [1, 3, 9, 10], "default": [1, 4, 6, 9, 14, 16, 19, 22], "defin": [10, 14], "del_respons": 4, "delai": 12, "delet": 4, "delete_ckpt_after_load": [4, 5, 6, 23], "delta": [4, 23], "delv": 3, "demonstr": 3, "depend": 22, "depict": 6, "deploi": [3, 22], "deploy": 22, "depress": 3, "derek": 3, "describ": [6, 9, 19], "descript": [12, 19], "design": [3, 11], "destin": [3, 23], "detail": [2, 3, 4], "detailed_tip": 10, "detect": 3, "determin": 9, "detoken": 19, "dev": [1, 8, 22], "devel": 8, "develop": [3, 4, 12], "devic": [1, 4, 5, 6, 8, 22, 23], "devtool": 12, "diagnosi": 3, "dict": 19, "die": 3, "diet": 10, "differ": [2, 3, 6, 15, 20], "difficult": [3, 19], "dimens": 2, "direct": 3, "directli": 1, "directori": 20, "disabl": [1, 4, 5, 12, 15, 19], "disable_cuda_graph": [4, 5, 6, 23], "disable_cuda_graph_pad": [4, 5, 6, 23], "disable_custom_all_reduc": [4, 5, 6, 23], "disable_disk_cach": [4, 5, 6, 23], "disable_jump_forward": [4, 5, 6, 23], "disable_mla": [4, 5, 6, 23], "disable_overlap_schedul": [4, 5, 6, 23], "disable_radix_cach": [4, 5, 6, 23], "discuss": 3, "diseas": 3, "dispatch": 15, "displac": 3, "displai": [4, 5, 6, 23], "dist_init_addr": [4, 5, 6, 23], "distant": 4, "distrib_releas": 12, "distribut": [4, 5, 6, 23], "divers": [3, 4], "diversityartifici": 3, "divorc": 3, "dn": 10, "do": [3, 4, 6, 8, 12, 16, 19], "doc": [9, 12, 14, 19, 22], "doc_site_path": 0, "docker": [1, 18], "dockerfil": 22, "dockerx": 22, "document": [3, 14, 22], "doe": [1, 12, 16], "donald": 9, "done": [8, 19, 23], "down": [3, 6, 9], "download": [12, 19], "download_dir": [4, 5, 6, 23], "dp": [1, 4, 5, 6, 23], "dp_size": [4, 5, 6, 23], "dpkg": 12, "dri": [8, 22], "drive": [3, 6], "drun": 22, "dry": 3, "ds_channel_config_path": [4, 5, 6, 23], "ds_heavy_channel_num": [4, 5, 6, 23], "ds_heavy_channel_typ": [4, 5, 6, 23], "ds_heavy_token_num": [4, 5, 6, 23], "ds_sparse_decode_threshold": [4, 5, 6, 23], "dtype": [1, 4, 5, 6, 23], "duck": 9, "due": [9, 16, 21], "dummi": 12, "dump": [4, 5, 19], "durat": 12, "dure": [1, 2, 4, 16, 19, 21], "dynam": [12, 15], "e": [4, 8, 12, 20, 22], "e2e_request_latency_second": 18, "e2e_request_latency_seconds_bucket": 18, "e2e_request_latency_seconds_count": 18, "e2e_request_latency_seconds_sum": 18, "e4b8": 4, "e5": [5, 11], "e8d55f30": 4, "each": 1, "earli": 16, "earlier": 9, "easi": [11, 20, 21], "easier": 10, "easili": 3, "eater": 10, "echo": [8, 12], "ecosystem": 3, "edit": 8, "ef7bcb464fc94973a3587103e66e8d29": 4, "effici": [1, 3, 11], "eiffel": 3, "either": 19, "elect": 3, "elector": 3, "element": 4, "eleutherai": 9, "elev": 3, "elif": 10, "els": 4, "embed": [1, 4, 11, 23], "embedding_process": [2, 5, 6], "emerg": 3, "emma": 3, "empir": 4, "empti": 1, "en": 14, "enabl": [1, 4, 5, 6, 10, 15, 16, 22], "enable_cache_report": [4, 5, 6, 23], "enable_double_spars": [4, 5, 6, 23], "enable_dp_attent": [4, 5, 6, 23], "enable_metr": [4, 5, 6, 23], "enable_mixed_chunk": [4, 5, 6, 23], "enable_nan_detect": [4, 5, 6, 23], "enable_p2p_check": [4, 5, 6, 23], "enable_torch_compil": [4, 5, 6, 23], "encod": [5, 19, 23], "encount": 22, "encourag": [4, 19], "end": [3, 4, 5, 6, 10, 18, 19, 20, 23], "endless": [3, 4], "endpoint": [1, 4, 19, 22, 23], "engin": [4, 10, 11, 12, 15], "england": 19, "enjoi": 3, "enough": [1, 16], "ensur": 3, "entryclass": 20, "enumer": 10, "env": [1, 22], "environ": [1, 5, 6, 8], "eo": [16, 19], "episod": 3, "equip": [3, 4], "equiti": 3, "equival": [4, 5, 6, 15, 23], "error": [1, 2, 4, 16], "especi": [3, 16], "essenti": [3, 4], "establish": 4, "etc": [11, 12], "eth0": 1, "ethic": 3, "europ": 3, "evalu": [2, 3], "even": [3, 9], "event": 3, "everi": 3, "everyon": 3, "everyth": 3, "evid": 3, "examin": 3, "exampl": [2, 3, 4, 5, 8, 9, 18, 20, 22], "example_imag": [6, 19], "exaon": 20, "except": 4, "excit": [3, 4], "exec": 12, "execut": [3, 4, 6, 22, 23], "execute_shell_command": [2, 4, 5, 6, 23], "exercis": 10, "exist": [4, 20], "expand": [4, 10], "expans": 4, "experi": 3, "experienc": 3, "experiment": [1, 16], "expert": 3, "explain": 3, "explor": [3, 4], "export": [0, 1, 8, 10], "expos": [2, 18], "express": [10, 19], "extend": 9, "extens": [11, 20], "exterior": 3, "extern": [10, 11], "extra_bodi": 4, "ey": 4, "f": [1, 2, 3, 4, 5, 10, 18, 22], "face": [1, 4, 6, 14], "fact": 3, "factor": 15, "fail": [2, 4, 9], "failur": 22, "fair": 3, "fals": [2, 4, 5, 6, 19, 23], "famili": 3, "famou": 3, "fan": 3, "fantasi": 3, "far": [3, 19], "farm": 3, "fashion": [3, 23], "fast": 11, "faster": 11, "father": 3, "favor": 16, "favorit": 3, "fcf": 16, "featur": [1, 4, 11], "feder": 3, "feel": 3, "fetch": 12, "fi": 3, "fiction": 3, "field": 3, "figur": 3, "file": [0, 4, 12, 13, 14, 18, 19, 20, 21], "file_respons": 4, "file_storage_pth": [4, 5, 6, 23], "fill": 10, "fillmor": 9, "final": [4, 15], "financ": 3, "find": [3, 10, 17, 20], "finish_reason": [4, 6, 23], "fire": [4, 5, 6, 23], "first": [1, 2, 5, 6, 10, 12, 16, 18], "fix": 21, "flashinf": [4, 5, 6, 11, 22, 23], "flexibl": [6, 11, 23], "float": 19, "float16": 5, "flow": 11, "fluenci": 4, "flush": [3, 10, 19, 23], "flush_cach": 2, "flynn": 3, "focus": [3, 4], "folder": [8, 12, 13], "follow": [1, 2, 3, 4, 5, 8, 10, 12, 16, 18, 19, 20, 21], "foo": 4, "forev": 8, "fork": [10, 12], "format": [3, 4, 5, 6, 10, 12, 19, 23], "forward": [11, 20], "forward_batch": 20, "found": [2, 3, 10], "foundat": 4, "four": 3, "fp16": 1, "fp8": [1, 11, 16, 22], "fp8_e5m2": 1, "fraction": [1, 6, 21], "framework": 11, "franc": [1, 2, 3, 4, 9, 10, 19, 23], "free": 3, "frequenc": 19, "frequency_penalti": [4, 19], "frequent": [11, 16], "friend": 3, "from": [2, 3, 4, 5, 6, 10, 13, 14, 15, 23], "from_pretrain": [2, 5], "frontend": [14, 22], "full": [1, 3, 16], "fulli": 4, "function": [9, 10, 20], "function_cal": [4, 23], "further": 22, "futur": [1, 2, 3, 20], "g": [6, 8, 12, 20, 22], "galleri": 3, "garden": 3, "gaug": 18, "gaze": 4, "gb": [4, 5, 6, 23], "gem": 3, "gemini": 10, "gemma": [11, 18, 20], "gemma2forsequenceclassif": 20, "gen": [4, 6, 9, 10, 16, 18, 23], "gen_throughput": 18, "gener": [0, 1, 4, 6, 11, 18, 19], "generatereqinput": 19, "generation_tokens_tot": 18, "georg": 3, "get": [4, 5, 6, 20, 22, 23], "get_max_total_num_token": 2, "get_memory_pool_s": 2, "get_model_info": [2, 4, 5, 6, 23], "get_server_arg": 2, "gift": 3, "git": [8, 22], "github": [0, 6, 19, 21, 22], "githubusercont": 6, "give": [4, 8, 20], "given": [4, 19], "glm": 20, "gloo_socket_ifnam": 1, "gnupg": 12, "go": 3, "golf": 3, "good": 16, "googl": [10, 18], "govern": [3, 4], "gpt": 10, "gptq": [4, 5, 6, 11, 23], "gpu": [1, 8, 16, 18, 22], "grammar_backend": [4, 5, 6, 23], "graph": [1, 4, 6, 12, 23], "gravit": 4, "greedy_token_select": 9, "groceri": 3, "grok": 20, "group": [8, 22], "grow": 3, "gryffindor": 10, "gte": [2, 5, 11, 20], "guarante": [4, 19], "gui": 3, "guid": [11, 19, 22, 23], "guidanc": 3, "h": 1, "h100": 22, "ha": [3, 16, 20], "habit": 3, "had": 4, "half": 10, "hand": 16, "handl": [2, 12, 19, 23], "happen": [3, 16, 21], "hard": 3, "harri": 10, "harvest": 3, "hasattr": 4, "have": [0, 1, 3, 6, 9, 15, 16, 18], "head": 3, "health_gener": 2, "healthi": [10, 16], "hear": 3, "hello": [1, 3], "helm": 4, "help": [1, 3, 4, 10, 16, 18, 19, 20, 21], "here": [1, 2, 3, 4, 10, 18, 19, 23], "hf": 14, "hf_home": 8, "hf_token": [8, 22], "hf_xxx": 8, "hi": 4, "hidden": 3, "high": [3, 4, 9, 16, 19], "higher": 4, "highest": [9, 10], "highlight": [4, 5, 6, 23], "histogram": 18, "histori": [3, 23], "historian": 4, "hit": [4, 5, 6, 18, 19, 23], "hold": 6, "home": [3, 4, 22], "homogen": 3, "hood": 15, "host": [1, 2, 4, 5, 6, 22, 23], "hostnam": 1, "hous": 10, "how": [1, 3, 9, 10, 13], "howev": 3, "html": [0, 12], "http": [0, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 18, 19, 22, 23], "hub": [3, 22], "hufflepuff": 10, "hug": [1, 4, 6, 14], "huge": 3, "huggingfac": [8, 14, 20, 22], "human": [3, 4], "humor": 6, "husband": 3, "hyperparamet": [1, 11], "i": [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23], "icon": [3, 6], "id": [4, 6, 19, 23], "idea": 6, "ident": 20, "ignor": [4, 19], "ignore_eo": 19, "im_end": [14, 19], "im_start": [14, 19], "imag": [10, 19, 22], "image_data": 19, "image_fil": 10, "image_id": 22, "image_qa": 10, "image_url": 6, "imagin": 3, "implement": [4, 9, 15, 20, 23], "implic": 19, "import": [1, 2, 3, 4, 5, 6, 10, 12, 16, 18, 19, 23], "impress": 4, "improv": [4, 5, 6, 23], "includ": [3, 4, 10, 11], "inclus": 3, "incorrect": 9, "increas": 16, "increasingli": 3, "incur": 9, "independ": 22, "indetermin": 15, "index": [4, 6, 12, 23], "indic": 16, "industri": [3, 11], "inf": [18, 19], "infam": 3, "infer": [1, 19], "influenti": 3, "info": [3, 4, 5, 6, 23], "inform": [2, 3, 4, 10, 19], "infra": 22, "init": [1, 4, 5, 6, 23], "initi": [4, 9, 15], "innov": 3, "input": [1, 4, 10, 11, 12, 19, 22], "input_file_id": 4, "input_file_path": 4, "input_id": [5, 19], "input_ids_embed": 5, "insid": 8, "instal": [0, 4, 5, 6, 7, 8, 11, 12, 13, 23], "installationguid": 12, "instanc": 9, "instead": 21, "instinct": 22, "instruct": [1, 2, 3, 4, 5, 6, 10, 12, 19, 20, 22, 23], "int": 19, "int4": 11, "int4wo": 1, "integ": [4, 19], "integr": 11, "intellig": [3, 4], "intend": 6, "interact": 11, "interest": 3, "interfac": [11, 20], "interior": 3, "interleav": 6, "intern": 3, "internlm": 20, "internlm2": 20, "internlm2forrewardmodel": 20, "interpret": 4, "intersect": 3, "intertwin": 3, "intfloat": 5, "introduc": [2, 15], "intuit": 11, "investig": 15, "invok": 10, "io": 0, "ip": [1, 10], "ipc": [1, 22], "iron": [3, 6], "is_embed": [4, 5, 6, 23], "is_gener": 2, "isgener": 3, "issu": [10, 15, 21, 22], "itali": 4, "iter_lin": [19, 23], "its": [2, 3, 4, 9], "j": 3, "jacket": 6, "januari": 3, "japan": [4, 10, 23], "jaxon": 4, "job": [3, 4], "join": 3, "joke": 4, "json": [1, 2, 5, 6, 12, 18, 23], "json_decod": 10, "json_model_override_arg": [4, 5, 6, 23], "json_output": 10, "json_schema": [4, 19], "jsonl": 4, "jump": 11, "just": [3, 4, 8, 14], "k": 19, "k8": 22, "keep": [3, 4], "kei": [2, 3, 10, 12], "kepler": 4, "kernel": [11, 15, 21, 22], "kfd": [8, 22], "kid": 3, "kingdom": 10, "knowledg": 4, "known": 3, "kv": [1, 16, 21, 22], "kv_cache_dtyp": [4, 5, 6, 23], "l": [6, 19], "l4": 22, "l40": 22, "la": 3, "lab": [6, 19, 20], "label": 8, "lack": 3, "landmark": 3, "lang": [6, 19], "languag": [4, 6, 11, 14, 22, 23], "larg": [6, 11, 16], "largest": 3, "last": [4, 8, 22], "late": 4, "latenc": [12, 18], "later": [8, 9], "latest": [1, 4, 22], "latin": 4, "lattic": 3, "launch": [1, 3, 10, 12, 14, 19, 22], "launch_serv": [1, 2, 4, 5, 6, 10, 12, 14, 19, 20, 22, 23], "law": 4, "layer": [15, 20], "layer_id": 20, "le": 18, "lead": [3, 15], "learn": [1, 3, 4, 11, 13, 20], "least": 19, "leav": 3, "left": 6, "len": [12, 19, 23], "length": [4, 10, 18, 19, 23], "less": 4, "let": 1, "level": [3, 4, 19], "librari": 10, "life": 3, "light": [3, 4], "like": [3, 4, 16], "limit": [3, 9], "line": 4, "link": 3, "lint": 13, "linux": 8, "lisa": 3, "list": [1, 3, 4, 6, 10, 12, 19, 20, 21, 23], "lit": 4, "literatur": 4, "live": 3, "ll": 3, "llama": [2, 3, 4, 6, 10, 11, 12, 14, 19, 20, 22, 23], "llama3": [6, 20], "llama_3_vis": 6, "llamaembeddingmodel": 20, "llamaforcausallm": [4, 23], "llamaforsequenceclassif": 20, "llava": [6, 11, 19, 20], "llava_llama_3": [6, 20], "llm": [1, 3, 9, 11], "lm_eval": [4, 5, 6, 23], "lmm": [6, 19, 20], "lmsysorg": [1, 8, 22], "load": [1, 3, 4, 5, 6, 12, 14, 16, 19, 23], "load_balance_method": [4, 5, 6, 23], "load_format": [4, 5, 6, 23], "load_imag": 19, "local": [4, 5, 6, 22], "local_example_llava_next": 10, "localhost": [0, 1, 2, 4, 5, 6, 10, 18, 19, 23], "locat": [3, 19], "log": [4, 5, 6, 10, 16, 23], "log_level": [4, 5, 6, 23], "log_level_http": [4, 5, 6, 23], "log_request": [4, 5, 6, 23], "logit": [10, 19, 20], "logitsprocessor": 20, "logo": 6, "logprob": [4, 6, 9, 19, 23], "logprob_start_len": 19, "london": [4, 9], "long": [1, 4], "longer": [4, 9], "longest": 16, "look": [3, 14, 16], "loop": 10, "lora_path": [4, 5, 6, 23], "louvr": 3, "love": [3, 23], "lover": 3, "low": 19, "lower": [4, 16], "lpm": [4, 5, 6, 16, 23], "lsb": 12, "lt": [3, 4, 5, 6, 23], "luxuri": 3, "m": [0, 1, 2, 3, 4, 5, 6, 10, 12, 14, 19, 20, 22, 23], "machin": [3, 22], "made": 3, "magic": 10, "mai": [4, 5, 6, 10, 12, 15, 21, 23], "main": [1, 3, 6, 14, 19], "mainli": 2, "maintain": [3, 20], "major": [3, 4, 20], "make": [0, 3, 4, 11, 16, 20], "maker": 3, "male": 3, "man": 6, "manag": 10, "mani": [3, 9, 15, 16, 20], "manipul": 4, "manner": 19, "mark": 3, "marri": 3, "martin": 3, "mask": 10, "master": 3, "match": [2, 16], "matched_stop": [4, 6, 23], "materi": 17, "math": 10, "mathemat": 15, "max": [3, 21], "max_check": 4, "max_loras_per_batch": [4, 5, 6, 23], "max_new_token": [1, 16, 19, 23], "max_prefill_token": [4, 5, 6, 18, 23], "max_running_request": [4, 5, 6, 18, 23], "max_token": [1, 4, 6, 10, 23], "max_total_num_token": [4, 5, 6, 18, 23], "max_total_token": [4, 5, 6, 23], "maximum": [18, 19], "md": 13, "me": 4, "mean": [16, 19], "meant": 6, "medic": 3, "meet": 1, "mem": [1, 4, 5, 6, 21, 23], "mem_fraction_stat": [4, 5, 6, 23], "member": 3, "memori": [1, 4, 5, 6, 12, 23], "mention": 6, "messag": [1, 2, 4, 6, 10, 23], "meta": [1, 2, 3, 4, 6, 10, 12, 14, 19, 22, 23], "meta_info": 23, "method": [4, 11], "metropolitan": 3, "mi": 22, "michael": 3, "mild": [3, 4], "militari": [3, 4], "millard": 9, "million": 3, "min": 19, "min_new_token": 19, "min_p": 19, "minicpm": 20, "minim": 3, "ministri": 10, "minut": [4, 6, 23], "mislead": 9, "miss": 14, "mistral": [5, 11, 20], "mitig": 3, "mix": 19, "mixtral": 20, "mllamaforconditionalgener": 6, "modal": [1, 11], "mode": [3, 4, 15], "model": [3, 4, 5, 6, 8, 9, 11, 12, 14, 16, 18, 19, 22, 23], "model_path": [1, 2, 3, 4, 5, 6, 23], "moder": 4, "modern": 4, "moe": 20, "mona": 3, "monitor": [3, 4, 18], "mood": 3, "more": [1, 3, 4, 11, 15, 19, 22, 23], "most": [3, 14, 16, 20], "mostli": 15, "mount": 8, "much": 3, "muggl": 10, "multi": [1, 11], "multi_turn_quest": 10, "multimod": 6, "multipl": [1, 3, 4], "multitask": 6, "museum": 3, "must": [2, 3, 19, 23], "my": [1, 3], "my_model": 14, "my_model_templ": 14, "n": [2, 3, 4, 10, 19, 23], "n1": [4, 23], "n2": [4, 23], "n3": [4, 23], "na": 4, "name": [1, 2, 3, 4, 8, 9, 10, 12, 14, 18, 19], "namespac": 18, "nativ": 11, "natur": 4, "navig": 3, "nbecaus": 4, "nccl": 1, "ndescrib": 19, "nearli": 4, "need": [3, 4, 6, 8, 10, 12, 14, 20, 22], "nemo": 20, "nest": 10, "network": [2, 22], "neural": 2, "new": [2, 3, 4, 5, 6, 7, 8, 11, 16, 18, 19, 23], "new_seq": 18, "new_token": 18, "new_token_ratio": 16, "next": [6, 20], "ngener": [1, 3], "nin": 4, "nlist": 4, "nlp": [2, 5, 20], "nnode": [1, 4, 5, 6, 23], "node": [1, 2, 12], "node_rank": [4, 5, 6, 23], "non": [2, 10], "nondeterminist": 15, "none": [3, 4, 5, 6, 19, 23], "normal": 10, "north": 3, "note": [2, 4, 5, 6, 8, 12, 14, 19, 20, 23], "notebook": [4, 5, 6, 23], "notic": 15, "notr": 3, "novel": [3, 4], "now": [2, 3, 10, 22], "nprompt": 3, "npython": 4, "nsy": 12, "null": [6, 22], "num": 12, "num_continuous_decode_step": [4, 5, 6, 23], "num_requests_run": 18, "num_requests_wait": 18, "number": [3, 16, 18, 19], "numer": [2, 15], "nutrient": 4, "nvidia": [8, 12], "nvtx": 12, "nyou": 19, "o": [4, 5, 8, 12, 19], "oath": 3, "object": [4, 6, 19, 23], "obtain": 9, "occasion": 16, "occup": 10, "ocean": 3, "off": 6, "offer": 11, "offic": 3, "offici": 14, "offlin": [1, 11, 12], "often": 3, "ok": [4, 5, 6, 23], "okai": 16, "old": 3, "olmo": 20, "omit": 9, "onc": [1, 2, 5, 6, 9, 23], "one": [2, 4, 6, 9, 10, 15, 19], "onevis": [6, 19, 20], "ongo": 3, "onli": [2, 3, 4, 6, 9, 10, 12, 15, 19, 20, 22], "onlin": 12, "only_run": 20, "oom": [16, 21], "open": [3, 4, 11, 22], "openai": [2, 9, 11, 14, 19, 20, 22], "openai_api_kei": [8, 10], "oper": 22, "opportun": 3, "opt": 22, "option": [9, 19], "orang": 6, "order": 10, "origin": [2, 4, 5, 6, 23], "orsai": 3, "other": [3, 4, 9, 16, 20, 22], "otherwis": 6, "our": [3, 15], "out": [1, 3, 4, 10, 12, 22], "outcom": 3, "outlin": [4, 5, 6, 23], "output": [1, 2, 3, 4, 5, 6, 12, 15, 18, 19, 20, 22, 23], "output_file_id": 4, "ov": [6, 19, 20], "over": 3, "overhead": [3, 16], "overlap": [1, 4, 5, 9, 16], "overrid": 14, "overwhelm": 3, "ovid": 4, "own": [1, 22], "owner": 3, "p": [1, 3, 18, 19, 22], "p2p": 1, "pad": 15, "page": [11, 21], "paint": 3, "pairwis": 2, "pantheon": 4, "paragraph": 10, "parallel": [1, 11, 16, 19], "paramet": [2, 11, 16, 21, 23], "pari": [3, 4, 9, 19, 23], "part": [3, 20], "pass": [10, 13, 20], "passion": 3, "past": 6, "path": [0, 1, 2, 4, 5, 6, 9, 10, 12, 14, 19, 20, 22, 23], "patronu": 10, "pattern": [4, 19], "peer": 1, "penal": 19, "penalti": [4, 19], "peopl": 3, "per": [15, 18], "perfect": 4, "perform": [1, 9, 19], "perhap": 6, "period": 4, "perpetu": 3, "person": 3, "phi": 20, "phoenix": 10, "phrase": 4, "pioneer": 3, "pip": [0, 7, 8, 12], "pip3": 13, "place": 3, "plai": 3, "plain": 3, "plan": 22, "planet": 4, "plant": 3, "playground": 20, "pleas": [1, 3, 4, 6, 10, 12, 21, 22], "png": [6, 19], "polit": 3, "pool": [1, 4, 5, 6, 16, 21, 23], "poorli": 9, "popul": [3, 4, 19], "popular": [3, 4, 6], "port": [1, 2, 4, 5, 6, 10, 14, 19, 22, 23], "portion": 15, "post": [2, 4, 5, 6, 19, 23], "potenti": 15, "potter": 10, "pre": 13, "precis": 3, "predict": 9, "prefer": 4, "prefil": [1, 4, 5, 6, 11, 12, 18, 20, 21, 23], "prefix": [11, 15, 16], "prerequisit": 12, "presence_penalti": [4, 19], "present": 3, "presid": [1, 3, 9], "press": [4, 5, 6, 23], "pretti": 3, "prev": [19, 23], "pride": 3, "primit": [9, 10], "principl": 3, "print": [1, 3, 4, 10, 12, 19, 23], "print_highlight": [2, 3, 4, 5, 6, 23], "priorit": 3, "probabl": 10, "process": [3, 4, 5, 6, 12, 18, 23], "product": 3, "profil": 11, "program": [4, 11, 22], "programm": 4, "progress": 15, "progress_bar": 10, "project": [0, 6, 7, 8, 14, 15, 17, 19, 22], "prometheu": 18, "promot": 3, "prompt": [1, 2, 3, 4, 10, 11, 12, 19], "prompt_token": [4, 6, 23], "prompt_tokens_detail": [4, 6, 23], "prompt_tokens_tot": 18, "proper": 22, "properti": [4, 19], "prosper": 3, "provid": [1, 2, 3, 4, 5, 6, 10, 11, 12, 22, 23], "pub": 12, "public": 3, "pull": 8, "pure": 10, "purpos": 4, "puzzl": 3, "py": [0, 3, 4, 5, 6, 7, 10, 12, 14, 19, 20, 23], "pydant": 10, "pyproject": 7, "python": [1, 2, 3, 4, 7, 10, 12, 14, 19, 20, 22], "python3": [0, 1, 2, 6, 8, 12, 19, 20, 22], "pytorch": [15, 22], "q": 10, "qk": [4, 5, 6, 23], "qualiti": [2, 3], "quantiz": [1, 4, 5, 6, 11, 22, 23], "queri": 20, "question": [10, 11], "question_1": 10, "question_2": 10, "queu": 18, "queue": [4, 5, 6, 16, 23], "queue_req": 18, "quick": [3, 11, 12], "quick_start": 10, "quit": [4, 5, 6, 23], "qwen": [1, 6, 11, 20], "qwen2": [1, 2, 5, 6, 19, 20], "qwen2forcausallm": 5, "r": [0, 3, 10, 20], "rachel": 3, "radix": [2, 12, 15], "radixattent": [11, 20], "rais": [2, 4], "ranch": 3, "random": [12, 15], "random_se": [4, 5, 6, 23], "rang": [2, 4, 11, 16], "rank": 1, "rapid": 3, "rate": [4, 5, 6, 18, 23], "ravenclaw": 10, "raw": [6, 19], "rb": 4, "re": 3, "reach": [3, 19], "read": 4, "readi": [4, 5, 6, 23], "readm": 13, "readme_exampl": 10, "real": 12, "realli": 3, "reason": 4, "recogn": 3, "recommend": [3, 12, 22], "recoveri": 22, "reduc": [1, 3, 4, 16, 21], "refer": [1, 4, 5, 6, 20, 21, 23], "reference_hf": 20, "refresh": 3, "refus": [4, 23], "regex": 10, "registr": 3, "regular": [10, 19], "regular_expression_gen": 10, "relat": [6, 14, 22], "releas": [12, 22], "remain": 15, "rememb": [3, 5, 6], "remot": [20, 22], "remov": [0, 4, 20], "renderd176": 8, "renderd184": 8, "repeat": 19, "repetit": 4, "repetition_penalti": [4, 19], "replac": [1, 20, 22], "repo": 12, "report": [1, 21], "reproduc": 4, "req": [4, 5, 6, 16, 23], "request": [1, 2, 4, 10, 11, 15, 18, 19, 21], "request_count": 4, "request_generation_token": 18, "request_generation_tokens_bucket": 18, "request_generation_tokens_count": 18, "request_generation_tokens_sum": 18, "request_id": 4, "request_prompt_token": 18, "request_prompt_tokens_bucket": 18, "request_prompt_tokens_count": 18, "request_prompt_tokens_sum": 18, "requir": [0, 3, 4, 19], "reserv": 3, "resid": 3, "resourc": [20, 22], "respons": [1, 2, 3, 4, 5, 6, 9, 19, 23], "response1": 2, "response2": 2, "response_format": 4, "response_json": 2, "rest": 3, "restart": [2, 8], "restaur": 3, "result": [4, 9], "result_cont": 4, "result_file_id": 4, "retracted_req": 16, "retriev": 4, "return": 19, "return_logprob": 19, "return_text_in_logprob": 19, "reus": 20, "reward": [11, 23], "reward_process": 2, "rich": [3, 4], "rid": 19, "right": 3, "risk": 3, "rm": [8, 22], "rmsnorm": 20, "robot": 3, "rocm": 22, "rocm620": [8, 22], "role": [1, 2, 3, 4, 6, 23], "roll": [2, 4, 5, 6, 23], "roman": 4, "romanc": [3, 23], "rome": 4, "root": [1, 15, 22], "roughli": 15, "round_robin": [4, 5, 6, 23], "run": [0, 3, 4, 5, 6, 10, 12, 13, 15, 18, 20, 21, 23], "run_batch": 10, "runner_allow_runasroot": 8, "runtim": [2, 11, 22], "runtimeendpoint": [9, 10], "safetensor": [3, 4, 5, 6, 23], "sai": 4, "same": [2, 10, 12, 15, 19, 20], "sampl": [2, 11, 20, 22, 23], "sampling_backend": [4, 5, 6, 23], "sampling_param": [1, 3, 19, 23], "scale": 22, "scene": 6, "schedul": [1, 3, 4, 5], "schedule_conserv": [4, 5, 6, 23], "schedule_polici": [4, 5, 6, 23], "schema": [4, 10, 19], "sci": 3, "scienc": 3, "scientif": 4, "script": [3, 12, 20], "search": [4, 10], "seccomp": 22, "second": [3, 4, 6, 18], "secret": 22, "section": [19, 21], "secur": 22, "see": [1, 3, 10, 16, 19, 20, 21, 22, 23], "seed": [4, 15], "select": [10, 18, 22], "self": [3, 4, 5, 6], "senat": 4, "send": [1, 6, 11, 15, 16, 19], "sentenc": [6, 19], "sep": 14, "sep_styl": 14, "separ": [4, 5, 6, 23], "seq": [4, 5, 6, 23], "sequenc": [4, 18], "seri": 4, "serv": [1, 3, 11, 12, 16, 22], "served_model_nam": [4, 5, 6, 23], "server": [0, 3, 10, 12, 14, 15, 16, 18, 19, 20], "server_arg": [4, 5, 6, 23], "server_process": [2, 4, 23], "serverarg": [4, 5, 6, 23], "servic": [3, 4, 5, 6, 22], "service_ti": [4, 23], "session_id": 23, "set": [1, 4, 10, 12, 14, 15, 19, 22], "set_default_backend": 10, "settlement": 4, "sever": [4, 6, 12, 23], "sgl": [0, 1, 3, 6, 7, 8, 9, 10, 15, 17, 19, 22], "sgl_branch": 22, "sglang": [2, 3, 4, 5, 6, 7, 8, 12, 13, 15, 17, 18, 23], "sglang_is_in_ci": 8, "sglang_storag": [4, 5, 6, 23], "sglang_use_modelscop": 1, "sh": 7, "shard": [3, 4, 5, 6, 23], "share": [8, 16], "shell": [5, 6, 23], "ship": 4, "shirt": 6, "shm": [8, 22], "shop": 3, "short": [4, 15, 19], "shorter": 9, "should": [3, 14, 20], "show": [6, 10], "show_time_cost": [4, 5, 6, 23], "showcas": 4, "shutdown": 3, "sight": 3, "sigmoid": 2, "siluandmul": 20, "similar": [2, 4, 19, 20], "similarli": 15, "simpl": 10, "simpli": 9, "simplifi": 12, "simultan": 3, "sinc": 4, "singl": [1, 4, 12, 19, 20, 22], "singleton": 2, "size": [1, 6, 8, 12, 15, 20, 21, 22], "sk": [8, 10], "skill": 4, "skip": 19, "skip_special_token": 19, "skip_tokenizer_init": [4, 5, 6, 23], "sky": 22, "skyserv": 22, "skywork": [2, 11, 20], "sleep": [3, 4, 8], "slide": 17, "slight": 15, "slightli": [4, 15], "slower": [4, 15], "slytherin": 10, "sm75": 22, "small": [1, 3, 6, 15, 16, 20], "smaller": 1, "smollm": 20, "smooth": [4, 5, 6], "snippet": [6, 12], "so": [3, 4, 5, 6, 12, 19, 23], "societi": 3, "soil": 4, "solut": 15, "solv": 3, "some": [3, 8, 10, 12, 20, 21], "someth": 3, "sometim": 21, "sound": 3, "sourc": [11, 12], "space": [4, 19], "spaces_between_special_token": 19, "spaceship": 4, "speak": 15, "special": [3, 19], "specif": [1, 4, 20, 22], "specifi": [1, 4, 6, 8, 9, 10, 14, 19], "speed": 4, "spend": 3, "split": 4, "sport": 3, "spot": 3, "srt": [11, 19, 20, 22], "stabl": 4, "stablelm": 20, "stai": 10, "stair": 3, "stand": [6, 16], "star": 4, "start": [4, 5, 6, 12, 18, 19, 20], "startswith": [19, 23], "startup": [4, 5, 6, 23], "state": [1, 3, 4, 10], "static": [1, 6, 12, 21], "statu": [4, 10, 22], "status_cod": 4, "still": 15, "stood": 4, "stop": [4, 6, 10, 16, 19, 23], "stop_str": 14, "stop_token_id": 19, "stori": 4, "str": 19, "strategi": 1, "stream": [1, 4], "stream_interv": [4, 5, 6, 23], "street": 6, "stress": 3, "string": [4, 16, 19], "strip": [4, 19, 23], "strong": [4, 9], "structur": 11, "struggl": 3, "student": 10, "stun": 3, "subprocess": [5, 6, 23], "subset": 9, "succeed": 2, "success": 2, "successfulli": 4, "suggest": 16, "summari": 10, "summer": 3, "suppli": 9, "support": [1, 2, 3, 4, 5, 6, 9, 10, 11, 19, 22], "sure": [0, 20], "surpris": 3, "surround": 3, "swing": 3, "switch": 22, "sworn": 3, "symbol": 3, "sys_ptrac": 22, "system": [1, 3, 4, 10, 12, 14, 19, 22], "system_fingerprint": [4, 23], "t": 22, "t4": 22, "tabl": 4, "take": [3, 4, 6, 16, 23], "talk": 3, "taxi": 6, "teacher": 10, "teaser": 3, "tech": 3, "technolog": 4, "technologi": [3, 4], "tee": 12, "tell": 4, "temper": 3, "temperatur": [1, 3, 4, 6, 10, 19, 23], "templat": [1, 4, 10, 11, 19, 20], "tensor": [1, 2, 11], "term": [3, 15], "termin": [4, 5, 6, 22, 23], "terminate_process": [2, 4, 5, 6, 23], "territori": 4, "test": [2, 3, 4, 6, 8, 12, 19, 23], "test_generation_model": 20, "test_oth": 20, "test_vision_openai_serv": 20, "testgenerationmodel": 20, "text": [1, 3, 4, 5, 6, 19, 20, 23], "text_complet": 4, "text_embed": 5, "text_it": 10, "text_qa": 10, "thei": [3, 4, 6, 19], "them": [21, 22], "therefor": 2, "thi": [0, 1, 2, 3, 4, 5, 6, 9, 10, 12, 14, 15, 16, 19, 20, 21, 22, 23], "thing": [3, 16], "think": 3, "three": 4, "through": [3, 4, 10], "throughput": [1, 4, 6, 18, 23], "till": 22, "time": [1, 2, 3, 4, 5, 12, 15, 18, 19], "time_per_output_token_second": 18, "time_per_output_token_seconds_bucket": 18, "time_per_output_token_seconds_count": 18, "time_per_output_token_seconds_sum": 18, "time_to_first_token_second": 18, "time_to_first_token_seconds_bucket": 18, "time_to_first_token_seconds_count": 18, "time_to_first_token_seconds_sum": 18, "tip": 21, "tip_suggest": 10, "tmp": 8, "todai": 1, "togeth": [1, 4, 5, 6, 16, 23], "token": [1, 4, 5, 6, 10, 11, 14, 16, 18, 19, 22, 23], "token_id": 19, "token_length_norm": 9, "token_usag": 18, "tokenizer_mod": [4, 5, 6, 23], "tokenizer_path": [4, 5, 6, 23], "tokenizers_parallel": 5, "tokyo": [4, 23], "tolkien": 3, "toml": 7, "too": 16, "tool": 10, "tool_cal": [4, 23], "tool_us": 10, "top": [3, 19], "top_k": 19, "top_logprobs_num": 19, "top_p": [1, 3, 4, 19], "topic": 4, "torch": [1, 4, 5, 6, 16, 23], "torch2": 22, "torch_compile_max_b": [4, 5, 6, 23], "torchao": 1, "torchao_config": [4, 5, 6, 23], "total": [1, 4, 18], "total_token": [4, 6, 23], "tour": 3, "tourist": 3, "tower": 3, "tp": [1, 20, 22], "tp0": [4, 5, 6, 23], "tp_size": [4, 5, 6, 23], "tr": 12, "trace": 12, "track": 15, "train": [2, 12], "transform": [2, 3, 5, 14, 20], "transit": [4, 5, 6], "transpar": 3, "travel": [3, 4], "treat": 2, "tri": 3, "trigger": 2, "triomph": 3, "triton": 22, "triton_attention_reduce_in_fp32": [4, 5, 6, 23], "troubleshoot": 11, "true": [1, 2, 3, 4, 5, 6, 8, 10, 12, 19, 23], "truncat": 12, "trunk": 6, "trust": [3, 20], "trust_remote_cod": [4, 5, 6, 23], "try": [1, 3, 4, 21], "tune": [1, 11, 21], "turbo": 10, "turn": [6, 10], "tutori": [4, 5, 6], "twelv": 4, "twice": 15, "twine": 7, "two": [1, 3, 4, 6, 10, 14, 15, 20], "txt": 0, "type": [1, 2, 4, 5, 6, 18, 19, 23], "typic": [4, 5, 6, 23], "u": [3, 9], "ubiquit": 3, "ubuntu": 12, "ubuntu1804": 12, "ubuntu22": 8, "unconditional_likelihood_norm": 9, "unconfin": 22, "under": [12, 13, 15, 20], "understand": 20, "unexpect": 6, "union": 19, "unit": [1, 3, 4, 10], "unittest": 20, "unnecessari": 3, "until": 19, "up": [4, 5, 6, 18, 22, 23], "updat": [0, 8, 12], "update_weight": 2, "upgrad": 22, "upload": [4, 18], "upload_pypi": 7, "uploaded_fil": 4, "upon": [1, 2, 5], "urban": 6, "url": [2, 4, 6, 19, 23], "us": [2, 3, 4, 8, 9, 12, 13, 14, 15, 16, 18, 19], "us_president_exampl": 9, "usabl": [4, 5, 6, 23], "usag": [1, 5, 6, 9, 16, 18, 21, 22, 23], "user": [1, 2, 3, 4, 6, 9, 10, 14, 16, 19, 23], "usual": 4, "utf": [4, 19, 23], "util": [2, 3, 4, 5, 6, 16, 19, 23], "uvicorn": [4, 5, 6, 23], "v": [1, 8, 22], "v0": [2, 8, 20, 22], "v1": [1, 2, 4, 5, 6, 23], "valid": 4, "valu": [1, 3, 16, 19], "valuabl": 20, "variabl": [1, 8], "varianc": 15, "variant": 12, "variou": [1, 4], "vast": 4, "ve": 3, "vehicl": 3, "veri": [4, 6, 16, 19, 20], "verifi": 4, "version": 22, "vertexai": 10, "via": 18, "vibrant": 3, "vicuna_v1": 6, "video": [8, 17, 22], "view": [1, 3], "virgil": 4, "virtual": 3, "visibl": 3, "vision": [1, 4, 11, 20, 23], "visit": [0, 3, 23], "visitor": 3, "vl": [6, 20], "vl2": 6, "vote": 3, "w": [4, 10, 19], "wa": 4, "wai": [3, 20], "wait": [4, 5, 6, 18, 23], "wait_for_serv": [2, 4, 5, 6, 23], "waiting_request_latency_second": 18, "waiting_request_latency_seconds_bucket": 18, "waiting_request_latency_seconds_count": 18, "waiting_request_latency_seconds_sum": 18, "wand": 10, "want": [1, 3, 19], "warn": 16, "wash": 3, "washington": 4, "watch": 3, "watchdog_timeout": [4, 5, 6, 23], "we": [1, 2, 3, 4, 5, 6, 15, 23], "wear": 6, "web": 4, "weight": [1, 3, 4, 5, 6, 8, 12, 23], "weight_util": [3, 4, 5, 6, 23], "welcom": 14, "welfar": 3, "well": 20, "were": 4, "what": [2, 4, 6, 9, 10, 23], "when": [2, 3, 4, 10, 14, 15, 16, 19], "where": [3, 9], "whether": [2, 3, 19], "which": [3, 4, 6, 14, 15, 16, 19, 21, 23], "while": [3, 4, 5, 6, 8, 12, 15, 19, 22, 23], "white": 6, "whl": 22, "who": [3, 4, 23], "why": 4, "wide": [4, 11], "wife": 3, "winner": 3, "winter": 3, "within": 10, "without": [2, 3, 4, 12, 22], "wood": 10, "word": [4, 10], "work": [1, 3, 8, 14, 16], "workflow": 10, "workload": [1, 16], "world": 3, "write": [0, 3, 4], "x64": 8, "x86_64": 12, "xvers": 20, "xxx": 8, "y": [8, 12], "yaml": [18, 22], "year": [3, 4], "yellow": 6, "yi": 20, "yml": 22, "york": 3, "you": [0, 1, 2, 3, 4, 6, 8, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "your": [0, 1, 3, 4, 6, 10, 11, 22, 23], "zip": [1, 3]}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Native APIs", "Offline Engine API", "OpenAI APIs - Completions", "OpenAI APIs - Embedding", "OpenAI APIs - Vision", "PyPI Package Release Process", "Set Up Self-Hosted Runners for GitHub Action", "Choices Methods in SGLang", "Frontend: Structured Generation Language (SGLang)", "SGLang Documentation", "Benchmark and Profiling", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Frequently Asked Questions", "Guide on Hyperparameter Tuning", "Learn more", "Production Metrics", "Sampling Parameters in SGLang Runtime", "Supported Models", "Troubleshooting", "Install SGLang", "Quick Start: Sending Requests"], "titleterms": {"0": 15, "1": [1, 8, 22], "2": [8, 22], "3": [1, 8, 22], "4": 22, "405b": 1, "5": 22, "A": [2, 4, 5, 6, 23], "The": 15, "With": 22, "access": 21, "achiev": 16, "action": 8, "add": [8, 13, 20], "addit": 1, "advanc": 16, "an": 21, "api": [1, 2, 3, 4, 5, 6, 23], "ar": 15, "arg": 2, "argument": 1, "ask": 15, "asynchron": 3, "avoid": 16, "backend": [1, 11], "batch": [3, 4, 10], "benchmark": 12, "build": 0, "cach": 2, "chat": [4, 6, 14], "check": 2, "choic": 9, "chunk": 16, "classifi": 2, "clean": 0, "client": [5, 6, 23], "cloud": 22, "code": [7, 13], "common": 22, "compat": 1, "complet": 4, "compos": 22, "config": 8, "configur": 8, "conserv": 16, "constrain": 10, "contain": 8, "contributor": 13, "control": 10, "correct": 20, "cuda": 21, "curl": [5, 6, 23], "custom": 14, "dashboard": 18, "debug": 20, "decod": [4, 10, 19], "depend": 0, "deploi": 0, "detail": 10, "determinist": 15, "docker": [8, 22], "document": [0, 11], "dp": 16, "embed": [2, 5, 20], "encod": 2, "encount": 21, "engin": [1, 3], "error": 21, "even": 15, "exampl": [1, 10, 19], "express": 4, "featur": 10, "flow": 10, "flush": 2, "format": [13, 14], "fraction": 16, "frequent": 15, "from": [1, 20, 22], "frontend": [10, 11], "gener": [2, 3, 10, 20, 23], "get": [2, 11], "github": [7, 8], "grafana": 18, "greedi": 9, "guid": [13, 16, 18], "health": 2, "host": 8, "how": 20, "http": 1, "hyperparamet": 16, "id": 5, "illeg": 21, "imag": 6, "implement": 10, "infer": 3, "info": 2, "input": [5, 6], "instal": 22, "interact": 20, "jinja": 14, "json": [4, 10, 14, 19], "kubernet": 22, "languag": 10, "launch": [2, 4, 5, 6, 23], "learn": 17, "length": 9, "likelihood": 9, "llama": 1, "local": 10, "make": 7, "max": 16, "maximum": 2, "mem": 16, "memori": [2, 16, 21], "method": [9, 22], "metric": 18, "modal": [10, 19], "model": [1, 2, 10, 20], "modelscop": 1, "more": [10, 17], "multi": [10, 19], "multipl": 6, "nativ": [2, 23], "new": 20, "non": 3, "normal": [9, 19], "note": 22, "nsight": 12, "number": 2, "offlin": 3, "openai": [1, 4, 5, 6, 10, 23], "option": 16, "other": 12, "out": [16, 21], "packag": 7, "parallel": 10, "paramet": [4, 19], "peak": 16, "pip": 22, "polici": 16, "pool": 2, "port": 20, "prefil": 16, "preview": 0, "process": 7, "product": 18, "profil": 12, "pypi": 7, "python": [5, 6, 23], "question": 15, "quick": [1, 10, 23], "refer": 11, "regex": [4, 19], "regular": 4, "releas": 7, "request": [5, 6, 16, 23], "result": 15, "reward": [2, 20], "role": 10, "run": [1, 8, 16, 22], "runner": 8, "runtim": [1, 14, 19], "sampl": 19, "schedul": 16, "select": 9, "self": 8, "send": 23, "serv": 0, "server": [1, 2, 4, 5, 6, 23], "set": 8, "setup": 18, "sglang": [0, 1, 9, 10, 11, 14, 19, 20, 22], "sh": 8, "size": [2, 16], "skypilot": 22, "sourc": 22, "speed": 16, "srt": 1, "start": [1, 8, 10, 11, 23], "static": 16, "step": 8, "stream": [3, 10, 19, 23], "structur": [4, 10, 19], "submiss": 16, "suit": 20, "support": 20, "synchron": 3, "temperatur": 15, "templat": [6, 14], "test": [13, 20], "text": 2, "throughput": 16, "tip": [10, 12], "token": [2, 9], "total": 2, "tp": 16, "troubleshoot": 21, "try": 16, "tune": 16, "tutori": 11, "uncondit": 9, "unit": 13, "up": 8, "updat": [2, 7], "upload": 7, "us": [1, 5, 6, 10, 22, 23], "usag": 4, "version": 7, "vision": 6, "vllm": 20, "wa": 21, "websit": 0, "weight": 2, "without": 1, "your": [13, 16]}})