> ## Documentation Index
> Fetch the complete documentation index at: https://lmsysorg.mintlify.app/llms.txt
> Use this file to discover all available pages before exploring further.

# DeepSeek-V3.2

export const DeepSeekV32Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h200',
        label: 'H200',
        default: true
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }]
    },
    modelname: {
      name: 'modelname',
      title: 'Model Name',
      items: [{
        id: 'v32',
        label: 'DeepSeek-V3.2',
        default: true
      }, {
        id: 'v32speciale',
        label: 'DeepSeek-V3.2-Speciale',
        default: false
      }, {
        id: 'v32exp',
        label: 'DeepSeek-V3.2-Exp',
        default: false
      }]
    },
    strategy: {
      name: 'strategy',
      title: 'Deployment Strategy',
      type: 'checkbox',
      items: [{
        id: 'tp',
        label: 'TP',
        default: true,
        required: true
      }, {
        id: 'dp',
        label: 'DP attention',
        default: false
      }, {
        id: 'ep',
        label: 'EP',
        default: false
      }, {
        id: 'mtp',
        label: 'Multi-token Prediction',
        default: false
      }]
    },
    reasoningParser: {
      name: 'reasoningParser',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const defaultItem = option.items.find(item => item.default);
        initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      } else {
        return {
          ...prev,
          [optionName]: currentValues.filter(id => id !== itemId)
        };
      }
    });
  };
  const generateCommand = () => {
    const {hardware, modelname, strategy, reasoningParser, toolcall} = values;
    const strategyArray = Array.isArray(strategy) ? strategy : [];
    if (modelname === 'v32speciale' && toolcall === 'enabled') {
      return `# Error: DeepSeek-V3.2-Speciale doesn't support tool calling\n# Please select "Disabled" for Tool Call Parser or choose a different model`;
    }
    const modelMap = {
      'v32': 'DeepSeek-V3.2',
      'v32exp': 'DeepSeek-V3.2-Exp',
      'v32speciale': 'DeepSeek-V3.2-Speciale'
    };
    const modelName = `deepseek-ai/${modelMap[modelname]}`;
    let cmd = 'python3 -m sglang.launch_server \\\n';
    cmd += `  --model-path ${modelName}`;
    if (hardware === 'mi355x') {
      cmd += ` \\\n  --trust-remote-code \\\n  --nsa-prefill-backend tilelang \\\n  --nsa-decode-backend tilelang \\\n  --cuda-graph-max-bs 64`;
    }
    cmd += ` \\\n  --tp 8`;
    if (strategyArray.includes('dp')) {
      cmd += ` \\\n  --dp 8 \\\n  --enable-dp-attention`;
    }
    if (strategyArray.includes('ep')) {
      cmd += ` \\\n  --ep 8`;
    }
    if (strategyArray.includes('mtp')) {
      cmd += ` \\\n  --speculative-algorithm EAGLE \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4`;
    }
    if (toolcall === 'enabled' && modelname !== 'v32speciale') {
      if (modelname === 'v32exp') {
        cmd += ` \\\n  --tool-call-parser deepseekv31`;
      } else if (modelname === 'v32') {
        cmd += ` \\\n  --tool-call-parser deepseekv32`;
      }
    }
    if (reasoningParser === 'enabled') {
      cmd += ` \\\n  --reasoning-parser deepseek-v3`;
    }
    if (toolcall === 'enabled' && modelname === 'v32exp') {
      cmd += ` \\\n  --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja`;
    }
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const isDisabled = item.required;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={e => handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} onChange={() => handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

The DeepSeek-V3.2 series includes three model variants, each optimized for different use cases:

**[DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)** is an upgraded version of DeepSeek-V3.1-Terminus, introducing the DeepSeek Sparse Attention (DSA) mechanism through continued training. DSA is a fine-grained sparse attention mechanism powered by a lightning indexer, enabling DeepSeek-V3.2-Exp to achieve significant efficiency improvements in long-context scenarios. As an intermediate step toward the next-generation architecture, V3.2-Exp builds upon V3.1-Terminus by introducing DeepSeek Sparse Attention—a sparse attention mechanism designed to explore and validate optimizations for training and inference efficiency in long-context scenarios. Recommended for general conversations, long-context processing, and efficient inference.

**[DeepSeek-V3.2](https://huggingface.co/deepseek-ai/DeepSeek-V3.2)** is the standard version suitable for general tasks and conversational scenarios. For local deployment, we recommend setting the sampling parameters to temperature = 1.0, top\_p = 0.95. Recommended for standard conversations and general tasks.

**[DeepSeek-V3.2-Speciale](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale)** is a special variant designed exclusively for deep reasoning tasks. This model is specifically optimized for scenarios requiring complex logical reasoning and deep thinking. For local deployment, we recommend setting the sampling parameters to temperature = 1.0, top\_p = 0.95. Recommended for deep reasoning tasks, complex logical problems, and mathematical reasoning.

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/installation) for installation instructions.

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities.

<DeepSeekV32Deployment />

### 3.2 Configuration Tips

For more detailed configuration tips, please refer to [DeepSeek-V3.2 Usage](../../../docs/basic_usage/deepseek_v32).

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [Basic API Usage](../../../docs/get-started/quickstart)

### 4.2 Advanced Usage

#### 4.2.1 Reasoning Parser

DeepSeek-V3.2 supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections:

```shell Command theme={null}
python -m sglang.launch_server \
  --model deepseek-ai/DeepSeek-V3.2-Exp \
  --reasoning-parser deepseek-v3 \
  --tp 8 \
  --host 0.0.0.0 \
  --port 8000
```

**Streaming with Thinking Process:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Enable streaming to see the thinking process in real-time
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3.2-Exp",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    temperature=0.7,
    max_tokens=2048,
    extra_body = {"chat_template_kwargs": {"thinking": True}},
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
To solve this problem, I need to calculate 15% of 240.
Step 1: Convert 15% to decimal: 15% = 0.15
Step 2: Multiply 240 by 0.15
Step 3: 240 × 0.15 = 36
=============== Content =================

The answer is 36. To find 15% of 240, we multiply 240 by 0.15, which equals 36.
```

**Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions.

#### 4.2.2 Tool Calling

DeepSeek-V3.2 and DeepSeek-V3.2-Exp support tool calling capabilities. Enable the tool call parser:

**Note:** DeepSeek-V3.2-Speciale does **NOT** support tool calling. It is designed exclusively for deep reasoning tasks.

**Deployment Command:**

```shell Command theme={null}
python -m sglang.launch_server \
  --model deepseek-ai/DeepSeek-V3.2-Exp \
  --tool-call-parser deepseekv31 \
  --reasoning-parser deepseek-v3 \
  --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja \
  --tp 8 \
  --host 0.0.0.0 \
  --port 8000
```

For DeepSeek-V3.2, use `--tool-call-parser deepseekv32` instead.

**Python Example (with Thinking Process):**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3.2-Exp",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    extra_body = {"chat_template_kwargs": {"thinking": True}},
    temperature=0.7,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False
tool_calls_accumulator = {}

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Accumulate tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================\n", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                index = tool_call.index
                if index not in tool_calls_accumulator:
                    tool_calls_accumulator[index] = {
                        'name': None,
                        'arguments': ''
                    }

                if tool_call.function:
                    if tool_call.function.name:
                        tool_calls_accumulator[index]['name'] = tool_call.function.name
                    if tool_call.function.arguments:
                        tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

# Print accumulated tool calls
for index, tool_call in sorted(tool_calls_accumulator.items()):
    print(f"🔧 Tool Call: {tool_call['name']}")
    print(f"   Arguments: {tool_call['arguments']}")

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
The user is asking about the weather in Beijing. I need to use the get_weather function to retrieve this information.
I should call the function with location="Beijing".
=============== Content =================

🔧 Tool Call: get_weather
   Arguments: {"location": "Beijing", "unit": "celsius"}
```

**Note:**

* The reasoning parser shows how the model decides to use a tool
* Tool calls are clearly marked with the function name and arguments
* You can then execute the function and send the result back to continue the conversation

**Handling Tool Call Results:**

```python Example theme={null}
# After getting the tool call, execute the function
def get_weather(location, unit="celsius"):
    # Your actual weather API call here
    return f"The weather in {location} is 22°{unit[0].upper()} and sunny."

# Send tool result back to the model
messages = [
    {"role": "user", "content": "What's the weather in Beijing?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Beijing", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": get_weather("Beijing", "celsius")
    }
]

final_response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3.2-Exp",
    messages=messages,
    temperature=0.7
)

print(final_response.choices[0].message.content)
# Output: "The weather in Beijing is currently 22°C and sunny."
```

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: NVIDIA B200 GPU (8x)
* Model: DeepSeek-V3.2-Exp
* Tensor Parallelism: 8
* sglang version: 0.5.6

We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT\_Vicuna\_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios. To simulate real-world usage patterns, we configure each request with 1024 input tokens and 1024 output tokens, representing typical medium-length conversations with detailed responses.

#### 5.1.1 Latency-Sensitive Benchmark

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
  --tp 8 \
  --speculative-algorithm EAGLE \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --host 0.0.0.0 \
  --port 8000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 8000 \
  --model deepseek-ai/DeepSeek-V3.2-Exp \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 10 \
  --max-concurrency 1
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  29.11
Total input tokens:                      1972
Total input text tokens:                 1972
Total input vision tokens:               0
Total generated tokens:                  2784
Total generated tokens (retokenized):    2777
Request throughput (req/s):              0.34
Input token throughput (tok/s):          67.73
Output token throughput (tok/s):         95.62
Peak output token throughput (tok/s):    157.00
Peak concurrent requests:                3
Total token throughput (tok/s):          163.36
Concurrency:                             1.00
Accept length:                           2.46
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   2909.74
Median E2E Latency (ms):                 3088.27
P90 E2E Latency (ms):                    4200.62
P99 E2E Latency (ms):                    5588.52
---------------Time to First Token----------------
Mean TTFT (ms):                          317.58
Median TTFT (ms):                        191.31
P99 TTFT (ms):                           740.79
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          9.09
Median TPOT (ms):                        9.25
P99 TPOT (ms):                           11.73
---------------Inter-Token Latency----------------
Mean ITL (ms):                           9.35
Median ITL (ms):                         7.64
P95 ITL (ms):                            22.81
P99 ITL (ms):                            23.33
Max ITL (ms):                            31.45
==================================================
```

#### 5.1.2 Throughput-Sensitive Benchmark

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
  --tp 8 \
  --ep 8 \
  --dp 8 \
  --enable-dp-attention \
  --host 0.0.0.0 \
  --port 8000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 8000 \
  --model deepseek-ai/DeepSeek-V3.2-Exp \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 1000 \
  --max-concurrency 100
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     1000
Benchmark duration (s):                  219.09
Total input tokens:                      301701
Total input text tokens:                 301701
Total input vision tokens:               0
Total generated tokens:                  188375
Total generated tokens (retokenized):    187443
Request throughput (req/s):              4.56
Input token throughput (tok/s):          1377.06
Output token throughput (tok/s):         859.80
Peak output token throughput (tok/s):    2465.00
Peak concurrent requests:                109
Total token throughput (tok/s):          2236.86
Concurrency:                             88.05
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   19291.23
Median E2E Latency (ms):                 11927.39
---------------Time to First Token----------------
Mean TTFT (ms):                          530.36
Median TTFT (ms):                        444.00
P99 TTFT (ms):                           1504.78
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          106.16
Median TPOT (ms):                        106.69
P99 TPOT (ms):                           221.12
---------------Inter-Token Latency----------------
Mean ITL (ms):                           100.46
Median ITL (ms):                         41.73
P95 ITL (ms):                            225.67
P99 ITL (ms):                            392.37
Max ITL (ms):                            975.03
==================================================
```

### 5.2 Accuracy Benchmark

#### 5.2.1 GSM8K Benchmark

* **Benchmark Command:**

```shell Command theme={null}
python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000
```

* **Test Results**:
  * DeepSeek-V3.2-Exp
    ```text Output theme={null}
    Accuracy: 0.980
    Invalid: 0.000
    Latency: 19.128 s
    Output throughput: 965.919 token/s
    ```

#### 5.2.2 MMLU Benchmark

* **Benchmark Command:**

```shell Command theme={null}
cd sglang
bash benchmark/mmlu/download_data.sh
python3 benchmark/mmlu/bench_sglang.py --nsub 10 --port 8000
```

* **Test Results**:
  * DeepSeek-V3.2-Exp
    ```text Output theme={null}
    subject: abstract_algebra, #q:100, acc: 0.780
    subject: anatomy, #q:135, acc: 0.874
    subject: astronomy, #q:152, acc: 0.961
    subject: business_ethics, #q:100, acc: 0.860
    subject: clinical_knowledge, #q:265, acc: 0.925
    subject: college_biology, #q:144, acc: 0.972
    subject: college_chemistry, #q:100, acc: 0.660
    subject: college_computer_science, #q:100, acc: 0.880
    subject: college_mathematics, #q:100, acc: 0.840
    subject: college_medicine, #q:173, acc: 0.879
    Total latency: 7.961
    Average accuracy: 0.879
    ```


Built with [Mintlify](https://mintlify.com).