diff --git a/static/js/cookbook.js b/static/js/cookbook.js index edcbab33e..6e710c1bd 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -188,6 +188,19 @@ export function _isMetal() { return ['metal', 'mps', 'apple'].includes(String(_hwfitCache?.system?.backend || '').toLowerCase()); } +const GEMMA4_THINKING_CHAT_TEMPLATE = `{% for message in messages %}{% if message['role'] == 'system' %}<|turn>system\n<|think|>{{ message['content'] }}\n{% elif message['role'] == 'user' %}<|turn>user\n{{ message['content'] }}\n{% elif message['role'] == 'assistant' %}<|turn>model\n{{ message['content'] }}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|turn>model\n<|channel>thought{% endif %}`; + +function _isGemma4ThinkingModel(modelName) { + const n = (modelName || '').toLowerCase(); + return n.includes('gemma-4') || n.includes('gemma4'); +} + +function _gemma4ThinkingChatTemplateArg(modelName) { + return _isGemma4ThinkingModel(modelName) + ? _shellQuote(GEMMA4_THINKING_CHAT_TEMPLATE) + : ''; +} + /** Detect model-specific vLLM optimizations */ function _detectModelOptimizations(modelName) { const n = (modelName || '').toLowerCase(); @@ -388,6 +401,8 @@ export function _buildServeCmd(f, modelName, backend) { const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim(); if (_extraEnv) cmd += _extraEnv + ' '; cmd += `${_vllmBin} serve ${modelName} --host 0.0.0.0 --port ${f.port || '8000'}`; + const _gemma4ChatTemplate = _gemma4ThinkingChatTemplateArg(modelName); + if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`; cmd += ` --tensor-parallel-size ${f.tp || '1'}`; cmd += ` --max-model-len ${f.ctx || '8192'}`; cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`; @@ -418,6 +433,8 @@ export function _buildServeCmd(f, modelName, backend) { const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim(); if (_extraEnv) cmd += _extraEnv + ' '; cmd += `${_py3Bin} -m sglang.launch_server --model-path ${modelName} --host 0.0.0.0 --port ${f.port || '30000'}`; + const _gemma4ChatTemplate = _gemma4ThinkingChatTemplateArg(modelName); + if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`; if (f.tp && f.tp !== '1') cmd += ` --tp ${f.tp}`; if (f.ctx) cmd += ` --context-length ${f.ctx}`; if (f.gpu_mem && f.gpu_mem !== '0.90') cmd += ` --mem-fraction-static ${f.gpu_mem}`; diff --git a/tests/test_cookbook_gemma4_thinking_template.py b/tests/test_cookbook_gemma4_thinking_template.py new file mode 100644 index 000000000..f331cd1d9 --- /dev/null +++ b/tests/test_cookbook_gemma4_thinking_template.py @@ -0,0 +1,31 @@ +"""Regression coverage for issue #2929: Gemma 4 thinking chat template. + +Gemma 4 thinking models need the `<|think|>` control token in the system +instruction, while the generation prompt should start the model turn with the +thought channel. Cookbook serve commands should supply that template for +OpenAI-compatible servers instead of relying on a generic chat template that +cannot toggle thinking mode. +""" +from pathlib import Path + +SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js" + + +def test_gemma4_thinking_template_uses_google_documented_thinking_placement(): + text = SRC.read_text(encoding="utf-8") + + assert "GEMMA4_THINKING_CHAT_TEMPLATE" in text + assert "<|turn>system\\n<|think|>{{ message['content'] }}" in text + assert "<|turn>user" in text + assert "<|turn>model" in text + assert "<|turn>model\\n<|channel>thought" in text + assert "<|turn>model\\n<|think|><|channel>thought" not in text + + +def test_vllm_and_sglang_apply_gemma4_thinking_template(): + text = SRC.read_text(encoding="utf-8") + + assert "function _isGemma4ThinkingModel" in text + assert "const _gemma4ChatTemplate" in text + assert "if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`;" in text + assert text.count("_gemma4ThinkingChatTemplateArg(modelName)") >= 2