3:["$","div",null,{"className":"max-w-[1480px] mx-auto px-4 sm:px-6 flex gap-6","children":[["$","$L17",null,{"recipesByOrg":[["arcee-ai",[{"meta":{"title":"Trinity-Large-Thinking","slug":"trinity-large-thinking","provider":"Arcee AI","description":"Arcee AI's reasoning-focused sparse MoE (AfmoeForCausalLM) with structured traces and agentic tool use","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:42:meta:tasks","related_recipes":[]},"model":{"model_id":"arcee-ai/Trinity-Large-Thinking","min_vllm_version":"0.11.1","architecture":"moe","parameter_count":"398B","active_parameters":"13B","context_length":262144,"base_args":["--dtype","bfloat16"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"DeepSeek-R1 reasoning parser extracts ... into message.reasoning","args":["--reasoning-parser","deepseek_r1"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":955,"description":"Full precision BF16 on multi-GPU (sparse MoE — multi-GPU recommended)"},"nvfp4":{"model_id":"arcee-ai/Trinity-Large-Thinking-NVFP4","precision":"nvfp4","vram_minimum_gb":239,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$18","hf_org":"arcee-ai","hf_repo":"Trinity-Large-Thinking","hf_id":"arcee-ai/Trinity-Large-Thinking","hf_released":"2026-04-01T03:35:06.000Z"}]],["baidu",[{"meta":{"title":"ERNIE-4.5","slug":"ernie-4.5","provider":"Ernie (Baidu)","description":"Baidu ERNIE 4.5 MoE text models (21B-A3B, 300B-A47B) with BF16 and FP8 support plus ERNIE-MTP speculative decoding","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:43:meta:tasks","related_recipes":["baidu/ERNIE-4.5-VL-28B-A3B-PT"]},"model":{"model_id":"baidu/ERNIE-4.5-21B-A3B-PT","min_vllm_version":"0.10.1","architecture":"moe","parameter_count":"21B","active_parameters":"3B","context_length":131072,"base_args":[],"base_env":{}},"features":{"spec_decoding":{"description":"ERNIE-MTP (multi-token prediction) speculative decoding","args":["--speculative-config","{\"method\":\"ernie_mtp\",\"model\":\"baidu/ERNIE-4.5-21B-A3B-PT\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":106,"description":"BF16 weights; fits on 1x80GB GPU (21B variant)"},"300b":{"model_id":"baidu/ERNIE-4.5-300B-A47B-PT","precision":"bf16","vram_minimum_gb":640,"description":"300B total / 47B active; 8x80GB with FP8 online, 16x80GB for BF16","extra_args":["--tensor-parallel-size","8","--gpu-memory-utilization","0.95","--quantization","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--gpu-memory-utilization","0.9"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$19","hf_org":"baidu","hf_repo":"ERNIE-4.5-21B-A3B-PT","hf_id":"baidu/ERNIE-4.5-21B-A3B-PT","hf_released":"2025-06-28T06:13:30.000Z"},{"meta":{"title":"ERNIE-4.5-VL","slug":"ernie-4.5-vl","provider":"Ernie (Baidu)","description":"Baidu ERNIE 4.5 VL MoE vision-language models (28B-A3B, 424B-A47B) with heterogeneous text/vision experts","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:44:meta:tasks","related_recipes":["baidu/ERNIE-4.5-21B-A3B-PT"],"hardware":{"mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"baidu/ERNIE-4.5-VL-28B-A3B-PT","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"28B","active_parameters":"3B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":67,"description":"BF16 weights; fits on 1x80GB GPU (28B VL variant)"},"424b":{"model_id":"baidu/ERNIE-4.5-VL-424B-A47B-PT","precision":"bf16","vram_minimum_gb":1120,"description":"424B total / 47B active; 8x140GB BF16 or 16x80GB BF16","extra_args":["--trust-remote-code","--tensor-parallel-size","8"]},"424b_fp8":{"model_id":"baidu/ERNIE-4.5-VL-424B-A47B-PT","precision":"fp8","vram_minimum_gb":640,"description":"424B with FP8 online quantization + CPU offload for 8x80GB testing","extra_args":["--trust-remote-code","--tensor-parallel-size","8","--quantization","fp8","--cpu-offload-gb","50"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$1a","hf_org":"baidu","hf_repo":"ERNIE-4.5-VL-28B-A3B-PT","hf_id":"baidu/ERNIE-4.5-VL-28B-A3B-PT","hf_released":"2025-06-28T05:50:33.000Z"}]],["ByteDance-Seed",[{"meta":{"title":"Seed-OSS-36B","slug":"seed-oss-36b","provider":"Seed (ByteDance)","description":"ByteDance Seed-OSS 36B dense model with unique 'thinking budget' control and 512K context support","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:24:meta:tasks","related_recipes":[]},"model":{"model_id":"ByteDance-Seed/Seed-OSS-36B-Instruct","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"36B","active_parameters":"36B","context_length":524288,"base_args":[],"base_env":{}},"dependencies":[{"note":"Pinned transformers commit required for Seed-OSS tokenizer compatibility","command":"uv pip install git+https://github.com/huggingface/transformers.git@56d68c6706ee052b445e1e476056ed92ac5eb383"}],"features":{"tool_calling":{"description":"Seed-OSS tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","seed_oss"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":86,"description":"Native BF16 weights on 8x GPU (TP=8)","extra_args":["--tensor-parallel-size","8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$1b","hf_org":"ByteDance-Seed","hf_repo":"Seed-OSS-36B-Instruct","hf_id":"ByteDance-Seed/Seed-OSS-36B-Instruct","hf_released":"2025-08-20T15:03:26.000Z"}]],["deepseek-ai",[{"meta":{"title":"DeepSeek-V4-Pro","slug":"deepseek-v4-pro","provider":"DeepSeek","description":"DeepSeek V4 flagship MoE (1.6T total / 49B active) with hybrid CSA+HCA attention, manifold-constrained hyper-connections, Muon-trained on 32T+ tokens, and three-tier reasoning.","date_updated":"2026-04-24","difficulty":"hard","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:9:meta:tasks","performance_headline":"1M efficient long-context attention","related_recipes":[],"hardware":{"h200":"verified","b200":"verified","gb200":"verified","b300":"verified","gb300":"verified","mi300x":"unsupported","mi325x":"unsupported","mi355x":"unsupported"}},"model":{"model_id":"deepseek-ai/DeepSeek-V4-Pro","min_vllm_version":"0.20.0","architecture":"moe","parameter_count":"1600B","active_parameters":"49B","context_length":1048576,"flashinfer_autotune":true,"base_args":["--trust-remote-code","--kv-cache-dtype","fp8","--block-size","256"],"base_env":{"VLLM_ENGINE_READY_TIMEOUT_S":"3600"}},"dependencies":[{"note":"DeepGEMM FP8 kernels — install via vLLM tools/install_deepgemm.sh","command":"bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/v0.20.0/tools/install_deepgemm.sh)"}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V4 chat template support.","args":["--tokenizer-mode","deepseek_v4","--tool-call-parser","deepseek_v4","--enable-auto-tool-choice"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek V4 reasoning parser.","args":["--reasoning-parser","deepseek_v4"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with 2 speculative tokens (1 on Hopper).","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":2}"],"hardware_overrides":{"hopper":{"args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}}}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":960,"description":"Native FP4+FP8 mixed checkpoint (MoE experts FP4, remaining params FP8)"}},"compatible_strategies":["single_node_tep","single_node_dep","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":["--max-model-len","800000","--gpu-memory-utilization","0.95","--max-num-seqs","512","--max-num-batched-tokens","512","--no-enable-flashinfer-autotune","--compilation-config","{\"mode\": 0, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}"],"extra_env":{"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS":"0"}},"blackwell":{"extra_args":["--attention_config.use_fp4_indexer_cache=True","--moe-backend","deep_gemm_mega_moe"]}},"strategy_overrides":{"single_node_tep":{"extra_args":["--compilation-config","{\"mode\": 0, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}"]},"multi_node_tep":{"extra_args":["--compilation-config","{\"mode\": 0, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}"]},"single_node_dep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"multi_node_dep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":{"default":2,"gb300":1},"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-seqs","2","--max-num-batched-tokens","16384","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}},"decode":{"nodes":{"default":2,"gb300":1},"parallelism":"dep","vllm_args":["--max-num-seqs","1024","--max-num-batched-tokens","1024","--max-cudagraph-capture-size","1024","--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}}}},"guide":"$1c","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V4-Pro","hf_id":"deepseek-ai/DeepSeek-V4-Pro","hf_released":"2026-04-22T06:04:45.000Z"},{"meta":{"title":"DeepSeek-V4-Flash","slug":"deepseek-v4-flash","provider":"DeepSeek","description":"DeepSeek V4 MoE model with hybrid CSA+HCA attention, manifold-constrained hyper-connections, and three-tier reasoning (Non-think / Think High / Think Max).","date_updated":"2026-04-24","difficulty":"hard","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:8:meta:tasks","performance_headline":"1M efficient long-context attention","related_recipes":[],"hardware":{"h200":"verified","b200":"verified","gb200":"verified","b300":"verified","gb300":"verified","mi300x":"unsupported","mi325x":"unsupported","mi355x":"unsupported"}},"model":{"model_id":"deepseek-ai/DeepSeek-V4-Flash","min_vllm_version":"0.20.0","architecture":"moe","parameter_count":"284B","active_parameters":"13B","context_length":1048576,"flashinfer_autotune":true,"base_args":["--trust-remote-code","--kv-cache-dtype","fp8","--block-size","256"],"base_env":{"VLLM_ENGINE_READY_TIMEOUT_S":"3600"}},"dependencies":[{"note":"DeepGEMM FP8 kernels — install via vLLM tools/install_deepgemm.sh","command":"bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/v0.20.0/tools/install_deepgemm.sh)"}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V4 chat template support.","args":["--tokenizer-mode","deepseek_v4","--tool-call-parser","deepseek_v4","--enable-auto-tool-choice"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek V4 reasoning parser.","args":["--reasoning-parser","deepseek_v4"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with 2 speculative tokens (1 on Hopper).","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":2}"],"hardware_overrides":{"hopper":{"args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}}}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":170,"description":"Native FP4+FP8 mixed checkpoint (MoE experts FP4, remaining params FP8)"}},"compatible_strategies":["single_node_dep","multi_node_dep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention_config.use_fp4_indexer_cache=True","--moe-backend","deep_gemm_mega_moe"]}},"strategy_overrides":{"single_node_dep":{"extra_args":["--data-parallel-size","4","--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"multi_node_dep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":1,"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-seqs","8","--max-num-batched-tokens","65536","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}},"decode":{"nodes":1,"parallelism":"dep","vllm_args":["--max-num-seqs","1536","--max-num-batched-tokens","1536","--max-cudagraph-capture-size","1536","--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}}}},"guide":"$1d","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V4-Flash","hf_id":"deepseek-ai/DeepSeek-V4-Flash","hf_released":"2026-04-22T06:04:20.000Z"},{"meta":{"title":"DeepSeek-OCR-2","slug":"deepseek-ocr-2","provider":"DeepSeek","description":"Next-generation DeepSeek OCR model with improved document-to-markdown grounding and optical context compression.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:45:meta:tasks","performance_headline":"Improved grounding and markdown conversion over DeepSeek-OCR","related_recipes":["deepseek-ai/DeepSeek-OCR"]},"model":{"model_id":"deepseek-ai/DeepSeek-OCR-2","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"3B","active_parameters":"3B","context_length":8192,"base_args":["--trust-remote-code","--logits_processors","vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":7,"description":"Full precision BF16 (~3.4B params)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$1e","hf_org":"deepseek-ai","hf_repo":"DeepSeek-OCR-2","hf_id":"deepseek-ai/DeepSeek-OCR-2","hf_released":"2026-01-27T02:56:54.000Z"},{"meta":{"title":"DeepSeek-V3.2","slug":"deepseek-v3.2","provider":"DeepSeek","description":"DeepSeek V3.2 MoE model with MLA attention, sparse attention, and scalable RL for strong reasoning and agent capabilities.","date_updated":"2026-04-01","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:86:meta:tasks","performance_headline":"GPT-5-level reasoning with efficient MoE inference","related_recipes":[],"hardware":{"h100":"verified","h200":"verified"}},"model":{"model_id":"deepseek-ai/DeepSeek-V3.2","min_vllm_version":"0.18.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--kernel-config.enable_flashinfer_autotune=False"],"base_env":{}},"dependencies":[{"note":"DeepGEMM pinned build for MQA logits (FP8 MoE kernels)","command":"uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation"},{"note":"Set VLLM_USE_DEEP_GEMM=0 to skip DeepGEMM for the MoE path (recommended on H20)","command":"export VLLM_USE_DEEP_GEMM=0","optional":true}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V3.2 chat template support.","args":["--tokenizer-mode","deepseek_v32","--tool-call-parser","deepseek_v32","--enable-auto-tool-choice"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek V3 reasoning parser.","args":["--reasoning-parser","deepseek_v3"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with 3 speculative tokens.","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":3}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 checkpoint (F8_E4M3)"},"nvfp4":{"model_id":"nvidia/DeepSeek-V3.2-NVFP4","precision":"nvfp4","vram_minimum_gb":403,"description":"NVIDIA FP4 quantized variant with FP8 KV cache for reduced VRAM usage.","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention-backend","FLASHINFER_MLA"],"extra_env":{}},"amd":{"extra_args":[],"extra_env":{}}},"strategy_overrides":{"pd_cluster":{"prefill":{"extra_args":[],"extra_env":{}},"decode":{"extra_args":[],"extra_env":{}}},"single_node_dep":{"extra_args":[],"extra_env":{}}},"guide":"$1f","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3.2","hf_id":"deepseek-ai/DeepSeek-V3.2","hf_released":"2025-12-01T02:34:49.000Z"},{"meta":{"title":"DeepSeek-OCR","slug":"deepseek-ocr","provider":"DeepSeek","description":"Frontier OCR model exploring optical context compression for LLMs, optimized for document parsing and markdown generation.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:46:meta:tasks","performance_headline":"Optical context compression for efficient OCR and document understanding","related_recipes":["deepseek-ai/DeepSeek-OCR-2"]},"model":{"model_id":"deepseek-ai/DeepSeek-OCR","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"3B","active_parameters":"3B","context_length":8192,"base_args":["--trust-remote-code","--logits_processors","vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":7,"description":"Full precision BF16 (~3.3B params)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$20","hf_org":"deepseek-ai","hf_repo":"DeepSeek-OCR","hf_id":"deepseek-ai/DeepSeek-OCR","hf_released":"2025-10-17T06:22:05.000Z"},{"meta":{"title":"DeepSeek-V3.2-Exp","slug":"deepseek-v3.2-exp","provider":"DeepSeek","description":"Experimental DeepSeek-V3.2 preview with sparse attention (MQA-like logits) and FP8 KV cache; architecture matches DeepSeek-V3.1 except for the sparse attention mechanism.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:49:meta:tasks","performance_headline":"Sparse attention MoE with FP8 KV cache and strong GSM8K score (~0.96)","related_recipes":["deepseek-ai/DeepSeek-V3.1","deepseek-ai/DeepSeek-V3.2"],"hardware":{"h200":"verified"}},"model":{"model_id":"deepseek-ai/DeepSeek-V3.2-Exp","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"DeepGEMM pinned build for MQA logits (FP8 MoE kernels)","command":"uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation"}],"features":{"reasoning":{"description":"Dynamic thinking mode via chat_template_kwargs, same as DeepSeek-V3.1.","args":[]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (or H20, or 8xB200) with FP8 KV cache default"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{"amd":{"extra_args":["--block-size","1","--kv-cache-dtype","bfloat16","--no-enable-prefix-caching","--max-num-batched-tokens","32768"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1","VLLM_RPC_TIMEOUT":"18000000"}}},"strategy_overrides":{},"guide":"$21","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3.2-Exp","hf_id":"deepseek-ai/DeepSeek-V3.2-Exp","hf_released":"2025-09-29T06:07:26.000Z"},{"meta":{"title":"DeepSeek-V3.1","slug":"deepseek-v3.1","provider":"DeepSeek","description":"DeepSeek-V3.1 is a hybrid MoE model that supports dynamic switching between thinking and non-thinking modes, with tool calling and function execution.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:48:meta:tasks","performance_headline":"Hybrid thinking / non-thinking MoE with native FP8 and tool calling","related_recipes":["deepseek-ai/DeepSeek-V3","deepseek-ai/DeepSeek-V3.2-Exp"],"hardware":{"h200":"verified"}},"model":{"model_id":"deepseek-ai/DeepSeek-V3.1","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--enable-expert-parallel"],"base_env":{}},"features":{"tool_calling":{"description":"Enable DeepSeek-V3.1 tool calling with the deepseek_v31 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","deepseek_v31","--chat-template","examples/tool_chat_template_deepseekv31.jinja"]},"reasoning":{"description":"Dynamic thinking mode via chat_template_kwargs={'thinking': true|false}. No separate parser flag is required; the chat template emits ... content.","args":[]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (or H20) with 141GB per GPU"},"nvfp4":{"model_id":"nvidia/DeepSeek-V3.1-NVFP4","precision":"nvfp4","vram_minimum_gb":403,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$22","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3.1","hf_id":"deepseek-ai/DeepSeek-V3.1","hf_released":"2025-08-21T02:37:52.000Z"},{"meta":{"title":"DeepSeek-R1","slug":"deepseek-r1","provider":"DeepSeek","description":"DeepSeek-R1 is a 671B-parameter MoE reasoning model built on the DeepSeek-V3 architecture, trained with large-scale reinforcement learning for strong chain-of-thought capabilities.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:47:meta:tasks","performance_headline":"Open-weights RL-trained reasoning model with native FP8 / FP4 variants","related_recipes":["deepseek-ai/DeepSeek-V3","deepseek-ai/DeepSeek-V3.1"],"hardware":{"h200":"verified","b200":"verified","gb200":"verified"}},"model":{"model_id":"deepseek-ai/DeepSeek-R1","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--enable-expert-parallel"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (recommended)"},"r1_0528":{"model_id":"deepseek-ai/DeepSeek-R1-0528","precision":"fp8","vram_minimum_gb":805,"description":"May 2025 DeepSeek-R1 refresh (DeepSeek-R1-0528)"},"nvfp4":{"model_id":"nvidia/DeepSeek-R1-NVFP4","precision":"nvfp4","vram_minimum_gb":403,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1"}},"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$23","hf_org":"deepseek-ai","hf_repo":"DeepSeek-R1","hf_id":"deepseek-ai/DeepSeek-R1","hf_released":"2025-01-20T03:46:07.000Z"},{"meta":{"title":"DeepSeek-V3","slug":"deepseek-v3","provider":"DeepSeek","description":"DeepSeek-V3 is a 671B-parameter Mixture-of-Experts model with native FP8 weights and strong reasoning, coding, and math capabilities.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:50:meta:tasks","performance_headline":"Frontier open-weights MoE with native FP8 and FP4 variants","related_recipes":["deepseek-ai/DeepSeek-R1","deepseek-ai/DeepSeek-V3.1"],"hardware":{"h200":"verified","b200":"verified","gb200":"verified"}},"model":{"model_id":"deepseek-ai/DeepSeek-V3","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--enable-expert-parallel"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (recommended)"},"fp4":{"model_id":"nvidia/DeepSeek-V3-FP4","precision":"fp4","vram_minimum_gb":403,"description":"NVIDIA FP4 quantized weights for Blackwell (e.g. 4xB200)","extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1"}},"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$24","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3","hf_id":"deepseek-ai/DeepSeek-V3","hf_released":"2024-12-25T12:52:23.000Z"}]],["Google",[{"meta":{"title":"Gemma 4 26B-A4B IT","slug":"gemma-4-26b-a4b-it","provider":"Google","description":"Google's Gemma 4 MoE multimodal model (26B total / 4B active) with 128 fine-grained experts, top-8 routing, thinking mode, and tool-use protocol.","date_updated":"2026-04-21","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:19:meta:tasks","performance_headline":"MoE multimodal model — 26B total / 4B active, 128 experts with top-8 routing","related_recipes":["google/gemma-4-E2B-it","google/gemma-4-E4B-it","google/gemma-4-31B-it"],"hardware":{"h100":"verified","h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified","trillium":"verified","ironwood":"verified"}},"model":{"model_id":"google/gemma-4-26B-A4B-it","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"26B","active_parameters":"4B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":64,"description":"Full BF16 — single 80 GB NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$25","hf_org":"Google","hf_repo":"gemma-4-26B-A4B-it","hf_id":"Google/gemma-4-26B-A4B-it","hf_released":"2026-03-11T21:25:57.000Z"},{"meta":{"title":"Gemma 4 31B IT","slug":"gemma-4-31b-it","provider":"Google","description":"Google's unified multimodal Gemma 4 dense model (31B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:25:meta:tasks","performance_headline":"Unified multimodal model with structured thinking, function calling, dynamic vision resolution","related_recipes":[],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified","trillium":"verified","ironwood":"verified"}},"model":{"model_id":"google/gemma-4-31B-it","min_vllm_version":"0.19.1","architecture":"dense","parameter_count":"31B","active_parameters":"31B","context_length":262144,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — only needed when serving the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":210,"description":"Full BF16 — single 80 GB NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X"},"nvfp4":{"model_id":"nvidia/gemma-4-31B-it-NVFP4","precision":"nvfp4","vram_minimum_gb":19,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$26","hf_org":"Google","hf_repo":"gemma-4-31B-it","hf_id":"Google/gemma-4-31B-it","hf_released":"2026-03-11T18:22:36.000Z"},{"meta":{"title":"Gemma 4 E2B IT","slug":"gemma-4-e2b-it","provider":"Google","description":"Google's compact Gemma 4 multimodal model (effective 2B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-04-21","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:20:meta:tasks","performance_headline":"Compact unified multimodal model with audio, thinking, and function calling — runs on a single 24 GB+ GPU","related_recipes":["google/gemma-4-E4B-it","google/gemma-4-31B-it","google/gemma-4-26B-A4B-it"],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified","trillium":"verified","ironwood":"verified"}},"model":{"model_id":"google/gemma-4-E2B-it","min_vllm_version":"0.19.1","architecture":"dense","parameter_count":"5B","active_parameters":"5B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — needed to serve the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision and audio encoders for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":13,"description":"Full BF16 — single 24 GB+ NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$27","hf_org":"Google","hf_repo":"gemma-4-E2B-it","hf_id":"Google/gemma-4-E2B-it","hf_released":"2026-03-02T19:58:09.000Z"},{"meta":{"title":"Gemma 4 E4B IT","slug":"gemma-4-e4b-it","provider":"Google","description":"Google's compact Gemma 4 multimodal model (effective 4B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-04-21","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:21:meta:tasks","performance_headline":"Effective-4B unified multimodal model with audio, thinking, and function calling — runs on a single 24 GB+ GPU","related_recipes":["google/gemma-4-E2B-it","google/gemma-4-31B-it","google/gemma-4-26B-A4B-it"],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified","trillium":"verified","ironwood":"verified"}},"model":{"model_id":"google/gemma-4-E4B-it","min_vllm_version":"0.19.1","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — needed to serve the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision and audio encoders for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":20,"description":"Full BF16 — single 24 GB+ NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$28","hf_org":"Google","hf_repo":"gemma-4-E4B-it","hf_id":"Google/gemma-4-E4B-it","hf_released":"2026-03-02T19:57:40.000Z"},{"meta":{"title":"TranslateGemma 27B IT","slug":"translategemma-27b-it","provider":"Google","description":"Lightweight open translation model from Google (based on Gemma 3) supporting 55 languages. Served via the vLLM-optimized Infomaniak-AI checkpoint.","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:26:meta:tasks","performance_headline":"Deployable on laptops/desktops and cloud GPUs; vLLM-optimized checkpoint removes custom JSON inputs","related_recipes":[]},"model":{"model_id":"google/translategemma-27b-it","min_vllm_version":"0.14.1","docker_image":"vllm/vllm-openai:v0.14.1-cu130","architecture":"dense","parameter_count":"27B","active_parameters":"27B","context_length":131072,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":65,"description":"Original Google checkpoint in BF16 (has vLLM compatibility issues — prefer vllm-optimized variant)"},"vllm_optimized":{"model_id":"Infomaniak-AI/vllm-translategemma-27b-it","precision":"bf16","vram_minimum_gb":65,"description":"Infomaniak-AI vLLM-optimized checkpoint — recommended. Fixes RoPE config, EOS token, and replaces custom JSON inputs with string delimiters."},"small_4b":{"model_id":"google/translategemma-4b-it","precision":"bf16","vram_minimum_gb":10,"description":"4B variant for lower-resource deployments (prefer Infomaniak-AI/vllm-translategemma-4b-it)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$29","hf_org":"Google","hf_repo":"translategemma-27b-it","hf_id":"Google/translategemma-27b-it","hf_released":"2026-01-12T16:12:41.000Z"}]],["inclusionAI",[{"meta":{"title":"Ling-2.6-flash","slug":"ling-2.6-flash","provider":"inclusionAI","description":"Ling-2.6-flash (BailingMoeV2_5) instruct model with 104B total / 7.4B active params, hybrid linear + MLA attention, 128K context, optimized for agent workloads","date_updated":"2026-04-28","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:1:meta:tasks","related_recipes":["inclusionAI/Ring-1T-FP8"],"hardware":{"h200":"verified"}},"model":{"model_id":"inclusionAI/Ling-2.6-flash","min_vllm_version":"nightly","nightly_required":true,"architecture":"moe","parameter_count":"104B","active_parameters":"7.4B","context_length":131072,"base_args":["--trust-remote-code","--tensor-parallel-size","4"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":250,"description":"BF16 weights on 4x H200 with TP=4"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":4}},"hf_org":"inclusionAI","hf_repo":"Ling-2.6-flash","hf_id":"inclusionAI/Ling-2.6-flash","hf_released":"2026-04-28T03:27:56.000Z"},{"meta":{"title":"Ring-1T-FP8","slug":"ring-1t-fp8","provider":"inclusionAI","description":"Ring-1T (BailingMoeV2) FP8 model (~1T total params) for 8xH200 or 8xMI300X deployment","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:51:meta:tasks","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"inclusionAI/Ring-1T-FP8","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"1T","active_parameters":"50B","context_length":65536,"base_args":["--trust-remote-code","--tensor-parallel-size","8","--max_num_seqs","32","--kv-cache-dtype","fp8","--served-model-name","Ring-1T-FP8"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1200,"description":"FP8 weights on 8x H200 (80 GB) with FP8 KV cache","extra_args":["--gpu-memory-utilization","0.97","--compilation-config","{\"use_inductor\": false}"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$2a","hf_org":"inclusionAI","hf_repo":"Ring-1T-FP8","hf_id":"inclusionAI/Ring-1T-FP8","hf_released":"2025-10-11T20:02:44.000Z"}]],["internlm",[{"meta":{"title":"Intern-S1","slug":"intern-s1","provider":"InternLM","description":"Intern-S1 vision-language model from Shanghai AI Lab with BF16/FP8 variants and thinking/non-thinking modes","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:52:meta:tasks","related_recipes":["OpenGVLab/InternVL3_5-8B"],"hardware":{"mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"internlm/Intern-S1","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"241B","active_parameters":"28B","context_length":65536,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"InternLM tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","internlm"]},"reasoning":{"description":"DeepSeek-R1 reasoning parser extracts ...","args":["--reasoning-parser","deepseek_r1"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":578,"description":"BF16 on 8x H800 (80GB each)","extra_args":["--tensor-parallel-size","8"]},"fp8":{"model_id":"internlm/Intern-S1-FP8","precision":"fp8","vram_minimum_gb":289,"description":"FP8 on 4x H800 (80GB each)","extra_args":["--tensor-parallel-size","4"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"0"}}},"strategy_overrides":{},"guide":"$2b","hf_org":"internlm","hf_repo":"Intern-S1","hf_id":"internlm/Intern-S1","hf_released":"2025-07-24T06:05:13.000Z"}]],["jinaai",[{"meta":{"title":"Jina Reranker m0","slug":"jina-reranker-m0","provider":"Jina AI","description":"Multilingual, multimodal reranker for text and visual documents across 29+ languages via Qwen2-VL backbone","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:53:meta:tasks","related_recipes":[]},"model":{"model_id":"jinaai/jina-reranker-m0","min_vllm_version":"0.8.0","architecture":"dense","parameter_count":"2.4B","active_parameters":"2.4B","context_length":32768,"base_args":["--gpu-memory-utilization","0.75","--max-num-seqs","32"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":6,"description":"BF16 weights; 2x T4 or 2x L4 GPUs"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$2c","hf_org":"jinaai","hf_repo":"jina-reranker-m0","hf_id":"jinaai/jina-reranker-m0","hf_released":"2025-03-27T11:08:51.000Z"}]],["meituan-longcat",[{"meta":{"title":"LongCat-Image-Edit","slug":"longcat-image-edit","provider":"Meituan","description":"Bilingual (Chinese-English) image editing model from Meituan LongCat, served via vLLM-Omni","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:54:meta:tasks","related_recipes":[]},"model":{"model_id":"meituan-longcat/LongCat-Image-Edit","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"6B","active_parameters":"6B","context_length":0,"base_args":[],"base_env":{}},"dependencies":[{"note":"vLLM-Omni must be installed from source with vllm==0.12.0 for LongCat-Image-Edit","command":"git clone https://github.com/vllm-project/vllm-omni.git && cd vllm-omni && uv pip install -e . vllm==0.12.0"},{"note":"xformers CUDA 12.8 build required for the diffusion attention kernels","command":"uv pip install -U xformers --index-url https://download.pytorch.org/whl/cu128"},{"note":"diffusers from source (needed by the image-edit pipeline)","command":"git clone https://github.com/huggingface/diffusers.git && cd diffusers && uv pip install -e ."}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":36,"description":"BF16 weights; served via vLLM-Omni (offline inference)"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$2d","hf_org":"meituan-longcat","hf_repo":"LongCat-Image-Edit","hf_id":"meituan-longcat/LongCat-Image-Edit","hf_released":"2025-12-05T07:34:53.000Z"}]],["meta-llama",[{"meta":{"title":"Llama-4-Scout","slug":"llama-4-scout","provider":"Meta","description":"Llama 4 Scout 17B-16E MoE model with NVIDIA FP8/FP4 variants, fits on a single GPU with quantization","date_updated":"2026-04-16","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:84:meta:tasks","performance_headline":"","related_recipes":["meta-llama/Llama-3.3-70B-Instruct"],"hardware":{"h100":"verified","b200":"verified","gb200":"verified"}},"model":{"model_id":"meta-llama/Llama-4-Scout-17B-16E-Instruct","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"109B","active_parameters":"17B","context_length":10485760,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":262,"description":"Full precision BF16"},"fp8":{"model_id":"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8","precision":"fp8","vram_minimum_gb":131,"description":"NVIDIA FP8 quantization for Hopper and Blackwell, fits on 1x H100","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{}},"nvfp4":{"model_id":"nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":65,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"hopper":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192"],"extra_env":{}},"blackwell":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192","--compilation-config","{\"pass_config\":{\"fuse_allreduce_rms\":true,\"eliminate_noops\":true}}"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1"}},"amd":{"extra_args":["--no-enable-prefix-caching","--max-num-batched-tokens","16384","--max-num-seqs","64","--max-model-len","32000"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$2e","hf_org":"meta-llama","hf_repo":"Llama-4-Scout-17B-16E-Instruct","hf_id":"meta-llama/Llama-4-Scout-17B-16E-Instruct","hf_released":"2025-04-02T13:34:17.000Z"},{"meta":{"title":"Llama-3.3-70B","slug":"llama3.3-70b","provider":"Meta","description":"Llama 3.3 70B dense model with NVIDIA FP8/FP4 quantized variants for Hopper and Blackwell GPUs","date_updated":"2026-04-16","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:83:meta:tasks","performance_headline":"","related_recipes":["meta-llama/Llama-4-Scout-17B-16E-Instruct"],"hardware":{"h100":"verified","h200":"verified","b200":"verified","gb200":"verified"}},"model":{"model_id":"meta-llama/Llama-3.3-70B-Instruct","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"70B","active_parameters":"70B","context_length":131072,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":170,"description":"Full precision BF16"},"fp8":{"model_id":"nvidia/Llama-3.3-70B-Instruct-FP8","precision":"fp8","vram_minimum_gb":84,"description":"NVIDIA FP8 quantization for Hopper and Blackwell","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{}},"nvfp4":{"model_id":"nvidia/Llama-3.3-70B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":42,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{"hopper":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192"],"extra_env":{}},"blackwell":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192","--compilation-config","{\"pass_config\":{\"fuse_allreduce_rms\":true,\"fuse_attn_quant\":true,\"eliminate_noops\":true}}"],"extra_env":{}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$2f","hf_org":"meta-llama","hf_repo":"Llama-3.3-70B-Instruct","hf_id":"meta-llama/Llama-3.3-70B-Instruct","hf_released":"2024-11-26T16:08:47.000Z"},{"meta":{"title":"Llama-3.1-8B-Instruct","slug":"llama-3.1-8b-instruct","provider":"Meta","description":"Meta's Llama 3.1 8B dense instruction-tuned language model with 128K context","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:55:meta:tasks","related_recipes":["meta-llama/Llama-3.3-70B-Instruct"],"hardware":{"h100":"verified","h200":"verified"}},"model":{"model_id":"meta-llama/Llama-3.1-8B-Instruct","min_vllm_version":"0.6.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":131072,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":20,"description":"Full precision BF16"},"nvfp4":{"model_id":"nvidia/Llama-3.1-8B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":5,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]},"nvidia_fp8":{"model_id":"nvidia/Llama-3.1-8B-Instruct-FP8","precision":"fp8","vram_minimum_gb":10,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$30","hf_org":"meta-llama","hf_repo":"Llama-3.1-8B-Instruct","hf_id":"meta-llama/Llama-3.1-8B-Instruct","hf_released":"2024-07-18T08:56:00.000Z"}]],["microsoft",[{"meta":{"title":"Phi-4","slug":"phi-4","provider":"Microsoft","description":"Microsoft's Phi-4 family of lightweight dense models (mini-instruct, reasoning, multimodal) with 128K context","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:56:meta:tasks","related_recipes":[],"hardware":{"h100":"verified"}},"model":{"model_id":"microsoft/Phi-4-mini-instruct","min_vllm_version":"0.7.0","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":131072,"base_args":[],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"Phi-4-mini-instruct, conversational instruction-tuned"},"mini_reasoning":{"model_id":"microsoft/Phi-4-mini-reasoning","precision":"bf16","vram_minimum_gb":10,"description":"Optimized for reasoning tasks"},"reasoning":{"model_id":"microsoft/Phi-4-reasoning","precision":"bf16","vram_minimum_gb":30,"description":"Advanced reasoning capabilities (14B)"},"multimodal":{"model_id":"microsoft/Phi-4-multimodal-instruct","precision":"bf16","vram_minimum_gb":16,"description":"Multimodal instruction-following (text + image)","extra_args":["--trust-remote-code"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$31","hf_org":"microsoft","hf_repo":"Phi-4-mini-instruct","hf_id":"microsoft/Phi-4-mini-instruct","hf_released":"2025-02-19T01:00:58.000Z"}]],["MiniMaxAI",[{"meta":{"title":"MiniMax-M2.7","slug":"minimax-m2.7","provider":"MiniMax","description":"MiniMax M2.7 MoE language model (230B total / 10B active) — latest M2 release for coding, agent toolchains, and long-context reasoning with native FP8","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:29:meta:tasks","performance_headline":"Latest M2 series release; verified accuracy on AIME25, GPQA-D, GSM8K; 196K context","related_recipes":[],"hardware":{"h100":"verified","h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"MiniMaxAI/MiniMax-M2.7","min_vllm_version":"0.20.0","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code","--compilation-config","{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}"],"base_env":{}},"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$32","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2.7","hf_id":"MiniMaxAI/MiniMax-M2.7","hf_released":"2026-04-09T03:37:12.000Z"},{"meta":{"title":"MiniMax-M2.5","slug":"minimax-m2.5","provider":"MiniMax","description":"MiniMax M2.5 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:28:meta:tasks","performance_headline":"Refreshed M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context","related_recipes":[],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"MiniMaxAI/MiniMax-M2.5","min_vllm_version":"0.19.0","docker_image":"vllm/vllm-openai:minimax27","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code","--compilation-config","{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}"],"base_env":{}},"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"},"nvfp4":{"model_id":"nvidia/MiniMax-M2.5-NVFP4","precision":"nvfp4","vram_minimum_gb":138,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$33","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2.5","hf_id":"MiniMaxAI/MiniMax-M2.5","hf_released":"2026-02-12T06:05:24.000Z"},{"meta":{"title":"MiniMax-M2.1","slug":"minimax-m2.1","provider":"MiniMax","description":"MiniMax M2.1 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:27:meta:tasks","performance_headline":"Updated M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context","related_recipes":[],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"MiniMaxAI/MiniMax-M2.1","min_vllm_version":"0.11.0","docker_image":"vllm/vllm-openai:minimax27","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code","--compilation-config","{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}"],"base_env":{}},"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$34","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2.1","hf_id":"MiniMaxAI/MiniMax-M2.1","hf_released":"2025-12-20T05:45:05.000Z"},{"meta":{"title":"MiniMax-M2","slug":"minimax-m2","provider":"MiniMax","description":"MiniMax M2 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:30:meta:tasks","performance_headline":"Open-source MoE with strong SWE-Bench and Terminal-Bench performance, 196K context","related_recipes":[],"hardware":{"h100":"verified","h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"MiniMaxAI/MiniMax-M2","min_vllm_version":"0.11.0","docker_image":"vllm/vllm-openai:minimax27","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code","--compilation-config","{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}"],"base_env":{}},"dependencies":[{"note":"Optional: DeepGEMM FP8 MoE kernels for throughput (skip on B200 — known FlashInfer FP8 MoE error)","command":"export VLLM_USE_DEEP_GEMM=1","optional":true}],"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$35","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2","hf_id":"MiniMaxAI/MiniMax-M2","hf_released":"2025-10-22T13:45:10.000Z"}]],["mistralai",[{"meta":{"title":"Mistral-Large-3-675B-Instruct","slug":"mistral-large-3-675b-instruct","provider":"Mistral AI","description":"Mistral Large 3 (675B) with FP8 and NVFP4 weights for 8xH200 / 4xB200 deployments","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:59:meta:tasks","related_recipes":["mistralai/Ministral-3-14B-Instruct-2512"],"hardware":{"h100":"verified","b200":"verified"}},"model":{"model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"675B","active_parameters":"22B","context_length":294912,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":810,"description":"FP8 weights on 8xH200 (recommended for fine-tuning; up to 256K context)","extra_args":["--tensor-parallel-size","8"]},"nvfp4":{"model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4","precision":"nvfp4","vram_minimum_gb":405,"description":"NVFP4 weights on 4xB200 (use for <64K context; B200-native, Marlin fallback on A100/H100)","extra_args":["--tensor-parallel-size","4"]},"fp8":{"model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512-FP8","precision":"fp8","vram_minimum_gb":810,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--no-enable-prefix-caching"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$36","hf_org":"mistralai","hf_repo":"Mistral-Large-3-675B-Instruct-2512","hf_id":"mistralai/Mistral-Large-3-675B-Instruct-2512","hf_released":"2025-11-28T18:05:12.000Z"},{"meta":{"title":"Ministral-3-Instruct","slug":"ministral-3-instruct","provider":"Mistral AI","description":"Ministral 3 Instruct family (3B/8B/14B) with FP8 weights, vision support, and 256K context","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:57:meta:tasks","related_recipes":["mistralai/Ministral-3-8B-Reasoning-2512","mistralai/Mistral-Large-3-675B-Instruct-2512"],"hardware":{"h200":"verified"}},"model":{"model_id":"mistralai/Ministral-3-14B-Instruct-2512","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"14B","active_parameters":"14B","context_length":262144,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":17,"description":"Native FP8 weights (14B), fits on 1x H200"},"8b":{"model_id":"mistralai/Ministral-3-8B-Instruct-2512","precision":"fp8","vram_minimum_gb":12,"description":"Smaller 8B variant with independent embedding/output layers"},"3b":{"model_id":"mistralai/Ministral-3-3B-Instruct-2512","precision":"fp8","vram_minimum_gb":6,"description":"Smallest 3B variant with tied embeddings"},"fp8":{"model_id":"mistralai/Ministral-3-14B-Instruct-2512-FP8","precision":"fp8","vram_minimum_gb":17,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":["--config_format","mistral","--load_format","mistral"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$37","hf_org":"mistralai","hf_repo":"Ministral-3-14B-Instruct-2512","hf_id":"mistralai/Ministral-3-14B-Instruct-2512","hf_released":"2025-10-31T08:43:24.000Z"},{"meta":{"title":"Ministral-3-Reasoning","slug":"ministral-3-reasoning","provider":"Mistral AI","description":"Ministral 3 Reasoning family (3B/8B/14B) with BF16 weights, vision support, and 256K context","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:58:meta:tasks","related_recipes":["mistralai/Ministral-3-14B-Instruct-2512","mistralai/Mistral-Large-3-675B-Instruct-2512"],"hardware":{"h200":"verified","gb200":"verified"}},"model":{"model_id":"mistralai/Ministral-3-8B-Reasoning-2512","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":262144,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"reasoning":{"description":"Mistral reasoning parser extracts ... into message.reasoning","args":["--reasoning-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":22,"description":"Native BF16 weights (8B)"},"3b":{"model_id":"mistralai/Ministral-3-3B-Reasoning-2512","precision":"bf16","vram_minimum_gb":8,"description":"Smallest 3B variant with tied embeddings"},"14b":{"model_id":"mistralai/Ministral-3-14B-Reasoning-2512","precision":"bf16","vram_minimum_gb":32,"description":"Largest 14B variant; 2xH200 recommended for full context","extra_args":["--tensor-parallel-size","2"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":["--no-enable-prefix-caching"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$38","hf_org":"mistralai","hf_repo":"Ministral-3-8B-Reasoning-2512","hf_id":"mistralai/Ministral-3-8B-Reasoning-2512","hf_released":"2025-10-31T08:41:36.000Z"}]],["moonshotai",[{"meta":{"title":"Kimi-K2.6","slug":"kimi-k2.6","provider":"Moonshot AI","description":"Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes","date_updated":"2026-04-20","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:22:meta:tasks","performance_headline":"Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention","related_recipes":[],"hardware":{"h200":"verified","gb200":"verified"}},"model":{"model_id":"moonshotai/Kimi-K2.6","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":262144,"supports_dcp":true,"base_args":["--trust-remote-code"]},"features":{"tool_calling":{"description":"Kimi K2 tool-call parser with automatic tool choice","args":["--tool-call-parser","kimi_k2","--enable-auto-tool-choice"]},"reasoning":{"description":"Kimi K2 reasoning parser for extracting chain-of-thought content","args":["--reasoning-parser","kimi_k2"]},"spec_decoding":{"description":"Eagle3 speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"lightseekorg/kimi-k2.6-eagle3\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"hardware_opt_in_features":{"gb200":["encoder_parallel"]},"variants":{"default":{"precision":"int4","vram_minimum_gb":714,"description":"Packed INT4 via compressed-tensors (~595 GB on disk); fits 8×H200"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention-config.use_trtllm_ragged_deepseek_prefill=True"]},"amd":{"extra_args":["--block-size=1"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","VLLM_ROCM_USE_AITER_RMSNORM":"0"}}},"strategy_overrides":{"single_node_dep":{"extra_args":[],"extra_env":{}},"single_node_tep":{"extra_args":[],"extra_env":{}},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":1,"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-batched-tokens","16384","--block-size","64"],"env":{}},"decode":{"nodes":1,"parallelism":"dep","vllm_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--block-size","64","--all2all-backend","flashinfer_nvlink_one_sided"],"env":{}}}},"guide":"$39","hf_org":"moonshotai","hf_repo":"Kimi-K2.6","hf_id":"moonshotai/Kimi-K2.6","hf_released":"2026-04-14T04:23:36.000Z"},{"meta":{"title":"Kimi-K2.5","slug":"kimi-k2.5","provider":"Moonshot AI","description":"Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes","date_updated":"2026-04-16","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:85:meta:tasks","performance_headline":"Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention","related_recipes":[],"hardware":{"h200":"verified","gb200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"moonshotai/Kimi-K2.5","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":262144,"supports_dcp":true,"base_args":["--trust-remote-code"]},"features":{"tool_calling":{"description":"Kimi K2 tool-call parser with automatic tool choice","args":["--tool-call-parser","kimi_k2","--enable-auto-tool-choice"]},"reasoning":{"description":"Kimi K2 reasoning parser for extracting chain-of-thought content","args":["--reasoning-parser","kimi_k2"]},"spec_decoding":{"description":"Eagle3 speculative decoding for accelerated inference (requires vLLM >= 0.18.0)","args":["--speculative-config","{\"model\":\"lightseekorg/kimi-k2.5-eagle3\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"hardware_opt_in_features":{"gb200":["encoder_parallel"]},"variants":{"default":{"precision":"int4","vram_minimum_gb":714,"description":"Packed INT4 via compressed-tensors (~595 GB on disk); fits 8×H200"},"nvfp4":{"model_id":"nvidia/Kimi-K2.5-NVFP4","precision":"nvfp4","vram_minimum_gb":600,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs (e.g. GB200)","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention-config.use_trtllm_ragged_deepseek_prefill=True"]},"amd":{"extra_args":["--block-size=1"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","VLLM_ROCM_USE_AITER_RMSNORM":"0"}}},"strategy_overrides":{"single_node_dep":{"extra_args":[],"extra_env":{}},"single_node_tep":{"extra_args":[],"extra_env":{}},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":1,"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-batched-tokens","16384","--block-size","64"],"env":{}},"decode":{"nodes":1,"parallelism":"dep","vllm_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--block-size","64","--all2all-backend","flashinfer_nvlink_one_sided"],"env":{}}}},"guide":"$3a","hf_org":"moonshotai","hf_repo":"Kimi-K2.5","hf_id":"moonshotai/Kimi-K2.5","hf_released":"2026-01-01T06:06:03.000Z"},{"meta":{"title":"Kimi-K2-Thinking","slug":"kimi-k2-thinking","provider":"Moonshot AI","description":"Kimi-K2-Thinking is an advanced reasoning MoE model with native INT4 QAT weights, designed for long-horizon agent workflows interleaving chain-of-thought reasoning with tool calls.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:61:meta:tasks","performance_headline":"1T MoE thinking model with native INT4 QAT for 2x low-latency speed-up","related_recipes":["moonshotai/Kimi-K2-Instruct","moonshotai/Kimi-K2.5"],"hardware":{"h200":"verified"}},"model":{"model_id":"moonshotai/Kimi-K2-Thinking","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":262144,"supports_dcp":true,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable Kimi K2 tool calling with the kimi_k2 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","kimi_k2"]},"reasoning":{"description":"Kimi K2 reasoning parser for extracting chain-of-thought content.","args":["--reasoning-parser","kimi_k2"]}},"opt_in_features":[],"variants":{"default":{"precision":"int4","vram_minimum_gb":600,"description":"Native INT4 (QAT) weights on 8xH200 / 8xH20; 2x low-latency speed-up vs FP8"},"nvfp4":{"model_id":"nvidia/Kimi-K2-Thinking-NVFP4","precision":"nvfp4","vram_minimum_gb":600,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$3b","hf_org":"moonshotai","hf_repo":"Kimi-K2-Thinking","hf_id":"moonshotai/Kimi-K2-Thinking","hf_released":"2025-11-04T08:25:31.000Z"},{"meta":{"title":"Kimi-Linear-48B-A3B-Instruct","slug":"kimi-linear-48b-a3b-instruct","provider":"Moonshot AI","description":"Kimi-Linear is a 48B-parameter instruction-tuned MoE model (~3B activated) with a linear-attention variant supporting very long context (1M tokens).","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:62:meta:tasks","performance_headline":"Linear-attention MoE with 1M-token context on a single node","related_recipes":[]},"model":{"model_id":"moonshotai/Kimi-Linear-48B-A3B-Instruct","min_vllm_version":"0.11.2","architecture":"moe","parameter_count":"48B","active_parameters":"3B","context_length":1048576,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.11.2 — 0.12.0 has a known Kimi-Linear regression","command":"uv pip install vllm==0.11.2 --torch-backend auto"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":115,"description":"Full precision BF16 on 4 or 8 GPUs (single node)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$3c","hf_org":"moonshotai","hf_repo":"Kimi-Linear-48B-A3B-Instruct","hf_id":"moonshotai/Kimi-Linear-48B-A3B-Instruct","hf_released":"2025-10-30T12:37:31.000Z"},{"meta":{"title":"Kimi-K2-Instruct","slug":"kimi-k2-instruct","provider":"Moonshot AI","description":"Moonshot AI's Kimi-K2 is a trillion-parameter MoE instruction model (~32B active) with native FP8 weights and strong tool-calling capabilities.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:60:meta:tasks","performance_headline":"Open-weights 1T-parameter MoE with native FP8 and Kimi K2 tool calling","related_recipes":["moonshotai/Kimi-K2-Thinking","moonshotai/Kimi-K2.5"],"hardware":{"h200":"verified"}},"model":{"model_id":"moonshotai/Kimi-K2-Instruct","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":131072,"supports_dcp":true,"base_args":["--trust-remote-code","--tokenizer-mode","auto"],"base_env":{}},"dependencies":[{"note":"Optional: DeepEP + DeepGEMM for the DP+EP deployment path on H800/H200","command":"uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation","optional":true}],"features":{"tool_calling":{"description":"Enable Kimi K2 tool calling with the kimi_k2 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","kimi_k2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1200,"description":"Native FP8 weights on 16xH800 / 16xH200 (smallest deployment for 128k seqlen)"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{"multi_node_tp_pp":{"vllm_args":["--dtype","bfloat16","--quantization","fp8","--kv-cache-dtype","fp8","--decode-context-parallel-size","8","--enable-chunked-prefill","--max-model-len","65536","--max-num-batched-tokens","1024","--max-num-seqs","1","--disable-log-requests"]}},"guide":"$3d","hf_org":"moonshotai","hf_repo":"Kimi-K2-Instruct","hf_id":"moonshotai/Kimi-K2-Instruct","hf_released":"2025-07-11T00:55:12.000Z"}]],["nvidia",[{"meta":{"title":"Nemotron-3-Nano-Omni-30B-A3B-Reasoning","slug":"nemotron-3-nano-omni-30b-a3b-reasoning","provider":"NVIDIA","description":"Mamba2-Transformer hybrid MoE omnimodal model (31B total / 3B active) with unified video, audio, image, and text understanding; reasoning + tool calling; BF16, FP8, and NVFP4 variants","date_updated":"2026-04-29","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:0:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"],"hardware":{"h100":"verified","h200":"verified","b200":"verified"}},"model":{"model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","min_vllm_version":"0.20.0","docker_image":{"cu129":"vllm/vllm-openai:v0.20.0-cu129","cu130":"vllm/vllm-openai:v0.20.0"},"install":{"pip":{"command":"uv pip install \"vllm[audio]==0.20.0\"","note":"Pinned to 0.20.0 with the audio extra (required for audio + use_audio_in_video)."}},"architecture":"moe","parameter_count":"31B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code","--max-model-len","131072","--media-io-kwargs","{\"video\": {\"num_frames\": 512, \"fps\": 1}}","--video-pruning-rate","0.5"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Nemotron v3 reasoning parser (chain-of-thought with tags)","args":["--reasoning-parser","nemotron_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":75,"description":"BF16 weights — full-precision reference"},"fp8":{"model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8","precision":"fp8","vram_minimum_gb":38,"description":"ModelOpt FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"]},"nvfp4":{"model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4","precision":"nvfp4","vram_minimum_gb":28,"description":"ModelOpt NVFP4 weights — Blackwell-only; smallest footprint","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$3e","hf_org":"nvidia","hf_repo":"Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","hf_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","hf_released":"2026-04-20T04:40:42.000Z"},{"meta":{"title":"NVIDIA Nemotron-3-Super-120B-A12B","slug":"nemotron-3-super-120b-a12b","provider":"NVIDIA","description":"NVIDIA Nemotron-3-Super Mamba-hybrid latent-MoE (~120B total / ~12B active) with BF16, FP8, and NVFP4 variants","date_updated":"2026-04-28","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:3:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"],"hardware":{"h100":"verified","h200":"verified","b200":"verified"}},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16","min_vllm_version":"0.17.1","architecture":"moe","parameter_count":"120B","active_parameters":"12B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Built-in nemotron_v3 reasoning parser (vLLM >= 0.17.1)","args":["--reasoning-parser","nemotron_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":298,"description":"BF16 weights (FP8 KV cache recommended)","extra_args":["--kv-cache-dtype","fp8"]},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8","precision":"fp8","vram_minimum_gb":149,"description":"FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"]},"nvfp4":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4","precision":"nvfp4","vram_minimum_gb":75,"description":"NVFP4 weights for Blackwell"},"base_bf16":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16","precision":"bf16","vram_minimum_gb":298,"description":"Pre-RL base checkpoint (BF16)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$3f","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Super-120B-A12B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16","hf_released":"2026-03-10T18:32:14.000Z"},{"meta":{"title":"NVIDIA Nemotron-3-Nano-4B","slug":"nemotron-3-nano-4b","provider":"NVIDIA","description":"NVIDIA Nemotron-3-Nano 4B (Mamba-hybrid dense) — compact reasoning + tool-use model with BF16 and FP8 variants","date_updated":"2026-04-28","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:2:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","nvidia/NVIDIA-Nemotron-Nano-9B-v2"],"hardware":{"h100":"verified","h200":"verified","b200":"verified"}},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16","min_vllm_version":"0.11.2","docker_image":"vllm/vllm-openai:v0.12.0","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":262144,"base_args":["--trust-remote-code","--async-scheduling"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Custom Nano v3 reasoning parser (download plugin: nano_v3_reasoning_parser.py)","args":["--reasoning-parser-plugin","nano_v3_reasoning_parser.py","--reasoning-parser","nano_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"BF16 weights","extra_args":["--kv-cache-dtype","auto"]},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8","precision":"fp8","vram_minimum_gb":5,"description":"FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$40","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Nano-4B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16","hf_released":"2026-03-07T01:23:35.000Z"},{"meta":{"title":"NVIDIA Nemotron-3-Nano-30B-A3B","slug":"nemotron-3-nano-30b-a3b","provider":"NVIDIA","description":"NVIDIA Nemotron-3-Nano Mamba-hybrid MoE (30B total / ~3B active) with BF16 and FP8 variants","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:63:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"],"hardware":{"h100":"verified","h200":"verified"}},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","min_vllm_version":"0.11.2","docker_image":"vllm/vllm-openai:v0.12.0","architecture":"moe","parameter_count":"30B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code","--async-scheduling"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Custom Nano v3 reasoning parser (download plugin: nano_v3_reasoning_parser.py)","args":["--reasoning-parser-plugin","nano_v3_reasoning_parser.py","--reasoning-parser","nano_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":72,"description":"BF16 weights","extra_args":["--kv-cache-dtype","auto"]},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8","precision":"fp8","vram_minimum_gb":35,"description":"FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1","VLLM_FLASHINFER_MOE_BACKEND":"throughput"}}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$41","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","hf_released":"2025-12-04T03:37:11.000Z"},{"meta":{"title":"NVIDIA Nemotron-Nano-12B-v2-VL","slug":"nemotron-nano-12b-v2-vl","provider":"NVIDIA","description":"NVIDIA Nemotron-Nano 12B vision-language model with video support and Efficient Video Sampling (EVS)","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:64:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"],"hardware":{"h100":"verified","b200":"verified"}},"model":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","min_vllm_version":"0.11.1","docker_image":"vllm/vllm-openai:nightly-8bff831f0aa239006f34b721e63e1340e3472067","architecture":"dense","parameter_count":"12B","active_parameters":"12B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{"VLLM_VIDEO_LOADER_BACKEND":"opencv"}},"features":{"video_compression":{"description":"Efficient Video Sampling (EVS) prunes video tokens; 0.75 means 75% pruning","args":["--video-pruning-rate","0.75"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":29,"description":"BF16 weights on 1 GPU"},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8","precision":"fp8","vram_minimum_gb":14,"description":"FP8 weights on 1 GPU"},"nvfp4":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD","precision":"nvfp4","vram_minimum_gb":8,"description":"NVFP4 (QAD) weights for Blackwell"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$42","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","hf_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","hf_released":"2025-10-21T18:11:05.000Z"},{"meta":{"title":"NVIDIA Nemotron-Nano-9B-v2","slug":"nemotron-nano-9b-v2","provider":"NVIDIA","description":"NVIDIA Nemotron-Nano 9B (Mamba-hybrid dense) reasoning + tool-use model with FP8 / NVFP4 / Japanese variants","date_updated":"2026-04-28","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:4:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"],"hardware":{"h100":"verified","h200":"verified","b200":"verified"}},"model":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","min_vllm_version":"0.10.1","architecture":"dense","parameter_count":"9B","active_parameters":"9B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Custom Nemotron tool-call parser plugin (download: nemotron_toolcall_parser_no_streaming.py)","args":["--enable-auto-tool-choice","--tool-call-parser-plugin","nemotron_toolcall_parser_no_streaming.py","--tool-call-parser","nemotron_json"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":22,"description":"BF16 weights on 1 GPU"},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8","precision":"fp8","vram_minimum_gb":11,"description":"FP8 weights"},"nvfp4":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4","precision":"nvfp4","vram_minimum_gb":6,"description":"NVFP4 weights for Blackwell"},"base":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base","precision":"bf16","vram_minimum_gb":22,"description":"Pre-RL base checkpoint (BF16)"},"japanese":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese","precision":"bf16","vram_minimum_gb":22,"description":"Japanese-specialized fine-tune (BF16)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$43","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-Nano-9B-v2","hf_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","hf_released":"2025-08-12T22:43:32.000Z"}]],["openai",[{"meta":{"title":"GPT-OSS","slug":"gpt-oss","provider":"OpenAI","description":"OpenAI's gpt-oss family (20B / 120B) with MXFP4 MoE, attention-sinks, built-in tools via Responses API","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:65:meta:tasks","related_recipes":[],"hardware":{"h100":"verified","h200":"verified","b200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"openai/gpt-oss-120b","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"120B","active_parameters":"5.1B","context_length":131072,"base_args":[],"base_env":{}},"features":{"tool_calling":{"description":"OpenAI harmony tool-call parser with automatic tool choice","args":["--tool-call-parser","openai","--enable-auto-tool-choice"]},"spec_decoding":{"description":"EAGLE3 speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"nvidia/gpt-oss-120b-Eagle3-v2\",\"num_speculative_tokens\":3,\"method\":\"eagle3\",\"draft_tensor_parallel_size\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"mxfp4","vram_minimum_gb":96,"description":"gpt-oss-120b with MXFP4 MoE; fits on 1xA100 80GB, scales to TP 2/4/8"},"20b":{"model_id":"openai/gpt-oss-20b","precision":"mxfp4","vram_minimum_gb":40,"description":"gpt-oss-20b, fits on a single A100"},"amd_fp8":{"model_id":"amd/gpt-oss-120b-w-mxfp4-a-fp8","precision":"mxfp4","vram_minimum_gb":80,"description":"Quark-quantized MXFP4 weights with FP8 activations for MI355X (gfx950)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"blackwell":{"extra_args":["--kv-cache-dtype","fp8","--no-enable-prefix-caching","--max-cudagraph-capture-size","2048","--max-num-batched-tokens","8192","--stream-interval","20"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":"1"}},"hopper":{"extra_args":["--no-enable-prefix-caching","--max-cudagraph-capture-size","2048","--max-num-batched-tokens","8192","--stream-interval","20"],"extra_env":{}},"amd":{"extra_args":["--attention-backend","ROCM_AITER_UNIFIED_ATTN","-cc.pass_config.fuse_rope_kvcache=True","-cc.use_inductor_graph_partition=True","--gpu-memory-utilization","0.95","--block-size=64"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","HSA_NO_SCRATCH_RECLAIM":"1","AMDGCN_USE_BUFFER_OPS":"0"}}},"strategy_overrides":{},"guide":"$44","hf_org":"openai","hf_repo":"gpt-oss-120b","hf_id":"openai/gpt-oss-120b","hf_released":"2025-08-04T22:33:06.000Z"}]],["OpenGVLab",[{"meta":{"title":"InternVL3.5","slug":"internvl3.5","provider":"InternVL (OpenGVLab)","description":"InternVL 3.5 vision-language models from Shanghai AI Lab with thinking-mode prompting","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:31:meta:tasks","related_recipes":["internlm/Intern-S1"],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"OpenGVLab/InternVL3_5-8B","min_vllm_version":"0.10.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":40960,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":19,"description":"BF16 weights for the 8B variant"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$45","hf_org":"OpenGVLab","hf_repo":"InternVL3_5-8B","hf_id":"OpenGVLab/InternVL3_5-8B","hf_released":"2025-08-25T16:38:47.000Z"}]],["PaddlePaddle",[{"meta":{"title":"PaddleOCR-VL","slug":"paddleocr-vl","provider":"PaddlePaddle","description":"PaddleOCR-VL (0.9B) — compact vision-language model for document parsing, OCR, tables, formulas, charts","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:32:meta:tasks","related_recipes":[]},"model":{"model_id":"PaddlePaddle/PaddleOCR-VL","min_vllm_version":"0.11.1","architecture":"dense","parameter_count":"0.9B","active_parameters":"0.9B","context_length":131072,"base_args":["--trust-remote-code","--max-num-batched-tokens","16384","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"dependencies":[{"note":"PaddlePaddle runtime (install in a separate venv from vllm to avoid conflicts)","command":"uv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/"},{"note":"PaddleOCR document-parsing helpers","command":"uv pip install -U \"paddleocr[doc-parser]\""},{"note":"Safetensors loader used by the doc-parser path","command":"uv pip install safetensors"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"BF16 weights — small footprint, runs on most GPUs"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$46","hf_org":"PaddlePaddle","hf_repo":"PaddleOCR-VL","hf_id":"PaddlePaddle/PaddleOCR-VL","hf_released":"2025-10-16T10:14:45.000Z"}]],["Qwen",[{"meta":{"title":"Qwen3.6-27B","slug":"Qwen3.6-27b","provider":"Qwen","description":"Qwen3.6 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:18:meta:tasks","performance_headline":"Qwen3.6 flagship dense — single-GPU FP8 or 2x GPU BF16","related_recipes":["Qwen/Qwen3.6-35B-A3B"]},"model":{"model_id":"Qwen/Qwen3.6-27B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"27B","active_parameters":"27B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":65,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.6-27B-FP8","precision":"fp8","vram_minimum_gb":33,"description":"Qwen official FP8 checkpoint — single 40 GB GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$47","hf_org":"Qwen","hf_repo":"Qwen3.6-27B","hf_id":"Qwen/Qwen3.6-27B","hf_released":"2026-04-21T07:50:43.000Z"},{"meta":{"title":"Qwen3.6-35B-A3B","slug":"qwen3.6-35b-a3b","provider":"Qwen","description":"Smaller Qwen3.6 multimodal MoE model (35B total / 3B active) with 256 experts (8 routed + 1 shared), gated delta networks architecture, and 262K context","date_updated":"2026-04-18","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:23:meta:tasks","performance_headline":"Compact Qwen3.6 MoE with 3B active parameters — single-GPU FP8 or 2-4 GPU BF16 serving","related_recipes":["Qwen/Qwen3.5-397B-A17B"],"hardware":{"h100":"verified","h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen3.6-35B-A3B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"35B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":2}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":84,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.6-35B-A3B-FP8","precision":"fp8","vram_minimum_gb":42,"description":"Qwen official FP8 checkpoint — single-GPU serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$48","hf_org":"Qwen","hf_repo":"Qwen3.6-35B-A3B","hf_id":"Qwen/Qwen3.6-35B-A3B","hf_released":"2026-04-15T05:59:19.000Z"},{"meta":{"title":"Qwen3.5-0.8B","slug":"qwen3.5-0.8b","provider":"Qwen","description":"Qwen3.5 tiny dense multimodal model (0.8B) — ultra-low-VRAM / edge serving with 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:11:meta:tasks","performance_headline":"Tiny Qwen3.5 dense for edge / draft-model use","related_recipes":["Qwen/Qwen3.5-2B"]},"model":{"model_id":"Qwen/Qwen3.5-0.8B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"0.8B","active_parameters":"0.8B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"Full precision BF16 — runs on any modern GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$49","hf_org":"Qwen","hf_repo":"Qwen3.5-0.8B","hf_id":"Qwen/Qwen3.5-0.8B","hf_released":"2026-02-28T23:57:01.000Z"},{"meta":{"title":"Qwen3.5-2B","slug":"qwen3.5-2b","provider":"Qwen","description":"Qwen3.5 mini dense multimodal model (2B) — edge / low-VRAM serving with 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:14:meta:tasks","performance_headline":"Edge-scale Qwen3.5 dense — fits on 8 GB GPUs","related_recipes":["Qwen/Qwen3.5-4B","Qwen/Qwen3.5-0.8B"]},"model":{"model_id":"Qwen/Qwen3.5-2B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"2B","active_parameters":"2B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads.","args":["--language-model-only"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":5,"description":"Full precision BF16 — fits on an 8 GB GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"## Overview\n\n[Qwen3.5-2B](https://huggingface.co/Qwen/Qwen3.5-2B) is a miniature dense\nQwen3.5 model — the full gated delta networks architecture, vision encoder,\nand 262K context, in a form small enough for 8 GB consumer GPUs or edge\ninference.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware:** single 8 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve Qwen/Qwen3.5-2B \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-2B\",\n messages=[{\"role\": \"user\", \"content\": \"Hi!\"}],\n max_tokens=64,\n)\nprint(resp.choices[0].message.content)\n```\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-2B)\n- [Base checkpoint](https://huggingface.co/Qwen/Qwen3.5-2B-Base)\n","hf_org":"Qwen","hf_repo":"Qwen3.5-2B","hf_id":"Qwen/Qwen3.5-2B","hf_released":"2026-02-28T23:56:16.000Z"},{"meta":{"title":"Qwen3.5-4B","slug":"qwen3.5-4b","provider":"Qwen","description":"Qwen3.5 compact dense multimodal model (4B) — fits on 16 GB consumer GPUs with full 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:16:meta:tasks","performance_headline":"Consumer-GPU-friendly Qwen3.5 dense with MTP support","related_recipes":["Qwen/Qwen3.5-9B","Qwen/Qwen3.5-2B"]},"model":{"model_id":"Qwen/Qwen3.5-4B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads.","args":["--language-model-only"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"Full precision BF16 — fits on a single 16 GB GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$4a","hf_org":"Qwen","hf_repo":"Qwen3.5-4B","hf_id":"Qwen/Qwen3.5-4B","hf_released":"2026-02-27T14:45:03.000Z"},{"meta":{"title":"Qwen3.5-9B","slug":"qwen3.5-9b","provider":"Qwen","description":"Qwen3.5 dense multimodal model (9B) with gated delta networks hybrid attention, MTP, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:17:meta:tasks","performance_headline":"Single-GPU Qwen3.5 dense with MTP-accelerated decoding","related_recipes":["Qwen/Qwen3.5-27B","Qwen/Qwen3.5-4B"]},"model":{"model_id":"Qwen/Qwen3.5-9B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"9B","active_parameters":"9B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache.","args":["--language-model-only"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":22,"description":"Full precision BF16 — single 24 GB GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$4b","hf_org":"Qwen","hf_repo":"Qwen3.5-9B","hf_id":"Qwen/Qwen3.5-9B","hf_released":"2026-02-27T12:58:26.000Z"},{"meta":{"title":"Qwen3.5-122B-A10B","slug":"qwen3.5-122b-a10b","provider":"Qwen","description":"Mid-size Qwen3.5 multimodal MoE (122B total / 10B active) with gated delta networks, 256 experts, and 262K context","date_updated":"2026-04-22","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:12:meta:tasks","performance_headline":"Qwen3.5 mid-tier MoE — fits on 4x H200 BF16 or 2x H200 FP8","related_recipes":["Qwen/Qwen3.5-397B-A17B","Qwen/Qwen3.5-35B-A3B"]},"model":{"model_id":"Qwen/Qwen3.5-122B-A10B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"122B","active_parameters":"10B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":293,"description":"Full precision BF16 — requires 4x H200 or equivalent"},"fp8":{"model_id":"Qwen/Qwen3.5-122B-A10B-FP8","precision":"fp8","vram_minimum_gb":147,"description":"Qwen official FP8 checkpoint — fits on 2x H200"},"gptq_int4":{"model_id":"Qwen/Qwen3.5-122B-A10B-GPTQ-Int4","precision":"int4","vram_minimum_gb":74,"description":"GPTQ Int4 checkpoint — single-GPU serving on 80GB hardware"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$4c","hf_org":"Qwen","hf_repo":"Qwen3.5-122B-A10B","hf_id":"Qwen/Qwen3.5-122B-A10B","hf_released":"2026-02-24T09:43:37.000Z"},{"meta":{"title":"Qwen3.5-27B","slug":"qwen3.5-27b","provider":"Qwen","description":"Qwen3.5 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:13:meta:tasks","performance_headline":"Qwen3.5 flagship dense — single-GPU FP8 or 2x GPU BF16","related_recipes":["Qwen/Qwen3.5-397B-A17B","Qwen/Qwen3.5-35B-A3B","Qwen/Qwen3.5-9B"]},"model":{"model_id":"Qwen/Qwen3.5-27B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"27B","active_parameters":"27B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":65,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.5-27B-FP8","precision":"fp8","vram_minimum_gb":33,"description":"Qwen official FP8 checkpoint — single 40 GB GPU"},"gptq_int4":{"model_id":"Qwen/Qwen3.5-27B-GPTQ-Int4","precision":"int4","vram_minimum_gb":17,"description":"GPTQ Int4 checkpoint — fits on a single 24 GB GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$4d","hf_org":"Qwen","hf_repo":"Qwen3.5-27B","hf_id":"Qwen/Qwen3.5-27B","hf_released":"2026-02-24T09:41:56.000Z"},{"meta":{"title":"Qwen3.5-35B-A3B","slug":"qwen3.5-35b-a3b","provider":"Qwen","description":"Compact Qwen3.5 multimodal MoE (35B total / 3B active) with gated delta networks, 256 experts, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:15:meta:tasks","performance_headline":"Compact Qwen3.5 MoE — single-GPU FP8 or 2x GPU BF16 serving","related_recipes":["Qwen/Qwen3.5-397B-A17B","Qwen/Qwen3.5-122B-A10B"]},"model":{"model_id":"Qwen/Qwen3.5-35B-A3B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"35B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":84,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.5-35B-A3B-FP8","precision":"fp8","vram_minimum_gb":42,"description":"Qwen official FP8 checkpoint — single-GPU serving"},"gptq_int4":{"model_id":"Qwen/Qwen3.5-35B-A3B-GPTQ-Int4","precision":"int4","vram_minimum_gb":21,"description":"GPTQ Int4 checkpoint — fits on a single 24GB GPU"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$4e","hf_org":"Qwen","hf_repo":"Qwen3.5-35B-A3B","hf_id":"Qwen/Qwen3.5-35B-A3B","hf_released":"2026-02-24T09:39:25.000Z"},{"meta":{"title":"Qwen3.5-397B","slug":"qwen3.5-397b","provider":"Qwen","description":"Multimodal MoE model with gated delta networks architecture, 397B total / 17B active parameters, up to 262K context","date_updated":"2026-04-16","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:82:meta:tasks","performance_headline":"Verified on 8x H200, 8x MI300X/MI355X, and GB200 nodes","related_recipes":["Qwen/Qwen3.6-35B-A3B"],"hardware":{"h200":"verified","gb200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen3.5-397B-A17B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"397B","active_parameters":"17B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{"VLLM_DEEP_GEMM_WARMUP":"skip","VLLM_USE_DEEP_GEMM":"0","VLLM_FLASHINFER_MOE_BACKEND":"latency"}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":3}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":953,"description":"Full precision BF16 — requires 8x H200 or equivalent"},"nvfp4":{"model_id":"nvidia/Qwen3.5-397B-A17B-NVFP4","precision":"nvfp4","vram_minimum_gb":238,"extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}},"gptq_int4":{"model_id":"Qwen/Qwen3.5-397B-A17B-GPTQ-Int4","precision":"int4","vram_minimum_gb":239,"description":"GPTQ Int4 checkpoint — halves VRAM vs FP8"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$4f","hf_org":"Qwen","hf_repo":"Qwen3.5-397B-A17B","hf_id":"Qwen/Qwen3.5-397B-A17B","hf_released":"2026-02-16T04:55:12.000Z"},{"meta":{"title":"Qwen3-ASR-1.7B","slug":"qwen3-asr-1.7b","provider":"Qwen","description":"Speech-to-text model supporting 11 languages, multiple accents, and singing voice with customizable text-context prompting.","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:36:meta:tasks","performance_headline":"Accurate multilingual ASR, including singing voice; single-GPU serving","related_recipes":[]},"model":{"model_id":"Qwen/Qwen3-ASR-1.7B","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"2.3B","active_parameters":"2.3B","context_length":65536,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — required for ASR input pre-processing (librosa, soundfile)","command":"uv pip install -U \"vllm[audio]\""}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":4,"description":"Full precision BF16 — fits on a single mid-range GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$50","hf_org":"Qwen","hf_repo":"Qwen3-ASR-1.7B","hf_id":"Qwen/Qwen3-ASR-1.7B","hf_released":"2026-01-28T03:22:40.000Z"},{"meta":{"title":"Qwen3Guard-Gen-8B","slug":"qwen3guard-gen-8b","provider":"Qwen","description":"Lightweight text-only guardrail/safety classifier model in the Qwen3Guard family.","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:40:meta:tasks","performance_headline":"Runs on a single GPU; serves safety classifications over OpenAI-compatible API","related_recipes":[]},"model":{"model_id":"Qwen/Qwen3Guard-Gen-8B","min_vllm_version":"0.10.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":32768,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":19,"description":"Full precision BF16 — single GPU with >=20 GB VRAM"},"small_4b":{"model_id":"Qwen/Qwen3Guard-Gen-4B","precision":"bf16","vram_minimum_gb":10,"description":"4B variant for more constrained deployments"},"tiny_0_6b":{"model_id":"Qwen/Qwen3Guard-Gen-0.6B","precision":"bf16","vram_minimum_gb":4,"description":"0.6B variant for edge / ultra-low-cost serving"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$51","hf_org":"Qwen","hf_repo":"Qwen3Guard-Gen-8B","hf_id":"Qwen/Qwen3Guard-Gen-8B","hf_released":"2025-09-23T11:40:09.000Z"},{"meta":{"title":"Qwen3-VL-235B-A22B-Instruct","slug":"qwen3-vl-235b-a22b-instruct","provider":"Qwen","description":"Qwen3-VL flagship MoE vision-language model with 235B total / 22B active parameters, supporting images, video, and long context.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:39:meta:tasks","performance_headline":"Strong on images, video, and text — #1 open model on text on lmarena.ai at release","related_recipes":[],"hardware":{"h100":"verified","h200":"verified","b200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen3-VL-235B-A22B-Instruct","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"235B","active_parameters":"22B","context_length":262144,"base_args":[],"base_env":{}},"dependencies":[{"note":"Recommended for offline multimodal inference (image/video pre-processing helpers)","command":"uv pip install qwen-vl-utils==0.0.14","optional":true}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":564,"description":"Full BF16 — ideal on H200/B200 with 8 GPUs"},"fp8":{"model_id":"Qwen/Qwen3-VL-235B-A22B-Instruct-FP8","precision":"fp8","vram_minimum_gb":282,"description":"Qwen official FP8 checkpoint for optimal H100 memory efficiency"},"nvfp4":{"model_id":"nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":141,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":["--mm-encoder-tp-mode","data","--async-scheduling"],"extra_env":{}},"blackwell":{"extra_args":["--mm-encoder-tp-mode","data","--async-scheduling"],"extra_env":{}}},"strategy_overrides":{},"guide":"$52","hf_org":"Qwen","hf_repo":"Qwen3-VL-235B-A22B-Instruct","hf_id":"Qwen/Qwen3-VL-235B-A22B-Instruct","hf_released":"2025-09-22T03:54:32.000Z"},{"meta":{"title":"Qwen3-Next-80B-A3B-Instruct","slug":"qwen3-next-80b-a3b-instruct","provider":"Qwen","description":"Advanced Qwen3-Next MoE model (80B total / 3B active) with hybrid attention, highly sparse experts, and multi-token prediction.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:38:meta:tasks","performance_headline":"Highly sparse MoE with MTP-accelerated decoding, runs on 4x H200/H20/A100/A800","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen3-Next-80B-A3B-Instruct","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"80B","active_parameters":"3B","context_length":262144,"base_args":[],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Hermes parser","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}","--no-enable-chunked-prefill"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":192,"description":"Full precision BF16 — fits on 4x H200/H20/A100/A800"},"fp8":{"model_id":"Qwen/Qwen3-Next-80B-A3B-Instruct-FP8","precision":"fp8","vram_minimum_gb":96,"description":"Qwen official FP8 checkpoint — recommended on SM90/SM100"},"nvfp4":{"model_id":"nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":48,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1","VLLM_FLASHINFER_MOE_BACKEND":"latency","VLLM_USE_DEEP_GEMM":"0","VLLM_USE_TRTLLM_ATTENTION":"0","VLLM_ATTENTION_BACKEND":"FLASH_ATTN"}}},"strategy_overrides":{},"guide":"$53","hf_org":"Qwen","hf_repo":"Qwen3-Next-80B-A3B-Instruct","hf_id":"Qwen/Qwen3-Next-80B-A3B-Instruct","hf_released":"2025-09-09T15:40:56.000Z"},{"meta":{"title":"Qwen-Image","slug":"qwen-image","provider":"Qwen","description":"Text-to-image diffusion model (20B parameters) from the Qwen-Image family, served via vLLM-Omni.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:33:meta:tasks","performance_headline":"Shared DiT core across T2I, image editing, and layered-image variants; accelerated via Cache-DiT, TeaCache, and sequence parallelism","related_recipes":[]},"model":{"model_id":"Qwen/Qwen-Image","min_vllm_version":"0.18.0","architecture":"dense","parameter_count":"20B","active_parameters":"20B","context_length":0,"base_args":[],"base_env":{}},"dependencies":[{"note":"vLLM-Omni must be installed from source and pins vllm==0.18.0 for diffusion support","command":"git clone https://github.com/vllm-project/vllm-omni.git && cd vllm-omni && uv pip install -e . vllm==0.18.0"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":48,"description":"Full precision BF16 — use CPU offload / layerwise offload for lower VRAM"},"fp8":{"precision":"fp8","vram_minimum_gb":24,"description":"FP8 quantization (`--quantization fp8`). Recommended to pass `--ignored-layers \"img_mlp\"` for better quality."},"int8":{"precision":"int8","vram_minimum_gb":24,"description":"INT8 quantization (`--quantization int8`)."}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$54","hf_org":"Qwen","hf_repo":"Qwen-Image","hf_id":"Qwen/Qwen-Image","hf_released":"2025-08-02T04:58:07.000Z"},{"meta":{"title":"Qwen3-Coder-480B-A35B-Instruct","slug":"qwen3-coder-480b-a35b-instruct","provider":"Qwen","description":"Large coder MoE with 480B total / 35B active parameters, strong tool-use and code generation capabilities.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:37:meta:tasks","performance_headline":"HumanEval 0.939, MBPP 0.918 (FP8). Recommended FP8 on 8x H200/H20 via DP=8","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"480B","active_parameters":"35B","context_length":262144,"base_args":[],"base_env":{}},"dependencies":[{"note":"Optional: opt into DeepGEMM FP8 MoE kernels for extra throughput","command":"export VLLM_USE_DEEP_GEMM=1","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1152,"description":"Full BF16 — 8x H200/H20 (141GB × 8) recommended"},"fp8":{"model_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8","precision":"fp8","vram_minimum_gb":576,"description":"Qwen official FP8 checkpoint — required for DP=8 serving"},"nvfp4":{"model_id":"nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":288,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":[],"extra_env":{"VLLM_USE_DEEP_GEMM":"1"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$55","hf_org":"Qwen","hf_repo":"Qwen3-Coder-480B-A35B-Instruct","hf_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct","hf_released":"2025-07-22T14:52:38.000Z"},{"meta":{"title":"Qwen3-235B-A22B-Instruct","slug":"qwen3-235b-a22b-instruct-2507","provider":"Qwen","description":"Flagship Qwen3 MoE instruct model with 235B total and 22B active parameters, tuned for high-quality text generation.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:35:meta:tasks","performance_headline":"Verified on 4x/8x H200, MI300X/MI325X/MI355X nodes (BF16 and FP8)","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen3-235B-A22B-Instruct-2507","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"235B","active_parameters":"22B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Hermes-compatible parser","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":564,"description":"Full precision BF16 — requires 4x H200 or 8x MI300X/MI325X/MI355X"},"fp8":{"model_id":"Qwen/Qwen3-235B-A22B-FP8","precision":"fp8","vram_minimum_gb":240,"description":"Qwen official FP8 checkpoint for improved efficiency on SM90+"},"nvfp4":{"model_id":"nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4","precision":"nvfp4","vram_minimum_gb":141,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$56","hf_org":"Qwen","hf_repo":"Qwen3-235B-A22B-Instruct-2507","hf_id":"Qwen/Qwen3-235B-A22B-Instruct-2507","hf_released":"2025-07-21T06:46:56.000Z"},{"meta":{"title":"Qwen2.5-VL-72B-Instruct","slug":"qwen2.5-vl-72b-instruct","provider":"Qwen","description":"Qwen2.5-VL dense vision-language model (72B) for high-quality image and video understanding.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:34:meta:tasks","performance_headline":"Verified on 4x A100 and 4x MI300X/MI325X/MI355X with BF16","related_recipes":[],"hardware":{"mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"Qwen/Qwen2.5-VL-72B-Instruct","min_vllm_version":"0.7.0","architecture":"dense","parameter_count":"72B","active_parameters":"72B","context_length":128000,"base_args":[],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":173,"description":"Full precision BF16 — 4x A100 80GB or 4x MI300X/MI325X/MI355X"},"awq":{"model_id":"Qwen/Qwen2.5-VL-72B-Instruct-AWQ","precision":"int4","vram_minimum_gb":43,"description":"AWQ 4-bit quantized weights","extra_args":["--quantization","awq"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{"hopper":{"extra_args":["--mm-encoder-tp-mode","data"],"extra_env":{}},"blackwell":{"extra_args":["--mm-encoder-tp-mode","data"],"extra_env":{}},"amd":{"extra_args":["--mm-encoder-tp-mode","data","--limit-mm-per-prompt","{\"image\":2,\"video\":0}"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$57","hf_org":"Qwen","hf_repo":"Qwen2.5-VL-72B-Instruct","hf_id":"Qwen/Qwen2.5-VL-72B-Instruct","hf_released":"2025-01-27T04:12:04.000Z"}]],["stabilityai",[{"meta":{"title":"Stable Diffusion 3.5","slug":"stable-diffusion-3.5","provider":"Stability AI","description":"Stability AI's Stable Diffusion 3.5 text-to-image family (medium 2.5B, large 8.1B, large-turbo) via vLLM-Omni with Cache-DiT acceleration","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:67:meta:tasks","related_recipes":["stabilityai/stable-audio-open-1.0"]},"model":{"model_id":"stabilityai/stable-diffusion-3.5-medium","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"2.5B","active_parameters":"2.5B","context_length":0,"base_args":[],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.12.0 for Stable Diffusion 3.5","command":"uv pip install vllm==0.12.0"},{"note":"vllm-omni provides the image generation backend","command":"uv pip install git+https://github.com/vllm-project/vllm-omni.git"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":44,"description":"Stable Diffusion 3.5 medium (2.5B)"},"large":{"model_id":"stabilityai/stable-diffusion-3.5-large","precision":"bf16","vram_minimum_gb":24,"description":"Stable Diffusion 3.5 large (8.1B)"},"large_turbo":{"model_id":"stabilityai/stable-diffusion-3.5-large-turbo","precision":"bf16","vram_minimum_gb":24,"description":"Stable Diffusion 3.5 large-turbo (8.1B, timestep-distilled for few-step inference)"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$58","hf_org":"stabilityai","hf_repo":"stable-diffusion-3.5-medium","hf_id":"stabilityai/stable-diffusion-3.5-medium","hf_released":"2024-10-29T10:27:32.000Z"},{"meta":{"title":"Stable Audio Open","slug":"stable-audio-open","provider":"Stability AI","description":"Text-to-audio generation model (1.2B params) producing up to ~47 s stereo audio at 44.1 kHz, served via vLLM-Omni","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:66:meta:tasks","related_recipes":["stabilityai/stable-diffusion-3.5-medium"]},"model":{"model_id":"stabilityai/stable-audio-open-1.0","min_vllm_version":"0.14.1","architecture":"dense","parameter_count":"1.2B","active_parameters":"1.2B","context_length":0,"base_args":[],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.14.1 for Stable Audio Open","command":"uv pip install vllm==0.14.1"},{"note":"vllm-omni provides the audio generation backend","command":"uv pip install git+https://github.com/vllm-project/vllm-omni.git"},{"note":"soundfile (recommended) or scipy for WAV output","command":"uv pip install soundfile"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":19,"description":"BF16 weights for text-to-audio generation (via vLLM-Omni)"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$59","hf_org":"stabilityai","hf_repo":"stable-audio-open-1.0","hf_id":"stabilityai/stable-audio-open-1.0","hf_released":"2024-05-24T05:11:20.000Z"}]],["stepfun-ai",[{"meta":{"title":"Step-3.5-Flash","slug":"step-3.5-flash","provider":"StepFun","description":"Production-grade reasoning MoE (~196B total / 11B active parameters) with hybrid attention schedules, SWA compensation, and multi-token prediction for low-latency long-context inference","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:68:meta:tasks","performance_headline":"Sparse MoE reasoning model with hybrid attention and step3p5 MTP speculative decoding","related_recipes":[],"hardware":{"h200":"verified","b200":"verified"}},"model":{"model_id":"stepfun-ai/Step-3.5-Flash","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"196B","active_parameters":"11B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Step-3.5 tool call parser with automatic tool choice","args":["--tool-call-parser","step3p5","--enable-auto-tool-choice"]},"reasoning":{"description":"Step-3.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","step3p5"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with the step3p5_mtp method","args":["--hf-overrides","{\"num_nextn_predict_layers\": 1}","--speculative-config","{\"method\": \"step3p5_mtp\", \"num_speculative_tokens\": 1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":470,"description":"Full precision BF16 — runs on 4xH200/H20/B200"},"fp8":{"model_id":"stepfun-ai/Step-3.5-Flash-FP8","precision":"fp8","vram_minimum_gb":235,"description":"Native FP8 checkpoint (TP4 not supported — use DP4)"},"int4":{"model_id":"stepfun-ai/Step-3.5-Flash-INT4","precision":"int4","vram_minimum_gb":118,"description":"INT4 quantized weights"},"int8":{"model_id":"stepfun-ai/Step-3.5-Flash-INT8","precision":"int8","vram_minimum_gb":235,"description":"INT8 quantized weights"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"0"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"0"}}},"strategy_overrides":{},"guide":"$5a","hf_org":"stepfun-ai","hf_repo":"Step-3.5-Flash","hf_id":"stepfun-ai/Step-3.5-Flash","hf_released":"2026-02-01T08:03:45.000Z"}]],["tencent",[{"meta":{"title":"Hy3-preview","slug":"hy3-preview","provider":"Tencent Hunyuan","description":"Tencent Hunyuan Hy3-preview — scaled-up MoE language model (295B total / 21B active) with a 3.8B MTP layer for speculative decoding, 256K context, and hy_v3 tool/reasoning parsers","date_updated":"2026-04-23","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:10:meta:tasks","performance_headline":"Hunyuan Hy3-preview MoE — 295B/21B on 8×H200, 8×H20-3e(141GB), or 8×AMD MI300X/MI355X with MTP","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi350x":"verified","mi355x":"verified"}},"model":{"model_id":"tencent/Hy3-preview","min_vllm_version":"0.20.0","install":{"docker":{"note":"Use the dedicated hy3-preview image until changes land in vllm:latest."}},"architecture":"moe","parameter_count":"295B","active_parameters":"21B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Hunyuan v3 tool call parser with automatic tool choice","args":["--tool-call-parser","hy_v3","--enable-auto-tool-choice"]},"reasoning":{"description":"Hunyuan v3 reasoning parser for thinking-mode chain-of-thought extraction","args":["--reasoning-parser","hy_v3"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layer","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":708,"description":"Full precision BF16 — 8×H200 or 8×H20-3e(141GB) minimum for weights + KV cache"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"install_note":"Hy3-preview model code is being added in PR #40681. Until it merges, build\nvLLM editable from the PR branch in rocm/vllm-dev:nightly:\n\n docker run -it --device=/dev/kfd --device=/dev/dri --network=host \\\n --ipc=host --shm-size=128g --group-add video --cap-add SYS_PTRACE \\\n --security-opt seccomp=unconfined -v ~/work:/work -w /work \\\n -e PYTHONPATH=/work/vllm rocm/vllm-dev:nightly bash\n git clone -b feature/support_hy_v3 \\\n https://github.com/stevenkuang-tencent/vllm.git\n cd vllm && pip uninstall -y vllm\n SETUPTOOLS_SCM_PRETEND_VERSION=0.20.0.dev0 VLLM_TARGET_DEVICE=rocm \\\n pip install --editable . --no-build-isolation\n\nSetting PYTHONPATH avoids a known editable-install conflict with the\nempty /app/vllm namespace directory shipped in the base image.\n","extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","VLLM_ROCM_USE_AITER_MHA":"1","VLLM_ROCM_USE_AITER_RMSNORM":"1","VLLM_ROCM_USE_AITER_LINEAR":"1"}}},"strategy_overrides":{},"guide":"$5b","hf_org":"tencent","hf_repo":"Hy3-preview","hf_id":"tencent/Hy3-preview","hf_released":"2026-04-13T06:07:57.000Z"},{"meta":{"title":"HunyuanOCR","slug":"hunyuan-ocr","provider":"Tencent Hunyuan","description":"Tencent Hunyuan end-to-end OCR expert VLM (~1B) for online OCR serving with an OpenAI-compatible API","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:70:meta:tasks","performance_headline":"Compact 1B end-to-end OCR VLM from the Hunyuan native multimodal family","related_recipes":[]},"model":{"model_id":"tencent/HunyuanOCR","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"1B","active_parameters":"1B","context_length":32768,"base_args":["--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"Full precision BF16 — single-GPU deployment"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$5c","hf_org":"tencent","hf_repo":"HunyuanOCR","hf_id":"tencent/HunyuanOCR","hf_released":"2025-11-18T04:08:56.000Z"},{"meta":{"title":"Hunyuan-A13B-Instruct","slug":"hunyuan-a13b-instruct","provider":"Tencent Hunyuan","description":"Tencent Hunyuan A13B instruct-tuned MoE language model with AITER-accelerated AMD ROCm deployment","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:69:meta:tasks","performance_headline":"Hunyuan-A13B MoE with AITER acceleration on AMD MI300X/MI325X/MI355X","related_recipes":[],"hardware":{"mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"tencent/Hunyuan-A13B-Instruct","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"80B","active_parameters":"13B","context_length":32768,"base_args":["--trust-remote-code"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":580,"description":"Full precision BF16 — 2x GPU (TP=2) on AMD MI300X/MI325X/MI355X"},"fp8":{"model_id":"tencent/Hunyuan-A13B-Instruct-FP8","precision":"fp8","vram_minimum_gb":96,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$5d","hf_org":"tencent","hf_repo":"Hunyuan-A13B-Instruct","hf_id":"tencent/Hunyuan-A13B-Instruct","hf_released":"2025-06-25T12:39:52.000Z"}]],["Wan-AI",[{"meta":{"title":"Wan2.2","slug":"wan2.2","provider":"Wan AI","description":"Wan2.2 video generation models — T2V/I2V MoE (14B active) and unified TI2V (5B dense), served via vLLM-Omni","date_updated":"2026-04-27","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:5:meta:tasks","related_recipes":[]},"model":{"model_id":"Wan-AI/Wan2.2-T2V-A14B-Diffusers","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"28B","active_parameters":"14B","context_length":0,"base_args":[],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.12.0 for Wan2.2","command":"uv pip install vllm==0.12.0"},{"note":"vllm-omni pinned to a specific commit that includes Wan2.2 text-to-video support","command":"uv pip install git+https://github.com/vllm-project/vllm-omni.git@ef01223c42be10ee260b9f6e5ec31894cd09d86e"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":152,"description":"T2V MoE with 14B active parameters"},"i2v":{"model_id":"Wan-AI/Wan2.2-I2V-A14B-Diffusers","precision":"bf16","vram_minimum_gb":40,"description":"Image-to-Video MoE with 14B active parameters"},"ti2v_5b":{"model_id":"Wan-AI/Wan2.2-TI2V-5B-Diffusers","precision":"bf16","vram_minimum_gb":20,"description":"Unified Text-to-Video + Image-to-Video dense 5B"}},"compatible_strategies":[],"hardware_overrides":{"amd":{"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$5e","hf_org":"Wan-AI","hf_repo":"Wan2.2-T2V-A14B-Diffusers","hf_id":"Wan-AI/Wan2.2-T2V-A14B-Diffusers","hf_released":"2025-07-28T09:04:28.000Z"}]],["XiaomiMiMo",[{"meta":{"title":"MiMo-V2.5","slug":"mimo-v2-5","provider":"Xiaomi MiMo","description":"MiMo-V2.5 is a native omnimodal model with strong agentic capabilities, supporting text, image, video, and audio understanding within a unified architecture","date_updated":"2026-04-27","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:7:meta:tasks","related_recipes":["XiaomiMiMo/MiMo-V2-Flash"],"hardware":{"h200":"verified"}},"model":{"model_id":"XiaomiMiMo/MiMo-V2.5","min_vllm_version":"nightly","docker_image":{"cu129":"vllm/vllm-openai:mimov25-cu129","cu130":"vllm/vllm-openai:mimov25-cu130"},"install":{"pip":false},"architecture":"moe","parameter_count":"311B","active_parameters":"15B","context_length":1048576,"base_args":["--trust-remote-code","--generation-config","vllm"],"base_env":{}},"features":{"tool_calling":{"description":"MiMo tool-call parser","args":["--tool-call-parser","mimo","--enable-auto-tool-choice"]},"reasoning":{"description":"MiMo reasoning parser","args":["--reasoning-parser","mimo"]},"spec_decoding":{"description":"Multi-Token Prediction","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":374,"description":"Native FP8 weights (block-wise e4m3 128x128); 4x H200 with TP4","extra_args":["--tensor-parallel-size","4","--gpu-memory-utilization","0.95","--max-model-len","auto"]}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$5f","hf_org":"XiaomiMiMo","hf_repo":"MiMo-V2.5","hf_id":"XiaomiMiMo/MiMo-V2.5","hf_released":"2026-04-27T13:37:38.000Z"},{"meta":{"title":"MiMo-V2.5-Pro","slug":"mimo-v2-5-pro","provider":"Xiaomi MiMo","description":"Xiaomi's flagship MoE reasoning model (1.02T total / 42B active) with hybrid attention, native FP8 weights, and Multi-Token Prediction","date_updated":"2026-04-27","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:6:meta:tasks","related_recipes":["XiaomiMiMo/MiMo-V2.5"],"hardware":{"h200":"verified"}},"model":{"model_id":"XiaomiMiMo/MiMo-V2.5-Pro","min_vllm_version":"nightly","docker_image":{"cu129":"vllm/vllm-openai:mimov25-cu129","cu130":"vllm/vllm-openai:mimov25-cu130"},"install":{"pip":false},"architecture":"moe","parameter_count":"1T","active_parameters":"42B","context_length":1048576,"base_args":["--trust-remote-code","--max-model-len auto","--generation-config","vllm"],"base_env":{}},"features":{"tool_calling":{"description":"MiMo tool-call parser","args":["--tool-call-parser","mimo","--enable-auto-tool-choice"]},"reasoning":{"description":"MiMo reasoning parser","args":["--reasoning-parser","mimo"]},"spec_decoding":{"description":"Multi-Token Prediction","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1224,"description":"Native FP8 weights (block-wise e4m3 128x128); 8x H200 with TP8","extra_args":["--tensor-parallel-size","8","--gpu-memory-utilization","0.95","--max-model-len","auto"]}},"compatible_strategies":["single_node_tp","single_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$60","hf_org":"XiaomiMiMo","hf_repo":"MiMo-V2.5-Pro","hf_id":"XiaomiMiMo/MiMo-V2.5-Pro","hf_released":"2026-04-27T12:52:53.000Z"},{"meta":{"title":"MiMo-V2-Flash","slug":"mimo-v2-flash","provider":"Xiaomi MiMo","description":"Xiaomi's MoE reasoning model (309B total / 15B active) with hybrid attention and MTP for fast agentic workflows","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:41:meta:tasks","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"XiaomiMiMo/MiMo-V2-Flash","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"309B","active_parameters":"15B","context_length":262144,"base_args":["--trust-remote-code","--generation-config","vllm"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 XML tool-call parser","args":["--tool-call-parser","qwen3_xml"]},"reasoning":{"description":"Qwen3 reasoning parser","args":["--reasoning-parser","qwen3"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":371,"description":"Native FP8 weights; 4x H200 recommended with TP4","extra_args":["--tensor-parallel-size","4","--gpu-memory-utilization","0.9"]}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"0"}}},"strategy_overrides":{},"guide":"$61","hf_org":"XiaomiMiMo","hf_repo":"MiMo-V2-Flash","hf_id":"XiaomiMiMo/MiMo-V2-Flash","hf_released":"2025-12-16T08:47:02.000Z"}]],["zai-org",[{"meta":{"title":"GLM-5.1","slug":"glm-5.1","provider":"GLM (Z-AI)","description":"GLM-5.1 refreshed version of GLM-5 — frontier-scale MoE language model (~744B total parameters) with MTP speculative decoding and thinking mode","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:76:meta:tasks","performance_headline":"Refreshed GLM-5 series MoE with improved reasoning, coding, and agentic performance","related_recipes":[],"hardware":{"h200":"verified"}},"model":{"model_id":"zai-org/GLM-5.1","min_vllm_version":"0.19.1","docker_image":"vllm/vllm-openai:latest","architecture":"moe","parameter_count":"744B","active_parameters":"40B","context_length":202752,"base_args":["--trust-remote-code","--chat-template-content-format=string"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.7 tool call parser with automatic tool choice","args":["--tool-call-parser","glm47","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser — thinking mode enabled by default on requests","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding (3 draft tokens)","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","3"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1786,"description":"Full precision BF16 — requires multi-node deployment"},"fp8":{"model_id":"zai-org/GLM-5.1-FP8","precision":"fp8","vram_minimum_gb":893,"description":"Native FP8 checkpoint — 8xH200/H20 (141GB × 8) single-node serving"}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$62","hf_org":"zai-org","hf_repo":"GLM-5.1","hf_id":"zai-org/GLM-5.1","hf_released":"2026-04-03T09:28:47.000Z"},{"meta":{"title":"GLM-5","slug":"glm-5","provider":"GLM (Z-AI)","description":"GLM-5 frontier-scale MoE language model (~744B total parameters, 28.5T training tokens) with asynchronous RL infrastructure for reasoning, coding, and agentic tasks","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:77:meta:tasks","performance_headline":"Frontier-scale MoE with 744B parameters, best-in-class open-source performance on reasoning/coding/agents","related_recipes":[],"hardware":{"h200":"verified"}},"model":{"model_id":"zai-org/GLM-5","min_vllm_version":"0.16.0","docker_image":"vllm/vllm-openai:glm51","architecture":"moe","parameter_count":"744B","active_parameters":"40B","context_length":202752,"base_args":["--trust-remote-code","--chat-template-content-format=string"],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.19.0 (avoid nightly)","command":"uv pip install \"vllm==0.19.0\" --torch-backend=auto"},{"note":"GLM-5 requires transformers >= 5.4.0","command":"uv pip install \"transformers>=5.4.0\""},{"note":"Optional: DeepGEMM for FP8 MoE kernels (FP8 variant only)","command":"bash install_deepgemm.sh","optional":true}],"features":{"tool_calling":{"description":"GLM-4.7 tool call parser with automatic tool choice","args":["--tool-call-parser","glm47","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser — thinking mode enabled by default on requests","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding (3 draft tokens)","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","3"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1786,"description":"Full precision BF16 — requires multi-node deployment"},"fp8":{"model_id":"zai-org/GLM-5-FP8","precision":"fp8","vram_minimum_gb":893,"description":"Native FP8 checkpoint — 8xH200/H20 (141GB x 8) single-node serving"},"nvfp4":{"model_id":"nvidia/GLM-5-NVFP4","precision":"nvfp4","vram_minimum_gb":446,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$63","hf_org":"zai-org","hf_repo":"GLM-5","hf_id":"zai-org/GLM-5","hf_released":"2026-02-11T04:55:46.000Z"},{"meta":{"title":"GLM-OCR","slug":"glm-ocr","provider":"GLM (Z-AI)","description":"GLM-OCR image-to-text model with built-in MTP speculative decoding for high-throughput OCR serving","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:80:meta:tasks","performance_headline":"Multilingual end-to-end OCR VLM with MTP-accelerated decoding","related_recipes":[]},"model":{"model_id":"zai-org/GLM-OCR","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"0.9B","active_parameters":"0.9B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"GLM-OCR requires the nightly vllm wheel","command":"uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly"},{"note":"transformers from source for GLM-OCR tokenizer support","command":"uv pip install git+https://github.com/huggingface/transformers.git"}],"features":{"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"Full precision BF16 — single-GPU deployment"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$64","hf_org":"zai-org","hf_repo":"GLM-OCR","hf_id":"zai-org/GLM-OCR","hf_released":"2026-01-30T04:24:21.000Z"},{"meta":{"title":"GLM-Image","slug":"glm-image","provider":"GLM (Z-AI)","description":"Hybrid autoregressive + diffusion image generation model — text-to-image and image-to-image with strong text rendering and knowledge-intensive generation","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:79:meta:tasks","performance_headline":"9B AR generator + 7B DiT decoder, state-of-the-art text rendering in generated images","related_recipes":[],"hardware":{"h100":"verified"}},"model":{"model_id":"zai-org/GLM-Image","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"16B","active_parameters":"16B","context_length":4096,"base_args":["--omni","--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"vllm-omni provides the diffusion decoder path","command":"uv pip install vllm-omni"},{"note":"transformers from source (GLM-Image tokenizer)","command":"uv pip install git+https://github.com/huggingface/transformers.git"},{"note":"diffusers from source — required for the DiT decoder","command":"uv pip install git+https://github.com/huggingface/diffusers.git"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":38,"description":"Single-GPU deployment (~33 GB for model weights, plus activation headroom)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$65","hf_org":"zai-org","hf_repo":"GLM-Image","hf_id":"zai-org/GLM-Image","hf_released":"2026-01-08T09:39:56.000Z"},{"meta":{"title":"GLM-4.7","slug":"glm-4.7","provider":"GLM (Z-AI)","description":"GLM-4.7 MoE language model (~358B total parameters) with MTP speculative decoding, updated tool call parser, and reasoning support","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:75:meta:tasks","performance_headline":"Latest GLM-4.X release with updated glm47 tool call parser and MTP speculative decoding","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"zai-org/GLM-4.7","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"358B","active_parameters":"32B","context_length":202752,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"GLM-4.7 requires the nightly vllm wheel","command":"uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly"},{"note":"transformers from source — GLM-4.7 tokenizer is newer than any release","command":"uv pip install git+https://github.com/huggingface/transformers.git"}],"features":{"tool_calling":{"description":"GLM-4.7 tool call parser with automatic tool choice","args":["--tool-call-parser","glm47","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":859,"description":"Full precision BF16 on 8xH200 or equivalent"},"fp8":{"model_id":"zai-org/GLM-4.7-FP8","precision":"fp8","vram_minimum_gb":430,"description":"Native FP8 checkpoint with minimal accuracy loss"},"nvfp4":{"model_id":"nvidia/GLM-4.7-NVFP4","precision":"nvfp4","vram_minimum_gb":215,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$66","hf_org":"zai-org","hf_repo":"GLM-4.7","hf_id":"zai-org/GLM-4.7","hf_released":"2025-12-22T07:45:52.000Z"},{"meta":{"title":"GLM-ASR-Nano-2512","slug":"glm-asr-nano-2512","provider":"GLM (Z-AI)","description":"Open-source speech recognition model (~2B) with strong dialect support (Cantonese and others) and robust low-volume speech transcription","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:78:meta:tasks","performance_headline":"Outperforms Whisper V3 on multiple benchmarks at compact 1.5B active / 2B total size","related_recipes":[]},"model":{"model_id":"zai-org/GLM-ASR-Nano-2512","min_vllm_version":"0.14.1","architecture":"dense","parameter_count":"2.3B","active_parameters":"1.5B","context_length":8192,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras required for ASR (requires vllm>=0.14.1)","command":"uv pip install -U \"vllm[audio]\" --torch-backend auto"},{"note":"Install transformers from source for GLM-ASR tokenizer support","command":"uv pip install git+https://github.com/huggingface/transformers.git"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":11,"description":"Full precision BF16 — single-GPU deployment"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$67","hf_org":"zai-org","hf_repo":"GLM-ASR-Nano-2512","hf_id":"zai-org/GLM-ASR-Nano-2512","hf_released":"2025-12-09T09:07:41.000Z"},{"meta":{"title":"GLM-4.6V","slug":"glm-4.6v","provider":"GLM (Z-AI)","description":"GLM-4.6 vision-language MoE model — image-text-to-text with 128K context, native FP8 checkpoint, and expert parallelism","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:74:meta:tasks","performance_headline":"Updated GLM-V series with 128K context length and native FP8","related_recipes":[],"hardware":{"h100":"verified","h200":"verified"}},"model":{"model_id":"zai-org/GLM-4.6V","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"107B","active_parameters":"12B","context_length":131072,"base_args":["--trust-remote-code","--enable-expert-parallel","--allowed-local-media-path","/","--mm-encoder-tp-mode","data","--mm-processor-cache-type","shm"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser","args":["--reasoning-parser","glm45"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":257,"description":"Full precision BF16 — runs on 4xH100/H200"},"fp8":{"model_id":"zai-org/GLM-4.6V-FP8","precision":"fp8","vram_minimum_gb":128,"description":"Native FP8 checkpoint with minimal accuracy loss"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--mm-encoder-tp-mode","data","--allowed-local-media-path","/"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$68","hf_org":"zai-org","hf_repo":"GLM-4.6V","hf_id":"zai-org/GLM-4.6V","hf_released":"2025-12-07T07:20:45.000Z"},{"meta":{"title":"Glyph","slug":"glyph","provider":"GLM (Z-AI)","description":"Visual-text compression framework that renders long text into images and processes them with a reasoning VLM, scaling effective context length","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:81:meta:tasks","performance_headline":"Reasoning multimodal model for visual-text compression, single-GPU deployable","related_recipes":[],"hardware":{"h100":"verified","mi300x":"verified","mi325x":"verified"}},"model":{"model_id":"zai-org/Glyph","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"10B","active_parameters":"10B","context_length":131072,"base_args":["--no-enable-prefix-caching","--mm-processor-cache-gb","0","--limit-mm-per-prompt.video","0"],"base_env":{}},"features":{"reasoning":{"description":"GLM-4.5 reasoning parser for extracting reasoning traces from Glyph outputs","args":["--reasoning-parser","glm45"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":24,"description":"Full precision BF16 — single-GPU deployment on 1xH100"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$69","hf_org":"zai-org","hf_repo":"Glyph","hf_id":"zai-org/Glyph","hf_released":"2025-10-25T06:19:07.000Z"},{"meta":{"title":"GLM-4.6","slug":"glm-4.6","provider":"GLM (Z-AI)","description":"GLM-4.6 MoE language model (~357B total parameters, BF16) with MTP speculative decoding, native tool calling and reasoning","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:73:meta:tasks","performance_headline":"Updated GLM-4.X series MoE model with native FP8 and BF16, MTP speculative decoding","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"zai-org/GLM-4.6","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"357B","active_parameters":"32B","context_length":202752,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":857,"description":"Full precision BF16 on 8xH200 or equivalent"},"fp8":{"model_id":"zai-org/GLM-4.6-FP8","precision":"fp8","vram_minimum_gb":428,"description":"Native FP8 checkpoint with minimal accuracy loss — recommended for cost-efficient serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$6a","hf_org":"zai-org","hf_repo":"GLM-4.6","hf_id":"zai-org/GLM-4.6","hf_released":"2025-09-29T18:22:51.000Z"},{"meta":{"title":"GLM-4.5V","slug":"glm-4.5v","provider":"GLM (Z-AI)","description":"GLM-4.5 vision-language MoE model (~107B parameters, BF16) with image-text-to-text capability, 64K context, expert parallelism, and native FP8","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:72:meta:tasks","performance_headline":"Multimodal GLM-4.5V with native FP8 and expert parallelism, deploys on 4xH100","related_recipes":[],"hardware":{"h100":"verified","h200":"verified"}},"model":{"model_id":"zai-org/GLM-4.5V","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"107B","active_parameters":"12B","context_length":65536,"base_args":["--trust-remote-code","--enable-expert-parallel","--allowed-local-media-path","/","--mm-encoder-tp-mode","data","--mm-processor-cache-type","shm"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser","args":["--reasoning-parser","glm45"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":257,"description":"Full precision BF16 — runs on 4xH100/H200"},"fp8":{"model_id":"zai-org/GLM-4.5V-FP8","precision":"fp8","vram_minimum_gb":128,"description":"Native FP8 checkpoint with minimal accuracy loss — recommended for cost-efficient serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--mm-encoder-tp-mode","data","--allowed-local-media-path","/"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$6b","hf_org":"zai-org","hf_repo":"GLM-4.5V","hf_id":"zai-org/GLM-4.5V","hf_released":"2025-08-10T13:55:30.000Z"},{"meta":{"title":"GLM-4.5","slug":"glm-4.5","provider":"GLM (Z-AI)","description":"GLM-4.5 MoE language model (~358B total parameters, BF16) with built-in MTP layers for speculative decoding and native tool calling","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:71:meta:tasks","performance_headline":"GLM-4.X series MoE model with native FP8 and BF16 support and MTP speculative decoding","related_recipes":[],"hardware":{"h200":"verified","mi300x":"verified","mi325x":"verified","mi355x":"verified"}},"model":{"model_id":"zai-org/GLM-4.5","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"358B","active_parameters":"32B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":859,"description":"Full precision BF16 on 8xH200 or equivalent"},"fp8":{"model_id":"zai-org/GLM-4.5-FP8","precision":"fp8","vram_minimum_gb":430,"description":"Native FP8 checkpoint with minimal accuracy loss — recommended for cost-efficient serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$6c","hf_org":"zai-org","hf_repo":"GLM-4.5","hf_id":"zai-org/GLM-4.5","hf_released":"2025-07-20T03:25:36.000Z"}]]]}],"$L6d"]}]