diff --git a/roles/litellm/templates/litellm-config.yaml.j2 b/roles/litellm/templates/litellm-config.yaml.j2 index 91e3e60..ef442a1 100644 --- a/roles/litellm/templates/litellm-config.yaml.j2 +++ b/roles/litellm/templates/litellm-config.yaml.j2 @@ -1,16 +1,18 @@ # {{ ansible_managed }} # -# LiteLLM proxy config. Routes OpenAI-shaped requests to Meridian's -# /v1/messages (Anthropic format). Meridian (same host, :3456) ignores the -# upstream API key, so we pass a placeholder. +# LiteLLM proxy config. Routes OpenAI-shaped requests to backends: +# - Claude models → Meridian's /v1/messages (same host, :3456), which +# ignores the upstream API key (placeholder passed below). +# - Local models → Anvil's Ollama (openai/ provider, OpenAI-compatible +# endpoint at http://192.168.1.150:11434). Set per-model api_base in vars. model_list: {% for m in litellm_models %} - model_name: {{ m.name }} litellm_params: model: {{ m.backend }} - api_base: http://127.0.0.1:{{ meridian_port }} - api_key: placeholder-meridian-ignores-this + api_base: {{ m.api_base | default('http://127.0.0.1:' ~ meridian_port) }} + api_key: {{ m.api_key | default('placeholder-meridian-ignores-this') }} {% endfor %} general_settings: diff --git a/vars/main.yml b/vars/main.yml index 4b47270..8e19470 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -53,27 +53,67 @@ litellm_venv: /opt/litellm/venv litellm_port: 4000 litellm_host: "0.0.0.0" litellm_package_spec: "litellm[proxy]==1.55.10" -# Models map onto Meridian's pinned Anthropic-shape backend on 127.0.0.1:3456. -# Native Claude aliases AND OpenAI-named aliases — some clients (paperless-ai's -# setup wizard) hardcode `model=gpt-4o-mini` for validation regardless of -# what you set as the default model, so we shadow the common OpenAI names too. +# Anvil — local Ollama backend (Strix Halo iGPU, gfx1151). IP not DNS keeps +# the inference path off the resolver. Ollama has no auth (placeholder api_key). +anvil_ollama_base: "http://192.168.1.150:11434" + +# ALL homelab LLM load routes LOCAL as of 2026-05-28. Every standard alias that +# clients already use (claude-*, gpt-*) now resolves to Anvil/Ollama — no client +# reconfig needed. Meridian still runs, but Claude/Max is reachable ONLY via the +# explicit *-max escape-hatch aliases below (use them for vision or hard +# reasoning — llama3.x is text-only and weaker on complex tasks). +# +# Size split: mini/haiku-class → llama3.1:8b; everything else → llama3.3:70b. +# Single GPU, OLLAMA_NUM_PARALLEL=1 — concurrent/mixed requests queue and the +# 70B+8B can't both stay resident in the ~62 GB budget (expect model swaps). litellm_models: - # Native Claude aliases (preferred for new clients) + # ---- Default aliases → LOCAL (Anvil/Ollama) ---- - name: claude-haiku-4-5 - backend: anthropic/claude-haiku-4-5 - - name: claude-sonnet-4-6 - backend: anthropic/claude-sonnet-4-6 - - name: claude-opus-4-7 - backend: anthropic/claude-opus-4-7 - # OpenAI-name shadows — for clients that probe gpt-* names regardless of - # config (paperless-ai wizard, Open WebUI defaults, etc.). All actually - # backed by Claude on the Max sub. + backend: ollama_chat/llama3.1:8b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth - name: gpt-4o-mini - backend: anthropic/claude-haiku-4-5 + backend: ollama_chat/llama3.1:8b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth + - name: claude-sonnet-4-6 + backend: ollama_chat/llama3.3:70b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth + - name: claude-opus-4-7 + backend: ollama_chat/llama3.3:70b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth - name: gpt-4o - backend: anthropic/claude-sonnet-4-6 + backend: ollama_chat/llama3.3:70b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth - name: gpt-4-turbo + backend: ollama_chat/llama3.3:70b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth + # Direct local model names (explicit) + - name: llama-3.3-70b + backend: ollama_chat/llama3.3:70b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth + - name: llama-3.1-8b + backend: ollama_chat/llama3.1:8b + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth + - name: nomic-embed-text + backend: ollama/nomic-embed-text + api_base: "{{ anvil_ollama_base }}" + api_key: ollama-no-auth + # ---- Escape hatches → Claude via Meridian/Max ---- + # No api_base → template default (127.0.0.1:meridian_port). Reach Claude by + # name when local can't do the job (vision, hard reasoning). + - name: claude-haiku-4-5-max + backend: anthropic/claude-haiku-4-5 + - name: claude-sonnet-4-6-max backend: anthropic/claude-sonnet-4-6 + - name: claude-opus-4-7-max + backend: anthropic/claude-opus-4-7 # Master key is required by LiteLLM. Pulled at deploy time from Infisical # /meridian/vault_litellm_master_key and passed via -e on the playbook # (see deploy.sh).