litellm: route all homelab LLM load to Anvil/Ollama by default

Per-model api_base/api_key overrides in the template (default stays Meridian's local port). All standard aliases (claude-*, gpt-*) now point at Anvil's Ollama (mini/haiku-class -> llama3.1:8b, rest -> llama3.3:70b). Claude/Max reachable only via new *-max escape-hatch aliases. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
2026-05-28 11:16:46 -04:00
parent 8adecb417a
commit c29e24b51b
2 changed files with 62 additions and 20 deletions
@@ -1,16 +1,18 @@
 # {{ ansible_managed }}
 #
-# LiteLLM proxy config. Routes OpenAI-shaped requests to Meridian's
+# LiteLLM proxy config. Routes OpenAI-shaped requests to backends:
-# /v1/messages (Anthropic format). Meridian (same host, :3456) ignores the
+#   - Claude models → Meridian's /v1/messages (same host, :3456), which
-# upstream API key, so we pass a placeholder.
+#     ignores the upstream API key (placeholder passed below).
 #   - Local models  → Anvil's Ollama (openai/ provider, OpenAI-compatible
 #     endpoint at http://192.168.1.150:11434). Set per-model api_base in vars.
 model_list:
 {% for m in litellm_models %}
  - model_name: {{ m.name }}
    litellm_params:
      model: {{ m.backend }}
-      api_base: http://127.0.0.1:{{ meridian_port }}
+      api_base: {{ m.api_base | default('http://127.0.0.1:' ~ meridian_port) }}
-      api_key: placeholder-meridian-ignores-this
+      api_key: {{ m.api_key | default('placeholder-meridian-ignores-this') }}
 {% endfor %}
 general_settings:
@@ -53,27 +53,67 @@ litellm_venv: /opt/litellm/venv
 litellm_port: 4000
 litellm_host: "0.0.0.0"
 litellm_package_spec: "litellm[proxy]==1.55.10"
-# Models map onto Meridian's pinned Anthropic-shape backend on 127.0.0.1:3456.
+# Anvil — local Ollama backend (Strix Halo iGPU, gfx1151). IP not DNS keeps
-# Native Claude aliases AND OpenAI-named aliases — some clients (paperless-ai's
+# the inference path off the resolver. Ollama has no auth (placeholder api_key).
-# setup wizard) hardcode `model=gpt-4o-mini` for validation regardless of
+anvil_ollama_base: "http://192.168.1.150:11434"
-# what you set as the default model, so we shadow the common OpenAI names too.
+
 # ALL homelab LLM load routes LOCAL as of 2026-05-28. Every standard alias that
 # clients already use (claude-*, gpt-*) now resolves to Anvil/Ollama — no client
 # reconfig needed. Meridian still runs, but Claude/Max is reachable ONLY via the
 # explicit *-max escape-hatch aliases below (use them for vision or hard
 # reasoning — llama3.x is text-only and weaker on complex tasks).
 #
 # Size split: mini/haiku-class → llama3.1:8b; everything else → llama3.3:70b.
 # Single GPU, OLLAMA_NUM_PARALLEL=1 — concurrent/mixed requests queue and the
 # 70B+8B can't both stay resident in the ~62 GB budget (expect model swaps).
 litellm_models:
-  # Native Claude aliases (preferred for new clients)
+  # ---- Default aliases → LOCAL (Anvil/Ollama) ----
  - name: claude-haiku-4-5
-    backend: anthropic/claude-haiku-4-5
+    backend: ollama_chat/llama3.1:8b
-  - name: claude-sonnet-4-6
+    api_base: "{{ anvil_ollama_base }}"
-    backend: anthropic/claude-sonnet-4-6
+    api_key: ollama-no-auth
  - name: claude-opus-4-7
    backend: anthropic/claude-opus-4-7
  # OpenAI-name shadows — for clients that probe gpt-* names regardless of
  # config (paperless-ai wizard, Open WebUI defaults, etc.). All actually
  # backed by Claude on the Max sub.
  - name: gpt-4o-mini
-    backend: anthropic/claude-haiku-4-5
+    backend: ollama_chat/llama3.1:8b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  - name: claude-sonnet-4-6
    backend: ollama_chat/llama3.3:70b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  - name: claude-opus-4-7
    backend: ollama_chat/llama3.3:70b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  - name: gpt-4o
-    backend: anthropic/claude-sonnet-4-6
+    backend: ollama_chat/llama3.3:70b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  - name: gpt-4-turbo
    backend: ollama_chat/llama3.3:70b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  # Direct local model names (explicit)
  - name: llama-3.3-70b
    backend: ollama_chat/llama3.3:70b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  - name: llama-3.1-8b
    backend: ollama_chat/llama3.1:8b
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  - name: nomic-embed-text
    backend: ollama/nomic-embed-text
    api_base: "{{ anvil_ollama_base }}"
    api_key: ollama-no-auth
  # ---- Escape hatches → Claude via Meridian/Max ----
  # No api_base → template default (127.0.0.1:meridian_port). Reach Claude by
  # name when local can't do the job (vision, hard reasoning).
  - name: claude-haiku-4-5-max
    backend: anthropic/claude-haiku-4-5
  - name: claude-sonnet-4-6-max
    backend: anthropic/claude-sonnet-4-6
  - name: claude-opus-4-7-max
    backend: anthropic/claude-opus-4-7
 # Master key is required by LiteLLM. Pulled at deploy time from Infisical
 # /meridian/vault_litellm_master_key and passed via -e on the playbook
 # (see deploy.sh).