litellm: route all homelab LLM load to Anvil/Ollama by default

Per-model api_base/api_key overrides in the template (default stays Meridian's local port). All standard aliases (claude-*, gpt-*) now point at Anvil's Ollama (mini/haiku-class -> llama3.1:8b, rest -> llama3.3:70b). Claude/Max reachable only via new *-max escape-hatch aliases. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
2026-05-28 11:16:46 -04:00
parent 8adecb417a
commit c29e24b51b
2 changed files with 62 additions and 20 deletions
@@ -53,27 +53,67 @@ litellm_venv: /opt/litellm/venv
 litellm_port: 4000
 litellm_host: "0.0.0.0"
 litellm_package_spec: "litellm[proxy]==1.55.10"
-# Models map onto Meridian's pinned Anthropic-shape backend on 127.0.0.1:3456.
-# Native Claude aliases AND OpenAI-named aliases — some clients (paperless-ai's
-# setup wizard) hardcode `model=gpt-4o-mini` for validation regardless of
-# what you set as the default model, so we shadow the common OpenAI names too.
+# Anvil — local Ollama backend (Strix Halo iGPU, gfx1151). IP not DNS keeps
+# the inference path off the resolver. Ollama has no auth (placeholder api_key).
+anvil_ollama_base: "http://192.168.1.150:11434"
+
+# ALL homelab LLM load routes LOCAL as of 2026-05-28. Every standard alias that
+# clients already use (claude-*, gpt-*) now resolves to Anvil/Ollama — no client
+# reconfig needed. Meridian still runs, but Claude/Max is reachable ONLY via the
+# explicit *-max escape-hatch aliases below (use them for vision or hard
+# reasoning — llama3.x is text-only and weaker on complex tasks).
+#
+# Size split: mini/haiku-class → llama3.1:8b; everything else → llama3.3:70b.
+# Single GPU, OLLAMA_NUM_PARALLEL=1 — concurrent/mixed requests queue and the
+# 70B+8B can't both stay resident in the ~62 GB budget (expect model swaps).
 litellm_models:
-  # Native Claude aliases (preferred for new clients)
+  # ---- Default aliases → LOCAL (Anvil/Ollama) ----
  - name: claude-haiku-4-5
-    backend: anthropic/claude-haiku-4-5
-  - name: claude-sonnet-4-6
-    backend: anthropic/claude-sonnet-4-6
-  - name: claude-opus-4-7
-    backend: anthropic/claude-opus-4-7
-  # OpenAI-name shadows — for clients that probe gpt-* names regardless of
-  # config (paperless-ai wizard, Open WebUI defaults, etc.). All actually
-  # backed by Claude on the Max sub.
+    backend: ollama_chat/llama3.1:8b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
  - name: gpt-4o-mini
-    backend: anthropic/claude-haiku-4-5
+    backend: ollama_chat/llama3.1:8b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
+  - name: claude-sonnet-4-6
+    backend: ollama_chat/llama3.3:70b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
+  - name: claude-opus-4-7
+    backend: ollama_chat/llama3.3:70b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
  - name: gpt-4o
-    backend: anthropic/claude-sonnet-4-6
+    backend: ollama_chat/llama3.3:70b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
  - name: gpt-4-turbo
+    backend: ollama_chat/llama3.3:70b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
+  # Direct local model names (explicit)
+  - name: llama-3.3-70b
+    backend: ollama_chat/llama3.3:70b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
+  - name: llama-3.1-8b
+    backend: ollama_chat/llama3.1:8b
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
+  - name: nomic-embed-text
+    backend: ollama/nomic-embed-text
+    api_base: "{{ anvil_ollama_base }}"
+    api_key: ollama-no-auth
+  # ---- Escape hatches → Claude via Meridian/Max ----
+  # No api_base → template default (127.0.0.1:meridian_port). Reach Claude by
+  # name when local can't do the job (vision, hard reasoning).
+  - name: claude-haiku-4-5-max
+    backend: anthropic/claude-haiku-4-5
+  - name: claude-sonnet-4-6-max
    backend: anthropic/claude-sonnet-4-6
+  - name: claude-opus-4-7-max
+    backend: anthropic/claude-opus-4-7
 # Master key is required by LiteLLM. Pulled at deploy time from Infisical
 # /meridian/vault_litellm_master_key and passed via -e on the playbook
 # (see deploy.sh).