From 211d26cc63ae5018c3dfde353ad9f3ff85c8e75c Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 5 Jun 2026 12:05:55 -0400 Subject: [PATCH] litellm: re-index models with local_/proxy_/direct_ prefixes + scaffold OpenAI+Gemini Backend-prefix taxonomy so the Open WebUI picker is self-documenting and a model name can't lie about where it routes: local_* -> Anvil/Ollama (free) e.g. local_qwen2.5-72b proxy_* -> Claude via Meridian/Max e.g. proxy_claude-sonnet-4-6 direct_* -> metered OpenAI/Gemini e.g. direct_gpt-4o, direct_gemini-2.0-flash Drops the redundant -max suffix (proxy_ already implies Max). api_base is now emitted only when a model defines it, so direct_* hit the provider default endpoint instead of Meridian. direct_* are SCAFFOLDED (no live keys): litellm.env writes a placeholder so the proxy boots; deploy.sh pulls OPENAI_API_KEY/ GEMINI_API_KEY from Infisical /meridian if present (non-fatal). They 401 until real keys land. Co-Authored-By: Claude Opus 4.8 (1M context) --- deploy.sh | 19 +++++ .../litellm/templates/litellm-config.yaml.j2 | 19 +++-- roles/litellm/templates/litellm.env.j2 | 5 ++ vars/main.yml | 77 ++++++++++++++----- 4 files changed, 96 insertions(+), 24 deletions(-) diff --git a/deploy.sh b/deploy.sh index 887da31..b73b15b 100755 --- a/deploy.sh +++ b/deploy.sh @@ -30,6 +30,25 @@ if [[ -z "${LITELLM_MASTER_KEY:-}" ]]; then export LITELLM_MASTER_KEY fi +# Optional provider keys for direct_* models. Non-fatal: if a key isn't in +# Infisical /meridian yet, litellm.env falls back to a placeholder and the +# direct_* model 401s on call (proxy_* + local_* keep working). Drop the secret +# into Infisical /meridian to activate, then re-deploy. +for keyvar in OPENAI_API_KEY GEMINI_API_KEY; do + if [[ -z "${!keyvar:-}" ]]; then + secret_name="vault_$(echo "$keyvar" | tr '[:upper:]' '[:lower:]')" + val="$(infisical secrets get "$secret_name" \ + --projectId 50062d7c-06ff-4d5c-8ca3-6c0cdba9f270 \ + --env prod --path /meridian --plain 2>/dev/null || true)" + if [[ -n "$val" ]]; then + echo "==> Pulled ${keyvar} from Infisical (direct_* enabled)." + export "$keyvar=$val" + else + echo "==> ${keyvar} not in Infisical /meridian — direct_* for this provider stays scaffolded (401 until set)." + fi + fi +done + echo "==> Checking connectivity to ${HOST_USER}@${HOST_IP} ..." if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "${HOST_USER}@${HOST_IP}" true 2>/dev/null; then echo " Cannot SSH to ${HOST_IP} — refreshing host key ..." diff --git a/roles/litellm/templates/litellm-config.yaml.j2 b/roles/litellm/templates/litellm-config.yaml.j2 index ef442a1..c6cf276 100644 --- a/roles/litellm/templates/litellm-config.yaml.j2 +++ b/roles/litellm/templates/litellm-config.yaml.j2 @@ -1,17 +1,24 @@ # {{ ansible_managed }} # -# LiteLLM proxy config. Routes OpenAI-shaped requests to backends: -# - Claude models → Meridian's /v1/messages (same host, :3456), which -# ignores the upstream API key (placeholder passed below). -# - Local models → Anvil's Ollama (openai/ provider, OpenAI-compatible -# endpoint at http://192.168.1.150:11434). Set per-model api_base in vars. +# LiteLLM proxy config. Routes OpenAI-shaped requests to backends by the +# model-name prefix set in vars (litellm_models): +# - proxy_* → Meridian's /v1/messages (same host, :3456), which ignores the +# upstream API key (placeholder); the Max-OAuth sub pays. Explicit api_base. +# - local_* → Anvil's Ollama (OpenAI-compatible, http://192.168.1.150:11434). +# Explicit api_base. +# - direct_* → a public provider (OpenAI/Gemini). NO api_base → LiteLLM uses +# the provider default endpoint; api_key reads os.environ/_API_KEY. +# api_base is emitted only when a model defines it; omit it to reach a provider +# default. model_list: {% for m in litellm_models %} - model_name: {{ m.name }} litellm_params: model: {{ m.backend }} - api_base: {{ m.api_base | default('http://127.0.0.1:' ~ meridian_port) }} +{% if m.api_base is defined %} + api_base: {{ m.api_base }} +{% endif %} api_key: {{ m.api_key | default('placeholder-meridian-ignores-this') }} {% endfor %} diff --git a/roles/litellm/templates/litellm.env.j2 b/roles/litellm/templates/litellm.env.j2 index b18456e..a84b9d3 100644 --- a/roles/litellm/templates/litellm.env.j2 +++ b/roles/litellm/templates/litellm.env.j2 @@ -1,2 +1,7 @@ # {{ ansible_managed }} LITELLM_MASTER_KEY={{ litellm_master_key }} +# direct_* provider keys. A placeholder keeps os.environ/ resolvable so the +# proxy boots; real keys come from Infisical /meridian via deploy.sh. Until then +# direct_* models 401 on call (proxy_* + local_* are unaffected). +OPENAI_API_KEY={{ litellm_openai_api_key | default('', true) or 'sk-PLACEHOLDER-set-in-infisical-meridian' }} +GEMINI_API_KEY={{ litellm_gemini_api_key | default('', true) or 'PLACEHOLDER-set-in-infisical-meridian' }} diff --git a/vars/main.yml b/vars/main.yml index 571d318..6470285 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -57,45 +57,86 @@ litellm_package_spec: "litellm[proxy]==1.55.10" # the inference path off the resolver. Ollama has no auth (placeholder api_key). anvil_ollama_base: "http://192.168.1.150:11434" -# Model list — honest names only. Shadow aliases (claude-*/gpt-*) were removed -# 2026-05-29 because the dual meaning (was-Claude, now-local) was a constant -# foot-gun in the Open WebUI picker. Local models keep their real names; Claude -# is reached only via the explicit *-max aliases. +# Model list — a prefix states each model's backend up front, so the Open WebUI +# picker is self-documenting (re-indexed 2026-06-05): +# local_* → Anvil/Ollama, local GPU, no metered cost +# proxy_* → Claude via Meridian's Max-OAuth bridge, no per-token cost +# direct_* → a real provider API (OpenAI/Gemini), METERED billing, key from +# Infisical /meridian # -# Trade-off: any client that hard-codes a claude-*/gpt-* model name in a probe -# (paperless-ai wizard hits gpt-4o-mini; see [[litellm-openai-alias-shadowing]]) -# will 400 with `Invalid model name` until that alias is re-added. +# This replaces the bare-name scheme. Shadow aliases (bare claude-*/gpt-*) were +# already removed 2026-05-29 because the dual meaning (was-Claude, now-local) was +# a foot-gun; the prefix makes that impossible — a name can no longer lie about +# where it routes. See [[litellm-openai-alias-shadowing]]. +# +# Trade-off: clients that hard-code a bare model name in a probe (paperless-ai +# wizard hits gpt-4o-mini) 400 until pointed at the prefixed name — or, for +# OpenAI specifically, at direct_gpt-4o-mini once a key is in Infisical. # # Single GPU, OLLAMA_NUM_PARALLEL=1 — concurrent/mixed requests queue and the # 70B+8B can't both stay resident in the ~62 GB budget (expect model swaps). litellm_models: - # ---- Local (Anvil/Ollama) ---- - - name: qwen2.5-72b + # ---- local_* → Anvil/Ollama, local GPU, no cost ---- + - name: local_qwen2.5-72b backend: ollama_chat/qwen2.5:72b api_base: "{{ anvil_ollama_base }}" api_key: ollama-no-auth - - name: llama-3.3-70b + - name: local_llama-3.3-70b backend: ollama_chat/llama3.3:70b api_base: "{{ anvil_ollama_base }}" api_key: ollama-no-auth - - name: llama-3.1-8b + - name: local_llama-3.1-8b backend: ollama_chat/llama3.1:8b api_base: "{{ anvil_ollama_base }}" api_key: ollama-no-auth - - name: nomic-embed-text + # Embeddings don't actually route through LiteLLM 1.55.10 (broken async ollama + # handler → 500); consumers hit Anvil direct. Kept as a catalog entry only. + - name: local_nomic-embed-text backend: ollama/nomic-embed-text api_base: "{{ anvil_ollama_base }}" api_key: ollama-no-auth - # ---- Escape hatches → Claude via Meridian/Max ---- - # No api_base → template default (127.0.0.1:meridian_port). Reach Claude by - # name when local can't do the job (vision, hard reasoning). - - name: claude-haiku-4-5-max + # ---- proxy_* → Claude via Meridian/Max, no per-token cost ---- + # api_base pins Meridian's local Anthropic endpoint (the OAuth Max sub pays). + # Reach Claude by name when local can't do the job (vision, hard reasoning). + - name: proxy_claude-haiku-4-5 backend: anthropic/claude-haiku-4-5 - - name: claude-sonnet-4-6-max + api_base: "http://127.0.0.1:{{ meridian_port }}" + - name: proxy_claude-sonnet-4-6 backend: anthropic/claude-sonnet-4-6 - - name: claude-opus-4-7-max + api_base: "http://127.0.0.1:{{ meridian_port }}" + - name: proxy_claude-opus-4-7 backend: anthropic/claude-opus-4-7 + api_base: "http://127.0.0.1:{{ meridian_port }}" + # ---- direct_* → public provider APIs, METERED, keys from Infisical /meridian ---- + # SCAFFOLDED 2026-06-05 with NO real keys: these 401 on call until + # OPENAI_API_KEY / GEMINI_API_KEY land in Infisical /meridian and deploy.sh + # pulls them (litellm.env carries a placeholder so the proxy still boots). + # No api_base → LiteLLM uses each provider's default endpoint; routing is by + # the backend's provider prefix (openai/, gemini/). + - name: direct_gpt-4o + backend: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + - name: direct_gpt-4o-mini + backend: openai/gpt-4o-mini + api_key: os.environ/OPENAI_API_KEY + - name: direct_o3-mini + backend: openai/o3-mini + api_key: os.environ/OPENAI_API_KEY + - name: direct_gemini-2.0-flash + backend: gemini/gemini-2.0-flash + api_key: os.environ/GEMINI_API_KEY + - name: direct_gemini-1.5-pro + backend: gemini/gemini-1.5-pro + api_key: os.environ/GEMINI_API_KEY # Master key is required by LiteLLM. Pulled at deploy time from Infisical # /meridian/vault_litellm_master_key and passed via -e on the playbook # (see deploy.sh). litellm_master_key: "{{ lookup('env', 'LITELLM_MASTER_KEY') | default('CHANGE_ME', true) }}" + +# Provider keys for direct_* models. Optional — deploy.sh pulls them from +# Infisical /meridian if present, else they stay empty and litellm.env writes a +# placeholder so the proxy still boots (direct_* models just 401 until a real +# key lands). Drop OPENAI_API_KEY / GEMINI_API_KEY into Infisical /meridian to +# activate them. +litellm_openai_api_key: "{{ lookup('env', 'OPENAI_API_KEY') | default('', true) }}" +litellm_gemini_api_key: "{{ lookup('env', 'GEMINI_API_KEY') | default('', true) }}"