litellm: re-index models with local_/proxy_/direct_ prefixes + scaffold OpenAI+Gemini
Backend-prefix taxonomy so the Open WebUI picker is self-documenting and a model name can't lie about where it routes: local_* -> Anvil/Ollama (free) e.g. local_qwen2.5-72b proxy_* -> Claude via Meridian/Max e.g. proxy_claude-sonnet-4-6 direct_* -> metered OpenAI/Gemini e.g. direct_gpt-4o, direct_gemini-2.0-flash Drops the redundant -max suffix (proxy_ already implies Max). api_base is now emitted only when a model defines it, so direct_* hit the provider default endpoint instead of Meridian. direct_* are SCAFFOLDED (no live keys): litellm.env writes a placeholder so the proxy boots; deploy.sh pulls OPENAI_API_KEY/ GEMINI_API_KEY from Infisical /meridian if present (non-fatal). They 401 until real keys land. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,25 @@ if [[ -z "${LITELLM_MASTER_KEY:-}" ]]; then
|
|||||||
export LITELLM_MASTER_KEY
|
export LITELLM_MASTER_KEY
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Optional provider keys for direct_* models. Non-fatal: if a key isn't in
|
||||||
|
# Infisical /meridian yet, litellm.env falls back to a placeholder and the
|
||||||
|
# direct_* model 401s on call (proxy_* + local_* keep working). Drop the secret
|
||||||
|
# into Infisical /meridian to activate, then re-deploy.
|
||||||
|
for keyvar in OPENAI_API_KEY GEMINI_API_KEY; do
|
||||||
|
if [[ -z "${!keyvar:-}" ]]; then
|
||||||
|
secret_name="vault_$(echo "$keyvar" | tr '[:upper:]' '[:lower:]')"
|
||||||
|
val="$(infisical secrets get "$secret_name" \
|
||||||
|
--projectId 50062d7c-06ff-4d5c-8ca3-6c0cdba9f270 \
|
||||||
|
--env prod --path /meridian --plain 2>/dev/null || true)"
|
||||||
|
if [[ -n "$val" ]]; then
|
||||||
|
echo "==> Pulled ${keyvar} from Infisical (direct_* enabled)."
|
||||||
|
export "$keyvar=$val"
|
||||||
|
else
|
||||||
|
echo "==> ${keyvar} not in Infisical /meridian — direct_* for this provider stays scaffolded (401 until set)."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
echo "==> Checking connectivity to ${HOST_USER}@${HOST_IP} ..."
|
echo "==> Checking connectivity to ${HOST_USER}@${HOST_IP} ..."
|
||||||
if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "${HOST_USER}@${HOST_IP}" true 2>/dev/null; then
|
if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "${HOST_USER}@${HOST_IP}" true 2>/dev/null; then
|
||||||
echo " Cannot SSH to ${HOST_IP} — refreshing host key ..."
|
echo " Cannot SSH to ${HOST_IP} — refreshing host key ..."
|
||||||
|
|||||||
@@ -1,17 +1,24 @@
|
|||||||
# {{ ansible_managed }}
|
# {{ ansible_managed }}
|
||||||
#
|
#
|
||||||
# LiteLLM proxy config. Routes OpenAI-shaped requests to backends:
|
# LiteLLM proxy config. Routes OpenAI-shaped requests to backends by the
|
||||||
# - Claude models → Meridian's /v1/messages (same host, :3456), which
|
# model-name prefix set in vars (litellm_models):
|
||||||
# ignores the upstream API key (placeholder passed below).
|
# - proxy_* → Meridian's /v1/messages (same host, :3456), which ignores the
|
||||||
# - Local models → Anvil's Ollama (openai/ provider, OpenAI-compatible
|
# upstream API key (placeholder); the Max-OAuth sub pays. Explicit api_base.
|
||||||
# endpoint at http://192.168.1.150:11434). Set per-model api_base in vars.
|
# - local_* → Anvil's Ollama (OpenAI-compatible, http://192.168.1.150:11434).
|
||||||
|
# Explicit api_base.
|
||||||
|
# - direct_* → a public provider (OpenAI/Gemini). NO api_base → LiteLLM uses
|
||||||
|
# the provider default endpoint; api_key reads os.environ/<PROVIDER>_API_KEY.
|
||||||
|
# api_base is emitted only when a model defines it; omit it to reach a provider
|
||||||
|
# default.
|
||||||
|
|
||||||
model_list:
|
model_list:
|
||||||
{% for m in litellm_models %}
|
{% for m in litellm_models %}
|
||||||
- model_name: {{ m.name }}
|
- model_name: {{ m.name }}
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: {{ m.backend }}
|
model: {{ m.backend }}
|
||||||
api_base: {{ m.api_base | default('http://127.0.0.1:' ~ meridian_port) }}
|
{% if m.api_base is defined %}
|
||||||
|
api_base: {{ m.api_base }}
|
||||||
|
{% endif %}
|
||||||
api_key: {{ m.api_key | default('placeholder-meridian-ignores-this') }}
|
api_key: {{ m.api_key | default('placeholder-meridian-ignores-this') }}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,7 @@
|
|||||||
# {{ ansible_managed }}
|
# {{ ansible_managed }}
|
||||||
LITELLM_MASTER_KEY={{ litellm_master_key }}
|
LITELLM_MASTER_KEY={{ litellm_master_key }}
|
||||||
|
# direct_* provider keys. A placeholder keeps os.environ/<KEY> resolvable so the
|
||||||
|
# proxy boots; real keys come from Infisical /meridian via deploy.sh. Until then
|
||||||
|
# direct_* models 401 on call (proxy_* + local_* are unaffected).
|
||||||
|
OPENAI_API_KEY={{ litellm_openai_api_key | default('', true) or 'sk-PLACEHOLDER-set-in-infisical-meridian' }}
|
||||||
|
GEMINI_API_KEY={{ litellm_gemini_api_key | default('', true) or 'PLACEHOLDER-set-in-infisical-meridian' }}
|
||||||
|
|||||||
+59
-18
@@ -57,45 +57,86 @@ litellm_package_spec: "litellm[proxy]==1.55.10"
|
|||||||
# the inference path off the resolver. Ollama has no auth (placeholder api_key).
|
# the inference path off the resolver. Ollama has no auth (placeholder api_key).
|
||||||
anvil_ollama_base: "http://192.168.1.150:11434"
|
anvil_ollama_base: "http://192.168.1.150:11434"
|
||||||
|
|
||||||
# Model list — honest names only. Shadow aliases (claude-*/gpt-*) were removed
|
# Model list — a prefix states each model's backend up front, so the Open WebUI
|
||||||
# 2026-05-29 because the dual meaning (was-Claude, now-local) was a constant
|
# picker is self-documenting (re-indexed 2026-06-05):
|
||||||
# foot-gun in the Open WebUI picker. Local models keep their real names; Claude
|
# local_* → Anvil/Ollama, local GPU, no metered cost
|
||||||
# is reached only via the explicit *-max aliases.
|
# proxy_* → Claude via Meridian's Max-OAuth bridge, no per-token cost
|
||||||
|
# direct_* → a real provider API (OpenAI/Gemini), METERED billing, key from
|
||||||
|
# Infisical /meridian
|
||||||
#
|
#
|
||||||
# Trade-off: any client that hard-codes a claude-*/gpt-* model name in a probe
|
# This replaces the bare-name scheme. Shadow aliases (bare claude-*/gpt-*) were
|
||||||
# (paperless-ai wizard hits gpt-4o-mini; see [[litellm-openai-alias-shadowing]])
|
# already removed 2026-05-29 because the dual meaning (was-Claude, now-local) was
|
||||||
# will 400 with `Invalid model name` until that alias is re-added.
|
# a foot-gun; the prefix makes that impossible — a name can no longer lie about
|
||||||
|
# where it routes. See [[litellm-openai-alias-shadowing]].
|
||||||
|
#
|
||||||
|
# Trade-off: clients that hard-code a bare model name in a probe (paperless-ai
|
||||||
|
# wizard hits gpt-4o-mini) 400 until pointed at the prefixed name — or, for
|
||||||
|
# OpenAI specifically, at direct_gpt-4o-mini once a key is in Infisical.
|
||||||
#
|
#
|
||||||
# Single GPU, OLLAMA_NUM_PARALLEL=1 — concurrent/mixed requests queue and the
|
# Single GPU, OLLAMA_NUM_PARALLEL=1 — concurrent/mixed requests queue and the
|
||||||
# 70B+8B can't both stay resident in the ~62 GB budget (expect model swaps).
|
# 70B+8B can't both stay resident in the ~62 GB budget (expect model swaps).
|
||||||
litellm_models:
|
litellm_models:
|
||||||
# ---- Local (Anvil/Ollama) ----
|
# ---- local_* → Anvil/Ollama, local GPU, no cost ----
|
||||||
- name: qwen2.5-72b
|
- name: local_qwen2.5-72b
|
||||||
backend: ollama_chat/qwen2.5:72b
|
backend: ollama_chat/qwen2.5:72b
|
||||||
api_base: "{{ anvil_ollama_base }}"
|
api_base: "{{ anvil_ollama_base }}"
|
||||||
api_key: ollama-no-auth
|
api_key: ollama-no-auth
|
||||||
- name: llama-3.3-70b
|
- name: local_llama-3.3-70b
|
||||||
backend: ollama_chat/llama3.3:70b
|
backend: ollama_chat/llama3.3:70b
|
||||||
api_base: "{{ anvil_ollama_base }}"
|
api_base: "{{ anvil_ollama_base }}"
|
||||||
api_key: ollama-no-auth
|
api_key: ollama-no-auth
|
||||||
- name: llama-3.1-8b
|
- name: local_llama-3.1-8b
|
||||||
backend: ollama_chat/llama3.1:8b
|
backend: ollama_chat/llama3.1:8b
|
||||||
api_base: "{{ anvil_ollama_base }}"
|
api_base: "{{ anvil_ollama_base }}"
|
||||||
api_key: ollama-no-auth
|
api_key: ollama-no-auth
|
||||||
- name: nomic-embed-text
|
# Embeddings don't actually route through LiteLLM 1.55.10 (broken async ollama
|
||||||
|
# handler → 500); consumers hit Anvil direct. Kept as a catalog entry only.
|
||||||
|
- name: local_nomic-embed-text
|
||||||
backend: ollama/nomic-embed-text
|
backend: ollama/nomic-embed-text
|
||||||
api_base: "{{ anvil_ollama_base }}"
|
api_base: "{{ anvil_ollama_base }}"
|
||||||
api_key: ollama-no-auth
|
api_key: ollama-no-auth
|
||||||
# ---- Escape hatches → Claude via Meridian/Max ----
|
# ---- proxy_* → Claude via Meridian/Max, no per-token cost ----
|
||||||
# No api_base → template default (127.0.0.1:meridian_port). Reach Claude by
|
# api_base pins Meridian's local Anthropic endpoint (the OAuth Max sub pays).
|
||||||
# name when local can't do the job (vision, hard reasoning).
|
# Reach Claude by name when local can't do the job (vision, hard reasoning).
|
||||||
- name: claude-haiku-4-5-max
|
- name: proxy_claude-haiku-4-5
|
||||||
backend: anthropic/claude-haiku-4-5
|
backend: anthropic/claude-haiku-4-5
|
||||||
- name: claude-sonnet-4-6-max
|
api_base: "http://127.0.0.1:{{ meridian_port }}"
|
||||||
|
- name: proxy_claude-sonnet-4-6
|
||||||
backend: anthropic/claude-sonnet-4-6
|
backend: anthropic/claude-sonnet-4-6
|
||||||
- name: claude-opus-4-7-max
|
api_base: "http://127.0.0.1:{{ meridian_port }}"
|
||||||
|
- name: proxy_claude-opus-4-7
|
||||||
backend: anthropic/claude-opus-4-7
|
backend: anthropic/claude-opus-4-7
|
||||||
|
api_base: "http://127.0.0.1:{{ meridian_port }}"
|
||||||
|
# ---- direct_* → public provider APIs, METERED, keys from Infisical /meridian ----
|
||||||
|
# SCAFFOLDED 2026-06-05 with NO real keys: these 401 on call until
|
||||||
|
# OPENAI_API_KEY / GEMINI_API_KEY land in Infisical /meridian and deploy.sh
|
||||||
|
# pulls them (litellm.env carries a placeholder so the proxy still boots).
|
||||||
|
# No api_base → LiteLLM uses each provider's default endpoint; routing is by
|
||||||
|
# the backend's provider prefix (openai/, gemini/).
|
||||||
|
- name: direct_gpt-4o
|
||||||
|
backend: openai/gpt-4o
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
- name: direct_gpt-4o-mini
|
||||||
|
backend: openai/gpt-4o-mini
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
- name: direct_o3-mini
|
||||||
|
backend: openai/o3-mini
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
- name: direct_gemini-2.0-flash
|
||||||
|
backend: gemini/gemini-2.0-flash
|
||||||
|
api_key: os.environ/GEMINI_API_KEY
|
||||||
|
- name: direct_gemini-1.5-pro
|
||||||
|
backend: gemini/gemini-1.5-pro
|
||||||
|
api_key: os.environ/GEMINI_API_KEY
|
||||||
# Master key is required by LiteLLM. Pulled at deploy time from Infisical
|
# Master key is required by LiteLLM. Pulled at deploy time from Infisical
|
||||||
# /meridian/vault_litellm_master_key and passed via -e on the playbook
|
# /meridian/vault_litellm_master_key and passed via -e on the playbook
|
||||||
# (see deploy.sh).
|
# (see deploy.sh).
|
||||||
litellm_master_key: "{{ lookup('env', 'LITELLM_MASTER_KEY') | default('CHANGE_ME', true) }}"
|
litellm_master_key: "{{ lookup('env', 'LITELLM_MASTER_KEY') | default('CHANGE_ME', true) }}"
|
||||||
|
|
||||||
|
# Provider keys for direct_* models. Optional — deploy.sh pulls them from
|
||||||
|
# Infisical /meridian if present, else they stay empty and litellm.env writes a
|
||||||
|
# placeholder so the proxy still boots (direct_* models just 401 until a real
|
||||||
|
# key lands). Drop OPENAI_API_KEY / GEMINI_API_KEY into Infisical /meridian to
|
||||||
|
# activate them.
|
||||||
|
litellm_openai_api_key: "{{ lookup('env', 'OPENAI_API_KEY') | default('', true) }}"
|
||||||
|
litellm_gemini_api_key: "{{ lookup('env', 'GEMINI_API_KEY') | default('', true) }}"
|
||||||
|
|||||||
Reference in New Issue
Block a user