Skip to content

AI Module

SDK reference for AI completions and streaming

The ai module provides LLM completions using your platform-configured provider.

from bifrost import ai

Get a completion from the configured LLM.

async def complete(
prompt: str | None = None,
*,
messages: list[dict[str, str]] | None = None,
system: str | None = None,
response_format: type[T] | None = None,
knowledge: list[str] | None = None,
max_tokens: int | None = None,
temperature: float | None = None,
org_id: str | None = None,
model: str | None = None,
) -> AIResponse | T
ParameterTypeDescription
promptstrSimple prompt string
messageslist[dict]Chat messages with role and content
systemstrSystem prompt (prepended to messages)
response_formattypePydantic model for structured output
knowledgelist[str]Knowledge namespaces for RAG
max_tokensintOverride default max tokens
temperaturefloatOverride default temperature (0.0-2.0)
org_idstrOrganization context (auto-set in workflows)
modelstrOverride default model

AIResponse or instance of response_format if provided.

# Simple prompt
response = await ai.complete("Explain Kubernetes")
print(response.content)
# With system prompt
response = await ai.complete(
"What should I do?",
system="You are a helpful assistant."
)
# Message format
response = await ai.complete(
messages=[
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"}
]
)
# Structured output
from pydantic import BaseModel
class Analysis(BaseModel):
sentiment: str
score: float
result = await ai.complete(
"Analyze: Great product!",
response_format=Analysis
)
print(result.sentiment) # "positive"
# With RAG
response = await ai.complete(
"What is the refund policy?",
knowledge=["policies", "faq"]
)

Stream tokens as they’re generated.

async def stream(
prompt: str | None = None,
*,
messages: list[dict[str, str]] | None = None,
system: str | None = None,
knowledge: list[str] | None = None,
max_tokens: int | None = None,
temperature: float | None = None,
org_id: str | None = None,
model: str | None = None,
) -> AsyncGenerator[AIStreamChunk, None]

Same as ai.complete() except no response_format.

ParameterTypeDescription
promptstrSimple prompt string
messageslist[dict]Chat messages with role and content
systemstrSystem prompt (prepended to messages)
knowledgelist[str]Knowledge namespaces for RAG
max_tokensintOverride default max tokens
temperaturefloatOverride default temperature (0.0-2.0)
org_idstrOrganization context (auto-set in workflows)
modelstrOverride default model

AIStreamChunk objects with:

  • content - Text content of this chunk
  • done - Whether this is the final chunk
  • input_tokens - Total input tokens (only on final chunk)
  • output_tokens - Total output tokens (only on final chunk)
async for chunk in ai.stream("Write a story"):
print(chunk.content, end="", flush=True)
if chunk.done:
print(f"\nTokens: {chunk.input_tokens + chunk.output_tokens}")

Get information about the configured LLM provider.

async def get_model_info() -> dict[str, Any]

Dictionary with provider, model, and configuration details.

info = await ai.get_model_info()
print(f"Using {info['provider']}/{info['model']}")
class AIResponse(BaseModel):
content: str # Generated text
input_tokens: int # Tokens in prompt
output_tokens: int # Tokens in response
model: str # Model used
class AIStreamChunk(BaseModel):
content: str # Chunk text
done: bool # Is final chunk
input_tokens: int | None = None # Only on final
output_tokens: int | None = None # Only on final