Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.mutagent.io/llms.txt

Use this file to discover all available pages before exploring further.

Evaluations SDK

Create and manage prompt evaluations. Evaluations are definition entities that specify how to evaluate a prompt against a dataset. Results are produced by running the evaluation or via the optimization loop.

List Evaluations

Retrieve evaluations with optional filters:
from mutagent import Mutagent

with Mutagent() as client:
    result = client.prompt_evaluations.list_evaluations(
        prompt_id=42,
        limit=20,
        offset=0,
    )
    for eval_ in result.get("data", []):
        print(eval_["id"], eval_["name"])

Filter parameters

ParameterTypeDescription
prompt_idintFilter by prompt ID
prompt_group_idstrFilter by prompt group UUID
dataset_idintFilter by dataset ID
namestrFilter by evaluation name
created_bystrFilter by creator email
is_latestboolFilter by latest-version flag
limitintResults per page
offsetintNumber of results to skip

Create Evaluation

Create an evaluation definition linking a prompt to a dataset:
from mutagent import Mutagent
from mutagent.models import PromptIdDatasetIdName

with Mutagent() as client:
    evaluation = client.prompt_evaluations.create_evaluation(
        body=PromptIdDatasetIdName(
            prompt_id=42,
            dataset_id=7,
            name="Customer Support Quality Eval",
            description="Evaluate tone, accuracy, and helpfulness",
            eval_config={
                "metrics": ["g_eval", "semantic_similarity"],
                "threshold": 0.8,
            },
            llm_config={
                "model": "claude-sonnet-4-6",
                "temperature": 0,
            },
            tags=["production", "baseline"],
        )
    )
    print("Created evaluation:", evaluation["id"])

PromptIdDatasetIdName fields

FieldTypeRequiredDescription
prompt_idintYesID of the prompt to evaluate
namestrYesHuman-readable name (max 255 chars)
dataset_idintNoID of the test dataset
descriptionstrNoEvaluation purpose and methodology
eval_configAnyNoMetrics, thresholds, evaluation parameters
llm_configAnyNoModel, temperature, LLM execution settings
tagslist[str]NoOrganization tags
metadataAnyNoArbitrary metadata
created_bystrNoCreator email

Get Evaluation

with Mutagent() as client:
    evaluation = client.prompt_evaluations.get_evaluation(id_=456)
    print(evaluation["name"])
    print("Dataset:", evaluation["datasetId"])

Update Evaluation

from mutagent.models import NameDescriptionEvalConfig

with Mutagent() as client:
    updated = client.prompt_evaluations.update_evaluation(
        id_=456,
        body=NameDescriptionEvalConfig(
            description="Updated description",
        ),
    )

Delete Evaluation

with Mutagent() as client:
    client.prompt_evaluations.delete_evaluation(id_=456)

Run Evaluation

Trigger an evaluation run:
with Mutagent() as client:
    run = client.prompt_evaluations.run_evaluation(id_=456)
    print("Run started:", run)

Get Results

Retrieve the execution results for an evaluation:
with Mutagent() as client:
    result = client.prompt_evaluations.get_evaluation_result(id_=456)
    print("Score:", result.get("score"))
    print("Passed:", result.get("success"))

Get Evaluation History

with Mutagent() as client:
    history = client.prompt_evaluations.get_evaluation_history(id_=456)

Create Evaluation Version

with Mutagent() as client:
    new_version = client.prompt_evaluations.create_evaluation_version(id_=456)

Poll for Completion

Since evaluations run asynchronously, poll for results:
import time
from mutagent import Mutagent

def wait_for_results(eval_id: int, max_attempts: int = 30) -> dict:
    with Mutagent() as client:
        for i in range(max_attempts):
            try:
                result = client.prompt_evaluations.get_evaluation_result(id_=eval_id)
                print(f"Score: {result.get('score')} | Passed: {result.get('success')}")
                return result
            except Exception:
                print(f"Waiting for results... (attempt {i + 1}/{max_attempts})")
                time.sleep(2)

    raise TimeoutError("Timed out waiting for evaluation results")

Async version

import asyncio
from mutagent import AsyncMutagent

async def wait_for_results_async(eval_id: int, max_attempts: int = 30) -> dict:
    async with AsyncMutagent() as client:
        for i in range(max_attempts):
            try:
                result = await client.prompt_evaluations.get_evaluation_result(id_=eval_id)
                print(f"Score: {result.get('score')}")
                return result
            except Exception:
                await asyncio.sleep(2)
    raise TimeoutError("Timed out waiting for evaluation results")

Method Reference

MethodDescriptionNamespace
list_evaluations(...)List evaluations with filtersclient.prompt_evaluations
create_evaluation(body)Create evaluation definitionclient.prompt_evaluations
get_evaluation(id_)Get evaluation by IDclient.prompt_evaluations
update_evaluation(id_, body)Update evaluationclient.prompt_evaluations
delete_evaluation(id_)Delete evaluationclient.prompt_evaluations
run_evaluation(id_)Trigger evaluation runclient.prompt_evaluations
get_evaluation_result(id_)Get evaluation resultsclient.prompt_evaluations
get_evaluation_history(id_)Get evaluation run historyclient.prompt_evaluations
create_evaluation_version(id_)Create new evaluation versionclient.prompt_evaluations
get_evaluation_results_aggregated(...)Get results aggregated by versionclient.prompt_evaluations