Skip to main content

Evaluations SDK

Run and manage prompt evaluations.

List Evaluations

const { evaluations } = await client.promptEvaluations.getApiPromptsEvaluations({
  limit: 20,
});

evaluations?.forEach(e => {
  console.log(e.name, '-', e.status);
});

Create Evaluation

const evaluation = await client.promptEvaluations.postApiPromptsEvaluations({
  promptGroupId: 'uuid-xxxx',
  datasetId: 123,
  name: 'Quality Check',
  evalConfig: {
    metrics: ['g_eval', 'semantic_similarity'],
  },
  llmConfig: {
    model: 'gpt-5.1',
    temperature: 0,
  },
});

console.log('Created evaluation:', evaluation.id);

Get Evaluation

const evaluation = await client.promptEvaluations.getApiPromptsEvaluationsById({
  id: 456,
});

console.log(evaluation.name);
console.log('Dataset:', evaluation.datasetId);
console.log('Config:', evaluation.evalConfig);

Run Evaluation

const run = await client.promptEvaluations.postApiPromptsEvaluationsByIdRun({
  id: 456,
});

console.log('Run started:', run.runId);
console.log('Status:', run.status);

Get Results

const results = await client.promptEvaluations.getApiPromptsEvaluationsByIdResult({
  id: 456,
});

console.log('Overall Score:', results.score);
console.log('Items evaluated:', results.itemCount);

results.metrics?.forEach(m => {
  console.log(`${m.name}: ${m.score.toFixed(2)}`);
});

Poll for Completion

async function waitForEvaluation(evalId: number): Promise<void> {
  while (true) {
    const results = await client.promptEvaluations.getApiPromptsEvaluationsByIdResult({
      id: evalId,
    });

    if (results.status === 'completed') {
      console.log('Done! Score:', results.score);
      return;
    }

    if (results.status === 'failed') {
      throw new Error('Evaluation failed: ' + results.error);
    }

    console.log('Progress:', results.progress, '%');
    await new Promise(r => setTimeout(r, 2000));
  }
}

Type Definitions

interface Evaluation {
  id: number;
  promptGroupId: string;
  datasetId: number;
  name: string;
  description?: string;
  evalConfig: unknown;
  llmConfig: unknown;
  tags: string[];
  metadata: unknown;
  createdAt: string;
  createdBy: string;
}

interface EvaluationResult {
  evaluationId: number;
  runId: string;
  status: 'pending' | 'running' | 'completed' | 'failed';
  progress?: number;
  score?: number;
  metrics?: Array<{
    name: string;
    score: number;
  }>;
  itemCount?: number;
  error?: string;
}

Method Reference

MethodDescription
getApiPromptsEvaluations()List evaluations
postApiPromptsEvaluations({ ...data })Create evaluation
getApiPromptsEvaluationsById({ id })Get evaluation
postApiPromptsEvaluationsByIdRun({ id })Run evaluation
getApiPromptsEvaluationsByIdResult({ id })Get results