Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 63 additions & 7 deletions app/pages/evals.vue
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import rawData from '~~/public/agent-results.json'
const UButton = resolveComponent('UButton')
const UBadge = resolveComponent('UBadge')
const UAvatar = resolveComponent('UAvatar')
const UTooltip = resolveComponent('UTooltip')
const UIcon = resolveComponent('UIcon')

definePageMeta({
heroBackground: 'opacity-70 -z-10'
Expand All @@ -19,6 +21,10 @@ interface EvalResultItem {
duration: number
evalPath: string
timestamp: string
firstRunSuccess?: boolean
passedRuns?: number
totalRuns?: number
passRate?: number
}
}

Expand All @@ -28,6 +34,8 @@ interface Experiment {
modelName: string
agentHarness: string
avgDuration?: number
passAt1?: number
avgPassRate?: number
}

interface ModelRow {
Expand All @@ -36,6 +44,7 @@ interface ModelRow {
timestamp: string
totalEvals: number
successRate: number
passAt1?: number
avgDuration: number
evals: EvalResultItem[]
}
Expand Down Expand Up @@ -71,9 +80,12 @@ const experimentMap = computed(() => {
return map
})

// Sort by success rate, then most recent run date first
// Sort by success rate, then first-try rate (tiebreak), then most recent run date first
function sortRows(a: ModelRow, b: ModelRow): number {
if (b.successRate !== a.successRate) return b.successRate - a.successRate
const aPassAt1 = a.passAt1 ?? -1
const bPassAt1 = b.passAt1 ?? -1
if (bPassAt1 !== aPassAt1) return bPassAt1 - aPassAt1
return new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime()
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

Expand All @@ -90,6 +102,7 @@ const allResults = computed<ModelRow[]>(() => {
timestamp: experiment?.timestamp || '',
totalEvals: evals.length,
successRate: evals.length ? Math.round((successes / evals.length) * 100) : 0,
passAt1: experiment?.passAt1,
avgDuration: experiment?.avgDuration ?? 0,
evals
})
Expand Down Expand Up @@ -127,11 +140,15 @@ const filteredResults = computed(() => {
rows = rows.map((r) => {
const evals = r.evals.filter(e => selectedCategories.value.includes(getEvalCategory(e.evalPath)))
const successes = evals.filter(e => e.result.success).length
const firstTries = evals.filter(e => e.result.firstRunSuccess).length
return {
...r,
evals,
totalEvals: evals.length,
successRate: evals.length ? Math.round((successes / evals.length) * 100) : 0
successRate: evals.length ? Math.round((successes / evals.length) * 100) : 0,
// Recompute first-try rate from the filtered subset so the column and sort tiebreak
// match the shown evals; keep undefined for older data that lacks firstRunSuccess.
passAt1: r.passAt1 != null && evals.length ? firstTries / evals.length : undefined
}
}).sort(sortRows)
}
Expand All @@ -152,7 +169,8 @@ const modelIconMap: Record<string, string> = {
gpt: 'i-simple-icons-openai',
cursor: 'i-simple-icons-cursor',
gemini: 'i-simple-icons-googlegemini',
devstral: 'i-simple-icons-mistralai'
devstral: 'i-simple-icons-mistralai',
minimax: 'i-simple-icons-minimax'
}

function getModelIcon(model: string): string {
Expand Down Expand Up @@ -242,6 +260,22 @@ const columns: TableColumn<ModelRow>[] = [
}
},
cell: ({ row }) => h('span', {}, `${row.original.successRate}%`)
},
{
accessorKey: 'passAt1',
header: () => h(UTooltip, {
text: 'Passed on the first attempt. Each eval allows up to 4 attempts; Success counts an eval as passed if any attempt succeeds.'
}, () => h('span', { class: 'inline-flex items-center gap-1' }, [
h('span', {}, 'First-Try Rate'),
h(UIcon, { name: 'i-lucide-info', class: 'size-3.5 text-dimmed' })
])),
meta: {
class: {
th: 'text-right',
td: 'text-right text-muted'
}
},
cell: ({ row }) => h('span', {}, row.original.passAt1 != null ? `${Math.round(row.original.passAt1 * 100)}%` : '—')
}
]

Expand All @@ -260,10 +294,24 @@ const evalColumns: TableColumn<EvalResultItem>[] = [
td: 'text-center'
}
},
cell: ({ row }) => h(UBadge, {
color: row.original.result.success ? 'success' : 'error',
variant: 'subtle'
}, () => row.original.result.success ? 'Pass' : 'Fail')
cell: ({ row }) => {
const { success, passedRuns, totalRuns } = row.original.result
const children = [
h(UBadge, {
color: success ? 'success' : 'error',
variant: 'subtle'
}, () => success ? 'Pass' : 'Fail')
]
if (totalRuns && totalRuns > 1) {
children.push(h(UTooltip, {
text: `${passedRuns} of ${totalRuns} attempts passed`
}, () => h(UBadge, {
color: 'neutral',
variant: 'subtle'
}, () => `${passedRuns}/${totalRuns}`)))
}
return h('div', { class: 'flex items-center justify-center gap-1.5' }, children)
}
},
{
id: 'duration',
Expand Down Expand Up @@ -371,6 +419,14 @@ const evalColumns: TableColumn<EvalResultItem>[] = [
/>
</template>
</UTable>

<div class="mt-4 text-sm text-dimmed text-pretty text-justify bg-elevated/50 p-4 rounded-lg">
Each evaluation is attempted up to 4 times.
<span class="text-default font-medium">Success Rate</span> is the percentage of evals that passed on at least one attempt;
<span class="text-default font-medium">First-Try Rate</span> is the percentage that passed on the first attempt, used to break ties between models with the same success rate.
<span class="text-default font-medium">Avg Duration</span> is the mean time an agent took per eval. Expand a row to see per-eval results, where a
<span class="text-default font-medium">1/3</span> badge means the eval failed twice before passing.
</div>
</UContainer>
</UPageBody>
</div>
Expand Down
Loading