Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ const result = await runUserFlow({

### `assert(options: AssertionOptions)`

Multi-model consensus assertion. Runs Claude and Gemini in parallel; if they disagree, a third model arbitrates.
Multi-model consensus assertion. Runs Claude and Gemini in parallel; if they disagree, a third model arbitrates (configurable — see [Consensus Policy](#consensus-policy)).

```typescript
const result = await assert({
Expand All @@ -226,6 +226,25 @@ const result = await assert({
});
```

### Consensus Policy

When the primary (Claude) and secondary (Gemini) assertion models reach the same verdict, the result is used directly. When they **disagree**, you choose how Passmark resolves it:

| Policy | Behavior |
|---|---|
| `consult-arbiter-on-disagreement` *(default)* | Calls the arbiter model (Gemini 3.1 Pro) to break the tie. |
| `fail-on-disagreement` | Treats any disagreement as a failure immediately — no arbiter call. The returned reasoning includes both models' takes so you can inspect what they saw differently. |

Pick `fail-on-disagreement` when you'd rather surface ambiguity/flakiness in the UI under test than let a single model swing the result. Pick the default when you trust the arbiter to make the final call.

```typescript
configure({
assertions: {
consensusPolicy: "fail-on-disagreement",
},
});
```

### Video Assertions

For UI that's only visible for a second or two — toast messages, snackbar confirmations, transient banners — a single end-of-flow screenshot often misses the evidence. Set `video: true` on an assertion inside `runSteps` and Passmark will record the entire step run with `page.screencast`, upload the resulting `.webm` to Gemini's Files API, and evaluate the assertion against the full video:
Expand Down
78 changes: 78 additions & 0 deletions src/__tests__/assertion.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ vi.mock("../utils", () => ({
}));

import { assert } from "../assertion";
import { configure, resetConfig } from "../config";
import { withTimeout } from "../utils";
import { generateText } from "ai";

Expand Down Expand Up @@ -85,6 +86,7 @@ function makeGenerateTextImpl(opts: {

beforeEach(() => {
vi.clearAllMocks();
resetConfig();
});

describe("assert consensus logic", () => {
Expand Down Expand Up @@ -233,3 +235,79 @@ describe("assert consensus logic", () => {
expect(res).toContain("✅ passed");
});
});

describe("consensusPolicy", () => {
it('fails on disagreement when policy is "fail-on-disagreement" and skips the arbiter', async () => {
configure({ assertions: { consensusPolicy: "fail-on-disagreement" } });

const page = createMockPage();
let arbiterCalled = false;

vi.mocked(generateText).mockImplementation((async (args: any) => {
const model = String(args.model ?? "");
const wantsStructured = Boolean(args.output);
if (!wantsStructured) return { text: "claude text" } as any;
if (model.includes("anthropic")) {
return { output: { assertionPassed: true, confidenceScore: 90, reasoning: "Claude says pass" } } as any;
}
if (model.includes("3.1-pro-preview")) {
arbiterCalled = true;
return { output: { assertionPassed: true, confidenceScore: 80, reasoning: "Arbiter should NOT be called" } } as any;
}
if (model.includes("gemini-3-flash")) {
return { output: { assertionPassed: false, confidenceScore: 70, reasoning: "Gemini says fail" } } as any;
}
return { output: { assertionPassed: false, confidenceScore: 0, reasoning: "unknown" } } as any;
}) as any);

const res = await assert({
page,
assertion: "The page shows 3 items",
test: mockTest,
expect: ((a: unknown, _m?: string) => ({ toBe: (_v: unknown) => {} })) as any,
failSilently: true,
maxRetries: 0, // skip the outer retry loop so we observe a single attempt
});

expect(arbiterCalled).toBe(false);
expect(res).toContain("❌ failed");
expect(res).toContain("Claude says pass");
expect(res).toContain("Gemini says fail");
expect(res).toContain("fail-on-disagreement");
});

it("still consults the arbiter on disagreement when policy is the default", async () => {
// No configure() — should use default "consult-arbiter-on-disagreement"
const page = createMockPage();
let arbiterCalled = false;

vi.mocked(generateText).mockImplementation((async (args: any) => {
const model = String(args.model ?? "");
const wantsStructured = Boolean(args.output);
if (!wantsStructured) return { text: "claude text" } as any;
if (model.includes("anthropic")) {
return { output: { assertionPassed: true, confidenceScore: 90, reasoning: "Claude says pass" } } as any;
}
if (model.includes("3.1-pro-preview")) {
arbiterCalled = true;
return { output: { assertionPassed: true, confidenceScore: 75, reasoning: "Arbiter: pass" } } as any;
}
if (model.includes("gemini-3-flash")) {
return { output: { assertionPassed: false, confidenceScore: 70, reasoning: "Gemini says fail" } } as any;
}
return { output: { assertionPassed: false, confidenceScore: 0, reasoning: "unknown" } } as any;
}) as any);

const res = await assert({
page,
assertion: "The page shows 3 items",
test: mockTest,
expect: ((a: unknown, _m?: string) => ({ toBe: (_v: unknown) => {} })) as any,
failSilently: true,
});

expect(arbiterCalled).toBe(true);
expect(res).toContain("✅ passed");
expect(res).toContain("Arbiter: pass");
});
});
22 changes: 21 additions & 1 deletion src/assertion.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { generateText, ModelMessage, Output } from "ai";
import { z } from "zod";
import { getModelId } from "./config";
import { getConsensusPolicy, getModelId } from "./config";
import { ASSERTION_MODEL_TIMEOUT, THINKING_BUDGET_DEFAULT } from "./constants";
import { logger } from "./logger";
import { resolveModel } from "./models";
Expand Down Expand Up @@ -312,6 +312,26 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe

// Check if models disagree on assertionPassed
if (claudeResult.assertionPassed !== geminiResult.assertionPassed) {
const policy = getConsensusPolicy();

if (policy === "fail-on-disagreement") {
logger.debug(
"Models disagree on assertion result; failing per consensusPolicy=fail-on-disagreement.",
);
const lower = Math.min(
claudeResult.confidenceScore,
geminiResult.confidenceScore,
);
return {
assertionPassed: false,
confidenceScore: Math.round(lower),
reasoning:
`Assertion failed: models disagreed and consensusPolicy is "fail-on-disagreement".\n` +
`Claude (passed=${claudeResult.assertionPassed}, ${claudeResult.confidenceScore}%): ${claudeResult.reasoning}\n` +
`Gemini (passed=${geminiResult.assertionPassed}, ${geminiResult.confidenceScore}%): ${geminiResult.reasoning}`,
};
}

logger.debug("Models disagree on assertion result, consulting arbiter...");
const arbiterResult = await withTimeout(
getArbiterDecision(claudeResult, geminiResult),
Expand Down
31 changes: 31 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,27 @@ export type RedisConfig = {
url?: string;
};

/**
* Policy for resolving disagreements between the primary and secondary
* assertion models.
* - "consult-arbiter-on-disagreement" (default): a third arbiter model
* makes the final call. Best when you trust the arbiter to break ties.
* - "fail-on-disagreement": any disagreement fails the assertion
* immediately. Strictest possible setting — useful when you'd rather
* surface flakiness/ambiguity than risk a single model being wrong.
*/
export type ConsensusPolicy =
| "consult-arbiter-on-disagreement"
| "fail-on-disagreement";

export type AssertionsConfig = {
/**
* How to resolve disagreements between the primary and secondary
* assertion models. Defaults to "consult-arbiter-on-disagreement".
*/
consensusPolicy?: ConsensusPolicy;
};

export type TelemetryConfig = {
/**
* Axiom API token for OpenTelemetry tracing of AI calls.
Expand All @@ -98,6 +119,8 @@ type Config = {
redis?: RedisConfig;
/** Telemetry (Axiom) connection. When omitted, falls back to `AXIOM_TOKEN`/`AXIOM_DATASET` env vars. */
telemetry?: TelemetryConfig;
/** Behavior of the multi-model assertion consensus engine. */
assertions?: AssertionsConfig;
/**
* Directory used to temporarily store video recordings for video-flagged
* assertions. Defaults to `/tmp/passmark-recordings`. Files are deleted
Expand Down Expand Up @@ -161,6 +184,14 @@ export function getMode(): AIMode {
return getConfig().ai?.mode ?? "snapshot";
}

/**
* Returns the effective consensus policy. Defaults to
* "consult-arbiter-on-disagreement" so existing users see no change.
*/
export function getConsensusPolicy(): ConsensusPolicy {
return getConfig().assertions?.consensusPolicy ?? "consult-arbiter-on-disagreement";
}

/**
* Effective AI config for a single step / call after merging overrides with
* the global config. `getModelId` looks up a model with the same precedence
Expand Down