From 7e2f26710e18bf6da942427d9c9e93a59b43713e Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Mon, 30 Mar 2026 18:49:36 -0700 Subject: [PATCH 1/2] add native agent file uploads --- README.md | 3 + .../lib/v3/agent/prompts/agentSystemPrompt.ts | 16 +- packages/core/lib/v3/agent/tools/index.ts | 34 +++- packages/core/lib/v3/agent/tools/upload.ts | 121 +++++++++++++++ .../core/lib/v3/agent/utils/actionMapping.ts | 40 +++++ packages/core/lib/v3/cache/AgentCache.ts | 21 +++ packages/core/lib/v3/types/private/cache.ts | 8 + packages/core/lib/v3/types/public/agent.ts | 10 +- packages/core/lib/v3/v3.ts | 19 ++- .../integration/agent-hybrid-mode.spec.ts | 7 + .../integration/agent-upload-tool.spec.ts | 145 ++++++++++++++++++ .../agent-system-prompt-variables.test.ts | 1 + .../unit/agent-upload-tool-execute.test.ts | 117 ++++++++++++++ .../tests/unit/agent-upload-tools.test.ts | 41 +++++ packages/docs/v3/basics/agent.mdx | 23 ++- .../prompting-best-practices.mdx | 6 +- packages/docs/v3/references/agent.mdx | 10 +- 17 files changed, 607 insertions(+), 15 deletions(-) create mode 100644 packages/core/lib/v3/agent/tools/upload.ts create mode 100644 packages/core/tests/integration/agent-upload-tool.spec.ts create mode 100644 packages/core/tests/unit/agent-upload-tool-execute.test.ts create mode 100644 packages/core/tests/unit/agent-upload-tools.test.ts diff --git a/README.md b/README.md index 56dda5f6d3..caddceca03 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,9 @@ await stagehand.act("click on the stagehand repo"); // Use agent() for multi-step tasks const agent = stagehand.agent(); await agent.execute("Get to the latest PR"); +await agent.execute( + 'Upload the file at "/Users/me/Documents/resume.pdf" to the resume file input', +); // Use extract() to get structured data from the page const { author, title } = await stagehand.extract( diff --git a/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts b/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts index d5b6637e8b..8a7ca3dc8f 100644 --- a/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts +++ b/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts @@ -60,6 +60,11 @@ function buildToolsSection( description: "Perform a specific atomic action (click, type, etc.) - ONLY use when element is in ariaTree but NOT visible in screenshot. Less reliable but can interact with out-of-viewport elements.", }, + { + name: "upload", + description: + "Upload one or more local files into a file input when the user has provided a file path", + }, { name: "dragAndDrop", description: "Drag and drop an element" }, { name: "clickAndHold", description: "Click and hold on an element" }, { name: "keys", description: "Press a keyboard key" }, @@ -89,6 +94,11 @@ function buildToolsSection( name: "act", description: "Perform a specific atomic action (click, type)", }, + { + name: "upload", + description: + "Upload one or more local files into a file input when the user has provided a file path", + }, { name: "keys", description: "Press a keyboard key" }, { name: "fillForm", description: "Fill out a form" }, { name: "think", description: "Think about the task" }, @@ -147,6 +157,7 @@ export function buildAgentSystemPrompt( `Tool selection priority: Use specific tools (click, type) when elements are visible in viewport for maximum reliability.`, `Always use screenshot to get proper grounding of the coordinates you want to type/click into.`, `When interacting with an input, always use the type tool to type into the input, over clicking and then typing into it.`, + `When the task requires uploading a file and a local path is available, use the upload tool instead of clicking the visible upload button.`, `Use ariaTree as a secondary tool when elements aren't visible in screenshot or to get full page context.`, `Only use act when element is in ariaTree but NOT visible in screenshot.`, ] @@ -154,6 +165,7 @@ export function buildAgentSystemPrompt( `Tool selection priority: Use act tool for all clicking and typing on a page.`, `Always check ariaTree first to understand full page content without scrolling - it shows all elements including those below the fold.`, `When interacting with an input, always use the act tool to type into the input, over clicking and then typing.`, + `When the task requires uploading a file and a local path is available, use the upload tool instead of clicking the visible upload button.`, `If an element is present in the ariaTree, use act to interact with it directly - this eliminates the need to scroll.`, `Use screenshot for visual confirmation when needed, but rely primarily on ariaTree for element detection.`, ]; @@ -213,8 +225,8 @@ export function buildAgentSystemPrompt( // Build variables section only if variables are provided const hasVariables = variables && Object.keys(variables).length > 0; const variableToolsNote = isHybridMode - ? "Use %variableName% syntax in the type, fillFormVision, or act tool's value/text/action fields." - : "Use %variableName% syntax in the act or fillForm tool's action fields."; + ? "Use %variableName% syntax in the type, fillFormVision, act, or upload tool's text/action/path fields." + : "Use %variableName% syntax in the act, fillForm, or upload tool's action/path fields."; const variableEntries = getVariablePromptEntries(variables); const variablesSection = hasVariables ? ` diff --git a/packages/core/lib/v3/agent/tools/index.ts b/packages/core/lib/v3/agent/tools/index.ts index ecaba3fad4..cd1e4b1c9d 100644 --- a/packages/core/lib/v3/agent/tools/index.ts +++ b/packages/core/lib/v3/agent/tools/index.ts @@ -16,6 +16,7 @@ import { fillFormVisionTool } from "./fillFormVision.js"; import { thinkTool } from "./think.js"; import { searchTool as browserbaseSearchTool } from "./browserbaseSearch.js"; import { searchTool as braveSearchTool } from "./braveSearch.js"; +import { uploadTool } from "./upload.js"; import type { ToolSet, InferUITools } from "ai"; import type { V3 } from "../../v3.js"; @@ -33,7 +34,7 @@ export interface V3AgentToolOptions { logger?: (message: LogLine) => void; /** * Tool mode determines which set of tools are available. - * - 'dom' (default): Uses DOM-based tools (act, fillForm) - removes coordinate-based tools + * - 'dom' (default): Uses DOM-based tools (act, fillForm, upload) - removes coordinate-based tools * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, etc.) - removes fillForm */ mode?: AgentToolMode; @@ -156,6 +157,8 @@ export function createAgentTools(v3: V3, options?: V3AgentToolOptions) { extract: "— try using a smaller or simpler schema", fillForm: "(it may continue executing in the background) — try filling fewer fields at once or use a different tool", + upload: + "— make sure the path exists locally and the target describes the actual file input element", }; const unwrappedTools: ToolSet = { @@ -173,6 +176,7 @@ export function createAgentTools(v3: V3, options?: V3AgentToolOptions) { screenshot: screenshotTool(v3), scroll: mode === "hybrid" ? scrollVisionTool(v3, provider) : scrollTool(v3), type: typeTool(v3, provider, variables), + upload: uploadTool(v3, executionModel, variables, toolTimeout), }; if (options?.useSearch && options.browserbaseApiKey) { @@ -206,6 +210,33 @@ export function createAgentTools(v3: V3, options?: V3AgentToolOptions) { export type AgentTools = ReturnType; +export function createCuaAgentTools( + v3: V3, + tools: ToolSet = {}, + options?: Pick< + V3AgentToolOptions, + "executionModel" | "toolTimeout" | "variables" + >, +): ToolSet { + const builtInUploadTool = wrapToolWithTimeout( + uploadTool( + v3, + options?.executionModel, + options?.variables, + options?.toolTimeout, + ), + "upload()", + v3, + options?.toolTimeout, + "— make sure the path exists locally and the target describes the actual file input element", + ); + + return { + upload: builtInUploadTool, + ...tools, + }; +} + /** * Type map of all agent tools for strong typing of tool calls and results. * Note: `search` is optional — enabled via useSearch: true (Browserbase) or BRAVE_API_KEY env var (legacy). @@ -229,6 +260,7 @@ export type AgentToolTypesMap = { | ReturnType; think: ReturnType; type: ReturnType; + upload: ReturnType; wait: ReturnType; }; diff --git a/packages/core/lib/v3/agent/tools/upload.ts b/packages/core/lib/v3/agent/tools/upload.ts new file mode 100644 index 0000000000..9aba347df3 --- /dev/null +++ b/packages/core/lib/v3/agent/tools/upload.ts @@ -0,0 +1,121 @@ +import path from "path"; +import { tool } from "ai"; +import { z } from "zod"; +import type { V3 } from "../../v3.js"; +import type { AgentModelConfig, Variables } from "../../types/public/agent.js"; +import { substituteVariables } from "../utils/variables.js"; +import { TimeoutError } from "../../types/public/sdkErrors.js"; + +const uploadInputInstruction = (target: string) => + `Find the actual element for ${target}. Return the real upload input element itself, not a visible button, wrapper, label, or drag-and-drop container.`; + +function summarizePaths(paths: string[]): string[] { + return paths.map((filePath) => path.basename(filePath)); +} + +export const uploadTool = ( + v3: V3, + executionModel?: string | AgentModelConfig, + variables?: Variables, + toolTimeout?: number, +) => { + const hasVariables = variables && Object.keys(variables).length > 0; + const availableVariables = hasVariables + ? Object.keys(variables).join(", ") + : ""; + + return tool({ + description: + "Upload one or more local files into a file input. Use this instead of clicking upload buttons when the user has provided a file path.", + inputSchema: z.object({ + target: z + .string() + .describe( + 'Describe the actual file input target, e.g. "resume file input", "CV upload field", or "profile photo uploader".', + ), + paths: z + .array(z.string().min(1)) + .min(1, "Provide at least one local file path to upload") + .describe( + hasVariables + ? `One or more local file paths to upload. You may use %variableName% placeholders. Available variables: ${availableVariables}` + : "One or more local file paths to upload.", + ), + }), + execute: async ({ target, paths }) => { + try { + const resolvedPaths = paths.map((filePath) => + substituteVariables(filePath, variables).trim(), + ); + const summarizedPaths = summarizePaths(resolvedPaths); + + v3.logger({ + category: "agent", + message: "Agent calling tool: upload", + level: 1, + auxiliary: { + target: { + value: target, + type: "string", + }, + files: { + value: JSON.stringify(summarizedPaths), + type: "object", + }, + }, + }); + + const observeOptions = executionModel + ? { model: executionModel, variables, timeout: toolTimeout } + : { variables, timeout: toolTimeout }; + const matches = await v3.observe( + uploadInputInstruction(target), + observeOptions, + ); + const fileInput = matches.find( + (match) => + typeof match.selector === "string" && + match.selector !== "not-supported", + ); + + if (!fileInput?.selector) { + return { + success: false, + error: `Could not find a file input for ${target}. Ask the agent to target the actual upload input field.`, + }; + } + + const page = await v3.context.awaitActivePage(); + const uploadValue = + resolvedPaths.length === 1 ? resolvedPaths[0]! : resolvedPaths; + + await page.deepLocator(fileInput.selector).setInputFiles(uploadValue); + + if (v3.isAgentReplayActive()) { + v3.recordAgentReplayStep({ + type: "upload", + target, + selector: fileInput.selector, + paths: resolvedPaths, + }); + } + + return { + success: true, + target: fileInput.description || target, + selector: fileInput.selector, + files: summarizedPaths, + fileCount: resolvedPaths.length, + }; + } catch (error) { + if (error instanceof TimeoutError) { + throw error; + } + return { + success: false, + error: error instanceof Error ? error.message : String(error), + }; + } + }, + }); +}; diff --git a/packages/core/lib/v3/agent/utils/actionMapping.ts b/packages/core/lib/v3/agent/utils/actionMapping.ts index 866a4a0fa4..bde21544f8 100644 --- a/packages/core/lib/v3/agent/utils/actionMapping.ts +++ b/packages/core/lib/v3/agent/utils/actionMapping.ts @@ -38,6 +38,8 @@ export function mapToolResultToActions({ return mapActToolResult(toolResult, args, reasoning); case "fillForm": return mapFillFormToolResult(toolResult, args, reasoning); + case "upload": + return [mapUploadToolResult(toolResult, args, reasoning)]; default: return [createStandardAction(toolCallName, toolResult, args, reasoning)]; } @@ -111,6 +113,44 @@ function mapFillFormToolResult( return actions; } +function mapUploadToolResult( + toolResult: unknown, + args: Record, + reasoning?: string, +): AgentAction { + const action: AgentAction = { + type: "upload", + reasoning, + taskCompleted: false, + target: args.target, + }; + + if (!toolResult || typeof toolResult !== "object") { + return action; + } + + const result = toolResult as Record; + const output = (result.output as Record) || result; + + if (typeof output.selector === "string") { + action.selector = output.selector; + } + if (Array.isArray(output.files)) { + action.files = output.files; + } + if (typeof output.fileCount === "number") { + action.fileCount = output.fileCount; + } + if (typeof output.success === "boolean") { + action.success = output.success; + } + if (typeof output.error === "string") { + action.error = output.error; + } + + return action; +} + function createStandardAction( toolCallName: string, toolResult: unknown, diff --git a/packages/core/lib/v3/cache/AgentCache.ts b/packages/core/lib/v3/cache/AgentCache.ts index e922be6c4f..e9d6f68ac8 100644 --- a/packages/core/lib/v3/cache/AgentCache.ts +++ b/packages/core/lib/v3/cache/AgentCache.ts @@ -9,6 +9,7 @@ import type { AgentReplayNavBackStep, AgentReplayScrollStep, AgentReplayStep, + AgentReplayUploadStep, AgentReplayWaitStep, CachedAgentEntry, SanitizedAgentExecuteOptions, @@ -650,6 +651,9 @@ export class AgentCache { case "goto": await this.replayAgentGotoStep(step as AgentReplayGotoStep, ctx); return step; + case "upload": + await this.replayAgentUploadStep(step as AgentReplayUploadStep, ctx); + return step; case "scroll": await this.replayAgentScrollStep(step as AgentReplayScrollStep, ctx); return step; @@ -771,6 +775,23 @@ export class AgentCache { await page.goto(step.url, { waitUntil: step.waitUntil ?? "load" }); } + private async replayAgentUploadStep( + step: AgentReplayUploadStep, + ctx: V3Context, + ): Promise { + const page = await ctx.awaitActivePage(); + await waitForCachedSelector({ + page, + selector: step.selector, + timeout: this.domSettleTimeoutMs, + logger: this.logger, + context: "upload", + }); + const uploadValue = + step.paths.length === 1 ? (step.paths[0] ?? "") : step.paths; + await page.deepLocator(step.selector).setInputFiles(uploadValue); + } + private async replayAgentScrollStep( step: AgentReplayScrollStep, ctx: V3Context, diff --git a/packages/core/lib/v3/types/private/cache.ts b/packages/core/lib/v3/types/private/cache.ts index 074f4e59b7..9ea7541bf7 100644 --- a/packages/core/lib/v3/types/private/cache.ts +++ b/packages/core/lib/v3/types/private/cache.ts @@ -87,6 +87,7 @@ export type AgentReplayStep = | AgentReplayActStep | AgentReplayFillFormStep | AgentReplayGotoStep + | AgentReplayUploadStep | AgentReplayScrollStep | AgentReplayWaitStep | AgentReplayNavBackStep @@ -115,6 +116,13 @@ export interface AgentReplayGotoStep { waitUntil?: LoadState; } +export interface AgentReplayUploadStep { + type: "upload"; + target: string; + selector: string; + paths: string[]; +} + export interface AgentReplayScrollStep { type: "scroll"; deltaX?: number; diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 7278e0c2d1..03253b0022 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -308,6 +308,7 @@ export interface AgentExecuteOptionsBase { * **DOM mode (default):** * - `act` - Perform semantic actions (click, type, etc.) * - `fillForm` - Fill form fields using DOM selectors + * - `upload` - Upload local files into file inputs * - `ariaTree` - Get accessibility tree of the page * - `extract` - Extract structured data from page * - `goto` - Navigate to a URL @@ -327,6 +328,7 @@ export interface AgentExecuteOptionsBase { * - `clickAndHold` - Click and hold at coordinates * - `fillFormVision` - Fill forms using vision/coordinates * - `act` - Perform semantic actions + * - `upload` - Upload local files into file inputs * - `ariaTree` - Get accessibility tree * - `extract` - Extract data from page * - `goto` - Navigate to URL @@ -373,7 +375,7 @@ export interface AgentExecuteOptionsBase { /** * Variables that the agent can use when filling forms or typing text. * The agent will see variable names and descriptions in the system prompt, - * and can use them via `%variableName%` syntax in act/type/fillForm tool calls. + * and can use them via `%variableName%` syntax in act/type/fillForm/upload tool calls. * * Accepts both simple values and rich objects with descriptions (same type as `act`). * @@ -397,7 +399,7 @@ export interface AgentExecuteOptionsBase { * Timeout in milliseconds for each agent tool call. * If a tool call exceeds this duration, it will be aborted and * reported back to the LLM as a timeout error so it can retry or adjust. - * For tools that call v3 methods (act, extract, fillForm, ariaTree), the + * For tools that call v3 methods (act, extract, fillForm, upload, ariaTree), the * timeout is also forwarded to the underlying v3 call for true cancellation. * @default 45000 (45 seconds) */ @@ -635,7 +637,7 @@ export type AgentModelConfig = { /** * Agent tool mode determines which set of tools are available to the agent. - * - 'dom': Uses DOM-based tools (act, fillForm) - better for structured page interactions + * - 'dom': Uses DOM-based tools (act, fillForm, upload) - better for structured page interactions * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, etc.) - better for visual/screenshot-based interactions * - 'cua': Uses Computer Use Agent (CUA) providers like Anthropic Claude or Google Gemini for screenshot-based automation */ @@ -677,7 +679,7 @@ export type AgentConfig = { stream?: boolean; /** * Tool mode for the agent. Determines which set of tools are available. - * - 'dom' (default): Uses DOM-based tools (act, fillForm) for structured interactions + * - 'dom' (default): Uses DOM-based tools (act, fillForm, upload) for structured interactions * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, clickAndHold, fillFormVision) * for visual/screenshot-based interactions * - 'cua': Uses Computer Use Agent (CUA) providers for screenshot-based automation diff --git a/packages/core/lib/v3/v3.ts b/packages/core/lib/v3/v3.ts index a3bd060398..bafd74690c 100644 --- a/packages/core/lib/v3/v3.ts +++ b/packages/core/lib/v3/v3.ts @@ -21,6 +21,7 @@ import { ObserveHandler } from "./handlers/observeHandler.js"; import { V3AgentHandler } from "./handlers/v3AgentHandler.js"; import { V3CuaAgentHandler } from "./handlers/v3CuaAgentHandler.js"; import { CAPTCHA_CUA_SYSTEM_PROMPT_NOTE } from "./agent/utils/captchaSolver.js"; +import { createCuaAgentTools } from "./agent/tools/index.js"; import { createBrowserbaseSession } from "./launch/browserbase.js"; import { launchLocalChrome } from "./launch/local.js"; import { LLMClient } from "./llm/LLMClient.js"; @@ -79,6 +80,9 @@ import { StagehandInitError, AgentStreamResult, } from "./types/public/index.js"; + +const AGENT_UPLOAD_SYSTEM_PROMPT_NOTE = + "\nIf the task requires uploading a file and the user has provided a local file path, use the upload tool instead of clicking the visible upload button. Target the actual file input element."; import { V3Context } from "./understudy/context.js"; import { Page } from "./understudy/page.js"; import { resolveModel } from "../modelUtils.js"; @@ -1898,9 +1902,21 @@ export class V3 { : null, }); - const tools = options?.integrations + const resolvedUserTools = options?.integrations ? await resolveTools(options.integrations, options.tools) : (options?.tools ?? {}); + const tools = createCuaAgentTools(this, resolvedUserTools, { + executionModel: options?.executionModel ?? options?.model, + variables: + typeof instructionOrOptions === "object" + ? instructionOrOptions.variables + : undefined, + toolTimeout: + typeof instructionOrOptions === "object" + ? (instructionOrOptions.toolTimeout ?? + DEFAULT_AGENT_TOOL_TIMEOUT_MS) + : DEFAULT_AGENT_TOOL_TIMEOUT_MS, + }); const handler = new V3CuaAgentHandler( this, @@ -1911,6 +1927,7 @@ export class V3 { userProvidedInstructions: (options.systemPrompt ?? `You are a helpful assistant that can use a web browser.\nDo not ask follow up questions, the user will trust your judgement.`) + + AGENT_UPLOAD_SYSTEM_PROMPT_NOTE + (this.isCaptchaAutoSolveEnabled ? CAPTCHA_CUA_SYSTEM_PROMPT_NOTE : ""), diff --git a/packages/core/tests/integration/agent-hybrid-mode.spec.ts b/packages/core/tests/integration/agent-hybrid-mode.spec.ts index 4557d55b5e..a54eea02cd 100644 --- a/packages/core/tests/integration/agent-hybrid-mode.spec.ts +++ b/packages/core/tests/integration/agent-hybrid-mode.spec.ts @@ -27,6 +27,7 @@ test.describe("Stagehand agent hybrid mode", () => { // DOM mode should have these tools expect(tools).toHaveProperty("act"); expect(tools).toHaveProperty("fillForm"); + expect(tools).toHaveProperty("upload"); expect(tools).toHaveProperty("ariaTree"); expect(tools).toHaveProperty("screenshot"); expect(tools).toHaveProperty("extract"); @@ -57,6 +58,7 @@ test.describe("Stagehand agent hybrid mode", () => { // Hybrid mode should also have common tools expect(tools).toHaveProperty("act"); + expect(tools).toHaveProperty("upload"); expect(tools).toHaveProperty("ariaTree"); expect(tools).toHaveProperty("screenshot"); expect(tools).toHaveProperty("extract"); @@ -76,6 +78,7 @@ test.describe("Stagehand agent hybrid mode", () => { // Should behave like DOM mode expect(tools).toHaveProperty("fillForm"); + expect(tools).toHaveProperty("upload"); expect(tools).not.toHaveProperty("click"); expect(tools).not.toHaveProperty("type"); }); @@ -93,10 +96,12 @@ test.describe("Stagehand agent hybrid mode", () => { expect(prompt).toContain("ariaTree"); expect(prompt).toContain("act"); expect(prompt).toContain("fillForm"); + expect(prompt).toContain("upload"); // Should have DOM-specific strategy expect(prompt).toContain("Use act tool for all clicking and typing"); expect(prompt).toContain("Always check ariaTree first"); + expect(prompt).toContain("use the upload tool instead of clicking"); }); test("Hybrid mode system prompt emphasizes screenshot and coordinate tools", () => { @@ -111,12 +116,14 @@ test.describe("Stagehand agent hybrid mode", () => { expect(prompt).toContain("type"); expect(prompt).toContain("fillFormVision"); expect(prompt).toContain("dragAndDrop"); + expect(prompt).toContain("upload"); // Should have hybrid-specific strategy expect(prompt).toContain( "Use specific tools (click, type) when elements are visible", ); expect(prompt).toContain("Always use screenshot"); + expect(prompt).toContain("use the upload tool instead of clicking"); }); test("System prompt includes custom instructions when provided", () => { diff --git a/packages/core/tests/integration/agent-upload-tool.spec.ts b/packages/core/tests/integration/agent-upload-tool.spec.ts new file mode 100644 index 0000000000..e1c7395a8d --- /dev/null +++ b/packages/core/tests/integration/agent-upload-tool.spec.ts @@ -0,0 +1,145 @@ +import { expect, test } from "@playwright/test"; +import { promises as fs } from "fs"; +import path from "path"; +import crypto from "crypto"; +import type { ToolCallOptions } from "ai"; +import { V3 } from "../../lib/v3/v3.js"; +import { v3TestConfig } from "./v3.config.js"; +import { createAgentTools } from "../../lib/v3/agent/tools/index.js"; + +const FILE_UPLOAD_V2_URL = + "https://browserbase.github.io/stagehand-eval-sites/sites/file-uploads-2/"; + +const RESUME_INPUT = "#resumeUpload"; +const RESUME_SUCCESS = "#resumeSuccess"; +const toolCallOptions: ToolCallOptions = { + toolCallId: "upload-integration-call", + messages: [], +}; + +type UploadTool = { + execute: ( + input: { + target: string; + paths: string[]; + }, + options: ToolCallOptions, + ) => Promise<{ + success: boolean; + selector?: string; + files?: string[]; + error?: string; + }>; +}; + +test.describe("Stagehand agent upload tool", () => { + let v3: V3; + const fixtures: string[] = []; + + test.beforeEach(async () => { + v3 = new V3({ + ...v3TestConfig, + experimental: true, + }); + await v3.init(); + }); + + test.afterEach(async () => { + await v3?.close?.().catch(() => {}); + await Promise.all( + fixtures.splice(0).map((file) => fs.unlink(file).catch(() => {})), + ); + }); + + const createFixture = async ( + namePrefix: string, + contents: string, + ext = ".txt", + ): Promise => { + const normalizedExt = ext.startsWith(".") ? ext : `.${ext}`; + const filename = `${namePrefix}-${crypto.randomBytes(4).toString("hex")}${normalizedExt}`; + const filePath = path.resolve(process.cwd(), filename); + await fs.writeFile(filePath, contents, "utf-8"); + fixtures.push(filePath); + return filePath; + }; + + const getUploadTool = (): UploadTool => + createAgentTools(v3, { + mode: "dom", + toolTimeout: 45_000, + }).upload as unknown as UploadTool; + + test("uploads a resume by targeting the file input semantically", async () => { + test.setTimeout(90_000); + + const page = v3.context.pages()[0]; + await page.goto(FILE_UPLOAD_V2_URL); + const fixture = await createFixture( + "resume-agent", + "

resume

", + ".pdf", + ); + + const result = await getUploadTool().execute( + { + target: "the Resume file upload input", + paths: [fixture], + }, + toolCallOptions, + ); + + expect(result.success).toBe(true); + expect(result.selector).toBeTruthy(); + expect(result.files).toEqual([path.basename(fixture)]); + + await expect + .poll( + () => + page.evaluate((selector) => { + const el = document.querySelector(selector); + if (!el) return ""; + const display = window.getComputedStyle(el).display; + if (display === "none") return ""; + return el.textContent ?? ""; + }, RESUME_SUCCESS), + { message: "wait for resume upload success" }, + ) + .toContain("Resume uploaded!"); + + await expect + .poll( + () => + page.evaluate((selector) => { + const el = document.querySelector(selector); + if (!(el instanceof HTMLInputElement)) return 0; + return el.files?.length ?? 0; + }, RESUME_INPUT), + { message: "wait for resume file count" }, + ) + .toBe(1); + }); + + test("returns a helpful error when the local file path does not exist", async () => { + test.setTimeout(90_000); + + const page = v3.context.pages()[0]; + await page.goto(FILE_UPLOAD_V2_URL); + + const missingPath = path.resolve( + process.cwd(), + `missing-upload-${crypto.randomBytes(4).toString("hex")}.pdf`, + ); + + const result = await getUploadTool().execute( + { + target: "the Resume file upload input", + paths: [missingPath], + }, + toolCallOptions, + ); + + expect(result.success).toBe(false); + expect(result.error).toContain("file not found"); + }); +}); diff --git a/packages/core/tests/unit/agent-system-prompt-variables.test.ts b/packages/core/tests/unit/agent-system-prompt-variables.test.ts index bf556d1c68..fe6fe9087c 100644 --- a/packages/core/tests/unit/agent-system-prompt-variables.test.ts +++ b/packages/core/tests/unit/agent-system-prompt-variables.test.ts @@ -20,5 +20,6 @@ describe("buildAgentSystemPrompt variables", () => { 'The login email', ); expect(prompt).toContain(''); + expect(prompt).toContain("upload tool's"); }); }); diff --git a/packages/core/tests/unit/agent-upload-tool-execute.test.ts b/packages/core/tests/unit/agent-upload-tool-execute.test.ts new file mode 100644 index 0000000000..78ba962de7 --- /dev/null +++ b/packages/core/tests/unit/agent-upload-tool-execute.test.ts @@ -0,0 +1,117 @@ +import { describe, expect, it, vi } from "vitest"; +import type { ToolCallOptions } from "ai"; +import { uploadTool } from "../../lib/v3/agent/tools/upload.js"; +import type { V3 } from "../../lib/v3/v3.js"; + +const toolCallOptions: ToolCallOptions = { + toolCallId: "upload-test-call", + messages: [], +}; + +function createStubV3() { + const setInputFiles = vi.fn().mockResolvedValue(undefined); + const deepLocator = vi.fn().mockReturnValue({ + setInputFiles, + }); + const awaitActivePage = vi.fn().mockResolvedValue({ + deepLocator, + }); + const observe = vi.fn().mockResolvedValue([ + { + description: "Resume upload input", + selector: "xpath=//input[@type='file']", + }, + ]); + const recordAgentReplayStep = vi.fn(); + const logger = vi.fn(); + + const v3 = { + observe, + context: { + awaitActivePage, + }, + isAgentReplayActive: () => true, + recordAgentReplayStep, + logger, + } as unknown as V3; + + return { + v3, + spies: { + awaitActivePage, + deepLocator, + logger, + observe, + recordAgentReplayStep, + setInputFiles, + }, + }; +} + +describe("uploadTool", () => { + it("uploads files found via observe and records a replay step", async () => { + const { v3, spies } = createStubV3(); + const tool = uploadTool(v3, undefined, { + resumePath: "/tmp/resume.pdf", + }); + + const result = await tool.execute( + { + target: "the resume file input", + paths: ["%resumePath%"], + }, + toolCallOptions, + ); + + expect(spies.observe).toHaveBeenCalledWith( + expect.stringContaining("the resume file input"), + { + variables: { resumePath: "/tmp/resume.pdf" }, + timeout: undefined, + }, + ); + expect(spies.deepLocator).toHaveBeenCalledWith( + "xpath=//input[@type='file']", + ); + expect(spies.setInputFiles).toHaveBeenCalledWith("/tmp/resume.pdf"); + expect(spies.recordAgentReplayStep).toHaveBeenCalledWith({ + type: "upload", + target: "the resume file input", + selector: "xpath=//input[@type='file']", + paths: ["/tmp/resume.pdf"], + }); + expect(result).toEqual({ + success: true, + target: "Resume upload input", + selector: "xpath=//input[@type='file']", + files: ["resume.pdf"], + fileCount: 1, + }); + }); + + it("returns a helpful error when observe cannot find a supported file input", async () => { + const { v3, spies } = createStubV3(); + spies.observe.mockResolvedValueOnce([ + { + description: "Upload button", + selector: "not-supported", + }, + ]); + const tool = uploadTool(v3); + + const result = await tool.execute( + { + target: "the resume file input", + paths: ["/tmp/resume.pdf"], + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: + "Could not find a file input for the resume file input. Ask the agent to target the actual upload input field.", + }); + expect(spies.setInputFiles).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/core/tests/unit/agent-upload-tools.test.ts b/packages/core/tests/unit/agent-upload-tools.test.ts new file mode 100644 index 0000000000..33efa90b6e --- /dev/null +++ b/packages/core/tests/unit/agent-upload-tools.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, it, vi } from "vitest"; +import { V3 } from "../../lib/v3/v3.js"; +import { + createAgentTools, + createCuaAgentTools, +} from "../../lib/v3/agent/tools/index.js"; + +function createStubV3(): V3 { + return { + logger: vi.fn(), + browserbaseApiKey: undefined, + isAgentReplayActive: () => false, + recordAgentReplayStep: vi.fn(), + } as unknown as V3; +} + +describe("agent upload tool registration", () => { + it("includes upload in DOM and hybrid toolsets", () => { + const v3 = createStubV3(); + + expect(createAgentTools(v3, { mode: "dom" })).toHaveProperty("upload"); + expect(createAgentTools(v3, { mode: "hybrid" })).toHaveProperty("upload"); + }); + + it("adds built-in upload to CUA tools", () => { + const v3 = createStubV3(); + const tools = createCuaAgentTools(v3); + + expect(tools).toHaveProperty("upload"); + }); + + it("lets user-provided CUA tools override the built-in upload tool", () => { + const v3 = createStubV3(); + const customUpload = { description: "custom upload tool" }; + const tools = createCuaAgentTools(v3, { + upload: customUpload as never, + }); + + expect(tools.upload).toBe(customUpload); + }); +}); diff --git a/packages/docs/v3/basics/agent.mdx b/packages/docs/v3/basics/agent.mdx index f3274b110a..27e881e17e 100644 --- a/packages/docs/v3/basics/agent.mdx +++ b/packages/docs/v3/basics/agent.mdx @@ -44,6 +44,7 @@ Some advanced features are only available with certain agent modes: | Custom tools | ✅ | ✅ | ✅ | | MCP integrations | ✅ | ✅ | ✅ | | System prompt | ✅ | ✅ | ✅ | +| Native file uploads | ✅ | ✅ | ✅ | | Variables | ❌ | ✅ | ✅ | | Streaming | ❌ | ✅ | ✅ | | Callbacks | ❌ | ✅ | ✅ | @@ -55,6 +56,22 @@ Some advanced features are only available with certain agent modes: | Coordinate-based actions | ✅ | ❌ | ✅ | | Visual cursor highlight | ✅ | ❌ | ✅ | +### Native File Uploads + +When the user already knows the local file path, the agent can upload it natively as part of the workflow without manually calling `setInputFiles()`. + +```typescript +const agent = stagehand.agent(); + +await agent.execute({ + instruction: + 'Upload the file at "/Users/me/Documents/resume.pdf" to the resume file input, then continue the application flow.', + maxSteps: 10, +}); +``` + +Use the actual local path in the instruction. The agent will use its built-in `upload` tool instead of clicking the visible upload button. + ### Computer Use Agents You can use specialized computer use models from Google, OpenAI, Anthropic, or Microsoft as shown below, with `mode` set to `"cua"`. To compare the performance of different computer use models, you can visit our [evals page](https://www.stagehand.dev/agent-evals). @@ -417,6 +434,7 @@ The tools you can exclude depend on the agent mode: |------|-------------| | `act` | Perform semantic actions (click, type, etc.) | | `fillForm` | Fill form fields using DOM selectors | +| `upload` | Upload local files into file inputs | | `ariaTree` | Get accessibility tree of the page | | `extract` | Extract structured data from page | | `goto` | Navigate to a URL | @@ -440,6 +458,7 @@ The tools you can exclude depend on the agent mode: | `clickAndHold` | Click and hold at coordinates | | `fillFormVision` | Fill forms using vision/coordinates | | `act` | Perform semantic actions | +| `upload` | Upload local files into file inputs | | `ariaTree` | Get accessibility tree | | `extract` | Extract data from page | | `goto` | Navigate to URL | @@ -562,6 +581,7 @@ Variables work with the following agent tools: |------|-------| | `act` | Use `%variableName%` in the action description | | `fillForm` | Use `%variableName%` in field values | +| `upload` | Use `%variableName%` in file paths | @@ -572,6 +592,7 @@ Variables work with the following agent tools: | `type` | Use `%variableName%` in the text to type | | `fillFormVision` | Use `%variableName%` in field values | | `act` | Use `%variableName%` in the action description | +| `upload` | Use `%variableName%` in file paths | @@ -1180,4 +1201,4 @@ if (firstResult.success === true) { Extract structured data from observed elements - \ No newline at end of file + diff --git a/packages/docs/v3/best-practices/prompting-best-practices.mdx b/packages/docs/v3/best-practices/prompting-best-practices.mdx index 1745188816..384aa8d27c 100644 --- a/packages/docs/v3/best-practices/prompting-best-practices.mdx +++ b/packages/docs/v3/best-practices/prompting-best-practices.mdx @@ -61,6 +61,10 @@ await stagehand.act("type into search"); // Good await stagehand.act("click the submit button"); await stagehand.act("select 'Option 1' from dropdown"); +await stagehand.agent().execute({ + instruction: + 'Upload the file at "/Users/me/Documents/resume.pdf" to the resume file input', +}); // Bad await stagehand.act("click submit"); @@ -246,4 +250,4 @@ await agent.execute("Add some items to cart"); 2. **Add complexity gradually** - Build up to complex workflows 3. **Monitor results** - Use logging to understand what's happening 4. **Iterate based on failures** - Refine prompts when they don't work -Remember: Good prompting is iterative. When in doubt, be more specific rather than less. \ No newline at end of file +Remember: Good prompting is iterative. When in doubt, be more specific rather than less. diff --git a/packages/docs/v3/references/agent.mdx b/packages/docs/v3/references/agent.mdx index be702649ff..52fca0b541 100644 --- a/packages/docs/v3/references/agent.mdx +++ b/packages/docs/v3/references/agent.mdx @@ -123,8 +123,8 @@ interface AgentInstance { Tool mode for the agent. Determines which set of tools are available to the agent. **Modes:** - - `"dom"` (default): Uses DOM-based tools (`act`, `fillForm`) for structured page interactions. Works with any model. - - `"hybrid"`: Uses both DOM-based and coordinate-based tools (`act`, `click`, `type`, `dragAndDrop`, `clickAndHold`, `fillForm`) for visual/screenshot-based interactions. Requires models with reliable coordinate-based action capabilities. + - `"dom"` (default): Uses DOM-based tools (`act`, `fillForm`, `upload`) for structured page interactions. Works with any model. + - `"hybrid"`: Uses both DOM-based and coordinate-based tools (`act`, `upload`, `click`, `type`, `dragAndDrop`, `clickAndHold`, `fillForm`) for visual/screenshot-based interactions. Requires models with reliable coordinate-based action capabilities. - `"cua"`: Uses Computer Use Agent (CUA) providers like Anthropic Claude, Google Gemini, or OpenAI for screenshot-based automation. This is the preferred way to enable CUA mode (replaces the deprecated `cua: true` option). **Default:** `"dom"` @@ -253,9 +253,9 @@ interface AgentStreamCallbacks { **Available tools by mode:** - **DOM mode (default):** `act`, `fillForm`, `ariaTree`, `extract`, `goto`, `scroll`, `keys`, `navback`, `screenshot`, `think`, `wait`, `search` + **DOM mode (default):** `act`, `fillForm`, `upload`, `ariaTree`, `extract`, `goto`, `scroll`, `keys`, `navback`, `screenshot`, `think`, `wait`, `search` - **Hybrid mode:** `click`, `type`, `dragAndDrop`, `clickAndHold`, `fillFormVision`, `act`, `ariaTree`, `extract`, `goto`, `scroll`, `keys`, `navback`, `screenshot`, `think`, `wait`, `search` + **Hybrid mode:** `click`, `type`, `dragAndDrop`, `clickAndHold`, `fillFormVision`, `act`, `upload`, `ariaTree`, `extract`, `goto`, `scroll`, `keys`, `navback`, `screenshot`, `think`, `wait`, `search` **Non-CUA agents only.** Requires `experimental: true`. Not available when `cua: true`. @@ -866,4 +866,4 @@ The following errors may be thrown by the `agent()` method: - **StagehandDefaultError** - General execution error with detailed message - **AgentAbortError** - Thrown when agent execution is cancelled via an `AbortSignal` - **StreamingCallbacksInNonStreamingModeError** - Thrown when streaming-only callbacks (`onChunk`, `onFinish`, `onError`, `onAbort`) are used without `stream: true` -- **ExperimentalNotConfiguredError** - Thrown when experimental features (callbacks, signal, messages, streaming) are used without `experimental: true` in Stagehand constructor \ No newline at end of file +- **ExperimentalNotConfiguredError** - Thrown when experimental features (callbacks, signal, messages, streaming) are used without `experimental: true` in Stagehand constructor From 5d7ab5f83d9cab96436036cd91d21c67d7193840 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Mon, 30 Mar 2026 19:31:01 -0700 Subject: [PATCH 2/2] guard agent upload tool selection --- .../lib/v3/agent/prompts/agentSystemPrompt.ts | 23 +- packages/core/lib/v3/agent/tools/act.ts | 9 + packages/core/lib/v3/agent/tools/click.ts | 9 + .../core/lib/v3/agent/tools/fillFormVision.ts | 11 + packages/core/lib/v3/agent/tools/fillform.ts | 11 + packages/core/lib/v3/agent/tools/keys.ts | 14 +- packages/core/lib/v3/agent/tools/type.ts | 9 + .../lib/v3/agent/utils/fileUploadGuard.ts | 49 +++++ .../unit/agent-file-upload-guard.test.ts | 199 ++++++++++++++++++ 9 files changed, 327 insertions(+), 7 deletions(-) create mode 100644 packages/core/lib/v3/agent/utils/fileUploadGuard.ts create mode 100644 packages/core/tests/unit/agent-file-upload-guard.test.ts diff --git a/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts b/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts index 8a7ca3dc8f..fe3878b3eb 100644 --- a/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts +++ b/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts @@ -48,17 +48,17 @@ function buildToolsSection( { name: "click", description: - "Click on an element (PREFERRED - more reliable when element is visible in viewport)", + "Click on an element (PREFERRED - more reliable when element is visible in viewport). Never use this for file upload buttons or file inputs.", }, { name: "type", description: - "Type text into an element (PREFERRED - more reliable when element is visible in viewport)", + "Type text into an element (PREFERRED - more reliable when element is visible in viewport). Never use this for file inputs or local file paths.", }, { name: "act", description: - "Perform a specific atomic action (click, type, etc.) - ONLY use when element is in ariaTree but NOT visible in screenshot. Less reliable but can interact with out-of-viewport elements.", + "Perform a specific atomic action (click, type, etc.) - ONLY use when element is in ariaTree but NOT visible in screenshot. Less reliable but can interact with out-of-viewport elements. Never use this for file upload buttons, file inputs, or local file paths.", }, { name: "upload", @@ -67,7 +67,11 @@ function buildToolsSection( }, { name: "dragAndDrop", description: "Drag and drop an element" }, { name: "clickAndHold", description: "Click and hold on an element" }, - { name: "keys", description: "Press a keyboard key" }, + { + name: "keys", + description: + "Press a keyboard key or type into the currently focused element. Never use this to enter local file paths for uploads.", + }, { name: "fillFormVision", description: "Fill out a form using coordinates", @@ -92,14 +96,19 @@ function buildToolsSection( }, { name: "act", - description: "Perform a specific atomic action (click, type)", + description: + "Perform a specific atomic action (click, type). Never use this for file upload buttons, file inputs, or local file paths.", }, { name: "upload", description: "Upload one or more local files into a file input when the user has provided a file path", }, - { name: "keys", description: "Press a keyboard key" }, + { + name: "keys", + description: + "Press a keyboard key or type into the currently focused element. Never use this to enter local file paths for uploads.", + }, { name: "fillForm", description: "Fill out a form" }, { name: "think", description: "Think about the task" }, { name: "extract", description: "Extract structured data" }, @@ -158,6 +167,7 @@ export function buildAgentSystemPrompt( `Always use screenshot to get proper grounding of the coordinates you want to type/click into.`, `When interacting with an input, always use the type tool to type into the input, over clicking and then typing into it.`, `When the task requires uploading a file and a local path is available, use the upload tool instead of clicking the visible upload button.`, + `Never use click, type, act, fillForm, or fillFormVision to interact with a file input or upload button. Those tools cannot complete native file selection reliably.`, `Use ariaTree as a secondary tool when elements aren't visible in screenshot or to get full page context.`, `Only use act when element is in ariaTree but NOT visible in screenshot.`, ] @@ -166,6 +176,7 @@ export function buildAgentSystemPrompt( `Always check ariaTree first to understand full page content without scrolling - it shows all elements including those below the fold.`, `When interacting with an input, always use the act tool to type into the input, over clicking and then typing.`, `When the task requires uploading a file and a local path is available, use the upload tool instead of clicking the visible upload button.`, + `Never use click, type, act, fillForm, or fillFormVision to interact with a file input or upload button. Those tools cannot complete native file selection reliably.`, `If an element is present in the ariaTree, use act to interact with it directly - this eliminates the need to scroll.`, `Use screenshot for visual confirmation when needed, but rely primarily on ariaTree for element detection.`, ]; diff --git a/packages/core/lib/v3/agent/tools/act.ts b/packages/core/lib/v3/agent/tools/act.ts index c6131640fa..f52db7ba36 100644 --- a/packages/core/lib/v3/agent/tools/act.ts +++ b/packages/core/lib/v3/agent/tools/act.ts @@ -4,6 +4,7 @@ import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { AgentModelConfig, Variables } from "../../types/public/agent.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; +import { getFileUploadGuardError } from "../utils/fileUploadGuard.js"; export const actTool = ( v3: V3, @@ -24,6 +25,14 @@ export const actTool = ( }), execute: async ({ action }) => { try { + const fileUploadGuardError = getFileUploadGuardError(action); + if (fileUploadGuardError) { + return { + success: false, + error: fileUploadGuardError, + }; + } + v3.logger({ category: "agent", message: `Agent calling tool: act`, diff --git a/packages/core/lib/v3/agent/tools/click.ts b/packages/core/lib/v3/agent/tools/click.ts index 514f02b1a3..314fb25a9f 100644 --- a/packages/core/lib/v3/agent/tools/click.ts +++ b/packages/core/lib/v3/agent/tools/click.ts @@ -7,6 +7,7 @@ import type { ModelOutputContentItem, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; +import { getFileUploadGuardError } from "../utils/fileUploadGuard.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; @@ -26,6 +27,14 @@ export const clickTool = (v3: V3, provider?: string) => }), execute: async ({ describe, coordinates }): Promise => { try { + const fileUploadGuardError = getFileUploadGuardError(describe); + if (fileUploadGuardError) { + return { + success: false, + error: fileUploadGuardError, + }; + } + const page = await v3.context.awaitActivePage(); const processed = processCoordinates( coordinates[0], diff --git a/packages/core/lib/v3/agent/tools/fillFormVision.ts b/packages/core/lib/v3/agent/tools/fillFormVision.ts index 6776384253..75a530ffbf 100644 --- a/packages/core/lib/v3/agent/tools/fillFormVision.ts +++ b/packages/core/lib/v3/agent/tools/fillFormVision.ts @@ -8,6 +8,7 @@ import type { Variables, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; +import { getFileUploadGuardError } from "../utils/fileUploadGuard.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; import { substituteVariables } from "../utils/variables.js"; @@ -62,6 +63,16 @@ MANDATORY USE CASES (always use fillFormVision for these): }), execute: async ({ fields }): Promise => { try { + const fileUploadGuardError = getFileUploadGuardError( + ...fields.flatMap((field) => [field.action, field.value]), + ); + if (fileUploadGuardError) { + return { + success: false, + error: fileUploadGuardError, + }; + } + const page = await v3.context.awaitActivePage(); // Process coordinates and substitute variables for each field diff --git a/packages/core/lib/v3/agent/tools/fillform.ts b/packages/core/lib/v3/agent/tools/fillform.ts index 4502d5c454..0948a9da75 100644 --- a/packages/core/lib/v3/agent/tools/fillform.ts +++ b/packages/core/lib/v3/agent/tools/fillform.ts @@ -4,6 +4,7 @@ import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { AgentModelConfig, Variables } from "../../types/public/agent.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; +import { getFileUploadGuardError } from "../utils/fileUploadGuard.js"; export const fillFormTool = ( v3: V3, @@ -30,6 +31,16 @@ export const fillFormTool = ( }), execute: async ({ fields }) => { try { + const fileUploadGuardError = getFileUploadGuardError( + ...fields.map((field) => field.action), + ); + if (fileUploadGuardError) { + return { + success: false, + error: fileUploadGuardError, + }; + } + v3.logger({ category: "agent", message: `Agent calling tool: fillForm`, diff --git a/packages/core/lib/v3/agent/tools/keys.ts b/packages/core/lib/v3/agent/tools/keys.ts index b8e581472f..530efb91cc 100644 --- a/packages/core/lib/v3/agent/tools/keys.ts +++ b/packages/core/lib/v3/agent/tools/keys.ts @@ -1,6 +1,7 @@ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; +import { getFileUploadGuardError } from "../utils/fileUploadGuard.js"; export const keysTool = (v3: V3) => tool({ @@ -8,7 +9,9 @@ export const keysTool = (v3: V3) => Use method="type" to enter text into the currently focused element. Preferred when: input is already focused, text needs to flow across multiple fields (e.g., verification codes) -Use method="press" for navigation keys (Enter, Tab, Escape, Backspace, arrows) and keyboard shortcuts (Cmd+A, Ctrl+C, Shift+Tab).`, +Use method="press" for navigation keys (Enter, Tab, Escape, Backspace, arrows) and keyboard shortcuts (Cmd+A, Ctrl+C, Shift+Tab). + +Never use this tool to type local file paths for uploads. Use the upload tool instead.`, inputSchema: z.object({ method: z.enum(["press", "type"]), value: z @@ -20,6 +23,15 @@ Use method="press" for navigation keys (Enter, Tab, Escape, Backspace, arrows) a }), execute: async ({ method, value, repeat }) => { try { + const fileUploadGuardError = + method === "type" ? getFileUploadGuardError(value) : null; + if (fileUploadGuardError) { + return { + success: false, + error: fileUploadGuardError, + }; + } + const page = await v3.context.awaitActivePage(); v3.logger({ category: "agent", diff --git a/packages/core/lib/v3/agent/tools/type.ts b/packages/core/lib/v3/agent/tools/type.ts index 4a2a0eb46a..0018b9d147 100644 --- a/packages/core/lib/v3/agent/tools/type.ts +++ b/packages/core/lib/v3/agent/tools/type.ts @@ -8,6 +8,7 @@ import type { Variables, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; +import { getFileUploadGuardError } from "../utils/fileUploadGuard.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; import { substituteVariables } from "../utils/variables.js"; @@ -38,6 +39,14 @@ export const typeTool = (v3: V3, provider?: string, variables?: Variables) => { text, }): Promise => { try { + const fileUploadGuardError = getFileUploadGuardError(describe, text); + if (fileUploadGuardError) { + return { + success: false, + error: fileUploadGuardError, + }; + } + const page = await v3.context.awaitActivePage(); const processed = processCoordinates( coordinates[0], diff --git a/packages/core/lib/v3/agent/utils/fileUploadGuard.ts b/packages/core/lib/v3/agent/utils/fileUploadGuard.ts new file mode 100644 index 0000000000..ea7e71c165 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/fileUploadGuard.ts @@ -0,0 +1,49 @@ +const FILE_UPLOAD_INTENT_PATTERNS = [ + /\bupload\b/i, + /\battach\b/i, + /\bresume\b/i, + /\bcv\b/i, + /\bcover letter\b/i, + /\bfile input\b/i, + /\bfile upload\b/i, + /\bfile chooser\b/i, + /\bchoose file\b/i, + /\bselect file\b/i, + /\bbrowse files?\b/i, + /\bdrag-and-drop\b/i, + /\bagent profile\b/i, + /\bprofile photo\b/i, +]; + +const LOCAL_PATH_PATTERN = + /(?:^|[\s"'`])(?:\/[^\s"'`]+|~\/[^\s"'`]+|[A-Za-z]:\\[^\s"'`]+)(?=$|[\s"'`])/; +const FILE_EXTENSION_PATTERN = + /\.(pdf|docx?|txt|rtf|png|jpe?g|gif|webp|csv|json|zip|mp3|wav|m4a|mov|mp4)\b/i; + +export const FILE_UPLOAD_GUARD_ERROR = + "File uploads must use the upload tool. Do not click, type into, or fill file inputs with click, act, type, fillForm, or fillFormVision. If no local path is available yet, ask the user for one."; + +export function getFileUploadGuardError(...texts: Array) { + const combined = texts + .filter( + (text): text is string => + typeof text === "string" && text.trim().length > 0, + ) + .join(" "); + + if (!combined) { + return null; + } + + const hasFileIntent = FILE_UPLOAD_INTENT_PATTERNS.some((pattern) => + pattern.test(combined), + ); + const hasLocalPath = + LOCAL_PATH_PATTERN.test(combined) || FILE_EXTENSION_PATTERN.test(combined); + + if (!hasFileIntent && !hasLocalPath) { + return null; + } + + return FILE_UPLOAD_GUARD_ERROR; +} diff --git a/packages/core/tests/unit/agent-file-upload-guard.test.ts b/packages/core/tests/unit/agent-file-upload-guard.test.ts new file mode 100644 index 0000000000..73017814ad --- /dev/null +++ b/packages/core/tests/unit/agent-file-upload-guard.test.ts @@ -0,0 +1,199 @@ +import { describe, expect, it, vi } from "vitest"; +import type { ToolCallOptions } from "ai"; +import { actTool } from "../../lib/v3/agent/tools/act.js"; +import { clickTool } from "../../lib/v3/agent/tools/click.js"; +import { fillFormTool } from "../../lib/v3/agent/tools/fillform.js"; +import { fillFormVisionTool } from "../../lib/v3/agent/tools/fillFormVision.js"; +import { keysTool } from "../../lib/v3/agent/tools/keys.js"; +import { typeTool } from "../../lib/v3/agent/tools/type.js"; +import { + FILE_UPLOAD_GUARD_ERROR, + getFileUploadGuardError, +} from "../../lib/v3/agent/utils/fileUploadGuard.js"; +import type { V3 } from "../../lib/v3/v3.js"; + +const toolCallOptions: ToolCallOptions = { + toolCallId: "file-upload-guard-test", + messages: [], +}; + +function createMockV3() { + const page = { + click: vi.fn(), + keyPress: vi.fn(), + type: vi.fn(), + }; + const awaitActivePage = vi.fn().mockResolvedValue(page); + const act = vi.fn(); + const observe = vi.fn(); + + const v3 = { + logger: vi.fn(), + act, + observe, + context: { + awaitActivePage, + }, + isAgentReplayActive: () => false, + recordAgentReplayStep: vi.fn(), + } as unknown as V3; + + return { + v3, + spies: { + act, + awaitActivePage, + observe, + page, + }, + }; +} + +describe("file upload guard", () => { + it("detects file upload intent from paths and upload language", () => { + expect( + getFileUploadGuardError( + 'upload "/tmp/resume.pdf" to the resume file input', + ), + ).toBe(FILE_UPLOAD_GUARD_ERROR); + expect(getFileUploadGuardError("click the Upload CV button")).toBe( + FILE_UPLOAD_GUARD_ERROR, + ); + expect(getFileUploadGuardError("click the Continue button")).toBeNull(); + }); + + it("prevents act from typing local file paths", async () => { + const { v3, spies } = createMockV3(); + const tool = actTool(v3); + + const result = await tool.execute!( + { + action: 'type "/tmp/resume.pdf" into the Agent Profile file input', + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: FILE_UPLOAD_GUARD_ERROR, + }); + expect(spies.act).not.toHaveBeenCalled(); + }); + + it("prevents fillForm from treating uploads as standard fields", async () => { + const { v3, spies } = createMockV3(); + const tool = fillFormTool(v3); + + const result = await tool.execute!( + { + fields: [ + { + action: 'type "/tmp/resume.pdf" into the resume upload field', + }, + ], + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: FILE_UPLOAD_GUARD_ERROR, + }); + expect(spies.observe).not.toHaveBeenCalled(); + expect(spies.act).not.toHaveBeenCalled(); + }); + + it("prevents click from targeting upload controls", async () => { + const { v3, spies } = createMockV3(); + const tool = clickTool(v3); + + const result = await tool.execute!( + { + describe: "the Upload resume button", + coordinates: [100, 200], + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: FILE_UPLOAD_GUARD_ERROR, + }); + expect(spies.awaitActivePage).not.toHaveBeenCalled(); + expect(spies.page.click).not.toHaveBeenCalled(); + }); + + it("prevents type from sending a file path into an input", async () => { + const { v3, spies } = createMockV3(); + const tool = typeTool(v3); + + const result = await tool.execute!( + { + describe: "the Agent Profile file input", + text: "/tmp/resume.pdf", + coordinates: [100, 200], + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: FILE_UPLOAD_GUARD_ERROR, + }); + expect(spies.awaitActivePage).not.toHaveBeenCalled(); + expect(spies.page.click).not.toHaveBeenCalled(); + expect(spies.page.type).not.toHaveBeenCalled(); + }); + + it("prevents fillFormVision from routing uploads through typing", async () => { + const { v3, spies } = createMockV3(); + const tool = fillFormVisionTool(v3); + + const result = await tool.execute!( + { + fields: [ + { + action: "type John into the first name field", + value: "John", + coordinates: { x: 10, y: 10 }, + }, + { + action: 'type "/tmp/resume.pdf" into the CV upload field', + value: "/tmp/resume.pdf", + coordinates: { x: 20, y: 20 }, + }, + ], + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: FILE_UPLOAD_GUARD_ERROR, + }); + expect(spies.awaitActivePage).not.toHaveBeenCalled(); + expect(spies.page.click).not.toHaveBeenCalled(); + expect(spies.page.type).not.toHaveBeenCalled(); + }); + + it("prevents keys(type) from typing local file paths", async () => { + const { v3, spies } = createMockV3(); + const tool = keysTool(v3); + + const result = await tool.execute!( + { + method: "type", + value: "/tmp/resume.pdf", + }, + toolCallOptions, + ); + + expect(result).toEqual({ + success: false, + error: FILE_UPLOAD_GUARD_ERROR, + }); + expect(spies.awaitActivePage).not.toHaveBeenCalled(); + expect(spies.page.type).not.toHaveBeenCalled(); + expect(spies.page.keyPress).not.toHaveBeenCalled(); + }); +});