Skip to content

Commit 11db324

Browse files
committed
feat: add guided research brief interview functionality with multi-language support
- Implemented guided brief interview questions and responses in multiple languages (English, Korean, Japanese, Chinese, Spanish, French, German, Portuguese, Russian). - Added a new utility to detect preflight-only metrics in experiments. - Enhanced tests for paper analysis and research brief file generation to cover new features and edge cases. - Updated terminal app to handle guided brief interviews and startup automation commands.
1 parent dcd316b commit 11db324

19 files changed

+2362
-106
lines changed

ISSUES.md

Lines changed: 197 additions & 42 deletions
Large diffs are not rendered by default.

src/core/agents/implementSessionManager.ts

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ export class ImplementSessionManager {
337337
const metricsPath = path.join(runDir, "metrics.json");
338338
const defaultPublicDir = buildPublicExperimentDir(this.deps.workspaceRoot, run);
339339
const experimentLlmProfile = resolveExperimentLlmProfile(this.deps.config);
340-
const useCodexSession =
340+
const canUseCodexSession =
341341
typeof this.deps.codex?.runTurnStream === "function" &&
342342
this.deps.config?.providers?.llm_mode !== "openai_api" &&
343343
this.deps.config?.providers?.llm_mode !== "ollama";
@@ -369,6 +369,8 @@ export class ImplementSessionManager {
369369
longTermMemory,
370370
environmentSnapshot
371371
);
372+
const useCodexSession =
373+
canUseCodexSession && !shouldFallbackToStagedImplementLlm(taskSpec.context.previous_summary || "");
372374
await writeJsonFile(path.join(runDir, "implement_task_spec.json"), taskSpec);
373375

374376
const queueProgressUpdate = (
@@ -688,6 +690,41 @@ export class ImplementSessionManager {
688690
}
689691
}
690692
});
693+
if (this.deps.llm && shouldFallbackToStagedImplementLlm(result.finalText)) {
694+
emitImplementObservation(
695+
"codex",
696+
"Codex implement turn reported a filesystem tooling blocker; retrying this attempt in staged_llm mode.",
697+
{
698+
attempt,
699+
threadId: activeThreadId,
700+
publicDir: defaultPublicDir
701+
}
702+
);
703+
const llmTimeoutMs = getImplementLlmTimeoutMs(this.deps.config);
704+
const filesystemFallbackPrompt = this.buildFilesystemFallbackRecoveryPrompt({
705+
taskSpec,
706+
searchLocalization,
707+
branchPlan,
708+
attempt
709+
});
710+
const filesystemFallbackSystemPrompt = appendFilesystemFallbackOverrideToPrompt(attemptSystemPrompt);
711+
const completion = await this.completeStagedLlmRequest({
712+
prompt: filesystemFallbackPrompt,
713+
systemPrompt: filesystemFallbackSystemPrompt,
714+
timeoutMs: llmTimeoutMs,
715+
abortSignal,
716+
attempt,
717+
threadId: activeThreadId,
718+
publicDir: defaultPublicDir,
719+
emitImplementObservation,
720+
reasoningEffort: experimentLlmProfile.reasoningEffort
721+
});
722+
result = {
723+
threadId: completion.threadId || activeThreadId,
724+
finalText: completion.text,
725+
events: []
726+
};
727+
}
691728
} else {
692729
if (!this.deps.llm) {
693730
throw new Error("implement_experiments is configured for staged_llm mode, but no LLM client is available.");
@@ -1717,6 +1754,43 @@ export class ImplementSessionManager {
17171754
return lines.join("\n");
17181755
}
17191756

1757+
private buildFilesystemFallbackRecoveryPrompt(params: {
1758+
taskSpec: ImplementTaskSpec;
1759+
searchLocalization: LocalizationResult;
1760+
branchPlan: BranchPlan;
1761+
attempt: number;
1762+
}): string {
1763+
const sandboxTaskSpec = rewriteWorkspacePathsForSandbox(params.taskSpec, this.deps.workspaceRoot);
1764+
const sandboxSearchLocalization = rewriteWorkspacePathsForSandbox(params.searchLocalization, this.deps.workspaceRoot);
1765+
const sandboxBranchPlan = rewriteWorkspacePathsForSandbox(params.branchPlan, this.deps.workspaceRoot);
1766+
const promptTaskSpec = compactTaskSpecForStagedLlmPrompt(sandboxTaskSpec);
1767+
const promptSearchLocalization = compactLocalizationForStagedLlmPrompt(sandboxSearchLocalization);
1768+
const promptBranchPlan = compactBranchPlanForStagedLlmPrompt(sandboxBranchPlan);
1769+
1770+
return [
1771+
`Implementation attempt ${params.attempt}/${MAX_IMPLEMENT_ATTEMPTS} (filesystem-blocker recovery mode).`,
1772+
"The previous Codex filesystem/tooling blocker has already been detected and handled by AutoLabOS.",
1773+
"Do NOT repeat the blocker narrative, sandbox explanation, or any request to retry Codex filesystem actions.",
1774+
"Treat this as a fresh staged_llm implementation task and return ONLY one JSON object.",
1775+
"A valid response MUST include non-empty file_edits for every created or modified text artifact needed for the runnable experiment bundle.",
1776+
"At minimum, emit file_edits for the runnable script and any required config or README referenced by your commands.",
1777+
"If inspection is incomplete, synthesize the smallest bounded implementation that satisfies the locked task spec, branch focus, and localization hints.",
1778+
"Task spec:",
1779+
JSON.stringify(promptTaskSpec, null, 2),
1780+
"",
1781+
"Search-backed localization hints:",
1782+
JSON.stringify(promptSearchLocalization, null, 2),
1783+
"",
1784+
"Branch focus:",
1785+
JSON.stringify(promptBranchPlan, null, 2),
1786+
"",
1787+
"Output contract reminder:",
1788+
"- Return ONLY one JSON object with keys: summary, experiment_mode, run_command, test_command, working_dir, changed_files, artifacts, public_dir, public_artifacts, script_path, metrics_path, localization, assumptions, file_edits.",
1789+
"- file_edits must contain full UTF-8 contents for each referenced file.",
1790+
"- Responses that only describe the blocker or omit file_edits are invalid."
1791+
].join("\n");
1792+
}
1793+
17201794
private async completeStagedLlmRequest(input: {
17211795
prompt: string;
17221796
systemPrompt: string;
@@ -3582,6 +3656,30 @@ function isDeferredExecutionArtifactPath(filePath: string): boolean {
35823656
);
35833657
}
35843658

3659+
function shouldFallbackToStagedImplementLlm(finalText: string): boolean {
3660+
const normalized = finalText.toLowerCase();
3661+
return (
3662+
normalized.includes("bwrap: loopback: failed rtm_newaddr: operation not permitted") ||
3663+
normalized.includes("codex local filesystem action") ||
3664+
normalized.includes("sandbox startup failure")
3665+
);
3666+
}
3667+
3668+
function appendFilesystemFallbackOverrideToPrompt(prompt: string): string {
3669+
return [
3670+
prompt,
3671+
"",
3672+
"Filesystem-blocker recovery mode:",
3673+
"- A previous Codex workspace filesystem/tooling blocker has already been detected and handled by AutoLabOS.",
3674+
"- Do NOT repeat the blocker narrative, sandbox failure explanation, or any request to retry Codex filesystem actions.",
3675+
"- In this staged_llm mode, you must synthesize the implementation directly as structured file_edits.",
3676+
"- A valid response must include file_edits for each created or modified text artifact needed for the runnable experiment bundle.",
3677+
"- At minimum, emit file_edits for the runnable script and any required config or README referenced by your commands.",
3678+
"- If prior attempts failed before materializing files, treat that as resolved context rather than the answer.",
3679+
"- If inspection is incomplete, generate the smallest bounded implementation that satisfies the task spec, localization hints, and verification command."
3680+
].join("\n");
3681+
}
3682+
35853683
async function publishReusableArtifacts(params: {
35863684
changedFiles: string[];
35873685
artifacts: string[];

src/core/analysis/paperAnalyzer.ts

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,20 @@ export interface PaperAnalysisResult {
7777
rawJson: RawPaperAnalysis;
7878
}
7979

80+
export function synthesizeDeterministicAbstractFallbackResult(args: {
81+
paper: AnalysisCorpusRow;
82+
source: ResolvedPaperSource;
83+
failureReason: string;
84+
attempts?: number;
85+
}): PaperAnalysisResult {
86+
const fallbackDraft = buildDeterministicAbstractTimeoutFallback(args.paper, args.source, args.failureReason);
87+
return {
88+
...normalizePaperAnalysis(args.paper, args.source, fallbackDraft),
89+
attempts: args.attempts ?? 1,
90+
rawJson: fallbackDraft
91+
};
92+
}
93+
8094
export const ANALYSIS_SYSTEM_PROMPT = [
8195
"You are a scientific literature analyst for AutoLabOS.",
8296
"Return one JSON object only.",
@@ -169,6 +183,17 @@ export async function analyzePaperWithLlm(args: {
169183
args.onProgress?.(
170184
`Analysis attempt ${attempt}/${imageBearingAttemptLimit} failed: ${describeAnalysisAttemptFailureReason(lastError)}`
171185
);
186+
if (shouldSynthesizeAbstractTimeoutFallback(args.source, lastError)) {
187+
args.onProgress?.(
188+
"Abstract-only analysis still timed out. Using a deterministic abstract fallback analysis to preserve a minimal, source-grounded summary."
189+
);
190+
return synthesizeDeterministicAbstractFallbackResult({
191+
paper: args.paper,
192+
source: args.source,
193+
failureReason: lastError.message,
194+
attempts: attempt
195+
});
196+
}
172197
if (isPaperAnalysisTimeoutError(lastError)) {
173198
throw lastError;
174199
}
@@ -608,6 +633,77 @@ function isAbortError(error: unknown): boolean {
608633
return message.includes("aborted") || message.includes("abort");
609634
}
610635

636+
function shouldSynthesizeAbstractTimeoutFallback(source: ResolvedPaperSource, error: unknown): boolean {
637+
return source.sourceType === "abstract" && isPaperAnalysisTimeoutError(error);
638+
}
639+
640+
function buildDeterministicAbstractTimeoutFallback(
641+
paper: AnalysisCorpusRow,
642+
source: ResolvedPaperSource,
643+
failureReason: string
644+
): RawPaperAnalysis {
645+
const abstract = paper.abstract?.trim() || "";
646+
const fallbackSummary = summarizeAbstractForTimeoutFallback(abstract, paper.title);
647+
const abstractSentence = firstMeaningfulSentence(abstract);
648+
const evidenceSpan = trimToLength(abstract || source.text || paper.title, 240);
649+
const claim = trimToLength(abstractSentence || fallbackSummary, 220);
650+
return {
651+
summary: fallbackSummary,
652+
key_findings: abstractSentence ? [trimToLength(abstractSentence, 180)] : [],
653+
limitations: ["Abstract-only fallback; no verified full-text extraction completed before timeout."],
654+
datasets: [],
655+
metrics: [],
656+
novelty: "Not established from abstract-only fallback evidence.",
657+
reproducibility_notes: [
658+
`Synthesized from title/abstract only after analysis timed out (${failureReason}).`
659+
],
660+
evidence_items: [
661+
{
662+
claim,
663+
method_slot: "Not specified from abstract-only fallback.",
664+
result_slot: fallbackSummary,
665+
limitation_slot: "Full-text extraction or extraction review did not complete before timeout.",
666+
dataset_slot: "Not specified.",
667+
metric_slot: "Not specified.",
668+
evidence_span: evidenceSpan,
669+
confidence: 0.3,
670+
confidence_reason:
671+
"This item was synthesized from title/abstract only after repeated analysis timeouts, so it should be treated as weak abstract-only evidence."
672+
}
673+
]
674+
};
675+
}
676+
677+
function summarizeAbstractForTimeoutFallback(abstract: string, title: string): string {
678+
const sentence = firstMeaningfulSentence(abstract);
679+
if (sentence) {
680+
return trimToLength(sentence, 280);
681+
}
682+
if (abstract) {
683+
return trimToLength(abstract, 280);
684+
}
685+
return trimToLength(`Abstract-only fallback for "${title}".`, 280);
686+
}
687+
688+
function firstMeaningfulSentence(text: string): string | undefined {
689+
const normalized = text.replace(/\s+/g, " ").trim();
690+
if (!normalized) {
691+
return undefined;
692+
}
693+
const sentences = normalized
694+
.split(/(?<=[.!?])\s+/u)
695+
.map((sentence) => sentence.trim())
696+
.filter((sentence) => sentence.length > 24);
697+
return sentences[0] || normalized;
698+
}
699+
700+
function trimToLength(text: string, maxLength: number): string {
701+
if (text.length <= maxLength) {
702+
return text;
703+
}
704+
return `${text.slice(0, Math.max(0, maxLength - 1)).trimEnd()}…`;
705+
}
706+
611707
export function buildPaperAnalysisPrompt(
612708
paper: AnalysisCorpusRow,
613709
source: ResolvedPaperSource,
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
export function detectPreflightOnlyMetrics(metrics: Record<string, unknown>): string | null {
2+
const mode = typeof metrics.mode === "string" ? metrics.mode.trim().toLowerCase() : "";
3+
const notes = typeof metrics.notes === "string" ? metrics.notes.trim() : "";
4+
if (mode === "preflight") {
5+
return "Experiment only emitted preflight metrics; no training or evaluation was executed.";
6+
}
7+
if (/no training\/evaluation executed/i.test(notes)) {
8+
return "Experiment reported that no training or evaluation was executed.";
9+
}
10+
return null;
11+
}

0 commit comments

Comments
 (0)