lhy0718
diff --git a/‎ISSUES.md‎
Lines changed: 197 additions & 42 deletions b/‎ISSUES.md‎
Lines changed: 197 additions & 42 deletions
diff --git a/‎src/core/agents/implementSessionManager.ts‎
Lines changed: 99 additions & 1 deletion b/‎src/core/agents/implementSessionManager.ts‎
Lines changed: 99 additions & 1 deletion
diff --git a/‎src/core/analysis/paperAnalyzer.ts‎
Lines changed: 96 additions & 0 deletions b/‎src/core/analysis/paperAnalyzer.ts‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎src/core/experiments/executedMetrics.ts‎
Lines changed: 11 additions & 0 deletions b/‎src/core/experiments/executedMetrics.ts‎
Lines changed: 11 additions & 0 deletions
@@ -337,7 +337,7 @@ export class ImplementSessionManager {
     const metricsPath = path.join(runDir, "metrics.json");
     const defaultPublicDir = buildPublicExperimentDir(this.deps.workspaceRoot, run);
     const experimentLlmProfile = resolveExperimentLlmProfile(this.deps.config);
-    const useCodexSession =
+    const canUseCodexSession =
       typeof this.deps.codex?.runTurnStream === "function" &&
       this.deps.config?.providers?.llm_mode !== "openai_api" &&
       this.deps.config?.providers?.llm_mode !== "ollama";
@@ -369,6 +369,8 @@ export class ImplementSessionManager {
       longTermMemory,
       environmentSnapshot
     );
+    const useCodexSession =
+      canUseCodexSession && !shouldFallbackToStagedImplementLlm(taskSpec.context.previous_summary || "");
     await writeJsonFile(path.join(runDir, "implement_task_spec.json"), taskSpec);
 
     const queueProgressUpdate = (
@@ -688,6 +690,41 @@ export class ImplementSessionManager {
                 }
               }
             });
+            if (this.deps.llm && shouldFallbackToStagedImplementLlm(result.finalText)) {
+              emitImplementObservation(
+                "codex",
+                "Codex implement turn reported a filesystem tooling blocker; retrying this attempt in staged_llm mode.",
+                {
+                  attempt,
+                  threadId: activeThreadId,
+                  publicDir: defaultPublicDir
+                }
+              );
+              const llmTimeoutMs = getImplementLlmTimeoutMs(this.deps.config);
+              const filesystemFallbackPrompt = this.buildFilesystemFallbackRecoveryPrompt({
+                taskSpec,
+                searchLocalization,
+                branchPlan,
+                attempt
+              });
+              const filesystemFallbackSystemPrompt = appendFilesystemFallbackOverrideToPrompt(attemptSystemPrompt);
+              const completion = await this.completeStagedLlmRequest({
+                prompt: filesystemFallbackPrompt,
+                systemPrompt: filesystemFallbackSystemPrompt,
+                timeoutMs: llmTimeoutMs,
+                abortSignal,
+                attempt,
+                threadId: activeThreadId,
+                publicDir: defaultPublicDir,
+                emitImplementObservation,
+                reasoningEffort: experimentLlmProfile.reasoningEffort
+              });
+              result = {
+                threadId: completion.threadId || activeThreadId,
+                finalText: completion.text,
+                events: []
+              };
+            }
           } else {
             if (!this.deps.llm) {
               throw new Error("implement_experiments is configured for staged_llm mode, but no LLM client is available.");
@@ -1717,6 +1754,43 @@ export class ImplementSessionManager {
     return lines.join("\n");
   }
 
+  private buildFilesystemFallbackRecoveryPrompt(params: {
+    taskSpec: ImplementTaskSpec;
+    searchLocalization: LocalizationResult;
+    branchPlan: BranchPlan;
+    attempt: number;
+  }): string {
+    const sandboxTaskSpec = rewriteWorkspacePathsForSandbox(params.taskSpec, this.deps.workspaceRoot);
+    const sandboxSearchLocalization = rewriteWorkspacePathsForSandbox(params.searchLocalization, this.deps.workspaceRoot);
+    const sandboxBranchPlan = rewriteWorkspacePathsForSandbox(params.branchPlan, this.deps.workspaceRoot);
+    const promptTaskSpec = compactTaskSpecForStagedLlmPrompt(sandboxTaskSpec);
+    const promptSearchLocalization = compactLocalizationForStagedLlmPrompt(sandboxSearchLocalization);
+    const promptBranchPlan = compactBranchPlanForStagedLlmPrompt(sandboxBranchPlan);
+
+    return [
+      `Implementation attempt ${params.attempt}/${MAX_IMPLEMENT_ATTEMPTS} (filesystem-blocker recovery mode).`,
+      "The previous Codex filesystem/tooling blocker has already been detected and handled by AutoLabOS.",
+      "Do NOT repeat the blocker narrative, sandbox explanation, or any request to retry Codex filesystem actions.",
+      "Treat this as a fresh staged_llm implementation task and return ONLY one JSON object.",
+      "A valid response MUST include non-empty file_edits for every created or modified text artifact needed for the runnable experiment bundle.",
+      "At minimum, emit file_edits for the runnable script and any required config or README referenced by your commands.",
+      "If inspection is incomplete, synthesize the smallest bounded implementation that satisfies the locked task spec, branch focus, and localization hints.",
+      "Task spec:",
+      JSON.stringify(promptTaskSpec, null, 2),
+      "",
+      "Search-backed localization hints:",
+      JSON.stringify(promptSearchLocalization, null, 2),
+      "",
+      "Branch focus:",
+      JSON.stringify(promptBranchPlan, null, 2),
+      "",
+      "Output contract reminder:",
+      "- Return ONLY one JSON object with keys: summary, experiment_mode, run_command, test_command, working_dir, changed_files, artifacts, public_dir, public_artifacts, script_path, metrics_path, localization, assumptions, file_edits.",
+      "- file_edits must contain full UTF-8 contents for each referenced file.",
+      "- Responses that only describe the blocker or omit file_edits are invalid."
+    ].join("\n");
+  }
+
   private async completeStagedLlmRequest(input: {
     prompt: string;
     systemPrompt: string;
@@ -3582,6 +3656,30 @@ function isDeferredExecutionArtifactPath(filePath: string): boolean {
   );
 }
 
+function shouldFallbackToStagedImplementLlm(finalText: string): boolean {
+  const normalized = finalText.toLowerCase();
+  return (
+    normalized.includes("bwrap: loopback: failed rtm_newaddr: operation not permitted") ||
+    normalized.includes("codex local filesystem action") ||
+    normalized.includes("sandbox startup failure")
+  );
+}
+
+function appendFilesystemFallbackOverrideToPrompt(prompt: string): string {
+  return [
+    prompt,
+    "",
+    "Filesystem-blocker recovery mode:",
+    "- A previous Codex workspace filesystem/tooling blocker has already been detected and handled by AutoLabOS.",
+    "- Do NOT repeat the blocker narrative, sandbox failure explanation, or any request to retry Codex filesystem actions.",
+    "- In this staged_llm mode, you must synthesize the implementation directly as structured file_edits.",
+    "- A valid response must include file_edits for each created or modified text artifact needed for the runnable experiment bundle.",
+    "- At minimum, emit file_edits for the runnable script and any required config or README referenced by your commands.",
+    "- If prior attempts failed before materializing files, treat that as resolved context rather than the answer.",
+    "- If inspection is incomplete, generate the smallest bounded implementation that satisfies the task spec, localization hints, and verification command."
+  ].join("\n");
+}
+
 async function publishReusableArtifacts(params: {
   changedFiles: string[];
   artifacts: string[];
 
@@ -77,6 +77,20 @@ export interface PaperAnalysisResult {
   rawJson: RawPaperAnalysis;
 }
 
+export function synthesizeDeterministicAbstractFallbackResult(args: {
+  paper: AnalysisCorpusRow;
+  source: ResolvedPaperSource;
+  failureReason: string;
+  attempts?: number;
+}): PaperAnalysisResult {
+  const fallbackDraft = buildDeterministicAbstractTimeoutFallback(args.paper, args.source, args.failureReason);
+  return {
+    ...normalizePaperAnalysis(args.paper, args.source, fallbackDraft),
+    attempts: args.attempts ?? 1,
+    rawJson: fallbackDraft
+  };
+}
+
 export const ANALYSIS_SYSTEM_PROMPT = [
   "You are a scientific literature analyst for AutoLabOS.",
   "Return one JSON object only.",
@@ -169,6 +183,17 @@ export async function analyzePaperWithLlm(args: {
       args.onProgress?.(
         `Analysis attempt ${attempt}/${imageBearingAttemptLimit} failed: ${describeAnalysisAttemptFailureReason(lastError)}`
       );
+      if (shouldSynthesizeAbstractTimeoutFallback(args.source, lastError)) {
+        args.onProgress?.(
+          "Abstract-only analysis still timed out. Using a deterministic abstract fallback analysis to preserve a minimal, source-grounded summary."
+        );
+        return synthesizeDeterministicAbstractFallbackResult({
+          paper: args.paper,
+          source: args.source,
+          failureReason: lastError.message,
+          attempts: attempt
+        });
+      }
       if (isPaperAnalysisTimeoutError(lastError)) {
         throw lastError;
       }
@@ -608,6 +633,77 @@ function isAbortError(error: unknown): boolean {
   return message.includes("aborted") || message.includes("abort");
 }
 
+function shouldSynthesizeAbstractTimeoutFallback(source: ResolvedPaperSource, error: unknown): boolean {
+  return source.sourceType === "abstract" && isPaperAnalysisTimeoutError(error);
+}
+
+function buildDeterministicAbstractTimeoutFallback(
+  paper: AnalysisCorpusRow,
+  source: ResolvedPaperSource,
+  failureReason: string
+): RawPaperAnalysis {
+  const abstract = paper.abstract?.trim() || "";
+  const fallbackSummary = summarizeAbstractForTimeoutFallback(abstract, paper.title);
+  const abstractSentence = firstMeaningfulSentence(abstract);
+  const evidenceSpan = trimToLength(abstract || source.text || paper.title, 240);
+  const claim = trimToLength(abstractSentence || fallbackSummary, 220);
+  return {
+    summary: fallbackSummary,
+    key_findings: abstractSentence ? [trimToLength(abstractSentence, 180)] : [],
+    limitations: ["Abstract-only fallback; no verified full-text extraction completed before timeout."],
+    datasets: [],
+    metrics: [],
+    novelty: "Not established from abstract-only fallback evidence.",
+    reproducibility_notes: [
+      `Synthesized from title/abstract only after analysis timed out (${failureReason}).`
+    ],
+    evidence_items: [
+      {
+        claim,
+        method_slot: "Not specified from abstract-only fallback.",
+        result_slot: fallbackSummary,
+        limitation_slot: "Full-text extraction or extraction review did not complete before timeout.",
+        dataset_slot: "Not specified.",
+        metric_slot: "Not specified.",
+        evidence_span: evidenceSpan,
+        confidence: 0.3,
+        confidence_reason:
+          "This item was synthesized from title/abstract only after repeated analysis timeouts, so it should be treated as weak abstract-only evidence."
+      }
+    ]
+  };
+}
+
+function summarizeAbstractForTimeoutFallback(abstract: string, title: string): string {
+  const sentence = firstMeaningfulSentence(abstract);
+  if (sentence) {
+    return trimToLength(sentence, 280);
+  }
+  if (abstract) {
+    return trimToLength(abstract, 280);
+  }
+  return trimToLength(`Abstract-only fallback for "${title}".`, 280);
+}
+
+function firstMeaningfulSentence(text: string): string | undefined {
+  const normalized = text.replace(/\s+/g, " ").trim();
+  if (!normalized) {
+    return undefined;
+  }
+  const sentences = normalized
+    .split(/(?<=[.!?])\s+/u)
+    .map((sentence) => sentence.trim())
+    .filter((sentence) => sentence.length > 24);
+  return sentences[0] || normalized;
+}
+
+function trimToLength(text: string, maxLength: number): string {
+  if (text.length <= maxLength) {
+    return text;
+  }
+  return `${text.slice(0, Math.max(0, maxLength - 1)).trimEnd()}…`;
+}
+
 export function buildPaperAnalysisPrompt(
   paper: AnalysisCorpusRow,
   source: ResolvedPaperSource,
 
@@ -0,0 +1,11 @@
+export function detectPreflightOnlyMetrics(metrics: Record<string, unknown>): string | null {
+  const mode = typeof metrics.mode === "string" ? metrics.mode.trim().toLowerCase() : "";
+  const notes = typeof metrics.notes === "string" ? metrics.notes.trim() : "";
+  if (mode === "preflight") {
+    return "Experiment only emitted preflight metrics; no training or evaluation was executed.";
+  }
+  if (/no training\/evaluation executed/i.test(notes)) {
+    return "Experiment reported that no training or evaluation was executed.";
+  }
+  return null;
+}