fix: harden LLM guardrails + extract shared scan helper

TPEmist · TPEmist · commit 336ad1df1553 · 2026-04-09T15:56:04.000-07:00
- XML entity escaping for user-controlled values in LLM prompt
- Strict boolean validation (approved is True, not truthy coercion)
- Unicode NFKC normalization before keyword detection
- Regex word boundaries on injection patterns to reduce false positives
- Extract _scan_and_validate() helper to deduplicate security scan logic
diff --git a/pop_pay/engine/guardrails.py b/pop_pay/engine/guardrails.py
@@ -1,5 +1,6 @@
 import re
 import os
+import unicodedata
 from urllib.parse import urlparse
 from pop_pay.core.models import PaymentIntent, GuardrailPolicy
 
@@ -62,7 +63,7 @@ async def evaluate_intent(self, intent: PaymentIntent, policy: GuardrailPolicy)
 
         # Rule 2: Hallucination/Loop detection
         if policy.block_hallucination_loops:
-            reasoning_lower = intent.reasoning.lower()
+            reasoning_lower = unicodedata.normalize("NFKC", intent.reasoning).lower()
             loop_keywords = ["retry", "failed again", "loop", "ignore previous", "stuck"]
 
             for keyword in loop_keywords:
@@ -72,11 +73,11 @@ async def evaluate_intent(self, intent: PaymentIntent, policy: GuardrailPolicy)
             # Rule 3: Injection pattern detection
             injection_patterns = [
                 r'\{.*".*".*:',                         # JSON-like structure
-                r'output\s*:',                           # "output:" pattern
-                r'you are now',                          # role injection
-                r'ignore (all |previous |your |the )',   # instruction override
-                r'already (approved|authorized|confirmed)',  # false pre-approval
-                r'system (says|has|override)',            # system impersonation
+                r'\boutput\s*:',                          # "output:" pattern
+                r'\byou are now\b',                      # role injection
+                r'\bignore (all |previous |your |the )', # instruction override
+                r'\balready (approved|authorized|confirmed)\b',  # false pre-approval
+                r'\bsystem (says|has|override)\b',       # system impersonation
             ]
             for pattern in injection_patterns:
                 if re.search(pattern, reasoning_lower):
diff --git a/pop_pay/engine/llm_guardrails.py b/pop_pay/engine/llm_guardrails.py
@@ -1,8 +1,13 @@
 import json
+from html import escape as _html_escape
 from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
 from pop_pay.core.models import PaymentIntent, GuardrailPolicy
 from pop_pay.engine.guardrails import GuardrailEngine
 
+
+def _escape_xml(s: str) -> str:
+    return _html_escape(s, quote=True)
+
 # Exceptions that warrant a retry (rate limits, transient server errors).
 # Defined at module level so the @retry decorator can reference them before
 # openai is imported — the actual classes are resolved lazily inside the engine.
@@ -34,10 +39,10 @@ async def evaluate_intent(self, intent: PaymentIntent, policy: GuardrailPolicy)
         prompt = f"""Evaluate the following agent payment intent and determine if it should be approved.
 
 <payment_request>
-  <vendor>{intent.target_vendor}</vendor>
+  <vendor>{_escape_xml(intent.target_vendor)}</vendor>
   <amount>{intent.requested_amount}</amount>
-  <allowed_categories>{policy.allowed_categories}</allowed_categories>
-  <agent_reasoning>{intent.reasoning}</agent_reasoning>
+  <allowed_categories>{_escape_xml(str(policy.allowed_categories))}</allowed_categories>
+  <agent_reasoning>{_escape_xml(intent.reasoning)}</agent_reasoning>
 </payment_request>
 
 Rules:
@@ -61,7 +66,8 @@ async def evaluate_intent(self, intent: PaymentIntent, policy: GuardrailPolicy)
             response = await self.client.chat.completions.create(**kwargs)
             result_text = response.choices[0].message.content
             result = json.loads(result_text)
-            return result.get("approved", False), result.get("reason", "Unknown")
+            approved = result.get("approved", False) is True
+            return approved, result.get("reason", "Unknown")
         except self._openai.APIStatusError as e:
             # Re-raise retriable status codes (rate limit, server errors) so
             # tenacity's @retry decorator can back off and retry.
diff --git a/pop_pay/mcp_server.py b/pop_pay/mcp_server.py
@@ -267,6 +267,45 @@ async def _request_human_approval(
     return True, "auto-approved (no approval webhook configured)"
 
 
+async def _scan_and_validate(page_url: str, action_label: str = "Payment") -> tuple[bool, str]:
+    """Run security scan on page_url. Returns (ok, message).
+
+    ok=True means scan passed (or was skipped). ok=False means the caller
+    should return `message` immediately as a rejection.
+    """
+    if not page_url:
+        return True, f" (security scan skipped — no page_url provided)"
+
+    # Check cache first (reuse recent scan within 5 minutes)
+    cached = snapshot_cache.get(page_url)
+    if cached and datetime.now() - cached["timestamp"] < timedelta(minutes=5):
+        scan_result = {
+            "flags": cached["flags"],
+            "snapshot_id": cached["snapshot_id"],
+            "safe": "hidden_instructions_detected" not in cached["flags"],
+            "error": None,
+        }
+    else:
+        scan_result = await _scan_page(page_url)
+
+    if scan_result.get("error"):
+        return False, (
+            f"{action_label} rejected. Security scan failed: {scan_result['error']} "
+            f"Snapshot ID: {scan_result['snapshot_id']}. "
+            f"Fix the URL or skip page_url if the page has no associated URL."
+        )
+
+    if not scan_result["safe"]:
+        return False, (
+            f"{action_label} rejected. Security scan detected hidden prompt injection. "
+            f"Snapshot ID: {scan_result['snapshot_id']}. "
+            f"Flags: {scan_result['flags']}. "
+            f"Do not retry this."
+        )
+
+    return True, ""
+
+
 # ---------------------------------------------------------------------------
 # MCP Tools
 # ---------------------------------------------------------------------------
@@ -303,37 +342,10 @@ async def request_virtual_card(
     # -------------------------------------------------------------------
     # P1: Automatic security scan (runs whenever page_url is provided)
     # -------------------------------------------------------------------
-    scan_note = ""
-    if page_url:
-        # Check cache first (reuse recent scan within 5 minutes)
-        cached = snapshot_cache.get(page_url)
-        if cached and datetime.now() - cached["timestamp"] < timedelta(minutes=5):
-            scan_result = {
-                "flags": cached["flags"],
-                "snapshot_id": cached["snapshot_id"],
-                "safe": "hidden_instructions_detected" not in cached["flags"],
-                "error": None,
-            }
-        else:
-            scan_result = await _scan_page(page_url)
-
-        if scan_result.get("error"):
-            # Network/URL error — treat as unsafe; do not issue card
-            return (
-                f"Payment rejected. Security scan failed: {scan_result['error']} "
-                f"Snapshot ID: {scan_result['snapshot_id']}. "
-                f"Fix the URL or skip page_url if the checkout has no associated URL."
-            )
-
-        if not scan_result["safe"]:
-            return (
-                f"Payment rejected. Security scan detected hidden prompt injection. "
-                f"Snapshot ID: {scan_result['snapshot_id']}. "
-                f"Flags: {scan_result['flags']}. "
-                f"Do not retry this payment."
-            )
-    else:
-        scan_note = " (security scan skipped — no page_url provided)"
+    scan_ok, scan_msg = await _scan_and_validate(page_url, action_label="Payment")
+    if not scan_ok:
+        return scan_msg
+    scan_note = scan_msg  # empty string when scan passed, skip note when no page_url
 
     # Human approval gate (if POP_APPROVAL_WEBHOOK is configured)
     require_approval = os.getenv("POP_REQUIRE_HUMAN_APPROVAL", "false").lower() == "true"
@@ -503,34 +515,9 @@ async def request_purchaser_info(
     # -------------------------------------------------------------------
     # P1: Automatic security scan (runs whenever page_url is provided)
     # -------------------------------------------------------------------
-    if page_url:
-        # Check cache first (reuse recent scan within 5 minutes)
-        cached = snapshot_cache.get(page_url)
-        if cached and datetime.now() - cached["timestamp"] < timedelta(minutes=5):
-            scan_result = {
-                "flags": cached["flags"],
-                "snapshot_id": cached["snapshot_id"],
-                "safe": "hidden_instructions_detected" not in cached["flags"],
-                "error": None,
-            }
-        else:
-            scan_result = await _scan_page(page_url)
-
-        if scan_result.get("error"):
-            # Network/URL error — treat as unsafe; do not inject info
-            return (
-                f"Billing info rejected. Security scan failed: {scan_result['error']} "
-                f"Snapshot ID: {scan_result['snapshot_id']}. "
-                f"Fix the URL or skip page_url if the page has no associated URL."
-            )
-
-        if not scan_result["safe"]:
-            return (
-                f"Billing info rejected. Security scan detected hidden prompt injection. "
-                f"Snapshot ID: {scan_result['snapshot_id']}. "
-                f"Flags: {scan_result['flags']}. "
-                f"Do not retry this."
-            )
+    scan_ok, scan_msg = await _scan_and_validate(page_url, action_label="Billing info")
+    if not scan_ok:
+        return scan_msg
 
     if injector is None:
         return (