|
| 1 | +#!/usr/bin/env node |
| 2 | + |
| 3 | +/** |
| 4 | + * Scans files for hidden Unicode characters commonly used in: |
| 5 | + * - LLM prompt injection (tag characters encoding invisible instructions) |
| 6 | + * - Trojan Source attacks (BiDi overrides making code render differently than it executes) |
| 7 | + * - Supply-chain attacks via invisible payloads in source/config/docs |
| 8 | + * |
| 9 | + * Also detects markdown-specific side-channels (HTML comments, hidden reference |
| 10 | + * links) that are invisible when rendered but readable by AI tools. |
| 11 | + * |
| 12 | + * Usage: |
| 13 | + * node scripts/scan-hidden-chars.mjs [file...] |
| 14 | + * With no arguments, scans all tracked text files in the repo. |
| 15 | + */ |
| 16 | + |
| 17 | +import { readFileSync } from "node:fs"; |
| 18 | +import { execSync } from "node:child_process"; |
| 19 | + |
| 20 | + |
| 21 | +// --------------------------------------------------------------------------- |
| 22 | +// Invisible / rendering-manipulating Unicode characters. |
| 23 | +// Organized by attack vector. Each entry: [description, regex] |
| 24 | +// --------------------------------------------------------------------------- |
| 25 | +const PATTERNS = [ |
| 26 | + // --- Primary prompt injection vector --- |
| 27 | + // Tag characters: U+E0000–E007F. Used in 2025-2026 attacks to encode entire |
| 28 | + // hidden prompts (each tag char maps to an ASCII char) invisible to reviewers. |
| 29 | + ["Tag character", /[\u{E0000}-\u{E007F}]/gu], |
| 30 | + // Variation Selectors Supplement: U+E0100–E01EF. Similar encoding potential. |
| 31 | + ["Variation selector supplement", /[\u{E0100}-\u{E01EF}]/gu], |
| 32 | + |
| 33 | + // --- Trojan Source attack vector --- |
| 34 | + // BiDi override/embedding/isolate characters. Make code render differently |
| 35 | + // than it executes — e.g., swapping the apparent order of operands. |
| 36 | + ["BiDi control character", /[\u061C\u200E\u200F\u202A-\u202E\u2066-\u2069]/g], |
| 37 | + |
| 38 | + // --- Zero-width characters: payload encoding --- |
| 39 | + // Hide text between visible characters, or encode Base64 via presence/absence. |
| 40 | + ["Zero-width space", /\u200B/g], |
| 41 | + ["Zero-width non-joiner", /\u200C/g], |
| 42 | + ["Zero-width joiner", /\u200D/g], |
| 43 | + ["Zero-width no-break space", /\uFEFF/g], |
| 44 | + ["Word joiner / invisible formatting", /[\u2060-\u2064]/g], |
| 45 | + |
| 46 | + // --- Other invisible characters --- |
| 47 | + // Each renders as nothing in most environments and can carry hidden payloads. |
| 48 | + ["Soft hyphen", /\u00AD/g], |
| 49 | + ["Combining grapheme joiner", /\u034F/g], |
| 50 | + ["Mongolian vowel separator", /\u180E/g], |
| 51 | + ["Hangul filler", /[\u115F\u1160\u3164\uFFA0]/g], |
| 52 | + ["Khmer inherent vowel", /[\u17B4\u17B5]/g], |
| 53 | + ["Line/paragraph separator", /[\u2028\u2029]/g], |
| 54 | + ["Interlinear annotation", /[\uFFF9-\uFFFB]/g], |
| 55 | +]; |
| 56 | + |
| 57 | +// --------------------------------------------------------------------------- |
| 58 | +// Markdown side-channel patterns. |
| 59 | +// These are invisible when rendered (GitHub, VS Code preview) but parsed as |
| 60 | +// raw text by AI tools that read repo context (Cursor, Copilot, etc.). |
| 61 | +// --------------------------------------------------------------------------- |
| 62 | +const MD_EXTENSIONS = new Set([".md", ".mdx", ".markdown"]); |
| 63 | + |
| 64 | +// HTML comments longer than this are suspicious — legitimate pragmas like |
| 65 | +// <!-- prettier-ignore --> are short. Long comments can hide prompt injections |
| 66 | +// that are invisible in rendered markdown. |
| 67 | +const HTML_COMMENT_LENGTH_THRESHOLD = 80; |
| 68 | + |
| 69 | +// Matches <!-- ... --> including multiline. Captures the comment body. |
| 70 | +const HTML_COMMENT_RE = /<!--([\s\S]*?)-->/g; |
| 71 | + |
| 72 | +// Hidden reference links used as markdown "comments": |
| 73 | +// [//]: # (hidden text here) |
| 74 | +// [//]: # "hidden text here" |
| 75 | +// Invisible when rendered, but parsed by AI context scrapers. |
| 76 | +const HIDDEN_REF_LINK_RE = /^\[\/\/\]: #\s*[("](.*?)[)"]\s*$/; |
| 77 | + |
| 78 | +// --------------------------------------------------------------------------- |
| 79 | +// Scanning |
| 80 | +// --------------------------------------------------------------------------- |
| 81 | +let totalFindings = 0; |
| 82 | + |
| 83 | +function isMarkdown(filepath) { |
| 84 | + const ext = filepath.slice(filepath.lastIndexOf(".")); |
| 85 | + return MD_EXTENSIONS.has(ext.toLowerCase()); |
| 86 | +} |
| 87 | + |
| 88 | +function scanFile(filepath) { |
| 89 | + let content; |
| 90 | + try { |
| 91 | + content = readFileSync(filepath, "utf-8"); |
| 92 | + } catch { |
| 93 | + return; |
| 94 | + } |
| 95 | + |
| 96 | + const lines = content.split("\n"); |
| 97 | + const fileFindings = []; |
| 98 | + |
| 99 | + // --- Invisible Unicode character scan (all files) --- |
| 100 | + for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) { |
| 101 | + const line = lines[lineIdx]; |
| 102 | + const reported = new Set(); |
| 103 | + |
| 104 | + // Named patterns first — gives descriptive output for known attack vectors. |
| 105 | + for (const [name, pattern] of PATTERNS) { |
| 106 | + pattern.lastIndex = 0; |
| 107 | + let match; |
| 108 | + while ((match = pattern.exec(line)) !== null) { |
| 109 | + const codePoint = match[0] |
| 110 | + .codePointAt(0) |
| 111 | + .toString(16) |
| 112 | + .toUpperCase() |
| 113 | + .padStart(4, "0"); |
| 114 | + |
| 115 | + // Allow BOM (U+FEFF) at the very start of a file — that's legitimate. |
| 116 | + if (codePoint === "FEFF" && lineIdx === 0 && match.index === 0) continue; |
| 117 | + |
| 118 | + reported.add(match.index); |
| 119 | + fileFindings.push({ |
| 120 | + line: lineIdx + 1, |
| 121 | + col: match.index + 1, |
| 122 | + name, |
| 123 | + codePoint, |
| 124 | + }); |
| 125 | + } |
| 126 | + } |
| 127 | + |
| 128 | + // Catch-all: \p{Cf} covers the entire Unicode "Format" category. |
| 129 | + // Catches any invisible format character not already matched above |
| 130 | + // (e.g., Arabic format chars, deprecated formatting, script-specific controls). |
| 131 | + const cfPattern = /\p{Cf}/gu; |
| 132 | + let cfMatch; |
| 133 | + while ((cfMatch = cfPattern.exec(line)) !== null) { |
| 134 | + if (reported.has(cfMatch.index)) continue; |
| 135 | + const codePoint = cfMatch[0] |
| 136 | + .codePointAt(0) |
| 137 | + .toString(16) |
| 138 | + .toUpperCase() |
| 139 | + .padStart(4, "0"); |
| 140 | + if (codePoint === "FEFF" && lineIdx === 0 && cfMatch.index === 0) continue; |
| 141 | + |
| 142 | + fileFindings.push({ |
| 143 | + line: lineIdx + 1, |
| 144 | + col: cfMatch.index + 1, |
| 145 | + name: "Unicode format character", |
| 146 | + codePoint, |
| 147 | + }); |
| 148 | + } |
| 149 | + } |
| 150 | + |
| 151 | + // --- Markdown side-channel scan (.md files only) --- |
| 152 | + if (isMarkdown(filepath)) { |
| 153 | + // Check for long HTML comments (potential hidden instructions). |
| 154 | + HTML_COMMENT_RE.lastIndex = 0; |
| 155 | + let commentMatch; |
| 156 | + while ((commentMatch = HTML_COMMENT_RE.exec(content)) !== null) { |
| 157 | + const body = commentMatch[1].trim(); |
| 158 | + if (body.length > HTML_COMMENT_LENGTH_THRESHOLD) { |
| 159 | + // Find the line number of the comment start. |
| 160 | + const upToMatch = content.slice(0, commentMatch.index); |
| 161 | + const lineNum = upToMatch.split("\n").length; |
| 162 | + fileFindings.push({ |
| 163 | + line: lineNum, |
| 164 | + col: commentMatch.index - upToMatch.lastIndexOf("\n"), |
| 165 | + name: `Long HTML comment (${body.length} chars) — may hide prompt injection`, |
| 166 | + }); |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + // Check for hidden reference-link "comments". |
| 171 | + for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) { |
| 172 | + const refMatch = lines[lineIdx].match(HIDDEN_REF_LINK_RE); |
| 173 | + if (refMatch && refMatch[1].length > 0) { |
| 174 | + fileFindings.push({ |
| 175 | + line: lineIdx + 1, |
| 176 | + col: 1, |
| 177 | + name: "Hidden reference link — invisible when rendered", |
| 178 | + }); |
| 179 | + } |
| 180 | + } |
| 181 | + } |
| 182 | + |
| 183 | + if (fileFindings.length > 0) { |
| 184 | + totalFindings += fileFindings.length; |
| 185 | + for (const f of fileFindings) { |
| 186 | + const suffix = f.codePoint ? ` (U+${f.codePoint})` : ""; |
| 187 | + console.error(` ${filepath}:${f.line}:${f.col} — ${f.name}${suffix}`); |
| 188 | + } |
| 189 | + } |
| 190 | +} |
| 191 | + |
| 192 | +// Determine which files to scan. |
| 193 | +let files = process.argv.slice(2).filter((arg) => !arg.startsWith("-")); |
| 194 | + |
| 195 | +if (files.length === 0) { |
| 196 | + const extensions = "ts js mjs cjs json md yml yaml" |
| 197 | + .split(" ") |
| 198 | + .map((ext) => `'*.${ext}'`) |
| 199 | + .join(" "); |
| 200 | + const tracked = execSync(`git ls-files -- ${extensions}`, { |
| 201 | + encoding: "utf-8", |
| 202 | + }); |
| 203 | + files = tracked.trim().split("\n").filter(Boolean); |
| 204 | +} |
| 205 | + |
| 206 | +for (const file of files) { |
| 207 | + scanFile(file); |
| 208 | +} |
| 209 | + |
| 210 | +if (totalFindings > 0) { |
| 211 | + console.error( |
| 212 | + `\n Found ${totalFindings} hidden character(s) that may indicate prompt injection or Trojan Source attacks.`, |
| 213 | + ); |
| 214 | + process.exit(1); |
| 215 | +} |
0 commit comments