Skip to content

Commit 966d0f7

Browse files
authored
chore: add character scanner (#336)
* feat: add hidden Unicode character scanner for prompt injection defense Scans all tracked text files for invisible characters used in LLM prompt injection, Trojan Source, and supply-chain attacks. Covers tag characters, BiDi controls, zero-width chars, and all Unicode Cf format characters. Also detects markdown side-channels (long HTML comments, hidden reference links). Runs as a pre-commit hook (Lefthook) and CI step on PRs. * fix: resolve ESLint errors in hidden char scanner Add Node globals for .mjs scripts in ESLint config. Remove unnecessary regex escapes in reference link pattern.
1 parent cdb6e5c commit 966d0f7

4 files changed

Lines changed: 229 additions & 0 deletions

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ jobs:
1313
- uses: actions/checkout@v4
1414
- uses: ./.github/actions/setup
1515

16+
- name: Scan for hidden characters
17+
run: node scripts/scan-hidden-chars.mjs
18+
1619
- name: Check formatting
1720
run: pnpm prettier --check "src/**/*.ts"
1821

eslint.config.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ export default [
4141
},
4242
},
4343
},
44+
{
45+
files: ["scripts/**/*.mjs"],
46+
languageOptions: {
47+
globals: {
48+
...globals.node,
49+
},
50+
},
51+
},
4452
{
4553
ignores: ["dist/**", "node_modules/**"],
4654
},

lefthook.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ pre-commit:
1010
type-check:
1111
glob: "*.{ts,js}"
1212
run: pnpm type-check
13+
scan-hidden-chars:
14+
glob: "*.{ts,js,mjs,cjs,json,md,yml,yaml}"
15+
run: node scripts/scan-hidden-chars.mjs {staged_files}

scripts/scan-hidden-chars.mjs

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Scans files for hidden Unicode characters commonly used in:
5+
* - LLM prompt injection (tag characters encoding invisible instructions)
6+
* - Trojan Source attacks (BiDi overrides making code render differently than it executes)
7+
* - Supply-chain attacks via invisible payloads in source/config/docs
8+
*
9+
* Also detects markdown-specific side-channels (HTML comments, hidden reference
10+
* links) that are invisible when rendered but readable by AI tools.
11+
*
12+
* Usage:
13+
* node scripts/scan-hidden-chars.mjs [file...]
14+
* With no arguments, scans all tracked text files in the repo.
15+
*/
16+
17+
import { readFileSync } from "node:fs";
18+
import { execSync } from "node:child_process";
19+
20+
21+
// ---------------------------------------------------------------------------
22+
// Invisible / rendering-manipulating Unicode characters.
23+
// Organized by attack vector. Each entry: [description, regex]
24+
// ---------------------------------------------------------------------------
25+
const PATTERNS = [
26+
// --- Primary prompt injection vector ---
27+
// Tag characters: U+E0000–E007F. Used in 2025-2026 attacks to encode entire
28+
// hidden prompts (each tag char maps to an ASCII char) invisible to reviewers.
29+
["Tag character", /[\u{E0000}-\u{E007F}]/gu],
30+
// Variation Selectors Supplement: U+E0100–E01EF. Similar encoding potential.
31+
["Variation selector supplement", /[\u{E0100}-\u{E01EF}]/gu],
32+
33+
// --- Trojan Source attack vector ---
34+
// BiDi override/embedding/isolate characters. Make code render differently
35+
// than it executes — e.g., swapping the apparent order of operands.
36+
["BiDi control character", /[\u061C\u200E\u200F\u202A-\u202E\u2066-\u2069]/g],
37+
38+
// --- Zero-width characters: payload encoding ---
39+
// Hide text between visible characters, or encode Base64 via presence/absence.
40+
["Zero-width space", /\u200B/g],
41+
["Zero-width non-joiner", /\u200C/g],
42+
["Zero-width joiner", /\u200D/g],
43+
["Zero-width no-break space", /\uFEFF/g],
44+
["Word joiner / invisible formatting", /[\u2060-\u2064]/g],
45+
46+
// --- Other invisible characters ---
47+
// Each renders as nothing in most environments and can carry hidden payloads.
48+
["Soft hyphen", /\u00AD/g],
49+
["Combining grapheme joiner", /\u034F/g],
50+
["Mongolian vowel separator", /\u180E/g],
51+
["Hangul filler", /[\u115F\u1160\u3164\uFFA0]/g],
52+
["Khmer inherent vowel", /[\u17B4\u17B5]/g],
53+
["Line/paragraph separator", /[\u2028\u2029]/g],
54+
["Interlinear annotation", /[\uFFF9-\uFFFB]/g],
55+
];
56+
57+
// ---------------------------------------------------------------------------
58+
// Markdown side-channel patterns.
59+
// These are invisible when rendered (GitHub, VS Code preview) but parsed as
60+
// raw text by AI tools that read repo context (Cursor, Copilot, etc.).
61+
// ---------------------------------------------------------------------------
62+
const MD_EXTENSIONS = new Set([".md", ".mdx", ".markdown"]);
63+
64+
// HTML comments longer than this are suspicious — legitimate pragmas like
65+
// <!-- prettier-ignore --> are short. Long comments can hide prompt injections
66+
// that are invisible in rendered markdown.
67+
const HTML_COMMENT_LENGTH_THRESHOLD = 80;
68+
69+
// Matches <!-- ... --> including multiline. Captures the comment body.
70+
const HTML_COMMENT_RE = /<!--([\s\S]*?)-->/g;
71+
72+
// Hidden reference links used as markdown "comments":
73+
// [//]: # (hidden text here)
74+
// [//]: # "hidden text here"
75+
// Invisible when rendered, but parsed by AI context scrapers.
76+
const HIDDEN_REF_LINK_RE = /^\[\/\/\]: #\s*[("](.*?)[)"]\s*$/;
77+
78+
// ---------------------------------------------------------------------------
79+
// Scanning
80+
// ---------------------------------------------------------------------------
81+
let totalFindings = 0;
82+
83+
function isMarkdown(filepath) {
84+
const ext = filepath.slice(filepath.lastIndexOf("."));
85+
return MD_EXTENSIONS.has(ext.toLowerCase());
86+
}
87+
88+
function scanFile(filepath) {
89+
let content;
90+
try {
91+
content = readFileSync(filepath, "utf-8");
92+
} catch {
93+
return;
94+
}
95+
96+
const lines = content.split("\n");
97+
const fileFindings = [];
98+
99+
// --- Invisible Unicode character scan (all files) ---
100+
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
101+
const line = lines[lineIdx];
102+
const reported = new Set();
103+
104+
// Named patterns first — gives descriptive output for known attack vectors.
105+
for (const [name, pattern] of PATTERNS) {
106+
pattern.lastIndex = 0;
107+
let match;
108+
while ((match = pattern.exec(line)) !== null) {
109+
const codePoint = match[0]
110+
.codePointAt(0)
111+
.toString(16)
112+
.toUpperCase()
113+
.padStart(4, "0");
114+
115+
// Allow BOM (U+FEFF) at the very start of a file — that's legitimate.
116+
if (codePoint === "FEFF" && lineIdx === 0 && match.index === 0) continue;
117+
118+
reported.add(match.index);
119+
fileFindings.push({
120+
line: lineIdx + 1,
121+
col: match.index + 1,
122+
name,
123+
codePoint,
124+
});
125+
}
126+
}
127+
128+
// Catch-all: \p{Cf} covers the entire Unicode "Format" category.
129+
// Catches any invisible format character not already matched above
130+
// (e.g., Arabic format chars, deprecated formatting, script-specific controls).
131+
const cfPattern = /\p{Cf}/gu;
132+
let cfMatch;
133+
while ((cfMatch = cfPattern.exec(line)) !== null) {
134+
if (reported.has(cfMatch.index)) continue;
135+
const codePoint = cfMatch[0]
136+
.codePointAt(0)
137+
.toString(16)
138+
.toUpperCase()
139+
.padStart(4, "0");
140+
if (codePoint === "FEFF" && lineIdx === 0 && cfMatch.index === 0) continue;
141+
142+
fileFindings.push({
143+
line: lineIdx + 1,
144+
col: cfMatch.index + 1,
145+
name: "Unicode format character",
146+
codePoint,
147+
});
148+
}
149+
}
150+
151+
// --- Markdown side-channel scan (.md files only) ---
152+
if (isMarkdown(filepath)) {
153+
// Check for long HTML comments (potential hidden instructions).
154+
HTML_COMMENT_RE.lastIndex = 0;
155+
let commentMatch;
156+
while ((commentMatch = HTML_COMMENT_RE.exec(content)) !== null) {
157+
const body = commentMatch[1].trim();
158+
if (body.length > HTML_COMMENT_LENGTH_THRESHOLD) {
159+
// Find the line number of the comment start.
160+
const upToMatch = content.slice(0, commentMatch.index);
161+
const lineNum = upToMatch.split("\n").length;
162+
fileFindings.push({
163+
line: lineNum,
164+
col: commentMatch.index - upToMatch.lastIndexOf("\n"),
165+
name: `Long HTML comment (${body.length} chars) — may hide prompt injection`,
166+
});
167+
}
168+
}
169+
170+
// Check for hidden reference-link "comments".
171+
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
172+
const refMatch = lines[lineIdx].match(HIDDEN_REF_LINK_RE);
173+
if (refMatch && refMatch[1].length > 0) {
174+
fileFindings.push({
175+
line: lineIdx + 1,
176+
col: 1,
177+
name: "Hidden reference link — invisible when rendered",
178+
});
179+
}
180+
}
181+
}
182+
183+
if (fileFindings.length > 0) {
184+
totalFindings += fileFindings.length;
185+
for (const f of fileFindings) {
186+
const suffix = f.codePoint ? ` (U+${f.codePoint})` : "";
187+
console.error(` ${filepath}:${f.line}:${f.col}${f.name}${suffix}`);
188+
}
189+
}
190+
}
191+
192+
// Determine which files to scan.
193+
let files = process.argv.slice(2).filter((arg) => !arg.startsWith("-"));
194+
195+
if (files.length === 0) {
196+
const extensions = "ts js mjs cjs json md yml yaml"
197+
.split(" ")
198+
.map((ext) => `'*.${ext}'`)
199+
.join(" ");
200+
const tracked = execSync(`git ls-files -- ${extensions}`, {
201+
encoding: "utf-8",
202+
});
203+
files = tracked.trim().split("\n").filter(Boolean);
204+
}
205+
206+
for (const file of files) {
207+
scanFile(file);
208+
}
209+
210+
if (totalFindings > 0) {
211+
console.error(
212+
`\n Found ${totalFindings} hidden character(s) that may indicate prompt injection or Trojan Source attacks.`,
213+
);
214+
process.exit(1);
215+
}

0 commit comments

Comments
 (0)