Skip to content

Commit c93de82

Browse files
committed
Optimized text normalization
1 parent c46a354 commit c93de82

File tree

1 file changed

+25
-19
lines changed

1 file changed

+25
-19
lines changed

examples/official/annotation_tool/benchmark/gui_benchmark.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -179,24 +179,29 @@ def load_and_convert(file_path: str) -> Dict[str, List[str]]:
179179
class AnnotationValidator:
180180
"""Validate detected barcodes against expected values."""
181181

182-
_CONTROL_TOKEN_MAP = {
183-
'<NUL>': '\x00',
184-
'␀': '\x00',
185-
'<EOT>': '\x04',
186-
'␄': '\x04',
187-
'<GS>': '\x1d',
188-
'[GS]': '\x1d',
189-
'␝': '\x1d',
190-
'<RS>': '\x1e',
191-
'␞': '\x1e',
182+
_CONTROL_CHAR_NAMES = {
183+
0x00: 'NUL',
184+
0x01: 'SOH',
185+
0x04: 'EOT',
186+
0x10: 'DLE',
187+
0x1C: 'FS',
188+
0x1D: 'GS',
189+
0x1E: 'RS',
192190
}
193191

194192
@staticmethod
195193
def _normalize_text(value: str) -> str:
196194
"""Normalize line endings and control-character placeholders."""
197195
normalized = value.replace('\r\n', '\n').replace('\r', '\n').strip()
198-
for token, replacement in AnnotationValidator._CONTROL_TOKEN_MAP.items():
199-
normalized = normalized.replace(token, replacement)
196+
197+
for code, name in AnnotationValidator._CONTROL_CHAR_NAMES.items():
198+
raw = chr(code)
199+
normalized = normalized.replace(f'<{name}>', raw)
200+
normalized = normalized.replace(f'[{name}]', raw)
201+
normalized = normalized.replace(chr(0x2400 + code), raw)
202+
203+
# U+2420 is the control-picture symbol for space.
204+
normalized = normalized.replace('␠', ' ')
200205
return normalized
201206

202207
@staticmethod
@@ -547,13 +552,14 @@ def _display_text(s: str) -> str:
547552
from html import escape
548553

549554
normalized = AnnotationValidator._normalize_text(str(s))
550-
visible = (
551-
normalized
552-
.replace('\x00', '<NUL>')
553-
.replace('\x04', '<EOT>')
554-
.replace('\x1d', '<GS>')
555-
.replace('\x1e', '<RS>')
556-
)
555+
visible_parts = []
556+
for ch in normalized:
557+
code = ord(ch)
558+
if code in AnnotationValidator._CONTROL_CHAR_NAMES:
559+
visible_parts.append(f'<{AnnotationValidator._CONTROL_CHAR_NAMES[code]}>')
560+
else:
561+
visible_parts.append(ch)
562+
visible = ''.join(visible_parts)
557563
return escape(visible).replace('\n', '<br>')
558564

559565
@staticmethod

0 commit comments

Comments
 (0)