voice-input-method/config.yaml at main · pofice/voice-input-method · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Voice Input Method Configuration
# Copy this file and edit to match your setup.

# Recognizer backend: "funasr", "sherpa-sensevoice", "sensevoice-lm", "sherpa-nano", "qwen3-asr"
# funasr: default, fast ONNX inference, supports hotwords and streaming
# sherpa-sensevoice: built-in punctuation + ITN, small model (229MB int8)
# sensevoice-lm: SenseVoice CTC + KenLM rescoring, better accuracy than greedy decode
# sherpa-nano: LLM-based, best accuracy for dialects/accents, requires sherpa-onnx
# qwen3-asr: LLM-based, 52 languages + 22 Chinese dialects, SOTA accuracy, requires sherpa-onnx
recognizer_backend: sherpa-nano

# ASR Model
# model_type: "seaco_paraformer" (supports hotwords)
model_type: seaco_paraformer

# Path to the FunASR model directory.
# Download with: modelscope download --model pofice/speech_seaco_paraformer_large_onnx
# Leave empty to use the default modelscope cache path.
model_dir: ""

# Use INT8 quantization for faster inference (recommended)
quantize: true

# --- sherpa-sensevoice backend paths (only used when recognizer_backend="sherpa-sensevoice") ---
# Download: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
sensevoice_model_path: ""   # e.g. /path/to/sherpa-onnx-sense-voice-.../model.int8.onnx
sensevoice_tokens_path: ""  # e.g. /path/to/sherpa-onnx-sense-voice-.../tokens.txt
sensevoice_language: "zh"   # zh / en / ja / ko / yue

# --- sensevoice-lm backend (only used when recognizer_backend="sensevoice-lm") ---
# Uses the same model/tokens as sherpa-sensevoice, plus a KenLM language model
# for CTC beam search rescoring. Better accuracy for Chinese text.
# pip install pyctcdecode kenlm
sensevoice_lm_path: ""         # path to KenLM .bin or .arpa file (empty = no LM, pure beam search)
sensevoice_lm_alpha: 0.5      # LM weight (0 = ignore LM, 1 = strong LM influence)
sensevoice_lm_beta: 1.0       # word insertion bonus (higher = fewer deletions)
sensevoice_lm_beam_width: 20  # beam search width (higher = slower but potentially better)

# --- sherpa-nano backend path (only used when recognizer_backend="sherpa-nano") ---
# Directory must contain encoder_adaptor.int8.onnx, llm.int8.onnx, embedding.int8.onnx, Qwen3-0.6B/
nano_model_dir: "models/sherpa-onnx-funasr-nano-int8-2025-12-30"
# LLM prompts for sherpa-nano (optional). You can stuff business context here
# as a stronger alternative to hotwords, since the model is LLM-based.
nano_system_prompt: "You are a voice transcription assistant. The speaker often talks about coding tools like Claude Code, ChatGPT, Cursor, GitHub Copilot, and Python libraries. Transcribe these names accurately."
nano_user_prompt: "语音转写:"

# --- qwen3-asr backend path (only used when recognizer_backend="qwen3-asr") ---
# Download: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-qwen3-asr-0.6B-int8-2026-03-25.tar.bz2
# Directory must contain conv_frontend.onnx, encoder.onnx, decoder.onnx, tokenizer/
qwen3_model_dir: ""
# KV cache length: increase for longer audio (512 ≈ 30s, 1024 ≈ 60s, 2048 ≈ 2min)
qwen3_max_total_len: 512
# Max output tokens: increase for longer audio (128 ≈ 80 chars, 512 ≈ 300 chars)
qwen3_max_new_tokens: 128

# Audio recording
sample_rate: 44100
channels: 2

# Global hotkey to trigger voice input (hold to record, release to transcribe)
# Options: scroll_lock, pause, f6, f7, f8, f9, f10, f11, f12, fn (macOS only, toggle mode)
hotkey: fn

# Toggle hotkey: press once to start recording, press again to stop and transcribe.
# Useful for long recordings. Leave empty to disable.
# Options: alt (macOS Option key), alt_r, caps_lock, f6-f12, etc.
toggle_hotkey: alt

# UI
window_title: Rtxime
window_width: 200
window_height: 100
window_opacity: 0.8

# Streaming recognition (real-time text while you speak, funasr backend only)
streaming: false

# Path to streaming model directory.
# Download with: modelscope download --model damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online
streaming_model_dir: ""

# Chunk size [left_context, body, right_context] in frames.
# Default [5, 10, 5] = 600ms per chunk. Use [5, 8, 5] for 480ms lower latency.
# chunk_size: [5, 10, 5]

# 2pass mode: stream results during recording, then correct with offline model on release.
# Gives real-time feedback + higher accuracy. Requires both models to be loaded.
two_pass: false

# Features
enable_hotwords: true           # Load custom hotwords from hotwords.txt
enable_traditional_chinese: true # Enable simplified↔traditional Chinese toggle
enable_noise_reduction: true    # Apply noise reduction before recognition
strip_trailing_punctuation: true # Remove trailing 。！？，. etc. from recognized text

# VAD (Voice Activity Detection) for long audio segmentation
# Automatically splits long audio into segments before ASR.
# Required for LLM backends (sherpa-nano, qwen3-asr) with long recordings.
# Also helps encoder backends (funasr, sensevoice) avoid memory issues.
enable_vad: true
vad_max_speech_duration: 15    # max seconds per segment (15s safe for all backends)
# vad_model_path: ""           # path to silero_vad.onnx (auto-detected from ./silero_vad.onnx, ./models/, ~/.cache/sherpa-onnx/)
# Download: wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx

# Platform override (auto-detected if empty)
# Options: x11, wayland, windows, macos
platform: ""

# Resource file names (searched in CWD first, then package resources/)
hotwords_file: hotwords.txt
library_file: library.txt
warmup_file: warmup.wav
icon_file: icon.png
style_file: style.css