Skip to content

Commit 7b920c9

Browse files
authored
Merge pull request #4 from rezhajulio/feat/duplicate-spam-detection
feat: duplicate message spam detection
2 parents 02d0967 + 6e7c230 commit 7b920c9

11 files changed

Lines changed: 733 additions & 3 deletions

.env.example

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,23 @@ CAPTCHA_ENABLED=false
3636
# Default: 120 seconds (2 minutes)
3737
CAPTCHA_TIMEOUT_SECONDS=120
3838

39+
# Enable duplicate message spam detection (true/false)
40+
# Detects and restricts users who paste the same message repeatedly
41+
DUPLICATE_SPAM_ENABLED=true
42+
43+
# Time window in seconds for detecting duplicate messages
44+
DUPLICATE_SPAM_WINDOW_SECONDS=120
45+
46+
# Number of similar messages within the window before restricting
47+
DUPLICATE_SPAM_THRESHOLD=2
48+
49+
# Minimum normalized text length to consider (avoids false positives on short messages)
50+
DUPLICATE_SPAM_MIN_LENGTH=20
51+
52+
# Similarity threshold (0.0-1.0) for matching duplicate messages
53+
# 0.95 catches minor edits, 0.97 only near-exact copies, 0.90 is more aggressive
54+
DUPLICATE_SPAM_SIMILARITY=0.95
55+
3956
# Path to groups.json for multi-group support (optional)
4057
# If this file exists, per-group settings are loaded from it instead of the
4158
# GROUP_ID/WARNING_TOPIC_ID/etc. fields above. See groups.json.example.

groups.json.example

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@
99
"captcha_timeout_seconds": 120,
1010
"new_user_probation_hours": 72,
1111
"new_user_violation_threshold": 3,
12-
"rules_link": "https://t.me/pythonID/290029/321799"
12+
"rules_link": "https://t.me/pythonID/290029/321799",
13+
"duplicate_spam_enabled": true,
14+
"duplicate_spam_window_seconds": 120,
15+
"duplicate_spam_threshold": 2,
16+
"duplicate_spam_min_length": 20,
17+
"duplicate_spam_similarity": 0.95
1318
},
1419
{
1520
"group_id": -1009876543210,
@@ -21,6 +26,11 @@
2126
"captcha_timeout_seconds": 180,
2227
"new_user_probation_hours": 168,
2328
"new_user_violation_threshold": 2,
24-
"rules_link": "https://t.me/mygroup/rules"
29+
"rules_link": "https://t.me/mygroup/rules",
30+
"duplicate_spam_enabled": true,
31+
"duplicate_spam_window_seconds": 60,
32+
"duplicate_spam_threshold": 2,
33+
"duplicate_spam_min_length": 20,
34+
"duplicate_spam_similarity": 0.90
2535
}
2636
]

src/bot/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ class Settings(BaseSettings):
7979
captcha_timeout_seconds: int = 120
8080
new_user_probation_hours: int = 72 # 3 days default
8181
new_user_violation_threshold: int = 3 # restrict after this many violations
82+
duplicate_spam_enabled: bool = True
83+
duplicate_spam_window_seconds: int = 120
84+
duplicate_spam_threshold: int = 2
85+
duplicate_spam_min_length: int = 20
86+
duplicate_spam_similarity: float = 0.95
8287
groups_config_path: str = "groups.json"
8388
logfire_token: str | None = None
8489
logfire_service_name: str = "pythonid-bot"

src/bot/constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,21 @@ def format_hours_display(hours: int) -> str:
211211
"📌 [Peraturan Grup]({rules_link})"
212212
)
213213

214+
# Duplicate message spam notification
215+
DUPLICATE_SPAM_RESTRICTION = (
216+
"🚫 *Spam Pesan Duplikat*\n\n"
217+
"{user_mention} telah dibatasi karena mengirim pesan yang sama "
218+
"sebanyak {count} kali dalam waktu singkat.\n\n"
219+
"📌 [Peraturan Grup]({rules_link})"
220+
)
221+
222+
DUPLICATE_SPAM_RESTRICTION_NO_RESTRICT = (
223+
"🚫 *Spam Pesan Duplikat*\n\n"
224+
"Pesan duplikat dari {user_mention} telah dihapus "
225+
"({count} pesan yang sama dalam waktu singkat).\n\n"
226+
"📌 [Peraturan Grup]({rules_link})"
227+
)
228+
214229
# Whitelisted URL domains for new user probation
215230
# These domains are allowed even during probation period
216231
# Matches exact domain or subdomains (e.g., "github.com" matches "www.github.com")

src/bot/group_config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ class GroupConfig(BaseModel):
3535
new_user_probation_hours: int = 72
3636
new_user_violation_threshold: int = 3
3737
rules_link: str = "https://t.me/pythonID/290029/321799"
38+
duplicate_spam_enabled: bool = True
39+
duplicate_spam_window_seconds: int = 120
40+
duplicate_spam_threshold: int = 2
41+
duplicate_spam_min_length: int = 20
42+
duplicate_spam_similarity: float = 0.95
3843

3944
@field_validator("group_id")
4045
@classmethod
@@ -181,6 +186,11 @@ def build_group_registry(settings: object) -> GroupRegistry:
181186
new_user_probation_hours=settings.new_user_probation_hours,
182187
new_user_violation_threshold=settings.new_user_violation_threshold,
183188
rules_link=settings.rules_link,
189+
duplicate_spam_enabled=settings.duplicate_spam_enabled,
190+
duplicate_spam_window_seconds=settings.duplicate_spam_window_seconds,
191+
duplicate_spam_threshold=settings.duplicate_spam_threshold,
192+
duplicate_spam_min_length=settings.duplicate_spam_min_length,
193+
duplicate_spam_similarity=settings.duplicate_spam_similarity,
184194
)
185195
registry.register(config)
186196

src/bot/handlers/duplicate_spam.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
"""
2+
Duplicate message spam detection handler.
3+
4+
This module detects users who spam by repeatedly posting the same or
5+
very similar messages within a short time window. When the threshold
6+
is reached, duplicate messages are deleted and the user is restricted.
7+
8+
Uses an in-memory rolling window per (group_id, user_id) to track
9+
recent messages. No database state is needed — restrictions applied
10+
here are NOT reversible via the DM unrestriction flow (no UserWarning
11+
record is created).
12+
"""
13+
14+
import logging
15+
import re
16+
import unicodedata
17+
from collections import deque
18+
from dataclasses import dataclass
19+
from datetime import UTC, datetime
20+
from difflib import SequenceMatcher
21+
22+
from telegram import Update
23+
from telegram.ext import ApplicationHandlerStop, ContextTypes
24+
25+
from bot.constants import (
26+
DUPLICATE_SPAM_RESTRICTION,
27+
DUPLICATE_SPAM_RESTRICTION_NO_RESTRICT,
28+
RESTRICTED_PERMISSIONS,
29+
)
30+
from bot.group_config import GroupConfig, get_group_config_for_update
31+
from bot.services.telegram_utils import get_user_mention
32+
33+
logger = logging.getLogger(__name__)
34+
35+
RECENT_MESSAGES_KEY = "duplicate_spam_recent"
36+
37+
38+
@dataclass
39+
class RecentMessage:
40+
"""A recent message entry for duplicate detection."""
41+
42+
timestamp: datetime
43+
normalized_text: str
44+
message_id: int
45+
46+
47+
def normalize_text(text: str) -> str:
48+
"""
49+
Normalize text for duplicate comparison.
50+
51+
Lowercases, strips whitespace, collapses runs of whitespace,
52+
removes emoji/symbol unicode categories, and strips punctuation.
53+
"""
54+
text = text.lower()
55+
text = unicodedata.normalize("NFKC", text)
56+
text = re.sub(r"\s+", " ", text).strip()
57+
text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)
58+
return text
59+
60+
61+
def is_similar(a: str, b: str, threshold: float = 0.95) -> bool:
62+
"""Check if two normalized texts are similar enough to be considered duplicates."""
63+
if a == b:
64+
return True
65+
return SequenceMatcher(None, a, b).ratio() >= threshold
66+
67+
68+
def _get_recent_messages(
69+
context: ContextTypes.DEFAULT_TYPE, group_id: int, user_id: int
70+
) -> deque[RecentMessage]:
71+
"""Get or create the recent messages deque for a (group, user) pair."""
72+
store: dict[tuple[int, int], deque[RecentMessage]] = context.bot_data.setdefault(
73+
RECENT_MESSAGES_KEY, {}
74+
)
75+
key = (group_id, user_id)
76+
if key not in store:
77+
store[key] = deque()
78+
return store[key]
79+
80+
81+
def _prune_old_messages(
82+
dq: deque[RecentMessage], window_seconds: int, now: datetime
83+
) -> None:
84+
"""Remove messages older than the window from the deque."""
85+
while dq and (now - dq[0].timestamp).total_seconds() > window_seconds:
86+
dq.popleft()
87+
88+
89+
def count_similar_in_window(
90+
dq: deque[RecentMessage], normalized: str, threshold: float = 0.95
91+
) -> int:
92+
"""Count how many messages in the deque are similar to the given text."""
93+
return sum(1 for m in dq if is_similar(normalized, m.normalized_text, threshold))
94+
95+
96+
async def handle_duplicate_spam(
97+
update: Update, context: ContextTypes.DEFAULT_TYPE
98+
) -> None:
99+
"""
100+
Detect and handle duplicate message spam.
101+
102+
Tracks recent messages per (group_id, user_id) in memory. When the
103+
count of similar messages within the time window reaches the threshold,
104+
deletes the message and restricts the user.
105+
"""
106+
if not update.message or not update.message.from_user:
107+
return
108+
109+
group_config = get_group_config_for_update(update)
110+
if group_config is None:
111+
return
112+
113+
if not group_config.duplicate_spam_enabled:
114+
return
115+
116+
user = update.message.from_user
117+
if user.is_bot:
118+
return
119+
120+
admin_ids = context.bot_data.get("group_admin_ids", {}).get(group_config.group_id, [])
121+
if user.id in admin_ids:
122+
return
123+
124+
text = update.message.text or update.message.caption
125+
if not text:
126+
return
127+
128+
normalized = normalize_text(text)
129+
if len(normalized) < group_config.duplicate_spam_min_length:
130+
return
131+
132+
now = datetime.now(UTC)
133+
dq = _get_recent_messages(context, group_config.group_id, user.id)
134+
_prune_old_messages(dq, group_config.duplicate_spam_window_seconds, now)
135+
136+
similar_count = count_similar_in_window(dq, normalized, group_config.duplicate_spam_similarity)
137+
138+
dq.append(
139+
RecentMessage(
140+
timestamp=now,
141+
normalized_text=normalized,
142+
message_id=update.message.message_id,
143+
)
144+
)
145+
146+
if similar_count < group_config.duplicate_spam_threshold - 1:
147+
return
148+
149+
total_count = similar_count + 1
150+
user_mention = get_user_mention(user)
151+
152+
logger.info(
153+
f"Duplicate spam detected: user_id={user.id}, "
154+
f"group_id={group_config.group_id}, count={total_count}"
155+
)
156+
157+
try:
158+
await update.message.delete()
159+
logger.info(f"Deleted duplicate spam from user_id={user.id}")
160+
except Exception:
161+
logger.error(
162+
f"Failed to delete duplicate spam: user_id={user.id}",
163+
exc_info=True,
164+
)
165+
166+
await _enforce_restriction(context, group_config, user, user_mention, total_count)
167+
168+
raise ApplicationHandlerStop
169+
170+
171+
async def _enforce_restriction(
172+
context: ContextTypes.DEFAULT_TYPE,
173+
group_config: GroupConfig,
174+
user: object,
175+
user_mention: str,
176+
count: int,
177+
) -> None:
178+
"""Restrict the user and send notification to warning topic."""
179+
restricted = False
180+
try:
181+
await context.bot.restrict_chat_member(
182+
chat_id=group_config.group_id,
183+
user_id=user.id,
184+
permissions=RESTRICTED_PERMISSIONS,
185+
)
186+
restricted = True
187+
logger.info(f"Restricted user_id={user.id} for duplicate spam")
188+
except Exception:
189+
logger.error(
190+
f"Failed to restrict user for duplicate spam: user_id={user.id}",
191+
exc_info=True,
192+
)
193+
194+
try:
195+
template = (
196+
DUPLICATE_SPAM_RESTRICTION if restricted
197+
else DUPLICATE_SPAM_RESTRICTION_NO_RESTRICT
198+
)
199+
notification_text = template.format(
200+
user_mention=user_mention,
201+
count=count,
202+
rules_link=group_config.rules_link,
203+
)
204+
await context.bot.send_message(
205+
chat_id=group_config.group_id,
206+
message_thread_id=group_config.warning_topic_id,
207+
text=notification_text,
208+
parse_mode="Markdown",
209+
)
210+
logger.info(f"Sent duplicate spam notification for user_id={user.id}")
211+
except Exception:
212+
logger.error(
213+
f"Failed to send duplicate spam notification: user_id={user.id}",
214+
exc_info=True,
215+
)

src/bot/main.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from bot.group_config import get_group_registry, init_group_registry
2020
from bot.handlers import captcha
2121
from bot.handlers.anti_spam import handle_inline_keyboard_spam, handle_new_user_spam
22+
from bot.handlers.duplicate_spam import handle_duplicate_spam
2223
from bot.handlers.dm import handle_dm
2324
from bot.handlers.message import handle_message
2425
from bot.handlers.topic_guard import guard_warning_topic
@@ -294,7 +295,16 @@ def main() -> None:
294295
)
295296
logger.info("Registered handler: anti_spam_handler (group=0)")
296297

297-
# Handler 10: Group message handler - monitors messages in monitored
298+
# Handler 10: Duplicate message spam handler - detects repeated identical messages
299+
application.add_handler(
300+
MessageHandler(
301+
filters.ChatType.GROUPS & ~filters.COMMAND,
302+
handle_duplicate_spam,
303+
)
304+
)
305+
logger.info("Registered handler: duplicate_spam_handler (group=0)")
306+
307+
# Handler 11: Group message handler - monitors messages in monitored
298308
# groups and warns/restricts users with incomplete profiles
299309
application.add_handler(
300310
MessageHandler(

tests/test_config.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,36 @@ def test_captcha_timeout_timedelta(self, monkeypatch):
123123

124124
assert settings.captcha_timeout_timedelta == timedelta(seconds=90)
125125

126+
def test_duplicate_spam_defaults(self, monkeypatch):
127+
"""Test that duplicate_spam fields have correct defaults."""
128+
monkeypatch.setenv("TELEGRAM_BOT_TOKEN", "test_token")
129+
monkeypatch.setenv("GROUP_ID", "-100999")
130+
monkeypatch.setenv("WARNING_TOPIC_ID", "1")
131+
132+
settings = Settings(_env_file=None)
133+
134+
assert settings.duplicate_spam_enabled is True
135+
assert settings.duplicate_spam_window_seconds == 120
136+
assert settings.duplicate_spam_threshold == 2
137+
assert settings.duplicate_spam_min_length == 20
138+
139+
def test_duplicate_spam_from_env(self, monkeypatch):
140+
"""Test that duplicate_spam fields are read from environment variables."""
141+
monkeypatch.setenv("TELEGRAM_BOT_TOKEN", "test_token")
142+
monkeypatch.setenv("GROUP_ID", "-100999")
143+
monkeypatch.setenv("WARNING_TOPIC_ID", "1")
144+
monkeypatch.setenv("DUPLICATE_SPAM_ENABLED", "false")
145+
monkeypatch.setenv("DUPLICATE_SPAM_WINDOW_SECONDS", "300")
146+
monkeypatch.setenv("DUPLICATE_SPAM_THRESHOLD", "5")
147+
monkeypatch.setenv("DUPLICATE_SPAM_MIN_LENGTH", "50")
148+
149+
settings = Settings(_env_file=None)
150+
151+
assert settings.duplicate_spam_enabled is False
152+
assert settings.duplicate_spam_window_seconds == 300
153+
assert settings.duplicate_spam_threshold == 5
154+
assert settings.duplicate_spam_min_length == 50
155+
126156

127157
class TestSettingsValidation:
128158
def test_group_id_must_be_negative(self, monkeypatch):

0 commit comments

Comments
 (0)