From f2ae8a31b5896211773eb36627c0a7e7c910961d Mon Sep 17 00:00:00 2001 From: Rodion Mostovoi <36400912+rodion-m@users.noreply.github.com> Date: Tue, 21 Apr 2026 10:16:21 +0500 Subject: [PATCH] Surface grep_search file-name matching + matchedByName flag (#375) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pairs with CodeAlive-AI/backend#376. The backend's grep_search now also matches file names/paths for literal queries and flags name-only hits with matchedByName=true (omitted when null via global JsonIgnoreCondition on the .NET side). Previously the MCP layer dropped matchedByName entirely in transform_grep_response, so the new signal never reached LLM agents even though the backend emitted it. Changes: - response_transformer.transform_grep_response now forwards matchedByName into the MCP dict output, only when the backend set it (mirrors the backend's omit-on-null wire semantics so content-match responses stay byte-identical to the pre-change shape). - grep_search tool docstring updated: mentions literal file-name matching, explains the Form.xml use case, documents the matchedByName contract (empty matches, location points at line 1 as a file-level reference — do NOT interpret it as a content match), and flags the Phase 1 limitation that regex=true still only searches content. - README.md one-line summary of grep_search extended accordingly. - Unit test test_grep_forwards_matched_by_name_flag asserts name-only hits surface the flag and content hits do not. Tests: 17/17 response_transformer tests pass (16 pre-existing + 1 new). Full MCP unit suite: 249 passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- src/tests/test_response_transformer.py | 46 ++++++++++++++++++++++++++ src/tools/search.py | 26 ++++++++++++--- src/utils/response_transformer.py | 6 ++++ 4 files changed, 74 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 09cad24..ef8eedf 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Once connected, you'll have access to these powerful tools: 1. **`get_data_sources`** - List your indexed repositories and workspaces 2. **`semantic_search`** - Canonical semantic search across indexed artifacts -3. **`grep_search`** - Exact text or regex search with line-level matches +3. **`grep_search`** - Exact literal or regex text search inside file content, plus literal file-name/path matching (returns files like `Form.xml` even when their content never mentions the name), with line-level previews for content matches 4. **`fetch_artifacts`** - Load the full source for relevant search hits 5. **`get_artifact_relationships`** - Expand call graph, inheritance, and reference relationships for one artifact 6. **`chat`** - Slower synthesized codebase Q&A, typically only after search diff --git a/src/tests/test_response_transformer.py b/src/tests/test_response_transformer.py index cd89511..ad0a359 100644 --- a/src/tests/test_response_transformer.py +++ b/src/tests/test_response_transformer.py @@ -354,3 +354,49 @@ def test_grep_unicode_in_line_text(self): line = result["results"][0]["matches"][0]["lineText"] assert "ТипШтрихкода" in line assert "GS1_DataMatrix" in line + + def test_grep_forwards_matched_by_name_flag(self): + """Name-only hits must carry matchedByName=True through to the MCP output + so LLM agents can distinguish a file-level name match from a content match. + Content hits must NOT include the field (backend omits null via + JsonIgnoreCondition.WhenWritingNull; the transformer mirrors that).""" + response = { + "results": [ + { + "kind": "File", + "identifier": "biterp/.../Ext/Form.xml", + "location": { + "path": "bsl-checks/src/test/resources/checks/VerifyMetadata/CommonForms/Форма/Ext/Form.xml", + "range": {"start": {"line": 1}, "end": {"line": 1}}, + }, + "matchCount": 0, + "matches": [], + "matchedByName": True, + }, + { + "kind": "File", + "identifier": "biterp/.../renames.txt", + "location": {"path": "renames.txt"}, + "matchCount": 2, + "matches": [ + { + "lineNumber": 3, + "startColumn": 1, + "endColumn": 9, + "lineText": "Form.xml -> Form2.xml", + } + ], + # matchedByName intentionally absent — backend omits it for content hits + }, + ] + } + + result = transform_grep_response(response) + + assert len(result["results"]) == 2 + name_only, content_hit = result["results"] + assert name_only["matchedByName"] is True + assert name_only["matchCount"] == 0 + assert "matches" not in name_only # transformer only copies matches when non-empty + assert "matchedByName" not in content_hit + assert content_hit["matchCount"] == 2 diff --git a/src/tools/search.py b/src/tools/search.py index e3d704b..bdbef36 100644 --- a/src/tools/search.py +++ b/src/tools/search.py @@ -239,16 +239,20 @@ async def grep_search( regex: bool = False, ) -> Dict[str, Any]: """ - Search indexed code by exact text or regex — finds code containing - a specific string. + Search indexed code by exact text or regex — matches file content + and, for literal queries, also file names/paths. Use this when you know WHAT TEXT to look for: an identifier, an error - message, a config key, a literal string that must appear in the source. + message, a config key, or a file whose name you know (even if nothing + inside the file references that name — 1C `Form.xml`, `.mdo`, config + XML, media files, etc.). **When to use grep_search:** - Specific identifiers: class/function/variable names, domain events (e.g. `RepositoryDeleted`, `handlePayment`, `AUTH_PROVIDERS`) - Literal strings: error messages, URLs, config keys, file paths + - File names whose content may never contain their own name + (e.g. `Form.xml`, `schema.graphql`, `appsettings.json`) - Import paths, TODO/FIXME comments, annotations - Regex patterns: `def test_.*async`, `Status\\.(Alive|Failed)` - Finding ALL occurrences of a known symbol across the codebase @@ -276,6 +280,8 @@ async def grep_search( max_results: Maximum number of results to return (1–500). regex: If True, treat `query` as a regex pattern. Default: False (literal). + **Regex currently matches file content only** — file-name/path + matching is literal-substring only. This is a known limitation. Returns: {"results": [...], "hint": "..."} @@ -283,9 +289,14 @@ async def grep_search( Each result contains: - path: file path - identifier: pass to `fetch_artifacts` for full source - - matchCount: total matches in this file + - matchCount: total matches in this file (0 for file-name-only hits) - matches: array of line-level hits, each with: - lineNumber, startColumn, endColumn, lineText + - matchedByName: present and `true` only when the artifact matched + by its file name/path and has no content match. In that case + `matches` is empty and `location.line` defaults to 1 as a + file-level reference — do NOT interpret `location.line` as an + actual line match. Content-match results omit this field. The `hint` reminds you that line previews are evidence only — load full source via `fetch_artifacts` or local `Read()` before reasoning. @@ -295,7 +306,12 @@ async def grep_search( grep_search(query="ConnectionString", data_sources=["backend"]) - 2. Regex search for test methods: + 2. Find a file by name (returns the file even if nothing inside + it references `Form.xml`): + grep_search(query="Form.xml", + data_sources=["biterp-bsl"]) + + 3. Regex search for test methods (content only): grep_search(query="def test_.*auth", data_sources=["backend"], extensions=[".py"], diff --git a/src/utils/response_transformer.py b/src/utils/response_transformer.py index 63b2b23..7981c50 100644 --- a/src/utils/response_transformer.py +++ b/src/utils/response_transformer.py @@ -95,6 +95,12 @@ def transform_grep_response(grep_results: Dict[str, Any]) -> Dict[str, Any]: item["matches"] = [ _build_match_dict(match) for match in result["matches"] ] + # Forward matchedByName only when the backend set it (name-only hits). + # The backend omits the field for content matches via System.Text.Json + # WhenWritingNull, so `get("matchedByName")` is None/missing for those + # and we skip it here to keep the happy path free of an extra key. + if result.get("matchedByName"): + item["matchedByName"] = True formatted_results.append(item) if not formatted_results: