From f2ae8a31b5896211773eb36627c0a7e7c910961d Mon Sep 17 00:00:00 2001
From: Rodion Mostovoi <36400912+rodion-m@users.noreply.github.com>
Date: Tue, 21 Apr 2026 10:16:21 +0500
Subject: [PATCH] Surface grep_search file-name matching + matchedByName flag
 (#375)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pairs with CodeAlive-AI/backend#376. The backend's grep_search now also
matches file names/paths for literal queries and flags name-only hits
with matchedByName=true (omitted when null via global JsonIgnoreCondition
on the .NET side). Previously the MCP layer dropped matchedByName
entirely in transform_grep_response, so the new signal never reached
LLM agents even though the backend emitted it.

Changes:
- response_transformer.transform_grep_response now forwards
  matchedByName into the MCP dict output, only when the backend set it
  (mirrors the backend's omit-on-null wire semantics so content-match
  responses stay byte-identical to the pre-change shape).
- grep_search tool docstring updated: mentions literal file-name
  matching, explains the Form.xml use case, documents the matchedByName
  contract (empty matches, location points at line 1 as a file-level
  reference — do NOT interpret it as a content match), and flags the
  Phase 1 limitation that regex=true still only searches content.
- README.md one-line summary of grep_search extended accordingly.
- Unit test test_grep_forwards_matched_by_name_flag asserts name-only
  hits surface the flag and content hits do not.

Tests: 17/17 response_transformer tests pass (16 pre-existing + 1 new).
Full MCP unit suite: 249 passed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                              |  2 +-
 src/tests/test_response_transformer.py | 46 ++++++++++++++++++++++++++
 src/tools/search.py                    | 26 ++++++++++++---
 src/utils/response_transformer.py      |  6 ++++
 4 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 09cad24..ef8eedf 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Once connected, you'll have access to these powerful tools:
 
 1. **`get_data_sources`** - List your indexed repositories and workspaces
 2. **`semantic_search`** - Canonical semantic search across indexed artifacts
-3. **`grep_search`** - Exact text or regex search with line-level matches
+3. **`grep_search`** - Exact literal or regex text search inside file content, plus literal file-name/path matching (returns files like `Form.xml` even when their content never mentions the name), with line-level previews for content matches
 4. **`fetch_artifacts`** - Load the full source for relevant search hits
 5. **`get_artifact_relationships`** - Expand call graph, inheritance, and reference relationships for one artifact
 6. **`chat`** - Slower synthesized codebase Q&A, typically only after search
diff --git a/src/tests/test_response_transformer.py b/src/tests/test_response_transformer.py
index cd89511..ad0a359 100644
--- a/src/tests/test_response_transformer.py
+++ b/src/tests/test_response_transformer.py
@@ -354,3 +354,49 @@ def test_grep_unicode_in_line_text(self):
         line = result["results"][0]["matches"][0]["lineText"]
         assert "ТипШтрихкода" in line
         assert "GS1_DataMatrix" in line
+
+    def test_grep_forwards_matched_by_name_flag(self):
+        """Name-only hits must carry matchedByName=True through to the MCP output
+        so LLM agents can distinguish a file-level name match from a content match.
+        Content hits must NOT include the field (backend omits null via
+        JsonIgnoreCondition.WhenWritingNull; the transformer mirrors that)."""
+        response = {
+            "results": [
+                {
+                    "kind": "File",
+                    "identifier": "biterp/.../Ext/Form.xml",
+                    "location": {
+                        "path": "bsl-checks/src/test/resources/checks/VerifyMetadata/CommonForms/Форма/Ext/Form.xml",
+                        "range": {"start": {"line": 1}, "end": {"line": 1}},
+                    },
+                    "matchCount": 0,
+                    "matches": [],
+                    "matchedByName": True,
+                },
+                {
+                    "kind": "File",
+                    "identifier": "biterp/.../renames.txt",
+                    "location": {"path": "renames.txt"},
+                    "matchCount": 2,
+                    "matches": [
+                        {
+                            "lineNumber": 3,
+                            "startColumn": 1,
+                            "endColumn": 9,
+                            "lineText": "Form.xml -> Form2.xml",
+                        }
+                    ],
+                    # matchedByName intentionally absent — backend omits it for content hits
+                },
+            ]
+        }
+
+        result = transform_grep_response(response)
+
+        assert len(result["results"]) == 2
+        name_only, content_hit = result["results"]
+        assert name_only["matchedByName"] is True
+        assert name_only["matchCount"] == 0
+        assert "matches" not in name_only  # transformer only copies matches when non-empty
+        assert "matchedByName" not in content_hit
+        assert content_hit["matchCount"] == 2
diff --git a/src/tools/search.py b/src/tools/search.py
index e3d704b..bdbef36 100644
--- a/src/tools/search.py
+++ b/src/tools/search.py
@@ -239,16 +239,20 @@ async def grep_search(
     regex: bool = False,
 ) -> Dict[str, Any]:
     """
-    Search indexed code by exact text or regex — finds code containing
-    a specific string.
+    Search indexed code by exact text or regex — matches file content
+    and, for literal queries, also file names/paths.
 
     Use this when you know WHAT TEXT to look for: an identifier, an error
-    message, a config key, a literal string that must appear in the source.
+    message, a config key, or a file whose name you know (even if nothing
+    inside the file references that name — 1C `Form.xml`, `.mdo`, config
+    XML, media files, etc.).
 
     **When to use grep_search:**
     - Specific identifiers: class/function/variable names, domain events
       (e.g. `RepositoryDeleted`, `handlePayment`, `AUTH_PROVIDERS`)
     - Literal strings: error messages, URLs, config keys, file paths
+    - File names whose content may never contain their own name
+      (e.g. `Form.xml`, `schema.graphql`, `appsettings.json`)
     - Import paths, TODO/FIXME comments, annotations
     - Regex patterns: `def test_.*async`, `Status\\.(Alive|Failed)`
     - Finding ALL occurrences of a known symbol across the codebase
@@ -276,6 +280,8 @@ async def grep_search(
         max_results: Maximum number of results to return (1–500).
 
         regex: If True, treat `query` as a regex pattern. Default: False (literal).
+               **Regex currently matches file content only** — file-name/path
+               matching is literal-substring only. This is a known limitation.
 
     Returns:
         {"results": [...], "hint": "..."}
@@ -283,9 +289,14 @@ async def grep_search(
         Each result contains:
         - path: file path
         - identifier: pass to `fetch_artifacts` for full source
-        - matchCount: total matches in this file
+        - matchCount: total matches in this file (0 for file-name-only hits)
         - matches: array of line-level hits, each with:
           - lineNumber, startColumn, endColumn, lineText
+        - matchedByName: present and `true` only when the artifact matched
+          by its file name/path and has no content match. In that case
+          `matches` is empty and `location.line` defaults to 1 as a
+          file-level reference — do NOT interpret `location.line` as an
+          actual line match. Content-match results omit this field.
 
         The `hint` reminds you that line previews are evidence only — load
         full source via `fetch_artifacts` or local `Read()` before reasoning.
@@ -295,7 +306,12 @@ async def grep_search(
            grep_search(query="ConnectionString",
                        data_sources=["backend"])
 
-        2. Regex search for test methods:
+        2. Find a file by name (returns the file even if nothing inside
+           it references `Form.xml`):
+           grep_search(query="Form.xml",
+                       data_sources=["biterp-bsl"])
+
+        3. Regex search for test methods (content only):
            grep_search(query="def test_.*auth",
                        data_sources=["backend"],
                        extensions=[".py"],
diff --git a/src/utils/response_transformer.py b/src/utils/response_transformer.py
index 63b2b23..7981c50 100644
--- a/src/utils/response_transformer.py
+++ b/src/utils/response_transformer.py
@@ -95,6 +95,12 @@ def transform_grep_response(grep_results: Dict[str, Any]) -> Dict[str, Any]:
             item["matches"] = [
                 _build_match_dict(match) for match in result["matches"]
             ]
+        # Forward matchedByName only when the backend set it (name-only hits).
+        # The backend omits the field for content matches via System.Text.Json
+        # WhenWritingNull, so `get("matchedByName")` is None/missing for those
+        # and we skip it here to keep the happy path free of an extra key.
+        if result.get("matchedByName"):
+            item["matchedByName"] = True
         formatted_results.append(item)
 
     if not formatted_results: