getsentry · kddubey · Mar 23, 2025 · Mar 15, 2025 · Mar 15, 2025 · Mar 15, 2025
@@ -2,7 +2,7 @@
 import subprocess
 import textwrap
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import cast
+from typing import Literal, TypeAlias, cast
 
 from langfuse.decorators import observe
 from pydantic import BaseModel
@@ -54,6 +54,35 @@ def _get_repo_names(self) -> list[str]:
         else:
             raise ValueError(f"Unsupported context type: {type(self.context)}")
 
+    def _semantic_file_search_completion(
+        self, query: str, valid_file_paths: str, repo_names: list[str], llm_client: LlmClient
+    ):
+        prompt = textwrap.dedent(
+            """
+            I'm searching for the file in this codebase that contains {query}. Please pick the most relevant file from the following list:
+            ------------
+            {valid_file_paths}
+            """
+        ).format(query=query, valid_file_paths=valid_file_paths)
+
+        if 2 <= len(repo_names) < 100:
+            # Lower bound avoids Gemini-Pydantic incompatibility.
+            # Upper bound is b/c structured output can't handle too many options in a Literal.
+            RepoName: TypeAlias = Literal[tuple(repo_names)]  # type: ignore[valid-type]
+        else:
+            RepoName: TypeAlias = str  # type: ignore[no-redef]
+
+        class FileLocation(BaseModel):
+            file_path: str
+            repo_name: RepoName  # type: ignore
+
+        response = llm_client.generate_structured(
+            prompt=prompt,
+            model=GeminiProvider(model_name="gemini-2.0-flash-001"),
+            response_format=FileLocation,
+        )
+        return response.parsed
+
     @observe(name="Semantic File Search")
     @ai_track(description="Semantic File Search")
     @inject
@@ -73,32 +102,17 @@ def semantic_file_search(self, query: str, llm_client: LlmClient = injected):
 
         self.context.event_manager.add_log(f'Searching for "{query}"...')
 
-        class FilePath(BaseModel):
-            file_path: str
-            repo_name: str
-
         all_valid_paths = "\n".join(
             [
                 f"FILES IN REPO {repo_name}:\n{files_per_repo[repo_name]}\n------------"
                 for repo_name in repo_names
             ]
         )
-        prompt = textwrap.dedent(
-            """
-            I'm searching for the file in this codebase that contains {query}. Please pick the most relevant file from the following list:
-            ------------
-            {valid_file_paths}
-            """
-        ).format(query=query, valid_file_paths=all_valid_paths)
-
-        response = llm_client.generate_structured(
-            prompt=prompt,
-            model=GeminiProvider(model_name="gemini-2.0-flash-001"),
-            response_format=FilePath,
+        file_location = self._semantic_file_search_completion(
+            query, all_valid_paths, repo_names, llm_client
         )
-        result = response.parsed
-        file_path = result.file_path if result else None
-        repo_name = result.repo_name if result else None
+        file_path = file_location.file_path if file_location else None
+        repo_name = file_location.repo_name if file_location else None
         if file_path is None or repo_name is None:
             return "Could not figure out which file matches what you were looking for. You'll have to try yourself."
 

@@ -1,7 +1,9 @@
+import textwrap
 from unittest.mock import MagicMock, patch
 
 import pytest
 
+from seer.automation.agent.client import LlmClient
 from seer.automation.autofix.autofix_context import AutofixContext
 from seer.automation.autofix.tools import BaseTools
 from seer.automation.codebase.repo_client import RepoClientType
@@ -81,6 +83,47 @@ def test_semantic_file_search_found(self, autofix_tools: BaseTools):
         expected = "This file might be what you're looking for: `src/file1.py`. Contents:\n\ntest file contents"
         assert result == expected
 
+    @pytest.mark.vcr()
+    @pytest.mark.parametrize(
+        "repo_names",
+        (
+            ["owner/repo", "owner/another-repo"],
+            ["owner/another-repo"],  # fall back to str RepoName
+            ["owner/repo", "owner/another-repo"] * 100,  # fall back to str RepoName
+        ),
+    )
+    def test_semantic_file_search_completion(self, autofix_tools: BaseTools, repo_names: list[str]):
+        query = "find the file which tests google's LLM"
+        valid_file_paths = textwrap.dedent(
+            """
+            FILES IN REPO owner/repo:
+            src/
+            └──something.py
+            tests/
+            └──another/
+                └──test_thing.py
+            ------------
+            FILES IN REPO owner/another-repo:
+            src/
+            └──clients/
+                ├──claude.py
+                ├──gemini.py
+                └──openai.py
+            tests/
+            └──clients/
+                ├──test_claude.py
+                ├──test_gemini.py
+                └──test_openai.py
+            """
+        )
+
+        llm_client = LlmClient()
+        file_location = autofix_tools._semantic_file_search_completion(
+            query, valid_file_paths, repo_names, llm_client
+        )
+        assert file_location.repo_name == "owner/another-repo"
+        assert file_location.file_path == "tests/clients/test_gemini.py"
+
     def test_semantic_file_search_not_found_no_file_path(self, autofix_tools: BaseTools):
         dummy_repo = MagicMock(full_name="owner/test_repo")
         autofix_tools.context.state.get.return_value.readable_repos = [dummy_repo]