Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(autofix) Constrain repo name in semantic search #2176

Merged
merged 5 commits into from
Mar 23, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 34 additions & 20 deletions src/seer/automation/autofix/tools.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
import subprocess
import textwrap
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import cast
from typing import Literal, TypeAlias, cast

from langfuse.decorators import observe
from pydantic import BaseModel
@@ -54,6 +54,35 @@ def _get_repo_names(self) -> list[str]:
else:
raise ValueError(f"Unsupported context type: {type(self.context)}")

def _semantic_file_search_completion(
self, query: str, valid_file_paths: str, repo_names: list[str], llm_client: LlmClient
):
prompt = textwrap.dedent(
"""
I'm searching for the file in this codebase that contains {query}. Please pick the most relevant file from the following list:
------------
{valid_file_paths}
"""
).format(query=query, valid_file_paths=valid_file_paths)

if 2 <= len(repo_names) < 100:
# Lower bound avoids Gemini-Pydantic incompatibility.
# Upper bound is b/c structured output can't handle too many options in a Literal.
RepoName: TypeAlias = Literal[tuple(repo_names)] # type: ignore[valid-type]
else:
RepoName: TypeAlias = str # type: ignore[no-redef]

class FileLocation(BaseModel):
file_path: str
repo_name: RepoName # type: ignore

response = llm_client.generate_structured(
prompt=prompt,
model=GeminiProvider(model_name="gemini-2.0-flash-001"),
response_format=FileLocation,
)
return response.parsed

@observe(name="Semantic File Search")
@ai_track(description="Semantic File Search")
@inject
@@ -73,32 +102,17 @@ def semantic_file_search(self, query: str, llm_client: LlmClient = injected):

self.context.event_manager.add_log(f'Searching for "{query}"...')

class FilePath(BaseModel):
file_path: str
repo_name: str

all_valid_paths = "\n".join(
[
f"FILES IN REPO {repo_name}:\n{files_per_repo[repo_name]}\n------------"
for repo_name in repo_names
]
)
prompt = textwrap.dedent(
"""
I'm searching for the file in this codebase that contains {query}. Please pick the most relevant file from the following list:
------------
{valid_file_paths}
"""
).format(query=query, valid_file_paths=all_valid_paths)

response = llm_client.generate_structured(
prompt=prompt,
model=GeminiProvider(model_name="gemini-2.0-flash-001"),
response_format=FilePath,
file_location = self._semantic_file_search_completion(
query, all_valid_paths, repo_names, llm_client
)
result = response.parsed
file_path = result.file_path if result else None
repo_name = result.repo_name if result else None
file_path = file_location.file_path if file_location else None
repo_name = file_location.repo_name if file_location else None
if file_path is None or repo_name is None:
return "Could not figure out which file matches what you were looking for. You'll have to try yourself."

Binary file not shown.
Binary file not shown.
Binary file not shown.
43 changes: 43 additions & 0 deletions tests/automation/autofix/test_autofix_tools.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import textwrap
from unittest.mock import MagicMock, patch

import pytest

from seer.automation.agent.client import LlmClient
from seer.automation.autofix.autofix_context import AutofixContext
from seer.automation.autofix.tools import BaseTools
from seer.automation.codebase.repo_client import RepoClientType
@@ -81,6 +83,47 @@ def test_semantic_file_search_found(self, autofix_tools: BaseTools):
expected = "This file might be what you're looking for: `src/file1.py`. Contents:\n\ntest file contents"
assert result == expected

@pytest.mark.vcr()
@pytest.mark.parametrize(
"repo_names",
(
["owner/repo", "owner/another-repo"],
["owner/another-repo"], # fall back to str RepoName
["owner/repo", "owner/another-repo"] * 100, # fall back to str RepoName
),
)
def test_semantic_file_search_completion(self, autofix_tools: BaseTools, repo_names: list[str]):
query = "find the file which tests google's LLM"
valid_file_paths = textwrap.dedent(
"""
FILES IN REPO owner/repo:
src/
└──something.py
tests/
└──another/
└──test_thing.py
------------
FILES IN REPO owner/another-repo:
src/
└──clients/
├──claude.py
├──gemini.py
└──openai.py
tests/
└──clients/
├──test_claude.py
├──test_gemini.py
└──test_openai.py
"""
)

llm_client = LlmClient()
file_location = autofix_tools._semantic_file_search_completion(
query, valid_file_paths, repo_names, llm_client
)
assert file_location.repo_name == "owner/another-repo"
assert file_location.file_path == "tests/clients/test_gemini.py"

def test_semantic_file_search_not_found_no_file_path(self, autofix_tools: BaseTools):
dummy_repo = MagicMock(full_name="owner/test_repo")
autofix_tools.context.state.get.return_value.readable_repos = [dummy_repo]