Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Store Bot Comment Embeddings from Closed PRs #2100

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
70 changes: 70 additions & 0 deletions src/migrations/versions/ee9136e6ff84_migration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Migration

Revision ID: ee9136e6ff84
Revises: e0fcdc14251c
Create Date: 2025-03-06 14:31:33.084873

"""

import sqlalchemy as sa
from alembic import op
from pgvector.sqlalchemy import Vector # type: ignore
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "ee9136e6ff84"
down_revision = "e0fcdc14251c"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"review_comments_embedding",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("provider", sa.String(), nullable=False),
sa.Column("owner", sa.String(), nullable=False),
sa.Column("repo", sa.String(), nullable=False),
sa.Column("pr_id", sa.BigInteger(), nullable=False),
sa.Column("body", sa.String(), nullable=False),
sa.Column("is_good_pattern", sa.Boolean(), nullable=False),
sa.Column("embedding", Vector(dim=768), nullable=False),
sa.Column("comment_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=False),
sa.Column("created_at", sa.DateTime(), nullable=False),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("provider", "pr_id", "repo", "owner"),
)
with op.batch_alter_table("review_comments_embedding", schema=None) as batch_op:
batch_op.create_index(
"ix_review_comments_embedding_hnsw",
["embedding"],
unique=False,
postgresql_using="hnsw",
postgresql_with={"m": 16, "ef_construction": 200},
postgresql_ops={"embedding": "vector_cosine_ops"},
)
batch_op.create_index(
"ix_review_comments_is_good_pattern", ["is_good_pattern"], unique=False
)
batch_op.create_index(
"ix_review_comments_repo_owner_pr", ["owner", "repo", "pr_id"], unique=False
)

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("review_comments_embedding", schema=None) as batch_op:
batch_op.drop_index("ix_review_comments_repo_owner_pr")
batch_op.drop_index("ix_review_comments_is_good_pattern")
batch_op.drop_index(
"ix_review_comments_embedding_hnsw",
postgresql_using="hnsw",
postgresql_with={"m": 16, "ef_construction": 200},
postgresql_ops={"embedding": "vector_cosine_ops"},
)

op.drop_table("review_comments_embedding")
# ### end Alembic commands ###
9 changes: 9 additions & 0 deletions src/seer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
CodecovTaskRequest,
CodegenBaseRequest,
CodegenBaseResponse,
CodegenPrClosedResponse,
CodegenPrReviewResponse,
CodegenPrReviewStateRequest,
CodegenPrReviewStateResponse,
Expand All @@ -60,6 +61,7 @@
CodegenUnitTestsStateResponse,
)
from seer.automation.codegen.tasks import (
codegen_pr_closed,
codegen_pr_review,
codegen_relevant_warnings,
codegen_unittest,
Expand Down Expand Up @@ -257,6 +259,11 @@ def codegen_unit_tests_endpoint(data: CodegenBaseRequest) -> CodegenUnitTestsRes
return codegen_unittest(data)


@json_api(blueprint, "/v1/automation/codegen/pr-closed")
def codegen_pr_closed_endpoint(data: CodegenBaseRequest) -> CodegenPrClosedResponse:
return codegen_pr_closed(data)


@json_api(blueprint, "/v1/automation/codegen/unit-tests/state")
def codegen_unit_tests_state_endpoint(
data: CodegenUnitTestsStateRequest,
Expand Down Expand Up @@ -307,6 +314,8 @@ def codecov_request_endpoint(
return codegen_pr_review_endpoint(data.data)
elif data.request_type == "unit-tests":
return codegen_unit_tests_endpoint(data.data)
elif data.request_type == "pr-closed":
return codegen_pr_closed_endpoint(data.data)
raise ValueError(f"Unsupported request_type: {data.request_type}")


Expand Down
22 changes: 22 additions & 0 deletions src/seer/automation/codebase/repo_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,31 @@ def get_codecov_pr_review_app_credentials(
return app_id, private_key


@inject
def get_codecov_pr_closed_app_credentials(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can just use PR review credentials. That way we don't have to add additional env values

config: AppConfig = injected,
) -> tuple[int | str | None, str | None]:
app_id = config.GITHUB_CODECOV_PR_CLOSED_APP_ID
private_key = config.GITHUB_CODECOV_PR_CLOSED_PRIVATE_KEY

if not app_id:
logger.warning("No key set GITHUB_CODECOV_PR_CLOSED_APP_ID")
if not private_key:
logger.warning("No key set GITHUB_CODECOV_PR_CLOSED_PRIVATE_KEY")

if not app_id or not private_key:
sentry_sdk.capture_message("Invalid credentials for codecov pr closed app.")
return get_write_app_credentials()

return app_id, private_key


class RepoClientType(str, Enum):
READ = "read"
WRITE = "write"
CODECOV_UNIT_TEST = "codecov_unit_test"
CODECOV_PR_REVIEW = "codecov_pr_review"
CODECOV_PR_CLOSED = "codecov_pr_closed"


class RepoClient:
Expand Down Expand Up @@ -225,6 +245,8 @@ def from_repo_definition(cls, repo_def: RepoDefinition, type: RepoClientType):
return cls(*get_codecov_unit_test_app_credentials(), repo_def)
elif type == RepoClientType.CODECOV_PR_REVIEW:
return cls(*get_codecov_pr_review_app_credentials(), repo_def)
elif type == RepoClientType.CODECOV_PR_CLOSED:
return cls(*get_codecov_pr_closed_app_credentials(), repo_def)

return cls(*get_read_app_credentials(), repo_def)

Expand Down
17 changes: 15 additions & 2 deletions src/seer/automation/codegen/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ class CodegenPrReviewRequest(CodegenBaseRequest):
pass


class CodegenPrClosedRequest(CodegenBaseRequest):
pass


class CodegenContinuation(CodegenState):
request: CodegenBaseRequest

Expand Down Expand Up @@ -82,6 +86,10 @@ class CodegenUnitTestsResponse(CodegenBaseResponse):
pass


class CodegenPrClosedResponse(CodegenBaseResponse):
pass


class CodegenUnitTestsStateRequest(BaseModel):
run_id: int

Expand Down Expand Up @@ -207,6 +215,11 @@ class CodePredictRelevantWarningsOutput(BaseComponentOutput):


class CodecovTaskRequest(BaseModel):
data: CodegenUnitTestsRequest | CodegenPrReviewRequest | CodegenRelevantWarningsRequest
data: (
CodegenUnitTestsRequest
| CodegenPrReviewRequest
| CodegenRelevantWarningsRequest
| CodegenPrClosedRequest
)
external_owner_id: str
request_type: Literal["unit-tests", "pr-review", "relevant-warnings"]
request_type: Literal["unit-tests", "pr-review", "relevant-warnings", "pr-closed"]
154 changes: 154 additions & 0 deletions src/seer/automation/codegen/pr_closed_step.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from asyncio.log import logger
from typing import Any

from github.PullRequestComment import PullRequestComment
from langfuse.decorators import observe
from sentry_sdk.ai.monitoring import ai_track
from sqlalchemy.dialects.postgresql import insert

from celery_app.app import celery_app
from seer.automation.agent.embeddings import GoogleProviderEmbeddings
from seer.automation.autofix.config import (
AUTOFIX_EXECUTION_HARD_TIME_LIMIT_SECS,
AUTOFIX_EXECUTION_SOFT_TIME_LIMIT_SECS,
)
from seer.automation.codebase.repo_client import RepoClientType
from seer.automation.codegen.step import CodegenStep
from seer.automation.models import RepoDefinition
from seer.automation.pipeline import PipelineStepTaskRequest
from seer.automation.state import DbStateRunTypes
from seer.db import DbReviewCommentEmbedding, Session


class PrClosedStepRequest(PipelineStepTaskRequest):
pr_id: int
repo_definition: RepoDefinition


class CommentAnalyzer:
"""
Handles comment analysis logic
"""

def __init__(self, bot_username: str = "codecov-ai-reviewer[bot]"):
self.bot_username = bot_username

def is_bot_comment(self, comment: PullRequestComment) -> bool:
"""Check if comment is authored by bot"""
return comment.user.login == self.bot_username

def analyze_reactions(self, comment: PullRequestComment) -> tuple[bool, bool]:
"""
Analyze reactions on a comment
Returns: (is_good_pattern, is_bad_pattern)
"""
reactions = comment.get_reactions()
upvotes = sum(1 for r in reactions if r.content == "+1")
downvotes = sum(1 for r in reactions if r.content == "-1")

is_good_pattern = upvotes >= downvotes
is_bad_pattern = downvotes > upvotes
return is_good_pattern, is_bad_pattern


@celery_app.task(
time_limit=AUTOFIX_EXECUTION_HARD_TIME_LIMIT_SECS,
soft_time_limit=AUTOFIX_EXECUTION_SOFT_TIME_LIMIT_SECS,
)
def pr_closed_task(*args, request: dict[str, Any]):
PrClosedStep(request, DbStateRunTypes.PR_CLOSED).invoke()


class PrClosedStep(CodegenStep):
"""
This class represents the PR Closed step in the codegen pipeline. It is responsible for
processing a closed or merged PR, including gathering and analyzing comment reactions.
"""

name = "PrClosedStep"
max_retries = 2

@staticmethod
def _instantiate_request(request: dict[str, Any]) -> PrClosedStepRequest:
return PrClosedStepRequest.model_validate(request)

@staticmethod
def get_task():
return pr_closed_task

def __init__(self, request: dict[str, Any], type: DbStateRunTypes):
super().__init__(request, type)
self.analyzer = CommentAnalyzer()

def _process_comment(self, comment: PullRequestComment, pr):
try:
is_good_pattern, is_bad_pattern = self.analyzer.analyze_reactions(comment)

logger.info(
f"Processing bot comment id {comment.id} on PR {pr.url}: "
f"good_pattern={is_good_pattern}, "
f"bad_pattern={is_bad_pattern}"
)

model = GoogleProviderEmbeddings.model(
"text-embedding-005", task_type="CODE_RETRIEVAL_QUERY"
)
# encode() expects list[str], returns 2D array
embedding = model.encode([comment.body])[0]

with Session() as session:
insert_stmt = insert(DbReviewCommentEmbedding).values(
provider="github",
owner=pr.base.repo.owner.login,
repo=pr.base.repo.name,
pr_id=pr.number,
body=comment.body,
is_good_pattern=is_good_pattern,
comment_metadata={
"url": comment.html_url,
"comment_id": comment.id,
"location": (
{"file_path": comment.path, "line_number": comment.position}
if hasattr(comment, "path")
else None
),
"timestamps": {
"created_at": comment.created_at.isoformat(),
"updated_at": comment.updated_at.isoformat(),
},
},
embedding=embedding,
)

session.execute(
insert_stmt.on_conflict_do_nothing(
index_elements=["provider", "pr_id", "repo", "owner"]
)
)
session.commit()

except Exception as e:
self.logger.error(f"Error processing comment {comment.id} on PR {pr.url}: {e}")
raise

@observe(name="Codegen - PR Closed")
@ai_track(description="Codegen - PR Closed Step")
def _invoke(self, **kwargs):
self.logger.info("Executing Codegen - PR Closed Step")
self.context.event_manager.mark_running()

repo_client = self.context.get_repo_client(type=RepoClientType.CODECOV_PR_CLOSED)
pr = repo_client.repo.get_pull(self.request.pr_id)

try:
review_comments = pr.get_review_comments()

for comment in review_comments:
if self.analyzer.is_bot_comment(comment):
self._process_comment(comment, pr)

self.context.event_manager.mark_completed()

except Exception as e:
self.logger.error(f"Error processing closed PR {pr.url}: {e}")
raise
Loading
Loading