From 344b3edf724edb8494c29fc75cc0d288e7147c41 Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Mon, 29 Jul 2024 22:54:00 -0500 Subject: [PATCH 01/11] Add schema models and ORM columns --- src/mainframe/models/orm.py | 5 +++ src/mainframe/models/schemas.py | 54 +++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/src/mainframe/models/orm.py b/src/mainframe/models/orm.py index b2c6265f..cb4d57d2 100644 --- a/src/mainframe/models/orm.py +++ b/src/mainframe/models/orm.py @@ -27,6 +27,9 @@ relationship, ) +from mainframe.models import Pydantic +from mainframe.models.schemas import Files + class Base(MappedAsDataclass, DeclarativeBase, kw_only=True): pass @@ -99,6 +102,8 @@ class Scan(Base): commit_hash: Mapped[Optional[str]] = mapped_column(default=None) + files: Mapped[Optional[Files]] = mapped_column(Pydantic(Files), default=None) + Index(None, Scan.status, postgresql_where=or_(Scan.status == Status.QUEUED, Scan.status == Status.PENDING)) diff --git a/src/mainframe/models/schemas.py b/src/mainframe/models/schemas.py index cad4d420..59beb8f5 100644 --- a/src/mainframe/models/schemas.py +++ b/src/mainframe/models/schemas.py @@ -1,10 +1,54 @@ +from __future__ import annotations + import datetime from enum import Enum -from typing import Any, Optional +from typing import TYPE_CHECKING, Annotated, Any, Optional + +from pydantic import BaseModel, Field, field_serializer, ConfigDict, RootModel + +if TYPE_CHECKING: + from mainframe.models.orm import Scan + +type MetaValue = int | float | bool | str | bytes + + +class Range(BaseModel): + """Represents the range in the source file that was matched.""" + + start: int + end: int + + +class Match(BaseModel): + """Represents a specific match by a pattern in a rule.""" + + range: Range + data: list[Annotated[int, Field(ge=0, lt=256)]] -from pydantic import BaseModel, Field, field_serializer, ConfigDict -from .orm import Scan +class PatternMatch(BaseModel): + """Represents the data matched by a pattern inside a rule.""" + + identifier: str + matches: list[Match] + + +class RuleMatch(BaseModel): + """Represents details information on a single rule match.""" + + identifier: str + patterns: list[PatternMatch] + metadata: dict[str, MetaValue] + + +class File(BaseModel): + """Represents a file and the rule matches for it.""" + + path: str + matches: list[RuleMatch] + + +Files = RootModel[list[File]] class ServerMetadata(BaseModel): @@ -44,6 +88,8 @@ class Package(BaseModel): commit_hash: Optional[str] + files: Optional[Files] + @classmethod def from_db(cls, scan: Scan): return cls( @@ -64,6 +110,7 @@ def from_db(cls, scan: Scan): finished_at=scan.finished_at, finished_by=scan.finished_by, commit_hash=scan.commit_hash, + files=scan.files, ) @field_serializer( @@ -132,6 +179,7 @@ class PackageScanResult(PackageSpecifier): score: int = 0 inspector_url: Optional[str] = None rules_matched: list[str] = [] + files: Optional[Files] = None class PackageScanResultFail(PackageSpecifier): From 3f7dc5feafbf26b36c54a9725d130b2404102be3 Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Mon, 29 Jul 2024 22:54:46 -0500 Subject: [PATCH 02/11] Exclude TYPE_CHECKING code from coverage Code inside a `if TYPE_CHECKING:` block will never be covered by coverage.py, because it's for type checkers (which coverage.py is not) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 08538a4c..95c85685 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,3 +147,4 @@ omit = [ [tool.coverage.report] fail_under = 100 +exclude_also = ["if TYPE_CHECKING:"] From 6a4db305f6fa8144d963c9c67064c98b78df05f0 Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Mon, 29 Jul 2024 22:56:58 -0500 Subject: [PATCH 03/11] Add utility to bridge Pydantic models and SQLAlchemy ORM Use SQLAlchemy's TypeDecorator to build a wrapper that can serialize and deserialize from a Pydantic model into a PostgreSQL JSONB column --- src/mainframe/models/__init__.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/mainframe/models/__init__.py b/src/mainframe/models/__init__.py index 7bb09cc1..1658d353 100644 --- a/src/mainframe/models/__init__.py +++ b/src/mainframe/models/__init__.py @@ -1 +1,26 @@ """Database models.""" + +from typing import Optional, Any, Type +from pydantic import BaseModel +from sqlalchemy import Dialect, TypeDecorator +from sqlalchemy.dialects.postgresql import JSONB + + +class Pydantic[T: BaseModel](TypeDecorator[T]): + """TypeDecorator to convert between Pydantic models and JSONB.""" + + impl = JSONB + + def __init__(self, pydantic_type: Type[T]): + super().__init__() + self.PydanticType = pydantic_type + + def process_bind_param(self, value: Optional[T], dialect: Dialect) -> dict[str, Any]: + if value: + return value.model_dump() + else: + return {} + + def process_result_value(self, value: Any, dialect: Dialect) -> Optional[T]: + if value: + return self.PydanticType.model_validate(value) From 5dbb2f5161ce3507b8f8540fecab050f8f0b9951 Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Mon, 29 Jul 2024 22:57:42 -0500 Subject: [PATCH 04/11] Add migration Add a migration that adds/removes the files column on the scans table --- .../587c186d91ee_better_match_information.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 alembic/versions/587c186d91ee_better_match_information.py diff --git a/alembic/versions/587c186d91ee_better_match_information.py b/alembic/versions/587c186d91ee_better_match_information.py new file mode 100644 index 00000000..cacffaf8 --- /dev/null +++ b/alembic/versions/587c186d91ee_better_match_information.py @@ -0,0 +1,29 @@ +"""better-match-information + +Revision ID: 587c186d91ee +Revises: 6991bcb18f89 +Create Date: 2024-07-27 19:51:33.408128 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "587c186d91ee" +down_revision = "6991bcb18f89" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("scans", sa.Column("files", postgresql.JSONB(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("scans", "files") + # ### end Alembic commands ### From ae2b2ad2d7c792cd85a2013683f9a4de66971263 Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Mon, 29 Jul 2024 22:58:14 -0500 Subject: [PATCH 05/11] Save detailed results from submit job results endpoint into DB --- src/mainframe/endpoints/package.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mainframe/endpoints/package.py b/src/mainframe/endpoints/package.py index 3fcad80e..fb2bd8b9 100644 --- a/src/mainframe/endpoints/package.py +++ b/src/mainframe/endpoints/package.py @@ -78,6 +78,7 @@ def submit_results( scan.score = result.score scan.finished_by = auth.subject scan.commit_hash = result.commit + scan.files = result.files # These are the rules that already have an entry in the database rules = session.scalars(select(Rule).where(Rule.name.in_(result.rules_matched))).all() From 71a15101820e1890f916e91093599a31765e414c Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Mon, 29 Jul 2024 22:58:55 -0500 Subject: [PATCH 06/11] Tests --- tests/test_package.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_package.py b/tests/test_package.py index f3a5865e..fb8663e4 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -20,10 +20,16 @@ from mainframe.json_web_token import AuthenticationData from mainframe.models.orm import Scan, Status from mainframe.models.schemas import ( + File, + Files, + Match, Package, PackageScanResult, PackageScanResultFail, PackageSpecifier, + PatternMatch, + Range, + RuleMatch, ) from mainframe.rules import Rules @@ -80,6 +86,32 @@ def test_package_lookup_rejects_invalid_combinations( assert e.value.status_code == 400 +def test_package_lookup_files(db_session: Session): + """Test that `lookup_package_info` returns detailed file information.""" + + range_ = Range(start=0, end=5) + match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF]) + pattern = PatternMatch(identifier="$pat", matches=[match]) + rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5}) + file = File(path="dist1/a/b.py", matches=[rule]) + files = Files([file]) + scan = Scan( + name="abc", + version="1.0.0", + status=Status.FINISHED, + queued_by="remmy", + files=files, + ) + + with db_session.begin(): + db_session.add(scan) + db_session.commit() + + package = lookup_package_info(db_session, name="abc", version="1.0.0")[0] + + assert package.files == files + + def test_handle_success(db_session: Session, test_data: list[Scan], auth: AuthenticationData, rules_state: Rules): job = get_jobs(db_session, auth, rules_state, batch=1) @@ -88,6 +120,13 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen name = job.name version = job.version + range_ = Range(start=0, end=5) + match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF]) + pattern = PatternMatch(identifier="$pat", matches=[match]) + rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5}) + file = File(path="dist1/a/b.py", matches=[rule]) + files = Files([file]) + body = PackageScanResult( name=job.name, version=job.version, @@ -95,6 +134,7 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen score=2, inspector_url="test inspector url", rules_matched=["a", "b", "c"], + files=files, ) submit_results(body, db_session, auth) @@ -107,6 +147,7 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen assert record.score == 2 assert record.inspector_url == "test inspector url" assert {rule.name for rule in record.rules} == {"a", "b", "c"} + assert record.files == files else: assert all(scan.status != Status.QUEUED for scan in test_data) From 872f0f0d58f2f0a19a336ea9208da3737b0e6d83 Mon Sep 17 00:00:00 2001 From: Robin <74519799+Robin5605@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:13:56 -0500 Subject: [PATCH 07/11] Enable caching in Pydantic <-> SQLA TypeDecorator Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com> Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com> --- src/mainframe/models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mainframe/models/__init__.py b/src/mainframe/models/__init__.py index 1658d353..3dedc5ab 100644 --- a/src/mainframe/models/__init__.py +++ b/src/mainframe/models/__init__.py @@ -10,6 +10,7 @@ class Pydantic[T: BaseModel](TypeDecorator[T]): """TypeDecorator to convert between Pydantic models and JSONB.""" impl = JSONB + cache_ok = True def __init__(self, pydantic_type: Type[T]): super().__init__() From fe481f064ced90484ce79c5a7203de9e4f463ac8 Mon Sep 17 00:00:00 2001 From: Robin <74519799+Robin5605@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:15:07 -0500 Subject: [PATCH 08/11] Use snake case for pydantic_type arg in converter Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com> Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com> --- src/mainframe/models/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mainframe/models/__init__.py b/src/mainframe/models/__init__.py index 3dedc5ab..a0f2e447 100644 --- a/src/mainframe/models/__init__.py +++ b/src/mainframe/models/__init__.py @@ -14,7 +14,7 @@ class Pydantic[T: BaseModel](TypeDecorator[T]): def __init__(self, pydantic_type: Type[T]): super().__init__() - self.PydanticType = pydantic_type + self.pydantic_type = pydantic_type def process_bind_param(self, value: Optional[T], dialect: Dialect) -> dict[str, Any]: if value: @@ -24,4 +24,4 @@ def process_bind_param(self, value: Optional[T], dialect: Dialect) -> dict[str, def process_result_value(self, value: Any, dialect: Dialect) -> Optional[T]: if value: - return self.PydanticType.model_validate(value) + return self.pydantic_type.model_validate(value) From e04b165b43faf71548e988c89c757833e378d5e4 Mon Sep 17 00:00:00 2001 From: Robin <74519799+Robin5605@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:17:16 -0500 Subject: [PATCH 09/11] Make some docstrings more accurate Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com> Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com> --- src/mainframe/models/schemas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mainframe/models/schemas.py b/src/mainframe/models/schemas.py index 59beb8f5..28d3ea68 100644 --- a/src/mainframe/models/schemas.py +++ b/src/mainframe/models/schemas.py @@ -13,7 +13,7 @@ class Range(BaseModel): - """Represents the range in the source file that was matched.""" + """Represents the inclusive range in the source file that was matched.""" start: int end: int @@ -34,7 +34,7 @@ class PatternMatch(BaseModel): class RuleMatch(BaseModel): - """Represents details information on a single rule match.""" + """Represents the matches of a rule on a file""" identifier: str patterns: list[PatternMatch] From 92560f35c981d162d82ab63f9b99f5914409d3e3 Mon Sep 17 00:00:00 2001 From: Robin <74519799+Robin5605@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:18:02 -0500 Subject: [PATCH 10/11] Change range end to 4 in tests This should also imply that the ranges are inclusive Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com> Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com> --- tests/test_package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_package.py b/tests/test_package.py index fb8663e4..a8df4d28 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -89,7 +89,7 @@ def test_package_lookup_rejects_invalid_combinations( def test_package_lookup_files(db_session: Session): """Test that `lookup_package_info` returns detailed file information.""" - range_ = Range(start=0, end=5) + range_ = Range(start=0, end=4) match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF]) pattern = PatternMatch(identifier="$pat", matches=[match]) rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5}) @@ -120,7 +120,7 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen name = job.name version = job.version - range_ = Range(start=0, end=5) + range_ = Range(start=0, end=4) match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF]) pattern = PatternMatch(identifier="$pat", matches=[match]) rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5}) From 520c68076c21946a33787d5f98a894d99d94ed6a Mon Sep 17 00:00:00 2001 From: Robin5605 Date: Sat, 10 Aug 2024 01:19:16 -0500 Subject: [PATCH 11/11] Update database schema documentation --- docs/source/database_schema.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/database_schema.rst b/docs/source/database_schema.rst index 9d49e73c..faf3828b 100644 --- a/docs/source/database_schema.rst +++ b/docs/source/database_schema.rst @@ -44,7 +44,8 @@ Database Schema pending_by text, finished_by text, commit_hash text, - fail_reason text + fail_reason text, + files jsonb ); ALTER TABLE ONLY public.download_urls