From 344b3edf724edb8494c29fc75cc0d288e7147c41 Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Mon, 29 Jul 2024 22:54:00 -0500
Subject: [PATCH 01/11] Add schema models and ORM columns

---
 src/mainframe/models/orm.py     |  5 +++
 src/mainframe/models/schemas.py | 54 +++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/mainframe/models/orm.py b/src/mainframe/models/orm.py
index b2c6265f..cb4d57d2 100644
--- a/src/mainframe/models/orm.py
+++ b/src/mainframe/models/orm.py
@@ -27,6 +27,9 @@
     relationship,
 )
 
+from mainframe.models import Pydantic
+from mainframe.models.schemas import Files
+
 
 class Base(MappedAsDataclass, DeclarativeBase, kw_only=True):
     pass
@@ -99,6 +102,8 @@ class Scan(Base):
 
     commit_hash: Mapped[Optional[str]] = mapped_column(default=None)
 
+    files: Mapped[Optional[Files]] = mapped_column(Pydantic(Files), default=None)
+
 
 Index(None, Scan.status, postgresql_where=or_(Scan.status == Status.QUEUED, Scan.status == Status.PENDING))
 
diff --git a/src/mainframe/models/schemas.py b/src/mainframe/models/schemas.py
index cad4d420..59beb8f5 100644
--- a/src/mainframe/models/schemas.py
+++ b/src/mainframe/models/schemas.py
@@ -1,10 +1,54 @@
+from __future__ import annotations
+
 import datetime
 from enum import Enum
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Annotated, Any, Optional
+
+from pydantic import BaseModel, Field, field_serializer, ConfigDict, RootModel
+
+if TYPE_CHECKING:
+    from mainframe.models.orm import Scan
+
+type MetaValue = int | float | bool | str | bytes
+
+
+class Range(BaseModel):
+    """Represents the range in the source file that was matched."""
+
+    start: int
+    end: int
+
+
+class Match(BaseModel):
+    """Represents a specific match by a pattern in a rule."""
+
+    range: Range
+    data: list[Annotated[int, Field(ge=0, lt=256)]]
 
-from pydantic import BaseModel, Field, field_serializer, ConfigDict
 
-from .orm import Scan
+class PatternMatch(BaseModel):
+    """Represents the data matched by a pattern inside a rule."""
+
+    identifier: str
+    matches: list[Match]
+
+
+class RuleMatch(BaseModel):
+    """Represents details information on a single rule match."""
+
+    identifier: str
+    patterns: list[PatternMatch]
+    metadata: dict[str, MetaValue]
+
+
+class File(BaseModel):
+    """Represents a file and the rule matches for it."""
+
+    path: str
+    matches: list[RuleMatch]
+
+
+Files = RootModel[list[File]]
 
 
 class ServerMetadata(BaseModel):
@@ -44,6 +88,8 @@ class Package(BaseModel):
 
     commit_hash: Optional[str]
 
+    files: Optional[Files]
+
     @classmethod
     def from_db(cls, scan: Scan):
         return cls(
@@ -64,6 +110,7 @@ def from_db(cls, scan: Scan):
             finished_at=scan.finished_at,
             finished_by=scan.finished_by,
             commit_hash=scan.commit_hash,
+            files=scan.files,
         )
 
     @field_serializer(
@@ -132,6 +179,7 @@ class PackageScanResult(PackageSpecifier):
     score: int = 0
     inspector_url: Optional[str] = None
     rules_matched: list[str] = []
+    files: Optional[Files] = None
 
 
 class PackageScanResultFail(PackageSpecifier):

From 3f7dc5feafbf26b36c54a9725d130b2404102be3 Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Mon, 29 Jul 2024 22:54:46 -0500
Subject: [PATCH 02/11] Exclude TYPE_CHECKING code from coverage

Code inside a `if TYPE_CHECKING:` block will never be covered by
coverage.py, because it's for type checkers (which coverage.py is not)
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 08538a4c..95c85685 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -147,3 +147,4 @@ omit = [
 
 [tool.coverage.report]
 fail_under = 100
+exclude_also = ["if TYPE_CHECKING:"]

From 6a4db305f6fa8144d963c9c67064c98b78df05f0 Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Mon, 29 Jul 2024 22:56:58 -0500
Subject: [PATCH 03/11] Add utility to bridge Pydantic models and SQLAlchemy
 ORM

Use SQLAlchemy's TypeDecorator to build a wrapper that can serialize and
deserialize from a Pydantic model into a PostgreSQL JSONB column
---
 src/mainframe/models/__init__.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/mainframe/models/__init__.py b/src/mainframe/models/__init__.py
index 7bb09cc1..1658d353 100644
--- a/src/mainframe/models/__init__.py
+++ b/src/mainframe/models/__init__.py
@@ -1 +1,26 @@
 """Database models."""
+
+from typing import Optional, Any, Type
+from pydantic import BaseModel
+from sqlalchemy import Dialect, TypeDecorator
+from sqlalchemy.dialects.postgresql import JSONB
+
+
+class Pydantic[T: BaseModel](TypeDecorator[T]):
+    """TypeDecorator to convert between Pydantic models and JSONB."""
+
+    impl = JSONB
+
+    def __init__(self, pydantic_type: Type[T]):
+        super().__init__()
+        self.PydanticType = pydantic_type
+
+    def process_bind_param(self, value: Optional[T], dialect: Dialect) -> dict[str, Any]:
+        if value:
+            return value.model_dump()
+        else:
+            return {}
+
+    def process_result_value(self, value: Any, dialect: Dialect) -> Optional[T]:
+        if value:
+            return self.PydanticType.model_validate(value)

From 5dbb2f5161ce3507b8f8540fecab050f8f0b9951 Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Mon, 29 Jul 2024 22:57:42 -0500
Subject: [PATCH 04/11] Add migration

Add a migration that adds/removes the files column on the scans table
---
 .../587c186d91ee_better_match_information.py  | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 alembic/versions/587c186d91ee_better_match_information.py

diff --git a/alembic/versions/587c186d91ee_better_match_information.py b/alembic/versions/587c186d91ee_better_match_information.py
new file mode 100644
index 00000000..cacffaf8
--- /dev/null
+++ b/alembic/versions/587c186d91ee_better_match_information.py
@@ -0,0 +1,29 @@
+"""better-match-information
+
+Revision ID: 587c186d91ee
+Revises: 6991bcb18f89
+Create Date: 2024-07-27 19:51:33.408128
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "587c186d91ee"
+down_revision = "6991bcb18f89"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("scans", sa.Column("files", postgresql.JSONB(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("scans", "files")
+    # ### end Alembic commands ###

From ae2b2ad2d7c792cd85a2013683f9a4de66971263 Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Mon, 29 Jul 2024 22:58:14 -0500
Subject: [PATCH 05/11] Save detailed results from submit job results endpoint
 into DB

---
 src/mainframe/endpoints/package.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mainframe/endpoints/package.py b/src/mainframe/endpoints/package.py
index 3fcad80e..fb2bd8b9 100644
--- a/src/mainframe/endpoints/package.py
+++ b/src/mainframe/endpoints/package.py
@@ -78,6 +78,7 @@ def submit_results(
         scan.score = result.score
         scan.finished_by = auth.subject
         scan.commit_hash = result.commit
+        scan.files = result.files
 
         # These are the rules that already have an entry in the database
         rules = session.scalars(select(Rule).where(Rule.name.in_(result.rules_matched))).all()

From 71a15101820e1890f916e91093599a31765e414c Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Mon, 29 Jul 2024 22:58:55 -0500
Subject: [PATCH 06/11] Tests

---
 tests/test_package.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/test_package.py b/tests/test_package.py
index f3a5865e..fb8663e4 100644
--- a/tests/test_package.py
+++ b/tests/test_package.py
@@ -20,10 +20,16 @@
 from mainframe.json_web_token import AuthenticationData
 from mainframe.models.orm import Scan, Status
 from mainframe.models.schemas import (
+    File,
+    Files,
+    Match,
     Package,
     PackageScanResult,
     PackageScanResultFail,
     PackageSpecifier,
+    PatternMatch,
+    Range,
+    RuleMatch,
 )
 from mainframe.rules import Rules
 
@@ -80,6 +86,32 @@ def test_package_lookup_rejects_invalid_combinations(
     assert e.value.status_code == 400
 
 
+def test_package_lookup_files(db_session: Session):
+    """Test that `lookup_package_info` returns detailed file information."""
+
+    range_ = Range(start=0, end=5)
+    match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF])
+    pattern = PatternMatch(identifier="$pat", matches=[match])
+    rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5})
+    file = File(path="dist1/a/b.py", matches=[rule])
+    files = Files([file])
+    scan = Scan(
+        name="abc",
+        version="1.0.0",
+        status=Status.FINISHED,
+        queued_by="remmy",
+        files=files,
+    )
+
+    with db_session.begin():
+        db_session.add(scan)
+        db_session.commit()
+
+    package = lookup_package_info(db_session, name="abc", version="1.0.0")[0]
+
+    assert package.files == files
+
+
 def test_handle_success(db_session: Session, test_data: list[Scan], auth: AuthenticationData, rules_state: Rules):
     job = get_jobs(db_session, auth, rules_state, batch=1)
 
@@ -88,6 +120,13 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen
         name = job.name
         version = job.version
 
+        range_ = Range(start=0, end=5)
+        match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF])
+        pattern = PatternMatch(identifier="$pat", matches=[match])
+        rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5})
+        file = File(path="dist1/a/b.py", matches=[rule])
+        files = Files([file])
+
         body = PackageScanResult(
             name=job.name,
             version=job.version,
@@ -95,6 +134,7 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen
             score=2,
             inspector_url="test inspector url",
             rules_matched=["a", "b", "c"],
+            files=files,
         )
         submit_results(body, db_session, auth)
 
@@ -107,6 +147,7 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen
         assert record.score == 2
         assert record.inspector_url == "test inspector url"
         assert {rule.name for rule in record.rules} == {"a", "b", "c"}
+        assert record.files == files
     else:
         assert all(scan.status != Status.QUEUED for scan in test_data)
 

From 872f0f0d58f2f0a19a336ea9208da3737b0e6d83 Mon Sep 17 00:00:00 2001
From: Robin <74519799+Robin5605@users.noreply.github.com>
Date: Tue, 30 Jul 2024 17:13:56 -0500
Subject: [PATCH 07/11] Enable caching in Pydantic <-> SQLA TypeDecorator

Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com>
Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com>
---
 src/mainframe/models/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mainframe/models/__init__.py b/src/mainframe/models/__init__.py
index 1658d353..3dedc5ab 100644
--- a/src/mainframe/models/__init__.py
+++ b/src/mainframe/models/__init__.py
@@ -10,6 +10,7 @@ class Pydantic[T: BaseModel](TypeDecorator[T]):
     """TypeDecorator to convert between Pydantic models and JSONB."""
 
     impl = JSONB
+    cache_ok = True
 
     def __init__(self, pydantic_type: Type[T]):
         super().__init__()

From fe481f064ced90484ce79c5a7203de9e4f463ac8 Mon Sep 17 00:00:00 2001
From: Robin <74519799+Robin5605@users.noreply.github.com>
Date: Tue, 30 Jul 2024 17:15:07 -0500
Subject: [PATCH 08/11] Use snake case for pydantic_type arg in converter

Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com>
Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com>
---
 src/mainframe/models/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mainframe/models/__init__.py b/src/mainframe/models/__init__.py
index 3dedc5ab..a0f2e447 100644
--- a/src/mainframe/models/__init__.py
+++ b/src/mainframe/models/__init__.py
@@ -14,7 +14,7 @@ class Pydantic[T: BaseModel](TypeDecorator[T]):
 
     def __init__(self, pydantic_type: Type[T]):
         super().__init__()
-        self.PydanticType = pydantic_type
+        self.pydantic_type = pydantic_type
 
     def process_bind_param(self, value: Optional[T], dialect: Dialect) -> dict[str, Any]:
         if value:
@@ -24,4 +24,4 @@ def process_bind_param(self, value: Optional[T], dialect: Dialect) -> dict[str,
 
     def process_result_value(self, value: Any, dialect: Dialect) -> Optional[T]:
         if value:
-            return self.PydanticType.model_validate(value)
+            return self.pydantic_type.model_validate(value)

From e04b165b43faf71548e988c89c757833e378d5e4 Mon Sep 17 00:00:00 2001
From: Robin <74519799+Robin5605@users.noreply.github.com>
Date: Tue, 30 Jul 2024 17:17:16 -0500
Subject: [PATCH 09/11] Make some docstrings more accurate

Co-authored-by: jonathan-d-zhang
<69145546+jonathan-d-zhang@users.noreply.github.com>
Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com>
---
 src/mainframe/models/schemas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mainframe/models/schemas.py b/src/mainframe/models/schemas.py
index 59beb8f5..28d3ea68 100644
--- a/src/mainframe/models/schemas.py
+++ b/src/mainframe/models/schemas.py
@@ -13,7 +13,7 @@
 
 
 class Range(BaseModel):
-    """Represents the range in the source file that was matched."""
+    """Represents the inclusive range in the source file that was matched."""
 
     start: int
     end: int
@@ -34,7 +34,7 @@ class PatternMatch(BaseModel):
 
 
 class RuleMatch(BaseModel):
-    """Represents details information on a single rule match."""
+    """Represents the matches of a rule on a file"""
 
     identifier: str
     patterns: list[PatternMatch]

From 92560f35c981d162d82ab63f9b99f5914409d3e3 Mon Sep 17 00:00:00 2001
From: Robin <74519799+Robin5605@users.noreply.github.com>
Date: Tue, 30 Jul 2024 17:18:02 -0500
Subject: [PATCH 10/11] Change range end to 4 in tests

This should also imply that the ranges are inclusive

Co-authored-by: jonathan-d-zhang <69145546+jonathan-d-zhang@users.noreply.github.com>
Signed-off-by: Robin <74519799+Robin5605@users.noreply.github.com>
---
 tests/test_package.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_package.py b/tests/test_package.py
index fb8663e4..a8df4d28 100644
--- a/tests/test_package.py
+++ b/tests/test_package.py
@@ -89,7 +89,7 @@ def test_package_lookup_rejects_invalid_combinations(
 def test_package_lookup_files(db_session: Session):
     """Test that `lookup_package_info` returns detailed file information."""
 
-    range_ = Range(start=0, end=5)
+    range_ = Range(start=0, end=4)
     match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF])
     pattern = PatternMatch(identifier="$pat", matches=[match])
     rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5})
@@ -120,7 +120,7 @@ def test_handle_success(db_session: Session, test_data: list[Scan], auth: Authen
         name = job.name
         version = job.version
 
-        range_ = Range(start=0, end=5)
+        range_ = Range(start=0, end=4)
         match = Match(range=range_, data=[0xDE, 0xAD, 0xBE, 0xEF])
         pattern = PatternMatch(identifier="$pat", matches=[match])
         rule = RuleMatch(identifier="rule1", patterns=[pattern], metadata={"author": "remmy", "score": 5})

From 520c68076c21946a33787d5f98a894d99d94ed6a Mon Sep 17 00:00:00 2001
From: Robin5605 <robinjefferson123@gmail.com>
Date: Sat, 10 Aug 2024 01:19:16 -0500
Subject: [PATCH 11/11] Update database schema documentation

---
 docs/source/database_schema.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/database_schema.rst b/docs/source/database_schema.rst
index 9d49e73c..faf3828b 100644
--- a/docs/source/database_schema.rst
+++ b/docs/source/database_schema.rst
@@ -44,7 +44,8 @@ Database Schema
         pending_by text,
         finished_by text,
         commit_hash text,
-        fail_reason text
+        fail_reason text,
+        files jsonb
     );
 
     ALTER TABLE ONLY public.download_urls