From 821de8829b595bdf0eea32e64273e688449d1428 Mon Sep 17 00:00:00 2001 From: Spyros Date: Wed, 25 Sep 2024 09:25:08 +0100 Subject: [PATCH] move cycle detection to the graph 'add_edge' method to fix databases with existing cycles --- application/database/db.py | 52 ++++++++++----------- application/database/inmemory_graph.py | 62 +++++++++++++++++--------- 2 files changed, 65 insertions(+), 49 deletions(-) diff --git a/application/database/db.py b/application/database/db.py index 1f2df3ff..12d7b743 100644 --- a/application/database/db.py +++ b/application/database/db.py @@ -1,3 +1,4 @@ +from sqlalchemy import CheckConstraint import networkx as nx import uuid import neo4j @@ -129,6 +130,7 @@ class InternalLinks(BaseModel): # type: ignore cre, name="uq_pair", ), + sqla.CheckConstraint("type != 'PartOf'", name="No 'PartOf' links"), ) @@ -683,8 +685,10 @@ def with_graph(self) -> "Node_collection": graph_singleton = inmemory_graph.Singleton_Graph_Storage.instance() self.graph.with_graph( graph=graph_singleton, - graph_data=self.__get_all_nodes_and_cres(), + graph_data=self.__get_all_nodes_and_cres(cres_only=True), ) + logger.info("Successfully loaded CRE graph in memory") + return self def __get_external_links(self) -> List[Tuple[CRE, Node, str]]: @@ -734,13 +738,16 @@ def __get_unlinked_cres(self) -> List[CRE]: ) return cres - def __get_all_nodes_and_cres(self) -> List[cre_defs.Document]: + def __get_all_nodes_and_cres( + self, cres_only: bool = False + ) -> List[cre_defs.Document]: result = [] nodes = [] cres = [] - node_ids = self.session.query(Node.id).all() - for nid in node_ids: - result.extend(self.get_nodes(db_id=nid[0])) + if not cres_only: + node_ids = self.session.query(Node.id).all() + for nid in node_ids: + result.extend(self.get_nodes(db_id=nid[0])) cre_ids = self.session.query(CRE.id).all() for cid in cre_ids: @@ -1436,7 +1443,7 @@ def add_cre(self, cre: cre_defs.CRE) -> CRE: entry.tags = ",".join(cre.tags) return entry else: - logger.info("did not know of %s ,adding" % cre.name) + logger.info("did not know of cre %s ,adding" % cre.name) entry = CRE( description=cre.description, name=cre.name, @@ -1452,6 +1459,9 @@ def add_cre(self, cre: cre_defs.CRE) -> CRE: def add_node( self, node: cre_defs.Node, comparison_skip_attributes: List = ["link"] ) -> Optional[Node]: + if not node: + raise ValueError(f"Node is None") + return None dbnode = dbNodeFromNode(node) if not dbnode: logger.warning(f"{node} could not be transformed to a DB object") @@ -1472,7 +1482,7 @@ def add_node( return entry else: logger.info( - f"did not know of {dbnode.name}:{dbnode.section}:{dbnode.section_id} ,adding" + f"did not know of node {dbnode.name}:{dbnode.section}:{dbnode.section_id} ,adding" ) self.session.add(dbnode) self.session.commit() @@ -1590,32 +1600,16 @@ def add_internal_link( higher_cre = CREfromDB(higher) lower_cre = CREfromDB(lower) link_to = cre_defs.Link(document=lower_cre, ltype=ltype) - cycle = self.graph.introduces_cycle(doc_from=higher_cre, link_to=link_to) - - if not cycle: + try: + self.graph.add_link(doc_from=higher_cre, link_to=link_to) self.session.add( InternalLinks(type=ltype.value, cre=lower.id, group=higher.id) ) self.session.commit() - if self.graph: - self.graph.add_link(doc_from=higher_cre, link_to=link_to) - else: - for item in cycle: - from_id = item[0].replace("CRE: ", "") - to_id = item[1].replace("CRE: ", "") - from_cre = self.session.query(CRE).filter(lower.id == from_id).first() - to_cre = self.session.query(CRE).filter(lower.id == to_id).first() - if from_cre and to_cre: - item[0].replace(from_id, from_cre.name) - item[1].replace(to_id, to_cre.name) - - logger.warning( - f"A link between CREs {higher.external_id}-{higher.name} and" - f" {lower.external_id}-{lower.name} " - f"would introduce cycle {cycle}, skipping" - ) - return None - return cre_defs.Link(document=lower, ltype=ltype) + return cre_defs.Link(document=lower, ltype=ltype) + except inmemory_graph.CycleDetectedError as cde: + # cycle detected, do nothing + pass def add_link( self, diff --git a/application/database/inmemory_graph.py b/application/database/inmemory_graph.py index 2f49abcf..be747326 100644 --- a/application/database/inmemory_graph.py +++ b/application/database/inmemory_graph.py @@ -10,6 +10,11 @@ logger.setLevel(logging.INFO) +class CycleDetectedError(Exception): + def __init__(self, message): + super().__init__(message) + + class Singleton_Graph_Storage(nx.DiGraph): __instance: "Singleton_Graph_Storage" = None @@ -30,10 +35,10 @@ class CRE_Graph: def get_raw_graph(self): return self.__graph - def with_graph(cls, graph: nx.Graph, graph_data: List[defs.Document]): - cls.__graph = graph + def with_graph(self, graph: nx.Graph, graph_data: List[defs.Document]): + self.__graph = graph if not len(graph.edges): - cls.__load_cre_graph(graph_data) + self.__load_cre_graph(graph_data) def introduces_cycle(self, doc_from: defs.Document, link_to: defs.Link): try: @@ -62,7 +67,7 @@ def introduces_cycle(self, doc_from: defs.Document, link_to: defs.Link): def has_cycle(self): try: - ex = nx.find_cycle(self.__graph) + ex = nx.find_cycle(self.__graph, orientation="original") return ex except nx.exception.NetworkXNoCycle: return None @@ -132,30 +137,40 @@ def get_path(self, start: str, end: str) -> List[Tuple[str, str]]: except nx.NetworkXNoPath: return [] - def add_cre(cls, cre: defs.CRE): + def add_cre(self, cre: defs.CRE): if not isinstance(cre, defs.CRE): raise ValueError( f"inmemory graph add_cre takes only cre objects, instead got {type(cre)}" ) graph_cre = f"{defs.Credoctypes.CRE.value}: {cre.id}" - if cre and graph_cre not in cls.__graph.nodes(): - cls.__graph.add_node(graph_cre, internal_id=cre.id) + if cre and graph_cre not in self.__graph.nodes(): + self.__graph.add_node(graph_cre, internal_id=cre.id) - def add_dbnode(cls, dbnode: defs.Node): + def add_dbnode(self, dbnode: defs.Node): graph_node = "Node: " + str(dbnode.id) - - if dbnode and graph_node not in cls.__graph.nodes(): - cls.__graph.add_node(graph_node, internal_id=dbnode.id) - else: - logger.error("Called with dbnode being none") + if dbnode and graph_node not in self.__graph.nodes(): + self.__graph.add_node(graph_node, internal_id=dbnode.id) def add_link(self, doc_from: defs.Document, link_to: defs.Link): + logger.debug( + f"adding link {doc_from.id}, {link_to.document.id} ltype: {link_to.ltype}" + ) + if ( + doc_from.doctype == defs.Credoctypes.CRE + and link_to.document.doctype == defs.Credoctypes.CRE + ): + cycle = self.introduces_cycle(doc_from=doc_from, link_to=link_to) + if cycle: + warn = f"A link between CREs {doc_from.id}-{doc_from.name} and {link_to.document.id}-{link_to.document.name} would introduce cycle {cycle}, skipping" + logger.warning(warn) + raise CycleDetectedError(warn) + self.__graph = self.__add_graph_edge( doc_from=doc_from, link_to=link_to, graph=self.__graph ) def __add_graph_edge( - cls, + self, doc_from: defs.Document, link_to: defs.Link, graph: nx.DiGraph, @@ -218,15 +233,22 @@ def __add_graph_edge( ) return graph - def __load_cre_graph(cls, documents: List[defs.Document]): + def __load_cre_graph(self, documents: List[defs.Document]): for doc in documents: + if not doc: + continue if doc.doctype == defs.Credoctypes.CRE: - cls.add_cre(cre=doc) + self.add_cre(cre=doc) else: - cls.add_dbnode(dbnode=doc) + self.add_dbnode(dbnode=doc) for link in doc.links: + if not link.document: + logger.error(f"doc {doc}, has a link with a document that's None") if link.document.doctype == defs.Credoctypes.CRE: - cls.add_cre(cre=link.document) + self.add_cre(cre=link.document) else: - cls.add_dbnode(dbnode=link.document) - cls.__add_graph_edge(doc_from=doc, link_to=link, graph=cls.__graph) + self.add_dbnode(dbnode=link.document) + try: + self.add_link(doc_from=doc, link_to=link) + except CycleDetectedError as cde: + pass